diff options
author | Jiri Kosina <jkosina@suse.cz> | 2012-10-28 14:28:52 -0400 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2012-10-28 14:29:19 -0400 |
commit | 3bd7bf1f0fe14f591c089ae61bbfa9bd356f178a (patch) | |
tree | 0058693cc9e70b7461dae551f8a19aff2efd13ca /mm | |
parent | f16f84937d769c893492160b1a8c3672e3992beb (diff) | |
parent | e657e078d3dfa9f96976db7a2b5fd7d7c9f1f1a6 (diff) |
Merge branch 'master' into for-next
Sync up with Linus' tree to be able to apply Cesar's patch
against newer version of the code.
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 50 | ||||
-rw-r--r-- | mm/bootmem.c | 10 | ||||
-rw-r--r-- | mm/compaction.c | 562 | ||||
-rw-r--r-- | mm/fadvise.c | 34 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/filemap_xip.c | 10 | ||||
-rw-r--r-- | mm/fremap.c | 19 | ||||
-rw-r--r-- | mm/frontswap.c | 34 | ||||
-rw-r--r-- | mm/huge_memory.c | 441 | ||||
-rw-r--r-- | mm/hugetlb.c | 34 | ||||
-rw-r--r-- | mm/internal.h | 52 | ||||
-rw-r--r-- | mm/interval_tree.c | 112 | ||||
-rw-r--r-- | mm/kmemleak.c | 106 | ||||
-rw-r--r-- | mm/ksm.c | 40 | ||||
-rw-r--r-- | mm/madvise.c | 8 | ||||
-rw-r--r-- | mm/memblock.c | 29 | ||||
-rw-r--r-- | mm/memcontrol.c | 29 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 115 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 77 | ||||
-rw-r--r-- | mm/mempolicy.c | 153 | ||||
-rw-r--r-- | mm/mlock.c | 27 | ||||
-rw-r--r-- | mm/mmap.c | 210 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 89 | ||||
-rw-r--r-- | mm/mremap.c | 73 | ||||
-rw-r--r-- | mm/nobootmem.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 39 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 14 | ||||
-rw-r--r-- | mm/page_alloc.c | 319 | ||||
-rw-r--r-- | mm/page_isolation.c | 43 | ||||
-rw-r--r-- | mm/percpu.c | 2 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 50 | ||||
-rw-r--r-- | mm/prio_tree.c | 208 | ||||
-rw-r--r-- | mm/readahead.c | 14 | ||||
-rw-r--r-- | mm/rmap.c | 179 | ||||
-rw-r--r-- | mm/shmem.c | 180 | ||||
-rw-r--r-- | mm/slab.c | 350 | ||||
-rw-r--r-- | mm/slab.h | 19 | ||||
-rw-r--r-- | mm/slab_common.c | 162 | ||||
-rw-r--r-- | mm/slob.c | 97 | ||||
-rw-r--r-- | mm/slub.c | 208 | ||||
-rw-r--r-- | mm/swap.c | 13 | ||||
-rw-r--r-- | mm/swapfile.c | 11 | ||||
-rw-r--r-- | mm/truncate.c | 3 | ||||
-rw-r--r-- | mm/util.c | 35 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 111 | ||||
-rw-r--r-- | mm/vmstat.c | 16 |
51 files changed, 2446 insertions, 1976 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d5c8019c6627..a3f8dddaaab3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS | |||
191 | # support for memory compaction | 191 | # support for memory compaction |
192 | config COMPACTION | 192 | config COMPACTION |
193 | bool "Allow for memory compaction" | 193 | bool "Allow for memory compaction" |
194 | def_bool y | ||
194 | select MIGRATION | 195 | select MIGRATION |
195 | depends on MMU | 196 | depends on MMU |
196 | help | 197 | help |
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS | |||
318 | 319 | ||
319 | config TRANSPARENT_HUGEPAGE | 320 | config TRANSPARENT_HUGEPAGE |
320 | bool "Transparent Hugepage Support" | 321 | bool "Transparent Hugepage Support" |
321 | depends on X86 && MMU | 322 | depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE |
322 | select COMPACTION | 323 | select COMPACTION |
323 | help | 324 | help |
324 | Transparent Hugepages allows the kernel to use huge pages and | 325 | Transparent Hugepages allows the kernel to use huge pages and |
diff --git a/mm/Makefile b/mm/Makefile index 92753e2d82da..6b025f80af34 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -14,9 +14,9 @@ endif | |||
14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
15 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o $(mmu-y) | 19 | compaction.o interval_tree.o $(mmu-y) |
20 | 20 | ||
21 | obj-y += init-mm.o | 21 | obj-y += init-mm.o |
22 | 22 | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b41823cc05e6..d3ca2b3ee176 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -158,16 +158,16 @@ static ssize_t read_ahead_kb_store(struct device *dev, | |||
158 | const char *buf, size_t count) | 158 | const char *buf, size_t count) |
159 | { | 159 | { |
160 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | 160 | struct backing_dev_info *bdi = dev_get_drvdata(dev); |
161 | char *end; | ||
162 | unsigned long read_ahead_kb; | 161 | unsigned long read_ahead_kb; |
163 | ssize_t ret = -EINVAL; | 162 | ssize_t ret; |
164 | 163 | ||
165 | read_ahead_kb = simple_strtoul(buf, &end, 10); | 164 | ret = kstrtoul(buf, 10, &read_ahead_kb); |
166 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { | 165 | if (ret < 0) |
167 | bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); | 166 | return ret; |
168 | ret = count; | 167 | |
169 | } | 168 | bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); |
170 | return ret; | 169 | |
170 | return count; | ||
171 | } | 171 | } |
172 | 172 | ||
173 | #define K(pages) ((pages) << (PAGE_SHIFT - 10)) | 173 | #define K(pages) ((pages) << (PAGE_SHIFT - 10)) |
@@ -187,16 +187,17 @@ static ssize_t min_ratio_store(struct device *dev, | |||
187 | struct device_attribute *attr, const char *buf, size_t count) | 187 | struct device_attribute *attr, const char *buf, size_t count) |
188 | { | 188 | { |
189 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | 189 | struct backing_dev_info *bdi = dev_get_drvdata(dev); |
190 | char *end; | ||
191 | unsigned int ratio; | 190 | unsigned int ratio; |
192 | ssize_t ret = -EINVAL; | 191 | ssize_t ret; |
192 | |||
193 | ret = kstrtouint(buf, 10, &ratio); | ||
194 | if (ret < 0) | ||
195 | return ret; | ||
196 | |||
197 | ret = bdi_set_min_ratio(bdi, ratio); | ||
198 | if (!ret) | ||
199 | ret = count; | ||
193 | 200 | ||
194 | ratio = simple_strtoul(buf, &end, 10); | ||
195 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { | ||
196 | ret = bdi_set_min_ratio(bdi, ratio); | ||
197 | if (!ret) | ||
198 | ret = count; | ||
199 | } | ||
200 | return ret; | 201 | return ret; |
201 | } | 202 | } |
202 | BDI_SHOW(min_ratio, bdi->min_ratio) | 203 | BDI_SHOW(min_ratio, bdi->min_ratio) |
@@ -205,16 +206,17 @@ static ssize_t max_ratio_store(struct device *dev, | |||
205 | struct device_attribute *attr, const char *buf, size_t count) | 206 | struct device_attribute *attr, const char *buf, size_t count) |
206 | { | 207 | { |
207 | struct backing_dev_info *bdi = dev_get_drvdata(dev); | 208 | struct backing_dev_info *bdi = dev_get_drvdata(dev); |
208 | char *end; | ||
209 | unsigned int ratio; | 209 | unsigned int ratio; |
210 | ssize_t ret = -EINVAL; | 210 | ssize_t ret; |
211 | |||
212 | ret = kstrtouint(buf, 10, &ratio); | ||
213 | if (ret < 0) | ||
214 | return ret; | ||
215 | |||
216 | ret = bdi_set_max_ratio(bdi, ratio); | ||
217 | if (!ret) | ||
218 | ret = count; | ||
211 | 219 | ||
212 | ratio = simple_strtoul(buf, &end, 10); | ||
213 | if (*buf && (end[0] == '\0' || (end[0] == '\n' && end[1] == '\0'))) { | ||
214 | ret = bdi_set_max_ratio(bdi, ratio); | ||
215 | if (!ret) | ||
216 | ret = count; | ||
217 | } | ||
218 | return ret; | 220 | return ret; |
219 | } | 221 | } |
220 | BDI_SHOW(max_ratio, bdi->max_ratio) | 222 | BDI_SHOW(max_ratio, bdi->max_ratio) |
diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185b3b28..434be4ae7a04 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
198 | int order = ilog2(BITS_PER_LONG); | 198 | int order = ilog2(BITS_PER_LONG); |
199 | 199 | ||
200 | __free_pages_bootmem(pfn_to_page(start), order); | 200 | __free_pages_bootmem(pfn_to_page(start), order); |
201 | fixup_zone_present_pages(page_to_nid(pfn_to_page(start)), | ||
202 | start, start + BITS_PER_LONG); | ||
201 | count += BITS_PER_LONG; | 203 | count += BITS_PER_LONG; |
202 | start += BITS_PER_LONG; | 204 | start += BITS_PER_LONG; |
203 | } else { | 205 | } else { |
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
208 | if (vec & 1) { | 210 | if (vec & 1) { |
209 | page = pfn_to_page(start + off); | 211 | page = pfn_to_page(start + off); |
210 | __free_pages_bootmem(page, 0); | 212 | __free_pages_bootmem(page, 0); |
213 | fixup_zone_present_pages( | ||
214 | page_to_nid(page), | ||
215 | start + off, start + off + 1); | ||
211 | count++; | 216 | count++; |
212 | } | 217 | } |
213 | vec >>= 1; | 218 | vec >>= 1; |
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
221 | pages = bdata->node_low_pfn - bdata->node_min_pfn; | 226 | pages = bdata->node_low_pfn - bdata->node_min_pfn; |
222 | pages = bootmem_bootmap_pages(pages); | 227 | pages = bootmem_bootmap_pages(pages); |
223 | count += pages; | 228 | count += pages; |
224 | while (pages--) | 229 | while (pages--) { |
230 | fixup_zone_present_pages(page_to_nid(page), | ||
231 | page_to_pfn(page), page_to_pfn(page) + 1); | ||
225 | __free_pages_bootmem(page++, 0); | 232 | __free_pages_bootmem(page++, 0); |
233 | } | ||
226 | 234 | ||
227 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); | 235 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); |
228 | 236 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 7fcd3a52e68d..9eef55838fca 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype) | |||
50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
51 | } | 51 | } |
52 | 52 | ||
53 | #ifdef CONFIG_COMPACTION | ||
54 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | ||
55 | static inline bool isolation_suitable(struct compact_control *cc, | ||
56 | struct page *page) | ||
57 | { | ||
58 | if (cc->ignore_skip_hint) | ||
59 | return true; | ||
60 | |||
61 | return !get_pageblock_skip(page); | ||
62 | } | ||
63 | |||
64 | /* | ||
65 | * This function is called to clear all cached information on pageblocks that | ||
66 | * should be skipped for page isolation when the migrate and free page scanner | ||
67 | * meet. | ||
68 | */ | ||
69 | static void __reset_isolation_suitable(struct zone *zone) | ||
70 | { | ||
71 | unsigned long start_pfn = zone->zone_start_pfn; | ||
72 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
73 | unsigned long pfn; | ||
74 | |||
75 | zone->compact_cached_migrate_pfn = start_pfn; | ||
76 | zone->compact_cached_free_pfn = end_pfn; | ||
77 | zone->compact_blockskip_flush = false; | ||
78 | |||
79 | /* Walk the zone and mark every pageblock as suitable for isolation */ | ||
80 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | ||
81 | struct page *page; | ||
82 | |||
83 | cond_resched(); | ||
84 | |||
85 | if (!pfn_valid(pfn)) | ||
86 | continue; | ||
87 | |||
88 | page = pfn_to_page(pfn); | ||
89 | if (zone != page_zone(page)) | ||
90 | continue; | ||
91 | |||
92 | clear_pageblock_skip(page); | ||
93 | } | ||
94 | } | ||
95 | |||
96 | void reset_isolation_suitable(pg_data_t *pgdat) | ||
97 | { | ||
98 | int zoneid; | ||
99 | |||
100 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | ||
101 | struct zone *zone = &pgdat->node_zones[zoneid]; | ||
102 | if (!populated_zone(zone)) | ||
103 | continue; | ||
104 | |||
105 | /* Only flush if a full compaction finished recently */ | ||
106 | if (zone->compact_blockskip_flush) | ||
107 | __reset_isolation_suitable(zone); | ||
108 | } | ||
109 | } | ||
110 | |||
111 | /* | ||
112 | * If no pages were isolated then mark this pageblock to be skipped in the | ||
113 | * future. The information is later cleared by __reset_isolation_suitable(). | ||
114 | */ | ||
115 | static void update_pageblock_skip(struct compact_control *cc, | ||
116 | struct page *page, unsigned long nr_isolated, | ||
117 | bool migrate_scanner) | ||
118 | { | ||
119 | struct zone *zone = cc->zone; | ||
120 | if (!page) | ||
121 | return; | ||
122 | |||
123 | if (!nr_isolated) { | ||
124 | unsigned long pfn = page_to_pfn(page); | ||
125 | set_pageblock_skip(page); | ||
126 | |||
127 | /* Update where compaction should restart */ | ||
128 | if (migrate_scanner) { | ||
129 | if (!cc->finished_update_migrate && | ||
130 | pfn > zone->compact_cached_migrate_pfn) | ||
131 | zone->compact_cached_migrate_pfn = pfn; | ||
132 | } else { | ||
133 | if (!cc->finished_update_free && | ||
134 | pfn < zone->compact_cached_free_pfn) | ||
135 | zone->compact_cached_free_pfn = pfn; | ||
136 | } | ||
137 | } | ||
138 | } | ||
139 | #else | ||
140 | static inline bool isolation_suitable(struct compact_control *cc, | ||
141 | struct page *page) | ||
142 | { | ||
143 | return true; | ||
144 | } | ||
145 | |||
146 | static void update_pageblock_skip(struct compact_control *cc, | ||
147 | struct page *page, unsigned long nr_isolated, | ||
148 | bool migrate_scanner) | ||
149 | { | ||
150 | } | ||
151 | #endif /* CONFIG_COMPACTION */ | ||
152 | |||
153 | static inline bool should_release_lock(spinlock_t *lock) | ||
154 | { | ||
155 | return need_resched() || spin_is_contended(lock); | ||
156 | } | ||
157 | |||
53 | /* | 158 | /* |
54 | * Compaction requires the taking of some coarse locks that are potentially | 159 | * Compaction requires the taking of some coarse locks that are potentially |
55 | * very heavily contended. Check if the process needs to be scheduled or | 160 | * very heavily contended. Check if the process needs to be scheduled or |
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype) | |||
62 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | 167 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, |
63 | bool locked, struct compact_control *cc) | 168 | bool locked, struct compact_control *cc) |
64 | { | 169 | { |
65 | if (need_resched() || spin_is_contended(lock)) { | 170 | if (should_release_lock(lock)) { |
66 | if (locked) { | 171 | if (locked) { |
67 | spin_unlock_irqrestore(lock, *flags); | 172 | spin_unlock_irqrestore(lock, *flags); |
68 | locked = false; | 173 | locked = false; |
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | |||
70 | 175 | ||
71 | /* async aborts if taking too long or contended */ | 176 | /* async aborts if taking too long or contended */ |
72 | if (!cc->sync) { | 177 | if (!cc->sync) { |
73 | if (cc->contended) | 178 | cc->contended = true; |
74 | *cc->contended = true; | ||
75 | return false; | 179 | return false; |
76 | } | 180 | } |
77 | 181 | ||
78 | cond_resched(); | 182 | cond_resched(); |
79 | if (fatal_signal_pending(current)) | ||
80 | return false; | ||
81 | } | 183 | } |
82 | 184 | ||
83 | if (!locked) | 185 | if (!locked) |
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock, | |||
91 | return compact_checklock_irqsave(lock, flags, false, cc); | 193 | return compact_checklock_irqsave(lock, flags, false, cc); |
92 | } | 194 | } |
93 | 195 | ||
196 | /* Returns true if the page is within a block suitable for migration to */ | ||
197 | static bool suitable_migration_target(struct page *page) | ||
198 | { | ||
199 | int migratetype = get_pageblock_migratetype(page); | ||
200 | |||
201 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
202 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
203 | return false; | ||
204 | |||
205 | /* If the page is a large free page, then allow migration */ | ||
206 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
207 | return true; | ||
208 | |||
209 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
210 | if (migrate_async_suitable(migratetype)) | ||
211 | return true; | ||
212 | |||
213 | /* Otherwise skip the block */ | ||
214 | return false; | ||
215 | } | ||
216 | |||
217 | static void compact_capture_page(struct compact_control *cc) | ||
218 | { | ||
219 | unsigned long flags; | ||
220 | int mtype, mtype_low, mtype_high; | ||
221 | |||
222 | if (!cc->page || *cc->page) | ||
223 | return; | ||
224 | |||
225 | /* | ||
226 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
227 | * regardless of the migratetype of the freelist is is captured from. | ||
228 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
229 | * allocation is typically at least a pageblock size and overall | ||
230 | * fragmentation is not impaired. Other allocation types must | ||
231 | * capture pages from their own migratelist because otherwise they | ||
232 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
233 | * difficult to move pages and making fragmentation worse overall. | ||
234 | */ | ||
235 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
236 | mtype_low = 0; | ||
237 | mtype_high = MIGRATE_PCPTYPES; | ||
238 | } else { | ||
239 | mtype_low = cc->migratetype; | ||
240 | mtype_high = cc->migratetype + 1; | ||
241 | } | ||
242 | |||
243 | /* Speculatively examine the free lists without zone lock */ | ||
244 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
245 | int order; | ||
246 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
247 | struct page *page; | ||
248 | struct free_area *area; | ||
249 | area = &(cc->zone->free_area[order]); | ||
250 | if (list_empty(&area->free_list[mtype])) | ||
251 | continue; | ||
252 | |||
253 | /* Take the lock and attempt capture of the page */ | ||
254 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
255 | return; | ||
256 | if (!list_empty(&area->free_list[mtype])) { | ||
257 | page = list_entry(area->free_list[mtype].next, | ||
258 | struct page, lru); | ||
259 | if (capture_free_page(page, cc->order, mtype)) { | ||
260 | spin_unlock_irqrestore(&cc->zone->lock, | ||
261 | flags); | ||
262 | *cc->page = page; | ||
263 | return; | ||
264 | } | ||
265 | } | ||
266 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
267 | } | ||
268 | } | ||
269 | } | ||
270 | |||
94 | /* | 271 | /* |
95 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 272 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
96 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 273 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
97 | * pages inside of the pageblock (even though it may still end up isolating | 274 | * pages inside of the pageblock (even though it may still end up isolating |
98 | * some pages). | 275 | * some pages). |
99 | */ | 276 | */ |
100 | static unsigned long isolate_freepages_block(unsigned long blockpfn, | 277 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
278 | unsigned long blockpfn, | ||
101 | unsigned long end_pfn, | 279 | unsigned long end_pfn, |
102 | struct list_head *freelist, | 280 | struct list_head *freelist, |
103 | bool strict) | 281 | bool strict) |
104 | { | 282 | { |
105 | int nr_scanned = 0, total_isolated = 0; | 283 | int nr_scanned = 0, total_isolated = 0; |
106 | struct page *cursor; | 284 | struct page *cursor, *valid_page = NULL; |
285 | unsigned long nr_strict_required = end_pfn - blockpfn; | ||
286 | unsigned long flags; | ||
287 | bool locked = false; | ||
107 | 288 | ||
108 | cursor = pfn_to_page(blockpfn); | 289 | cursor = pfn_to_page(blockpfn); |
109 | 290 | ||
110 | /* Isolate free pages. This assumes the block is valid */ | 291 | /* Isolate free pages. */ |
111 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | 292 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { |
112 | int isolated, i; | 293 | int isolated, i; |
113 | struct page *page = cursor; | 294 | struct page *page = cursor; |
114 | 295 | ||
115 | if (!pfn_valid_within(blockpfn)) { | ||
116 | if (strict) | ||
117 | return 0; | ||
118 | continue; | ||
119 | } | ||
120 | nr_scanned++; | 296 | nr_scanned++; |
297 | if (!pfn_valid_within(blockpfn)) | ||
298 | continue; | ||
299 | if (!valid_page) | ||
300 | valid_page = page; | ||
301 | if (!PageBuddy(page)) | ||
302 | continue; | ||
121 | 303 | ||
122 | if (!PageBuddy(page)) { | 304 | /* |
123 | if (strict) | 305 | * The zone lock must be held to isolate freepages. |
124 | return 0; | 306 | * Unfortunately this is a very coarse lock and can be |
307 | * heavily contended if there are parallel allocations | ||
308 | * or parallel compactions. For async compaction do not | ||
309 | * spin on the lock and we acquire the lock as late as | ||
310 | * possible. | ||
311 | */ | ||
312 | locked = compact_checklock_irqsave(&cc->zone->lock, &flags, | ||
313 | locked, cc); | ||
314 | if (!locked) | ||
315 | break; | ||
316 | |||
317 | /* Recheck this is a suitable migration target under lock */ | ||
318 | if (!strict && !suitable_migration_target(page)) | ||
319 | break; | ||
320 | |||
321 | /* Recheck this is a buddy page under lock */ | ||
322 | if (!PageBuddy(page)) | ||
125 | continue; | 323 | continue; |
126 | } | ||
127 | 324 | ||
128 | /* Found a free page, break it into order-0 pages */ | 325 | /* Found a free page, break it into order-0 pages */ |
129 | isolated = split_free_page(page); | 326 | isolated = split_free_page(page); |
130 | if (!isolated && strict) | 327 | if (!isolated && strict) |
131 | return 0; | 328 | break; |
132 | total_isolated += isolated; | 329 | total_isolated += isolated; |
133 | for (i = 0; i < isolated; i++) { | 330 | for (i = 0; i < isolated; i++) { |
134 | list_add(&page->lru, freelist); | 331 | list_add(&page->lru, freelist); |
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, | |||
143 | } | 340 | } |
144 | 341 | ||
145 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 342 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
343 | |||
344 | /* | ||
345 | * If strict isolation is requested by CMA then check that all the | ||
346 | * pages requested were isolated. If there were any failures, 0 is | ||
347 | * returned and CMA will fail. | ||
348 | */ | ||
349 | if (strict && nr_strict_required > total_isolated) | ||
350 | total_isolated = 0; | ||
351 | |||
352 | if (locked) | ||
353 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
354 | |||
355 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
356 | if (blockpfn == end_pfn) | ||
357 | update_pageblock_skip(cc, valid_page, total_isolated, false); | ||
358 | |||
146 | return total_isolated; | 359 | return total_isolated; |
147 | } | 360 | } |
148 | 361 | ||
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn, | |||
160 | * a free page). | 373 | * a free page). |
161 | */ | 374 | */ |
162 | unsigned long | 375 | unsigned long |
163 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | 376 | isolate_freepages_range(struct compact_control *cc, |
377 | unsigned long start_pfn, unsigned long end_pfn) | ||
164 | { | 378 | { |
165 | unsigned long isolated, pfn, block_end_pfn, flags; | 379 | unsigned long isolated, pfn, block_end_pfn; |
166 | struct zone *zone = NULL; | ||
167 | LIST_HEAD(freelist); | 380 | LIST_HEAD(freelist); |
168 | 381 | ||
169 | if (pfn_valid(start_pfn)) | ||
170 | zone = page_zone(pfn_to_page(start_pfn)); | ||
171 | |||
172 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { | 382 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { |
173 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) | 383 | if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) |
174 | break; | 384 | break; |
175 | 385 | ||
176 | /* | 386 | /* |
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | |||
180 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 390 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
181 | block_end_pfn = min(block_end_pfn, end_pfn); | 391 | block_end_pfn = min(block_end_pfn, end_pfn); |
182 | 392 | ||
183 | spin_lock_irqsave(&zone->lock, flags); | 393 | isolated = isolate_freepages_block(cc, pfn, block_end_pfn, |
184 | isolated = isolate_freepages_block(pfn, block_end_pfn, | ||
185 | &freelist, true); | 394 | &freelist, true); |
186 | spin_unlock_irqrestore(&zone->lock, flags); | ||
187 | 395 | ||
188 | /* | 396 | /* |
189 | * In strict mode, isolate_freepages_block() returns 0 if | 397 | * In strict mode, isolate_freepages_block() returns 0 if |
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone) | |||
253 | * @cc: Compaction control structure. | 461 | * @cc: Compaction control structure. |
254 | * @low_pfn: The first PFN of the range. | 462 | * @low_pfn: The first PFN of the range. |
255 | * @end_pfn: The one-past-the-last PFN of the range. | 463 | * @end_pfn: The one-past-the-last PFN of the range. |
464 | * @unevictable: true if it allows to isolate unevictable pages | ||
256 | * | 465 | * |
257 | * Isolate all pages that can be migrated from the range specified by | 466 | * Isolate all pages that can be migrated from the range specified by |
258 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal | 467 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal |
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone) | |||
268 | */ | 477 | */ |
269 | unsigned long | 478 | unsigned long |
270 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 479 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
271 | unsigned long low_pfn, unsigned long end_pfn) | 480 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable) |
272 | { | 481 | { |
273 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 482 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
274 | unsigned long nr_scanned = 0, nr_isolated = 0; | 483 | unsigned long nr_scanned = 0, nr_isolated = 0; |
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
276 | isolate_mode_t mode = 0; | 485 | isolate_mode_t mode = 0; |
277 | struct lruvec *lruvec; | 486 | struct lruvec *lruvec; |
278 | unsigned long flags; | 487 | unsigned long flags; |
279 | bool locked; | 488 | bool locked = false; |
489 | struct page *page = NULL, *valid_page = NULL; | ||
280 | 490 | ||
281 | /* | 491 | /* |
282 | * Ensure that there are not too many pages isolated from the LRU | 492 | * Ensure that there are not too many pages isolated from the LRU |
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
296 | 506 | ||
297 | /* Time to isolate some pages for migration */ | 507 | /* Time to isolate some pages for migration */ |
298 | cond_resched(); | 508 | cond_resched(); |
299 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
300 | locked = true; | ||
301 | for (; low_pfn < end_pfn; low_pfn++) { | 509 | for (; low_pfn < end_pfn; low_pfn++) { |
302 | struct page *page; | ||
303 | |||
304 | /* give a chance to irqs before checking need_resched() */ | 510 | /* give a chance to irqs before checking need_resched() */ |
305 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 511 | if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) { |
306 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 512 | if (should_release_lock(&zone->lru_lock)) { |
307 | locked = false; | 513 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
514 | locked = false; | ||
515 | } | ||
308 | } | 516 | } |
309 | 517 | ||
310 | /* Check if it is ok to still hold the lock */ | ||
311 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
312 | locked, cc); | ||
313 | if (!locked) | ||
314 | break; | ||
315 | |||
316 | /* | 518 | /* |
317 | * migrate_pfn does not necessarily start aligned to a | 519 | * migrate_pfn does not necessarily start aligned to a |
318 | * pageblock. Ensure that pfn_valid is called when moving | 520 | * pageblock. Ensure that pfn_valid is called when moving |
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
340 | if (page_zone(page) != zone) | 542 | if (page_zone(page) != zone) |
341 | continue; | 543 | continue; |
342 | 544 | ||
545 | if (!valid_page) | ||
546 | valid_page = page; | ||
547 | |||
548 | /* If isolation recently failed, do not retry */ | ||
549 | pageblock_nr = low_pfn >> pageblock_order; | ||
550 | if (!isolation_suitable(cc, page)) | ||
551 | goto next_pageblock; | ||
552 | |||
343 | /* Skip if free */ | 553 | /* Skip if free */ |
344 | if (PageBuddy(page)) | 554 | if (PageBuddy(page)) |
345 | continue; | 555 | continue; |
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
349 | * migration is optimistic to see if the minimum amount of work | 559 | * migration is optimistic to see if the minimum amount of work |
350 | * satisfies the allocation | 560 | * satisfies the allocation |
351 | */ | 561 | */ |
352 | pageblock_nr = low_pfn >> pageblock_order; | ||
353 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 562 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
354 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 563 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
355 | low_pfn += pageblock_nr_pages; | 564 | cc->finished_update_migrate = true; |
356 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 565 | goto next_pageblock; |
357 | last_pageblock_nr = pageblock_nr; | ||
358 | continue; | ||
359 | } | 566 | } |
360 | 567 | ||
568 | /* Check may be lockless but that's ok as we recheck later */ | ||
361 | if (!PageLRU(page)) | 569 | if (!PageLRU(page)) |
362 | continue; | 570 | continue; |
363 | 571 | ||
364 | /* | 572 | /* |
365 | * PageLRU is set, and lru_lock excludes isolation, | 573 | * PageLRU is set. lru_lock normally excludes isolation |
366 | * splitting and collapsing (collapsing has already | 574 | * splitting and collapsing (collapsing has already happened |
367 | * happened if PageLRU is set). | 575 | * if PageLRU is set) but the lock is not necessarily taken |
576 | * here and it is wasteful to take it just to check transhuge. | ||
577 | * Check TransHuge without lock and skip the whole pageblock if | ||
578 | * it's either a transhuge or hugetlbfs page, as calling | ||
579 | * compound_order() without preventing THP from splitting the | ||
580 | * page underneath us may return surprising results. | ||
368 | */ | 581 | */ |
369 | if (PageTransHuge(page)) { | 582 | if (PageTransHuge(page)) { |
583 | if (!locked) | ||
584 | goto next_pageblock; | ||
585 | low_pfn += (1 << compound_order(page)) - 1; | ||
586 | continue; | ||
587 | } | ||
588 | |||
589 | /* Check if it is ok to still hold the lock */ | ||
590 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | ||
591 | locked, cc); | ||
592 | if (!locked || fatal_signal_pending(current)) | ||
593 | break; | ||
594 | |||
595 | /* Recheck PageLRU and PageTransHuge under lock */ | ||
596 | if (!PageLRU(page)) | ||
597 | continue; | ||
598 | if (PageTransHuge(page)) { | ||
370 | low_pfn += (1 << compound_order(page)) - 1; | 599 | low_pfn += (1 << compound_order(page)) - 1; |
371 | continue; | 600 | continue; |
372 | } | 601 | } |
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
374 | if (!cc->sync) | 603 | if (!cc->sync) |
375 | mode |= ISOLATE_ASYNC_MIGRATE; | 604 | mode |= ISOLATE_ASYNC_MIGRATE; |
376 | 605 | ||
606 | if (unevictable) | ||
607 | mode |= ISOLATE_UNEVICTABLE; | ||
608 | |||
377 | lruvec = mem_cgroup_page_lruvec(page, zone); | 609 | lruvec = mem_cgroup_page_lruvec(page, zone); |
378 | 610 | ||
379 | /* Try isolate the page */ | 611 | /* Try isolate the page */ |
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
383 | VM_BUG_ON(PageTransCompound(page)); | 615 | VM_BUG_ON(PageTransCompound(page)); |
384 | 616 | ||
385 | /* Successfully isolated */ | 617 | /* Successfully isolated */ |
618 | cc->finished_update_migrate = true; | ||
386 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 619 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
387 | list_add(&page->lru, migratelist); | 620 | list_add(&page->lru, migratelist); |
388 | cc->nr_migratepages++; | 621 | cc->nr_migratepages++; |
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
393 | ++low_pfn; | 626 | ++low_pfn; |
394 | break; | 627 | break; |
395 | } | 628 | } |
629 | |||
630 | continue; | ||
631 | |||
632 | next_pageblock: | ||
633 | low_pfn += pageblock_nr_pages; | ||
634 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | ||
635 | last_pageblock_nr = pageblock_nr; | ||
396 | } | 636 | } |
397 | 637 | ||
398 | acct_isolated(zone, locked, cc); | 638 | acct_isolated(zone, locked, cc); |
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
400 | if (locked) | 640 | if (locked) |
401 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 641 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
402 | 642 | ||
643 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
644 | if (low_pfn == end_pfn) | ||
645 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | ||
646 | |||
403 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 647 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
404 | 648 | ||
405 | return low_pfn; | 649 | return low_pfn; |
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
407 | 651 | ||
408 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 652 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
409 | #ifdef CONFIG_COMPACTION | 653 | #ifdef CONFIG_COMPACTION |
410 | |||
411 | /* Returns true if the page is within a block suitable for migration to */ | ||
412 | static bool suitable_migration_target(struct page *page) | ||
413 | { | ||
414 | |||
415 | int migratetype = get_pageblock_migratetype(page); | ||
416 | |||
417 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | ||
418 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | ||
419 | return false; | ||
420 | |||
421 | /* If the page is a large free page, then allow migration */ | ||
422 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | ||
423 | return true; | ||
424 | |||
425 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
426 | if (migrate_async_suitable(migratetype)) | ||
427 | return true; | ||
428 | |||
429 | /* Otherwise skip the block */ | ||
430 | return false; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
435 | * point for full compaction of a zone. Compaction searches for free pages from | ||
436 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
437 | * page block. | ||
438 | */ | ||
439 | static unsigned long start_free_pfn(struct zone *zone) | ||
440 | { | ||
441 | unsigned long free_pfn; | ||
442 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
443 | free_pfn &= ~(pageblock_nr_pages-1); | ||
444 | return free_pfn; | ||
445 | } | ||
446 | |||
447 | /* | 654 | /* |
448 | * Based on information in the current compact_control, find blocks | 655 | * Based on information in the current compact_control, find blocks |
449 | * suitable for isolating free pages from and then isolate them. | 656 | * suitable for isolating free pages from and then isolate them. |
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone, | |||
453 | { | 660 | { |
454 | struct page *page; | 661 | struct page *page; |
455 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 662 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; |
456 | unsigned long flags; | ||
457 | int nr_freepages = cc->nr_freepages; | 663 | int nr_freepages = cc->nr_freepages; |
458 | struct list_head *freelist = &cc->freepages; | 664 | struct list_head *freelist = &cc->freepages; |
459 | 665 | ||
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone, | |||
501 | if (!suitable_migration_target(page)) | 707 | if (!suitable_migration_target(page)) |
502 | continue; | 708 | continue; |
503 | 709 | ||
504 | /* | 710 | /* If isolation recently failed, do not retry */ |
505 | * Found a block suitable for isolating free pages from. Now | 711 | if (!isolation_suitable(cc, page)) |
506 | * we disabled interrupts, double check things are ok and | 712 | continue; |
507 | * isolate the pages. This is to minimise the time IRQs | ||
508 | * are disabled | ||
509 | */ | ||
510 | isolated = 0; | ||
511 | 713 | ||
512 | /* | 714 | /* Found a block suitable for isolating free pages from */ |
513 | * The zone lock must be held to isolate freepages. This | 715 | isolated = 0; |
514 | * unfortunately this is a very coarse lock and can be | 716 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); |
515 | * heavily contended if there are parallel allocations | 717 | isolated = isolate_freepages_block(cc, pfn, end_pfn, |
516 | * or parallel compactions. For async compaction do not | 718 | freelist, false); |
517 | * spin on the lock | 719 | nr_freepages += isolated; |
518 | */ | ||
519 | if (!compact_trylock_irqsave(&zone->lock, &flags, cc)) | ||
520 | break; | ||
521 | if (suitable_migration_target(page)) { | ||
522 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | ||
523 | isolated = isolate_freepages_block(pfn, end_pfn, | ||
524 | freelist, false); | ||
525 | nr_freepages += isolated; | ||
526 | } | ||
527 | spin_unlock_irqrestore(&zone->lock, flags); | ||
528 | 720 | ||
529 | /* | 721 | /* |
530 | * Record the highest PFN we isolated pages from. When next | 722 | * Record the highest PFN we isolated pages from. When next |
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone, | |||
532 | * page migration may have returned some pages to the allocator | 724 | * page migration may have returned some pages to the allocator |
533 | */ | 725 | */ |
534 | if (isolated) { | 726 | if (isolated) { |
727 | cc->finished_update_free = true; | ||
535 | high_pfn = max(high_pfn, pfn); | 728 | high_pfn = max(high_pfn, pfn); |
536 | |||
537 | /* | ||
538 | * If the free scanner has wrapped, update | ||
539 | * compact_cached_free_pfn to point to the highest | ||
540 | * pageblock with free pages. This reduces excessive | ||
541 | * scanning of full pageblocks near the end of the | ||
542 | * zone | ||
543 | */ | ||
544 | if (cc->order > 0 && cc->wrapped) | ||
545 | zone->compact_cached_free_pfn = high_pfn; | ||
546 | } | 729 | } |
547 | } | 730 | } |
548 | 731 | ||
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone, | |||
551 | 734 | ||
552 | cc->free_pfn = high_pfn; | 735 | cc->free_pfn = high_pfn; |
553 | cc->nr_freepages = nr_freepages; | 736 | cc->nr_freepages = nr_freepages; |
554 | |||
555 | /* If compact_cached_free_pfn is reset then set it now */ | ||
556 | if (cc->order > 0 && !cc->wrapped && | ||
557 | zone->compact_cached_free_pfn == start_free_pfn(zone)) | ||
558 | zone->compact_cached_free_pfn = high_pfn; | ||
559 | } | 737 | } |
560 | 738 | ||
561 | /* | 739 | /* |
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
633 | } | 811 | } |
634 | 812 | ||
635 | /* Perform the isolation */ | 813 | /* Perform the isolation */ |
636 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); | 814 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); |
637 | if (!low_pfn) | 815 | if (!low_pfn || cc->contended) |
638 | return ISOLATE_ABORT; | 816 | return ISOLATE_ABORT; |
639 | 817 | ||
640 | cc->migrate_pfn = low_pfn; | 818 | cc->migrate_pfn = low_pfn; |
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
645 | static int compact_finished(struct zone *zone, | 823 | static int compact_finished(struct zone *zone, |
646 | struct compact_control *cc) | 824 | struct compact_control *cc) |
647 | { | 825 | { |
648 | unsigned int order; | ||
649 | unsigned long watermark; | 826 | unsigned long watermark; |
650 | 827 | ||
651 | if (fatal_signal_pending(current)) | 828 | if (fatal_signal_pending(current)) |
652 | return COMPACT_PARTIAL; | 829 | return COMPACT_PARTIAL; |
653 | 830 | ||
654 | /* | 831 | /* Compaction run completes if the migrate and free scanner meet */ |
655 | * A full (order == -1) compaction run starts at the beginning and | ||
656 | * end of a zone; it completes when the migrate and free scanner meet. | ||
657 | * A partial (order > 0) compaction can start with the free scanner | ||
658 | * at a random point in the zone, and may have to restart. | ||
659 | */ | ||
660 | if (cc->free_pfn <= cc->migrate_pfn) { | 832 | if (cc->free_pfn <= cc->migrate_pfn) { |
661 | if (cc->order > 0 && !cc->wrapped) { | 833 | /* |
662 | /* We started partway through; restart at the end. */ | 834 | * Mark that the PG_migrate_skip information should be cleared |
663 | unsigned long free_pfn = start_free_pfn(zone); | 835 | * by kswapd when it goes to sleep. kswapd does not set the |
664 | zone->compact_cached_free_pfn = free_pfn; | 836 | * flag itself as the decision to be clear should be directly |
665 | cc->free_pfn = free_pfn; | 837 | * based on an allocation request. |
666 | cc->wrapped = 1; | 838 | */ |
667 | return COMPACT_CONTINUE; | 839 | if (!current_is_kswapd()) |
668 | } | 840 | zone->compact_blockskip_flush = true; |
669 | return COMPACT_COMPLETE; | ||
670 | } | ||
671 | 841 | ||
672 | /* We wrapped around and ended up where we started. */ | ||
673 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
674 | return COMPACT_COMPLETE; | 842 | return COMPACT_COMPLETE; |
843 | } | ||
675 | 844 | ||
676 | /* | 845 | /* |
677 | * order == -1 is expected when compacting via | 846 | * order == -1 is expected when compacting via |
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone, | |||
688 | return COMPACT_CONTINUE; | 857 | return COMPACT_CONTINUE; |
689 | 858 | ||
690 | /* Direct compactor: Is a suitable page free? */ | 859 | /* Direct compactor: Is a suitable page free? */ |
691 | for (order = cc->order; order < MAX_ORDER; order++) { | 860 | if (cc->page) { |
692 | /* Job done if page is free of the right migratetype */ | 861 | /* Was a suitable page captured? */ |
693 | if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) | 862 | if (*cc->page) |
694 | return COMPACT_PARTIAL; | ||
695 | |||
696 | /* Job done if allocation would set block type */ | ||
697 | if (order >= pageblock_order && zone->free_area[order].nr_free) | ||
698 | return COMPACT_PARTIAL; | 863 | return COMPACT_PARTIAL; |
864 | } else { | ||
865 | unsigned int order; | ||
866 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
867 | struct free_area *area = &zone->free_area[cc->order]; | ||
868 | /* Job done if page is free of the right migratetype */ | ||
869 | if (!list_empty(&area->free_list[cc->migratetype])) | ||
870 | return COMPACT_PARTIAL; | ||
871 | |||
872 | /* Job done if allocation would set block type */ | ||
873 | if (cc->order >= pageblock_order && area->nr_free) | ||
874 | return COMPACT_PARTIAL; | ||
875 | } | ||
699 | } | 876 | } |
700 | 877 | ||
701 | return COMPACT_CONTINUE; | 878 | return COMPACT_CONTINUE; |
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
754 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 931 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
755 | { | 932 | { |
756 | int ret; | 933 | int ret; |
934 | unsigned long start_pfn = zone->zone_start_pfn; | ||
935 | unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
757 | 936 | ||
758 | ret = compaction_suitable(zone, cc->order); | 937 | ret = compaction_suitable(zone, cc->order); |
759 | switch (ret) { | 938 | switch (ret) { |
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
766 | ; | 945 | ; |
767 | } | 946 | } |
768 | 947 | ||
769 | /* Setup to move all movable pages to the end of the zone */ | 948 | /* |
770 | cc->migrate_pfn = zone->zone_start_pfn; | 949 | * Setup to move all movable pages to the end of the zone. Used cached |
771 | 950 | * information on where the scanners should start but check that it | |
772 | if (cc->order > 0) { | 951 | * is initialised by ensuring the values are within zone boundaries. |
773 | /* Incremental compaction. Start where the last one stopped. */ | 952 | */ |
774 | cc->free_pfn = zone->compact_cached_free_pfn; | 953 | cc->migrate_pfn = zone->compact_cached_migrate_pfn; |
775 | cc->start_free_pfn = cc->free_pfn; | 954 | cc->free_pfn = zone->compact_cached_free_pfn; |
776 | } else { | 955 | if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { |
777 | /* Order == -1 starts at the end of the zone. */ | 956 | cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); |
778 | cc->free_pfn = start_free_pfn(zone); | 957 | zone->compact_cached_free_pfn = cc->free_pfn; |
958 | } | ||
959 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { | ||
960 | cc->migrate_pfn = start_pfn; | ||
961 | zone->compact_cached_migrate_pfn = cc->migrate_pfn; | ||
779 | } | 962 | } |
780 | 963 | ||
964 | /* | ||
965 | * Clear pageblock skip if there were failures recently and compaction | ||
966 | * is about to be retried after being deferred. kswapd does not do | ||
967 | * this reset as it'll reset the cached information when going to sleep. | ||
968 | */ | ||
969 | if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) | ||
970 | __reset_isolation_suitable(zone); | ||
971 | |||
781 | migrate_prep_local(); | 972 | migrate_prep_local(); |
782 | 973 | ||
783 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 974 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
787 | switch (isolate_migratepages(zone, cc)) { | 978 | switch (isolate_migratepages(zone, cc)) { |
788 | case ISOLATE_ABORT: | 979 | case ISOLATE_ABORT: |
789 | ret = COMPACT_PARTIAL; | 980 | ret = COMPACT_PARTIAL; |
981 | putback_lru_pages(&cc->migratepages); | ||
982 | cc->nr_migratepages = 0; | ||
790 | goto out; | 983 | goto out; |
791 | case ISOLATE_NONE: | 984 | case ISOLATE_NONE: |
792 | continue; | 985 | continue; |
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
817 | goto out; | 1010 | goto out; |
818 | } | 1011 | } |
819 | } | 1012 | } |
1013 | |||
1014 | /* Capture a page now if it is a suitable size */ | ||
1015 | compact_capture_page(cc); | ||
820 | } | 1016 | } |
821 | 1017 | ||
822 | out: | 1018 | out: |
@@ -829,8 +1025,10 @@ out: | |||
829 | 1025 | ||
830 | static unsigned long compact_zone_order(struct zone *zone, | 1026 | static unsigned long compact_zone_order(struct zone *zone, |
831 | int order, gfp_t gfp_mask, | 1027 | int order, gfp_t gfp_mask, |
832 | bool sync, bool *contended) | 1028 | bool sync, bool *contended, |
1029 | struct page **page) | ||
833 | { | 1030 | { |
1031 | unsigned long ret; | ||
834 | struct compact_control cc = { | 1032 | struct compact_control cc = { |
835 | .nr_freepages = 0, | 1033 | .nr_freepages = 0, |
836 | .nr_migratepages = 0, | 1034 | .nr_migratepages = 0, |
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
838 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1036 | .migratetype = allocflags_to_migratetype(gfp_mask), |
839 | .zone = zone, | 1037 | .zone = zone, |
840 | .sync = sync, | 1038 | .sync = sync, |
841 | .contended = contended, | 1039 | .page = page, |
842 | }; | 1040 | }; |
843 | INIT_LIST_HEAD(&cc.freepages); | 1041 | INIT_LIST_HEAD(&cc.freepages); |
844 | INIT_LIST_HEAD(&cc.migratepages); | 1042 | INIT_LIST_HEAD(&cc.migratepages); |
845 | 1043 | ||
846 | return compact_zone(zone, &cc); | 1044 | ret = compact_zone(zone, &cc); |
1045 | |||
1046 | VM_BUG_ON(!list_empty(&cc.freepages)); | ||
1047 | VM_BUG_ON(!list_empty(&cc.migratepages)); | ||
1048 | |||
1049 | *contended = cc.contended; | ||
1050 | return ret; | ||
847 | } | 1051 | } |
848 | 1052 | ||
849 | int sysctl_extfrag_threshold = 500; | 1053 | int sysctl_extfrag_threshold = 500; |
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500; | |||
855 | * @gfp_mask: The GFP mask of the current allocation | 1059 | * @gfp_mask: The GFP mask of the current allocation |
856 | * @nodemask: The allowed nodes to allocate from | 1060 | * @nodemask: The allowed nodes to allocate from |
857 | * @sync: Whether migration is synchronous or not | 1061 | * @sync: Whether migration is synchronous or not |
1062 | * @contended: Return value that is true if compaction was aborted due to lock contention | ||
1063 | * @page: Optionally capture a free page of the requested order during compaction | ||
858 | * | 1064 | * |
859 | * This is the main entry point for direct page compaction. | 1065 | * This is the main entry point for direct page compaction. |
860 | */ | 1066 | */ |
861 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1067 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
862 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1068 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
863 | bool sync, bool *contended) | 1069 | bool sync, bool *contended, struct page **page) |
864 | { | 1070 | { |
865 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1071 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
866 | int may_enter_fs = gfp_mask & __GFP_FS; | 1072 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
868 | struct zoneref *z; | 1074 | struct zoneref *z; |
869 | struct zone *zone; | 1075 | struct zone *zone; |
870 | int rc = COMPACT_SKIPPED; | 1076 | int rc = COMPACT_SKIPPED; |
1077 | int alloc_flags = 0; | ||
871 | 1078 | ||
872 | /* | 1079 | /* Check if the GFP flags allow compaction */ |
873 | * Check whether it is worth even starting compaction. The order check is | ||
874 | * made because an assumption is made that the page allocator can satisfy | ||
875 | * the "cheaper" orders without taking special steps | ||
876 | */ | ||
877 | if (!order || !may_enter_fs || !may_perform_io) | 1080 | if (!order || !may_enter_fs || !may_perform_io) |
878 | return rc; | 1081 | return rc; |
879 | 1082 | ||
880 | count_vm_event(COMPACTSTALL); | 1083 | count_vm_event(COMPACTSTALL); |
881 | 1084 | ||
1085 | #ifdef CONFIG_CMA | ||
1086 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
1087 | alloc_flags |= ALLOC_CMA; | ||
1088 | #endif | ||
882 | /* Compact each zone in the list */ | 1089 | /* Compact each zone in the list */ |
883 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1090 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
884 | nodemask) { | 1091 | nodemask) { |
885 | int status; | 1092 | int status; |
886 | 1093 | ||
887 | status = compact_zone_order(zone, order, gfp_mask, sync, | 1094 | status = compact_zone_order(zone, order, gfp_mask, sync, |
888 | contended); | 1095 | contended, page); |
889 | rc = max(status, rc); | 1096 | rc = max(status, rc); |
890 | 1097 | ||
891 | /* If a normal allocation would succeed, stop compacting */ | 1098 | /* If a normal allocation would succeed, stop compacting */ |
892 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | 1099 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, |
1100 | alloc_flags)) | ||
893 | break; | 1101 | break; |
894 | } | 1102 | } |
895 | 1103 | ||
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) | |||
940 | struct compact_control cc = { | 1148 | struct compact_control cc = { |
941 | .order = order, | 1149 | .order = order, |
942 | .sync = false, | 1150 | .sync = false, |
1151 | .page = NULL, | ||
943 | }; | 1152 | }; |
944 | 1153 | ||
945 | return __compact_pgdat(pgdat, &cc); | 1154 | return __compact_pgdat(pgdat, &cc); |
@@ -950,6 +1159,7 @@ static int compact_node(int nid) | |||
950 | struct compact_control cc = { | 1159 | struct compact_control cc = { |
951 | .order = -1, | 1160 | .order = -1, |
952 | .sync = true, | 1161 | .sync = true, |
1162 | .page = NULL, | ||
953 | }; | 1163 | }; |
954 | 1164 | ||
955 | return __compact_pgdat(NODE_DATA(nid), &cc); | 1165 | return __compact_pgdat(NODE_DATA(nid), &cc); |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 9b75a045dbf4..a47f0f50c89f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -26,7 +26,7 @@ | |||
26 | */ | 26 | */ |
27 | SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | 27 | SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) |
28 | { | 28 | { |
29 | struct file *file = fget(fd); | 29 | struct fd f = fdget(fd); |
30 | struct address_space *mapping; | 30 | struct address_space *mapping; |
31 | struct backing_dev_info *bdi; | 31 | struct backing_dev_info *bdi; |
32 | loff_t endbyte; /* inclusive */ | 32 | loff_t endbyte; /* inclusive */ |
@@ -35,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
35 | unsigned long nrpages; | 35 | unsigned long nrpages; |
36 | int ret = 0; | 36 | int ret = 0; |
37 | 37 | ||
38 | if (!file) | 38 | if (!f.file) |
39 | return -EBADF; | 39 | return -EBADF; |
40 | 40 | ||
41 | if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) { | 41 | if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) { |
42 | ret = -ESPIPE; | 42 | ret = -ESPIPE; |
43 | goto out; | 43 | goto out; |
44 | } | 44 | } |
45 | 45 | ||
46 | mapping = file->f_mapping; | 46 | mapping = f.file->f_mapping; |
47 | if (!mapping || len < 0) { | 47 | if (!mapping || len < 0) { |
48 | ret = -EINVAL; | 48 | ret = -EINVAL; |
49 | goto out; | 49 | goto out; |
@@ -76,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
76 | 76 | ||
77 | switch (advice) { | 77 | switch (advice) { |
78 | case POSIX_FADV_NORMAL: | 78 | case POSIX_FADV_NORMAL: |
79 | file->f_ra.ra_pages = bdi->ra_pages; | 79 | f.file->f_ra.ra_pages = bdi->ra_pages; |
80 | spin_lock(&file->f_lock); | 80 | spin_lock(&f.file->f_lock); |
81 | file->f_mode &= ~FMODE_RANDOM; | 81 | f.file->f_mode &= ~FMODE_RANDOM; |
82 | spin_unlock(&file->f_lock); | 82 | spin_unlock(&f.file->f_lock); |
83 | break; | 83 | break; |
84 | case POSIX_FADV_RANDOM: | 84 | case POSIX_FADV_RANDOM: |
85 | spin_lock(&file->f_lock); | 85 | spin_lock(&f.file->f_lock); |
86 | file->f_mode |= FMODE_RANDOM; | 86 | f.file->f_mode |= FMODE_RANDOM; |
87 | spin_unlock(&file->f_lock); | 87 | spin_unlock(&f.file->f_lock); |
88 | break; | 88 | break; |
89 | case POSIX_FADV_SEQUENTIAL: | 89 | case POSIX_FADV_SEQUENTIAL: |
90 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 90 | f.file->f_ra.ra_pages = bdi->ra_pages * 2; |
91 | spin_lock(&file->f_lock); | 91 | spin_lock(&f.file->f_lock); |
92 | file->f_mode &= ~FMODE_RANDOM; | 92 | f.file->f_mode &= ~FMODE_RANDOM; |
93 | spin_unlock(&file->f_lock); | 93 | spin_unlock(&f.file->f_lock); |
94 | break; | 94 | break; |
95 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
96 | /* First and last PARTIAL page! */ | 96 | /* First and last PARTIAL page! */ |
@@ -106,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
106 | * Ignore return value because fadvise() shall return | 106 | * Ignore return value because fadvise() shall return |
107 | * success even if filesystem can't retrieve a hint, | 107 | * success even if filesystem can't retrieve a hint, |
108 | */ | 108 | */ |
109 | force_page_cache_readahead(mapping, file, start_index, | 109 | force_page_cache_readahead(mapping, f.file, start_index, |
110 | nrpages); | 110 | nrpages); |
111 | break; | 111 | break; |
112 | case POSIX_FADV_NOREUSE: | 112 | case POSIX_FADV_NOREUSE: |
@@ -128,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
128 | ret = -EINVAL; | 128 | ret = -EINVAL; |
129 | } | 129 | } |
130 | out: | 130 | out: |
131 | fput(file); | 131 | fdput(f); |
132 | return ret; | 132 | return ret; |
133 | } | 133 | } |
134 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | 134 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS |
diff --git a/mm/filemap.c b/mm/filemap.c index 384344575c37..83efee76a5c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1607 | * Do we have something in the page cache already? | 1607 | * Do we have something in the page cache already? |
1608 | */ | 1608 | */ |
1609 | page = find_get_page(mapping, offset); | 1609 | page = find_get_page(mapping, offset); |
1610 | if (likely(page)) { | 1610 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
1611 | /* | 1611 | /* |
1612 | * We found the page, so try async readahead before | 1612 | * We found the page, so try async readahead before |
1613 | * waiting for the lock. | 1613 | * waiting for the lock. |
1614 | */ | 1614 | */ |
1615 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1615 | do_async_mmap_readahead(vma, ra, file, page, offset); |
1616 | } else { | 1616 | } else if (!page) { |
1617 | /* No page in the page cache at all */ | 1617 | /* No page in the page cache at all */ |
1618 | do_sync_mmap_readahead(vma, ra, file, offset); | 1618 | do_sync_mmap_readahead(vma, ra, file, offset); |
1619 | count_vm_event(PGMAJFAULT); | 1619 | count_vm_event(PGMAJFAULT); |
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite); | |||
1737 | const struct vm_operations_struct generic_file_vm_ops = { | 1737 | const struct vm_operations_struct generic_file_vm_ops = { |
1738 | .fault = filemap_fault, | 1738 | .fault = filemap_fault, |
1739 | .page_mkwrite = filemap_page_mkwrite, | 1739 | .page_mkwrite = filemap_page_mkwrite, |
1740 | .remap_pages = generic_file_remap_pages, | ||
1740 | }; | 1741 | }; |
1741 | 1742 | ||
1742 | /* This is used for a general mmap of a disk file */ | 1743 | /* This is used for a general mmap of a disk file */ |
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
1749 | return -ENOEXEC; | 1750 | return -ENOEXEC; |
1750 | file_accessed(file); | 1751 | file_accessed(file); |
1751 | vma->vm_ops = &generic_file_vm_ops; | 1752 | vma->vm_ops = &generic_file_vm_ops; |
1752 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1753 | return 0; | 1753 | return 0; |
1754 | } | 1754 | } |
1755 | 1755 | ||
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 13e013b1270c..a912da6ddfd4 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping, | |||
167 | { | 167 | { |
168 | struct vm_area_struct *vma; | 168 | struct vm_area_struct *vma; |
169 | struct mm_struct *mm; | 169 | struct mm_struct *mm; |
170 | struct prio_tree_iter iter; | ||
171 | unsigned long address; | 170 | unsigned long address; |
172 | pte_t *pte; | 171 | pte_t *pte; |
173 | pte_t pteval; | 172 | pte_t pteval; |
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping, | |||
184 | 183 | ||
185 | retry: | 184 | retry: |
186 | mutex_lock(&mapping->i_mmap_mutex); | 185 | mutex_lock(&mapping->i_mmap_mutex); |
187 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 186 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
188 | mm = vma->vm_mm; | 187 | mm = vma->vm_mm; |
189 | address = vma->vm_start + | 188 | address = vma->vm_start + |
190 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 189 | ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
@@ -193,11 +192,13 @@ retry: | |||
193 | if (pte) { | 192 | if (pte) { |
194 | /* Nuke the page table entry. */ | 193 | /* Nuke the page table entry. */ |
195 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
196 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush(vma, address, pte); |
197 | page_remove_rmap(page); | 196 | page_remove_rmap(page); |
198 | dec_mm_counter(mm, MM_FILEPAGES); | 197 | dec_mm_counter(mm, MM_FILEPAGES); |
199 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
200 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
200 | /* must invalidate_page _before_ freeing the page */ | ||
201 | mmu_notifier_invalidate_page(mm, address); | ||
201 | page_cache_release(page); | 202 | page_cache_release(page); |
202 | } | 203 | } |
203 | } | 204 | } |
@@ -305,6 +306,7 @@ out: | |||
305 | static const struct vm_operations_struct xip_file_vm_ops = { | 306 | static const struct vm_operations_struct xip_file_vm_ops = { |
306 | .fault = xip_file_fault, | 307 | .fault = xip_file_fault, |
307 | .page_mkwrite = filemap_page_mkwrite, | 308 | .page_mkwrite = filemap_page_mkwrite, |
309 | .remap_pages = generic_file_remap_pages, | ||
308 | }; | 310 | }; |
309 | 311 | ||
310 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 312 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | |||
313 | 315 | ||
314 | file_accessed(file); | 316 | file_accessed(file); |
315 | vma->vm_ops = &xip_file_vm_ops; | 317 | vma->vm_ops = &xip_file_vm_ops; |
316 | vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; | 318 | vma->vm_flags |= VM_MIXEDMAP; |
317 | return 0; | 319 | return 0; |
318 | } | 320 | } |
319 | EXPORT_SYMBOL_GPL(xip_file_mmap); | 321 | EXPORT_SYMBOL_GPL(xip_file_mmap); |
diff --git a/mm/fremap.c b/mm/fremap.c index 9ed4fd432467..a0aaf0e56800 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -5,6 +5,7 @@ | |||
5 | * | 5 | * |
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | 6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 |
7 | */ | 7 | */ |
8 | #include <linux/export.h> | ||
8 | #include <linux/backing-dev.h> | 9 | #include <linux/backing-dev.h> |
9 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
10 | #include <linux/swap.h> | 11 | #include <linux/swap.h> |
@@ -80,9 +81,10 @@ out: | |||
80 | return err; | 81 | return err; |
81 | } | 82 | } |
82 | 83 | ||
83 | static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | 84 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, |
84 | unsigned long addr, unsigned long size, pgoff_t pgoff) | 85 | unsigned long size, pgoff_t pgoff) |
85 | { | 86 | { |
87 | struct mm_struct *mm = vma->vm_mm; | ||
86 | int err; | 88 | int err; |
87 | 89 | ||
88 | do { | 90 | do { |
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, | |||
95 | pgoff++; | 97 | pgoff++; |
96 | } while (size); | 98 | } while (size); |
97 | 99 | ||
98 | return 0; | 100 | return 0; |
99 | |||
100 | } | 101 | } |
102 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
101 | 103 | ||
102 | /** | 104 | /** |
103 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma | 105 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma |
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
167 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) | 169 | if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) |
168 | goto out; | 170 | goto out; |
169 | 171 | ||
170 | if (!(vma->vm_flags & VM_CAN_NONLINEAR)) | 172 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) |
171 | goto out; | 173 | goto out; |
172 | 174 | ||
173 | if (start < vma->vm_start || start + size > vma->vm_end) | 175 | if (start < vma->vm_start || start + size > vma->vm_end) |
@@ -195,10 +197,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
195 | */ | 197 | */ |
196 | if (mapping_cap_account_dirty(mapping)) { | 198 | if (mapping_cap_account_dirty(mapping)) { |
197 | unsigned long addr; | 199 | unsigned long addr; |
198 | struct file *file = vma->vm_file; | 200 | struct file *file = get_file(vma->vm_file); |
199 | 201 | ||
200 | flags &= MAP_NONBLOCK; | 202 | flags &= MAP_NONBLOCK; |
201 | get_file(file); | ||
202 | addr = mmap_region(file, start, size, | 203 | addr = mmap_region(file, start, size, |
203 | flags, vma->vm_flags, pgoff); | 204 | flags, vma->vm_flags, pgoff); |
204 | fput(file); | 205 | fput(file); |
@@ -213,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
213 | mutex_lock(&mapping->i_mmap_mutex); | 214 | mutex_lock(&mapping->i_mmap_mutex); |
214 | flush_dcache_mmap_lock(mapping); | 215 | flush_dcache_mmap_lock(mapping); |
215 | vma->vm_flags |= VM_NONLINEAR; | 216 | vma->vm_flags |= VM_NONLINEAR; |
216 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 217 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
217 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 218 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
218 | flush_dcache_mmap_unlock(mapping); | 219 | flush_dcache_mmap_unlock(mapping); |
219 | mutex_unlock(&mapping->i_mmap_mutex); | 220 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -229,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
229 | } | 230 | } |
230 | 231 | ||
231 | mmu_notifier_invalidate_range_start(mm, start, start + size); | 232 | mmu_notifier_invalidate_range_start(mm, start, start + size); |
232 | err = populate_range(mm, vma, start, size, pgoff); | 233 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); |
233 | mmu_notifier_invalidate_range_end(mm, start, start + size); | 234 | mmu_notifier_invalidate_range_end(mm, start, start + size); |
234 | if (!err && !(flags & MAP_NONBLOCK)) { | 235 | if (!err && !(flags & MAP_NONBLOCK)) { |
235 | if (vma->vm_flags & VM_LOCKED) { | 236 | if (vma->vm_flags & VM_LOCKED) { |
diff --git a/mm/frontswap.c b/mm/frontswap.c index 6b3e71a2cd48..2890e67d6026 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -44,6 +44,13 @@ EXPORT_SYMBOL(frontswap_enabled); | |||
44 | */ | 44 | */ |
45 | static bool frontswap_writethrough_enabled __read_mostly; | 45 | static bool frontswap_writethrough_enabled __read_mostly; |
46 | 46 | ||
47 | /* | ||
48 | * If enabled, the underlying tmem implementation is capable of doing | ||
49 | * exclusive gets, so frontswap_load, on a successful tmem_get must | ||
50 | * mark the page as no longer in frontswap AND mark it dirty. | ||
51 | */ | ||
52 | static bool frontswap_tmem_exclusive_gets_enabled __read_mostly; | ||
53 | |||
47 | #ifdef CONFIG_DEBUG_FS | 54 | #ifdef CONFIG_DEBUG_FS |
48 | /* | 55 | /* |
49 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | 56 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is |
@@ -97,6 +104,15 @@ void frontswap_writethrough(bool enable) | |||
97 | EXPORT_SYMBOL(frontswap_writethrough); | 104 | EXPORT_SYMBOL(frontswap_writethrough); |
98 | 105 | ||
99 | /* | 106 | /* |
107 | * Enable/disable frontswap exclusive gets (see above). | ||
108 | */ | ||
109 | void frontswap_tmem_exclusive_gets(bool enable) | ||
110 | { | ||
111 | frontswap_tmem_exclusive_gets_enabled = enable; | ||
112 | } | ||
113 | EXPORT_SYMBOL(frontswap_tmem_exclusive_gets); | ||
114 | |||
115 | /* | ||
100 | * Called when a swap device is swapon'd. | 116 | * Called when a swap device is swapon'd. |
101 | */ | 117 | */ |
102 | void __frontswap_init(unsigned type) | 118 | void __frontswap_init(unsigned type) |
@@ -174,8 +190,13 @@ int __frontswap_load(struct page *page) | |||
174 | BUG_ON(sis == NULL); | 190 | BUG_ON(sis == NULL); |
175 | if (frontswap_test(sis, offset)) | 191 | if (frontswap_test(sis, offset)) |
176 | ret = frontswap_ops.load(type, offset, page); | 192 | ret = frontswap_ops.load(type, offset, page); |
177 | if (ret == 0) | 193 | if (ret == 0) { |
178 | inc_frontswap_loads(); | 194 | inc_frontswap_loads(); |
195 | if (frontswap_tmem_exclusive_gets_enabled) { | ||
196 | SetPageDirty(page); | ||
197 | frontswap_clear(sis, offset); | ||
198 | } | ||
199 | } | ||
179 | return ret; | 200 | return ret; |
180 | } | 201 | } |
181 | EXPORT_SYMBOL(__frontswap_load); | 202 | EXPORT_SYMBOL(__frontswap_load); |
@@ -263,6 +284,11 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, | |||
263 | return ret; | 284 | return ret; |
264 | } | 285 | } |
265 | 286 | ||
287 | /* | ||
288 | * Used to check if it's necessory and feasible to unuse pages. | ||
289 | * Return 1 when nothing to do, 0 when need to shink pages, | ||
290 | * error code when there is an error. | ||
291 | */ | ||
266 | static int __frontswap_shrink(unsigned long target_pages, | 292 | static int __frontswap_shrink(unsigned long target_pages, |
267 | unsigned long *pages_to_unuse, | 293 | unsigned long *pages_to_unuse, |
268 | int *type) | 294 | int *type) |
@@ -275,7 +301,7 @@ static int __frontswap_shrink(unsigned long target_pages, | |||
275 | if (total_pages <= target_pages) { | 301 | if (total_pages <= target_pages) { |
276 | /* Nothing to do */ | 302 | /* Nothing to do */ |
277 | *pages_to_unuse = 0; | 303 | *pages_to_unuse = 0; |
278 | return 0; | 304 | return 1; |
279 | } | 305 | } |
280 | total_pages_to_unuse = total_pages - target_pages; | 306 | total_pages_to_unuse = total_pages - target_pages; |
281 | return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); | 307 | return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type); |
@@ -292,7 +318,7 @@ static int __frontswap_shrink(unsigned long target_pages, | |||
292 | void frontswap_shrink(unsigned long target_pages) | 318 | void frontswap_shrink(unsigned long target_pages) |
293 | { | 319 | { |
294 | unsigned long pages_to_unuse = 0; | 320 | unsigned long pages_to_unuse = 0; |
295 | int type, ret; | 321 | int uninitialized_var(type), ret; |
296 | 322 | ||
297 | /* | 323 | /* |
298 | * we don't want to hold swap_lock while doing a very | 324 | * we don't want to hold swap_lock while doing a very |
@@ -302,7 +328,7 @@ void frontswap_shrink(unsigned long target_pages) | |||
302 | spin_lock(&swap_lock); | 328 | spin_lock(&swap_lock); |
303 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); | 329 | ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); |
304 | spin_unlock(&swap_lock); | 330 | spin_unlock(&swap_lock); |
305 | if (ret == 0 && pages_to_unuse) | 331 | if (ret == 0) |
306 | try_to_unuse(type, true, pages_to_unuse); | 332 | try_to_unuse(type, true, pages_to_unuse); |
307 | return; | 333 | return; |
308 | } | 334 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 141dbb695097..40f17c34b415 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/khugepaged.h> | 17 | #include <linux/khugepaged.h> |
18 | #include <linux/freezer.h> | 18 | #include <linux/freezer.h> |
19 | #include <linux/mman.h> | 19 | #include <linux/mman.h> |
20 | #include <linux/pagemap.h> | ||
20 | #include <asm/tlb.h> | 21 | #include <asm/tlb.h> |
21 | #include <asm/pgalloc.h> | 22 | #include <asm/pgalloc.h> |
22 | #include "internal.h" | 23 | #include "internal.h" |
@@ -102,10 +103,7 @@ static int set_recommended_min_free_kbytes(void) | |||
102 | unsigned long recommended_min; | 103 | unsigned long recommended_min; |
103 | extern int min_free_kbytes; | 104 | extern int min_free_kbytes; |
104 | 105 | ||
105 | if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, | 106 | if (!khugepaged_enabled()) |
106 | &transparent_hugepage_flags) && | ||
107 | !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
108 | &transparent_hugepage_flags)) | ||
109 | return 0; | 107 | return 0; |
110 | 108 | ||
111 | for_each_populated_zone(zone) | 109 | for_each_populated_zone(zone) |
@@ -139,12 +137,6 @@ static int start_khugepaged(void) | |||
139 | { | 137 | { |
140 | int err = 0; | 138 | int err = 0; |
141 | if (khugepaged_enabled()) { | 139 | if (khugepaged_enabled()) { |
142 | int wakeup; | ||
143 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
144 | err = -ENOMEM; | ||
145 | goto out; | ||
146 | } | ||
147 | mutex_lock(&khugepaged_mutex); | ||
148 | if (!khugepaged_thread) | 140 | if (!khugepaged_thread) |
149 | khugepaged_thread = kthread_run(khugepaged, NULL, | 141 | khugepaged_thread = kthread_run(khugepaged, NULL, |
150 | "khugepaged"); | 142 | "khugepaged"); |
@@ -154,16 +146,16 @@ static int start_khugepaged(void) | |||
154 | err = PTR_ERR(khugepaged_thread); | 146 | err = PTR_ERR(khugepaged_thread); |
155 | khugepaged_thread = NULL; | 147 | khugepaged_thread = NULL; |
156 | } | 148 | } |
157 | wakeup = !list_empty(&khugepaged_scan.mm_head); | 149 | |
158 | mutex_unlock(&khugepaged_mutex); | 150 | if (!list_empty(&khugepaged_scan.mm_head)) |
159 | if (wakeup) | ||
160 | wake_up_interruptible(&khugepaged_wait); | 151 | wake_up_interruptible(&khugepaged_wait); |
161 | 152 | ||
162 | set_recommended_min_free_kbytes(); | 153 | set_recommended_min_free_kbytes(); |
163 | } else | 154 | } else if (khugepaged_thread) { |
164 | /* wakeup to exit */ | 155 | kthread_stop(khugepaged_thread); |
165 | wake_up_interruptible(&khugepaged_wait); | 156 | khugepaged_thread = NULL; |
166 | out: | 157 | } |
158 | |||
167 | return err; | 159 | return err; |
168 | } | 160 | } |
169 | 161 | ||
@@ -224,18 +216,16 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
224 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 216 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
225 | 217 | ||
226 | if (ret > 0) { | 218 | if (ret > 0) { |
227 | int err = start_khugepaged(); | 219 | int err; |
220 | |||
221 | mutex_lock(&khugepaged_mutex); | ||
222 | err = start_khugepaged(); | ||
223 | mutex_unlock(&khugepaged_mutex); | ||
224 | |||
228 | if (err) | 225 | if (err) |
229 | ret = err; | 226 | ret = err; |
230 | } | 227 | } |
231 | 228 | ||
232 | if (ret > 0 && | ||
233 | (test_bit(TRANSPARENT_HUGEPAGE_FLAG, | ||
234 | &transparent_hugepage_flags) || | ||
235 | test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | ||
236 | &transparent_hugepage_flags))) | ||
237 | set_recommended_min_free_kbytes(); | ||
238 | |||
239 | return ret; | 229 | return ret; |
240 | } | 230 | } |
241 | static struct kobj_attribute enabled_attr = | 231 | static struct kobj_attribute enabled_attr = |
@@ -570,8 +560,6 @@ static int __init hugepage_init(void) | |||
570 | 560 | ||
571 | start_khugepaged(); | 561 | start_khugepaged(); |
572 | 562 | ||
573 | set_recommended_min_free_kbytes(); | ||
574 | |||
575 | return 0; | 563 | return 0; |
576 | out: | 564 | out: |
577 | hugepage_exit_sysfs(hugepage_kobj); | 565 | hugepage_exit_sysfs(hugepage_kobj); |
@@ -611,19 +599,6 @@ out: | |||
611 | } | 599 | } |
612 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 600 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
613 | 601 | ||
614 | static void prepare_pmd_huge_pte(pgtable_t pgtable, | ||
615 | struct mm_struct *mm) | ||
616 | { | ||
617 | assert_spin_locked(&mm->page_table_lock); | ||
618 | |||
619 | /* FIFO */ | ||
620 | if (!mm->pmd_huge_pte) | ||
621 | INIT_LIST_HEAD(&pgtable->lru); | ||
622 | else | ||
623 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
624 | mm->pmd_huge_pte = pgtable; | ||
625 | } | ||
626 | |||
627 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 602 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
628 | { | 603 | { |
629 | if (likely(vma->vm_flags & VM_WRITE)) | 604 | if (likely(vma->vm_flags & VM_WRITE)) |
@@ -665,7 +640,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
665 | */ | 640 | */ |
666 | page_add_new_anon_rmap(page, vma, haddr); | 641 | page_add_new_anon_rmap(page, vma, haddr); |
667 | set_pmd_at(mm, haddr, pmd, entry); | 642 | set_pmd_at(mm, haddr, pmd, entry); |
668 | prepare_pmd_huge_pte(pgtable, mm); | 643 | pgtable_trans_huge_deposit(mm, pgtable); |
669 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 644 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
670 | mm->nr_ptes++; | 645 | mm->nr_ptes++; |
671 | spin_unlock(&mm->page_table_lock); | 646 | spin_unlock(&mm->page_table_lock); |
@@ -791,7 +766,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
791 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 766 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
792 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 767 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
793 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 768 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
794 | prepare_pmd_huge_pte(pgtable, dst_mm); | 769 | pgtable_trans_huge_deposit(dst_mm, pgtable); |
795 | dst_mm->nr_ptes++; | 770 | dst_mm->nr_ptes++; |
796 | 771 | ||
797 | ret = 0; | 772 | ret = 0; |
@@ -802,25 +777,6 @@ out: | |||
802 | return ret; | 777 | return ret; |
803 | } | 778 | } |
804 | 779 | ||
805 | /* no "address" argument so destroys page coloring of some arch */ | ||
806 | pgtable_t get_pmd_huge_pte(struct mm_struct *mm) | ||
807 | { | ||
808 | pgtable_t pgtable; | ||
809 | |||
810 | assert_spin_locked(&mm->page_table_lock); | ||
811 | |||
812 | /* FIFO */ | ||
813 | pgtable = mm->pmd_huge_pte; | ||
814 | if (list_empty(&pgtable->lru)) | ||
815 | mm->pmd_huge_pte = NULL; | ||
816 | else { | ||
817 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
818 | struct page, lru); | ||
819 | list_del(&pgtable->lru); | ||
820 | } | ||
821 | return pgtable; | ||
822 | } | ||
823 | |||
824 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 780 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
825 | struct vm_area_struct *vma, | 781 | struct vm_area_struct *vma, |
826 | unsigned long address, | 782 | unsigned long address, |
@@ -832,6 +788,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
832 | pmd_t _pmd; | 788 | pmd_t _pmd; |
833 | int ret = 0, i; | 789 | int ret = 0, i; |
834 | struct page **pages; | 790 | struct page **pages; |
791 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
792 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
835 | 793 | ||
836 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, | 794 | pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, |
837 | GFP_KERNEL); | 795 | GFP_KERNEL); |
@@ -868,15 +826,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
868 | cond_resched(); | 826 | cond_resched(); |
869 | } | 827 | } |
870 | 828 | ||
829 | mmun_start = haddr; | ||
830 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
831 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
832 | |||
871 | spin_lock(&mm->page_table_lock); | 833 | spin_lock(&mm->page_table_lock); |
872 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 834 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
873 | goto out_free_pages; | 835 | goto out_free_pages; |
874 | VM_BUG_ON(!PageHead(page)); | 836 | VM_BUG_ON(!PageHead(page)); |
875 | 837 | ||
876 | pmdp_clear_flush_notify(vma, haddr, pmd); | 838 | pmdp_clear_flush(vma, haddr, pmd); |
877 | /* leave pmd empty until pte is filled */ | 839 | /* leave pmd empty until pte is filled */ |
878 | 840 | ||
879 | pgtable = get_pmd_huge_pte(mm); | 841 | pgtable = pgtable_trans_huge_withdraw(mm); |
880 | pmd_populate(mm, &_pmd, pgtable); | 842 | pmd_populate(mm, &_pmd, pgtable); |
881 | 843 | ||
882 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 844 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -896,6 +858,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
896 | page_remove_rmap(page); | 858 | page_remove_rmap(page); |
897 | spin_unlock(&mm->page_table_lock); | 859 | spin_unlock(&mm->page_table_lock); |
898 | 860 | ||
861 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
862 | |||
899 | ret |= VM_FAULT_WRITE; | 863 | ret |= VM_FAULT_WRITE; |
900 | put_page(page); | 864 | put_page(page); |
901 | 865 | ||
@@ -904,6 +868,7 @@ out: | |||
904 | 868 | ||
905 | out_free_pages: | 869 | out_free_pages: |
906 | spin_unlock(&mm->page_table_lock); | 870 | spin_unlock(&mm->page_table_lock); |
871 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
907 | mem_cgroup_uncharge_start(); | 872 | mem_cgroup_uncharge_start(); |
908 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 873 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
909 | mem_cgroup_uncharge_page(pages[i]); | 874 | mem_cgroup_uncharge_page(pages[i]); |
@@ -920,6 +885,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
920 | int ret = 0; | 885 | int ret = 0; |
921 | struct page *page, *new_page; | 886 | struct page *page, *new_page; |
922 | unsigned long haddr; | 887 | unsigned long haddr; |
888 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
889 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
923 | 890 | ||
924 | VM_BUG_ON(!vma->anon_vma); | 891 | VM_BUG_ON(!vma->anon_vma); |
925 | spin_lock(&mm->page_table_lock); | 892 | spin_lock(&mm->page_table_lock); |
@@ -934,7 +901,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
934 | entry = pmd_mkyoung(orig_pmd); | 901 | entry = pmd_mkyoung(orig_pmd); |
935 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 902 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
936 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 903 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) |
937 | update_mmu_cache(vma, address, entry); | 904 | update_mmu_cache_pmd(vma, address, pmd); |
938 | ret |= VM_FAULT_WRITE; | 905 | ret |= VM_FAULT_WRITE; |
939 | goto out_unlock; | 906 | goto out_unlock; |
940 | } | 907 | } |
@@ -970,38 +937,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
970 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 937 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); |
971 | __SetPageUptodate(new_page); | 938 | __SetPageUptodate(new_page); |
972 | 939 | ||
940 | mmun_start = haddr; | ||
941 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
942 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
943 | |||
973 | spin_lock(&mm->page_table_lock); | 944 | spin_lock(&mm->page_table_lock); |
974 | put_page(page); | 945 | put_page(page); |
975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 946 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | 947 | spin_unlock(&mm->page_table_lock); |
977 | mem_cgroup_uncharge_page(new_page); | 948 | mem_cgroup_uncharge_page(new_page); |
978 | put_page(new_page); | 949 | put_page(new_page); |
979 | goto out; | 950 | goto out_mn; |
980 | } else { | 951 | } else { |
981 | pmd_t entry; | 952 | pmd_t entry; |
982 | VM_BUG_ON(!PageHead(page)); | 953 | VM_BUG_ON(!PageHead(page)); |
983 | entry = mk_pmd(new_page, vma->vm_page_prot); | 954 | entry = mk_pmd(new_page, vma->vm_page_prot); |
984 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 955 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
985 | entry = pmd_mkhuge(entry); | 956 | entry = pmd_mkhuge(entry); |
986 | pmdp_clear_flush_notify(vma, haddr, pmd); | 957 | pmdp_clear_flush(vma, haddr, pmd); |
987 | page_add_new_anon_rmap(new_page, vma, haddr); | 958 | page_add_new_anon_rmap(new_page, vma, haddr); |
988 | set_pmd_at(mm, haddr, pmd, entry); | 959 | set_pmd_at(mm, haddr, pmd, entry); |
989 | update_mmu_cache(vma, address, entry); | 960 | update_mmu_cache_pmd(vma, address, pmd); |
990 | page_remove_rmap(page); | 961 | page_remove_rmap(page); |
991 | put_page(page); | 962 | put_page(page); |
992 | ret |= VM_FAULT_WRITE; | 963 | ret |= VM_FAULT_WRITE; |
993 | } | 964 | } |
994 | out_unlock: | ||
995 | spin_unlock(&mm->page_table_lock); | 965 | spin_unlock(&mm->page_table_lock); |
966 | out_mn: | ||
967 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
996 | out: | 968 | out: |
997 | return ret; | 969 | return ret; |
970 | out_unlock: | ||
971 | spin_unlock(&mm->page_table_lock); | ||
972 | return ret; | ||
998 | } | 973 | } |
999 | 974 | ||
1000 | struct page *follow_trans_huge_pmd(struct mm_struct *mm, | 975 | struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
1001 | unsigned long addr, | 976 | unsigned long addr, |
1002 | pmd_t *pmd, | 977 | pmd_t *pmd, |
1003 | unsigned int flags) | 978 | unsigned int flags) |
1004 | { | 979 | { |
980 | struct mm_struct *mm = vma->vm_mm; | ||
1005 | struct page *page = NULL; | 981 | struct page *page = NULL; |
1006 | 982 | ||
1007 | assert_spin_locked(&mm->page_table_lock); | 983 | assert_spin_locked(&mm->page_table_lock); |
@@ -1024,6 +1000,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, | |||
1024 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 1000 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
1025 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | 1001 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); |
1026 | } | 1002 | } |
1003 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1004 | if (page->mapping && trylock_page(page)) { | ||
1005 | lru_add_drain(); | ||
1006 | if (page->mapping) | ||
1007 | mlock_vma_page(page); | ||
1008 | unlock_page(page); | ||
1009 | } | ||
1010 | } | ||
1027 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1011 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1028 | VM_BUG_ON(!PageCompound(page)); | 1012 | VM_BUG_ON(!PageCompound(page)); |
1029 | if (flags & FOLL_GET) | 1013 | if (flags & FOLL_GET) |
@@ -1041,9 +1025,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1041 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1025 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1042 | struct page *page; | 1026 | struct page *page; |
1043 | pgtable_t pgtable; | 1027 | pgtable_t pgtable; |
1044 | pgtable = get_pmd_huge_pte(tlb->mm); | 1028 | pmd_t orig_pmd; |
1045 | page = pmd_page(*pmd); | 1029 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
1046 | pmd_clear(pmd); | 1030 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1031 | page = pmd_page(orig_pmd); | ||
1047 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1032 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1048 | page_remove_rmap(page); | 1033 | page_remove_rmap(page); |
1049 | VM_BUG_ON(page_mapcount(page) < 0); | 1034 | VM_BUG_ON(page_mapcount(page) < 0); |
@@ -1207,7 +1192,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1207 | struct mm_struct *mm = vma->vm_mm; | 1192 | struct mm_struct *mm = vma->vm_mm; |
1208 | pmd_t *pmd; | 1193 | pmd_t *pmd; |
1209 | int ret = 0; | 1194 | int ret = 0; |
1195 | /* For mmu_notifiers */ | ||
1196 | const unsigned long mmun_start = address; | ||
1197 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | ||
1210 | 1198 | ||
1199 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1211 | spin_lock(&mm->page_table_lock); | 1200 | spin_lock(&mm->page_table_lock); |
1212 | pmd = page_check_address_pmd(page, mm, address, | 1201 | pmd = page_check_address_pmd(page, mm, address, |
1213 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); | 1202 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); |
@@ -1219,10 +1208,11 @@ static int __split_huge_page_splitting(struct page *page, | |||
1219 | * and it won't wait on the anon_vma->root->mutex to | 1208 | * and it won't wait on the anon_vma->root->mutex to |
1220 | * serialize against split_huge_page*. | 1209 | * serialize against split_huge_page*. |
1221 | */ | 1210 | */ |
1222 | pmdp_splitting_flush_notify(vma, address, pmd); | 1211 | pmdp_splitting_flush(vma, address, pmd); |
1223 | ret = 1; | 1212 | ret = 1; |
1224 | } | 1213 | } |
1225 | spin_unlock(&mm->page_table_lock); | 1214 | spin_unlock(&mm->page_table_lock); |
1215 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1226 | 1216 | ||
1227 | return ret; | 1217 | return ret; |
1228 | } | 1218 | } |
@@ -1358,11 +1348,11 @@ static int __split_huge_page_map(struct page *page, | |||
1358 | pmd = page_check_address_pmd(page, mm, address, | 1348 | pmd = page_check_address_pmd(page, mm, address, |
1359 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1349 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); |
1360 | if (pmd) { | 1350 | if (pmd) { |
1361 | pgtable = get_pmd_huge_pte(mm); | 1351 | pgtable = pgtable_trans_huge_withdraw(mm); |
1362 | pmd_populate(mm, &_pmd, pgtable); | 1352 | pmd_populate(mm, &_pmd, pgtable); |
1363 | 1353 | ||
1364 | for (i = 0, haddr = address; i < HPAGE_PMD_NR; | 1354 | haddr = address; |
1365 | i++, haddr += PAGE_SIZE) { | 1355 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1366 | pte_t *pte, entry; | 1356 | pte_t *pte, entry; |
1367 | BUG_ON(PageCompound(page+i)); | 1357 | BUG_ON(PageCompound(page+i)); |
1368 | entry = mk_pte(page + i, vma->vm_page_prot); | 1358 | entry = mk_pte(page + i, vma->vm_page_prot); |
@@ -1406,8 +1396,7 @@ static int __split_huge_page_map(struct page *page, | |||
1406 | * SMP TLB and finally we write the non-huge version | 1396 | * SMP TLB and finally we write the non-huge version |
1407 | * of the pmd entry with pmd_populate. | 1397 | * of the pmd entry with pmd_populate. |
1408 | */ | 1398 | */ |
1409 | set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); | 1399 | pmdp_invalidate(vma, address, pmd); |
1410 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
1411 | pmd_populate(mm, pmd, pgtable); | 1400 | pmd_populate(mm, pmd, pgtable); |
1412 | ret = 1; | 1401 | ret = 1; |
1413 | } | 1402 | } |
@@ -1421,18 +1410,17 @@ static void __split_huge_page(struct page *page, | |||
1421 | struct anon_vma *anon_vma) | 1410 | struct anon_vma *anon_vma) |
1422 | { | 1411 | { |
1423 | int mapcount, mapcount2; | 1412 | int mapcount, mapcount2; |
1413 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1424 | struct anon_vma_chain *avc; | 1414 | struct anon_vma_chain *avc; |
1425 | 1415 | ||
1426 | BUG_ON(!PageHead(page)); | 1416 | BUG_ON(!PageHead(page)); |
1427 | BUG_ON(PageTail(page)); | 1417 | BUG_ON(PageTail(page)); |
1428 | 1418 | ||
1429 | mapcount = 0; | 1419 | mapcount = 0; |
1430 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1420 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1431 | struct vm_area_struct *vma = avc->vma; | 1421 | struct vm_area_struct *vma = avc->vma; |
1432 | unsigned long addr = vma_address(page, vma); | 1422 | unsigned long addr = vma_address(page, vma); |
1433 | BUG_ON(is_vma_temporary_stack(vma)); | 1423 | BUG_ON(is_vma_temporary_stack(vma)); |
1434 | if (addr == -EFAULT) | ||
1435 | continue; | ||
1436 | mapcount += __split_huge_page_splitting(page, vma, addr); | 1424 | mapcount += __split_huge_page_splitting(page, vma, addr); |
1437 | } | 1425 | } |
1438 | /* | 1426 | /* |
@@ -1453,12 +1441,10 @@ static void __split_huge_page(struct page *page, | |||
1453 | __split_huge_page_refcount(page); | 1441 | __split_huge_page_refcount(page); |
1454 | 1442 | ||
1455 | mapcount2 = 0; | 1443 | mapcount2 = 0; |
1456 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1444 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1457 | struct vm_area_struct *vma = avc->vma; | 1445 | struct vm_area_struct *vma = avc->vma; |
1458 | unsigned long addr = vma_address(page, vma); | 1446 | unsigned long addr = vma_address(page, vma); |
1459 | BUG_ON(is_vma_temporary_stack(vma)); | 1447 | BUG_ON(is_vma_temporary_stack(vma)); |
1460 | if (addr == -EFAULT) | ||
1461 | continue; | ||
1462 | mapcount2 += __split_huge_page_map(page, vma, addr); | 1448 | mapcount2 += __split_huge_page_map(page, vma, addr); |
1463 | } | 1449 | } |
1464 | if (mapcount != mapcount2) | 1450 | if (mapcount != mapcount2) |
@@ -1491,12 +1477,13 @@ out: | |||
1491 | return ret; | 1477 | return ret; |
1492 | } | 1478 | } |
1493 | 1479 | ||
1494 | #define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ | 1480 | #define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) |
1495 | VM_HUGETLB|VM_SHARED|VM_MAYSHARE) | ||
1496 | 1481 | ||
1497 | int hugepage_madvise(struct vm_area_struct *vma, | 1482 | int hugepage_madvise(struct vm_area_struct *vma, |
1498 | unsigned long *vm_flags, int advice) | 1483 | unsigned long *vm_flags, int advice) |
1499 | { | 1484 | { |
1485 | struct mm_struct *mm = vma->vm_mm; | ||
1486 | |||
1500 | switch (advice) { | 1487 | switch (advice) { |
1501 | case MADV_HUGEPAGE: | 1488 | case MADV_HUGEPAGE: |
1502 | /* | 1489 | /* |
@@ -1504,6 +1491,8 @@ int hugepage_madvise(struct vm_area_struct *vma, | |||
1504 | */ | 1491 | */ |
1505 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) | 1492 | if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) |
1506 | return -EINVAL; | 1493 | return -EINVAL; |
1494 | if (mm->def_flags & VM_NOHUGEPAGE) | ||
1495 | return -EINVAL; | ||
1507 | *vm_flags &= ~VM_NOHUGEPAGE; | 1496 | *vm_flags &= ~VM_NOHUGEPAGE; |
1508 | *vm_flags |= VM_HUGEPAGE; | 1497 | *vm_flags |= VM_HUGEPAGE; |
1509 | /* | 1498 | /* |
@@ -1655,11 +1644,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | |||
1655 | if (vma->vm_ops) | 1644 | if (vma->vm_ops) |
1656 | /* khugepaged not yet working on file or special mappings */ | 1645 | /* khugepaged not yet working on file or special mappings */ |
1657 | return 0; | 1646 | return 0; |
1658 | /* | 1647 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
1659 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1660 | * true too, verify it here. | ||
1661 | */ | ||
1662 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1663 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 1648 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
1664 | hend = vma->vm_end & HPAGE_PMD_MASK; | 1649 | hend = vma->vm_end & HPAGE_PMD_MASK; |
1665 | if (hstart < hend) | 1650 | if (hstart < hend) |
@@ -1833,28 +1818,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
1833 | } | 1818 | } |
1834 | } | 1819 | } |
1835 | 1820 | ||
1836 | static void collapse_huge_page(struct mm_struct *mm, | 1821 | static void khugepaged_alloc_sleep(void) |
1837 | unsigned long address, | ||
1838 | struct page **hpage, | ||
1839 | struct vm_area_struct *vma, | ||
1840 | int node) | ||
1841 | { | 1822 | { |
1842 | pgd_t *pgd; | 1823 | wait_event_freezable_timeout(khugepaged_wait, false, |
1843 | pud_t *pud; | 1824 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); |
1844 | pmd_t *pmd, _pmd; | 1825 | } |
1845 | pte_t *pte; | ||
1846 | pgtable_t pgtable; | ||
1847 | struct page *new_page; | ||
1848 | spinlock_t *ptl; | ||
1849 | int isolated; | ||
1850 | unsigned long hstart, hend; | ||
1851 | 1826 | ||
1852 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 1827 | #ifdef CONFIG_NUMA |
1853 | #ifndef CONFIG_NUMA | 1828 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) |
1854 | up_read(&mm->mmap_sem); | 1829 | { |
1855 | VM_BUG_ON(!*hpage); | 1830 | if (IS_ERR(*hpage)) { |
1856 | new_page = *hpage; | 1831 | if (!*wait) |
1857 | #else | 1832 | return false; |
1833 | |||
1834 | *wait = false; | ||
1835 | *hpage = NULL; | ||
1836 | khugepaged_alloc_sleep(); | ||
1837 | } else if (*hpage) { | ||
1838 | put_page(*hpage); | ||
1839 | *hpage = NULL; | ||
1840 | } | ||
1841 | |||
1842 | return true; | ||
1843 | } | ||
1844 | |||
1845 | static struct page | ||
1846 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | ||
1847 | struct vm_area_struct *vma, unsigned long address, | ||
1848 | int node) | ||
1849 | { | ||
1858 | VM_BUG_ON(*hpage); | 1850 | VM_BUG_ON(*hpage); |
1859 | /* | 1851 | /* |
1860 | * Allocate the page while the vma is still valid and under | 1852 | * Allocate the page while the vma is still valid and under |
@@ -1866,7 +1858,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1866 | * mmap_sem in read mode is good idea also to allow greater | 1858 | * mmap_sem in read mode is good idea also to allow greater |
1867 | * scalability. | 1859 | * scalability. |
1868 | */ | 1860 | */ |
1869 | new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, | 1861 | *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, |
1870 | node, __GFP_OTHER_NODE); | 1862 | node, __GFP_OTHER_NODE); |
1871 | 1863 | ||
1872 | /* | 1864 | /* |
@@ -1874,20 +1866,85 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1874 | * preparation for taking it in write mode. | 1866 | * preparation for taking it in write mode. |
1875 | */ | 1867 | */ |
1876 | up_read(&mm->mmap_sem); | 1868 | up_read(&mm->mmap_sem); |
1877 | if (unlikely(!new_page)) { | 1869 | if (unlikely(!*hpage)) { |
1878 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 1870 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
1879 | *hpage = ERR_PTR(-ENOMEM); | 1871 | *hpage = ERR_PTR(-ENOMEM); |
1880 | return; | 1872 | return NULL; |
1881 | } | 1873 | } |
1882 | #endif | ||
1883 | 1874 | ||
1884 | count_vm_event(THP_COLLAPSE_ALLOC); | 1875 | count_vm_event(THP_COLLAPSE_ALLOC); |
1885 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1876 | return *hpage; |
1886 | #ifdef CONFIG_NUMA | 1877 | } |
1887 | put_page(new_page); | 1878 | #else |
1879 | static struct page *khugepaged_alloc_hugepage(bool *wait) | ||
1880 | { | ||
1881 | struct page *hpage; | ||
1882 | |||
1883 | do { | ||
1884 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
1885 | if (!hpage) { | ||
1886 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
1887 | if (!*wait) | ||
1888 | return NULL; | ||
1889 | |||
1890 | *wait = false; | ||
1891 | khugepaged_alloc_sleep(); | ||
1892 | } else | ||
1893 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
1894 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | ||
1895 | |||
1896 | return hpage; | ||
1897 | } | ||
1898 | |||
1899 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
1900 | { | ||
1901 | if (!*hpage) | ||
1902 | *hpage = khugepaged_alloc_hugepage(wait); | ||
1903 | |||
1904 | if (unlikely(!*hpage)) | ||
1905 | return false; | ||
1906 | |||
1907 | return true; | ||
1908 | } | ||
1909 | |||
1910 | static struct page | ||
1911 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | ||
1912 | struct vm_area_struct *vma, unsigned long address, | ||
1913 | int node) | ||
1914 | { | ||
1915 | up_read(&mm->mmap_sem); | ||
1916 | VM_BUG_ON(!*hpage); | ||
1917 | return *hpage; | ||
1918 | } | ||
1888 | #endif | 1919 | #endif |
1920 | |||
1921 | static void collapse_huge_page(struct mm_struct *mm, | ||
1922 | unsigned long address, | ||
1923 | struct page **hpage, | ||
1924 | struct vm_area_struct *vma, | ||
1925 | int node) | ||
1926 | { | ||
1927 | pgd_t *pgd; | ||
1928 | pud_t *pud; | ||
1929 | pmd_t *pmd, _pmd; | ||
1930 | pte_t *pte; | ||
1931 | pgtable_t pgtable; | ||
1932 | struct page *new_page; | ||
1933 | spinlock_t *ptl; | ||
1934 | int isolated; | ||
1935 | unsigned long hstart, hend; | ||
1936 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1937 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1938 | |||
1939 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1940 | |||
1941 | /* release the mmap_sem read lock. */ | ||
1942 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | ||
1943 | if (!new_page) | ||
1944 | return; | ||
1945 | |||
1946 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | ||
1889 | return; | 1947 | return; |
1890 | } | ||
1891 | 1948 | ||
1892 | /* | 1949 | /* |
1893 | * Prevent all access to pagetables with the exception of | 1950 | * Prevent all access to pagetables with the exception of |
@@ -1912,11 +1969,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1912 | goto out; | 1969 | goto out; |
1913 | if (is_vma_temporary_stack(vma)) | 1970 | if (is_vma_temporary_stack(vma)) |
1914 | goto out; | 1971 | goto out; |
1915 | /* | 1972 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
1916 | * If is_pfn_mapping() is true is_learn_pfn_mapping() must be | ||
1917 | * true too, verify it here. | ||
1918 | */ | ||
1919 | VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP); | ||
1920 | 1973 | ||
1921 | pgd = pgd_offset(mm, address); | 1974 | pgd = pgd_offset(mm, address); |
1922 | if (!pgd_present(*pgd)) | 1975 | if (!pgd_present(*pgd)) |
@@ -1936,6 +1989,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1936 | pte = pte_offset_map(pmd, address); | 1989 | pte = pte_offset_map(pmd, address); |
1937 | ptl = pte_lockptr(mm, pmd); | 1990 | ptl = pte_lockptr(mm, pmd); |
1938 | 1991 | ||
1992 | mmun_start = address; | ||
1993 | mmun_end = address + HPAGE_PMD_SIZE; | ||
1994 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1939 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | 1995 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ |
1940 | /* | 1996 | /* |
1941 | * After this gup_fast can't run anymore. This also removes | 1997 | * After this gup_fast can't run anymore. This also removes |
@@ -1943,8 +1999,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1943 | * huge and small TLB entries for the same virtual address | 1999 | * huge and small TLB entries for the same virtual address |
1944 | * to avoid the risk of CPU bugs in that area. | 2000 | * to avoid the risk of CPU bugs in that area. |
1945 | */ | 2001 | */ |
1946 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | 2002 | _pmd = pmdp_clear_flush(vma, address, pmd); |
1947 | spin_unlock(&mm->page_table_lock); | 2003 | spin_unlock(&mm->page_table_lock); |
2004 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1948 | 2005 | ||
1949 | spin_lock(ptl); | 2006 | spin_lock(ptl); |
1950 | isolated = __collapse_huge_page_isolate(vma, address, pte); | 2007 | isolated = __collapse_huge_page_isolate(vma, address, pte); |
@@ -1970,8 +2027,6 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1970 | pte_unmap(pte); | 2027 | pte_unmap(pte); |
1971 | __SetPageUptodate(new_page); | 2028 | __SetPageUptodate(new_page); |
1972 | pgtable = pmd_pgtable(_pmd); | 2029 | pgtable = pmd_pgtable(_pmd); |
1973 | VM_BUG_ON(page_count(pgtable) != 1); | ||
1974 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
1975 | 2030 | ||
1976 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | 2031 | _pmd = mk_pmd(new_page, vma->vm_page_prot); |
1977 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | 2032 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); |
@@ -1988,13 +2043,12 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1988 | BUG_ON(!pmd_none(*pmd)); | 2043 | BUG_ON(!pmd_none(*pmd)); |
1989 | page_add_new_anon_rmap(new_page, vma, address); | 2044 | page_add_new_anon_rmap(new_page, vma, address); |
1990 | set_pmd_at(mm, address, pmd, _pmd); | 2045 | set_pmd_at(mm, address, pmd, _pmd); |
1991 | update_mmu_cache(vma, address, _pmd); | 2046 | update_mmu_cache_pmd(vma, address, pmd); |
1992 | prepare_pmd_huge_pte(pgtable, mm); | 2047 | pgtable_trans_huge_deposit(mm, pgtable); |
1993 | spin_unlock(&mm->page_table_lock); | 2048 | spin_unlock(&mm->page_table_lock); |
1994 | 2049 | ||
1995 | #ifndef CONFIG_NUMA | ||
1996 | *hpage = NULL; | 2050 | *hpage = NULL; |
1997 | #endif | 2051 | |
1998 | khugepaged_pages_collapsed++; | 2052 | khugepaged_pages_collapsed++; |
1999 | out_up_write: | 2053 | out_up_write: |
2000 | up_write(&mm->mmap_sem); | 2054 | up_write(&mm->mmap_sem); |
@@ -2002,9 +2056,6 @@ out_up_write: | |||
2002 | 2056 | ||
2003 | out: | 2057 | out: |
2004 | mem_cgroup_uncharge_page(new_page); | 2058 | mem_cgroup_uncharge_page(new_page); |
2005 | #ifdef CONFIG_NUMA | ||
2006 | put_page(new_page); | ||
2007 | #endif | ||
2008 | goto out_up_write; | 2059 | goto out_up_write; |
2009 | } | 2060 | } |
2010 | 2061 | ||
@@ -2154,12 +2205,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
2154 | goto skip; | 2205 | goto skip; |
2155 | if (is_vma_temporary_stack(vma)) | 2206 | if (is_vma_temporary_stack(vma)) |
2156 | goto skip; | 2207 | goto skip; |
2157 | /* | 2208 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); |
2158 | * If is_pfn_mapping() is true is_learn_pfn_mapping() | ||
2159 | * must be true too, verify it here. | ||
2160 | */ | ||
2161 | VM_BUG_ON(is_linear_pfn_mapping(vma) || | ||
2162 | vma->vm_flags & VM_NO_THP); | ||
2163 | 2209 | ||
2164 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2210 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2165 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2211 | hend = vma->vm_end & HPAGE_PMD_MASK; |
@@ -2234,32 +2280,23 @@ static int khugepaged_has_work(void) | |||
2234 | static int khugepaged_wait_event(void) | 2280 | static int khugepaged_wait_event(void) |
2235 | { | 2281 | { |
2236 | return !list_empty(&khugepaged_scan.mm_head) || | 2282 | return !list_empty(&khugepaged_scan.mm_head) || |
2237 | !khugepaged_enabled(); | 2283 | kthread_should_stop(); |
2238 | } | 2284 | } |
2239 | 2285 | ||
2240 | static void khugepaged_do_scan(struct page **hpage) | 2286 | static void khugepaged_do_scan(void) |
2241 | { | 2287 | { |
2288 | struct page *hpage = NULL; | ||
2242 | unsigned int progress = 0, pass_through_head = 0; | 2289 | unsigned int progress = 0, pass_through_head = 0; |
2243 | unsigned int pages = khugepaged_pages_to_scan; | 2290 | unsigned int pages = khugepaged_pages_to_scan; |
2291 | bool wait = true; | ||
2244 | 2292 | ||
2245 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | 2293 | barrier(); /* write khugepaged_pages_to_scan to local stack */ |
2246 | 2294 | ||
2247 | while (progress < pages) { | 2295 | while (progress < pages) { |
2248 | cond_resched(); | 2296 | if (!khugepaged_prealloc_page(&hpage, &wait)) |
2249 | |||
2250 | #ifndef CONFIG_NUMA | ||
2251 | if (!*hpage) { | ||
2252 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
2253 | if (unlikely(!*hpage)) { | ||
2254 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2255 | break; | ||
2256 | } | ||
2257 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2258 | } | ||
2259 | #else | ||
2260 | if (IS_ERR(*hpage)) | ||
2261 | break; | 2297 | break; |
2262 | #endif | 2298 | |
2299 | cond_resched(); | ||
2263 | 2300 | ||
2264 | if (unlikely(kthread_should_stop() || freezing(current))) | 2301 | if (unlikely(kthread_should_stop() || freezing(current))) |
2265 | break; | 2302 | break; |
@@ -2270,73 +2307,32 @@ static void khugepaged_do_scan(struct page **hpage) | |||
2270 | if (khugepaged_has_work() && | 2307 | if (khugepaged_has_work() && |
2271 | pass_through_head < 2) | 2308 | pass_through_head < 2) |
2272 | progress += khugepaged_scan_mm_slot(pages - progress, | 2309 | progress += khugepaged_scan_mm_slot(pages - progress, |
2273 | hpage); | 2310 | &hpage); |
2274 | else | 2311 | else |
2275 | progress = pages; | 2312 | progress = pages; |
2276 | spin_unlock(&khugepaged_mm_lock); | 2313 | spin_unlock(&khugepaged_mm_lock); |
2277 | } | 2314 | } |
2278 | } | ||
2279 | 2315 | ||
2280 | static void khugepaged_alloc_sleep(void) | 2316 | if (!IS_ERR_OR_NULL(hpage)) |
2281 | { | 2317 | put_page(hpage); |
2282 | wait_event_freezable_timeout(khugepaged_wait, false, | ||
2283 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | ||
2284 | } | 2318 | } |
2285 | 2319 | ||
2286 | #ifndef CONFIG_NUMA | 2320 | static void khugepaged_wait_work(void) |
2287 | static struct page *khugepaged_alloc_hugepage(void) | ||
2288 | { | 2321 | { |
2289 | struct page *hpage; | 2322 | try_to_freeze(); |
2290 | |||
2291 | do { | ||
2292 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
2293 | if (!hpage) { | ||
2294 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2295 | khugepaged_alloc_sleep(); | ||
2296 | } else | ||
2297 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2298 | } while (unlikely(!hpage) && | ||
2299 | likely(khugepaged_enabled())); | ||
2300 | return hpage; | ||
2301 | } | ||
2302 | #endif | ||
2303 | 2323 | ||
2304 | static void khugepaged_loop(void) | 2324 | if (khugepaged_has_work()) { |
2305 | { | 2325 | if (!khugepaged_scan_sleep_millisecs) |
2306 | struct page *hpage; | 2326 | return; |
2307 | 2327 | ||
2308 | #ifdef CONFIG_NUMA | 2328 | wait_event_freezable_timeout(khugepaged_wait, |
2309 | hpage = NULL; | 2329 | kthread_should_stop(), |
2310 | #endif | 2330 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); |
2311 | while (likely(khugepaged_enabled())) { | 2331 | return; |
2312 | #ifndef CONFIG_NUMA | ||
2313 | hpage = khugepaged_alloc_hugepage(); | ||
2314 | if (unlikely(!hpage)) | ||
2315 | break; | ||
2316 | #else | ||
2317 | if (IS_ERR(hpage)) { | ||
2318 | khugepaged_alloc_sleep(); | ||
2319 | hpage = NULL; | ||
2320 | } | ||
2321 | #endif | ||
2322 | |||
2323 | khugepaged_do_scan(&hpage); | ||
2324 | #ifndef CONFIG_NUMA | ||
2325 | if (hpage) | ||
2326 | put_page(hpage); | ||
2327 | #endif | ||
2328 | try_to_freeze(); | ||
2329 | if (unlikely(kthread_should_stop())) | ||
2330 | break; | ||
2331 | if (khugepaged_has_work()) { | ||
2332 | if (!khugepaged_scan_sleep_millisecs) | ||
2333 | continue; | ||
2334 | wait_event_freezable_timeout(khugepaged_wait, false, | ||
2335 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); | ||
2336 | } else if (khugepaged_enabled()) | ||
2337 | wait_event_freezable(khugepaged_wait, | ||
2338 | khugepaged_wait_event()); | ||
2339 | } | 2332 | } |
2333 | |||
2334 | if (khugepaged_enabled()) | ||
2335 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | ||
2340 | } | 2336 | } |
2341 | 2337 | ||
2342 | static int khugepaged(void *none) | 2338 | static int khugepaged(void *none) |
@@ -2346,20 +2342,9 @@ static int khugepaged(void *none) | |||
2346 | set_freezable(); | 2342 | set_freezable(); |
2347 | set_user_nice(current, 19); | 2343 | set_user_nice(current, 19); |
2348 | 2344 | ||
2349 | /* serialize with start_khugepaged() */ | 2345 | while (!kthread_should_stop()) { |
2350 | mutex_lock(&khugepaged_mutex); | 2346 | khugepaged_do_scan(); |
2351 | 2347 | khugepaged_wait_work(); | |
2352 | for (;;) { | ||
2353 | mutex_unlock(&khugepaged_mutex); | ||
2354 | VM_BUG_ON(khugepaged_thread != current); | ||
2355 | khugepaged_loop(); | ||
2356 | VM_BUG_ON(khugepaged_thread != current); | ||
2357 | |||
2358 | mutex_lock(&khugepaged_mutex); | ||
2359 | if (!khugepaged_enabled()) | ||
2360 | break; | ||
2361 | if (unlikely(kthread_should_stop())) | ||
2362 | break; | ||
2363 | } | 2348 | } |
2364 | 2349 | ||
2365 | spin_lock(&khugepaged_mm_lock); | 2350 | spin_lock(&khugepaged_mm_lock); |
@@ -2368,10 +2353,6 @@ static int khugepaged(void *none) | |||
2368 | if (mm_slot) | 2353 | if (mm_slot) |
2369 | collect_mm_slot(mm_slot); | 2354 | collect_mm_slot(mm_slot); |
2370 | spin_unlock(&khugepaged_mm_lock); | 2355 | spin_unlock(&khugepaged_mm_lock); |
2371 | |||
2372 | khugepaged_thread = NULL; | ||
2373 | mutex_unlock(&khugepaged_mutex); | ||
2374 | |||
2375 | return 0; | 2356 | return 0; |
2376 | } | 2357 | } |
2377 | 2358 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc727122dd44..59a0059b39e2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -30,7 +30,6 @@ | |||
30 | #include <linux/hugetlb.h> | 30 | #include <linux/hugetlb.h> |
31 | #include <linux/hugetlb_cgroup.h> | 31 | #include <linux/hugetlb_cgroup.h> |
32 | #include <linux/node.h> | 32 | #include <linux/node.h> |
33 | #include <linux/hugetlb_cgroup.h> | ||
34 | #include "internal.h" | 33 | #include "internal.h" |
35 | 34 | ||
36 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 35 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page) | |||
637 | h->surplus_huge_pages--; | 636 | h->surplus_huge_pages--; |
638 | h->surplus_huge_pages_node[nid]--; | 637 | h->surplus_huge_pages_node[nid]--; |
639 | } else { | 638 | } else { |
639 | arch_clear_hugepage_flags(page); | ||
640 | enqueue_huge_page(h, page); | 640 | enqueue_huge_page(h, page); |
641 | } | 641 | } |
642 | spin_unlock(&hugetlb_lock); | 642 | spin_unlock(&hugetlb_lock); |
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
671 | } | 671 | } |
672 | } | 672 | } |
673 | 673 | ||
674 | /* | ||
675 | * PageHuge() only returns true for hugetlbfs pages, but not for normal or | ||
676 | * transparent huge pages. See the PageTransHuge() documentation for more | ||
677 | * details. | ||
678 | */ | ||
674 | int PageHuge(struct page *page) | 679 | int PageHuge(struct page *page) |
675 | { | 680 | { |
676 | compound_page_dtor *dtor; | 681 | compound_page_dtor *dtor; |
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2355 | struct page *page; | 2360 | struct page *page; |
2356 | struct hstate *h = hstate_vma(vma); | 2361 | struct hstate *h = hstate_vma(vma); |
2357 | unsigned long sz = huge_page_size(h); | 2362 | unsigned long sz = huge_page_size(h); |
2363 | const unsigned long mmun_start = start; /* For mmu_notifiers */ | ||
2364 | const unsigned long mmun_end = end; /* For mmu_notifiers */ | ||
2358 | 2365 | ||
2359 | WARN_ON(!is_vm_hugetlb_page(vma)); | 2366 | WARN_ON(!is_vm_hugetlb_page(vma)); |
2360 | BUG_ON(start & ~huge_page_mask(h)); | 2367 | BUG_ON(start & ~huge_page_mask(h)); |
2361 | BUG_ON(end & ~huge_page_mask(h)); | 2368 | BUG_ON(end & ~huge_page_mask(h)); |
2362 | 2369 | ||
2363 | tlb_start_vma(tlb, vma); | 2370 | tlb_start_vma(tlb, vma); |
2364 | mmu_notifier_invalidate_range_start(mm, start, end); | 2371 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2365 | again: | 2372 | again: |
2366 | spin_lock(&mm->page_table_lock); | 2373 | spin_lock(&mm->page_table_lock); |
2367 | for (address = start; address < end; address += sz) { | 2374 | for (address = start; address < end; address += sz) { |
@@ -2425,7 +2432,7 @@ again: | |||
2425 | if (address < end && !ref_page) | 2432 | if (address < end && !ref_page) |
2426 | goto again; | 2433 | goto again; |
2427 | } | 2434 | } |
2428 | mmu_notifier_invalidate_range_end(mm, start, end); | 2435 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2429 | tlb_end_vma(tlb, vma); | 2436 | tlb_end_vma(tlb, vma); |
2430 | } | 2437 | } |
2431 | 2438 | ||
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2473 | struct hstate *h = hstate_vma(vma); | 2480 | struct hstate *h = hstate_vma(vma); |
2474 | struct vm_area_struct *iter_vma; | 2481 | struct vm_area_struct *iter_vma; |
2475 | struct address_space *mapping; | 2482 | struct address_space *mapping; |
2476 | struct prio_tree_iter iter; | ||
2477 | pgoff_t pgoff; | 2483 | pgoff_t pgoff; |
2478 | 2484 | ||
2479 | /* | 2485 | /* |
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2481 | * from page cache lookup which is in HPAGE_SIZE units. | 2487 | * from page cache lookup which is in HPAGE_SIZE units. |
2482 | */ | 2488 | */ |
2483 | address = address & huge_page_mask(h); | 2489 | address = address & huge_page_mask(h); |
2484 | pgoff = vma_hugecache_offset(h, vma, address); | 2490 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + |
2491 | vma->vm_pgoff; | ||
2485 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; | 2492 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2486 | 2493 | ||
2487 | /* | 2494 | /* |
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2490 | * __unmap_hugepage_range() is called as the lock is already held | 2497 | * __unmap_hugepage_range() is called as the lock is already held |
2491 | */ | 2498 | */ |
2492 | mutex_lock(&mapping->i_mmap_mutex); | 2499 | mutex_lock(&mapping->i_mmap_mutex); |
2493 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2500 | vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) { |
2494 | /* Do not unmap the current VMA */ | 2501 | /* Do not unmap the current VMA */ |
2495 | if (iter_vma == vma) | 2502 | if (iter_vma == vma) |
2496 | continue; | 2503 | continue; |
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2525 | struct page *old_page, *new_page; | 2532 | struct page *old_page, *new_page; |
2526 | int avoidcopy; | 2533 | int avoidcopy; |
2527 | int outside_reserve = 0; | 2534 | int outside_reserve = 0; |
2535 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2536 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2528 | 2537 | ||
2529 | old_page = pte_page(pte); | 2538 | old_page = pte_page(pte); |
2530 | 2539 | ||
@@ -2611,6 +2620,9 @@ retry_avoidcopy: | |||
2611 | pages_per_huge_page(h)); | 2620 | pages_per_huge_page(h)); |
2612 | __SetPageUptodate(new_page); | 2621 | __SetPageUptodate(new_page); |
2613 | 2622 | ||
2623 | mmun_start = address & huge_page_mask(h); | ||
2624 | mmun_end = mmun_start + huge_page_size(h); | ||
2625 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2614 | /* | 2626 | /* |
2615 | * Retake the page_table_lock to check for racing updates | 2627 | * Retake the page_table_lock to check for racing updates |
2616 | * before the page tables are altered | 2628 | * before the page tables are altered |
@@ -2619,9 +2631,6 @@ retry_avoidcopy: | |||
2619 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2631 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2620 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2632 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
2621 | /* Break COW */ | 2633 | /* Break COW */ |
2622 | mmu_notifier_invalidate_range_start(mm, | ||
2623 | address & huge_page_mask(h), | ||
2624 | (address & huge_page_mask(h)) + huge_page_size(h)); | ||
2625 | huge_ptep_clear_flush(vma, address, ptep); | 2634 | huge_ptep_clear_flush(vma, address, ptep); |
2626 | set_huge_pte_at(mm, address, ptep, | 2635 | set_huge_pte_at(mm, address, ptep, |
2627 | make_huge_pte(vma, new_page, 1)); | 2636 | make_huge_pte(vma, new_page, 1)); |
@@ -2629,10 +2638,11 @@ retry_avoidcopy: | |||
2629 | hugepage_add_new_anon_rmap(new_page, vma, address); | 2638 | hugepage_add_new_anon_rmap(new_page, vma, address); |
2630 | /* Make the old page be freed below */ | 2639 | /* Make the old page be freed below */ |
2631 | new_page = old_page; | 2640 | new_page = old_page; |
2632 | mmu_notifier_invalidate_range_end(mm, | ||
2633 | address & huge_page_mask(h), | ||
2634 | (address & huge_page_mask(h)) + huge_page_size(h)); | ||
2635 | } | 2641 | } |
2642 | spin_unlock(&mm->page_table_lock); | ||
2643 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2644 | /* Caller expects lock to be held */ | ||
2645 | spin_lock(&mm->page_table_lock); | ||
2636 | page_cache_release(new_page); | 2646 | page_cache_release(new_page); |
2637 | page_cache_release(old_page); | 2647 | page_cache_release(old_page); |
2638 | return 0; | 2648 | return 0; |
diff --git a/mm/internal.h b/mm/internal.h index b8c91b342e24..a4fa284f6bc2 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -118,26 +118,27 @@ struct compact_control { | |||
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
123 | bool sync; /* Synchronous migration */ | 122 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | 123 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
125 | incremental, once free_pfn | 124 | bool finished_update_free; /* True when the zone cached pfns are |
126 | and migrate_pfn meet, we restart | 125 | * no longer being updated |
127 | from the top of the zone; | 126 | */ |
128 | remember we wrapped around. */ | 127 | bool finished_update_migrate; |
129 | 128 | ||
130 | int order; /* order a direct compactor needs */ | 129 | int order; /* order a direct compactor needs */ |
131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 130 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
132 | struct zone *zone; | 131 | struct zone *zone; |
133 | bool *contended; /* True if a lock was contended */ | 132 | bool contended; /* True if a lock was contended */ |
133 | struct page **page; /* Page captured of requested size */ | ||
134 | }; | 134 | }; |
135 | 135 | ||
136 | unsigned long | 136 | unsigned long |
137 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); | 137 | isolate_freepages_range(struct compact_control *cc, |
138 | unsigned long start_pfn, unsigned long end_pfn); | ||
138 | unsigned long | 139 | unsigned long |
139 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 140 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
140 | unsigned long low_pfn, unsigned long end_pfn); | 141 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable); |
141 | 142 | ||
142 | #endif | 143 | #endif |
143 | 144 | ||
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
167 | } | 168 | } |
168 | 169 | ||
169 | /* | 170 | /* |
170 | * Called only in fault path via page_evictable() for a new page | 171 | * Called only in fault path, to determine if a new page is being |
171 | * to determine if it's being mapped into a LOCKED vma. | 172 | * mapped into a LOCKED vma. If it is, mark page as mlocked. |
172 | * If so, mark page as mlocked. | ||
173 | */ | 173 | */ |
174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | 174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
175 | struct page *page) | 175 | struct page *page) |
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | |||
180 | return 0; | 180 | return 0; |
181 | 181 | ||
182 | if (!TestSetPageMlocked(page)) { | 182 | if (!TestSetPageMlocked(page)) { |
183 | inc_zone_page_state(page, NR_MLOCK); | 183 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
184 | hpage_nr_pages(page)); | ||
184 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 185 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
185 | } | 186 | } |
186 | return 1; | 187 | return 1; |
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page); | |||
201 | * If called for a page that is still mapped by mlocked vmas, all we do | 202 | * If called for a page that is still mapped by mlocked vmas, all we do |
202 | * is revert to lazy LRU behaviour -- semantics are not broken. | 203 | * is revert to lazy LRU behaviour -- semantics are not broken. |
203 | */ | 204 | */ |
204 | extern void __clear_page_mlock(struct page *page); | 205 | extern void clear_page_mlock(struct page *page); |
205 | static inline void clear_page_mlock(struct page *page) | ||
206 | { | ||
207 | if (unlikely(TestClearPageMlocked(page))) | ||
208 | __clear_page_mlock(page); | ||
209 | } | ||
210 | 206 | ||
211 | /* | 207 | /* |
212 | * mlock_migrate_page - called only from migrate_page_copy() to | 208 | * mlock_migrate_page - called only from migrate_page_copy() to |
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
340 | #define ZONE_RECLAIM_FULL -1 | 336 | #define ZONE_RECLAIM_FULL -1 |
341 | #define ZONE_RECLAIM_SOME 0 | 337 | #define ZONE_RECLAIM_SOME 0 |
342 | #define ZONE_RECLAIM_SUCCESS 1 | 338 | #define ZONE_RECLAIM_SUCCESS 1 |
343 | #endif | ||
344 | 339 | ||
345 | extern int hwpoison_filter(struct page *p); | 340 | extern int hwpoison_filter(struct page *p); |
346 | 341 | ||
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | |||
356 | unsigned long, unsigned long); | 351 | unsigned long, unsigned long); |
357 | 352 | ||
358 | extern void set_pageblock_order(void); | 353 | extern void set_pageblock_order(void); |
354 | unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
355 | struct list_head *page_list); | ||
356 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | ||
357 | #define ALLOC_WMARK_MIN WMARK_MIN | ||
358 | #define ALLOC_WMARK_LOW WMARK_LOW | ||
359 | #define ALLOC_WMARK_HIGH WMARK_HIGH | ||
360 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
361 | |||
362 | /* Mask to get the watermark bits */ | ||
363 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
364 | |||
365 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
366 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
367 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
368 | #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ | ||
369 | |||
370 | #endif /* __MM_INTERNAL_H */ | ||
diff --git a/mm/interval_tree.c b/mm/interval_tree.c new file mode 100644 index 000000000000..4a5822a586e6 --- /dev/null +++ b/mm/interval_tree.c | |||
@@ -0,0 +1,112 @@ | |||
1 | /* | ||
2 | * mm/interval_tree.c - interval tree for mapping->i_mmap | ||
3 | * | ||
4 | * Copyright (C) 2012, Michel Lespinasse <walken@google.com> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | */ | ||
8 | |||
9 | #include <linux/mm.h> | ||
10 | #include <linux/fs.h> | ||
11 | #include <linux/rmap.h> | ||
12 | #include <linux/interval_tree_generic.h> | ||
13 | |||
14 | static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) | ||
15 | { | ||
16 | return v->vm_pgoff; | ||
17 | } | ||
18 | |||
19 | static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) | ||
20 | { | ||
21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; | ||
22 | } | ||
23 | |||
24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, | ||
25 | unsigned long, shared.linear.rb_subtree_last, | ||
26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) | ||
27 | |||
28 | /* Insert node immediately after prev in the interval tree */ | ||
29 | void vma_interval_tree_insert_after(struct vm_area_struct *node, | ||
30 | struct vm_area_struct *prev, | ||
31 | struct rb_root *root) | ||
32 | { | ||
33 | struct rb_node **link; | ||
34 | struct vm_area_struct *parent; | ||
35 | unsigned long last = vma_last_pgoff(node); | ||
36 | |||
37 | VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); | ||
38 | |||
39 | if (!prev->shared.linear.rb.rb_right) { | ||
40 | parent = prev; | ||
41 | link = &prev->shared.linear.rb.rb_right; | ||
42 | } else { | ||
43 | parent = rb_entry(prev->shared.linear.rb.rb_right, | ||
44 | struct vm_area_struct, shared.linear.rb); | ||
45 | if (parent->shared.linear.rb_subtree_last < last) | ||
46 | parent->shared.linear.rb_subtree_last = last; | ||
47 | while (parent->shared.linear.rb.rb_left) { | ||
48 | parent = rb_entry(parent->shared.linear.rb.rb_left, | ||
49 | struct vm_area_struct, shared.linear.rb); | ||
50 | if (parent->shared.linear.rb_subtree_last < last) | ||
51 | parent->shared.linear.rb_subtree_last = last; | ||
52 | } | ||
53 | link = &parent->shared.linear.rb.rb_left; | ||
54 | } | ||
55 | |||
56 | node->shared.linear.rb_subtree_last = last; | ||
57 | rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); | ||
58 | rb_insert_augmented(&node->shared.linear.rb, root, | ||
59 | &vma_interval_tree_augment); | ||
60 | } | ||
61 | |||
62 | static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc) | ||
63 | { | ||
64 | return vma_start_pgoff(avc->vma); | ||
65 | } | ||
66 | |||
67 | static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc) | ||
68 | { | ||
69 | return vma_last_pgoff(avc->vma); | ||
70 | } | ||
71 | |||
72 | INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last, | ||
73 | avc_start_pgoff, avc_last_pgoff, | ||
74 | static inline, __anon_vma_interval_tree) | ||
75 | |||
76 | void anon_vma_interval_tree_insert(struct anon_vma_chain *node, | ||
77 | struct rb_root *root) | ||
78 | { | ||
79 | #ifdef CONFIG_DEBUG_VM_RB | ||
80 | node->cached_vma_start = avc_start_pgoff(node); | ||
81 | node->cached_vma_last = avc_last_pgoff(node); | ||
82 | #endif | ||
83 | __anon_vma_interval_tree_insert(node, root); | ||
84 | } | ||
85 | |||
86 | void anon_vma_interval_tree_remove(struct anon_vma_chain *node, | ||
87 | struct rb_root *root) | ||
88 | { | ||
89 | __anon_vma_interval_tree_remove(node, root); | ||
90 | } | ||
91 | |||
92 | struct anon_vma_chain * | ||
93 | anon_vma_interval_tree_iter_first(struct rb_root *root, | ||
94 | unsigned long first, unsigned long last) | ||
95 | { | ||
96 | return __anon_vma_interval_tree_iter_first(root, first, last); | ||
97 | } | ||
98 | |||
99 | struct anon_vma_chain * | ||
100 | anon_vma_interval_tree_iter_next(struct anon_vma_chain *node, | ||
101 | unsigned long first, unsigned long last) | ||
102 | { | ||
103 | return __anon_vma_interval_tree_iter_next(node, first, last); | ||
104 | } | ||
105 | |||
106 | #ifdef CONFIG_DEBUG_VM_RB | ||
107 | void anon_vma_interval_tree_verify(struct anon_vma_chain *node) | ||
108 | { | ||
109 | WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node)); | ||
110 | WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node)); | ||
111 | } | ||
112 | #endif | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 45eb6217bf38..a217cc544060 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -29,7 +29,7 @@ | |||
29 | * - kmemleak_lock (rwlock): protects the object_list modifications and | 29 | * - kmemleak_lock (rwlock): protects the object_list modifications and |
30 | * accesses to the object_tree_root. The object_list is the main list | 30 | * accesses to the object_tree_root. The object_list is the main list |
31 | * holding the metadata (struct kmemleak_object) for the allocated memory | 31 | * holding the metadata (struct kmemleak_object) for the allocated memory |
32 | * blocks. The object_tree_root is a priority search tree used to look-up | 32 | * blocks. The object_tree_root is a red black tree used to look-up |
33 | * metadata based on a pointer to the corresponding memory block. The | 33 | * metadata based on a pointer to the corresponding memory block. The |
34 | * kmemleak_object structures are added to the object_list and | 34 | * kmemleak_object structures are added to the object_list and |
35 | * object_tree_root in the create_object() function called from the | 35 | * object_tree_root in the create_object() function called from the |
@@ -71,7 +71,7 @@ | |||
71 | #include <linux/delay.h> | 71 | #include <linux/delay.h> |
72 | #include <linux/export.h> | 72 | #include <linux/export.h> |
73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
74 | #include <linux/prio_tree.h> | 74 | #include <linux/rbtree.h> |
75 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
76 | #include <linux/debugfs.h> | 76 | #include <linux/debugfs.h> |
77 | #include <linux/seq_file.h> | 77 | #include <linux/seq_file.h> |
@@ -132,7 +132,7 @@ struct kmemleak_scan_area { | |||
132 | * Structure holding the metadata for each allocated memory block. | 132 | * Structure holding the metadata for each allocated memory block. |
133 | * Modifications to such objects should be made while holding the | 133 | * Modifications to such objects should be made while holding the |
134 | * object->lock. Insertions or deletions from object_list, gray_list or | 134 | * object->lock. Insertions or deletions from object_list, gray_list or |
135 | * tree_node are already protected by the corresponding locks or mutex (see | 135 | * rb_node are already protected by the corresponding locks or mutex (see |
136 | * the notes on locking above). These objects are reference-counted | 136 | * the notes on locking above). These objects are reference-counted |
137 | * (use_count) and freed using the RCU mechanism. | 137 | * (use_count) and freed using the RCU mechanism. |
138 | */ | 138 | */ |
@@ -141,7 +141,7 @@ struct kmemleak_object { | |||
141 | unsigned long flags; /* object status flags */ | 141 | unsigned long flags; /* object status flags */ |
142 | struct list_head object_list; | 142 | struct list_head object_list; |
143 | struct list_head gray_list; | 143 | struct list_head gray_list; |
144 | struct prio_tree_node tree_node; | 144 | struct rb_node rb_node; |
145 | struct rcu_head rcu; /* object_list lockless traversal */ | 145 | struct rcu_head rcu; /* object_list lockless traversal */ |
146 | /* object usage count; object freed when use_count == 0 */ | 146 | /* object usage count; object freed when use_count == 0 */ |
147 | atomic_t use_count; | 147 | atomic_t use_count; |
@@ -182,9 +182,9 @@ struct kmemleak_object { | |||
182 | static LIST_HEAD(object_list); | 182 | static LIST_HEAD(object_list); |
183 | /* the list of gray-colored objects (see color_gray comment below) */ | 183 | /* the list of gray-colored objects (see color_gray comment below) */ |
184 | static LIST_HEAD(gray_list); | 184 | static LIST_HEAD(gray_list); |
185 | /* prio search tree for object boundaries */ | 185 | /* search tree for object boundaries */ |
186 | static struct prio_tree_root object_tree_root; | 186 | static struct rb_root object_tree_root = RB_ROOT; |
187 | /* rw_lock protecting the access to object_list and prio_tree_root */ | 187 | /* rw_lock protecting the access to object_list and object_tree_root */ |
188 | static DEFINE_RWLOCK(kmemleak_lock); | 188 | static DEFINE_RWLOCK(kmemleak_lock); |
189 | 189 | ||
190 | /* allocation caches for kmemleak internal data */ | 190 | /* allocation caches for kmemleak internal data */ |
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
380 | trace.entries = object->trace; | 380 | trace.entries = object->trace; |
381 | 381 | ||
382 | pr_notice("Object 0x%08lx (size %zu):\n", | 382 | pr_notice("Object 0x%08lx (size %zu):\n", |
383 | object->tree_node.start, object->size); | 383 | object->pointer, object->size); |
384 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", | 384 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", |
385 | object->comm, object->pid, object->jiffies); | 385 | object->comm, object->pid, object->jiffies); |
386 | pr_notice(" min_count = %d\n", object->min_count); | 386 | pr_notice(" min_count = %d\n", object->min_count); |
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object) | |||
392 | } | 392 | } |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * Look-up a memory block metadata (kmemleak_object) in the priority search | 395 | * Look-up a memory block metadata (kmemleak_object) in the object search |
396 | * tree based on a pointer value. If alias is 0, only values pointing to the | 396 | * tree based on a pointer value. If alias is 0, only values pointing to the |
397 | * beginning of the memory block are allowed. The kmemleak_lock must be held | 397 | * beginning of the memory block are allowed. The kmemleak_lock must be held |
398 | * when calling this function. | 398 | * when calling this function. |
399 | */ | 399 | */ |
400 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) | 400 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) |
401 | { | 401 | { |
402 | struct prio_tree_node *node; | 402 | struct rb_node *rb = object_tree_root.rb_node; |
403 | struct prio_tree_iter iter; | 403 | |
404 | struct kmemleak_object *object; | 404 | while (rb) { |
405 | 405 | struct kmemleak_object *object = | |
406 | prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); | 406 | rb_entry(rb, struct kmemleak_object, rb_node); |
407 | node = prio_tree_next(&iter); | 407 | if (ptr < object->pointer) |
408 | if (node) { | 408 | rb = object->rb_node.rb_left; |
409 | object = prio_tree_entry(node, struct kmemleak_object, | 409 | else if (object->pointer + object->size <= ptr) |
410 | tree_node); | 410 | rb = object->rb_node.rb_right; |
411 | if (!alias && object->pointer != ptr) { | 411 | else if (object->pointer == ptr || alias) |
412 | return object; | ||
413 | else { | ||
412 | kmemleak_warn("Found object by alias at 0x%08lx\n", | 414 | kmemleak_warn("Found object by alias at 0x%08lx\n", |
413 | ptr); | 415 | ptr); |
414 | dump_object_info(object); | 416 | dump_object_info(object); |
415 | object = NULL; | 417 | break; |
416 | } | 418 | } |
417 | } else | 419 | } |
418 | object = NULL; | 420 | return NULL; |
419 | |||
420 | return object; | ||
421 | } | 421 | } |
422 | 422 | ||
423 | /* | 423 | /* |
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object) | |||
471 | } | 471 | } |
472 | 472 | ||
473 | /* | 473 | /* |
474 | * Look up an object in the prio search tree and increase its use_count. | 474 | * Look up an object in the object search tree and increase its use_count. |
475 | */ | 475 | */ |
476 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | 476 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) |
477 | { | 477 | { |
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
516 | int min_count, gfp_t gfp) | 516 | int min_count, gfp_t gfp) |
517 | { | 517 | { |
518 | unsigned long flags; | 518 | unsigned long flags; |
519 | struct kmemleak_object *object; | 519 | struct kmemleak_object *object, *parent; |
520 | struct prio_tree_node *node; | 520 | struct rb_node **link, *rb_parent; |
521 | 521 | ||
522 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); | 522 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); |
523 | if (!object) { | 523 | if (!object) { |
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
560 | /* kernel backtrace */ | 560 | /* kernel backtrace */ |
561 | object->trace_len = __save_stack_trace(object->trace); | 561 | object->trace_len = __save_stack_trace(object->trace); |
562 | 562 | ||
563 | INIT_PRIO_TREE_NODE(&object->tree_node); | ||
564 | object->tree_node.start = ptr; | ||
565 | object->tree_node.last = ptr + size - 1; | ||
566 | |||
567 | write_lock_irqsave(&kmemleak_lock, flags); | 563 | write_lock_irqsave(&kmemleak_lock, flags); |
568 | 564 | ||
569 | min_addr = min(min_addr, ptr); | 565 | min_addr = min(min_addr, ptr); |
570 | max_addr = max(max_addr, ptr + size); | 566 | max_addr = max(max_addr, ptr + size); |
571 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | 567 | link = &object_tree_root.rb_node; |
572 | /* | 568 | rb_parent = NULL; |
573 | * The code calling the kernel does not yet have the pointer to the | 569 | while (*link) { |
574 | * memory block to be able to free it. However, we still hold the | 570 | rb_parent = *link; |
575 | * kmemleak_lock here in case parts of the kernel started freeing | 571 | parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); |
576 | * random memory blocks. | 572 | if (ptr + size <= parent->pointer) |
577 | */ | 573 | link = &parent->rb_node.rb_left; |
578 | if (node != &object->tree_node) { | 574 | else if (parent->pointer + parent->size <= ptr) |
579 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " | 575 | link = &parent->rb_node.rb_right; |
580 | "(already existing)\n", ptr); | 576 | else { |
581 | object = lookup_object(ptr, 1); | 577 | kmemleak_stop("Cannot insert 0x%lx into the object " |
582 | spin_lock(&object->lock); | 578 | "search tree (overlaps existing)\n", |
583 | dump_object_info(object); | 579 | ptr); |
584 | spin_unlock(&object->lock); | 580 | kmem_cache_free(object_cache, object); |
585 | 581 | object = parent; | |
586 | goto out; | 582 | spin_lock(&object->lock); |
583 | dump_object_info(object); | ||
584 | spin_unlock(&object->lock); | ||
585 | goto out; | ||
586 | } | ||
587 | } | 587 | } |
588 | rb_link_node(&object->rb_node, rb_parent, link); | ||
589 | rb_insert_color(&object->rb_node, &object_tree_root); | ||
590 | |||
588 | list_add_tail_rcu(&object->object_list, &object_list); | 591 | list_add_tail_rcu(&object->object_list, &object_list); |
589 | out: | 592 | out: |
590 | write_unlock_irqrestore(&kmemleak_lock, flags); | 593 | write_unlock_irqrestore(&kmemleak_lock, flags); |
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object) | |||
600 | unsigned long flags; | 603 | unsigned long flags; |
601 | 604 | ||
602 | write_lock_irqsave(&kmemleak_lock, flags); | 605 | write_lock_irqsave(&kmemleak_lock, flags); |
603 | prio_tree_remove(&object_tree_root, &object->tree_node); | 606 | rb_erase(&object->rb_node, &object_tree_root); |
604 | list_del_rcu(&object->object_list); | 607 | list_del_rcu(&object->object_list); |
605 | write_unlock_irqrestore(&kmemleak_lock, flags); | 608 | write_unlock_irqrestore(&kmemleak_lock, flags); |
606 | 609 | ||
@@ -1483,13 +1486,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) | |||
1483 | { | 1486 | { |
1484 | struct kmemleak_object *prev_obj = v; | 1487 | struct kmemleak_object *prev_obj = v; |
1485 | struct kmemleak_object *next_obj = NULL; | 1488 | struct kmemleak_object *next_obj = NULL; |
1486 | struct list_head *n = &prev_obj->object_list; | 1489 | struct kmemleak_object *obj = prev_obj; |
1487 | 1490 | ||
1488 | ++(*pos); | 1491 | ++(*pos); |
1489 | 1492 | ||
1490 | list_for_each_continue_rcu(n, &object_list) { | 1493 | list_for_each_entry_continue_rcu(obj, &object_list, object_list) { |
1491 | struct kmemleak_object *obj = | ||
1492 | list_entry(n, struct kmemleak_object, object_list); | ||
1493 | if (get_object(obj)) { | 1494 | if (get_object(obj)) { |
1494 | next_obj = obj; | 1495 | next_obj = obj; |
1495 | break; | 1496 | break; |
@@ -1768,7 +1769,6 @@ void __init kmemleak_init(void) | |||
1768 | 1769 | ||
1769 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | 1770 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); |
1770 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | 1771 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); |
1771 | INIT_PRIO_TREE_ROOT(&object_tree_root); | ||
1772 | 1772 | ||
1773 | if (crt_early_log >= ARRAY_SIZE(early_log)) | 1773 | if (crt_early_log >= ARRAY_SIZE(early_log)) |
1774 | pr_warning("Early log buffer exceeded (%d), please increase " | 1774 | pr_warning("Early log buffer exceeded (%d), please increase " |
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
709 | spinlock_t *ptl; | 709 | spinlock_t *ptl; |
710 | int swapped; | 710 | int swapped; |
711 | int err = -EFAULT; | 711 | int err = -EFAULT; |
712 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
713 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
712 | 714 | ||
713 | addr = page_address_in_vma(page, vma); | 715 | addr = page_address_in_vma(page, vma); |
714 | if (addr == -EFAULT) | 716 | if (addr == -EFAULT) |
715 | goto out; | 717 | goto out; |
716 | 718 | ||
717 | BUG_ON(PageTransCompound(page)); | 719 | BUG_ON(PageTransCompound(page)); |
720 | |||
721 | mmun_start = addr; | ||
722 | mmun_end = addr + PAGE_SIZE; | ||
723 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
724 | |||
718 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 725 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
719 | if (!ptep) | 726 | if (!ptep) |
720 | goto out; | 727 | goto out_mn; |
721 | 728 | ||
722 | if (pte_write(*ptep) || pte_dirty(*ptep)) { | 729 | if (pte_write(*ptep) || pte_dirty(*ptep)) { |
723 | pte_t entry; | 730 | pte_t entry; |
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
752 | 759 | ||
753 | out_unlock: | 760 | out_unlock: |
754 | pte_unmap_unlock(ptep, ptl); | 761 | pte_unmap_unlock(ptep, ptl); |
762 | out_mn: | ||
763 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
755 | out: | 764 | out: |
756 | return err; | 765 | return err; |
757 | } | 766 | } |
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
776 | spinlock_t *ptl; | 785 | spinlock_t *ptl; |
777 | unsigned long addr; | 786 | unsigned long addr; |
778 | int err = -EFAULT; | 787 | int err = -EFAULT; |
788 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
789 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
779 | 790 | ||
780 | addr = page_address_in_vma(page, vma); | 791 | addr = page_address_in_vma(page, vma); |
781 | if (addr == -EFAULT) | 792 | if (addr == -EFAULT) |
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
794 | if (!pmd_present(*pmd)) | 805 | if (!pmd_present(*pmd)) |
795 | goto out; | 806 | goto out; |
796 | 807 | ||
808 | mmun_start = addr; | ||
809 | mmun_end = addr + PAGE_SIZE; | ||
810 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
811 | |||
797 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | 812 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); |
798 | if (!pte_same(*ptep, orig_pte)) { | 813 | if (!pte_same(*ptep, orig_pte)) { |
799 | pte_unmap_unlock(ptep, ptl); | 814 | pte_unmap_unlock(ptep, ptl); |
800 | goto out; | 815 | goto out_mn; |
801 | } | 816 | } |
802 | 817 | ||
803 | get_page(kpage); | 818 | get_page(kpage); |
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
814 | 829 | ||
815 | pte_unmap_unlock(ptep, ptl); | 830 | pte_unmap_unlock(ptep, ptl); |
816 | err = 0; | 831 | err = 0; |
832 | out_mn: | ||
833 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
817 | out: | 834 | out: |
818 | return err; | 835 | return err; |
819 | } | 836 | } |
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1469 | */ | 1486 | */ |
1470 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1487 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1471 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1488 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1472 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1489 | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) |
1473 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) | ||
1474 | return 0; /* just ignore the advice */ | 1490 | return 0; /* just ignore the advice */ |
1475 | 1491 | ||
1492 | #ifdef VM_SAO | ||
1493 | if (*vm_flags & VM_SAO) | ||
1494 | return 0; | ||
1495 | #endif | ||
1496 | |||
1476 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1497 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
1477 | err = __ksm_enter(mm); | 1498 | err = __ksm_enter(mm); |
1478 | if (err) | 1499 | if (err) |
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page, | |||
1582 | SetPageSwapBacked(new_page); | 1603 | SetPageSwapBacked(new_page); |
1583 | __set_page_locked(new_page); | 1604 | __set_page_locked(new_page); |
1584 | 1605 | ||
1585 | if (page_evictable(new_page, vma)) | 1606 | if (!mlocked_vma_newpage(vma, new_page)) |
1586 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | 1607 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); |
1587 | else | 1608 | else |
1588 | add_page_to_unevictable_list(new_page); | 1609 | add_page_to_unevictable_list(new_page); |
@@ -1614,7 +1635,8 @@ again: | |||
1614 | struct vm_area_struct *vma; | 1635 | struct vm_area_struct *vma; |
1615 | 1636 | ||
1616 | anon_vma_lock(anon_vma); | 1637 | anon_vma_lock(anon_vma); |
1617 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1638 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1639 | 0, ULONG_MAX) { | ||
1618 | vma = vmac->vma; | 1640 | vma = vmac->vma; |
1619 | if (rmap_item->address < vma->vm_start || | 1641 | if (rmap_item->address < vma->vm_start || |
1620 | rmap_item->address >= vma->vm_end) | 1642 | rmap_item->address >= vma->vm_end) |
@@ -1667,7 +1689,8 @@ again: | |||
1667 | struct vm_area_struct *vma; | 1689 | struct vm_area_struct *vma; |
1668 | 1690 | ||
1669 | anon_vma_lock(anon_vma); | 1691 | anon_vma_lock(anon_vma); |
1670 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1692 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1693 | 0, ULONG_MAX) { | ||
1671 | vma = vmac->vma; | 1694 | vma = vmac->vma; |
1672 | if (rmap_item->address < vma->vm_start || | 1695 | if (rmap_item->address < vma->vm_start || |
1673 | rmap_item->address >= vma->vm_end) | 1696 | rmap_item->address >= vma->vm_end) |
@@ -1719,7 +1742,8 @@ again: | |||
1719 | struct vm_area_struct *vma; | 1742 | struct vm_area_struct *vma; |
1720 | 1743 | ||
1721 | anon_vma_lock(anon_vma); | 1744 | anon_vma_lock(anon_vma); |
1722 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | 1745 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1746 | 0, ULONG_MAX) { | ||
1723 | vma = vmac->vma; | 1747 | vma = vmac->vma; |
1724 | if (rmap_item->address < vma->vm_start || | 1748 | if (rmap_item->address < vma->vm_start || |
1725 | rmap_item->address >= vma->vm_end) | 1749 | rmap_item->address >= vma->vm_end) |
diff --git a/mm/madvise.c b/mm/madvise.c index 14d260fa0d17..03dfa5c7adb3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
69 | new_flags &= ~VM_DONTCOPY; | 69 | new_flags &= ~VM_DONTCOPY; |
70 | break; | 70 | break; |
71 | case MADV_DONTDUMP: | 71 | case MADV_DONTDUMP: |
72 | new_flags |= VM_NODUMP; | 72 | new_flags |= VM_DONTDUMP; |
73 | break; | 73 | break; |
74 | case MADV_DODUMP: | 74 | case MADV_DODUMP: |
75 | new_flags &= ~VM_NODUMP; | 75 | if (new_flags & VM_SPECIAL) { |
76 | error = -EINVAL; | ||
77 | goto out; | ||
78 | } | ||
79 | new_flags &= ~VM_DONTDUMP; | ||
76 | break; | 80 | break; |
77 | case MADV_MERGEABLE: | 81 | case MADV_MERGEABLE: |
78 | case MADV_UNMERGEABLE: | 82 | case MADV_UNMERGEABLE: |
diff --git a/mm/memblock.c b/mm/memblock.c index 82aa349d2f7a..625905523c2a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0; | |||
41 | static int memblock_reserved_in_slab __initdata_memblock = 0; | 41 | static int memblock_reserved_in_slab __initdata_memblock = 0; |
42 | 42 | ||
43 | /* inline so we don't get a warning when pr_debug is compiled out */ | 43 | /* inline so we don't get a warning when pr_debug is compiled out */ |
44 | static inline const char *memblock_type_name(struct memblock_type *type) | 44 | static __init_memblock const char * |
45 | memblock_type_name(struct memblock_type *type) | ||
45 | { | 46 | { |
46 | if (type == &memblock.memory) | 47 | if (type == &memblock.memory) |
47 | return "memory"; | 48 | return "memory"; |
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, | |||
756 | return ret; | 757 | return ret; |
757 | 758 | ||
758 | for (i = start_rgn; i < end_rgn; i++) | 759 | for (i = start_rgn; i < end_rgn; i++) |
759 | type->regions[i].nid = nid; | 760 | memblock_set_region_node(&type->regions[i], nid); |
760 | 761 | ||
761 | memblock_merge_regions(type); | 762 | memblock_merge_regions(type); |
762 | return 0; | 763 | return 0; |
@@ -929,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si | |||
929 | return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; | 930 | return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; |
930 | } | 931 | } |
931 | 932 | ||
933 | void __init_memblock memblock_trim_memory(phys_addr_t align) | ||
934 | { | ||
935 | int i; | ||
936 | phys_addr_t start, end, orig_start, orig_end; | ||
937 | struct memblock_type *mem = &memblock.memory; | ||
938 | |||
939 | for (i = 0; i < mem->cnt; i++) { | ||
940 | orig_start = mem->regions[i].base; | ||
941 | orig_end = mem->regions[i].base + mem->regions[i].size; | ||
942 | start = round_up(orig_start, align); | ||
943 | end = round_down(orig_end, align); | ||
944 | |||
945 | if (start == orig_start && end == orig_end) | ||
946 | continue; | ||
947 | |||
948 | if (start < end) { | ||
949 | mem->regions[i].base = start; | ||
950 | mem->regions[i].size = end - start; | ||
951 | } else { | ||
952 | memblock_remove_region(mem, i); | ||
953 | i--; | ||
954 | } | ||
955 | } | ||
956 | } | ||
932 | 957 | ||
933 | void __init_memblock memblock_set_current_limit(phys_addr_t limit) | 958 | void __init_memblock memblock_set_current_limit(phys_addr_t limit) |
934 | { | 959 | { |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 795e525afaba..7acf43bf04a2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/oom.h> | 51 | #include <linux/oom.h> |
52 | #include "internal.h" | 52 | #include "internal.h" |
53 | #include <net/sock.h> | 53 | #include <net/sock.h> |
54 | #include <net/ip.h> | ||
54 | #include <net/tcp_memcontrol.h> | 55 | #include <net/tcp_memcontrol.h> |
55 | 56 | ||
56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
@@ -326,7 +327,7 @@ struct mem_cgroup { | |||
326 | struct mem_cgroup_stat_cpu nocpu_base; | 327 | struct mem_cgroup_stat_cpu nocpu_base; |
327 | spinlock_t pcp_counter_lock; | 328 | spinlock_t pcp_counter_lock; |
328 | 329 | ||
329 | #ifdef CONFIG_INET | 330 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
330 | struct tcp_memcontrol tcp_mem; | 331 | struct tcp_memcontrol tcp_mem; |
331 | #endif | 332 | #endif |
332 | }; | 333 | }; |
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | |||
411 | return container_of(s, struct mem_cgroup, css); | 412 | return container_of(s, struct mem_cgroup, css); |
412 | } | 413 | } |
413 | 414 | ||
415 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||
416 | { | ||
417 | return (memcg == root_mem_cgroup); | ||
418 | } | ||
419 | |||
414 | /* Writing them here to avoid exposing memcg's inner layout */ | 420 | /* Writing them here to avoid exposing memcg's inner layout */ |
415 | #ifdef CONFIG_MEMCG_KMEM | 421 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
416 | #include <net/sock.h> | ||
417 | #include <net/ip.h> | ||
418 | 422 | ||
419 | static bool mem_cgroup_is_root(struct mem_cgroup *memcg); | ||
420 | void sock_update_memcg(struct sock *sk) | 423 | void sock_update_memcg(struct sock *sk) |
421 | { | 424 | { |
422 | if (mem_cgroup_sockets_enabled) { | 425 | if (mem_cgroup_sockets_enabled) { |
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk) | |||
461 | } | 464 | } |
462 | } | 465 | } |
463 | 466 | ||
464 | #ifdef CONFIG_INET | ||
465 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | 467 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) |
466 | { | 468 | { |
467 | if (!memcg || mem_cgroup_is_root(memcg)) | 469 | if (!memcg || mem_cgroup_is_root(memcg)) |
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
470 | return &memcg->tcp_mem.cg_proto; | 472 | return &memcg->tcp_mem.cg_proto; |
471 | } | 473 | } |
472 | EXPORT_SYMBOL(tcp_proto_cgroup); | 474 | EXPORT_SYMBOL(tcp_proto_cgroup); |
473 | #endif /* CONFIG_INET */ | ||
474 | #endif /* CONFIG_MEMCG_KMEM */ | ||
475 | 475 | ||
476 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) | ||
477 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 476 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
478 | { | 477 | { |
479 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) | 478 | if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) |
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
1016 | iter != NULL; \ | 1015 | iter != NULL; \ |
1017 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1018 | 1017 | ||
1019 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | ||
1020 | { | ||
1021 | return (memcg == root_mem_cgroup); | ||
1022 | } | ||
1023 | |||
1024 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1025 | { | 1019 | { |
1026 | struct mem_cgroup *memcg; | 1020 | struct mem_cgroup *memcg; |
@@ -4973,6 +4967,13 @@ mem_cgroup_create(struct cgroup *cont) | |||
4973 | } else { | 4967 | } else { |
4974 | res_counter_init(&memcg->res, NULL); | 4968 | res_counter_init(&memcg->res, NULL); |
4975 | res_counter_init(&memcg->memsw, NULL); | 4969 | res_counter_init(&memcg->memsw, NULL); |
4970 | /* | ||
4971 | * Deeper hierachy with use_hierarchy == false doesn't make | ||
4972 | * much sense so let cgroup subsystem know about this | ||
4973 | * unfortunate state in our controller. | ||
4974 | */ | ||
4975 | if (parent && parent != root_mem_cgroup) | ||
4976 | mem_cgroup_subsys.broken_hierarchy = true; | ||
4976 | } | 4977 | } |
4977 | memcg->last_scanned_node = MAX_NUMNODES; | 4978 | memcg->last_scanned_node = MAX_NUMNODES; |
4978 | INIT_LIST_HEAD(&memcg->oom_notify); | 4979 | INIT_LIST_HEAD(&memcg->oom_notify); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a6e2141a6610..6c5899b9034a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
400 | struct vm_area_struct *vma; | 400 | struct vm_area_struct *vma; |
401 | struct task_struct *tsk; | 401 | struct task_struct *tsk; |
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | ||
403 | 404 | ||
404 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma(page); |
405 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
406 | return; | 407 | return; |
407 | 408 | ||
409 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
408 | read_lock(&tasklist_lock); | 410 | read_lock(&tasklist_lock); |
409 | for_each_process (tsk) { | 411 | for_each_process (tsk) { |
410 | struct anon_vma_chain *vmac; | 412 | struct anon_vma_chain *vmac; |
411 | 413 | ||
412 | if (!task_early_kill(tsk)) | 414 | if (!task_early_kill(tsk)) |
413 | continue; | 415 | continue; |
414 | list_for_each_entry(vmac, &av->head, same_anon_vma) { | 416 | anon_vma_interval_tree_foreach(vmac, &av->rb_root, |
417 | pgoff, pgoff) { | ||
415 | vma = vmac->vma; | 418 | vma = vmac->vma; |
416 | if (!page_mapped_in_vma(page, vma)) | 419 | if (!page_mapped_in_vma(page, vma)) |
417 | continue; | 420 | continue; |
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
431 | { | 434 | { |
432 | struct vm_area_struct *vma; | 435 | struct vm_area_struct *vma; |
433 | struct task_struct *tsk; | 436 | struct task_struct *tsk; |
434 | struct prio_tree_iter iter; | ||
435 | struct address_space *mapping = page->mapping; | 437 | struct address_space *mapping = page->mapping; |
436 | 438 | ||
437 | mutex_lock(&mapping->i_mmap_mutex); | 439 | mutex_lock(&mapping->i_mmap_mutex); |
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
442 | if (!task_early_kill(tsk)) | 444 | if (!task_early_kill(tsk)) |
443 | continue; | 445 | continue; |
444 | 446 | ||
445 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, | 447 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, |
446 | pgoff) { | 448 | pgoff) { |
447 | /* | 449 | /* |
448 | * Send early kill signal to tasks where a vma covers | 450 | * Send early kill signal to tasks where a vma covers |
diff --git a/mm/memory.c b/mm/memory.c index 57361708d1a5..fb135ba4aba9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
712 | add_taint(TAINT_BAD_PAGE); | 712 | add_taint(TAINT_BAD_PAGE); |
713 | } | 713 | } |
714 | 714 | ||
715 | static inline int is_cow_mapping(vm_flags_t flags) | 715 | static inline bool is_cow_mapping(vm_flags_t flags) |
716 | { | 716 | { |
717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
718 | } | 718 | } |
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1039 | unsigned long next; | 1039 | unsigned long next; |
1040 | unsigned long addr = vma->vm_start; | 1040 | unsigned long addr = vma->vm_start; |
1041 | unsigned long end = vma->vm_end; | 1041 | unsigned long end = vma->vm_end; |
1042 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1043 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1044 | bool is_cow; | ||
1042 | int ret; | 1045 | int ret; |
1043 | 1046 | ||
1044 | /* | 1047 | /* |
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1047 | * readonly mappings. The tradeoff is that copy_page_range is more | 1050 | * readonly mappings. The tradeoff is that copy_page_range is more |
1048 | * efficient than faulting. | 1051 | * efficient than faulting. |
1049 | */ | 1052 | */ |
1050 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { | 1053 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | |
1054 | VM_PFNMAP | VM_MIXEDMAP))) { | ||
1051 | if (!vma->anon_vma) | 1055 | if (!vma->anon_vma) |
1052 | return 0; | 1056 | return 0; |
1053 | } | 1057 | } |
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1055 | if (is_vm_hugetlb_page(vma)) | 1059 | if (is_vm_hugetlb_page(vma)) |
1056 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1060 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
1057 | 1061 | ||
1058 | if (unlikely(is_pfn_mapping(vma))) { | 1062 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
1059 | /* | 1063 | /* |
1060 | * We do not free on error cases below as remove_vma | 1064 | * We do not free on error cases below as remove_vma |
1061 | * gets called on error from higher level routine | 1065 | * gets called on error from higher level routine |
1062 | */ | 1066 | */ |
1063 | ret = track_pfn_vma_copy(vma); | 1067 | ret = track_pfn_copy(vma); |
1064 | if (ret) | 1068 | if (ret) |
1065 | return ret; | 1069 | return ret; |
1066 | } | 1070 | } |
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1071 | * parent mm. And a permission downgrade will only happen if | 1075 | * parent mm. And a permission downgrade will only happen if |
1072 | * is_cow_mapping() returns true. | 1076 | * is_cow_mapping() returns true. |
1073 | */ | 1077 | */ |
1074 | if (is_cow_mapping(vma->vm_flags)) | 1078 | is_cow = is_cow_mapping(vma->vm_flags); |
1075 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | 1079 | mmun_start = addr; |
1080 | mmun_end = end; | ||
1081 | if (is_cow) | ||
1082 | mmu_notifier_invalidate_range_start(src_mm, mmun_start, | ||
1083 | mmun_end); | ||
1076 | 1084 | ||
1077 | ret = 0; | 1085 | ret = 0; |
1078 | dst_pgd = pgd_offset(dst_mm, addr); | 1086 | dst_pgd = pgd_offset(dst_mm, addr); |
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1088 | } | 1096 | } |
1089 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 1097 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
1090 | 1098 | ||
1091 | if (is_cow_mapping(vma->vm_flags)) | 1099 | if (is_cow) |
1092 | mmu_notifier_invalidate_range_end(src_mm, | 1100 | mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end); |
1093 | vma->vm_start, end); | ||
1094 | return ret; | 1101 | return ret; |
1095 | } | 1102 | } |
1096 | 1103 | ||
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, | |||
1327 | if (vma->vm_file) | 1334 | if (vma->vm_file) |
1328 | uprobe_munmap(vma, start, end); | 1335 | uprobe_munmap(vma, start, end); |
1329 | 1336 | ||
1330 | if (unlikely(is_pfn_mapping(vma))) | 1337 | if (unlikely(vma->vm_flags & VM_PFNMAP)) |
1331 | untrack_pfn_vma(vma, 0, 0); | 1338 | untrack_pfn(vma, 0, 0); |
1332 | 1339 | ||
1333 | if (start != end) { | 1340 | if (start != end) { |
1334 | if (unlikely(is_vm_hugetlb_page(vma))) { | 1341 | if (unlikely(is_vm_hugetlb_page(vma))) { |
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1521 | spin_unlock(&mm->page_table_lock); | 1528 | spin_unlock(&mm->page_table_lock); |
1522 | wait_split_huge_page(vma->anon_vma, pmd); | 1529 | wait_split_huge_page(vma->anon_vma, pmd); |
1523 | } else { | 1530 | } else { |
1524 | page = follow_trans_huge_pmd(mm, address, | 1531 | page = follow_trans_huge_pmd(vma, address, |
1525 | pmd, flags); | 1532 | pmd, flags); |
1526 | spin_unlock(&mm->page_table_lock); | 1533 | spin_unlock(&mm->page_table_lock); |
1527 | goto out; | 1534 | goto out; |
@@ -1576,12 +1583,12 @@ split_fallthrough: | |||
1576 | if (page->mapping && trylock_page(page)) { | 1583 | if (page->mapping && trylock_page(page)) { |
1577 | lru_add_drain(); /* push cached pages to LRU */ | 1584 | lru_add_drain(); /* push cached pages to LRU */ |
1578 | /* | 1585 | /* |
1579 | * Because we lock page here and migration is | 1586 | * Because we lock page here, and migration is |
1580 | * blocked by the pte's page reference, we need | 1587 | * blocked by the pte's page reference, and we |
1581 | * only check for file-cache page truncation. | 1588 | * know the page is still mapped, we don't even |
1589 | * need to check for file-cache page truncation. | ||
1582 | */ | 1590 | */ |
1583 | if (page->mapping) | 1591 | mlock_vma_page(page); |
1584 | mlock_vma_page(page); | ||
1585 | unlock_page(page); | 1592 | unlock_page(page); |
1586 | } | 1593 | } |
1587 | } | 1594 | } |
@@ -2085,6 +2092,11 @@ out: | |||
2085 | * ask for a shared writable mapping! | 2092 | * ask for a shared writable mapping! |
2086 | * | 2093 | * |
2087 | * The page does not need to be reserved. | 2094 | * The page does not need to be reserved. |
2095 | * | ||
2096 | * Usually this function is called from f_op->mmap() handler | ||
2097 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. | ||
2098 | * Caller must set VM_MIXEDMAP on vma if it wants to call this | ||
2099 | * function from other places, for example from page-fault handler. | ||
2088 | */ | 2100 | */ |
2089 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 2101 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
2090 | struct page *page) | 2102 | struct page *page) |
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
2093 | return -EFAULT; | 2105 | return -EFAULT; |
2094 | if (!page_count(page)) | 2106 | if (!page_count(page)) |
2095 | return -EINVAL; | 2107 | return -EINVAL; |
2096 | vma->vm_flags |= VM_INSERTPAGE; | 2108 | if (!(vma->vm_flags & VM_MIXEDMAP)) { |
2109 | BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); | ||
2110 | BUG_ON(vma->vm_flags & VM_PFNMAP); | ||
2111 | vma->vm_flags |= VM_MIXEDMAP; | ||
2112 | } | ||
2097 | return insert_page(vma, addr, page, vma->vm_page_prot); | 2113 | return insert_page(vma, addr, page, vma->vm_page_prot); |
2098 | } | 2114 | } |
2099 | EXPORT_SYMBOL(vm_insert_page); | 2115 | EXPORT_SYMBOL(vm_insert_page); |
@@ -2132,7 +2148,7 @@ out: | |||
2132 | * @addr: target user address of this page | 2148 | * @addr: target user address of this page |
2133 | * @pfn: source kernel pfn | 2149 | * @pfn: source kernel pfn |
2134 | * | 2150 | * |
2135 | * Similar to vm_inert_page, this allows drivers to insert individual pages | 2151 | * Similar to vm_insert_page, this allows drivers to insert individual pages |
2136 | * they've allocated into a user vma. Same comments apply. | 2152 | * they've allocated into a user vma. Same comments apply. |
2137 | * | 2153 | * |
2138 | * This function should only be called from a vm_ops->fault handler, and | 2154 | * This function should only be called from a vm_ops->fault handler, and |
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
2162 | 2178 | ||
2163 | if (addr < vma->vm_start || addr >= vma->vm_end) | 2179 | if (addr < vma->vm_start || addr >= vma->vm_end) |
2164 | return -EFAULT; | 2180 | return -EFAULT; |
2165 | if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) | 2181 | if (track_pfn_insert(vma, &pgprot, pfn)) |
2166 | return -EINVAL; | 2182 | return -EINVAL; |
2167 | 2183 | ||
2168 | ret = insert_pfn(vma, addr, pfn, pgprot); | 2184 | ret = insert_pfn(vma, addr, pfn, pgprot); |
2169 | 2185 | ||
2170 | if (ret) | ||
2171 | untrack_pfn_vma(vma, pfn, PAGE_SIZE); | ||
2172 | |||
2173 | return ret; | 2186 | return ret; |
2174 | } | 2187 | } |
2175 | EXPORT_SYMBOL(vm_insert_pfn); | 2188 | EXPORT_SYMBOL(vm_insert_pfn); |
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2290 | * rest of the world about it: | 2303 | * rest of the world about it: |
2291 | * VM_IO tells people not to look at these pages | 2304 | * VM_IO tells people not to look at these pages |
2292 | * (accesses can have side effects). | 2305 | * (accesses can have side effects). |
2293 | * VM_RESERVED is specified all over the place, because | ||
2294 | * in 2.4 it kept swapout's vma scan off this vma; but | ||
2295 | * in 2.6 the LRU scan won't even find its pages, so this | ||
2296 | * flag means no more than count its pages in reserved_vm, | ||
2297 | * and omit it from core dump, even when VM_IO turned off. | ||
2298 | * VM_PFNMAP tells the core MM that the base pages are just | 2306 | * VM_PFNMAP tells the core MM that the base pages are just |
2299 | * raw PFN mappings, and do not have a "struct page" associated | 2307 | * raw PFN mappings, and do not have a "struct page" associated |
2300 | * with them. | 2308 | * with them. |
2309 | * VM_DONTEXPAND | ||
2310 | * Disable vma merging and expanding with mremap(). | ||
2311 | * VM_DONTDUMP | ||
2312 | * Omit vma from core dump, even when VM_IO turned off. | ||
2301 | * | 2313 | * |
2302 | * There's a horrible special case to handle copy-on-write | 2314 | * There's a horrible special case to handle copy-on-write |
2303 | * behaviour that some programs depend on. We mark the "original" | 2315 | * behaviour that some programs depend on. We mark the "original" |
2304 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". | 2316 | * un-COW'ed pages by matching them up with "vma->vm_pgoff". |
2317 | * See vm_normal_page() for details. | ||
2305 | */ | 2318 | */ |
2306 | if (addr == vma->vm_start && end == vma->vm_end) { | 2319 | if (is_cow_mapping(vma->vm_flags)) { |
2320 | if (addr != vma->vm_start || end != vma->vm_end) | ||
2321 | return -EINVAL; | ||
2307 | vma->vm_pgoff = pfn; | 2322 | vma->vm_pgoff = pfn; |
2308 | vma->vm_flags |= VM_PFN_AT_MMAP; | 2323 | } |
2309 | } else if (is_cow_mapping(vma->vm_flags)) | ||
2310 | return -EINVAL; | ||
2311 | |||
2312 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
2313 | 2324 | ||
2314 | err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); | 2325 | err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); |
2315 | if (err) { | 2326 | if (err) |
2316 | /* | ||
2317 | * To indicate that track_pfn related cleanup is not | ||
2318 | * needed from higher level routine calling unmap_vmas | ||
2319 | */ | ||
2320 | vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); | ||
2321 | vma->vm_flags &= ~VM_PFN_AT_MMAP; | ||
2322 | return -EINVAL; | 2327 | return -EINVAL; |
2323 | } | 2328 | |
2329 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; | ||
2324 | 2330 | ||
2325 | BUG_ON(addr >= end); | 2331 | BUG_ON(addr >= end); |
2326 | pfn -= addr >> PAGE_SHIFT; | 2332 | pfn -= addr >> PAGE_SHIFT; |
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
2335 | } while (pgd++, addr = next, addr != end); | 2341 | } while (pgd++, addr = next, addr != end); |
2336 | 2342 | ||
2337 | if (err) | 2343 | if (err) |
2338 | untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); | 2344 | untrack_pfn(vma, pfn, PAGE_ALIGN(size)); |
2339 | 2345 | ||
2340 | return err; | 2346 | return err; |
2341 | } | 2347 | } |
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2516 | spinlock_t *ptl, pte_t orig_pte) | 2522 | spinlock_t *ptl, pte_t orig_pte) |
2517 | __releases(ptl) | 2523 | __releases(ptl) |
2518 | { | 2524 | { |
2519 | struct page *old_page, *new_page; | 2525 | struct page *old_page, *new_page = NULL; |
2520 | pte_t entry; | 2526 | pte_t entry; |
2521 | int ret = 0; | 2527 | int ret = 0; |
2522 | int page_mkwrite = 0; | 2528 | int page_mkwrite = 0; |
2523 | struct page *dirty_page = NULL; | 2529 | struct page *dirty_page = NULL; |
2530 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2531 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2532 | bool mmun_called = false; /* For mmu_notifiers */ | ||
2524 | 2533 | ||
2525 | old_page = vm_normal_page(vma, address, orig_pte); | 2534 | old_page = vm_normal_page(vma, address, orig_pte); |
2526 | if (!old_page) { | 2535 | if (!old_page) { |
@@ -2698,6 +2707,11 @@ gotten: | |||
2698 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2707 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
2699 | goto oom_free_new; | 2708 | goto oom_free_new; |
2700 | 2709 | ||
2710 | mmun_start = address & PAGE_MASK; | ||
2711 | mmun_end = (address & PAGE_MASK) + PAGE_SIZE; | ||
2712 | mmun_called = true; | ||
2713 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2714 | |||
2701 | /* | 2715 | /* |
2702 | * Re-check the pte - we dropped the lock | 2716 | * Re-check the pte - we dropped the lock |
2703 | */ | 2717 | */ |
@@ -2764,6 +2778,8 @@ gotten: | |||
2764 | page_cache_release(new_page); | 2778 | page_cache_release(new_page); |
2765 | unlock: | 2779 | unlock: |
2766 | pte_unmap_unlock(page_table, ptl); | 2780 | pte_unmap_unlock(page_table, ptl); |
2781 | if (mmun_called) | ||
2782 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2767 | if (old_page) { | 2783 | if (old_page) { |
2768 | /* | 2784 | /* |
2769 | * Don't let another task, with possibly unlocked vma, | 2785 | * Don't let another task, with possibly unlocked vma, |
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma, | |||
2801 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); | 2817 | zap_page_range_single(vma, start_addr, end_addr - start_addr, details); |
2802 | } | 2818 | } |
2803 | 2819 | ||
2804 | static inline void unmap_mapping_range_tree(struct prio_tree_root *root, | 2820 | static inline void unmap_mapping_range_tree(struct rb_root *root, |
2805 | struct zap_details *details) | 2821 | struct zap_details *details) |
2806 | { | 2822 | { |
2807 | struct vm_area_struct *vma; | 2823 | struct vm_area_struct *vma; |
2808 | struct prio_tree_iter iter; | ||
2809 | pgoff_t vba, vea, zba, zea; | 2824 | pgoff_t vba, vea, zba, zea; |
2810 | 2825 | ||
2811 | vma_prio_tree_foreach(vma, &iter, root, | 2826 | vma_interval_tree_foreach(vma, root, |
2812 | details->first_index, details->last_index) { | 2827 | details->first_index, details->last_index) { |
2813 | 2828 | ||
2814 | vba = vma->vm_pgoff; | 2829 | vba = vma->vm_pgoff; |
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head, | |||
2839 | * across *all* the pages in each nonlinear VMA, not just the pages | 2854 | * across *all* the pages in each nonlinear VMA, not just the pages |
2840 | * whose virtual address lies outside the file truncation point. | 2855 | * whose virtual address lies outside the file truncation point. |
2841 | */ | 2856 | */ |
2842 | list_for_each_entry(vma, head, shared.vm_set.list) { | 2857 | list_for_each_entry(vma, head, shared.nonlinear) { |
2843 | details->nonlinear_vma = vma; | 2858 | details->nonlinear_vma = vma; |
2844 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | 2859 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); |
2845 | } | 2860 | } |
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2883 | 2898 | ||
2884 | 2899 | ||
2885 | mutex_lock(&mapping->i_mmap_mutex); | 2900 | mutex_lock(&mapping->i_mmap_mutex); |
2886 | if (unlikely(!prio_tree_empty(&mapping->i_mmap))) | 2901 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2887 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2902 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2888 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | 2903 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) |
2889 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | 2904 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6a5b90d0cfd7..56b758ae57d2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
107 | { | 107 | { |
108 | unsigned long type; | 108 | unsigned long type; |
109 | struct zone *zone; | ||
109 | 110 | ||
110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page) | |||
116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
118 | __free_pages_bootmem(page, 0); | 119 | __free_pages_bootmem(page, 0); |
120 | |||
121 | zone = page_zone(page); | ||
122 | zone_span_writelock(zone); | ||
123 | zone->present_pages++; | ||
124 | zone_span_writeunlock(zone); | ||
125 | totalram_pages++; | ||
119 | } | 126 | } |
120 | 127 | ||
121 | } | 128 | } |
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
362 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 369 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
363 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 370 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
364 | 371 | ||
372 | release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); | ||
373 | |||
365 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 374 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
366 | for (i = 0; i < sections_to_remove; i++) { | 375 | for (i = 0; i < sections_to_remove; i++) { |
367 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | 376 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; |
368 | release_mem_region(pfn << PAGE_SHIFT, | ||
369 | PAGES_PER_SECTION << PAGE_SHIFT); | ||
370 | ret = __remove_section(zone, __pfn_to_section(pfn)); | 377 | ret = __remove_section(zone, __pfn_to_section(pfn)); |
371 | if (ret) | 378 | if (ret) |
372 | break; | 379 | break; |
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) | |||
756 | return 0; | 763 | return 0; |
757 | } | 764 | } |
758 | 765 | ||
759 | static struct page * | ||
760 | hotremove_migrate_alloc(struct page *page, unsigned long private, int **x) | ||
761 | { | ||
762 | /* This should be improooooved!! */ | ||
763 | return alloc_page(GFP_HIGHUSER_MOVABLE); | ||
764 | } | ||
765 | |||
766 | #define NR_OFFLINE_AT_ONCE_PAGES (256) | 766 | #define NR_OFFLINE_AT_ONCE_PAGES (256) |
767 | static int | 767 | static int |
768 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | 768 | do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) |
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
813 | putback_lru_pages(&source); | 813 | putback_lru_pages(&source); |
814 | goto out; | 814 | goto out; |
815 | } | 815 | } |
816 | /* this function returns # of failed pages */ | 816 | |
817 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, | 817 | /* |
818 | * alloc_migrate_target should be improooooved!! | ||
819 | * migrate_pages returns # of failed pages. | ||
820 | */ | ||
821 | ret = migrate_pages(&source, alloc_migrate_target, 0, | ||
818 | true, MIGRATE_SYNC); | 822 | true, MIGRATE_SYNC); |
819 | if (ret) | 823 | if (ret) |
820 | putback_lru_pages(&source); | 824 | putback_lru_pages(&source); |
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
870 | return offlined; | 874 | return offlined; |
871 | } | 875 | } |
872 | 876 | ||
873 | static int __ref offline_pages(unsigned long start_pfn, | 877 | static int __ref __offline_pages(unsigned long start_pfn, |
874 | unsigned long end_pfn, unsigned long timeout) | 878 | unsigned long end_pfn, unsigned long timeout) |
875 | { | 879 | { |
876 | unsigned long pfn, nr_pages, expire; | 880 | unsigned long pfn, nr_pages, expire; |
@@ -970,8 +974,13 @@ repeat: | |||
970 | 974 | ||
971 | init_per_zone_wmark_min(); | 975 | init_per_zone_wmark_min(); |
972 | 976 | ||
973 | if (!populated_zone(zone)) | 977 | if (!populated_zone(zone)) { |
974 | zone_pcp_reset(zone); | 978 | zone_pcp_reset(zone); |
979 | mutex_lock(&zonelists_mutex); | ||
980 | build_all_zonelists(NULL, NULL); | ||
981 | mutex_unlock(&zonelists_mutex); | ||
982 | } else | ||
983 | zone_pcp_update(zone); | ||
975 | 984 | ||
976 | if (!node_present_pages(node)) { | 985 | if (!node_present_pages(node)) { |
977 | node_clear_state(node, N_HIGH_MEMORY); | 986 | node_clear_state(node, N_HIGH_MEMORY); |
@@ -998,15 +1007,55 @@ out: | |||
998 | return ret; | 1007 | return ret; |
999 | } | 1008 | } |
1000 | 1009 | ||
1010 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1011 | { | ||
1012 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | ||
1013 | } | ||
1014 | |||
1001 | int remove_memory(u64 start, u64 size) | 1015 | int remove_memory(u64 start, u64 size) |
1002 | { | 1016 | { |
1017 | struct memory_block *mem = NULL; | ||
1018 | struct mem_section *section; | ||
1003 | unsigned long start_pfn, end_pfn; | 1019 | unsigned long start_pfn, end_pfn; |
1020 | unsigned long pfn, section_nr; | ||
1021 | int ret; | ||
1004 | 1022 | ||
1005 | start_pfn = PFN_DOWN(start); | 1023 | start_pfn = PFN_DOWN(start); |
1006 | end_pfn = start_pfn + PFN_DOWN(size); | 1024 | end_pfn = start_pfn + PFN_DOWN(size); |
1007 | return offline_pages(start_pfn, end_pfn, 120 * HZ); | 1025 | |
1026 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | ||
1027 | section_nr = pfn_to_section_nr(pfn); | ||
1028 | if (!present_section_nr(section_nr)) | ||
1029 | continue; | ||
1030 | |||
1031 | section = __nr_to_section(section_nr); | ||
1032 | /* same memblock? */ | ||
1033 | if (mem) | ||
1034 | if ((section_nr >= mem->start_section_nr) && | ||
1035 | (section_nr <= mem->end_section_nr)) | ||
1036 | continue; | ||
1037 | |||
1038 | mem = find_memory_block_hinted(section, mem); | ||
1039 | if (!mem) | ||
1040 | continue; | ||
1041 | |||
1042 | ret = offline_memory_block(mem); | ||
1043 | if (ret) { | ||
1044 | kobject_put(&mem->dev.kobj); | ||
1045 | return ret; | ||
1046 | } | ||
1047 | } | ||
1048 | |||
1049 | if (mem) | ||
1050 | kobject_put(&mem->dev.kobj); | ||
1051 | |||
1052 | return 0; | ||
1008 | } | 1053 | } |
1009 | #else | 1054 | #else |
1055 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1056 | { | ||
1057 | return -EINVAL; | ||
1058 | } | ||
1010 | int remove_memory(u64 start, u64 size) | 1059 | int remove_memory(u64 start, u64 size) |
1011 | { | 1060 | { |
1012 | return -EINVAL; | 1061 | return -EINVAL; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ada3be6e252..d04a8a54c294 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
607 | return first; | 607 | return first; |
608 | } | 608 | } |
609 | 609 | ||
610 | /* | ||
611 | * Apply policy to a single VMA | ||
612 | * This must be called with the mmap_sem held for writing. | ||
613 | */ | ||
614 | static int vma_replace_policy(struct vm_area_struct *vma, | ||
615 | struct mempolicy *pol) | ||
616 | { | ||
617 | int err; | ||
618 | struct mempolicy *old; | ||
619 | struct mempolicy *new; | ||
620 | |||
621 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
622 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
623 | vma->vm_ops, vma->vm_file, | ||
624 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
625 | |||
626 | new = mpol_dup(pol); | ||
627 | if (IS_ERR(new)) | ||
628 | return PTR_ERR(new); | ||
629 | |||
630 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
631 | err = vma->vm_ops->set_policy(vma, new); | ||
632 | if (err) | ||
633 | goto err_out; | ||
634 | } | ||
635 | |||
636 | old = vma->vm_policy; | ||
637 | vma->vm_policy = new; /* protected by mmap_sem */ | ||
638 | mpol_put(old); | ||
639 | |||
640 | return 0; | ||
641 | err_out: | ||
642 | mpol_put(new); | ||
643 | return err; | ||
644 | } | ||
645 | |||
610 | /* Step 2: apply policy to a range and do splits. */ | 646 | /* Step 2: apply policy to a range and do splits. */ |
611 | static int mbind_range(struct mm_struct *mm, unsigned long start, | 647 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
612 | unsigned long end, struct mempolicy *new_pol) | 648 | unsigned long end, struct mempolicy *new_pol) |
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
655 | if (err) | 691 | if (err) |
656 | goto out; | 692 | goto out; |
657 | } | 693 | } |
658 | 694 | err = vma_replace_policy(vma, new_pol); | |
659 | /* | 695 | if (err) |
660 | * Apply policy to a single VMA. The reference counting of | 696 | goto out; |
661 | * policy for vma_policy linkages has already been handled by | ||
662 | * vma_merge and split_vma as necessary. If this is a shared | ||
663 | * policy then ->set_policy will increment the reference count | ||
664 | * for an sp node. | ||
665 | */ | ||
666 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | ||
667 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | ||
668 | vma->vm_ops, vma->vm_file, | ||
669 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | ||
670 | if (vma->vm_ops && vma->vm_ops->set_policy) { | ||
671 | err = vma->vm_ops->set_policy(vma, new_pol); | ||
672 | if (err) | ||
673 | goto out; | ||
674 | } | ||
675 | } | 697 | } |
676 | 698 | ||
677 | out: | 699 | out: |
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
924 | nodemask_t nmask; | 946 | nodemask_t nmask; |
925 | LIST_HEAD(pagelist); | 947 | LIST_HEAD(pagelist); |
926 | int err = 0; | 948 | int err = 0; |
927 | struct vm_area_struct *vma; | ||
928 | 949 | ||
929 | nodes_clear(nmask); | 950 | nodes_clear(nmask); |
930 | node_set(source, nmask); | 951 | node_set(source, nmask); |
931 | 952 | ||
932 | vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | 953 | /* |
954 | * This does not "check" the range but isolates all pages that | ||
955 | * need migration. Between passing in the full user address | ||
956 | * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. | ||
957 | */ | ||
958 | VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); | ||
959 | check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, | ||
933 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 960 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
934 | if (IS_ERR(vma)) | ||
935 | return PTR_ERR(vma); | ||
936 | 961 | ||
937 | if (!list_empty(&pagelist)) { | 962 | if (!list_empty(&pagelist)) { |
938 | err = migrate_pages(&pagelist, new_node_page, dest, | 963 | err = migrate_pages(&pagelist, new_node_page, dest, |
@@ -1511,9 +1536,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1511 | * | 1536 | * |
1512 | * Returns effective policy for a VMA at specified address. | 1537 | * Returns effective policy for a VMA at specified address. |
1513 | * Falls back to @task or system default policy, as necessary. | 1538 | * Falls back to @task or system default policy, as necessary. |
1514 | * Current or other task's task mempolicy and non-shared vma policies | 1539 | * Current or other task's task mempolicy and non-shared vma policies must be |
1515 | * are protected by the task's mmap_sem, which must be held for read by | 1540 | * protected by task_lock(task) by the caller. |
1516 | * the caller. | ||
1517 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference | 1541 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference |
1518 | * count--added by the get_policy() vm_op, as appropriate--to protect against | 1542 | * count--added by the get_policy() vm_op, as appropriate--to protect against |
1519 | * freeing by another task. It is the caller's responsibility to free the | 1543 | * freeing by another task. It is the caller's responsibility to free the |
@@ -1530,8 +1554,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1530 | addr); | 1554 | addr); |
1531 | if (vpol) | 1555 | if (vpol) |
1532 | pol = vpol; | 1556 | pol = vpol; |
1533 | } else if (vma->vm_policy) | 1557 | } else if (vma->vm_policy) { |
1534 | pol = vma->vm_policy; | 1558 | pol = vma->vm_policy; |
1559 | |||
1560 | /* | ||
1561 | * shmem_alloc_page() passes MPOL_F_SHARED policy with | ||
1562 | * a pseudo vma whose vma->vm_ops=NULL. Take a reference | ||
1563 | * count on these policies which will be dropped by | ||
1564 | * mpol_cond_put() later | ||
1565 | */ | ||
1566 | if (mpol_needs_cond_ref(pol)) | ||
1567 | mpol_get(pol); | ||
1568 | } | ||
1535 | } | 1569 | } |
1536 | if (!pol) | 1570 | if (!pol) |
1537 | pol = &default_policy; | 1571 | pol = &default_policy; |
@@ -2061,7 +2095,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
2061 | */ | 2095 | */ |
2062 | 2096 | ||
2063 | /* lookup first element intersecting start-end */ | 2097 | /* lookup first element intersecting start-end */ |
2064 | /* Caller holds sp->lock */ | 2098 | /* Caller holds sp->mutex */ |
2065 | static struct sp_node * | 2099 | static struct sp_node * |
2066 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) | 2100 | sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) |
2067 | { | 2101 | { |
@@ -2125,36 +2159,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
2125 | 2159 | ||
2126 | if (!sp->root.rb_node) | 2160 | if (!sp->root.rb_node) |
2127 | return NULL; | 2161 | return NULL; |
2128 | spin_lock(&sp->lock); | 2162 | mutex_lock(&sp->mutex); |
2129 | sn = sp_lookup(sp, idx, idx+1); | 2163 | sn = sp_lookup(sp, idx, idx+1); |
2130 | if (sn) { | 2164 | if (sn) { |
2131 | mpol_get(sn->policy); | 2165 | mpol_get(sn->policy); |
2132 | pol = sn->policy; | 2166 | pol = sn->policy; |
2133 | } | 2167 | } |
2134 | spin_unlock(&sp->lock); | 2168 | mutex_unlock(&sp->mutex); |
2135 | return pol; | 2169 | return pol; |
2136 | } | 2170 | } |
2137 | 2171 | ||
2172 | static void sp_free(struct sp_node *n) | ||
2173 | { | ||
2174 | mpol_put(n->policy); | ||
2175 | kmem_cache_free(sn_cache, n); | ||
2176 | } | ||
2177 | |||
2138 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2178 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2139 | { | 2179 | { |
2140 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2180 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
2141 | rb_erase(&n->nd, &sp->root); | 2181 | rb_erase(&n->nd, &sp->root); |
2142 | mpol_put(n->policy); | 2182 | sp_free(n); |
2143 | kmem_cache_free(sn_cache, n); | ||
2144 | } | 2183 | } |
2145 | 2184 | ||
2146 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | 2185 | static struct sp_node *sp_alloc(unsigned long start, unsigned long end, |
2147 | struct mempolicy *pol) | 2186 | struct mempolicy *pol) |
2148 | { | 2187 | { |
2149 | struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | 2188 | struct sp_node *n; |
2189 | struct mempolicy *newpol; | ||
2150 | 2190 | ||
2191 | n = kmem_cache_alloc(sn_cache, GFP_KERNEL); | ||
2151 | if (!n) | 2192 | if (!n) |
2152 | return NULL; | 2193 | return NULL; |
2194 | |||
2195 | newpol = mpol_dup(pol); | ||
2196 | if (IS_ERR(newpol)) { | ||
2197 | kmem_cache_free(sn_cache, n); | ||
2198 | return NULL; | ||
2199 | } | ||
2200 | newpol->flags |= MPOL_F_SHARED; | ||
2201 | |||
2153 | n->start = start; | 2202 | n->start = start; |
2154 | n->end = end; | 2203 | n->end = end; |
2155 | mpol_get(pol); | 2204 | n->policy = newpol; |
2156 | pol->flags |= MPOL_F_SHARED; /* for unref */ | 2205 | |
2157 | n->policy = pol; | ||
2158 | return n; | 2206 | return n; |
2159 | } | 2207 | } |
2160 | 2208 | ||
@@ -2162,10 +2210,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, | |||
2162 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, | 2210 | static int shared_policy_replace(struct shared_policy *sp, unsigned long start, |
2163 | unsigned long end, struct sp_node *new) | 2211 | unsigned long end, struct sp_node *new) |
2164 | { | 2212 | { |
2165 | struct sp_node *n, *new2 = NULL; | 2213 | struct sp_node *n; |
2214 | int ret = 0; | ||
2166 | 2215 | ||
2167 | restart: | 2216 | mutex_lock(&sp->mutex); |
2168 | spin_lock(&sp->lock); | ||
2169 | n = sp_lookup(sp, start, end); | 2217 | n = sp_lookup(sp, start, end); |
2170 | /* Take care of old policies in the same range. */ | 2218 | /* Take care of old policies in the same range. */ |
2171 | while (n && n->start < end) { | 2219 | while (n && n->start < end) { |
@@ -2178,16 +2226,14 @@ restart: | |||
2178 | } else { | 2226 | } else { |
2179 | /* Old policy spanning whole new range. */ | 2227 | /* Old policy spanning whole new range. */ |
2180 | if (n->end > end) { | 2228 | if (n->end > end) { |
2229 | struct sp_node *new2; | ||
2230 | new2 = sp_alloc(end, n->end, n->policy); | ||
2181 | if (!new2) { | 2231 | if (!new2) { |
2182 | spin_unlock(&sp->lock); | 2232 | ret = -ENOMEM; |
2183 | new2 = sp_alloc(end, n->end, n->policy); | 2233 | goto out; |
2184 | if (!new2) | ||
2185 | return -ENOMEM; | ||
2186 | goto restart; | ||
2187 | } | 2234 | } |
2188 | n->end = start; | 2235 | n->end = start; |
2189 | sp_insert(sp, new2); | 2236 | sp_insert(sp, new2); |
2190 | new2 = NULL; | ||
2191 | break; | 2237 | break; |
2192 | } else | 2238 | } else |
2193 | n->end = start; | 2239 | n->end = start; |
@@ -2198,12 +2244,9 @@ restart: | |||
2198 | } | 2244 | } |
2199 | if (new) | 2245 | if (new) |
2200 | sp_insert(sp, new); | 2246 | sp_insert(sp, new); |
2201 | spin_unlock(&sp->lock); | 2247 | out: |
2202 | if (new2) { | 2248 | mutex_unlock(&sp->mutex); |
2203 | mpol_put(new2->policy); | 2249 | return ret; |
2204 | kmem_cache_free(sn_cache, new2); | ||
2205 | } | ||
2206 | return 0; | ||
2207 | } | 2250 | } |
2208 | 2251 | ||
2209 | /** | 2252 | /** |
@@ -2221,7 +2264,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | |||
2221 | int ret; | 2264 | int ret; |
2222 | 2265 | ||
2223 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 2266 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
2224 | spin_lock_init(&sp->lock); | 2267 | mutex_init(&sp->mutex); |
2225 | 2268 | ||
2226 | if (mpol) { | 2269 | if (mpol) { |
2227 | struct vm_area_struct pvma; | 2270 | struct vm_area_struct pvma; |
@@ -2275,7 +2318,7 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
2275 | } | 2318 | } |
2276 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); | 2319 | err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); |
2277 | if (err && new) | 2320 | if (err && new) |
2278 | kmem_cache_free(sn_cache, new); | 2321 | sp_free(new); |
2279 | return err; | 2322 | return err; |
2280 | } | 2323 | } |
2281 | 2324 | ||
@@ -2287,16 +2330,14 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2287 | 2330 | ||
2288 | if (!p->root.rb_node) | 2331 | if (!p->root.rb_node) |
2289 | return; | 2332 | return; |
2290 | spin_lock(&p->lock); | 2333 | mutex_lock(&p->mutex); |
2291 | next = rb_first(&p->root); | 2334 | next = rb_first(&p->root); |
2292 | while (next) { | 2335 | while (next) { |
2293 | n = rb_entry(next, struct sp_node, nd); | 2336 | n = rb_entry(next, struct sp_node, nd); |
2294 | next = rb_next(&n->nd); | 2337 | next = rb_next(&n->nd); |
2295 | rb_erase(&n->nd, &p->root); | 2338 | sp_delete(p, n); |
2296 | mpol_put(n->policy); | ||
2297 | kmem_cache_free(sn_cache, n); | ||
2298 | } | 2339 | } |
2299 | spin_unlock(&p->lock); | 2340 | mutex_unlock(&p->mutex); |
2300 | } | 2341 | } |
2301 | 2342 | ||
2302 | /* assumes fs == KERNEL_DS */ | 2343 | /* assumes fs == KERNEL_DS */ |
diff --git a/mm/mlock.c b/mm/mlock.c index ef726e8aa8e9..f0b9ce572fc7 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock); | |||
51 | /* | 51 | /* |
52 | * LRU accounting for clear_page_mlock() | 52 | * LRU accounting for clear_page_mlock() |
53 | */ | 53 | */ |
54 | void __clear_page_mlock(struct page *page) | 54 | void clear_page_mlock(struct page *page) |
55 | { | 55 | { |
56 | VM_BUG_ON(!PageLocked(page)); | 56 | if (!TestClearPageMlocked(page)) |
57 | |||
58 | if (!page->mapping) { /* truncated ? */ | ||
59 | return; | 57 | return; |
60 | } | ||
61 | 58 | ||
62 | dec_zone_page_state(page, NR_MLOCK); | 59 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
60 | -hpage_nr_pages(page)); | ||
63 | count_vm_event(UNEVICTABLE_PGCLEARED); | 61 | count_vm_event(UNEVICTABLE_PGCLEARED); |
64 | if (!isolate_lru_page(page)) { | 62 | if (!isolate_lru_page(page)) { |
65 | putback_lru_page(page); | 63 | putback_lru_page(page); |
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page) | |||
81 | BUG_ON(!PageLocked(page)); | 79 | BUG_ON(!PageLocked(page)); |
82 | 80 | ||
83 | if (!TestSetPageMlocked(page)) { | 81 | if (!TestSetPageMlocked(page)) { |
84 | inc_zone_page_state(page, NR_MLOCK); | 82 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
83 | hpage_nr_pages(page)); | ||
85 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 84 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
86 | if (!isolate_lru_page(page)) | 85 | if (!isolate_lru_page(page)) |
87 | putback_lru_page(page); | 86 | putback_lru_page(page); |
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page) | |||
108 | BUG_ON(!PageLocked(page)); | 107 | BUG_ON(!PageLocked(page)); |
109 | 108 | ||
110 | if (TestClearPageMlocked(page)) { | 109 | if (TestClearPageMlocked(page)) { |
111 | dec_zone_page_state(page, NR_MLOCK); | 110 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
111 | -hpage_nr_pages(page)); | ||
112 | if (!isolate_lru_page(page)) { | 112 | if (!isolate_lru_page(page)) { |
113 | int ret = SWAP_AGAIN; | 113 | int ret = SWAP_AGAIN; |
114 | 114 | ||
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | 227 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) |
228 | goto no_mlock; | 228 | goto no_mlock; |
229 | 229 | ||
230 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 230 | if (!((vma->vm_flags & VM_DONTEXPAND) || |
231 | is_vm_hugetlb_page(vma) || | 231 | is_vm_hugetlb_page(vma) || |
232 | vma == get_gate_vma(current->mm))) { | 232 | vma == get_gate_vma(current->mm))) { |
233 | 233 | ||
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); | 290 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); |
291 | if (page && !IS_ERR(page)) { | 291 | if (page && !IS_ERR(page)) { |
292 | lock_page(page); | 292 | lock_page(page); |
293 | /* | 293 | munlock_vma_page(page); |
294 | * Like in __mlock_vma_pages_range(), | ||
295 | * because we lock page here and migration is | ||
296 | * blocked by the elevated reference, we need | ||
297 | * only check for file-cache page truncation. | ||
298 | */ | ||
299 | if (page->mapping) | ||
300 | munlock_vma_page(page); | ||
301 | unlock_page(page); | 294 | unlock_page(page); |
302 | put_page(page); | 295 | put_page(page); |
303 | } | 296 | } |
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm, | |||
51 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 51 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
52 | unsigned long start, unsigned long end); | 52 | unsigned long start, unsigned long end); |
53 | 53 | ||
54 | /* | ||
55 | * WARNING: the debugging will use recursive algorithms so never enable this | ||
56 | * unless you know what you are doing. | ||
57 | */ | ||
58 | #undef DEBUG_MM_RB | ||
59 | |||
60 | /* description of effects of mapping type and prot in current implementation. | 54 | /* description of effects of mapping type and prot in current implementation. |
61 | * this is due to the limited x86 page protection hardware. The expected | 55 | * this is due to the limited x86 page protection hardware. The expected |
62 | * behavior is in parens: | 56 | * behavior is in parens: |
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
199 | 193 | ||
200 | flush_dcache_mmap_lock(mapping); | 194 | flush_dcache_mmap_lock(mapping); |
201 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 195 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
202 | list_del_init(&vma->shared.vm_set.list); | 196 | list_del_init(&vma->shared.nonlinear); |
203 | else | 197 | else |
204 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 198 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
205 | flush_dcache_mmap_unlock(mapping); | 199 | flush_dcache_mmap_unlock(mapping); |
206 | } | 200 | } |
207 | 201 | ||
208 | /* | 202 | /* |
209 | * Unlink a file-based vm structure from its prio_tree, to hide | 203 | * Unlink a file-based vm structure from its interval tree, to hide |
210 | * vma from rmap and vmtruncate before freeing its page tables. | 204 | * vma from rmap and vmtruncate before freeing its page tables. |
211 | */ | 205 | */ |
212 | void unlink_file_vma(struct vm_area_struct *vma) | 206 | void unlink_file_vma(struct vm_area_struct *vma) |
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
231 | might_sleep(); | 225 | might_sleep(); |
232 | if (vma->vm_ops && vma->vm_ops->close) | 226 | if (vma->vm_ops && vma->vm_ops->close) |
233 | vma->vm_ops->close(vma); | 227 | vma->vm_ops->close(vma); |
234 | if (vma->vm_file) { | 228 | if (vma->vm_file) |
235 | fput(vma->vm_file); | 229 | fput(vma->vm_file); |
236 | if (vma->vm_flags & VM_EXECUTABLE) | ||
237 | removed_exe_file_vma(vma->vm_mm); | ||
238 | } | ||
239 | mpol_put(vma_policy(vma)); | 230 | mpol_put(vma_policy(vma)); |
240 | kmem_cache_free(vm_area_cachep, vma); | 231 | kmem_cache_free(vm_area_cachep, vma); |
241 | return next; | 232 | return next; |
@@ -306,7 +297,7 @@ out: | |||
306 | return retval; | 297 | return retval; |
307 | } | 298 | } |
308 | 299 | ||
309 | #ifdef DEBUG_MM_RB | 300 | #ifdef CONFIG_DEBUG_VM_RB |
310 | static int browse_rb(struct rb_root *root) | 301 | static int browse_rb(struct rb_root *root) |
311 | { | 302 | { |
312 | int i = 0, j; | 303 | int i = 0, j; |
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm) | |||
340 | { | 331 | { |
341 | int bug = 0; | 332 | int bug = 0; |
342 | int i = 0; | 333 | int i = 0; |
343 | struct vm_area_struct *tmp = mm->mmap; | 334 | struct vm_area_struct *vma = mm->mmap; |
344 | while (tmp) { | 335 | while (vma) { |
345 | tmp = tmp->vm_next; | 336 | struct anon_vma_chain *avc; |
337 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
338 | anon_vma_interval_tree_verify(avc); | ||
339 | vma = vma->vm_next; | ||
346 | i++; | 340 | i++; |
347 | } | 341 | } |
348 | if (i != mm->map_count) | 342 | if (i != mm->map_count) |
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm) | |||
356 | #define validate_mm(mm) do { } while (0) | 350 | #define validate_mm(mm) do { } while (0) |
357 | #endif | 351 | #endif |
358 | 352 | ||
359 | static struct vm_area_struct * | 353 | /* |
360 | find_vma_prepare(struct mm_struct *mm, unsigned long addr, | 354 | * vma has some anon_vma assigned, and is already inserted on that |
361 | struct vm_area_struct **pprev, struct rb_node ***rb_link, | 355 | * anon_vma's interval trees. |
362 | struct rb_node ** rb_parent) | 356 | * |
357 | * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the | ||
358 | * vma must be removed from the anon_vma's interval trees using | ||
359 | * anon_vma_interval_tree_pre_update_vma(). | ||
360 | * | ||
361 | * After the update, the vma will be reinserted using | ||
362 | * anon_vma_interval_tree_post_update_vma(). | ||
363 | * | ||
364 | * The entire update must be protected by exclusive mmap_sem and by | ||
365 | * the root anon_vma's mutex. | ||
366 | */ | ||
367 | static inline void | ||
368 | anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) | ||
363 | { | 369 | { |
364 | struct vm_area_struct * vma; | 370 | struct anon_vma_chain *avc; |
365 | struct rb_node ** __rb_link, * __rb_parent, * rb_prev; | 371 | |
372 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
373 | anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); | ||
374 | } | ||
375 | |||
376 | static inline void | ||
377 | anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) | ||
378 | { | ||
379 | struct anon_vma_chain *avc; | ||
380 | |||
381 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | ||
382 | anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); | ||
383 | } | ||
384 | |||
385 | static int find_vma_links(struct mm_struct *mm, unsigned long addr, | ||
386 | unsigned long end, struct vm_area_struct **pprev, | ||
387 | struct rb_node ***rb_link, struct rb_node **rb_parent) | ||
388 | { | ||
389 | struct rb_node **__rb_link, *__rb_parent, *rb_prev; | ||
366 | 390 | ||
367 | __rb_link = &mm->mm_rb.rb_node; | 391 | __rb_link = &mm->mm_rb.rb_node; |
368 | rb_prev = __rb_parent = NULL; | 392 | rb_prev = __rb_parent = NULL; |
369 | vma = NULL; | ||
370 | 393 | ||
371 | while (*__rb_link) { | 394 | while (*__rb_link) { |
372 | struct vm_area_struct *vma_tmp; | 395 | struct vm_area_struct *vma_tmp; |
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
375 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); | 398 | vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); |
376 | 399 | ||
377 | if (vma_tmp->vm_end > addr) { | 400 | if (vma_tmp->vm_end > addr) { |
378 | vma = vma_tmp; | 401 | /* Fail if an existing vma overlaps the area */ |
379 | if (vma_tmp->vm_start <= addr) | 402 | if (vma_tmp->vm_start < end) |
380 | break; | 403 | return -ENOMEM; |
381 | __rb_link = &__rb_parent->rb_left; | 404 | __rb_link = &__rb_parent->rb_left; |
382 | } else { | 405 | } else { |
383 | rb_prev = __rb_parent; | 406 | rb_prev = __rb_parent; |
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
390 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); | 413 | *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); |
391 | *rb_link = __rb_link; | 414 | *rb_link = __rb_link; |
392 | *rb_parent = __rb_parent; | 415 | *rb_parent = __rb_parent; |
393 | return vma; | 416 | return 0; |
394 | } | 417 | } |
395 | 418 | ||
396 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, | 419 | void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
417 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 440 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) |
418 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | 441 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); |
419 | else | 442 | else |
420 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 443 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
421 | flush_dcache_mmap_unlock(mapping); | 444 | flush_dcache_mmap_unlock(mapping); |
422 | } | 445 | } |
423 | } | 446 | } |
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
455 | 478 | ||
456 | /* | 479 | /* |
457 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the | 480 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
458 | * mm's list and rbtree. It has already been inserted into the prio_tree. | 481 | * mm's list and rbtree. It has already been inserted into the interval tree. |
459 | */ | 482 | */ |
460 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 483 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
461 | { | 484 | { |
462 | struct vm_area_struct *__vma, *prev; | 485 | struct vm_area_struct *prev; |
463 | struct rb_node **rb_link, *rb_parent; | 486 | struct rb_node **rb_link, *rb_parent; |
464 | 487 | ||
465 | __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); | 488 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
466 | BUG_ON(__vma && __vma->vm_start < vma->vm_end); | 489 | &prev, &rb_link, &rb_parent)) |
490 | BUG(); | ||
467 | __vma_link(mm, vma, prev, rb_link, rb_parent); | 491 | __vma_link(mm, vma, prev, rb_link, rb_parent); |
468 | mm->map_count++; | 492 | mm->map_count++; |
469 | } | 493 | } |
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
496 | struct vm_area_struct *next = vma->vm_next; | 520 | struct vm_area_struct *next = vma->vm_next; |
497 | struct vm_area_struct *importer = NULL; | 521 | struct vm_area_struct *importer = NULL; |
498 | struct address_space *mapping = NULL; | 522 | struct address_space *mapping = NULL; |
499 | struct prio_tree_root *root = NULL; | 523 | struct rb_root *root = NULL; |
500 | struct anon_vma *anon_vma = NULL; | 524 | struct anon_vma *anon_vma = NULL; |
501 | struct file *file = vma->vm_file; | 525 | struct file *file = vma->vm_file; |
502 | long adjust_next = 0; | 526 | long adjust_next = 0; |
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
559 | mutex_lock(&mapping->i_mmap_mutex); | 583 | mutex_lock(&mapping->i_mmap_mutex); |
560 | if (insert) { | 584 | if (insert) { |
561 | /* | 585 | /* |
562 | * Put into prio_tree now, so instantiated pages | 586 | * Put into interval tree now, so instantiated pages |
563 | * are visible to arm/parisc __flush_dcache_page | 587 | * are visible to arm/parisc __flush_dcache_page |
564 | * throughout; but we cannot insert into address | 588 | * throughout; but we cannot insert into address |
565 | * space until vma start or end is updated. | 589 | * space until vma start or end is updated. |
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end); | |||
570 | 594 | ||
571 | vma_adjust_trans_huge(vma, start, end, adjust_next); | 595 | vma_adjust_trans_huge(vma, start, end, adjust_next); |
572 | 596 | ||
573 | /* | 597 | anon_vma = vma->anon_vma; |
574 | * When changing only vma->vm_end, we don't really need anon_vma | 598 | if (!anon_vma && adjust_next) |
575 | * lock. This is a fairly rare case by itself, but the anon_vma | 599 | anon_vma = next->anon_vma; |
576 | * lock may be shared between many sibling processes. Skipping | 600 | if (anon_vma) { |
577 | * the lock for brk adjustments makes a difference sometimes. | 601 | VM_BUG_ON(adjust_next && next->anon_vma && |
578 | */ | 602 | anon_vma != next->anon_vma); |
579 | if (vma->anon_vma && (importer || start != vma->vm_start)) { | ||
580 | anon_vma = vma->anon_vma; | ||
581 | anon_vma_lock(anon_vma); | 603 | anon_vma_lock(anon_vma); |
604 | anon_vma_interval_tree_pre_update_vma(vma); | ||
605 | if (adjust_next) | ||
606 | anon_vma_interval_tree_pre_update_vma(next); | ||
582 | } | 607 | } |
583 | 608 | ||
584 | if (root) { | 609 | if (root) { |
585 | flush_dcache_mmap_lock(mapping); | 610 | flush_dcache_mmap_lock(mapping); |
586 | vma_prio_tree_remove(vma, root); | 611 | vma_interval_tree_remove(vma, root); |
587 | if (adjust_next) | 612 | if (adjust_next) |
588 | vma_prio_tree_remove(next, root); | 613 | vma_interval_tree_remove(next, root); |
589 | } | 614 | } |
590 | 615 | ||
591 | vma->vm_start = start; | 616 | vma->vm_start = start; |
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
598 | 623 | ||
599 | if (root) { | 624 | if (root) { |
600 | if (adjust_next) | 625 | if (adjust_next) |
601 | vma_prio_tree_insert(next, root); | 626 | vma_interval_tree_insert(next, root); |
602 | vma_prio_tree_insert(vma, root); | 627 | vma_interval_tree_insert(vma, root); |
603 | flush_dcache_mmap_unlock(mapping); | 628 | flush_dcache_mmap_unlock(mapping); |
604 | } | 629 | } |
605 | 630 | ||
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end); | |||
620 | __insert_vm_struct(mm, insert); | 645 | __insert_vm_struct(mm, insert); |
621 | } | 646 | } |
622 | 647 | ||
623 | if (anon_vma) | 648 | if (anon_vma) { |
649 | anon_vma_interval_tree_post_update_vma(vma); | ||
650 | if (adjust_next) | ||
651 | anon_vma_interval_tree_post_update_vma(next); | ||
624 | anon_vma_unlock(anon_vma); | 652 | anon_vma_unlock(anon_vma); |
653 | } | ||
625 | if (mapping) | 654 | if (mapping) |
626 | mutex_unlock(&mapping->i_mmap_mutex); | 655 | mutex_unlock(&mapping->i_mmap_mutex); |
627 | 656 | ||
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
636 | if (file) { | 665 | if (file) { |
637 | uprobe_munmap(next, next->vm_start, next->vm_end); | 666 | uprobe_munmap(next, next->vm_start, next->vm_end); |
638 | fput(file); | 667 | fput(file); |
639 | if (next->vm_flags & VM_EXECUTABLE) | ||
640 | removed_exe_file_vma(mm); | ||
641 | } | 668 | } |
642 | if (next->anon_vma) | 669 | if (next->anon_vma) |
643 | anon_vma_merge(vma, next); | 670 | anon_vma_merge(vma, next); |
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
669 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 696 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
670 | struct file *file, unsigned long vm_flags) | 697 | struct file *file, unsigned long vm_flags) |
671 | { | 698 | { |
672 | /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ | 699 | if (vma->vm_flags ^ vm_flags) |
673 | if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) | ||
674 | return 0; | 700 | return 0; |
675 | if (vma->vm_file != file) | 701 | if (vma->vm_file != file) |
676 | return 0; | 702 | return 0; |
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
951 | mm->exec_vm += pages; | 977 | mm->exec_vm += pages; |
952 | } else if (flags & stack_flags) | 978 | } else if (flags & stack_flags) |
953 | mm->stack_vm += pages; | 979 | mm->stack_vm += pages; |
954 | if (flags & (VM_RESERVED|VM_IO)) | ||
955 | mm->reserved_vm += pages; | ||
956 | } | 980 | } |
957 | #endif /* CONFIG_PROC_FS */ | 981 | #endif /* CONFIG_PROC_FS */ |
958 | 982 | ||
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) | |||
1190 | return 0; | 1214 | return 0; |
1191 | 1215 | ||
1192 | /* Specialty mapping? */ | 1216 | /* Specialty mapping? */ |
1193 | if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) | 1217 | if (vm_flags & VM_PFNMAP) |
1194 | return 0; | 1218 | return 0; |
1195 | 1219 | ||
1196 | /* Can the mapping track the dirty pages? */ | 1220 | /* Can the mapping track the dirty pages? */ |
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
1229 | /* Clear old maps */ | 1253 | /* Clear old maps */ |
1230 | error = -ENOMEM; | 1254 | error = -ENOMEM; |
1231 | munmap_back: | 1255 | munmap_back: |
1232 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 1256 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
1233 | if (vma && vma->vm_start < addr + len) { | ||
1234 | if (do_munmap(mm, addr, len)) | 1257 | if (do_munmap(mm, addr, len)) |
1235 | return -ENOMEM; | 1258 | return -ENOMEM; |
1236 | goto munmap_back; | 1259 | goto munmap_back; |
@@ -1301,13 +1324,10 @@ munmap_back: | |||
1301 | goto free_vma; | 1324 | goto free_vma; |
1302 | correct_wcount = 1; | 1325 | correct_wcount = 1; |
1303 | } | 1326 | } |
1304 | vma->vm_file = file; | 1327 | vma->vm_file = get_file(file); |
1305 | get_file(file); | ||
1306 | error = file->f_op->mmap(file, vma); | 1328 | error = file->f_op->mmap(file, vma); |
1307 | if (error) | 1329 | if (error) |
1308 | goto unmap_and_free_vma; | 1330 | goto unmap_and_free_vma; |
1309 | if (vm_flags & VM_EXECUTABLE) | ||
1310 | added_exe_file_vma(mm); | ||
1311 | 1331 | ||
1312 | /* Can addr have changed?? | 1332 | /* Can addr have changed?? |
1313 | * | 1333 | * |
@@ -1758,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1758 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 1778 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
1759 | error = acct_stack_growth(vma, size, grow); | 1779 | error = acct_stack_growth(vma, size, grow); |
1760 | if (!error) { | 1780 | if (!error) { |
1781 | anon_vma_interval_tree_pre_update_vma(vma); | ||
1761 | vma->vm_end = address; | 1782 | vma->vm_end = address; |
1783 | anon_vma_interval_tree_post_update_vma(vma); | ||
1762 | perf_event_mmap(vma); | 1784 | perf_event_mmap(vma); |
1763 | } | 1785 | } |
1764 | } | 1786 | } |
1765 | } | 1787 | } |
1766 | vma_unlock_anon_vma(vma); | 1788 | vma_unlock_anon_vma(vma); |
1767 | khugepaged_enter_vma_merge(vma); | 1789 | khugepaged_enter_vma_merge(vma); |
1790 | validate_mm(vma->vm_mm); | ||
1768 | return error; | 1791 | return error; |
1769 | } | 1792 | } |
1770 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ | 1793 | #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ |
@@ -1808,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma, | |||
1808 | if (grow <= vma->vm_pgoff) { | 1831 | if (grow <= vma->vm_pgoff) { |
1809 | error = acct_stack_growth(vma, size, grow); | 1832 | error = acct_stack_growth(vma, size, grow); |
1810 | if (!error) { | 1833 | if (!error) { |
1834 | anon_vma_interval_tree_pre_update_vma(vma); | ||
1811 | vma->vm_start = address; | 1835 | vma->vm_start = address; |
1812 | vma->vm_pgoff -= grow; | 1836 | vma->vm_pgoff -= grow; |
1837 | anon_vma_interval_tree_post_update_vma(vma); | ||
1813 | perf_event_mmap(vma); | 1838 | perf_event_mmap(vma); |
1814 | } | 1839 | } |
1815 | } | 1840 | } |
1816 | } | 1841 | } |
1817 | vma_unlock_anon_vma(vma); | 1842 | vma_unlock_anon_vma(vma); |
1818 | khugepaged_enter_vma_merge(vma); | 1843 | khugepaged_enter_vma_merge(vma); |
1844 | validate_mm(vma->vm_mm); | ||
1819 | return error; | 1845 | return error; |
1820 | } | 1846 | } |
1821 | 1847 | ||
@@ -1989,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1989 | if (anon_vma_clone(new, vma)) | 2015 | if (anon_vma_clone(new, vma)) |
1990 | goto out_free_mpol; | 2016 | goto out_free_mpol; |
1991 | 2017 | ||
1992 | if (new->vm_file) { | 2018 | if (new->vm_file) |
1993 | get_file(new->vm_file); | 2019 | get_file(new->vm_file); |
1994 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1995 | added_exe_file_vma(mm); | ||
1996 | } | ||
1997 | 2020 | ||
1998 | if (new->vm_ops && new->vm_ops->open) | 2021 | if (new->vm_ops && new->vm_ops->open) |
1999 | new->vm_ops->open(new); | 2022 | new->vm_ops->open(new); |
@@ -2011,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
2011 | /* Clean everything up if vma_adjust failed. */ | 2034 | /* Clean everything up if vma_adjust failed. */ |
2012 | if (new->vm_ops && new->vm_ops->close) | 2035 | if (new->vm_ops && new->vm_ops->close) |
2013 | new->vm_ops->close(new); | 2036 | new->vm_ops->close(new); |
2014 | if (new->vm_file) { | 2037 | if (new->vm_file) |
2015 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2016 | removed_exe_file_vma(mm); | ||
2017 | fput(new->vm_file); | 2038 | fput(new->vm_file); |
2018 | } | ||
2019 | unlink_anon_vmas(new); | 2039 | unlink_anon_vmas(new); |
2020 | out_free_mpol: | 2040 | out_free_mpol: |
2021 | mpol_put(pol); | 2041 | mpol_put(pol); |
@@ -2200,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
2200 | * Clear old maps. this also does some error checking for us | 2220 | * Clear old maps. this also does some error checking for us |
2201 | */ | 2221 | */ |
2202 | munmap_back: | 2222 | munmap_back: |
2203 | vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2223 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { |
2204 | if (vma && vma->vm_start < addr + len) { | ||
2205 | if (do_munmap(mm, addr, len)) | 2224 | if (do_munmap(mm, addr, len)) |
2206 | return -ENOMEM; | 2225 | return -ENOMEM; |
2207 | goto munmap_back; | 2226 | goto munmap_back; |
@@ -2315,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm) | |||
2315 | * and into the inode's i_mmap tree. If vm_file is non-NULL | 2334 | * and into the inode's i_mmap tree. If vm_file is non-NULL |
2316 | * then i_mmap_mutex is taken here. | 2335 | * then i_mmap_mutex is taken here. |
2317 | */ | 2336 | */ |
2318 | int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | 2337 | int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
2319 | { | 2338 | { |
2320 | struct vm_area_struct * __vma, * prev; | 2339 | struct vm_area_struct *prev; |
2321 | struct rb_node ** rb_link, * rb_parent; | 2340 | struct rb_node **rb_link, *rb_parent; |
2322 | 2341 | ||
2323 | /* | 2342 | /* |
2324 | * The vm_pgoff of a purely anonymous vma should be irrelevant | 2343 | * The vm_pgoff of a purely anonymous vma should be irrelevant |
@@ -2336,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2336 | BUG_ON(vma->anon_vma); | 2355 | BUG_ON(vma->anon_vma); |
2337 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; | 2356 | vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; |
2338 | } | 2357 | } |
2339 | __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); | 2358 | if (find_vma_links(mm, vma->vm_start, vma->vm_end, |
2340 | if (__vma && __vma->vm_start < vma->vm_end) | 2359 | &prev, &rb_link, &rb_parent)) |
2341 | return -ENOMEM; | 2360 | return -ENOMEM; |
2342 | if ((vma->vm_flags & VM_ACCOUNT) && | 2361 | if ((vma->vm_flags & VM_ACCOUNT) && |
2343 | security_vm_enough_memory_mm(mm, vma_pages(vma))) | 2362 | security_vm_enough_memory_mm(mm, vma_pages(vma))) |
@@ -2352,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) | |||
2352 | * prior to moving page table entries, to effect an mremap move. | 2371 | * prior to moving page table entries, to effect an mremap move. |
2353 | */ | 2372 | */ |
2354 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | 2373 | struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, |
2355 | unsigned long addr, unsigned long len, pgoff_t pgoff) | 2374 | unsigned long addr, unsigned long len, pgoff_t pgoff, |
2375 | bool *need_rmap_locks) | ||
2356 | { | 2376 | { |
2357 | struct vm_area_struct *vma = *vmap; | 2377 | struct vm_area_struct *vma = *vmap; |
2358 | unsigned long vma_start = vma->vm_start; | 2378 | unsigned long vma_start = vma->vm_start; |
@@ -2371,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2371 | faulted_in_anon_vma = false; | 2391 | faulted_in_anon_vma = false; |
2372 | } | 2392 | } |
2373 | 2393 | ||
2374 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2394 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) |
2395 | return NULL; /* should never get here */ | ||
2375 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2396 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
2376 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); | 2397 | vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); |
2377 | if (new_vma) { | 2398 | if (new_vma) { |
@@ -2393,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2393 | * linear if there are no pages mapped yet. | 2414 | * linear if there are no pages mapped yet. |
2394 | */ | 2415 | */ |
2395 | VM_BUG_ON(faulted_in_anon_vma); | 2416 | VM_BUG_ON(faulted_in_anon_vma); |
2396 | *vmap = new_vma; | 2417 | *vmap = vma = new_vma; |
2397 | } else | 2418 | } |
2398 | anon_vma_moveto_tail(new_vma); | 2419 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
2399 | } else { | 2420 | } else { |
2400 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2421 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2401 | if (new_vma) { | 2422 | if (new_vma) { |
2402 | *new_vma = *vma; | 2423 | *new_vma = *vma; |
2424 | new_vma->vm_start = addr; | ||
2425 | new_vma->vm_end = addr + len; | ||
2426 | new_vma->vm_pgoff = pgoff; | ||
2403 | pol = mpol_dup(vma_policy(vma)); | 2427 | pol = mpol_dup(vma_policy(vma)); |
2404 | if (IS_ERR(pol)) | 2428 | if (IS_ERR(pol)) |
2405 | goto out_free_vma; | 2429 | goto out_free_vma; |
2430 | vma_set_policy(new_vma, pol); | ||
2406 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); | 2431 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
2407 | if (anon_vma_clone(new_vma, vma)) | 2432 | if (anon_vma_clone(new_vma, vma)) |
2408 | goto out_free_mempol; | 2433 | goto out_free_mempol; |
2409 | vma_set_policy(new_vma, pol); | 2434 | if (new_vma->vm_file) |
2410 | new_vma->vm_start = addr; | ||
2411 | new_vma->vm_end = addr + len; | ||
2412 | new_vma->vm_pgoff = pgoff; | ||
2413 | if (new_vma->vm_file) { | ||
2414 | get_file(new_vma->vm_file); | 2435 | get_file(new_vma->vm_file); |
2415 | |||
2416 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2417 | added_exe_file_vma(mm); | ||
2418 | } | ||
2419 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2436 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2420 | new_vma->vm_ops->open(new_vma); | 2437 | new_vma->vm_ops->open(new_vma); |
2421 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2438 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
2439 | *need_rmap_locks = false; | ||
2422 | } | 2440 | } |
2423 | } | 2441 | } |
2424 | return new_vma; | 2442 | return new_vma; |
@@ -2536,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex); | |||
2536 | 2554 | ||
2537 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | 2555 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) |
2538 | { | 2556 | { |
2539 | if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { | 2557 | if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2540 | /* | 2558 | /* |
2541 | * The LSB of head.next can't change from under us | 2559 | * The LSB of head.next can't change from under us |
2542 | * because we hold the mm_all_locks_mutex. | 2560 | * because we hold the mm_all_locks_mutex. |
@@ -2552,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2552 | * anon_vma->root->mutex. | 2570 | * anon_vma->root->mutex. |
2553 | */ | 2571 | */ |
2554 | if (__test_and_set_bit(0, (unsigned long *) | 2572 | if (__test_and_set_bit(0, (unsigned long *) |
2555 | &anon_vma->root->head.next)) | 2573 | &anon_vma->root->rb_root.rb_node)) |
2556 | BUG(); | 2574 | BUG(); |
2557 | } | 2575 | } |
2558 | } | 2576 | } |
@@ -2593,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
2593 | * A single task can't take more than one mm_take_all_locks() in a row | 2611 | * A single task can't take more than one mm_take_all_locks() in a row |
2594 | * or it would deadlock. | 2612 | * or it would deadlock. |
2595 | * | 2613 | * |
2596 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | 2614 | * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in |
2597 | * mapping->flags avoid to take the same lock twice, if more than one | 2615 | * mapping->flags avoid to take the same lock twice, if more than one |
2598 | * vma in this mm is backed by the same anon_vma or address_space. | 2616 | * vma in this mm is backed by the same anon_vma or address_space. |
2599 | * | 2617 | * |
@@ -2640,13 +2658,13 @@ out_unlock: | |||
2640 | 2658 | ||
2641 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | 2659 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) |
2642 | { | 2660 | { |
2643 | if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { | 2661 | if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) { |
2644 | /* | 2662 | /* |
2645 | * The LSB of head.next can't change to 0 from under | 2663 | * The LSB of head.next can't change to 0 from under |
2646 | * us because we hold the mm_all_locks_mutex. | 2664 | * us because we hold the mm_all_locks_mutex. |
2647 | * | 2665 | * |
2648 | * We must however clear the bitflag before unlocking | 2666 | * We must however clear the bitflag before unlocking |
2649 | * the vma so the users using the anon_vma->head will | 2667 | * the vma so the users using the anon_vma->rb_root will |
2650 | * never see our bitflag. | 2668 | * never see our bitflag. |
2651 | * | 2669 | * |
2652 | * No need of atomic instructions here, head.next | 2670 | * No need of atomic instructions here, head.next |
@@ -2654,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2654 | * anon_vma->root->mutex. | 2672 | * anon_vma->root->mutex. |
2655 | */ | 2673 | */ |
2656 | if (!__test_and_clear_bit(0, (unsigned long *) | 2674 | if (!__test_and_clear_bit(0, (unsigned long *) |
2657 | &anon_vma->root->head.next)) | 2675 | &anon_vma->root->rb_root.rb_node)) |
2658 | BUG(); | 2676 | BUG(); |
2659 | anon_vma_unlock(anon_vma); | 2677 | anon_vma_unlock(anon_vma); |
2660 | } | 2678 | } |
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 862b60822d9f..8a5ac8c686b0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -14,10 +14,14 @@ | |||
14 | #include <linux/export.h> | 14 | #include <linux/export.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
17 | #include <linux/srcu.h> | ||
17 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
18 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
19 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
20 | 21 | ||
22 | /* global SRCU for all MMs */ | ||
23 | static struct srcu_struct srcu; | ||
24 | |||
21 | /* | 25 | /* |
22 | * This function can't run concurrently against mmu_notifier_register | 26 | * This function can't run concurrently against mmu_notifier_register |
23 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | 27 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
@@ -25,8 +29,8 @@ | |||
25 | * in parallel despite there being no task using this mm any more, | 29 | * in parallel despite there being no task using this mm any more, |
26 | * through the vmas outside of the exit_mmap context, such as with | 30 | * through the vmas outside of the exit_mmap context, such as with |
27 | * vmtruncate. This serializes against mmu_notifier_unregister with | 31 | * vmtruncate. This serializes against mmu_notifier_unregister with |
28 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | 32 | * the mmu_notifier_mm->lock in addition to SRCU and it serializes |
29 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | 33 | * against the other mmu notifiers with SRCU. struct mmu_notifier_mm |
30 | * can't go away from under us as exit_mmap holds an mm_count pin | 34 | * can't go away from under us as exit_mmap holds an mm_count pin |
31 | * itself. | 35 | * itself. |
32 | */ | 36 | */ |
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
34 | { | 38 | { |
35 | struct mmu_notifier *mn; | 39 | struct mmu_notifier *mn; |
36 | struct hlist_node *n; | 40 | struct hlist_node *n; |
41 | int id; | ||
37 | 42 | ||
38 | /* | 43 | /* |
39 | * RCU here will block mmu_notifier_unregister until | 44 | * SRCU here will block mmu_notifier_unregister until |
40 | * ->release returns. | 45 | * ->release returns. |
41 | */ | 46 | */ |
42 | rcu_read_lock(); | 47 | id = srcu_read_lock(&srcu); |
43 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) | 48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) |
44 | /* | 49 | /* |
45 | * if ->release runs before mmu_notifier_unregister it | 50 | * if ->release runs before mmu_notifier_unregister it |
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
50 | */ | 55 | */ |
51 | if (mn->ops->release) | 56 | if (mn->ops->release) |
52 | mn->ops->release(mn, mm); | 57 | mn->ops->release(mn, mm); |
53 | rcu_read_unlock(); | 58 | srcu_read_unlock(&srcu, id); |
54 | 59 | ||
55 | spin_lock(&mm->mmu_notifier_mm->lock); | 60 | spin_lock(&mm->mmu_notifier_mm->lock); |
56 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | 61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
68 | spin_unlock(&mm->mmu_notifier_mm->lock); | 73 | spin_unlock(&mm->mmu_notifier_mm->lock); |
69 | 74 | ||
70 | /* | 75 | /* |
71 | * synchronize_rcu here prevents mmu_notifier_release to | 76 | * synchronize_srcu here prevents mmu_notifier_release to |
72 | * return to exit_mmap (which would proceed freeing all pages | 77 | * return to exit_mmap (which would proceed freeing all pages |
73 | * in the mm) until the ->release method returns, if it was | 78 | * in the mm) until the ->release method returns, if it was |
74 | * invoked by mmu_notifier_unregister. | 79 | * invoked by mmu_notifier_unregister. |
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm) | |||
76 | * The mmu_notifier_mm can't go away from under us because one | 81 | * The mmu_notifier_mm can't go away from under us because one |
77 | * mm_count is hold by exit_mmap. | 82 | * mm_count is hold by exit_mmap. |
78 | */ | 83 | */ |
79 | synchronize_rcu(); | 84 | synchronize_srcu(&srcu); |
80 | } | 85 | } |
81 | 86 | ||
82 | /* | 87 | /* |
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
89 | { | 94 | { |
90 | struct mmu_notifier *mn; | 95 | struct mmu_notifier *mn; |
91 | struct hlist_node *n; | 96 | struct hlist_node *n; |
92 | int young = 0; | 97 | int young = 0, id; |
93 | 98 | ||
94 | rcu_read_lock(); | 99 | id = srcu_read_lock(&srcu); |
95 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 100 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
96 | if (mn->ops->clear_flush_young) | 101 | if (mn->ops->clear_flush_young) |
97 | young |= mn->ops->clear_flush_young(mn, mm, address); | 102 | young |= mn->ops->clear_flush_young(mn, mm, address); |
98 | } | 103 | } |
99 | rcu_read_unlock(); | 104 | srcu_read_unlock(&srcu, id); |
100 | 105 | ||
101 | return young; | 106 | return young; |
102 | } | 107 | } |
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
106 | { | 111 | { |
107 | struct mmu_notifier *mn; | 112 | struct mmu_notifier *mn; |
108 | struct hlist_node *n; | 113 | struct hlist_node *n; |
109 | int young = 0; | 114 | int young = 0, id; |
110 | 115 | ||
111 | rcu_read_lock(); | 116 | id = srcu_read_lock(&srcu); |
112 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 117 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
113 | if (mn->ops->test_young) { | 118 | if (mn->ops->test_young) { |
114 | young = mn->ops->test_young(mn, mm, address); | 119 | young = mn->ops->test_young(mn, mm, address); |
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm, | |||
116 | break; | 121 | break; |
117 | } | 122 | } |
118 | } | 123 | } |
119 | rcu_read_unlock(); | 124 | srcu_read_unlock(&srcu, id); |
120 | 125 | ||
121 | return young; | 126 | return young; |
122 | } | 127 | } |
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | |||
126 | { | 131 | { |
127 | struct mmu_notifier *mn; | 132 | struct mmu_notifier *mn; |
128 | struct hlist_node *n; | 133 | struct hlist_node *n; |
134 | int id; | ||
129 | 135 | ||
130 | rcu_read_lock(); | 136 | id = srcu_read_lock(&srcu); |
131 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
132 | if (mn->ops->change_pte) | 138 | if (mn->ops->change_pte) |
133 | mn->ops->change_pte(mn, mm, address, pte); | 139 | mn->ops->change_pte(mn, mm, address, pte); |
134 | /* | ||
135 | * Some drivers don't have change_pte, | ||
136 | * so we must call invalidate_page in that case. | ||
137 | */ | ||
138 | else if (mn->ops->invalidate_page) | ||
139 | mn->ops->invalidate_page(mn, mm, address); | ||
140 | } | 140 | } |
141 | rcu_read_unlock(); | 141 | srcu_read_unlock(&srcu, id); |
142 | } | 142 | } |
143 | 143 | ||
144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | 144 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, |
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm, | |||
146 | { | 146 | { |
147 | struct mmu_notifier *mn; | 147 | struct mmu_notifier *mn; |
148 | struct hlist_node *n; | 148 | struct hlist_node *n; |
149 | int id; | ||
149 | 150 | ||
150 | rcu_read_lock(); | 151 | id = srcu_read_lock(&srcu); |
151 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 152 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
152 | if (mn->ops->invalidate_page) | 153 | if (mn->ops->invalidate_page) |
153 | mn->ops->invalidate_page(mn, mm, address); | 154 | mn->ops->invalidate_page(mn, mm, address); |
154 | } | 155 | } |
155 | rcu_read_unlock(); | 156 | srcu_read_unlock(&srcu, id); |
156 | } | 157 | } |
157 | 158 | ||
158 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | 159 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, |
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | |||
160 | { | 161 | { |
161 | struct mmu_notifier *mn; | 162 | struct mmu_notifier *mn; |
162 | struct hlist_node *n; | 163 | struct hlist_node *n; |
164 | int id; | ||
163 | 165 | ||
164 | rcu_read_lock(); | 166 | id = srcu_read_lock(&srcu); |
165 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 167 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
166 | if (mn->ops->invalidate_range_start) | 168 | if (mn->ops->invalidate_range_start) |
167 | mn->ops->invalidate_range_start(mn, mm, start, end); | 169 | mn->ops->invalidate_range_start(mn, mm, start, end); |
168 | } | 170 | } |
169 | rcu_read_unlock(); | 171 | srcu_read_unlock(&srcu, id); |
170 | } | 172 | } |
171 | 173 | ||
172 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | 174 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | |||
174 | { | 176 | { |
175 | struct mmu_notifier *mn; | 177 | struct mmu_notifier *mn; |
176 | struct hlist_node *n; | 178 | struct hlist_node *n; |
179 | int id; | ||
177 | 180 | ||
178 | rcu_read_lock(); | 181 | id = srcu_read_lock(&srcu); |
179 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | 182 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
180 | if (mn->ops->invalidate_range_end) | 183 | if (mn->ops->invalidate_range_end) |
181 | mn->ops->invalidate_range_end(mn, mm, start, end); | 184 | mn->ops->invalidate_range_end(mn, mm, start, end); |
182 | } | 185 | } |
183 | rcu_read_unlock(); | 186 | srcu_read_unlock(&srcu, id); |
184 | } | 187 | } |
185 | 188 | ||
186 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | 189 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
@@ -192,6 +195,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
192 | 195 | ||
193 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 196 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
194 | 197 | ||
198 | /* | ||
199 | * Verify that mmu_notifier_init() already run and the global srcu is | ||
200 | * initialized. | ||
201 | */ | ||
202 | BUG_ON(!srcu.per_cpu_ref); | ||
203 | |||
195 | ret = -ENOMEM; | 204 | ret = -ENOMEM; |
196 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | 205 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); |
197 | if (unlikely(!mmu_notifier_mm)) | 206 | if (unlikely(!mmu_notifier_mm)) |
@@ -201,11 +210,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
201 | down_write(&mm->mmap_sem); | 210 | down_write(&mm->mmap_sem); |
202 | ret = mm_take_all_locks(mm); | 211 | ret = mm_take_all_locks(mm); |
203 | if (unlikely(ret)) | 212 | if (unlikely(ret)) |
204 | goto out_cleanup; | 213 | goto out_clean; |
205 | 214 | ||
206 | if (!mm_has_notifiers(mm)) { | 215 | if (!mm_has_notifiers(mm)) { |
207 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | 216 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); |
208 | spin_lock_init(&mmu_notifier_mm->lock); | 217 | spin_lock_init(&mmu_notifier_mm->lock); |
218 | |||
209 | mm->mmu_notifier_mm = mmu_notifier_mm; | 219 | mm->mmu_notifier_mm = mmu_notifier_mm; |
210 | mmu_notifier_mm = NULL; | 220 | mmu_notifier_mm = NULL; |
211 | } | 221 | } |
@@ -224,10 +234,9 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, | |||
224 | spin_unlock(&mm->mmu_notifier_mm->lock); | 234 | spin_unlock(&mm->mmu_notifier_mm->lock); |
225 | 235 | ||
226 | mm_drop_all_locks(mm); | 236 | mm_drop_all_locks(mm); |
227 | out_cleanup: | 237 | out_clean: |
228 | if (take_mmap_sem) | 238 | if (take_mmap_sem) |
229 | up_write(&mm->mmap_sem); | 239 | up_write(&mm->mmap_sem); |
230 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | ||
231 | kfree(mmu_notifier_mm); | 240 | kfree(mmu_notifier_mm); |
232 | out: | 241 | out: |
233 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | 242 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm) | |||
274 | /* | 283 | /* |
275 | * This releases the mm_count pin automatically and frees the mm | 284 | * This releases the mm_count pin automatically and frees the mm |
276 | * structure if it was the last user of it. It serializes against | 285 | * structure if it was the last user of it. It serializes against |
277 | * running mmu notifiers with RCU and against mmu_notifier_unregister | 286 | * running mmu notifiers with SRCU and against mmu_notifier_unregister |
278 | * with the unregister lock + RCU. All sptes must be dropped before | 287 | * with the unregister lock + SRCU. All sptes must be dropped before |
279 | * calling mmu_notifier_unregister. ->release or any other notifier | 288 | * calling mmu_notifier_unregister. ->release or any other notifier |
280 | * method may be invoked concurrently with mmu_notifier_unregister, | 289 | * method may be invoked concurrently with mmu_notifier_unregister, |
281 | * and only after mmu_notifier_unregister returned we're guaranteed | 290 | * and only after mmu_notifier_unregister returned we're guaranteed |
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
287 | 296 | ||
288 | if (!hlist_unhashed(&mn->hlist)) { | 297 | if (!hlist_unhashed(&mn->hlist)) { |
289 | /* | 298 | /* |
290 | * RCU here will force exit_mmap to wait ->release to finish | 299 | * SRCU here will force exit_mmap to wait ->release to finish |
291 | * before freeing the pages. | 300 | * before freeing the pages. |
292 | */ | 301 | */ |
293 | rcu_read_lock(); | 302 | int id; |
294 | 303 | ||
304 | id = srcu_read_lock(&srcu); | ||
295 | /* | 305 | /* |
296 | * exit_mmap will block in mmu_notifier_release to | 306 | * exit_mmap will block in mmu_notifier_release to |
297 | * guarantee ->release is called before freeing the | 307 | * guarantee ->release is called before freeing the |
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
299 | */ | 309 | */ |
300 | if (mn->ops->release) | 310 | if (mn->ops->release) |
301 | mn->ops->release(mn, mm); | 311 | mn->ops->release(mn, mm); |
302 | rcu_read_unlock(); | 312 | srcu_read_unlock(&srcu, id); |
303 | 313 | ||
304 | spin_lock(&mm->mmu_notifier_mm->lock); | 314 | spin_lock(&mm->mmu_notifier_mm->lock); |
305 | hlist_del_rcu(&mn->hlist); | 315 | hlist_del_rcu(&mn->hlist); |
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
310 | * Wait any running method to finish, of course including | 320 | * Wait any running method to finish, of course including |
311 | * ->release if it was run by mmu_notifier_relase instead of us. | 321 | * ->release if it was run by mmu_notifier_relase instead of us. |
312 | */ | 322 | */ |
313 | synchronize_rcu(); | 323 | synchronize_srcu(&srcu); |
314 | 324 | ||
315 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | 325 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
316 | 326 | ||
317 | mmdrop(mm); | 327 | mmdrop(mm); |
318 | } | 328 | } |
319 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | 329 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
330 | |||
331 | static int __init mmu_notifier_init(void) | ||
332 | { | ||
333 | return init_srcu_struct(&srcu); | ||
334 | } | ||
335 | |||
336 | module_init(mmu_notifier_init); | ||
diff --git a/mm/mremap.c b/mm/mremap.c index cc06d0e48d05..1b61c2d3307a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | 71 | static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, |
72 | unsigned long old_addr, unsigned long old_end, | 72 | unsigned long old_addr, unsigned long old_end, |
73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, | 73 | struct vm_area_struct *new_vma, pmd_t *new_pmd, |
74 | unsigned long new_addr) | 74 | unsigned long new_addr, bool need_rmap_locks) |
75 | { | 75 | { |
76 | struct address_space *mapping = NULL; | 76 | struct address_space *mapping = NULL; |
77 | struct anon_vma *anon_vma = NULL; | ||
77 | struct mm_struct *mm = vma->vm_mm; | 78 | struct mm_struct *mm = vma->vm_mm; |
78 | pte_t *old_pte, *new_pte, pte; | 79 | pte_t *old_pte, *new_pte, pte; |
79 | spinlock_t *old_ptl, *new_ptl; | 80 | spinlock_t *old_ptl, *new_ptl; |
80 | 81 | ||
81 | if (vma->vm_file) { | 82 | /* |
82 | /* | 83 | * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma |
83 | * Subtle point from Rajesh Venkatasubramanian: before | 84 | * locks to ensure that rmap will always observe either the old or the |
84 | * moving file-based ptes, we must lock truncate_pagecache | 85 | * new ptes. This is the easiest way to avoid races with |
85 | * out, since it might clean the dst vma before the src vma, | 86 | * truncate_pagecache(), page migration, etc... |
86 | * and we propagate stale pages into the dst afterward. | 87 | * |
87 | */ | 88 | * When need_rmap_locks is false, we use other ways to avoid |
88 | mapping = vma->vm_file->f_mapping; | 89 | * such races: |
89 | mutex_lock(&mapping->i_mmap_mutex); | 90 | * |
91 | * - During exec() shift_arg_pages(), we use a specially tagged vma | ||
92 | * which rmap call sites look for using is_vma_temporary_stack(). | ||
93 | * | ||
94 | * - During mremap(), new_vma is often known to be placed after vma | ||
95 | * in rmap traversal order. This ensures rmap will always observe | ||
96 | * either the old pte, or the new pte, or both (the page table locks | ||
97 | * serialize access to individual ptes, but only rmap traversal | ||
98 | * order guarantees that we won't miss both the old and new ptes). | ||
99 | */ | ||
100 | if (need_rmap_locks) { | ||
101 | if (vma->vm_file) { | ||
102 | mapping = vma->vm_file->f_mapping; | ||
103 | mutex_lock(&mapping->i_mmap_mutex); | ||
104 | } | ||
105 | if (vma->anon_vma) { | ||
106 | anon_vma = vma->anon_vma; | ||
107 | anon_vma_lock(anon_vma); | ||
108 | } | ||
90 | } | 109 | } |
91 | 110 | ||
92 | /* | 111 | /* |
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
114 | spin_unlock(new_ptl); | 133 | spin_unlock(new_ptl); |
115 | pte_unmap(new_pte - 1); | 134 | pte_unmap(new_pte - 1); |
116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 135 | pte_unmap_unlock(old_pte - 1, old_ptl); |
136 | if (anon_vma) | ||
137 | anon_vma_unlock(anon_vma); | ||
117 | if (mapping) | 138 | if (mapping) |
118 | mutex_unlock(&mapping->i_mmap_mutex); | 139 | mutex_unlock(&mapping->i_mmap_mutex); |
119 | } | 140 | } |
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
122 | 143 | ||
123 | unsigned long move_page_tables(struct vm_area_struct *vma, | 144 | unsigned long move_page_tables(struct vm_area_struct *vma, |
124 | unsigned long old_addr, struct vm_area_struct *new_vma, | 145 | unsigned long old_addr, struct vm_area_struct *new_vma, |
125 | unsigned long new_addr, unsigned long len) | 146 | unsigned long new_addr, unsigned long len, |
147 | bool need_rmap_locks) | ||
126 | { | 148 | { |
127 | unsigned long extent, next, old_end; | 149 | unsigned long extent, next, old_end; |
128 | pmd_t *old_pmd, *new_pmd; | 150 | pmd_t *old_pmd, *new_pmd; |
129 | bool need_flush = false; | 151 | bool need_flush = false; |
152 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
153 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
130 | 154 | ||
131 | old_end = old_addr + len; | 155 | old_end = old_addr + len; |
132 | flush_cache_range(vma, old_addr, old_end); | 156 | flush_cache_range(vma, old_addr, old_end); |
133 | 157 | ||
134 | mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); | 158 | mmun_start = old_addr; |
159 | mmun_end = old_end; | ||
160 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | ||
135 | 161 | ||
136 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { | 162 | for (; old_addr < old_end; old_addr += extent, new_addr += extent) { |
137 | cond_resched(); | 163 | cond_resched(); |
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
169 | if (extent > LATENCY_LIMIT) | 195 | if (extent > LATENCY_LIMIT) |
170 | extent = LATENCY_LIMIT; | 196 | extent = LATENCY_LIMIT; |
171 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, | 197 | move_ptes(vma, old_pmd, old_addr, old_addr + extent, |
172 | new_vma, new_pmd, new_addr); | 198 | new_vma, new_pmd, new_addr, need_rmap_locks); |
173 | need_flush = true; | 199 | need_flush = true; |
174 | } | 200 | } |
175 | if (likely(need_flush)) | 201 | if (likely(need_flush)) |
176 | flush_tlb_range(vma, old_end-len, old_addr); | 202 | flush_tlb_range(vma, old_end-len, old_addr); |
177 | 203 | ||
178 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); | 204 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
179 | 205 | ||
180 | return len + old_addr - old_end; /* how much done */ | 206 | return len + old_addr - old_end; /* how much done */ |
181 | } | 207 | } |
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
193 | unsigned long hiwater_vm; | 219 | unsigned long hiwater_vm; |
194 | int split = 0; | 220 | int split = 0; |
195 | int err; | 221 | int err; |
222 | bool need_rmap_locks; | ||
196 | 223 | ||
197 | /* | 224 | /* |
198 | * We'd prefer to avoid failure later on in do_munmap: | 225 | * We'd prefer to avoid failure later on in do_munmap: |
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
214 | return err; | 241 | return err; |
215 | 242 | ||
216 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); | 243 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); |
217 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | 244 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, |
245 | &need_rmap_locks); | ||
218 | if (!new_vma) | 246 | if (!new_vma) |
219 | return -ENOMEM; | 247 | return -ENOMEM; |
220 | 248 | ||
221 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | 249 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, |
250 | need_rmap_locks); | ||
222 | if (moved_len < old_len) { | 251 | if (moved_len < old_len) { |
223 | /* | 252 | /* |
224 | * Before moving the page tables from the new vma to | ||
225 | * the old vma, we need to be sure the old vma is | ||
226 | * queued after new vma in the same_anon_vma list to | ||
227 | * prevent SMP races with rmap_walk (that could lead | ||
228 | * rmap_walk to miss some page table). | ||
229 | */ | ||
230 | anon_vma_moveto_tail(vma); | ||
231 | |||
232 | /* | ||
233 | * On error, move entries back from new area to old, | 253 | * On error, move entries back from new area to old, |
234 | * which will succeed since page tables still there, | 254 | * which will succeed since page tables still there, |
235 | * and then proceed to unmap new area instead of old. | 255 | * and then proceed to unmap new area instead of old. |
236 | */ | 256 | */ |
237 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); | 257 | move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, |
258 | true); | ||
238 | vma = new_vma; | 259 | vma = new_vma; |
239 | old_len = new_len; | 260 | old_len = new_len; |
240 | old_addr = new_addr; | 261 | old_addr = new_addr; |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 405573010f99..714d5d650470 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start, | |||
116 | return 0; | 116 | return 0; |
117 | 117 | ||
118 | __free_pages_memory(start_pfn, end_pfn); | 118 | __free_pages_memory(start_pfn, end_pfn); |
119 | fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT), | ||
120 | start_pfn, end_pfn); | ||
119 | 121 | ||
120 | return end_pfn - start_pfn; | 122 | return end_pfn - start_pfn; |
121 | } | 123 | } |
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
126 | phys_addr_t start, end, size; | 128 | phys_addr_t start, end, size; |
127 | u64 i; | 129 | u64 i; |
128 | 130 | ||
131 | reset_zone_present_pages(); | ||
129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) | 132 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
130 | count += __free_memory_core(start, end); | 133 | count += __free_memory_core(start, end); |
131 | 134 | ||
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void) | |||
162 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 165 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
163 | * because in some case like Node0 doesn't have RAM installed | 166 | * because in some case like Node0 doesn't have RAM installed |
164 | * low ram will be on Node1 | 167 | * low ram will be on Node1 |
165 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
166 | * will be used instead of only Node0 related | ||
167 | */ | 168 | */ |
168 | return free_low_memory_core_early(MAX_NUMNODES); | 169 | return free_low_memory_core_early(MAX_NUMNODES); |
169 | } | 170 | } |
diff --git a/mm/nommu.c b/mm/nommu.c index d4b0c10872de..45131b41bcdb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
698 | 698 | ||
699 | mutex_lock(&mapping->i_mmap_mutex); | 699 | mutex_lock(&mapping->i_mmap_mutex); |
700 | flush_dcache_mmap_lock(mapping); | 700 | flush_dcache_mmap_lock(mapping); |
701 | vma_prio_tree_insert(vma, &mapping->i_mmap); | 701 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
702 | flush_dcache_mmap_unlock(mapping); | 702 | flush_dcache_mmap_unlock(mapping); |
703 | mutex_unlock(&mapping->i_mmap_mutex); | 703 | mutex_unlock(&mapping->i_mmap_mutex); |
704 | } | 704 | } |
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
764 | 764 | ||
765 | mutex_lock(&mapping->i_mmap_mutex); | 765 | mutex_lock(&mapping->i_mmap_mutex); |
766 | flush_dcache_mmap_lock(mapping); | 766 | flush_dcache_mmap_lock(mapping); |
767 | vma_prio_tree_remove(vma, &mapping->i_mmap); | 767 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
768 | flush_dcache_mmap_unlock(mapping); | 768 | flush_dcache_mmap_unlock(mapping); |
769 | mutex_unlock(&mapping->i_mmap_mutex); | 769 | mutex_unlock(&mapping->i_mmap_mutex); |
770 | } | 770 | } |
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | |||
789 | kenter("%p", vma); | 789 | kenter("%p", vma); |
790 | if (vma->vm_ops && vma->vm_ops->close) | 790 | if (vma->vm_ops && vma->vm_ops->close) |
791 | vma->vm_ops->close(vma); | 791 | vma->vm_ops->close(vma); |
792 | if (vma->vm_file) { | 792 | if (vma->vm_file) |
793 | fput(vma->vm_file); | 793 | fput(vma->vm_file); |
794 | if (vma->vm_flags & VM_EXECUTABLE) | ||
795 | removed_exe_file_vma(mm); | ||
796 | } | ||
797 | put_nommu_region(vma->vm_region); | 794 | put_nommu_region(vma->vm_region); |
798 | kmem_cache_free(vm_area_cachep, vma); | 795 | kmem_cache_free(vm_area_cachep, vma); |
799 | } | 796 | } |
@@ -1282,14 +1279,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1282 | vma->vm_pgoff = pgoff; | 1279 | vma->vm_pgoff = pgoff; |
1283 | 1280 | ||
1284 | if (file) { | 1281 | if (file) { |
1285 | region->vm_file = file; | 1282 | region->vm_file = get_file(file); |
1286 | get_file(file); | 1283 | vma->vm_file = get_file(file); |
1287 | vma->vm_file = file; | ||
1288 | get_file(file); | ||
1289 | if (vm_flags & VM_EXECUTABLE) { | ||
1290 | added_exe_file_vma(current->mm); | ||
1291 | vma->vm_mm = current->mm; | ||
1292 | } | ||
1293 | } | 1284 | } |
1294 | 1285 | ||
1295 | down_write(&nommu_region_sem); | 1286 | down_write(&nommu_region_sem); |
@@ -1442,8 +1433,6 @@ error: | |||
1442 | kmem_cache_free(vm_region_jar, region); | 1433 | kmem_cache_free(vm_region_jar, region); |
1443 | if (vma->vm_file) | 1434 | if (vma->vm_file) |
1444 | fput(vma->vm_file); | 1435 | fput(vma->vm_file); |
1445 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1446 | removed_exe_file_vma(vma->vm_mm); | ||
1447 | kmem_cache_free(vm_area_cachep, vma); | 1436 | kmem_cache_free(vm_area_cachep, vma); |
1448 | kleave(" = %d", ret); | 1437 | kleave(" = %d", ret); |
1449 | return ret; | 1438 | return ret; |
@@ -1822,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1822 | if (addr != (pfn << PAGE_SHIFT)) | 1811 | if (addr != (pfn << PAGE_SHIFT)) |
1823 | return -EINVAL; | 1812 | return -EINVAL; |
1824 | 1813 | ||
1825 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | 1814 | vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; |
1826 | return 0; | 1815 | return 0; |
1827 | } | 1816 | } |
1828 | EXPORT_SYMBOL(remap_pfn_range); | 1817 | EXPORT_SYMBOL(remap_pfn_range); |
@@ -1963,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1963 | } | 1952 | } |
1964 | EXPORT_SYMBOL(filemap_fault); | 1953 | EXPORT_SYMBOL(filemap_fault); |
1965 | 1954 | ||
1955 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
1956 | unsigned long size, pgoff_t pgoff) | ||
1957 | { | ||
1958 | BUG(); | ||
1959 | return 0; | ||
1960 | } | ||
1961 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
1962 | |||
1966 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 1963 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
1967 | unsigned long addr, void *buf, int len, int write) | 1964 | unsigned long addr, void *buf, int len, int write) |
1968 | { | 1965 | { |
@@ -2047,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2047 | size_t newsize) | 2044 | size_t newsize) |
2048 | { | 2045 | { |
2049 | struct vm_area_struct *vma; | 2046 | struct vm_area_struct *vma; |
2050 | struct prio_tree_iter iter; | ||
2051 | struct vm_region *region; | 2047 | struct vm_region *region; |
2052 | pgoff_t low, high; | 2048 | pgoff_t low, high; |
2053 | size_t r_size, r_top; | 2049 | size_t r_size, r_top; |
@@ -2059,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2059 | mutex_lock(&inode->i_mapping->i_mmap_mutex); | 2055 | mutex_lock(&inode->i_mapping->i_mmap_mutex); |
2060 | 2056 | ||
2061 | /* search for VMAs that fall within the dead zone */ | 2057 | /* search for VMAs that fall within the dead zone */ |
2062 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2058 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) { |
2063 | low, high) { | ||
2064 | /* found one - only interested if it's shared out of the page | 2059 | /* found one - only interested if it's shared out of the page |
2065 | * cache */ | 2060 | * cache */ |
2066 | if (vma->vm_flags & VM_SHARED) { | 2061 | if (vma->vm_flags & VM_SHARED) { |
@@ -2076,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | |||
2076 | * we don't check for any regions that start beyond the EOF as there | 2071 | * we don't check for any regions that start beyond the EOF as there |
2077 | * shouldn't be any | 2072 | * shouldn't be any |
2078 | */ | 2073 | */ |
2079 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | 2074 | vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, |
2080 | 0, ULONG_MAX) { | 2075 | 0, ULONG_MAX) { |
2081 | if (!(vma->vm_flags & VM_SHARED)) | 2076 | if (!(vma->vm_flags & VM_SHARED)) |
2082 | continue; | 2077 | continue; |
2083 | 2078 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 198600861638..79e0f3e24831 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
428 | { | 428 | { |
429 | task_lock(current); | 429 | task_lock(current); |
430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 430 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
431 | "oom_adj=%d, oom_score_adj=%d\n", | 431 | "oom_score_adj=%d\n", |
432 | current->comm, gfp_mask, order, current->signal->oom_adj, | 432 | current->comm, gfp_mask, order, |
433 | current->signal->oom_score_adj); | 433 | current->signal->oom_score_adj); |
434 | cpuset_print_task_mems_allowed(current); | 434 | cpuset_print_task_mems_allowed(current); |
435 | task_unlock(current); | 435 | task_unlock(current); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5ad5ce23c1e0..830893b2b3c7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1602,10 +1602,18 @@ void writeback_set_ratelimit(void) | |||
1602 | } | 1602 | } |
1603 | 1603 | ||
1604 | static int __cpuinit | 1604 | static int __cpuinit |
1605 | ratelimit_handler(struct notifier_block *self, unsigned long u, void *v) | 1605 | ratelimit_handler(struct notifier_block *self, unsigned long action, |
1606 | void *hcpu) | ||
1606 | { | 1607 | { |
1607 | writeback_set_ratelimit(); | 1608 | |
1608 | return NOTIFY_DONE; | 1609 | switch (action & ~CPU_TASKS_FROZEN) { |
1610 | case CPU_ONLINE: | ||
1611 | case CPU_DEAD: | ||
1612 | writeback_set_ratelimit(); | ||
1613 | return NOTIFY_OK; | ||
1614 | default: | ||
1615 | return NOTIFY_DONE; | ||
1616 | } | ||
1609 | } | 1617 | } |
1610 | 1618 | ||
1611 | static struct notifier_block __cpuinitdata ratelimit_nb = { | 1619 | static struct notifier_block __cpuinitdata ratelimit_nb = { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c13ea7538891..5b74de6702e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page, | |||
558 | if (page_is_guard(buddy)) { | 558 | if (page_is_guard(buddy)) { |
559 | clear_page_guard_flag(buddy); | 559 | clear_page_guard_flag(buddy); |
560 | set_page_private(page, 0); | 560 | set_page_private(page, 0); |
561 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 561 | __mod_zone_freepage_state(zone, 1 << order, |
562 | migratetype); | ||
562 | } else { | 563 | } else { |
563 | list_del(&buddy->lru); | 564 | list_del(&buddy->lru); |
564 | zone->free_area[order].nr_free--; | 565 | zone->free_area[order].nr_free--; |
@@ -597,17 +598,6 @@ out: | |||
597 | zone->free_area[order].nr_free++; | 598 | zone->free_area[order].nr_free++; |
598 | } | 599 | } |
599 | 600 | ||
600 | /* | ||
601 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
602 | * Page should not be on lru, so no need to fix that up. | ||
603 | * free_pages_check() will verify... | ||
604 | */ | ||
605 | static inline void free_page_mlock(struct page *page) | ||
606 | { | ||
607 | __dec_zone_page_state(page, NR_MLOCK); | ||
608 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
609 | } | ||
610 | |||
611 | static inline int free_pages_check(struct page *page) | 601 | static inline int free_pages_check(struct page *page) |
612 | { | 602 | { |
613 | if (unlikely(page_mapcount(page) | | 603 | if (unlikely(page_mapcount(page) | |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
668 | batch_free = to_free; | 658 | batch_free = to_free; |
669 | 659 | ||
670 | do { | 660 | do { |
661 | int mt; /* migratetype of the to-be-freed page */ | ||
662 | |||
671 | page = list_entry(list->prev, struct page, lru); | 663 | page = list_entry(list->prev, struct page, lru); |
672 | /* must delete as __free_one_page list manipulates */ | 664 | /* must delete as __free_one_page list manipulates */ |
673 | list_del(&page->lru); | 665 | list_del(&page->lru); |
666 | mt = get_freepage_migratetype(page); | ||
674 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
675 | __free_one_page(page, zone, 0, page_private(page)); | 668 | __free_one_page(page, zone, 0, mt); |
676 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | 669 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | ||
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
677 | } while (--to_free && --batch_free && !list_empty(list)); | 672 | } while (--to_free && --batch_free && !list_empty(list)); |
678 | } | 673 | } |
679 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
688 | zone->pages_scanned = 0; | 683 | zone->pages_scanned = 0; |
689 | 684 | ||
690 | __free_one_page(page, zone, order, migratetype); | 685 | __free_one_page(page, zone, order, migratetype); |
691 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 686 | if (unlikely(migratetype != MIGRATE_ISOLATE)) |
687 | __mod_zone_freepage_state(zone, 1 << order, migratetype); | ||
692 | spin_unlock(&zone->lock); | 688 | spin_unlock(&zone->lock); |
693 | } | 689 | } |
694 | 690 | ||
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
721 | static void __free_pages_ok(struct page *page, unsigned int order) | 717 | static void __free_pages_ok(struct page *page, unsigned int order) |
722 | { | 718 | { |
723 | unsigned long flags; | 719 | unsigned long flags; |
724 | int wasMlocked = __TestClearPageMlocked(page); | 720 | int migratetype; |
725 | 721 | ||
726 | if (!free_pages_prepare(page, order)) | 722 | if (!free_pages_prepare(page, order)) |
727 | return; | 723 | return; |
728 | 724 | ||
729 | local_irq_save(flags); | 725 | local_irq_save(flags); |
730 | if (unlikely(wasMlocked)) | ||
731 | free_page_mlock(page); | ||
732 | __count_vm_events(PGFREE, 1 << order); | 726 | __count_vm_events(PGFREE, 1 << order); |
733 | free_one_page(page_zone(page), page, order, | 727 | migratetype = get_pageblock_migratetype(page); |
734 | get_pageblock_migratetype(page)); | 728 | set_freepage_migratetype(page, migratetype); |
729 | free_one_page(page_zone(page), page, order, migratetype); | ||
735 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
736 | } | 731 | } |
737 | 732 | ||
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page, | |||
811 | set_page_guard_flag(&page[size]); | 806 | set_page_guard_flag(&page[size]); |
812 | set_page_private(&page[size], high); | 807 | set_page_private(&page[size], high); |
813 | /* Guard pages are not available for any usage */ | 808 | /* Guard pages are not available for any usage */ |
814 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | 809 | __mod_zone_freepage_state(zone, -(1 << high), |
810 | migratetype); | ||
815 | continue; | 811 | continue; |
816 | } | 812 | } |
817 | #endif | 813 | #endif |
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
915 | * Note that start_page and end_pages are not aligned on a pageblock | 911 | * Note that start_page and end_pages are not aligned on a pageblock |
916 | * boundary. If alignment is required, use move_freepages_block() | 912 | * boundary. If alignment is required, use move_freepages_block() |
917 | */ | 913 | */ |
918 | static int move_freepages(struct zone *zone, | 914 | int move_freepages(struct zone *zone, |
919 | struct page *start_page, struct page *end_page, | 915 | struct page *start_page, struct page *end_page, |
920 | int migratetype) | 916 | int migratetype) |
921 | { | 917 | { |
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone, | |||
951 | order = page_order(page); | 947 | order = page_order(page); |
952 | list_move(&page->lru, | 948 | list_move(&page->lru, |
953 | &zone->free_area[order].free_list[migratetype]); | 949 | &zone->free_area[order].free_list[migratetype]); |
950 | set_freepage_migratetype(page, migratetype); | ||
954 | page += 1 << order; | 951 | page += 1 << order; |
955 | pages_moved += 1 << order; | 952 | pages_moved += 1 << order; |
956 | } | 953 | } |
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
1135 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1132 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) |
1136 | mt = migratetype; | 1133 | mt = migratetype; |
1137 | } | 1134 | } |
1138 | set_page_private(page, mt); | 1135 | set_freepage_migratetype(page, mt); |
1139 | list = &page->lru; | 1136 | list = &page->lru; |
1137 | if (is_migrate_cma(mt)) | ||
1138 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, | ||
1139 | -(1 << order)); | ||
1140 | } | 1140 | } |
1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1141 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
1142 | spin_unlock(&zone->lock); | 1142 | spin_unlock(&zone->lock); |
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1296 | struct per_cpu_pages *pcp; | 1296 | struct per_cpu_pages *pcp; |
1297 | unsigned long flags; | 1297 | unsigned long flags; |
1298 | int migratetype; | 1298 | int migratetype; |
1299 | int wasMlocked = __TestClearPageMlocked(page); | ||
1300 | 1299 | ||
1301 | if (!free_pages_prepare(page, 0)) | 1300 | if (!free_pages_prepare(page, 0)) |
1302 | return; | 1301 | return; |
1303 | 1302 | ||
1304 | migratetype = get_pageblock_migratetype(page); | 1303 | migratetype = get_pageblock_migratetype(page); |
1305 | set_page_private(page, migratetype); | 1304 | set_freepage_migratetype(page, migratetype); |
1306 | local_irq_save(flags); | 1305 | local_irq_save(flags); |
1307 | if (unlikely(wasMlocked)) | ||
1308 | free_page_mlock(page); | ||
1309 | __count_vm_event(PGFREE); | 1306 | __count_vm_event(PGFREE); |
1310 | 1307 | ||
1311 | /* | 1308 | /* |
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order) | |||
1380 | } | 1377 | } |
1381 | 1378 | ||
1382 | /* | 1379 | /* |
1383 | * Similar to split_page except the page is already free. As this is only | 1380 | * Similar to the split_page family of functions except that the page |
1384 | * being used for migration, the migratetype of the block also changes. | 1381 | * required at the given order and being isolated now to prevent races |
1385 | * As this is called with interrupts disabled, the caller is responsible | 1382 | * with parallel allocators |
1386 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1387 | * are enabled. | ||
1388 | * | ||
1389 | * Note: this is probably too low level an operation for use in drivers. | ||
1390 | * Please consult with lkml before using this in your driver. | ||
1391 | */ | 1383 | */ |
1392 | int split_free_page(struct page *page) | 1384 | int capture_free_page(struct page *page, int alloc_order, int migratetype) |
1393 | { | 1385 | { |
1394 | unsigned int order; | 1386 | unsigned int order; |
1395 | unsigned long watermark; | 1387 | unsigned long watermark; |
1396 | struct zone *zone; | 1388 | struct zone *zone; |
1389 | int mt; | ||
1397 | 1390 | ||
1398 | BUG_ON(!PageBuddy(page)); | 1391 | BUG_ON(!PageBuddy(page)); |
1399 | 1392 | ||
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page) | |||
1409 | list_del(&page->lru); | 1402 | list_del(&page->lru); |
1410 | zone->free_area[order].nr_free--; | 1403 | zone->free_area[order].nr_free--; |
1411 | rmv_page_order(page); | 1404 | rmv_page_order(page); |
1412 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | ||
1413 | 1405 | ||
1414 | /* Split into individual pages */ | 1406 | mt = get_pageblock_migratetype(page); |
1415 | set_page_refcounted(page); | 1407 | if (unlikely(mt != MIGRATE_ISOLATE)) |
1416 | split_page(page, order); | 1408 | __mod_zone_freepage_state(zone, -(1UL << order), mt); |
1417 | 1409 | ||
1410 | if (alloc_order != order) | ||
1411 | expand(zone, page, alloc_order, order, | ||
1412 | &zone->free_area[order], migratetype); | ||
1413 | |||
1414 | /* Set the pageblock if the captured page is at least a pageblock */ | ||
1418 | if (order >= pageblock_order - 1) { | 1415 | if (order >= pageblock_order - 1) { |
1419 | struct page *endpage = page + (1 << order) - 1; | 1416 | struct page *endpage = page + (1 << order) - 1; |
1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1417 | for (; page < endpage; page += pageblock_nr_pages) { |
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page) | |||
1425 | } | 1422 | } |
1426 | } | 1423 | } |
1427 | 1424 | ||
1428 | return 1 << order; | 1425 | return 1UL << order; |
1426 | } | ||
1427 | |||
1428 | /* | ||
1429 | * Similar to split_page except the page is already free. As this is only | ||
1430 | * being used for migration, the migratetype of the block also changes. | ||
1431 | * As this is called with interrupts disabled, the caller is responsible | ||
1432 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
1433 | * are enabled. | ||
1434 | * | ||
1435 | * Note: this is probably too low level an operation for use in drivers. | ||
1436 | * Please consult with lkml before using this in your driver. | ||
1437 | */ | ||
1438 | int split_free_page(struct page *page) | ||
1439 | { | ||
1440 | unsigned int order; | ||
1441 | int nr_pages; | ||
1442 | |||
1443 | BUG_ON(!PageBuddy(page)); | ||
1444 | order = page_order(page); | ||
1445 | |||
1446 | nr_pages = capture_free_page(page, order, 0); | ||
1447 | if (!nr_pages) | ||
1448 | return 0; | ||
1449 | |||
1450 | /* Split into individual pages */ | ||
1451 | set_page_refcounted(page); | ||
1452 | split_page(page, order); | ||
1453 | return nr_pages; | ||
1429 | } | 1454 | } |
1430 | 1455 | ||
1431 | /* | 1456 | /* |
@@ -1484,7 +1509,8 @@ again: | |||
1484 | spin_unlock(&zone->lock); | 1509 | spin_unlock(&zone->lock); |
1485 | if (!page) | 1510 | if (!page) |
1486 | goto failed; | 1511 | goto failed; |
1487 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | 1512 | __mod_zone_freepage_state(zone, -(1 << order), |
1513 | get_pageblock_migratetype(page)); | ||
1488 | } | 1514 | } |
1489 | 1515 | ||
1490 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1516 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
@@ -1501,19 +1527,6 @@ failed: | |||
1501 | return NULL; | 1527 | return NULL; |
1502 | } | 1528 | } |
1503 | 1529 | ||
1504 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | ||
1505 | #define ALLOC_WMARK_MIN WMARK_MIN | ||
1506 | #define ALLOC_WMARK_LOW WMARK_LOW | ||
1507 | #define ALLOC_WMARK_HIGH WMARK_HIGH | ||
1508 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
1509 | |||
1510 | /* Mask to get the watermark bits */ | ||
1511 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
1512 | |||
1513 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | ||
1514 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | ||
1515 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | ||
1516 | |||
1517 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1530 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1518 | 1531 | ||
1519 | static struct { | 1532 | static struct { |
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1608 | min -= min / 2; | 1621 | min -= min / 2; |
1609 | if (alloc_flags & ALLOC_HARDER) | 1622 | if (alloc_flags & ALLOC_HARDER) |
1610 | min -= min / 4; | 1623 | min -= min / 4; |
1611 | 1624 | #ifdef CONFIG_CMA | |
1625 | /* If allocation can't use CMA areas don't use free CMA pages */ | ||
1626 | if (!(alloc_flags & ALLOC_CMA)) | ||
1627 | free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); | ||
1628 | #endif | ||
1612 | if (free_pages <= min + lowmem_reserve) | 1629 | if (free_pages <= min + lowmem_reserve) |
1613 | return false; | 1630 | return false; |
1614 | for (o = 0; o < order; o++) { | 1631 | for (o = 0; o < order; o++) { |
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) | |||
1782 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1799 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1783 | } | 1800 | } |
1784 | 1801 | ||
1802 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | ||
1803 | { | ||
1804 | return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); | ||
1805 | } | ||
1806 | |||
1807 | static void __paginginit init_zone_allows_reclaim(int nid) | ||
1808 | { | ||
1809 | int i; | ||
1810 | |||
1811 | for_each_online_node(i) | ||
1812 | if (node_distance(nid, i) <= RECLAIM_DISTANCE) | ||
1813 | node_set(i, NODE_DATA(nid)->reclaim_nodes); | ||
1814 | else | ||
1815 | zone_reclaim_mode = 1; | ||
1816 | } | ||
1817 | |||
1785 | #else /* CONFIG_NUMA */ | 1818 | #else /* CONFIG_NUMA */ |
1786 | 1819 | ||
1787 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1820 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1802 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1835 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1803 | { | 1836 | { |
1804 | } | 1837 | } |
1838 | |||
1839 | static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) | ||
1840 | { | ||
1841 | return true; | ||
1842 | } | ||
1843 | |||
1844 | static inline void init_zone_allows_reclaim(int nid) | ||
1845 | { | ||
1846 | } | ||
1805 | #endif /* CONFIG_NUMA */ | 1847 | #endif /* CONFIG_NUMA */ |
1806 | 1848 | ||
1807 | /* | 1849 | /* |
@@ -1886,7 +1928,8 @@ zonelist_scan: | |||
1886 | did_zlc_setup = 1; | 1928 | did_zlc_setup = 1; |
1887 | } | 1929 | } |
1888 | 1930 | ||
1889 | if (zone_reclaim_mode == 0) | 1931 | if (zone_reclaim_mode == 0 || |
1932 | !zone_allows_reclaim(preferred_zone, zone)) | ||
1890 | goto this_zone_full; | 1933 | goto this_zone_full; |
1891 | 1934 | ||
1892 | /* | 1935 | /* |
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2105 | bool *contended_compaction, bool *deferred_compaction, | 2148 | bool *contended_compaction, bool *deferred_compaction, |
2106 | unsigned long *did_some_progress) | 2149 | unsigned long *did_some_progress) |
2107 | { | 2150 | { |
2108 | struct page *page; | 2151 | struct page *page = NULL; |
2109 | 2152 | ||
2110 | if (!order) | 2153 | if (!order) |
2111 | return NULL; | 2154 | return NULL; |
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2118 | current->flags |= PF_MEMALLOC; | 2161 | current->flags |= PF_MEMALLOC; |
2119 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2162 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2120 | nodemask, sync_migration, | 2163 | nodemask, sync_migration, |
2121 | contended_compaction); | 2164 | contended_compaction, &page); |
2122 | current->flags &= ~PF_MEMALLOC; | 2165 | current->flags &= ~PF_MEMALLOC; |
2123 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
2124 | 2166 | ||
2167 | /* If compaction captured a page, prep and use it */ | ||
2168 | if (page) { | ||
2169 | prep_new_page(page, order, gfp_mask); | ||
2170 | goto got_page; | ||
2171 | } | ||
2172 | |||
2173 | if (*did_some_progress != COMPACT_SKIPPED) { | ||
2125 | /* Page migration frees to the PCP lists but we want merging */ | 2174 | /* Page migration frees to the PCP lists but we want merging */ |
2126 | drain_pages(get_cpu()); | 2175 | drain_pages(get_cpu()); |
2127 | put_cpu(); | 2176 | put_cpu(); |
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2131 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2180 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2132 | preferred_zone, migratetype); | 2181 | preferred_zone, migratetype); |
2133 | if (page) { | 2182 | if (page) { |
2183 | got_page: | ||
2184 | preferred_zone->compact_blockskip_flush = false; | ||
2134 | preferred_zone->compact_considered = 0; | 2185 | preferred_zone->compact_considered = 0; |
2135 | preferred_zone->compact_defer_shift = 0; | 2186 | preferred_zone->compact_defer_shift = 0; |
2136 | if (order >= preferred_zone->compact_order_failed) | 2187 | if (order >= preferred_zone->compact_order_failed) |
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2315 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2366 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
2316 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2367 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2317 | } | 2368 | } |
2318 | 2369 | #ifdef CONFIG_CMA | |
2370 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2371 | alloc_flags |= ALLOC_CMA; | ||
2372 | #endif | ||
2319 | return alloc_flags; | 2373 | return alloc_flags; |
2320 | } | 2374 | } |
2321 | 2375 | ||
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2362 | goto nopage; | 2416 | goto nopage; |
2363 | 2417 | ||
2364 | restart: | 2418 | restart: |
2365 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2419 | wake_all_kswapd(order, zonelist, high_zoneidx, |
2366 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2420 | zone_idx(preferred_zone)); |
2367 | zone_idx(preferred_zone)); | ||
2368 | 2421 | ||
2369 | /* | 2422 | /* |
2370 | * OK, we're below the kswapd watermark and have kicked background | 2423 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2441,7 +2494,7 @@ rebalance: | |||
2441 | * system then fail the allocation instead of entering direct reclaim. | 2494 | * system then fail the allocation instead of entering direct reclaim. |
2442 | */ | 2495 | */ |
2443 | if ((deferred_compaction || contended_compaction) && | 2496 | if ((deferred_compaction || contended_compaction) && |
2444 | (gfp_mask & __GFP_NO_KSWAPD)) | 2497 | (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) |
2445 | goto nopage; | 2498 | goto nopage; |
2446 | 2499 | ||
2447 | /* Try direct reclaim and then allocating */ | 2500 | /* Try direct reclaim and then allocating */ |
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2541 | struct page *page = NULL; | 2594 | struct page *page = NULL; |
2542 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2595 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2543 | unsigned int cpuset_mems_cookie; | 2596 | unsigned int cpuset_mems_cookie; |
2597 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | ||
2544 | 2598 | ||
2545 | gfp_mask &= gfp_allowed_mask; | 2599 | gfp_mask &= gfp_allowed_mask; |
2546 | 2600 | ||
@@ -2569,9 +2623,13 @@ retry_cpuset: | |||
2569 | if (!preferred_zone) | 2623 | if (!preferred_zone) |
2570 | goto out; | 2624 | goto out; |
2571 | 2625 | ||
2626 | #ifdef CONFIG_CMA | ||
2627 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2628 | alloc_flags |= ALLOC_CMA; | ||
2629 | #endif | ||
2572 | /* First allocation attempt */ | 2630 | /* First allocation attempt */ |
2573 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2631 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2574 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | 2632 | zonelist, high_zoneidx, alloc_flags, |
2575 | preferred_zone, migratetype); | 2633 | preferred_zone, migratetype); |
2576 | if (unlikely(!page)) | 2634 | if (unlikely(!page)) |
2577 | page = __alloc_pages_slowpath(gfp_mask, order, | 2635 | page = __alloc_pages_slowpath(gfp_mask, order, |
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter) | |||
2852 | " unevictable:%lu" | 2910 | " unevictable:%lu" |
2853 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2911 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2854 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 2912 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2855 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | 2913 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
2914 | " free_cma:%lu\n", | ||
2856 | global_page_state(NR_ACTIVE_ANON), | 2915 | global_page_state(NR_ACTIVE_ANON), |
2857 | global_page_state(NR_INACTIVE_ANON), | 2916 | global_page_state(NR_INACTIVE_ANON), |
2858 | global_page_state(NR_ISOLATED_ANON), | 2917 | global_page_state(NR_ISOLATED_ANON), |
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter) | |||
2869 | global_page_state(NR_FILE_MAPPED), | 2928 | global_page_state(NR_FILE_MAPPED), |
2870 | global_page_state(NR_SHMEM), | 2929 | global_page_state(NR_SHMEM), |
2871 | global_page_state(NR_PAGETABLE), | 2930 | global_page_state(NR_PAGETABLE), |
2872 | global_page_state(NR_BOUNCE)); | 2931 | global_page_state(NR_BOUNCE), |
2932 | global_page_state(NR_FREE_CMA_PAGES)); | ||
2873 | 2933 | ||
2874 | for_each_populated_zone(zone) { | 2934 | for_each_populated_zone(zone) { |
2875 | int i; | 2935 | int i; |
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter) | |||
2901 | " pagetables:%lukB" | 2961 | " pagetables:%lukB" |
2902 | " unstable:%lukB" | 2962 | " unstable:%lukB" |
2903 | " bounce:%lukB" | 2963 | " bounce:%lukB" |
2964 | " free_cma:%lukB" | ||
2904 | " writeback_tmp:%lukB" | 2965 | " writeback_tmp:%lukB" |
2905 | " pages_scanned:%lu" | 2966 | " pages_scanned:%lu" |
2906 | " all_unreclaimable? %s" | 2967 | " all_unreclaimable? %s" |
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter) | |||
2930 | K(zone_page_state(zone, NR_PAGETABLE)), | 2991 | K(zone_page_state(zone, NR_PAGETABLE)), |
2931 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | 2992 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), |
2932 | K(zone_page_state(zone, NR_BOUNCE)), | 2993 | K(zone_page_state(zone, NR_BOUNCE)), |
2994 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | ||
2933 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2995 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2934 | zone->pages_scanned, | 2996 | zone->pages_scanned, |
2935 | (zone->all_unreclaimable ? "yes" : "no") | 2997 | (zone->all_unreclaimable ? "yes" : "no") |
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat) | |||
3328 | j = 0; | 3390 | j = 0; |
3329 | 3391 | ||
3330 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 3392 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
3331 | int distance = node_distance(local_node, node); | ||
3332 | |||
3333 | /* | ||
3334 | * If another node is sufficiently far away then it is better | ||
3335 | * to reclaim pages in a zone before going off node. | ||
3336 | */ | ||
3337 | if (distance > RECLAIM_DISTANCE) | ||
3338 | zone_reclaim_mode = 1; | ||
3339 | |||
3340 | /* | 3393 | /* |
3341 | * We don't want to pressure a particular node. | 3394 | * We don't want to pressure a particular node. |
3342 | * So adding penalty to the first node in same | 3395 | * So adding penalty to the first node in same |
3343 | * distance group to make it round-robin. | 3396 | * distance group to make it round-robin. |
3344 | */ | 3397 | */ |
3345 | if (distance != node_distance(local_node, prev_node)) | 3398 | if (node_distance(local_node, node) != |
3399 | node_distance(local_node, prev_node)) | ||
3346 | node_load[node] = load; | 3400 | node_load[node] = load; |
3347 | 3401 | ||
3348 | prev_node = node; | 3402 | prev_node = node; |
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4438 | 4492 | ||
4439 | zone->spanned_pages = size; | 4493 | zone->spanned_pages = size; |
4440 | zone->present_pages = realsize; | 4494 | zone->present_pages = realsize; |
4441 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4442 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4443 | zone->spanned_pages; | ||
4444 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4445 | #endif | ||
4446 | #ifdef CONFIG_NUMA | 4495 | #ifdef CONFIG_NUMA |
4447 | zone->node = nid; | 4496 | zone->node = nid; |
4448 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4497 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4521 | 4570 | ||
4522 | pgdat->node_id = nid; | 4571 | pgdat->node_id = nid; |
4523 | pgdat->node_start_pfn = node_start_pfn; | 4572 | pgdat->node_start_pfn = node_start_pfn; |
4573 | init_zone_allows_reclaim(nid); | ||
4524 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4574 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
4525 | 4575 | ||
4526 | alloc_node_mem_map(pgdat); | 4576 | alloc_node_mem_map(pgdat); |
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4879 | zone_movable_pfn[i] << PAGE_SHIFT); | 4929 | zone_movable_pfn[i] << PAGE_SHIFT); |
4880 | } | 4930 | } |
4881 | 4931 | ||
4882 | /* Print out the early_node_map[] */ | 4932 | /* Print out the early node map */ |
4883 | printk("Early memory node ranges\n"); | 4933 | printk("Early memory node ranges\n"); |
4884 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4934 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4885 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 4935 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn) | |||
5619 | pageblock_nr_pages)); | 5669 | pageblock_nr_pages)); |
5620 | } | 5670 | } |
5621 | 5671 | ||
5622 | static struct page * | ||
5623 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | ||
5624 | int **resultp) | ||
5625 | { | ||
5626 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
5627 | |||
5628 | if (PageHighMem(page)) | ||
5629 | gfp_mask |= __GFP_HIGHMEM; | ||
5630 | |||
5631 | return alloc_page(gfp_mask); | ||
5632 | } | ||
5633 | |||
5634 | /* [start, end) must belong to a single zone. */ | 5672 | /* [start, end) must belong to a single zone. */ |
5635 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | 5673 | static int __alloc_contig_migrate_range(struct compact_control *cc, |
5674 | unsigned long start, unsigned long end) | ||
5636 | { | 5675 | { |
5637 | /* This function is based on compact_zone() from compaction.c. */ | 5676 | /* This function is based on compact_zone() from compaction.c. */ |
5638 | 5677 | unsigned long nr_reclaimed; | |
5639 | unsigned long pfn = start; | 5678 | unsigned long pfn = start; |
5640 | unsigned int tries = 0; | 5679 | unsigned int tries = 0; |
5641 | int ret = 0; | 5680 | int ret = 0; |
5642 | 5681 | ||
5643 | struct compact_control cc = { | ||
5644 | .nr_migratepages = 0, | ||
5645 | .order = -1, | ||
5646 | .zone = page_zone(pfn_to_page(start)), | ||
5647 | .sync = true, | ||
5648 | }; | ||
5649 | INIT_LIST_HEAD(&cc.migratepages); | ||
5650 | |||
5651 | migrate_prep_local(); | 5682 | migrate_prep_local(); |
5652 | 5683 | ||
5653 | while (pfn < end || !list_empty(&cc.migratepages)) { | 5684 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5654 | if (fatal_signal_pending(current)) { | 5685 | if (fatal_signal_pending(current)) { |
5655 | ret = -EINTR; | 5686 | ret = -EINTR; |
5656 | break; | 5687 | break; |
5657 | } | 5688 | } |
5658 | 5689 | ||
5659 | if (list_empty(&cc.migratepages)) { | 5690 | if (list_empty(&cc->migratepages)) { |
5660 | cc.nr_migratepages = 0; | 5691 | cc->nr_migratepages = 0; |
5661 | pfn = isolate_migratepages_range(cc.zone, &cc, | 5692 | pfn = isolate_migratepages_range(cc->zone, cc, |
5662 | pfn, end); | 5693 | pfn, end, true); |
5663 | if (!pfn) { | 5694 | if (!pfn) { |
5664 | ret = -EINTR; | 5695 | ret = -EINTR; |
5665 | break; | 5696 | break; |
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | |||
5670 | break; | 5701 | break; |
5671 | } | 5702 | } |
5672 | 5703 | ||
5673 | ret = migrate_pages(&cc.migratepages, | 5704 | nr_reclaimed = reclaim_clean_pages_from_list(cc->zone, |
5674 | __alloc_contig_migrate_alloc, | 5705 | &cc->migratepages); |
5706 | cc->nr_migratepages -= nr_reclaimed; | ||
5707 | |||
5708 | ret = migrate_pages(&cc->migratepages, | ||
5709 | alloc_migrate_target, | ||
5675 | 0, false, MIGRATE_SYNC); | 5710 | 0, false, MIGRATE_SYNC); |
5676 | } | 5711 | } |
5677 | 5712 | ||
5678 | putback_lru_pages(&cc.migratepages); | 5713 | putback_lru_pages(&cc->migratepages); |
5679 | return ret > 0 ? 0 : ret; | 5714 | return ret > 0 ? 0 : ret; |
5680 | } | 5715 | } |
5681 | 5716 | ||
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5754 | unsigned long outer_start, outer_end; | 5789 | unsigned long outer_start, outer_end; |
5755 | int ret = 0, order; | 5790 | int ret = 0, order; |
5756 | 5791 | ||
5792 | struct compact_control cc = { | ||
5793 | .nr_migratepages = 0, | ||
5794 | .order = -1, | ||
5795 | .zone = page_zone(pfn_to_page(start)), | ||
5796 | .sync = true, | ||
5797 | .ignore_skip_hint = true, | ||
5798 | }; | ||
5799 | INIT_LIST_HEAD(&cc.migratepages); | ||
5800 | |||
5757 | /* | 5801 | /* |
5758 | * What we do here is we mark all pageblocks in range as | 5802 | * What we do here is we mark all pageblocks in range as |
5759 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | 5803 | * MIGRATE_ISOLATE. Because pageblock and max order pages may |
@@ -5781,9 +5825,9 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5781 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5825 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5782 | pfn_max_align_up(end), migratetype); | 5826 | pfn_max_align_up(end), migratetype); |
5783 | if (ret) | 5827 | if (ret) |
5784 | goto done; | 5828 | return ret; |
5785 | 5829 | ||
5786 | ret = __alloc_contig_migrate_range(start, end); | 5830 | ret = __alloc_contig_migrate_range(&cc, start, end); |
5787 | if (ret) | 5831 | if (ret) |
5788 | goto done; | 5832 | goto done; |
5789 | 5833 | ||
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5832 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | 5876 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); |
5833 | 5877 | ||
5834 | /* Grab isolated pages from freelists. */ | 5878 | /* Grab isolated pages from freelists. */ |
5835 | outer_end = isolate_freepages_range(outer_start, end); | 5879 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
5836 | if (!outer_end) { | 5880 | if (!outer_end) { |
5837 | ret = -EBUSY; | 5881 | ret = -EBUSY; |
5838 | goto done; | 5882 | goto done; |
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data) | |||
5874 | local_irq_save(flags); | 5918 | local_irq_save(flags); |
5875 | if (pcp->count > 0) | 5919 | if (pcp->count > 0) |
5876 | free_pcppages_bulk(zone, pcp->count, pcp); | 5920 | free_pcppages_bulk(zone, pcp->count, pcp); |
5921 | drain_zonestat(zone, pset); | ||
5877 | setup_pageset(pset, batch); | 5922 | setup_pageset(pset, batch); |
5878 | local_irq_restore(flags); | 5923 | local_irq_restore(flags); |
5879 | } | 5924 | } |
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5890 | void zone_pcp_reset(struct zone *zone) | 5935 | void zone_pcp_reset(struct zone *zone) |
5891 | { | 5936 | { |
5892 | unsigned long flags; | 5937 | unsigned long flags; |
5938 | int cpu; | ||
5939 | struct per_cpu_pageset *pset; | ||
5893 | 5940 | ||
5894 | /* avoid races with drain_pages() */ | 5941 | /* avoid races with drain_pages() */ |
5895 | local_irq_save(flags); | 5942 | local_irq_save(flags); |
5896 | if (zone->pageset != &boot_pageset) { | 5943 | if (zone->pageset != &boot_pageset) { |
5944 | for_each_online_cpu(cpu) { | ||
5945 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5946 | drain_zonestat(zone, pset); | ||
5947 | } | ||
5897 | free_percpu(zone->pageset); | 5948 | free_percpu(zone->pageset); |
5898 | zone->pageset = &boot_pageset; | 5949 | zone->pageset = &boot_pageset; |
5899 | } | 5950 | } |
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page) | |||
6047 | dump_page_flags(page->flags); | 6098 | dump_page_flags(page->flags); |
6048 | mem_cgroup_print_bad_page(page); | 6099 | mem_cgroup_print_bad_page(page); |
6049 | } | 6100 | } |
6101 | |||
6102 | /* reset zone->present_pages */ | ||
6103 | void reset_zone_present_pages(void) | ||
6104 | { | ||
6105 | struct zone *z; | ||
6106 | int i, nid; | ||
6107 | |||
6108 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
6109 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6110 | z = NODE_DATA(nid)->node_zones + i; | ||
6111 | z->present_pages = 0; | ||
6112 | } | ||
6113 | } | ||
6114 | } | ||
6115 | |||
6116 | /* calculate zone's present pages in buddy system */ | ||
6117 | void fixup_zone_present_pages(int nid, unsigned long start_pfn, | ||
6118 | unsigned long end_pfn) | ||
6119 | { | ||
6120 | struct zone *z; | ||
6121 | unsigned long zone_start_pfn, zone_end_pfn; | ||
6122 | int i; | ||
6123 | |||
6124 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6125 | z = NODE_DATA(nid)->node_zones + i; | ||
6126 | zone_start_pfn = z->zone_start_pfn; | ||
6127 | zone_end_pfn = zone_start_pfn + z->spanned_pages; | ||
6128 | |||
6129 | /* if the two regions intersect */ | ||
6130 | if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn)) | ||
6131 | z->present_pages += min(end_pfn, zone_end_pfn) - | ||
6132 | max(start_pfn, zone_start_pfn); | ||
6133 | } | ||
6134 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 247d1f175739..f2f5b4818e94 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page) | |||
76 | 76 | ||
77 | out: | 77 | out: |
78 | if (!ret) { | 78 | if (!ret) { |
79 | unsigned long nr_pages; | ||
80 | int migratetype = get_pageblock_migratetype(page); | ||
81 | |||
79 | set_pageblock_isolate(page); | 82 | set_pageblock_isolate(page); |
80 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 83 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); |
84 | |||
85 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); | ||
81 | } | 86 | } |
82 | 87 | ||
83 | spin_unlock_irqrestore(&zone->lock, flags); | 88 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -89,12 +94,14 @@ out: | |||
89 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | 94 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
90 | { | 95 | { |
91 | struct zone *zone; | 96 | struct zone *zone; |
92 | unsigned long flags; | 97 | unsigned long flags, nr_pages; |
98 | |||
93 | zone = page_zone(page); | 99 | zone = page_zone(page); |
94 | spin_lock_irqsave(&zone->lock, flags); | 100 | spin_lock_irqsave(&zone->lock, flags); |
95 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 101 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
96 | goto out; | 102 | goto out; |
97 | move_freepages_block(zone, page, migratetype); | 103 | nr_pages = move_freepages_block(zone, page, migratetype); |
104 | __mod_zone_freepage_state(zone, nr_pages, migratetype); | ||
98 | restore_pageblock_isolate(page, migratetype); | 105 | restore_pageblock_isolate(page, migratetype); |
99 | out: | 106 | out: |
100 | spin_unlock_irqrestore(&zone->lock, flags); | 107 | spin_unlock_irqrestore(&zone->lock, flags); |
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) | |||
193 | continue; | 200 | continue; |
194 | } | 201 | } |
195 | page = pfn_to_page(pfn); | 202 | page = pfn_to_page(pfn); |
196 | if (PageBuddy(page)) | 203 | if (PageBuddy(page)) { |
204 | /* | ||
205 | * If race between isolatation and allocation happens, | ||
206 | * some free pages could be in MIGRATE_MOVABLE list | ||
207 | * although pageblock's migratation type of the page | ||
208 | * is MIGRATE_ISOLATE. Catch it and move the page into | ||
209 | * MIGRATE_ISOLATE list. | ||
210 | */ | ||
211 | if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) { | ||
212 | struct page *end_page; | ||
213 | |||
214 | end_page = page + (1 << page_order(page)) - 1; | ||
215 | move_freepages(page_zone(page), page, end_page, | ||
216 | MIGRATE_ISOLATE); | ||
217 | } | ||
197 | pfn += 1 << page_order(page); | 218 | pfn += 1 << page_order(page); |
219 | } | ||
198 | else if (page_count(page) == 0 && | 220 | else if (page_count(page) == 0 && |
199 | page_private(page) == MIGRATE_ISOLATE) | 221 | get_freepage_migratetype(page) == MIGRATE_ISOLATE) |
200 | pfn += 1; | 222 | pfn += 1; |
201 | else | 223 | else |
202 | break; | 224 | break; |
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
233 | spin_unlock_irqrestore(&zone->lock, flags); | 255 | spin_unlock_irqrestore(&zone->lock, flags); |
234 | return ret ? 0 : -EBUSY; | 256 | return ret ? 0 : -EBUSY; |
235 | } | 257 | } |
258 | |||
259 | struct page *alloc_migrate_target(struct page *page, unsigned long private, | ||
260 | int **resultp) | ||
261 | { | ||
262 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | ||
263 | |||
264 | if (PageHighMem(page)) | ||
265 | gfp_mask |= __GFP_HIGHMEM; | ||
266 | |||
267 | return alloc_page(gfp_mask); | ||
268 | } | ||
diff --git a/mm/percpu.c b/mm/percpu.c index bb4be7435ce3..ddc5efb9c5bb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1370,7 +1370,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1370 | 1370 | ||
1371 | #ifdef CONFIG_SMP | 1371 | #ifdef CONFIG_SMP |
1372 | 1372 | ||
1373 | const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { | 1373 | const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { |
1374 | [PCPU_FC_AUTO] = "auto", | 1374 | [PCPU_FC_AUTO] = "auto", |
1375 | [PCPU_FC_EMBED] = "embed", | 1375 | [PCPU_FC_EMBED] = "embed", |
1376 | [PCPU_FC_PAGE] = "page", | 1376 | [PCPU_FC_PAGE] = "page", |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 74c0ddaa6fa0..e642627da6b7 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | |||
120 | } | 120 | } |
121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 121 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
122 | #endif | 122 | #endif |
123 | |||
124 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | ||
125 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
126 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) | ||
127 | { | ||
128 | assert_spin_locked(&mm->page_table_lock); | ||
129 | |||
130 | /* FIFO */ | ||
131 | if (!mm->pmd_huge_pte) | ||
132 | INIT_LIST_HEAD(&pgtable->lru); | ||
133 | else | ||
134 | list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); | ||
135 | mm->pmd_huge_pte = pgtable; | ||
136 | } | ||
137 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
138 | #endif | ||
139 | |||
140 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW | ||
141 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
142 | /* no "address" argument so destroys page coloring of some arch */ | ||
143 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) | ||
144 | { | ||
145 | pgtable_t pgtable; | ||
146 | |||
147 | assert_spin_locked(&mm->page_table_lock); | ||
148 | |||
149 | /* FIFO */ | ||
150 | pgtable = mm->pmd_huge_pte; | ||
151 | if (list_empty(&pgtable->lru)) | ||
152 | mm->pmd_huge_pte = NULL; | ||
153 | else { | ||
154 | mm->pmd_huge_pte = list_entry(pgtable->lru.next, | ||
155 | struct page, lru); | ||
156 | list_del(&pgtable->lru); | ||
157 | } | ||
158 | return pgtable; | ||
159 | } | ||
160 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
161 | #endif | ||
162 | |||
163 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE | ||
164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
165 | void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | ||
166 | pmd_t *pmdp) | ||
167 | { | ||
168 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); | ||
169 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
170 | } | ||
171 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
172 | #endif | ||
diff --git a/mm/prio_tree.c b/mm/prio_tree.c deleted file mode 100644 index 799dcfd7cd8c..000000000000 --- a/mm/prio_tree.c +++ /dev/null | |||
@@ -1,208 +0,0 @@ | |||
1 | /* | ||
2 | * mm/prio_tree.c - priority search tree for mapping->i_mmap | ||
3 | * | ||
4 | * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu> | ||
5 | * | ||
6 | * This file is released under the GPL v2. | ||
7 | * | ||
8 | * Based on the radix priority search tree proposed by Edward M. McCreight | ||
9 | * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 | ||
10 | * | ||
11 | * 02Feb2004 Initial version | ||
12 | */ | ||
13 | |||
14 | #include <linux/mm.h> | ||
15 | #include <linux/prio_tree.h> | ||
16 | #include <linux/prefetch.h> | ||
17 | |||
18 | /* | ||
19 | * See lib/prio_tree.c for details on the general radix priority search tree | ||
20 | * code. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * The following #defines are mirrored from lib/prio_tree.c. They're only used | ||
25 | * for debugging, and should be removed (along with the debugging code using | ||
26 | * them) when switching also VMAs to the regular prio_tree code. | ||
27 | */ | ||
28 | |||
29 | #define RADIX_INDEX(vma) ((vma)->vm_pgoff) | ||
30 | #define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) | ||
31 | /* avoid overflow */ | ||
32 | #define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) | ||
33 | |||
34 | /* | ||
35 | * Radix priority search tree for address_space->i_mmap | ||
36 | * | ||
37 | * For each vma that map a unique set of file pages i.e., unique [radix_index, | ||
38 | * heap_index] value, we have a corresponding priority search tree node. If | ||
39 | * multiple vmas have identical [radix_index, heap_index] value, then one of | ||
40 | * them is used as a tree node and others are stored in a vm_set list. The tree | ||
41 | * node points to the first vma (head) of the list using vm_set.head. | ||
42 | * | ||
43 | * prio_tree_root | ||
44 | * | | ||
45 | * A vm_set.head | ||
46 | * / \ / | ||
47 | * L R -> H-I-J-K-M-N-O-P-Q-S | ||
48 | * ^ ^ <-- vm_set.list --> | ||
49 | * tree nodes | ||
50 | * | ||
51 | * We need some way to identify whether a vma is a tree node, head of a vm_set | ||
52 | * list, or just a member of a vm_set list. We cannot use vm_flags to store | ||
53 | * such information. The reason is, in the above figure, it is possible that | ||
54 | * vm_flags' of R and H are covered by the different mmap_sems. When R is | ||
55 | * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold | ||
56 | * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. | ||
57 | * That's why some trick involving shared.vm_set.parent is used for identifying | ||
58 | * tree nodes and list head nodes. | ||
59 | * | ||
60 | * vma radix priority search tree node rules: | ||
61 | * | ||
62 | * vma->shared.vm_set.parent != NULL ==> a tree node | ||
63 | * vma->shared.vm_set.head != NULL ==> list of others mapping same range | ||
64 | * vma->shared.vm_set.head == NULL ==> no others map the same range | ||
65 | * | ||
66 | * vma->shared.vm_set.parent == NULL | ||
67 | * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range | ||
68 | * vma->shared.vm_set.head == NULL ==> a list node | ||
69 | */ | ||
70 | |||
71 | /* | ||
72 | * Add a new vma known to map the same set of pages as the old vma: | ||
73 | * useful for fork's dup_mmap as well as vma_prio_tree_insert below. | ||
74 | * Note that it just happens to work correctly on i_mmap_nonlinear too. | ||
75 | */ | ||
76 | void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old) | ||
77 | { | ||
78 | /* Leave these BUG_ONs till prio_tree patch stabilizes */ | ||
79 | BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); | ||
80 | BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); | ||
81 | |||
82 | vma->shared.vm_set.head = NULL; | ||
83 | vma->shared.vm_set.parent = NULL; | ||
84 | |||
85 | if (!old->shared.vm_set.parent) | ||
86 | list_add(&vma->shared.vm_set.list, | ||
87 | &old->shared.vm_set.list); | ||
88 | else if (old->shared.vm_set.head) | ||
89 | list_add_tail(&vma->shared.vm_set.list, | ||
90 | &old->shared.vm_set.head->shared.vm_set.list); | ||
91 | else { | ||
92 | INIT_LIST_HEAD(&vma->shared.vm_set.list); | ||
93 | vma->shared.vm_set.head = old; | ||
94 | old->shared.vm_set.head = vma; | ||
95 | } | ||
96 | } | ||
97 | |||
98 | void vma_prio_tree_insert(struct vm_area_struct *vma, | ||
99 | struct prio_tree_root *root) | ||
100 | { | ||
101 | struct prio_tree_node *ptr; | ||
102 | struct vm_area_struct *old; | ||
103 | |||
104 | vma->shared.vm_set.head = NULL; | ||
105 | |||
106 | ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node); | ||
107 | if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) { | ||
108 | old = prio_tree_entry(ptr, struct vm_area_struct, | ||
109 | shared.prio_tree_node); | ||
110 | vma_prio_tree_add(vma, old); | ||
111 | } | ||
112 | } | ||
113 | |||
114 | void vma_prio_tree_remove(struct vm_area_struct *vma, | ||
115 | struct prio_tree_root *root) | ||
116 | { | ||
117 | struct vm_area_struct *node, *head, *new_head; | ||
118 | |||
119 | if (!vma->shared.vm_set.head) { | ||
120 | if (!vma->shared.vm_set.parent) | ||
121 | list_del_init(&vma->shared.vm_set.list); | ||
122 | else | ||
123 | raw_prio_tree_remove(root, &vma->shared.prio_tree_node); | ||
124 | } else { | ||
125 | /* Leave this BUG_ON till prio_tree patch stabilizes */ | ||
126 | BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma); | ||
127 | if (vma->shared.vm_set.parent) { | ||
128 | head = vma->shared.vm_set.head; | ||
129 | if (!list_empty(&head->shared.vm_set.list)) { | ||
130 | new_head = list_entry( | ||
131 | head->shared.vm_set.list.next, | ||
132 | struct vm_area_struct, | ||
133 | shared.vm_set.list); | ||
134 | list_del_init(&head->shared.vm_set.list); | ||
135 | } else | ||
136 | new_head = NULL; | ||
137 | |||
138 | raw_prio_tree_replace(root, &vma->shared.prio_tree_node, | ||
139 | &head->shared.prio_tree_node); | ||
140 | head->shared.vm_set.head = new_head; | ||
141 | if (new_head) | ||
142 | new_head->shared.vm_set.head = head; | ||
143 | |||
144 | } else { | ||
145 | node = vma->shared.vm_set.head; | ||
146 | if (!list_empty(&vma->shared.vm_set.list)) { | ||
147 | new_head = list_entry( | ||
148 | vma->shared.vm_set.list.next, | ||
149 | struct vm_area_struct, | ||
150 | shared.vm_set.list); | ||
151 | list_del_init(&vma->shared.vm_set.list); | ||
152 | node->shared.vm_set.head = new_head; | ||
153 | new_head->shared.vm_set.head = node; | ||
154 | } else | ||
155 | node->shared.vm_set.head = NULL; | ||
156 | } | ||
157 | } | ||
158 | } | ||
159 | |||
160 | /* | ||
161 | * Helper function to enumerate vmas that map a given file page or a set of | ||
162 | * contiguous file pages. The function returns vmas that at least map a single | ||
163 | * page in the given range of contiguous file pages. | ||
164 | */ | ||
165 | struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma, | ||
166 | struct prio_tree_iter *iter) | ||
167 | { | ||
168 | struct prio_tree_node *ptr; | ||
169 | struct vm_area_struct *next; | ||
170 | |||
171 | if (!vma) { | ||
172 | /* | ||
173 | * First call is with NULL vma | ||
174 | */ | ||
175 | ptr = prio_tree_next(iter); | ||
176 | if (ptr) { | ||
177 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
178 | shared.prio_tree_node); | ||
179 | prefetch(next->shared.vm_set.head); | ||
180 | return next; | ||
181 | } else | ||
182 | return NULL; | ||
183 | } | ||
184 | |||
185 | if (vma->shared.vm_set.parent) { | ||
186 | if (vma->shared.vm_set.head) { | ||
187 | next = vma->shared.vm_set.head; | ||
188 | prefetch(next->shared.vm_set.list.next); | ||
189 | return next; | ||
190 | } | ||
191 | } else { | ||
192 | next = list_entry(vma->shared.vm_set.list.next, | ||
193 | struct vm_area_struct, shared.vm_set.list); | ||
194 | if (!next->shared.vm_set.head) { | ||
195 | prefetch(next->shared.vm_set.list.next); | ||
196 | return next; | ||
197 | } | ||
198 | } | ||
199 | |||
200 | ptr = prio_tree_next(iter); | ||
201 | if (ptr) { | ||
202 | next = prio_tree_entry(ptr, struct vm_area_struct, | ||
203 | shared.prio_tree_node); | ||
204 | prefetch(next->shared.vm_set.head); | ||
205 | return next; | ||
206 | } else | ||
207 | return NULL; | ||
208 | } | ||
diff --git a/mm/readahead.c b/mm/readahead.c index ea8f8fa21649..7963f2391236 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -579,19 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
579 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | 579 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) |
580 | { | 580 | { |
581 | ssize_t ret; | 581 | ssize_t ret; |
582 | struct file *file; | 582 | struct fd f; |
583 | 583 | ||
584 | ret = -EBADF; | 584 | ret = -EBADF; |
585 | file = fget(fd); | 585 | f = fdget(fd); |
586 | if (file) { | 586 | if (f.file) { |
587 | if (file->f_mode & FMODE_READ) { | 587 | if (f.file->f_mode & FMODE_READ) { |
588 | struct address_space *mapping = file->f_mapping; | 588 | struct address_space *mapping = f.file->f_mapping; |
589 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | 589 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; |
590 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | 590 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; |
591 | unsigned long len = end - start + 1; | 591 | unsigned long len = end - start + 1; |
592 | ret = do_readahead(mapping, file, start, len); | 592 | ret = do_readahead(mapping, f.file, start, len); |
593 | } | 593 | } |
594 | fput(file); | 594 | fdput(f); |
595 | } | 595 | } |
596 | return ret; | 596 | return ret; |
597 | } | 597 | } |
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/mmu_notifier.h> | 56 | #include <linux/mmu_notifier.h> |
57 | #include <linux/migrate.h> | 57 | #include <linux/migrate.h> |
58 | #include <linux/hugetlb.h> | 58 | #include <linux/hugetlb.h> |
59 | #include <linux/backing-dev.h> | ||
59 | 60 | ||
60 | #include <asm/tlbflush.h> | 61 | #include <asm/tlbflush.h> |
61 | 62 | ||
@@ -127,12 +128,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
127 | avc->vma = vma; | 128 | avc->vma = vma; |
128 | avc->anon_vma = anon_vma; | 129 | avc->anon_vma = anon_vma; |
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 130 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
130 | 131 | anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); | |
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | 132 | } |
137 | 133 | ||
138 | /** | 134 | /** |
@@ -269,51 +265,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
269 | } | 265 | } |
270 | 266 | ||
271 | /* | 267 | /* |
272 | * Some rmap walk that needs to find all ptes/hugepmds without false | ||
273 | * negatives (like migrate and split_huge_page) running concurrent | ||
274 | * with operations that copy or move pagetables (like mremap() and | ||
275 | * fork()) to be safe. They depend on the anon_vma "same_anon_vma" | ||
276 | * list to be in a certain order: the dst_vma must be placed after the | ||
277 | * src_vma in the list. This is always guaranteed by fork() but | ||
278 | * mremap() needs to call this function to enforce it in case the | ||
279 | * dst_vma isn't newly allocated and chained with the anon_vma_clone() | ||
280 | * function but just an extension of a pre-existing vma through | ||
281 | * vma_merge. | ||
282 | * | ||
283 | * NOTE: the same_anon_vma list can still be changed by other | ||
284 | * processes while mremap runs because mremap doesn't hold the | ||
285 | * anon_vma mutex to prevent modifications to the list while it | ||
286 | * runs. All we need to enforce is that the relative order of this | ||
287 | * process vmas isn't changing (we don't care about other vmas | ||
288 | * order). Each vma corresponds to an anon_vma_chain structure so | ||
289 | * there's no risk that other processes calling anon_vma_moveto_tail() | ||
290 | * and changing the same_anon_vma list under mremap() will screw with | ||
291 | * the relative order of this process vmas in the list, because we | ||
292 | * they can't alter the order of any vma that belongs to this | ||
293 | * process. And there can't be another anon_vma_moveto_tail() running | ||
294 | * concurrently with mremap() coming from this process because we hold | ||
295 | * the mmap_sem for the whole mremap(). fork() ordering dependency | ||
296 | * also shouldn't be affected because fork() only cares that the | ||
297 | * parent vmas are placed in the list before the child vmas and | ||
298 | * anon_vma_moveto_tail() won't reorder vmas from either the fork() | ||
299 | * parent or child. | ||
300 | */ | ||
301 | void anon_vma_moveto_tail(struct vm_area_struct *dst) | ||
302 | { | ||
303 | struct anon_vma_chain *pavc; | ||
304 | struct anon_vma *root = NULL; | ||
305 | |||
306 | list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { | ||
307 | struct anon_vma *anon_vma = pavc->anon_vma; | ||
308 | VM_BUG_ON(pavc->vma != dst); | ||
309 | root = lock_anon_vma_root(root, anon_vma); | ||
310 | list_del(&pavc->same_anon_vma); | ||
311 | list_add_tail(&pavc->same_anon_vma, &anon_vma->head); | ||
312 | } | ||
313 | unlock_anon_vma_root(root); | ||
314 | } | ||
315 | |||
316 | /* | ||
317 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | 268 | * Attach vma to its own anon_vma, as well as to the anon_vmas that |
318 | * the corresponding VMA in the parent process is attached to. | 269 | * the corresponding VMA in the parent process is attached to. |
319 | * Returns 0 on success, non-zero on failure. | 270 | * Returns 0 on success, non-zero on failure. |
@@ -381,13 +332,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
381 | struct anon_vma *anon_vma = avc->anon_vma; | 332 | struct anon_vma *anon_vma = avc->anon_vma; |
382 | 333 | ||
383 | root = lock_anon_vma_root(root, anon_vma); | 334 | root = lock_anon_vma_root(root, anon_vma); |
384 | list_del(&avc->same_anon_vma); | 335 | anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); |
385 | 336 | ||
386 | /* | 337 | /* |
387 | * Leave empty anon_vmas on the list - we'll need | 338 | * Leave empty anon_vmas on the list - we'll need |
388 | * to free them outside the lock. | 339 | * to free them outside the lock. |
389 | */ | 340 | */ |
390 | if (list_empty(&anon_vma->head)) | 341 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) |
391 | continue; | 342 | continue; |
392 | 343 | ||
393 | list_del(&avc->same_vma); | 344 | list_del(&avc->same_vma); |
@@ -416,7 +367,7 @@ static void anon_vma_ctor(void *data) | |||
416 | 367 | ||
417 | mutex_init(&anon_vma->mutex); | 368 | mutex_init(&anon_vma->mutex); |
418 | atomic_set(&anon_vma->refcount, 0); | 369 | atomic_set(&anon_vma->refcount, 0); |
419 | INIT_LIST_HEAD(&anon_vma->head); | 370 | anon_vma->rb_root = RB_ROOT; |
420 | } | 371 | } |
421 | 372 | ||
422 | void __init anon_vma_init(void) | 373 | void __init anon_vma_init(void) |
@@ -560,22 +511,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) | |||
560 | 511 | ||
561 | /* | 512 | /* |
562 | * At what user virtual address is page expected in @vma? | 513 | * At what user virtual address is page expected in @vma? |
563 | * Returns virtual address or -EFAULT if page's index/offset is not | ||
564 | * within the range mapped the @vma. | ||
565 | */ | 514 | */ |
566 | inline unsigned long | 515 | static inline unsigned long |
567 | vma_address(struct page *page, struct vm_area_struct *vma) | 516 | __vma_address(struct page *page, struct vm_area_struct *vma) |
568 | { | 517 | { |
569 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 518 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
570 | unsigned long address; | ||
571 | 519 | ||
572 | if (unlikely(is_vm_hugetlb_page(vma))) | 520 | if (unlikely(is_vm_hugetlb_page(vma))) |
573 | pgoff = page->index << huge_page_order(page_hstate(page)); | 521 | pgoff = page->index << huge_page_order(page_hstate(page)); |
574 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 522 | |
575 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { | 523 | return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
576 | /* page should be within @vma mapping range */ | 524 | } |
577 | return -EFAULT; | 525 | |
578 | } | 526 | inline unsigned long |
527 | vma_address(struct page *page, struct vm_area_struct *vma) | ||
528 | { | ||
529 | unsigned long address = __vma_address(page, vma); | ||
530 | |||
531 | /* page should be within @vma mapping range */ | ||
532 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | ||
533 | |||
579 | return address; | 534 | return address; |
580 | } | 535 | } |
581 | 536 | ||
@@ -585,6 +540,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
585 | */ | 540 | */ |
586 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 541 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
587 | { | 542 | { |
543 | unsigned long address; | ||
588 | if (PageAnon(page)) { | 544 | if (PageAnon(page)) { |
589 | struct anon_vma *page__anon_vma = page_anon_vma(page); | 545 | struct anon_vma *page__anon_vma = page_anon_vma(page); |
590 | /* | 546 | /* |
@@ -600,7 +556,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
600 | return -EFAULT; | 556 | return -EFAULT; |
601 | } else | 557 | } else |
602 | return -EFAULT; | 558 | return -EFAULT; |
603 | return vma_address(page, vma); | 559 | address = __vma_address(page, vma); |
560 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) | ||
561 | return -EFAULT; | ||
562 | return address; | ||
604 | } | 563 | } |
605 | 564 | ||
606 | /* | 565 | /* |
@@ -674,8 +633,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
674 | pte_t *pte; | 633 | pte_t *pte; |
675 | spinlock_t *ptl; | 634 | spinlock_t *ptl; |
676 | 635 | ||
677 | address = vma_address(page, vma); | 636 | address = __vma_address(page, vma); |
678 | if (address == -EFAULT) /* out of vma range */ | 637 | if (unlikely(address < vma->vm_start || address >= vma->vm_end)) |
679 | return 0; | 638 | return 0; |
680 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); | 639 | pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); |
681 | if (!pte) /* the page is not in this mm */ | 640 | if (!pte) /* the page is not in this mm */ |
@@ -769,6 +728,7 @@ static int page_referenced_anon(struct page *page, | |||
769 | { | 728 | { |
770 | unsigned int mapcount; | 729 | unsigned int mapcount; |
771 | struct anon_vma *anon_vma; | 730 | struct anon_vma *anon_vma; |
731 | pgoff_t pgoff; | ||
772 | struct anon_vma_chain *avc; | 732 | struct anon_vma_chain *avc; |
773 | int referenced = 0; | 733 | int referenced = 0; |
774 | 734 | ||
@@ -777,11 +737,10 @@ static int page_referenced_anon(struct page *page, | |||
777 | return referenced; | 737 | return referenced; |
778 | 738 | ||
779 | mapcount = page_mapcount(page); | 739 | mapcount = page_mapcount(page); |
780 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 740 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
741 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
781 | struct vm_area_struct *vma = avc->vma; | 742 | struct vm_area_struct *vma = avc->vma; |
782 | unsigned long address = vma_address(page, vma); | 743 | unsigned long address = vma_address(page, vma); |
783 | if (address == -EFAULT) | ||
784 | continue; | ||
785 | /* | 744 | /* |
786 | * If we are reclaiming on behalf of a cgroup, skip | 745 | * If we are reclaiming on behalf of a cgroup, skip |
787 | * counting on behalf of references from different | 746 | * counting on behalf of references from different |
@@ -820,7 +779,6 @@ static int page_referenced_file(struct page *page, | |||
820 | struct address_space *mapping = page->mapping; | 779 | struct address_space *mapping = page->mapping; |
821 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 780 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
822 | struct vm_area_struct *vma; | 781 | struct vm_area_struct *vma; |
823 | struct prio_tree_iter iter; | ||
824 | int referenced = 0; | 782 | int referenced = 0; |
825 | 783 | ||
826 | /* | 784 | /* |
@@ -846,10 +804,8 @@ static int page_referenced_file(struct page *page, | |||
846 | */ | 804 | */ |
847 | mapcount = page_mapcount(page); | 805 | mapcount = page_mapcount(page); |
848 | 806 | ||
849 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 807 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
850 | unsigned long address = vma_address(page, vma); | 808 | unsigned long address = vma_address(page, vma); |
851 | if (address == -EFAULT) | ||
852 | continue; | ||
853 | /* | 809 | /* |
854 | * If we are reclaiming on behalf of a cgroup, skip | 810 | * If we are reclaiming on behalf of a cgroup, skip |
855 | * counting on behalf of references from different | 811 | * counting on behalf of references from different |
@@ -929,7 +885,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
929 | pte_t entry; | 885 | pte_t entry; |
930 | 886 | ||
931 | flush_cache_page(vma, address, pte_pfn(*pte)); | 887 | flush_cache_page(vma, address, pte_pfn(*pte)); |
932 | entry = ptep_clear_flush_notify(vma, address, pte); | 888 | entry = ptep_clear_flush(vma, address, pte); |
933 | entry = pte_wrprotect(entry); | 889 | entry = pte_wrprotect(entry); |
934 | entry = pte_mkclean(entry); | 890 | entry = pte_mkclean(entry); |
935 | set_pte_at(mm, address, pte, entry); | 891 | set_pte_at(mm, address, pte, entry); |
@@ -937,6 +893,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
937 | } | 893 | } |
938 | 894 | ||
939 | pte_unmap_unlock(pte, ptl); | 895 | pte_unmap_unlock(pte, ptl); |
896 | |||
897 | if (ret) | ||
898 | mmu_notifier_invalidate_page(mm, address); | ||
940 | out: | 899 | out: |
941 | return ret; | 900 | return ret; |
942 | } | 901 | } |
@@ -945,17 +904,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
945 | { | 904 | { |
946 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 905 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
947 | struct vm_area_struct *vma; | 906 | struct vm_area_struct *vma; |
948 | struct prio_tree_iter iter; | ||
949 | int ret = 0; | 907 | int ret = 0; |
950 | 908 | ||
951 | BUG_ON(PageAnon(page)); | 909 | BUG_ON(PageAnon(page)); |
952 | 910 | ||
953 | mutex_lock(&mapping->i_mmap_mutex); | 911 | mutex_lock(&mapping->i_mmap_mutex); |
954 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 912 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
955 | if (vma->vm_flags & VM_SHARED) { | 913 | if (vma->vm_flags & VM_SHARED) { |
956 | unsigned long address = vma_address(page, vma); | 914 | unsigned long address = vma_address(page, vma); |
957 | if (address == -EFAULT) | ||
958 | continue; | ||
959 | ret += page_mkclean_one(page, vma, address); | 915 | ret += page_mkclean_one(page, vma, address); |
960 | } | 916 | } |
961 | } | 917 | } |
@@ -971,11 +927,8 @@ int page_mkclean(struct page *page) | |||
971 | 927 | ||
972 | if (page_mapped(page)) { | 928 | if (page_mapped(page)) { |
973 | struct address_space *mapping = page_mapping(page); | 929 | struct address_space *mapping = page_mapping(page); |
974 | if (mapping) { | 930 | if (mapping) |
975 | ret = page_mkclean_file(mapping, page); | 931 | ret = page_mkclean_file(mapping, page); |
976 | if (page_test_and_clear_dirty(page_to_pfn(page), 1)) | ||
977 | ret = 1; | ||
978 | } | ||
979 | } | 932 | } |
980 | 933 | ||
981 | return ret; | 934 | return ret; |
@@ -1128,7 +1081,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
1128 | else | 1081 | else |
1129 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1082 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1130 | __page_set_anon_rmap(page, vma, address, 1); | 1083 | __page_set_anon_rmap(page, vma, address, 1); |
1131 | if (page_evictable(page, vma)) | 1084 | if (!mlocked_vma_newpage(vma, page)) |
1132 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1085 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
1133 | else | 1086 | else |
1134 | add_page_to_unevictable_list(page); | 1087 | add_page_to_unevictable_list(page); |
@@ -1161,6 +1114,7 @@ void page_add_file_rmap(struct page *page) | |||
1161 | */ | 1114 | */ |
1162 | void page_remove_rmap(struct page *page) | 1115 | void page_remove_rmap(struct page *page) |
1163 | { | 1116 | { |
1117 | struct address_space *mapping = page_mapping(page); | ||
1164 | bool anon = PageAnon(page); | 1118 | bool anon = PageAnon(page); |
1165 | bool locked; | 1119 | bool locked; |
1166 | unsigned long flags; | 1120 | unsigned long flags; |
@@ -1183,8 +1137,19 @@ void page_remove_rmap(struct page *page) | |||
1183 | * this if the page is anon, so about to be freed; but perhaps | 1137 | * this if the page is anon, so about to be freed; but perhaps |
1184 | * not if it's in swapcache - there might be another pte slot | 1138 | * not if it's in swapcache - there might be another pte slot |
1185 | * containing the swap entry, but page not yet written to swap. | 1139 | * containing the swap entry, but page not yet written to swap. |
1140 | * | ||
1141 | * And we can skip it on file pages, so long as the filesystem | ||
1142 | * participates in dirty tracking; but need to catch shm and tmpfs | ||
1143 | * and ramfs pages which have been modified since creation by read | ||
1144 | * fault. | ||
1145 | * | ||
1146 | * Note that mapping must be decided above, before decrementing | ||
1147 | * mapcount (which luckily provides a barrier): once page is unmapped, | ||
1148 | * it could be truncated and page->mapping reset to NULL at any moment. | ||
1149 | * Note also that we are relying on page_mapping(page) to set mapping | ||
1150 | * to &swapper_space when PageSwapCache(page). | ||
1186 | */ | 1151 | */ |
1187 | if ((!anon || PageSwapCache(page)) && | 1152 | if (mapping && !mapping_cap_account_dirty(mapping) && |
1188 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | 1153 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
1189 | set_page_dirty(page); | 1154 | set_page_dirty(page); |
1190 | /* | 1155 | /* |
@@ -1203,7 +1168,10 @@ void page_remove_rmap(struct page *page) | |||
1203 | } else { | 1168 | } else { |
1204 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1169 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
1205 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1170 | mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1171 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1206 | } | 1172 | } |
1173 | if (unlikely(PageMlocked(page))) | ||
1174 | clear_page_mlock(page); | ||
1207 | /* | 1175 | /* |
1208 | * It would be tidy to reset the PageAnon mapping here, | 1176 | * It would be tidy to reset the PageAnon mapping here, |
1209 | * but that might overwrite a racing page_add_anon_rmap | 1177 | * but that might overwrite a racing page_add_anon_rmap |
@@ -1213,6 +1181,7 @@ void page_remove_rmap(struct page *page) | |||
1213 | * Leaving it set also helps swapoff to reinstate ptes | 1181 | * Leaving it set also helps swapoff to reinstate ptes |
1214 | * faster for those pages still in swapcache. | 1182 | * faster for those pages still in swapcache. |
1215 | */ | 1183 | */ |
1184 | return; | ||
1216 | out: | 1185 | out: |
1217 | if (!anon) | 1186 | if (!anon) |
1218 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | 1187 | mem_cgroup_end_update_page_stat(page, &locked, &flags); |
@@ -1256,7 +1225,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1256 | 1225 | ||
1257 | /* Nuke the page table entry. */ | 1226 | /* Nuke the page table entry. */ |
1258 | flush_cache_page(vma, address, page_to_pfn(page)); | 1227 | flush_cache_page(vma, address, page_to_pfn(page)); |
1259 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1228 | pteval = ptep_clear_flush(vma, address, pte); |
1260 | 1229 | ||
1261 | /* Move the dirty bit to the physical page now the pte is gone. */ | 1230 | /* Move the dirty bit to the physical page now the pte is gone. */ |
1262 | if (pte_dirty(pteval)) | 1231 | if (pte_dirty(pteval)) |
@@ -1318,6 +1287,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1318 | 1287 | ||
1319 | out_unmap: | 1288 | out_unmap: |
1320 | pte_unmap_unlock(pte, ptl); | 1289 | pte_unmap_unlock(pte, ptl); |
1290 | if (ret != SWAP_FAIL) | ||
1291 | mmu_notifier_invalidate_page(mm, address); | ||
1321 | out: | 1292 | out: |
1322 | return ret; | 1293 | return ret; |
1323 | 1294 | ||
@@ -1382,6 +1353,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1382 | spinlock_t *ptl; | 1353 | spinlock_t *ptl; |
1383 | struct page *page; | 1354 | struct page *page; |
1384 | unsigned long address; | 1355 | unsigned long address; |
1356 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1357 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1385 | unsigned long end; | 1358 | unsigned long end; |
1386 | int ret = SWAP_AGAIN; | 1359 | int ret = SWAP_AGAIN; |
1387 | int locked_vma = 0; | 1360 | int locked_vma = 0; |
@@ -1405,6 +1378,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1405 | if (!pmd_present(*pmd)) | 1378 | if (!pmd_present(*pmd)) |
1406 | return ret; | 1379 | return ret; |
1407 | 1380 | ||
1381 | mmun_start = address; | ||
1382 | mmun_end = end; | ||
1383 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1384 | |||
1408 | /* | 1385 | /* |
1409 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | 1386 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
1410 | * keep the sem while scanning the cluster for mlocking pages. | 1387 | * keep the sem while scanning the cluster for mlocking pages. |
@@ -1438,7 +1415,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1438 | 1415 | ||
1439 | /* Nuke the page table entry. */ | 1416 | /* Nuke the page table entry. */ |
1440 | flush_cache_page(vma, address, pte_pfn(*pte)); | 1417 | flush_cache_page(vma, address, pte_pfn(*pte)); |
1441 | pteval = ptep_clear_flush_notify(vma, address, pte); | 1418 | pteval = ptep_clear_flush(vma, address, pte); |
1442 | 1419 | ||
1443 | /* If nonlinear, store the file page offset in the pte. */ | 1420 | /* If nonlinear, store the file page offset in the pte. */ |
1444 | if (page->index != linear_page_index(vma, address)) | 1421 | if (page->index != linear_page_index(vma, address)) |
@@ -1454,6 +1431,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
1454 | (*mapcount)--; | 1431 | (*mapcount)--; |
1455 | } | 1432 | } |
1456 | pte_unmap_unlock(pte - 1, ptl); | 1433 | pte_unmap_unlock(pte - 1, ptl); |
1434 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1457 | if (locked_vma) | 1435 | if (locked_vma) |
1458 | up_read(&vma->vm_mm->mmap_sem); | 1436 | up_read(&vma->vm_mm->mmap_sem); |
1459 | return ret; | 1437 | return ret; |
@@ -1492,6 +1470,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma) | |||
1492 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1470 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
1493 | { | 1471 | { |
1494 | struct anon_vma *anon_vma; | 1472 | struct anon_vma *anon_vma; |
1473 | pgoff_t pgoff; | ||
1495 | struct anon_vma_chain *avc; | 1474 | struct anon_vma_chain *avc; |
1496 | int ret = SWAP_AGAIN; | 1475 | int ret = SWAP_AGAIN; |
1497 | 1476 | ||
@@ -1499,7 +1478,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1499 | if (!anon_vma) | 1478 | if (!anon_vma) |
1500 | return ret; | 1479 | return ret; |
1501 | 1480 | ||
1502 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1481 | pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1482 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1503 | struct vm_area_struct *vma = avc->vma; | 1483 | struct vm_area_struct *vma = avc->vma; |
1504 | unsigned long address; | 1484 | unsigned long address; |
1505 | 1485 | ||
@@ -1516,8 +1496,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1516 | continue; | 1496 | continue; |
1517 | 1497 | ||
1518 | address = vma_address(page, vma); | 1498 | address = vma_address(page, vma); |
1519 | if (address == -EFAULT) | ||
1520 | continue; | ||
1521 | ret = try_to_unmap_one(page, vma, address, flags); | 1499 | ret = try_to_unmap_one(page, vma, address, flags); |
1522 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1500 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1523 | break; | 1501 | break; |
@@ -1547,7 +1525,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1547 | struct address_space *mapping = page->mapping; | 1525 | struct address_space *mapping = page->mapping; |
1548 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1526 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1549 | struct vm_area_struct *vma; | 1527 | struct vm_area_struct *vma; |
1550 | struct prio_tree_iter iter; | ||
1551 | int ret = SWAP_AGAIN; | 1528 | int ret = SWAP_AGAIN; |
1552 | unsigned long cursor; | 1529 | unsigned long cursor; |
1553 | unsigned long max_nl_cursor = 0; | 1530 | unsigned long max_nl_cursor = 0; |
@@ -1555,10 +1532,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1555 | unsigned int mapcount; | 1532 | unsigned int mapcount; |
1556 | 1533 | ||
1557 | mutex_lock(&mapping->i_mmap_mutex); | 1534 | mutex_lock(&mapping->i_mmap_mutex); |
1558 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1535 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1559 | unsigned long address = vma_address(page, vma); | 1536 | unsigned long address = vma_address(page, vma); |
1560 | if (address == -EFAULT) | ||
1561 | continue; | ||
1562 | ret = try_to_unmap_one(page, vma, address, flags); | 1537 | ret = try_to_unmap_one(page, vma, address, flags); |
1563 | if (ret != SWAP_AGAIN || !page_mapped(page)) | 1538 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
1564 | goto out; | 1539 | goto out; |
@@ -1576,7 +1551,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1576 | goto out; | 1551 | goto out; |
1577 | 1552 | ||
1578 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1553 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1579 | shared.vm_set.list) { | 1554 | shared.nonlinear) { |
1580 | cursor = (unsigned long) vma->vm_private_data; | 1555 | cursor = (unsigned long) vma->vm_private_data; |
1581 | if (cursor > max_nl_cursor) | 1556 | if (cursor > max_nl_cursor) |
1582 | max_nl_cursor = cursor; | 1557 | max_nl_cursor = cursor; |
@@ -1608,7 +1583,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1608 | 1583 | ||
1609 | do { | 1584 | do { |
1610 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1585 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
1611 | shared.vm_set.list) { | 1586 | shared.nonlinear) { |
1612 | cursor = (unsigned long) vma->vm_private_data; | 1587 | cursor = (unsigned long) vma->vm_private_data; |
1613 | while ( cursor < max_nl_cursor && | 1588 | while ( cursor < max_nl_cursor && |
1614 | cursor < vma->vm_end - vma->vm_start) { | 1589 | cursor < vma->vm_end - vma->vm_start) { |
@@ -1631,7 +1606,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
1631 | * in locked vmas). Reset cursor on all unreserved nonlinear | 1606 | * in locked vmas). Reset cursor on all unreserved nonlinear |
1632 | * vmas, now forgetting on which ones it had fallen behind. | 1607 | * vmas, now forgetting on which ones it had fallen behind. |
1633 | */ | 1608 | */ |
1634 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | 1609 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) |
1635 | vma->vm_private_data = NULL; | 1610 | vma->vm_private_data = NULL; |
1636 | out: | 1611 | out: |
1637 | mutex_unlock(&mapping->i_mmap_mutex); | 1612 | mutex_unlock(&mapping->i_mmap_mutex); |
@@ -1716,6 +1691,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1716 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1691 | struct vm_area_struct *, unsigned long, void *), void *arg) |
1717 | { | 1692 | { |
1718 | struct anon_vma *anon_vma; | 1693 | struct anon_vma *anon_vma; |
1694 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1719 | struct anon_vma_chain *avc; | 1695 | struct anon_vma_chain *avc; |
1720 | int ret = SWAP_AGAIN; | 1696 | int ret = SWAP_AGAIN; |
1721 | 1697 | ||
@@ -1729,11 +1705,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1729 | if (!anon_vma) | 1705 | if (!anon_vma) |
1730 | return ret; | 1706 | return ret; |
1731 | anon_vma_lock(anon_vma); | 1707 | anon_vma_lock(anon_vma); |
1732 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | 1708 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1733 | struct vm_area_struct *vma = avc->vma; | 1709 | struct vm_area_struct *vma = avc->vma; |
1734 | unsigned long address = vma_address(page, vma); | 1710 | unsigned long address = vma_address(page, vma); |
1735 | if (address == -EFAULT) | ||
1736 | continue; | ||
1737 | ret = rmap_one(page, vma, address, arg); | 1711 | ret = rmap_one(page, vma, address, arg); |
1738 | if (ret != SWAP_AGAIN) | 1712 | if (ret != SWAP_AGAIN) |
1739 | break; | 1713 | break; |
@@ -1748,16 +1722,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | |||
1748 | struct address_space *mapping = page->mapping; | 1722 | struct address_space *mapping = page->mapping; |
1749 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1723 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
1750 | struct vm_area_struct *vma; | 1724 | struct vm_area_struct *vma; |
1751 | struct prio_tree_iter iter; | ||
1752 | int ret = SWAP_AGAIN; | 1725 | int ret = SWAP_AGAIN; |
1753 | 1726 | ||
1754 | if (!mapping) | 1727 | if (!mapping) |
1755 | return ret; | 1728 | return ret; |
1756 | mutex_lock(&mapping->i_mmap_mutex); | 1729 | mutex_lock(&mapping->i_mmap_mutex); |
1757 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1730 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1758 | unsigned long address = vma_address(page, vma); | 1731 | unsigned long address = vma_address(page, vma); |
1759 | if (address == -EFAULT) | ||
1760 | continue; | ||
1761 | ret = rmap_one(page, vma, address, arg); | 1732 | ret = rmap_one(page, vma, address, arg); |
1762 | if (ret != SWAP_AGAIN) | 1733 | if (ret != SWAP_AGAIN) |
1763 | break; | 1734 | break; |
diff --git a/mm/shmem.c b/mm/shmem.c index d4e184e2a38e..67afba5117f2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -77,13 +77,6 @@ static struct vfsmount *shm_mnt; | |||
77 | /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ | 77 | /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ |
78 | #define SHORT_SYMLINK_LEN 128 | 78 | #define SHORT_SYMLINK_LEN 128 |
79 | 79 | ||
80 | struct shmem_xattr { | ||
81 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ | ||
82 | char *name; /* xattr name */ | ||
83 | size_t size; | ||
84 | char value[0]; | ||
85 | }; | ||
86 | |||
87 | /* | 80 | /* |
88 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | 81 | * shmem_fallocate and shmem_writepage communicate via inode->i_private |
89 | * (with i_mutex making sure that it has only one user at a time): | 82 | * (with i_mutex making sure that it has only one user at a time): |
@@ -636,7 +629,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
636 | static void shmem_evict_inode(struct inode *inode) | 629 | static void shmem_evict_inode(struct inode *inode) |
637 | { | 630 | { |
638 | struct shmem_inode_info *info = SHMEM_I(inode); | 631 | struct shmem_inode_info *info = SHMEM_I(inode); |
639 | struct shmem_xattr *xattr, *nxattr; | ||
640 | 632 | ||
641 | if (inode->i_mapping->a_ops == &shmem_aops) { | 633 | if (inode->i_mapping->a_ops == &shmem_aops) { |
642 | shmem_unacct_size(info->flags, inode->i_size); | 634 | shmem_unacct_size(info->flags, inode->i_size); |
@@ -650,10 +642,7 @@ static void shmem_evict_inode(struct inode *inode) | |||
650 | } else | 642 | } else |
651 | kfree(info->symlink); | 643 | kfree(info->symlink); |
652 | 644 | ||
653 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { | 645 | simple_xattrs_free(&info->xattrs); |
654 | kfree(xattr->name); | ||
655 | kfree(xattr); | ||
656 | } | ||
657 | BUG_ON(inode->i_blocks); | 646 | BUG_ON(inode->i_blocks); |
658 | shmem_free_inode(inode->i_sb); | 647 | shmem_free_inode(inode->i_sb); |
659 | clear_inode(inode); | 648 | clear_inode(inode); |
@@ -1350,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | |||
1350 | { | 1339 | { |
1351 | file_accessed(file); | 1340 | file_accessed(file); |
1352 | vma->vm_ops = &shmem_vm_ops; | 1341 | vma->vm_ops = &shmem_vm_ops; |
1353 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
1354 | return 0; | 1342 | return 0; |
1355 | } | 1343 | } |
1356 | 1344 | ||
@@ -1377,7 +1365,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1377 | spin_lock_init(&info->lock); | 1365 | spin_lock_init(&info->lock); |
1378 | info->flags = flags & VM_NORESERVE; | 1366 | info->flags = flags & VM_NORESERVE; |
1379 | INIT_LIST_HEAD(&info->swaplist); | 1367 | INIT_LIST_HEAD(&info->swaplist); |
1380 | INIT_LIST_HEAD(&info->xattr_list); | 1368 | simple_xattrs_init(&info->xattrs); |
1381 | cache_no_acl(inode); | 1369 | cache_no_acl(inode); |
1382 | 1370 | ||
1383 | switch (mode & S_IFMT) { | 1371 | switch (mode & S_IFMT) { |
@@ -2060,28 +2048,6 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co | |||
2060 | */ | 2048 | */ |
2061 | 2049 | ||
2062 | /* | 2050 | /* |
2063 | * Allocate new xattr and copy in the value; but leave the name to callers. | ||
2064 | */ | ||
2065 | static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) | ||
2066 | { | ||
2067 | struct shmem_xattr *new_xattr; | ||
2068 | size_t len; | ||
2069 | |||
2070 | /* wrap around? */ | ||
2071 | len = sizeof(*new_xattr) + size; | ||
2072 | if (len <= sizeof(*new_xattr)) | ||
2073 | return NULL; | ||
2074 | |||
2075 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
2076 | if (!new_xattr) | ||
2077 | return NULL; | ||
2078 | |||
2079 | new_xattr->size = size; | ||
2080 | memcpy(new_xattr->value, value, size); | ||
2081 | return new_xattr; | ||
2082 | } | ||
2083 | |||
2084 | /* | ||
2085 | * Callback for security_inode_init_security() for acquiring xattrs. | 2051 | * Callback for security_inode_init_security() for acquiring xattrs. |
2086 | */ | 2052 | */ |
2087 | static int shmem_initxattrs(struct inode *inode, | 2053 | static int shmem_initxattrs(struct inode *inode, |
@@ -2090,11 +2056,11 @@ static int shmem_initxattrs(struct inode *inode, | |||
2090 | { | 2056 | { |
2091 | struct shmem_inode_info *info = SHMEM_I(inode); | 2057 | struct shmem_inode_info *info = SHMEM_I(inode); |
2092 | const struct xattr *xattr; | 2058 | const struct xattr *xattr; |
2093 | struct shmem_xattr *new_xattr; | 2059 | struct simple_xattr *new_xattr; |
2094 | size_t len; | 2060 | size_t len; |
2095 | 2061 | ||
2096 | for (xattr = xattr_array; xattr->name != NULL; xattr++) { | 2062 | for (xattr = xattr_array; xattr->name != NULL; xattr++) { |
2097 | new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); | 2063 | new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len); |
2098 | if (!new_xattr) | 2064 | if (!new_xattr) |
2099 | return -ENOMEM; | 2065 | return -ENOMEM; |
2100 | 2066 | ||
@@ -2111,91 +2077,12 @@ static int shmem_initxattrs(struct inode *inode, | |||
2111 | memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, | 2077 | memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, |
2112 | xattr->name, len); | 2078 | xattr->name, len); |
2113 | 2079 | ||
2114 | spin_lock(&info->lock); | 2080 | simple_xattr_list_add(&info->xattrs, new_xattr); |
2115 | list_add(&new_xattr->list, &info->xattr_list); | ||
2116 | spin_unlock(&info->lock); | ||
2117 | } | 2081 | } |
2118 | 2082 | ||
2119 | return 0; | 2083 | return 0; |
2120 | } | 2084 | } |
2121 | 2085 | ||
2122 | static int shmem_xattr_get(struct dentry *dentry, const char *name, | ||
2123 | void *buffer, size_t size) | ||
2124 | { | ||
2125 | struct shmem_inode_info *info; | ||
2126 | struct shmem_xattr *xattr; | ||
2127 | int ret = -ENODATA; | ||
2128 | |||
2129 | info = SHMEM_I(dentry->d_inode); | ||
2130 | |||
2131 | spin_lock(&info->lock); | ||
2132 | list_for_each_entry(xattr, &info->xattr_list, list) { | ||
2133 | if (strcmp(name, xattr->name)) | ||
2134 | continue; | ||
2135 | |||
2136 | ret = xattr->size; | ||
2137 | if (buffer) { | ||
2138 | if (size < xattr->size) | ||
2139 | ret = -ERANGE; | ||
2140 | else | ||
2141 | memcpy(buffer, xattr->value, xattr->size); | ||
2142 | } | ||
2143 | break; | ||
2144 | } | ||
2145 | spin_unlock(&info->lock); | ||
2146 | return ret; | ||
2147 | } | ||
2148 | |||
2149 | static int shmem_xattr_set(struct inode *inode, const char *name, | ||
2150 | const void *value, size_t size, int flags) | ||
2151 | { | ||
2152 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
2153 | struct shmem_xattr *xattr; | ||
2154 | struct shmem_xattr *new_xattr = NULL; | ||
2155 | int err = 0; | ||
2156 | |||
2157 | /* value == NULL means remove */ | ||
2158 | if (value) { | ||
2159 | new_xattr = shmem_xattr_alloc(value, size); | ||
2160 | if (!new_xattr) | ||
2161 | return -ENOMEM; | ||
2162 | |||
2163 | new_xattr->name = kstrdup(name, GFP_KERNEL); | ||
2164 | if (!new_xattr->name) { | ||
2165 | kfree(new_xattr); | ||
2166 | return -ENOMEM; | ||
2167 | } | ||
2168 | } | ||
2169 | |||
2170 | spin_lock(&info->lock); | ||
2171 | list_for_each_entry(xattr, &info->xattr_list, list) { | ||
2172 | if (!strcmp(name, xattr->name)) { | ||
2173 | if (flags & XATTR_CREATE) { | ||
2174 | xattr = new_xattr; | ||
2175 | err = -EEXIST; | ||
2176 | } else if (new_xattr) { | ||
2177 | list_replace(&xattr->list, &new_xattr->list); | ||
2178 | } else { | ||
2179 | list_del(&xattr->list); | ||
2180 | } | ||
2181 | goto out; | ||
2182 | } | ||
2183 | } | ||
2184 | if (flags & XATTR_REPLACE) { | ||
2185 | xattr = new_xattr; | ||
2186 | err = -ENODATA; | ||
2187 | } else { | ||
2188 | list_add(&new_xattr->list, &info->xattr_list); | ||
2189 | xattr = NULL; | ||
2190 | } | ||
2191 | out: | ||
2192 | spin_unlock(&info->lock); | ||
2193 | if (xattr) | ||
2194 | kfree(xattr->name); | ||
2195 | kfree(xattr); | ||
2196 | return err; | ||
2197 | } | ||
2198 | |||
2199 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 2086 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2200 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2087 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2201 | &generic_acl_access_handler, | 2088 | &generic_acl_access_handler, |
@@ -2226,6 +2113,7 @@ static int shmem_xattr_validate(const char *name) | |||
2226 | static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, | 2113 | static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, |
2227 | void *buffer, size_t size) | 2114 | void *buffer, size_t size) |
2228 | { | 2115 | { |
2116 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | ||
2229 | int err; | 2117 | int err; |
2230 | 2118 | ||
2231 | /* | 2119 | /* |
@@ -2240,12 +2128,13 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, | |||
2240 | if (err) | 2128 | if (err) |
2241 | return err; | 2129 | return err; |
2242 | 2130 | ||
2243 | return shmem_xattr_get(dentry, name, buffer, size); | 2131 | return simple_xattr_get(&info->xattrs, name, buffer, size); |
2244 | } | 2132 | } |
2245 | 2133 | ||
2246 | static int shmem_setxattr(struct dentry *dentry, const char *name, | 2134 | static int shmem_setxattr(struct dentry *dentry, const char *name, |
2247 | const void *value, size_t size, int flags) | 2135 | const void *value, size_t size, int flags) |
2248 | { | 2136 | { |
2137 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | ||
2249 | int err; | 2138 | int err; |
2250 | 2139 | ||
2251 | /* | 2140 | /* |
@@ -2260,15 +2149,12 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, | |||
2260 | if (err) | 2149 | if (err) |
2261 | return err; | 2150 | return err; |
2262 | 2151 | ||
2263 | if (size == 0) | 2152 | return simple_xattr_set(&info->xattrs, name, value, size, flags); |
2264 | value = ""; /* empty EA, do not remove */ | ||
2265 | |||
2266 | return shmem_xattr_set(dentry->d_inode, name, value, size, flags); | ||
2267 | |||
2268 | } | 2153 | } |
2269 | 2154 | ||
2270 | static int shmem_removexattr(struct dentry *dentry, const char *name) | 2155 | static int shmem_removexattr(struct dentry *dentry, const char *name) |
2271 | { | 2156 | { |
2157 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | ||
2272 | int err; | 2158 | int err; |
2273 | 2159 | ||
2274 | /* | 2160 | /* |
@@ -2283,45 +2169,13 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) | |||
2283 | if (err) | 2169 | if (err) |
2284 | return err; | 2170 | return err; |
2285 | 2171 | ||
2286 | return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); | 2172 | return simple_xattr_remove(&info->xattrs, name); |
2287 | } | ||
2288 | |||
2289 | static bool xattr_is_trusted(const char *name) | ||
2290 | { | ||
2291 | return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); | ||
2292 | } | 2173 | } |
2293 | 2174 | ||
2294 | static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | 2175 | static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) |
2295 | { | 2176 | { |
2296 | bool trusted = capable(CAP_SYS_ADMIN); | 2177 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); |
2297 | struct shmem_xattr *xattr; | 2178 | return simple_xattr_list(&info->xattrs, buffer, size); |
2298 | struct shmem_inode_info *info; | ||
2299 | size_t used = 0; | ||
2300 | |||
2301 | info = SHMEM_I(dentry->d_inode); | ||
2302 | |||
2303 | spin_lock(&info->lock); | ||
2304 | list_for_each_entry(xattr, &info->xattr_list, list) { | ||
2305 | size_t len; | ||
2306 | |||
2307 | /* skip "trusted." attributes for unprivileged callers */ | ||
2308 | if (!trusted && xattr_is_trusted(xattr->name)) | ||
2309 | continue; | ||
2310 | |||
2311 | len = strlen(xattr->name) + 1; | ||
2312 | used += len; | ||
2313 | if (buffer) { | ||
2314 | if (size < used) { | ||
2315 | used = -ERANGE; | ||
2316 | break; | ||
2317 | } | ||
2318 | memcpy(buffer, xattr->name, len); | ||
2319 | buffer += len; | ||
2320 | } | ||
2321 | } | ||
2322 | spin_unlock(&info->lock); | ||
2323 | |||
2324 | return used; | ||
2325 | } | 2179 | } |
2326 | #endif /* CONFIG_TMPFS_XATTR */ | 2180 | #endif /* CONFIG_TMPFS_XATTR */ |
2327 | 2181 | ||
@@ -2366,12 +2220,14 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, | |||
2366 | { | 2220 | { |
2367 | struct inode *inode; | 2221 | struct inode *inode; |
2368 | struct dentry *dentry = NULL; | 2222 | struct dentry *dentry = NULL; |
2369 | u64 inum = fid->raw[2]; | 2223 | u64 inum; |
2370 | inum = (inum << 32) | fid->raw[1]; | ||
2371 | 2224 | ||
2372 | if (fh_len < 3) | 2225 | if (fh_len < 3) |
2373 | return NULL; | 2226 | return NULL; |
2374 | 2227 | ||
2228 | inum = fid->raw[2]; | ||
2229 | inum = (inum << 32) | fid->raw[1]; | ||
2230 | |||
2375 | inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), | 2231 | inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), |
2376 | shmem_match, fid->raw); | 2232 | shmem_match, fid->raw); |
2377 | if (inode) { | 2233 | if (inode) { |
@@ -2788,6 +2644,7 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2788 | .set_policy = shmem_set_policy, | 2644 | .set_policy = shmem_set_policy, |
2789 | .get_policy = shmem_get_policy, | 2645 | .get_policy = shmem_get_policy, |
2790 | #endif | 2646 | #endif |
2647 | .remap_pages = generic_file_remap_pages, | ||
2791 | }; | 2648 | }; |
2792 | 2649 | ||
2793 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2650 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
@@ -2981,7 +2838,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
2981 | fput(vma->vm_file); | 2838 | fput(vma->vm_file); |
2982 | vma->vm_file = file; | 2839 | vma->vm_file = file; |
2983 | vma->vm_ops = &shmem_vm_ops; | 2840 | vma->vm_ops = &shmem_vm_ops; |
2984 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
2985 | return 0; | 2841 | return 0; |
2986 | } | 2842 | } |
2987 | 2843 | ||
@@ -498,14 +498,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
498 | 498 | ||
499 | #endif | 499 | #endif |
500 | 500 | ||
501 | #ifdef CONFIG_TRACING | ||
502 | size_t slab_buffer_size(struct kmem_cache *cachep) | ||
503 | { | ||
504 | return cachep->size; | ||
505 | } | ||
506 | EXPORT_SYMBOL(slab_buffer_size); | ||
507 | #endif | ||
508 | |||
509 | /* | 501 | /* |
510 | * Do not go above this order unless 0 objects fit into the slab or | 502 | * Do not go above this order unless 0 objects fit into the slab or |
511 | * overridden on the command line. | 503 | * overridden on the command line. |
@@ -515,13 +507,6 @@ EXPORT_SYMBOL(slab_buffer_size); | |||
515 | static int slab_max_order = SLAB_MAX_ORDER_LO; | 507 | static int slab_max_order = SLAB_MAX_ORDER_LO; |
516 | static bool slab_max_order_set __initdata; | 508 | static bool slab_max_order_set __initdata; |
517 | 509 | ||
518 | static inline struct kmem_cache *page_get_cache(struct page *page) | ||
519 | { | ||
520 | page = compound_head(page); | ||
521 | BUG_ON(!PageSlab(page)); | ||
522 | return page->slab_cache; | ||
523 | } | ||
524 | |||
525 | static inline struct kmem_cache *virt_to_cache(const void *obj) | 510 | static inline struct kmem_cache *virt_to_cache(const void *obj) |
526 | { | 511 | { |
527 | struct page *page = virt_to_head_page(obj); | 512 | struct page *page = virt_to_head_page(obj); |
@@ -585,9 +570,9 @@ static struct arraycache_init initarray_generic = | |||
585 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 570 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
586 | 571 | ||
587 | /* internal cache of cache description objs */ | 572 | /* internal cache of cache description objs */ |
588 | static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; | 573 | static struct kmem_list3 *kmem_cache_nodelists[MAX_NUMNODES]; |
589 | static struct kmem_cache cache_cache = { | 574 | static struct kmem_cache kmem_cache_boot = { |
590 | .nodelists = cache_cache_nodelists, | 575 | .nodelists = kmem_cache_nodelists, |
591 | .batchcount = 1, | 576 | .batchcount = 1, |
592 | .limit = BOOT_CPUCACHE_ENTRIES, | 577 | .limit = BOOT_CPUCACHE_ENTRIES, |
593 | .shared = 1, | 578 | .shared = 1, |
@@ -810,6 +795,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, | |||
810 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; | 795 | *left_over = slab_size - nr_objs*buffer_size - mgmt_size; |
811 | } | 796 | } |
812 | 797 | ||
798 | #if DEBUG | ||
813 | #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) | 799 | #define slab_error(cachep, msg) __slab_error(__func__, cachep, msg) |
814 | 800 | ||
815 | static void __slab_error(const char *function, struct kmem_cache *cachep, | 801 | static void __slab_error(const char *function, struct kmem_cache *cachep, |
@@ -818,7 +804,9 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
818 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 804 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
819 | function, cachep->name, msg); | 805 | function, cachep->name, msg); |
820 | dump_stack(); | 806 | dump_stack(); |
807 | add_taint(TAINT_BAD_PAGE); | ||
821 | } | 808 | } |
809 | #endif | ||
822 | 810 | ||
823 | /* | 811 | /* |
824 | * By default on NUMA we use alien caches to stage the freeing of | 812 | * By default on NUMA we use alien caches to stage the freeing of |
@@ -900,7 +888,7 @@ static void __cpuinit start_cpu_timer(int cpu) | |||
900 | */ | 888 | */ |
901 | if (keventd_up() && reap_work->work.func == NULL) { | 889 | if (keventd_up() && reap_work->work.func == NULL) { |
902 | init_reap_node(cpu); | 890 | init_reap_node(cpu); |
903 | INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); | 891 | INIT_DEFERRABLE_WORK(reap_work, cache_reap); |
904 | schedule_delayed_work_on(cpu, reap_work, | 892 | schedule_delayed_work_on(cpu, reap_work, |
905 | __round_jiffies_relative(HZ, cpu)); | 893 | __round_jiffies_relative(HZ, cpu)); |
906 | } | 894 | } |
@@ -1601,15 +1589,17 @@ void __init kmem_cache_init(void) | |||
1601 | int order; | 1589 | int order; |
1602 | int node; | 1590 | int node; |
1603 | 1591 | ||
1592 | kmem_cache = &kmem_cache_boot; | ||
1593 | |||
1604 | if (num_possible_nodes() == 1) | 1594 | if (num_possible_nodes() == 1) |
1605 | use_alien_caches = 0; | 1595 | use_alien_caches = 0; |
1606 | 1596 | ||
1607 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1597 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
1608 | kmem_list3_init(&initkmem_list3[i]); | 1598 | kmem_list3_init(&initkmem_list3[i]); |
1609 | if (i < MAX_NUMNODES) | 1599 | if (i < MAX_NUMNODES) |
1610 | cache_cache.nodelists[i] = NULL; | 1600 | kmem_cache->nodelists[i] = NULL; |
1611 | } | 1601 | } |
1612 | set_up_list3s(&cache_cache, CACHE_CACHE); | 1602 | set_up_list3s(kmem_cache, CACHE_CACHE); |
1613 | 1603 | ||
1614 | /* | 1604 | /* |
1615 | * Fragmentation resistance on low memory - only use bigger | 1605 | * Fragmentation resistance on low memory - only use bigger |
@@ -1621,9 +1611,9 @@ void __init kmem_cache_init(void) | |||
1621 | 1611 | ||
1622 | /* Bootstrap is tricky, because several objects are allocated | 1612 | /* Bootstrap is tricky, because several objects are allocated |
1623 | * from caches that do not exist yet: | 1613 | * from caches that do not exist yet: |
1624 | * 1) initialize the cache_cache cache: it contains the struct | 1614 | * 1) initialize the kmem_cache cache: it contains the struct |
1625 | * kmem_cache structures of all caches, except cache_cache itself: | 1615 | * kmem_cache structures of all caches, except kmem_cache itself: |
1626 | * cache_cache is statically allocated. | 1616 | * kmem_cache is statically allocated. |
1627 | * Initially an __init data area is used for the head array and the | 1617 | * Initially an __init data area is used for the head array and the |
1628 | * kmem_list3 structures, it's replaced with a kmalloc allocated | 1618 | * kmem_list3 structures, it's replaced with a kmalloc allocated |
1629 | * array at the end of the bootstrap. | 1619 | * array at the end of the bootstrap. |
@@ -1632,43 +1622,43 @@ void __init kmem_cache_init(void) | |||
1632 | * An __init data area is used for the head array. | 1622 | * An __init data area is used for the head array. |
1633 | * 3) Create the remaining kmalloc caches, with minimally sized | 1623 | * 3) Create the remaining kmalloc caches, with minimally sized |
1634 | * head arrays. | 1624 | * head arrays. |
1635 | * 4) Replace the __init data head arrays for cache_cache and the first | 1625 | * 4) Replace the __init data head arrays for kmem_cache and the first |
1636 | * kmalloc cache with kmalloc allocated arrays. | 1626 | * kmalloc cache with kmalloc allocated arrays. |
1637 | * 5) Replace the __init data for kmem_list3 for cache_cache and | 1627 | * 5) Replace the __init data for kmem_list3 for kmem_cache and |
1638 | * the other cache's with kmalloc allocated memory. | 1628 | * the other cache's with kmalloc allocated memory. |
1639 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. | 1629 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. |
1640 | */ | 1630 | */ |
1641 | 1631 | ||
1642 | node = numa_mem_id(); | 1632 | node = numa_mem_id(); |
1643 | 1633 | ||
1644 | /* 1) create the cache_cache */ | 1634 | /* 1) create the kmem_cache */ |
1645 | INIT_LIST_HEAD(&slab_caches); | 1635 | INIT_LIST_HEAD(&slab_caches); |
1646 | list_add(&cache_cache.list, &slab_caches); | 1636 | list_add(&kmem_cache->list, &slab_caches); |
1647 | cache_cache.colour_off = cache_line_size(); | 1637 | kmem_cache->colour_off = cache_line_size(); |
1648 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | 1638 | kmem_cache->array[smp_processor_id()] = &initarray_cache.cache; |
1649 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1639 | kmem_cache->nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
1650 | 1640 | ||
1651 | /* | 1641 | /* |
1652 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1642 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1653 | */ | 1643 | */ |
1654 | cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1644 | kmem_cache->size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1655 | nr_node_ids * sizeof(struct kmem_list3 *); | 1645 | nr_node_ids * sizeof(struct kmem_list3 *); |
1656 | cache_cache.object_size = cache_cache.size; | 1646 | kmem_cache->object_size = kmem_cache->size; |
1657 | cache_cache.size = ALIGN(cache_cache.size, | 1647 | kmem_cache->size = ALIGN(kmem_cache->object_size, |
1658 | cache_line_size()); | 1648 | cache_line_size()); |
1659 | cache_cache.reciprocal_buffer_size = | 1649 | kmem_cache->reciprocal_buffer_size = |
1660 | reciprocal_value(cache_cache.size); | 1650 | reciprocal_value(kmem_cache->size); |
1661 | 1651 | ||
1662 | for (order = 0; order < MAX_ORDER; order++) { | 1652 | for (order = 0; order < MAX_ORDER; order++) { |
1663 | cache_estimate(order, cache_cache.size, | 1653 | cache_estimate(order, kmem_cache->size, |
1664 | cache_line_size(), 0, &left_over, &cache_cache.num); | 1654 | cache_line_size(), 0, &left_over, &kmem_cache->num); |
1665 | if (cache_cache.num) | 1655 | if (kmem_cache->num) |
1666 | break; | 1656 | break; |
1667 | } | 1657 | } |
1668 | BUG_ON(!cache_cache.num); | 1658 | BUG_ON(!kmem_cache->num); |
1669 | cache_cache.gfporder = order; | 1659 | kmem_cache->gfporder = order; |
1670 | cache_cache.colour = left_over / cache_cache.colour_off; | 1660 | kmem_cache->colour = left_over / kmem_cache->colour_off; |
1671 | cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + | 1661 | kmem_cache->slab_size = ALIGN(kmem_cache->num * sizeof(kmem_bufctl_t) + |
1672 | sizeof(struct slab), cache_line_size()); | 1662 | sizeof(struct slab), cache_line_size()); |
1673 | 1663 | ||
1674 | /* 2+3) create the kmalloc caches */ | 1664 | /* 2+3) create the kmalloc caches */ |
@@ -1681,19 +1671,22 @@ void __init kmem_cache_init(void) | |||
1681 | * bug. | 1671 | * bug. |
1682 | */ | 1672 | */ |
1683 | 1673 | ||
1684 | sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name, | 1674 | sizes[INDEX_AC].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
1685 | sizes[INDEX_AC].cs_size, | 1675 | sizes[INDEX_AC].cs_cachep->name = names[INDEX_AC].name; |
1686 | ARCH_KMALLOC_MINALIGN, | 1676 | sizes[INDEX_AC].cs_cachep->size = sizes[INDEX_AC].cs_size; |
1687 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1677 | sizes[INDEX_AC].cs_cachep->object_size = sizes[INDEX_AC].cs_size; |
1688 | NULL); | 1678 | sizes[INDEX_AC].cs_cachep->align = ARCH_KMALLOC_MINALIGN; |
1679 | __kmem_cache_create(sizes[INDEX_AC].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | ||
1680 | list_add(&sizes[INDEX_AC].cs_cachep->list, &slab_caches); | ||
1689 | 1681 | ||
1690 | if (INDEX_AC != INDEX_L3) { | 1682 | if (INDEX_AC != INDEX_L3) { |
1691 | sizes[INDEX_L3].cs_cachep = | 1683 | sizes[INDEX_L3].cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
1692 | __kmem_cache_create(names[INDEX_L3].name, | 1684 | sizes[INDEX_L3].cs_cachep->name = names[INDEX_L3].name; |
1693 | sizes[INDEX_L3].cs_size, | 1685 | sizes[INDEX_L3].cs_cachep->size = sizes[INDEX_L3].cs_size; |
1694 | ARCH_KMALLOC_MINALIGN, | 1686 | sizes[INDEX_L3].cs_cachep->object_size = sizes[INDEX_L3].cs_size; |
1695 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1687 | sizes[INDEX_L3].cs_cachep->align = ARCH_KMALLOC_MINALIGN; |
1696 | NULL); | 1688 | __kmem_cache_create(sizes[INDEX_L3].cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); |
1689 | list_add(&sizes[INDEX_L3].cs_cachep->list, &slab_caches); | ||
1697 | } | 1690 | } |
1698 | 1691 | ||
1699 | slab_early_init = 0; | 1692 | slab_early_init = 0; |
@@ -1707,20 +1700,23 @@ void __init kmem_cache_init(void) | |||
1707 | * allow tighter packing of the smaller caches. | 1700 | * allow tighter packing of the smaller caches. |
1708 | */ | 1701 | */ |
1709 | if (!sizes->cs_cachep) { | 1702 | if (!sizes->cs_cachep) { |
1710 | sizes->cs_cachep = __kmem_cache_create(names->name, | 1703 | sizes->cs_cachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
1711 | sizes->cs_size, | 1704 | sizes->cs_cachep->name = names->name; |
1712 | ARCH_KMALLOC_MINALIGN, | 1705 | sizes->cs_cachep->size = sizes->cs_size; |
1713 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, | 1706 | sizes->cs_cachep->object_size = sizes->cs_size; |
1714 | NULL); | 1707 | sizes->cs_cachep->align = ARCH_KMALLOC_MINALIGN; |
1708 | __kmem_cache_create(sizes->cs_cachep, ARCH_KMALLOC_FLAGS|SLAB_PANIC); | ||
1709 | list_add(&sizes->cs_cachep->list, &slab_caches); | ||
1715 | } | 1710 | } |
1716 | #ifdef CONFIG_ZONE_DMA | 1711 | #ifdef CONFIG_ZONE_DMA |
1717 | sizes->cs_dmacachep = __kmem_cache_create( | 1712 | sizes->cs_dmacachep = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
1718 | names->name_dma, | 1713 | sizes->cs_dmacachep->name = names->name_dma; |
1719 | sizes->cs_size, | 1714 | sizes->cs_dmacachep->size = sizes->cs_size; |
1720 | ARCH_KMALLOC_MINALIGN, | 1715 | sizes->cs_dmacachep->object_size = sizes->cs_size; |
1721 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| | 1716 | sizes->cs_dmacachep->align = ARCH_KMALLOC_MINALIGN; |
1722 | SLAB_PANIC, | 1717 | __kmem_cache_create(sizes->cs_dmacachep, |
1723 | NULL); | 1718 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| SLAB_PANIC); |
1719 | list_add(&sizes->cs_dmacachep->list, &slab_caches); | ||
1724 | #endif | 1720 | #endif |
1725 | sizes++; | 1721 | sizes++; |
1726 | names++; | 1722 | names++; |
@@ -1731,15 +1727,15 @@ void __init kmem_cache_init(void) | |||
1731 | 1727 | ||
1732 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | 1728 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1733 | 1729 | ||
1734 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); | 1730 | BUG_ON(cpu_cache_get(kmem_cache) != &initarray_cache.cache); |
1735 | memcpy(ptr, cpu_cache_get(&cache_cache), | 1731 | memcpy(ptr, cpu_cache_get(kmem_cache), |
1736 | sizeof(struct arraycache_init)); | 1732 | sizeof(struct arraycache_init)); |
1737 | /* | 1733 | /* |
1738 | * Do not assume that spinlocks can be initialized via memcpy: | 1734 | * Do not assume that spinlocks can be initialized via memcpy: |
1739 | */ | 1735 | */ |
1740 | spin_lock_init(&ptr->lock); | 1736 | spin_lock_init(&ptr->lock); |
1741 | 1737 | ||
1742 | cache_cache.array[smp_processor_id()] = ptr; | 1738 | kmem_cache->array[smp_processor_id()] = ptr; |
1743 | 1739 | ||
1744 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | 1740 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
1745 | 1741 | ||
@@ -1760,7 +1756,7 @@ void __init kmem_cache_init(void) | |||
1760 | int nid; | 1756 | int nid; |
1761 | 1757 | ||
1762 | for_each_online_node(nid) { | 1758 | for_each_online_node(nid) { |
1763 | init_list(&cache_cache, &initkmem_list3[CACHE_CACHE + nid], nid); | 1759 | init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid); |
1764 | 1760 | ||
1765 | init_list(malloc_sizes[INDEX_AC].cs_cachep, | 1761 | init_list(malloc_sizes[INDEX_AC].cs_cachep, |
1766 | &initkmem_list3[SIZE_AC + nid], nid); | 1762 | &initkmem_list3[SIZE_AC + nid], nid); |
@@ -1781,9 +1777,6 @@ void __init kmem_cache_init_late(void) | |||
1781 | 1777 | ||
1782 | slab_state = UP; | 1778 | slab_state = UP; |
1783 | 1779 | ||
1784 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1785 | init_lock_keys(); | ||
1786 | |||
1787 | /* 6) resize the head arrays to their final sizes */ | 1780 | /* 6) resize the head arrays to their final sizes */ |
1788 | mutex_lock(&slab_mutex); | 1781 | mutex_lock(&slab_mutex); |
1789 | list_for_each_entry(cachep, &slab_caches, list) | 1782 | list_for_each_entry(cachep, &slab_caches, list) |
@@ -1791,6 +1784,9 @@ void __init kmem_cache_init_late(void) | |||
1791 | BUG(); | 1784 | BUG(); |
1792 | mutex_unlock(&slab_mutex); | 1785 | mutex_unlock(&slab_mutex); |
1793 | 1786 | ||
1787 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1788 | init_lock_keys(); | ||
1789 | |||
1794 | /* Done! */ | 1790 | /* Done! */ |
1795 | slab_state = FULL; | 1791 | slab_state = FULL; |
1796 | 1792 | ||
@@ -2209,27 +2205,6 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | |||
2209 | } | 2205 | } |
2210 | } | 2206 | } |
2211 | 2207 | ||
2212 | static void __kmem_cache_destroy(struct kmem_cache *cachep) | ||
2213 | { | ||
2214 | int i; | ||
2215 | struct kmem_list3 *l3; | ||
2216 | |||
2217 | for_each_online_cpu(i) | ||
2218 | kfree(cachep->array[i]); | ||
2219 | |||
2220 | /* NUMA: free the list3 structures */ | ||
2221 | for_each_online_node(i) { | ||
2222 | l3 = cachep->nodelists[i]; | ||
2223 | if (l3) { | ||
2224 | kfree(l3->shared); | ||
2225 | free_alien_cache(l3->alien); | ||
2226 | kfree(l3); | ||
2227 | } | ||
2228 | } | ||
2229 | kmem_cache_free(&cache_cache, cachep); | ||
2230 | } | ||
2231 | |||
2232 | |||
2233 | /** | 2208 | /** |
2234 | * calculate_slab_order - calculate size (page order) of slabs | 2209 | * calculate_slab_order - calculate size (page order) of slabs |
2235 | * @cachep: pointer to the cache that is being created | 2210 | * @cachep: pointer to the cache that is being created |
@@ -2366,9 +2341,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2366 | * Cannot be called within a int, but can be interrupted. | 2341 | * Cannot be called within a int, but can be interrupted. |
2367 | * The @ctor is run when new pages are allocated by the cache. | 2342 | * The @ctor is run when new pages are allocated by the cache. |
2368 | * | 2343 | * |
2369 | * @name must be valid until the cache is destroyed. This implies that | ||
2370 | * the module calling this has to destroy the cache before getting unloaded. | ||
2371 | * | ||
2372 | * The flags are | 2344 | * The flags are |
2373 | * | 2345 | * |
2374 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | 2346 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) |
@@ -2381,13 +2353,13 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2381 | * cacheline. This can be beneficial if you're counting cycles as closely | 2353 | * cacheline. This can be beneficial if you're counting cycles as closely |
2382 | * as davem. | 2354 | * as davem. |
2383 | */ | 2355 | */ |
2384 | struct kmem_cache * | 2356 | int |
2385 | __kmem_cache_create (const char *name, size_t size, size_t align, | 2357 | __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) |
2386 | unsigned long flags, void (*ctor)(void *)) | ||
2387 | { | 2358 | { |
2388 | size_t left_over, slab_size, ralign; | 2359 | size_t left_over, slab_size, ralign; |
2389 | struct kmem_cache *cachep = NULL; | ||
2390 | gfp_t gfp; | 2360 | gfp_t gfp; |
2361 | int err; | ||
2362 | size_t size = cachep->size; | ||
2391 | 2363 | ||
2392 | #if DEBUG | 2364 | #if DEBUG |
2393 | #if FORCED_DEBUG | 2365 | #if FORCED_DEBUG |
@@ -2459,8 +2431,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2459 | ralign = ARCH_SLAB_MINALIGN; | 2431 | ralign = ARCH_SLAB_MINALIGN; |
2460 | } | 2432 | } |
2461 | /* 3) caller mandated alignment */ | 2433 | /* 3) caller mandated alignment */ |
2462 | if (ralign < align) { | 2434 | if (ralign < cachep->align) { |
2463 | ralign = align; | 2435 | ralign = cachep->align; |
2464 | } | 2436 | } |
2465 | /* disable debug if necessary */ | 2437 | /* disable debug if necessary */ |
2466 | if (ralign > __alignof__(unsigned long long)) | 2438 | if (ralign > __alignof__(unsigned long long)) |
@@ -2468,21 +2440,14 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2468 | /* | 2440 | /* |
2469 | * 4) Store it. | 2441 | * 4) Store it. |
2470 | */ | 2442 | */ |
2471 | align = ralign; | 2443 | cachep->align = ralign; |
2472 | 2444 | ||
2473 | if (slab_is_available()) | 2445 | if (slab_is_available()) |
2474 | gfp = GFP_KERNEL; | 2446 | gfp = GFP_KERNEL; |
2475 | else | 2447 | else |
2476 | gfp = GFP_NOWAIT; | 2448 | gfp = GFP_NOWAIT; |
2477 | 2449 | ||
2478 | /* Get cache's description obj. */ | ||
2479 | cachep = kmem_cache_zalloc(&cache_cache, gfp); | ||
2480 | if (!cachep) | ||
2481 | return NULL; | ||
2482 | |||
2483 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | 2450 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; |
2484 | cachep->object_size = size; | ||
2485 | cachep->align = align; | ||
2486 | #if DEBUG | 2451 | #if DEBUG |
2487 | 2452 | ||
2488 | /* | 2453 | /* |
@@ -2506,8 +2471,9 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2506 | } | 2471 | } |
2507 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2472 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
2508 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size | 2473 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size |
2509 | && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { | 2474 | && cachep->object_size > cache_line_size() |
2510 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); | 2475 | && ALIGN(size, cachep->align) < PAGE_SIZE) { |
2476 | cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); | ||
2511 | size = PAGE_SIZE; | 2477 | size = PAGE_SIZE; |
2512 | } | 2478 | } |
2513 | #endif | 2479 | #endif |
@@ -2527,18 +2493,15 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2527 | */ | 2493 | */ |
2528 | flags |= CFLGS_OFF_SLAB; | 2494 | flags |= CFLGS_OFF_SLAB; |
2529 | 2495 | ||
2530 | size = ALIGN(size, align); | 2496 | size = ALIGN(size, cachep->align); |
2531 | 2497 | ||
2532 | left_over = calculate_slab_order(cachep, size, align, flags); | 2498 | left_over = calculate_slab_order(cachep, size, cachep->align, flags); |
2499 | |||
2500 | if (!cachep->num) | ||
2501 | return -E2BIG; | ||
2533 | 2502 | ||
2534 | if (!cachep->num) { | ||
2535 | printk(KERN_ERR | ||
2536 | "kmem_cache_create: couldn't create cache %s.\n", name); | ||
2537 | kmem_cache_free(&cache_cache, cachep); | ||
2538 | return NULL; | ||
2539 | } | ||
2540 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) | 2503 | slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) |
2541 | + sizeof(struct slab), align); | 2504 | + sizeof(struct slab), cachep->align); |
2542 | 2505 | ||
2543 | /* | 2506 | /* |
2544 | * If the slab has been placed off-slab, and we have enough space then | 2507 | * If the slab has been placed off-slab, and we have enough space then |
@@ -2566,8 +2529,8 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2566 | 2529 | ||
2567 | cachep->colour_off = cache_line_size(); | 2530 | cachep->colour_off = cache_line_size(); |
2568 | /* Offset must be a multiple of the alignment. */ | 2531 | /* Offset must be a multiple of the alignment. */ |
2569 | if (cachep->colour_off < align) | 2532 | if (cachep->colour_off < cachep->align) |
2570 | cachep->colour_off = align; | 2533 | cachep->colour_off = cachep->align; |
2571 | cachep->colour = left_over / cachep->colour_off; | 2534 | cachep->colour = left_over / cachep->colour_off; |
2572 | cachep->slab_size = slab_size; | 2535 | cachep->slab_size = slab_size; |
2573 | cachep->flags = flags; | 2536 | cachep->flags = flags; |
@@ -2588,12 +2551,11 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2588 | */ | 2551 | */ |
2589 | BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); | 2552 | BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); |
2590 | } | 2553 | } |
2591 | cachep->ctor = ctor; | ||
2592 | cachep->name = name; | ||
2593 | 2554 | ||
2594 | if (setup_cpu_cache(cachep, gfp)) { | 2555 | err = setup_cpu_cache(cachep, gfp); |
2595 | __kmem_cache_destroy(cachep); | 2556 | if (err) { |
2596 | return NULL; | 2557 | __kmem_cache_shutdown(cachep); |
2558 | return err; | ||
2597 | } | 2559 | } |
2598 | 2560 | ||
2599 | if (flags & SLAB_DEBUG_OBJECTS) { | 2561 | if (flags & SLAB_DEBUG_OBJECTS) { |
@@ -2606,9 +2568,7 @@ __kmem_cache_create (const char *name, size_t size, size_t align, | |||
2606 | slab_set_debugobj_lock_classes(cachep); | 2568 | slab_set_debugobj_lock_classes(cachep); |
2607 | } | 2569 | } |
2608 | 2570 | ||
2609 | /* cache setup completed, link it into the list */ | 2571 | return 0; |
2610 | list_add(&cachep->list, &slab_caches); | ||
2611 | return cachep; | ||
2612 | } | 2572 | } |
2613 | 2573 | ||
2614 | #if DEBUG | 2574 | #if DEBUG |
@@ -2767,49 +2727,29 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
2767 | } | 2727 | } |
2768 | EXPORT_SYMBOL(kmem_cache_shrink); | 2728 | EXPORT_SYMBOL(kmem_cache_shrink); |
2769 | 2729 | ||
2770 | /** | 2730 | int __kmem_cache_shutdown(struct kmem_cache *cachep) |
2771 | * kmem_cache_destroy - delete a cache | ||
2772 | * @cachep: the cache to destroy | ||
2773 | * | ||
2774 | * Remove a &struct kmem_cache object from the slab cache. | ||
2775 | * | ||
2776 | * It is expected this function will be called by a module when it is | ||
2777 | * unloaded. This will remove the cache completely, and avoid a duplicate | ||
2778 | * cache being allocated each time a module is loaded and unloaded, if the | ||
2779 | * module doesn't have persistent in-kernel storage across loads and unloads. | ||
2780 | * | ||
2781 | * The cache must be empty before calling this function. | ||
2782 | * | ||
2783 | * The caller must guarantee that no one will allocate memory from the cache | ||
2784 | * during the kmem_cache_destroy(). | ||
2785 | */ | ||
2786 | void kmem_cache_destroy(struct kmem_cache *cachep) | ||
2787 | { | 2731 | { |
2788 | BUG_ON(!cachep || in_interrupt()); | 2732 | int i; |
2733 | struct kmem_list3 *l3; | ||
2734 | int rc = __cache_shrink(cachep); | ||
2789 | 2735 | ||
2790 | /* Find the cache in the chain of caches. */ | 2736 | if (rc) |
2791 | get_online_cpus(); | 2737 | return rc; |
2792 | mutex_lock(&slab_mutex); | ||
2793 | /* | ||
2794 | * the chain is never empty, cache_cache is never destroyed | ||
2795 | */ | ||
2796 | list_del(&cachep->list); | ||
2797 | if (__cache_shrink(cachep)) { | ||
2798 | slab_error(cachep, "Can't free all objects"); | ||
2799 | list_add(&cachep->list, &slab_caches); | ||
2800 | mutex_unlock(&slab_mutex); | ||
2801 | put_online_cpus(); | ||
2802 | return; | ||
2803 | } | ||
2804 | 2738 | ||
2805 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 2739 | for_each_online_cpu(i) |
2806 | rcu_barrier(); | 2740 | kfree(cachep->array[i]); |
2807 | 2741 | ||
2808 | __kmem_cache_destroy(cachep); | 2742 | /* NUMA: free the list3 structures */ |
2809 | mutex_unlock(&slab_mutex); | 2743 | for_each_online_node(i) { |
2810 | put_online_cpus(); | 2744 | l3 = cachep->nodelists[i]; |
2745 | if (l3) { | ||
2746 | kfree(l3->shared); | ||
2747 | free_alien_cache(l3->alien); | ||
2748 | kfree(l3); | ||
2749 | } | ||
2750 | } | ||
2751 | return 0; | ||
2811 | } | 2752 | } |
2812 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
2813 | 2753 | ||
2814 | /* | 2754 | /* |
2815 | * Get the memory for a slab management obj. | 2755 | * Get the memory for a slab management obj. |
@@ -3098,7 +3038,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) | |||
3098 | } | 3038 | } |
3099 | 3039 | ||
3100 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | 3040 | static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, |
3101 | void *caller) | 3041 | unsigned long caller) |
3102 | { | 3042 | { |
3103 | struct page *page; | 3043 | struct page *page; |
3104 | unsigned int objnr; | 3044 | unsigned int objnr; |
@@ -3118,7 +3058,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
3118 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 3058 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
3119 | } | 3059 | } |
3120 | if (cachep->flags & SLAB_STORE_USER) | 3060 | if (cachep->flags & SLAB_STORE_USER) |
3121 | *dbg_userword(cachep, objp) = caller; | 3061 | *dbg_userword(cachep, objp) = (void *)caller; |
3122 | 3062 | ||
3123 | objnr = obj_to_index(cachep, slabp, objp); | 3063 | objnr = obj_to_index(cachep, slabp, objp); |
3124 | 3064 | ||
@@ -3131,7 +3071,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
3131 | if (cachep->flags & SLAB_POISON) { | 3071 | if (cachep->flags & SLAB_POISON) { |
3132 | #ifdef CONFIG_DEBUG_PAGEALLOC | 3072 | #ifdef CONFIG_DEBUG_PAGEALLOC |
3133 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { | 3073 | if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
3134 | store_stackinfo(cachep, objp, (unsigned long)caller); | 3074 | store_stackinfo(cachep, objp, caller); |
3135 | kernel_map_pages(virt_to_page(objp), | 3075 | kernel_map_pages(virt_to_page(objp), |
3136 | cachep->size / PAGE_SIZE, 0); | 3076 | cachep->size / PAGE_SIZE, 0); |
3137 | } else { | 3077 | } else { |
@@ -3285,7 +3225,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, | |||
3285 | 3225 | ||
3286 | #if DEBUG | 3226 | #if DEBUG |
3287 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | 3227 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
3288 | gfp_t flags, void *objp, void *caller) | 3228 | gfp_t flags, void *objp, unsigned long caller) |
3289 | { | 3229 | { |
3290 | if (!objp) | 3230 | if (!objp) |
3291 | return objp; | 3231 | return objp; |
@@ -3302,7 +3242,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3302 | poison_obj(cachep, objp, POISON_INUSE); | 3242 | poison_obj(cachep, objp, POISON_INUSE); |
3303 | } | 3243 | } |
3304 | if (cachep->flags & SLAB_STORE_USER) | 3244 | if (cachep->flags & SLAB_STORE_USER) |
3305 | *dbg_userword(cachep, objp) = caller; | 3245 | *dbg_userword(cachep, objp) = (void *)caller; |
3306 | 3246 | ||
3307 | if (cachep->flags & SLAB_RED_ZONE) { | 3247 | if (cachep->flags & SLAB_RED_ZONE) { |
3308 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || | 3248 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || |
@@ -3343,7 +3283,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3343 | 3283 | ||
3344 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | 3284 | static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) |
3345 | { | 3285 | { |
3346 | if (cachep == &cache_cache) | 3286 | if (cachep == kmem_cache) |
3347 | return false; | 3287 | return false; |
3348 | 3288 | ||
3349 | return should_failslab(cachep->object_size, flags, cachep->flags); | 3289 | return should_failslab(cachep->object_size, flags, cachep->flags); |
@@ -3576,8 +3516,8 @@ done: | |||
3576 | * Fallback to other node is possible if __GFP_THISNODE is not set. | 3516 | * Fallback to other node is possible if __GFP_THISNODE is not set. |
3577 | */ | 3517 | */ |
3578 | static __always_inline void * | 3518 | static __always_inline void * |
3579 | __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | 3519 | slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, |
3580 | void *caller) | 3520 | unsigned long caller) |
3581 | { | 3521 | { |
3582 | unsigned long save_flags; | 3522 | unsigned long save_flags; |
3583 | void *ptr; | 3523 | void *ptr; |
@@ -3663,7 +3603,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3663 | #endif /* CONFIG_NUMA */ | 3603 | #endif /* CONFIG_NUMA */ |
3664 | 3604 | ||
3665 | static __always_inline void * | 3605 | static __always_inline void * |
3666 | __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | 3606 | slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) |
3667 | { | 3607 | { |
3668 | unsigned long save_flags; | 3608 | unsigned long save_flags; |
3669 | void *objp; | 3609 | void *objp; |
@@ -3799,7 +3739,7 @@ free_done: | |||
3799 | * be in this state _before_ it is released. Called with disabled ints. | 3739 | * be in this state _before_ it is released. Called with disabled ints. |
3800 | */ | 3740 | */ |
3801 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, | 3741 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, |
3802 | void *caller) | 3742 | unsigned long caller) |
3803 | { | 3743 | { |
3804 | struct array_cache *ac = cpu_cache_get(cachep); | 3744 | struct array_cache *ac = cpu_cache_get(cachep); |
3805 | 3745 | ||
@@ -3839,7 +3779,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3839 | */ | 3779 | */ |
3840 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3780 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3841 | { | 3781 | { |
3842 | void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3782 | void *ret = slab_alloc(cachep, flags, _RET_IP_); |
3843 | 3783 | ||
3844 | trace_kmem_cache_alloc(_RET_IP_, ret, | 3784 | trace_kmem_cache_alloc(_RET_IP_, ret, |
3845 | cachep->object_size, cachep->size, flags); | 3785 | cachep->object_size, cachep->size, flags); |
@@ -3850,14 +3790,14 @@ EXPORT_SYMBOL(kmem_cache_alloc); | |||
3850 | 3790 | ||
3851 | #ifdef CONFIG_TRACING | 3791 | #ifdef CONFIG_TRACING |
3852 | void * | 3792 | void * |
3853 | kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) | 3793 | kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) |
3854 | { | 3794 | { |
3855 | void *ret; | 3795 | void *ret; |
3856 | 3796 | ||
3857 | ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3797 | ret = slab_alloc(cachep, flags, _RET_IP_); |
3858 | 3798 | ||
3859 | trace_kmalloc(_RET_IP_, ret, | 3799 | trace_kmalloc(_RET_IP_, ret, |
3860 | size, slab_buffer_size(cachep), flags); | 3800 | size, cachep->size, flags); |
3861 | return ret; | 3801 | return ret; |
3862 | } | 3802 | } |
3863 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 3803 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
@@ -3866,8 +3806,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); | |||
3866 | #ifdef CONFIG_NUMA | 3806 | #ifdef CONFIG_NUMA |
3867 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3807 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
3868 | { | 3808 | { |
3869 | void *ret = __cache_alloc_node(cachep, flags, nodeid, | 3809 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3870 | __builtin_return_address(0)); | ||
3871 | 3810 | ||
3872 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 3811 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
3873 | cachep->object_size, cachep->size, | 3812 | cachep->object_size, cachep->size, |
@@ -3878,17 +3817,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
3878 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3817 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
3879 | 3818 | ||
3880 | #ifdef CONFIG_TRACING | 3819 | #ifdef CONFIG_TRACING |
3881 | void *kmem_cache_alloc_node_trace(size_t size, | 3820 | void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, |
3882 | struct kmem_cache *cachep, | ||
3883 | gfp_t flags, | 3821 | gfp_t flags, |
3884 | int nodeid) | 3822 | int nodeid, |
3823 | size_t size) | ||
3885 | { | 3824 | { |
3886 | void *ret; | 3825 | void *ret; |
3887 | 3826 | ||
3888 | ret = __cache_alloc_node(cachep, flags, nodeid, | 3827 | ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
3889 | __builtin_return_address(0)); | 3828 | |
3890 | trace_kmalloc_node(_RET_IP_, ret, | 3829 | trace_kmalloc_node(_RET_IP_, ret, |
3891 | size, slab_buffer_size(cachep), | 3830 | size, cachep->size, |
3892 | flags, nodeid); | 3831 | flags, nodeid); |
3893 | return ret; | 3832 | return ret; |
3894 | } | 3833 | } |
@@ -3896,34 +3835,33 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | |||
3896 | #endif | 3835 | #endif |
3897 | 3836 | ||
3898 | static __always_inline void * | 3837 | static __always_inline void * |
3899 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | 3838 | __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) |
3900 | { | 3839 | { |
3901 | struct kmem_cache *cachep; | 3840 | struct kmem_cache *cachep; |
3902 | 3841 | ||
3903 | cachep = kmem_find_general_cachep(size, flags); | 3842 | cachep = kmem_find_general_cachep(size, flags); |
3904 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3843 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3905 | return cachep; | 3844 | return cachep; |
3906 | return kmem_cache_alloc_node_trace(size, cachep, flags, node); | 3845 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); |
3907 | } | 3846 | } |
3908 | 3847 | ||
3909 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3848 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3910 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3849 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3911 | { | 3850 | { |
3912 | return __do_kmalloc_node(size, flags, node, | 3851 | return __do_kmalloc_node(size, flags, node, _RET_IP_); |
3913 | __builtin_return_address(0)); | ||
3914 | } | 3852 | } |
3915 | EXPORT_SYMBOL(__kmalloc_node); | 3853 | EXPORT_SYMBOL(__kmalloc_node); |
3916 | 3854 | ||
3917 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | 3855 | void *__kmalloc_node_track_caller(size_t size, gfp_t flags, |
3918 | int node, unsigned long caller) | 3856 | int node, unsigned long caller) |
3919 | { | 3857 | { |
3920 | return __do_kmalloc_node(size, flags, node, (void *)caller); | 3858 | return __do_kmalloc_node(size, flags, node, caller); |
3921 | } | 3859 | } |
3922 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | 3860 | EXPORT_SYMBOL(__kmalloc_node_track_caller); |
3923 | #else | 3861 | #else |
3924 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3862 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3925 | { | 3863 | { |
3926 | return __do_kmalloc_node(size, flags, node, NULL); | 3864 | return __do_kmalloc_node(size, flags, node, 0); |
3927 | } | 3865 | } |
3928 | EXPORT_SYMBOL(__kmalloc_node); | 3866 | EXPORT_SYMBOL(__kmalloc_node); |
3929 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ | 3867 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ |
@@ -3936,7 +3874,7 @@ EXPORT_SYMBOL(__kmalloc_node); | |||
3936 | * @caller: function caller for debug tracking of the caller | 3874 | * @caller: function caller for debug tracking of the caller |
3937 | */ | 3875 | */ |
3938 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | 3876 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, |
3939 | void *caller) | 3877 | unsigned long caller) |
3940 | { | 3878 | { |
3941 | struct kmem_cache *cachep; | 3879 | struct kmem_cache *cachep; |
3942 | void *ret; | 3880 | void *ret; |
@@ -3949,9 +3887,9 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3949 | cachep = __find_general_cachep(size, flags); | 3887 | cachep = __find_general_cachep(size, flags); |
3950 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3888 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
3951 | return cachep; | 3889 | return cachep; |
3952 | ret = __cache_alloc(cachep, flags, caller); | 3890 | ret = slab_alloc(cachep, flags, caller); |
3953 | 3891 | ||
3954 | trace_kmalloc((unsigned long) caller, ret, | 3892 | trace_kmalloc(caller, ret, |
3955 | size, cachep->size, flags); | 3893 | size, cachep->size, flags); |
3956 | 3894 | ||
3957 | return ret; | 3895 | return ret; |
@@ -3961,20 +3899,20 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3961 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | 3899 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
3962 | void *__kmalloc(size_t size, gfp_t flags) | 3900 | void *__kmalloc(size_t size, gfp_t flags) |
3963 | { | 3901 | { |
3964 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | 3902 | return __do_kmalloc(size, flags, _RET_IP_); |
3965 | } | 3903 | } |
3966 | EXPORT_SYMBOL(__kmalloc); | 3904 | EXPORT_SYMBOL(__kmalloc); |
3967 | 3905 | ||
3968 | void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) | 3906 | void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) |
3969 | { | 3907 | { |
3970 | return __do_kmalloc(size, flags, (void *)caller); | 3908 | return __do_kmalloc(size, flags, caller); |
3971 | } | 3909 | } |
3972 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3910 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3973 | 3911 | ||
3974 | #else | 3912 | #else |
3975 | void *__kmalloc(size_t size, gfp_t flags) | 3913 | void *__kmalloc(size_t size, gfp_t flags) |
3976 | { | 3914 | { |
3977 | return __do_kmalloc(size, flags, NULL); | 3915 | return __do_kmalloc(size, flags, 0); |
3978 | } | 3916 | } |
3979 | EXPORT_SYMBOL(__kmalloc); | 3917 | EXPORT_SYMBOL(__kmalloc); |
3980 | #endif | 3918 | #endif |
@@ -3995,7 +3933,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3995 | debug_check_no_locks_freed(objp, cachep->object_size); | 3933 | debug_check_no_locks_freed(objp, cachep->object_size); |
3996 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3934 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3997 | debug_check_no_obj_freed(objp, cachep->object_size); | 3935 | debug_check_no_obj_freed(objp, cachep->object_size); |
3998 | __cache_free(cachep, objp, __builtin_return_address(0)); | 3936 | __cache_free(cachep, objp, _RET_IP_); |
3999 | local_irq_restore(flags); | 3937 | local_irq_restore(flags); |
4000 | 3938 | ||
4001 | trace_kmem_cache_free(_RET_IP_, objp); | 3939 | trace_kmem_cache_free(_RET_IP_, objp); |
@@ -4026,7 +3964,7 @@ void kfree(const void *objp) | |||
4026 | debug_check_no_locks_freed(objp, c->object_size); | 3964 | debug_check_no_locks_freed(objp, c->object_size); |
4027 | 3965 | ||
4028 | debug_check_no_obj_freed(objp, c->object_size); | 3966 | debug_check_no_obj_freed(objp, c->object_size); |
4029 | __cache_free(c, (void *)objp, __builtin_return_address(0)); | 3967 | __cache_free(c, (void *)objp, _RET_IP_); |
4030 | local_irq_restore(flags); | 3968 | local_irq_restore(flags); |
4031 | } | 3969 | } |
4032 | EXPORT_SYMBOL(kfree); | 3970 | EXPORT_SYMBOL(kfree); |
@@ -25,9 +25,26 @@ extern enum slab_state slab_state; | |||
25 | 25 | ||
26 | /* The slab cache mutex protects the management structures during changes */ | 26 | /* The slab cache mutex protects the management structures during changes */ |
27 | extern struct mutex slab_mutex; | 27 | extern struct mutex slab_mutex; |
28 | |||
29 | /* The list of all slab caches on the system */ | ||
28 | extern struct list_head slab_caches; | 30 | extern struct list_head slab_caches; |
29 | 31 | ||
30 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, | 32 | /* The slab cache that manages slab cache information */ |
33 | extern struct kmem_cache *kmem_cache; | ||
34 | |||
35 | /* Functions provided by the slab allocators */ | ||
36 | extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); | ||
37 | |||
38 | #ifdef CONFIG_SLUB | ||
39 | struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | ||
31 | size_t align, unsigned long flags, void (*ctor)(void *)); | 40 | size_t align, unsigned long flags, void (*ctor)(void *)); |
41 | #else | ||
42 | static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, | ||
43 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
44 | { return NULL; } | ||
45 | #endif | ||
46 | |||
47 | |||
48 | int __kmem_cache_shutdown(struct kmem_cache *); | ||
32 | 49 | ||
33 | #endif | 50 | #endif |
diff --git a/mm/slab_common.c b/mm/slab_common.c index aa3ca5bb01b5..069a24e64403 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -22,6 +22,53 @@ | |||
22 | enum slab_state slab_state; | 22 | enum slab_state slab_state; |
23 | LIST_HEAD(slab_caches); | 23 | LIST_HEAD(slab_caches); |
24 | DEFINE_MUTEX(slab_mutex); | 24 | DEFINE_MUTEX(slab_mutex); |
25 | struct kmem_cache *kmem_cache; | ||
26 | |||
27 | #ifdef CONFIG_DEBUG_VM | ||
28 | static int kmem_cache_sanity_check(const char *name, size_t size) | ||
29 | { | ||
30 | struct kmem_cache *s = NULL; | ||
31 | |||
32 | if (!name || in_interrupt() || size < sizeof(void *) || | ||
33 | size > KMALLOC_MAX_SIZE) { | ||
34 | pr_err("kmem_cache_create(%s) integrity check failed\n", name); | ||
35 | return -EINVAL; | ||
36 | } | ||
37 | |||
38 | list_for_each_entry(s, &slab_caches, list) { | ||
39 | char tmp; | ||
40 | int res; | ||
41 | |||
42 | /* | ||
43 | * This happens when the module gets unloaded and doesn't | ||
44 | * destroy its slab cache and no-one else reuses the vmalloc | ||
45 | * area of the module. Print a warning. | ||
46 | */ | ||
47 | res = probe_kernel_address(s->name, tmp); | ||
48 | if (res) { | ||
49 | pr_err("Slab cache with size %d has lost its name\n", | ||
50 | s->object_size); | ||
51 | continue; | ||
52 | } | ||
53 | |||
54 | if (!strcmp(s->name, name)) { | ||
55 | pr_err("%s (%s): Cache name already exists.\n", | ||
56 | __func__, name); | ||
57 | dump_stack(); | ||
58 | s = NULL; | ||
59 | return -EINVAL; | ||
60 | } | ||
61 | } | ||
62 | |||
63 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | ||
64 | return 0; | ||
65 | } | ||
66 | #else | ||
67 | static inline int kmem_cache_sanity_check(const char *name, size_t size) | ||
68 | { | ||
69 | return 0; | ||
70 | } | ||
71 | #endif | ||
25 | 72 | ||
26 | /* | 73 | /* |
27 | * kmem_cache_create - Create a cache. | 74 | * kmem_cache_create - Create a cache. |
@@ -52,68 +99,95 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align | |||
52 | unsigned long flags, void (*ctor)(void *)) | 99 | unsigned long flags, void (*ctor)(void *)) |
53 | { | 100 | { |
54 | struct kmem_cache *s = NULL; | 101 | struct kmem_cache *s = NULL; |
55 | 102 | int err = 0; | |
56 | #ifdef CONFIG_DEBUG_VM | ||
57 | if (!name || in_interrupt() || size < sizeof(void *) || | ||
58 | size > KMALLOC_MAX_SIZE) { | ||
59 | printk(KERN_ERR "kmem_cache_create(%s) integrity check" | ||
60 | " failed\n", name); | ||
61 | goto out; | ||
62 | } | ||
63 | #endif | ||
64 | 103 | ||
65 | get_online_cpus(); | 104 | get_online_cpus(); |
66 | mutex_lock(&slab_mutex); | 105 | mutex_lock(&slab_mutex); |
67 | 106 | ||
68 | #ifdef CONFIG_DEBUG_VM | 107 | if (!kmem_cache_sanity_check(name, size) == 0) |
69 | list_for_each_entry(s, &slab_caches, list) { | 108 | goto out_locked; |
70 | char tmp; | ||
71 | int res; | ||
72 | 109 | ||
73 | /* | ||
74 | * This happens when the module gets unloaded and doesn't | ||
75 | * destroy its slab cache and no-one else reuses the vmalloc | ||
76 | * area of the module. Print a warning. | ||
77 | */ | ||
78 | res = probe_kernel_address(s->name, tmp); | ||
79 | if (res) { | ||
80 | printk(KERN_ERR | ||
81 | "Slab cache with size %d has lost its name\n", | ||
82 | s->object_size); | ||
83 | continue; | ||
84 | } | ||
85 | 110 | ||
86 | if (!strcmp(s->name, name)) { | 111 | s = __kmem_cache_alias(name, size, align, flags, ctor); |
87 | printk(KERN_ERR "kmem_cache_create(%s): Cache name" | 112 | if (s) |
88 | " already exists.\n", | 113 | goto out_locked; |
89 | name); | 114 | |
90 | dump_stack(); | 115 | s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); |
91 | s = NULL; | 116 | if (s) { |
92 | goto oops; | 117 | s->object_size = s->size = size; |
118 | s->align = align; | ||
119 | s->ctor = ctor; | ||
120 | s->name = kstrdup(name, GFP_KERNEL); | ||
121 | if (!s->name) { | ||
122 | kmem_cache_free(kmem_cache, s); | ||
123 | err = -ENOMEM; | ||
124 | goto out_locked; | ||
93 | } | 125 | } |
94 | } | ||
95 | 126 | ||
96 | WARN_ON(strchr(name, ' ')); /* It confuses parsers */ | 127 | err = __kmem_cache_create(s, flags); |
97 | #endif | 128 | if (!err) { |
98 | 129 | ||
99 | s = __kmem_cache_create(name, size, align, flags, ctor); | 130 | s->refcount = 1; |
131 | list_add(&s->list, &slab_caches); | ||
100 | 132 | ||
101 | #ifdef CONFIG_DEBUG_VM | 133 | } else { |
102 | oops: | 134 | kfree(s->name); |
103 | #endif | 135 | kmem_cache_free(kmem_cache, s); |
136 | } | ||
137 | } else | ||
138 | err = -ENOMEM; | ||
139 | |||
140 | out_locked: | ||
104 | mutex_unlock(&slab_mutex); | 141 | mutex_unlock(&slab_mutex); |
105 | put_online_cpus(); | 142 | put_online_cpus(); |
106 | 143 | ||
107 | #ifdef CONFIG_DEBUG_VM | 144 | if (err) { |
108 | out: | 145 | |
109 | #endif | 146 | if (flags & SLAB_PANIC) |
110 | if (!s && (flags & SLAB_PANIC)) | 147 | panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", |
111 | panic("kmem_cache_create: Failed to create slab '%s'\n", name); | 148 | name, err); |
149 | else { | ||
150 | printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", | ||
151 | name, err); | ||
152 | dump_stack(); | ||
153 | } | ||
154 | |||
155 | return NULL; | ||
156 | } | ||
112 | 157 | ||
113 | return s; | 158 | return s; |
114 | } | 159 | } |
115 | EXPORT_SYMBOL(kmem_cache_create); | 160 | EXPORT_SYMBOL(kmem_cache_create); |
116 | 161 | ||
162 | void kmem_cache_destroy(struct kmem_cache *s) | ||
163 | { | ||
164 | get_online_cpus(); | ||
165 | mutex_lock(&slab_mutex); | ||
166 | s->refcount--; | ||
167 | if (!s->refcount) { | ||
168 | list_del(&s->list); | ||
169 | |||
170 | if (!__kmem_cache_shutdown(s)) { | ||
171 | mutex_unlock(&slab_mutex); | ||
172 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
173 | rcu_barrier(); | ||
174 | |||
175 | kfree(s->name); | ||
176 | kmem_cache_free(kmem_cache, s); | ||
177 | } else { | ||
178 | list_add(&s->list, &slab_caches); | ||
179 | mutex_unlock(&slab_mutex); | ||
180 | printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", | ||
181 | s->name); | ||
182 | dump_stack(); | ||
183 | } | ||
184 | } else { | ||
185 | mutex_unlock(&slab_mutex); | ||
186 | } | ||
187 | put_online_cpus(); | ||
188 | } | ||
189 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
190 | |||
117 | int slab_is_available(void) | 191 | int slab_is_available(void) |
118 | { | 192 | { |
119 | return slab_state >= UP; | 193 | return slab_state >= UP; |
@@ -194,7 +194,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
194 | void *page; | 194 | void *page; |
195 | 195 | ||
196 | #ifdef CONFIG_NUMA | 196 | #ifdef CONFIG_NUMA |
197 | if (node != -1) | 197 | if (node != NUMA_NO_NODE) |
198 | page = alloc_pages_exact_node(node, gfp, order); | 198 | page = alloc_pages_exact_node(node, gfp, order); |
199 | else | 199 | else |
200 | #endif | 200 | #endif |
@@ -290,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | |||
290 | * If there's a node specification, search for a partial | 290 | * If there's a node specification, search for a partial |
291 | * page with a matching node id in the freelist. | 291 | * page with a matching node id in the freelist. |
292 | */ | 292 | */ |
293 | if (node != -1 && page_to_nid(sp) != node) | 293 | if (node != NUMA_NO_NODE && page_to_nid(sp) != node) |
294 | continue; | 294 | continue; |
295 | #endif | 295 | #endif |
296 | /* Enough room on this page? */ | 296 | /* Enough room on this page? */ |
@@ -425,10 +425,11 @@ out: | |||
425 | * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. | 425 | * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. |
426 | */ | 426 | */ |
427 | 427 | ||
428 | void *__kmalloc_node(size_t size, gfp_t gfp, int node) | 428 | static __always_inline void * |
429 | __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) | ||
429 | { | 430 | { |
430 | unsigned int *m; | 431 | unsigned int *m; |
431 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 432 | int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
432 | void *ret; | 433 | void *ret; |
433 | 434 | ||
434 | gfp &= gfp_allowed_mask; | 435 | gfp &= gfp_allowed_mask; |
@@ -446,7 +447,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
446 | *m = size; | 447 | *m = size; |
447 | ret = (void *)m + align; | 448 | ret = (void *)m + align; |
448 | 449 | ||
449 | trace_kmalloc_node(_RET_IP_, ret, | 450 | trace_kmalloc_node(caller, ret, |
450 | size, size + align, gfp, node); | 451 | size, size + align, gfp, node); |
451 | } else { | 452 | } else { |
452 | unsigned int order = get_order(size); | 453 | unsigned int order = get_order(size); |
@@ -460,15 +461,35 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
460 | page->private = size; | 461 | page->private = size; |
461 | } | 462 | } |
462 | 463 | ||
463 | trace_kmalloc_node(_RET_IP_, ret, | 464 | trace_kmalloc_node(caller, ret, |
464 | size, PAGE_SIZE << order, gfp, node); | 465 | size, PAGE_SIZE << order, gfp, node); |
465 | } | 466 | } |
466 | 467 | ||
467 | kmemleak_alloc(ret, size, 1, gfp); | 468 | kmemleak_alloc(ret, size, 1, gfp); |
468 | return ret; | 469 | return ret; |
469 | } | 470 | } |
471 | |||
472 | void *__kmalloc_node(size_t size, gfp_t gfp, int node) | ||
473 | { | ||
474 | return __do_kmalloc_node(size, gfp, node, _RET_IP_); | ||
475 | } | ||
470 | EXPORT_SYMBOL(__kmalloc_node); | 476 | EXPORT_SYMBOL(__kmalloc_node); |
471 | 477 | ||
478 | #ifdef CONFIG_TRACING | ||
479 | void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) | ||
480 | { | ||
481 | return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); | ||
482 | } | ||
483 | |||
484 | #ifdef CONFIG_NUMA | ||
485 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, | ||
486 | int node, unsigned long caller) | ||
487 | { | ||
488 | return __do_kmalloc_node(size, gfp, node, caller); | ||
489 | } | ||
490 | #endif | ||
491 | #endif | ||
492 | |||
472 | void kfree(const void *block) | 493 | void kfree(const void *block) |
473 | { | 494 | { |
474 | struct page *sp; | 495 | struct page *sp; |
@@ -481,7 +502,7 @@ void kfree(const void *block) | |||
481 | 502 | ||
482 | sp = virt_to_page(block); | 503 | sp = virt_to_page(block); |
483 | if (PageSlab(sp)) { | 504 | if (PageSlab(sp)) { |
484 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 505 | int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
485 | unsigned int *m = (unsigned int *)(block - align); | 506 | unsigned int *m = (unsigned int *)(block - align); |
486 | slob_free(m, *m + align); | 507 | slob_free(m, *m + align); |
487 | } else | 508 | } else |
@@ -500,7 +521,7 @@ size_t ksize(const void *block) | |||
500 | 521 | ||
501 | sp = virt_to_page(block); | 522 | sp = virt_to_page(block); |
502 | if (PageSlab(sp)) { | 523 | if (PageSlab(sp)) { |
503 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 524 | int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
504 | unsigned int *m = (unsigned int *)(block - align); | 525 | unsigned int *m = (unsigned int *)(block - align); |
505 | return SLOB_UNITS(*m) * SLOB_UNIT; | 526 | return SLOB_UNITS(*m) * SLOB_UNIT; |
506 | } else | 527 | } else |
@@ -508,44 +529,24 @@ size_t ksize(const void *block) | |||
508 | } | 529 | } |
509 | EXPORT_SYMBOL(ksize); | 530 | EXPORT_SYMBOL(ksize); |
510 | 531 | ||
511 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, | 532 | int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) |
512 | size_t align, unsigned long flags, void (*ctor)(void *)) | ||
513 | { | 533 | { |
514 | struct kmem_cache *c; | 534 | size_t align = c->size; |
515 | |||
516 | c = slob_alloc(sizeof(struct kmem_cache), | ||
517 | GFP_KERNEL, ARCH_KMALLOC_MINALIGN, -1); | ||
518 | 535 | ||
519 | if (c) { | 536 | if (flags & SLAB_DESTROY_BY_RCU) { |
520 | c->name = name; | 537 | /* leave room for rcu footer at the end of object */ |
521 | c->size = size; | 538 | c->size += sizeof(struct slob_rcu); |
522 | if (flags & SLAB_DESTROY_BY_RCU) { | ||
523 | /* leave room for rcu footer at the end of object */ | ||
524 | c->size += sizeof(struct slob_rcu); | ||
525 | } | ||
526 | c->flags = flags; | ||
527 | c->ctor = ctor; | ||
528 | /* ignore alignment unless it's forced */ | ||
529 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | ||
530 | if (c->align < ARCH_SLAB_MINALIGN) | ||
531 | c->align = ARCH_SLAB_MINALIGN; | ||
532 | if (c->align < align) | ||
533 | c->align = align; | ||
534 | |||
535 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); | ||
536 | c->refcount = 1; | ||
537 | } | 539 | } |
538 | return c; | 540 | c->flags = flags; |
539 | } | 541 | /* ignore alignment unless it's forced */ |
542 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | ||
543 | if (c->align < ARCH_SLAB_MINALIGN) | ||
544 | c->align = ARCH_SLAB_MINALIGN; | ||
545 | if (c->align < align) | ||
546 | c->align = align; | ||
540 | 547 | ||
541 | void kmem_cache_destroy(struct kmem_cache *c) | 548 | return 0; |
542 | { | ||
543 | kmemleak_free(c); | ||
544 | if (c->flags & SLAB_DESTROY_BY_RCU) | ||
545 | rcu_barrier(); | ||
546 | slob_free(c, sizeof(struct kmem_cache)); | ||
547 | } | 549 | } |
548 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
549 | 550 | ||
550 | void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | 551 | void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) |
551 | { | 552 | { |
@@ -613,14 +614,28 @@ unsigned int kmem_cache_size(struct kmem_cache *c) | |||
613 | } | 614 | } |
614 | EXPORT_SYMBOL(kmem_cache_size); | 615 | EXPORT_SYMBOL(kmem_cache_size); |
615 | 616 | ||
617 | int __kmem_cache_shutdown(struct kmem_cache *c) | ||
618 | { | ||
619 | /* No way to check for remaining objects */ | ||
620 | return 0; | ||
621 | } | ||
622 | |||
616 | int kmem_cache_shrink(struct kmem_cache *d) | 623 | int kmem_cache_shrink(struct kmem_cache *d) |
617 | { | 624 | { |
618 | return 0; | 625 | return 0; |
619 | } | 626 | } |
620 | EXPORT_SYMBOL(kmem_cache_shrink); | 627 | EXPORT_SYMBOL(kmem_cache_shrink); |
621 | 628 | ||
629 | struct kmem_cache kmem_cache_boot = { | ||
630 | .name = "kmem_cache", | ||
631 | .size = sizeof(struct kmem_cache), | ||
632 | .flags = SLAB_PANIC, | ||
633 | .align = ARCH_KMALLOC_MINALIGN, | ||
634 | }; | ||
635 | |||
622 | void __init kmem_cache_init(void) | 636 | void __init kmem_cache_init(void) |
623 | { | 637 | { |
638 | kmem_cache = &kmem_cache_boot; | ||
624 | slab_state = UP; | 639 | slab_state = UP; |
625 | } | 640 | } |
626 | 641 | ||
@@ -210,11 +210,7 @@ static void sysfs_slab_remove(struct kmem_cache *); | |||
210 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } | 210 | static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } |
211 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) | 211 | static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) |
212 | { return 0; } | 212 | { return 0; } |
213 | static inline void sysfs_slab_remove(struct kmem_cache *s) | 213 | static inline void sysfs_slab_remove(struct kmem_cache *s) { } |
214 | { | ||
215 | kfree(s->name); | ||
216 | kfree(s); | ||
217 | } | ||
218 | 214 | ||
219 | #endif | 215 | #endif |
220 | 216 | ||
@@ -568,6 +564,8 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) | |||
568 | printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); | 564 | printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); |
569 | printk(KERN_ERR "----------------------------------------" | 565 | printk(KERN_ERR "----------------------------------------" |
570 | "-------------------------------------\n\n"); | 566 | "-------------------------------------\n\n"); |
567 | |||
568 | add_taint(TAINT_BAD_PAGE); | ||
571 | } | 569 | } |
572 | 570 | ||
573 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) | 571 | static void slab_fix(struct kmem_cache *s, char *fmt, ...) |
@@ -624,7 +622,7 @@ static void object_err(struct kmem_cache *s, struct page *page, | |||
624 | print_trailer(s, page, object); | 622 | print_trailer(s, page, object); |
625 | } | 623 | } |
626 | 624 | ||
627 | static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) | 625 | static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) |
628 | { | 626 | { |
629 | va_list args; | 627 | va_list args; |
630 | char buf[100]; | 628 | char buf[100]; |
@@ -1069,13 +1067,13 @@ bad: | |||
1069 | return 0; | 1067 | return 0; |
1070 | } | 1068 | } |
1071 | 1069 | ||
1072 | static noinline int free_debug_processing(struct kmem_cache *s, | 1070 | static noinline struct kmem_cache_node *free_debug_processing( |
1073 | struct page *page, void *object, unsigned long addr) | 1071 | struct kmem_cache *s, struct page *page, void *object, |
1072 | unsigned long addr, unsigned long *flags) | ||
1074 | { | 1073 | { |
1075 | unsigned long flags; | 1074 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1076 | int rc = 0; | ||
1077 | 1075 | ||
1078 | local_irq_save(flags); | 1076 | spin_lock_irqsave(&n->list_lock, *flags); |
1079 | slab_lock(page); | 1077 | slab_lock(page); |
1080 | 1078 | ||
1081 | if (!check_slab(s, page)) | 1079 | if (!check_slab(s, page)) |
@@ -1113,15 +1111,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1113 | set_track(s, object, TRACK_FREE, addr); | 1111 | set_track(s, object, TRACK_FREE, addr); |
1114 | trace(s, page, object, 0); | 1112 | trace(s, page, object, 0); |
1115 | init_object(s, object, SLUB_RED_INACTIVE); | 1113 | init_object(s, object, SLUB_RED_INACTIVE); |
1116 | rc = 1; | ||
1117 | out: | 1114 | out: |
1118 | slab_unlock(page); | 1115 | slab_unlock(page); |
1119 | local_irq_restore(flags); | 1116 | /* |
1120 | return rc; | 1117 | * Keep node_lock to preserve integrity |
1118 | * until the object is actually freed | ||
1119 | */ | ||
1120 | return n; | ||
1121 | 1121 | ||
1122 | fail: | 1122 | fail: |
1123 | slab_unlock(page); | ||
1124 | spin_unlock_irqrestore(&n->list_lock, *flags); | ||
1123 | slab_fix(s, "Object at 0x%p not freed", object); | 1125 | slab_fix(s, "Object at 0x%p not freed", object); |
1124 | goto out; | 1126 | return NULL; |
1125 | } | 1127 | } |
1126 | 1128 | ||
1127 | static int __init setup_slub_debug(char *str) | 1129 | static int __init setup_slub_debug(char *str) |
@@ -1214,8 +1216,9 @@ static inline void setup_object_debug(struct kmem_cache *s, | |||
1214 | static inline int alloc_debug_processing(struct kmem_cache *s, | 1216 | static inline int alloc_debug_processing(struct kmem_cache *s, |
1215 | struct page *page, void *object, unsigned long addr) { return 0; } | 1217 | struct page *page, void *object, unsigned long addr) { return 0; } |
1216 | 1218 | ||
1217 | static inline int free_debug_processing(struct kmem_cache *s, | 1219 | static inline struct kmem_cache_node *free_debug_processing( |
1218 | struct page *page, void *object, unsigned long addr) { return 0; } | 1220 | struct kmem_cache *s, struct page *page, void *object, |
1221 | unsigned long addr, unsigned long *flags) { return NULL; } | ||
1219 | 1222 | ||
1220 | static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | 1223 | static inline int slab_pad_check(struct kmem_cache *s, struct page *page) |
1221 | { return 1; } | 1224 | { return 1; } |
@@ -1714,7 +1717,7 @@ static inline void note_cmpxchg_failure(const char *n, | |||
1714 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); | 1717 | stat(s, CMPXCHG_DOUBLE_CPU_FAIL); |
1715 | } | 1718 | } |
1716 | 1719 | ||
1717 | void init_kmem_cache_cpus(struct kmem_cache *s) | 1720 | static void init_kmem_cache_cpus(struct kmem_cache *s) |
1718 | { | 1721 | { |
1719 | int cpu; | 1722 | int cpu; |
1720 | 1723 | ||
@@ -1939,7 +1942,7 @@ static void unfreeze_partials(struct kmem_cache *s) | |||
1939 | * If we did not find a slot then simply move all the partials to the | 1942 | * If we did not find a slot then simply move all the partials to the |
1940 | * per node partial list. | 1943 | * per node partial list. |
1941 | */ | 1944 | */ |
1942 | int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | 1945 | static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) |
1943 | { | 1946 | { |
1944 | struct page *oldpage; | 1947 | struct page *oldpage; |
1945 | int pages; | 1948 | int pages; |
@@ -1962,6 +1965,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
1962 | local_irq_save(flags); | 1965 | local_irq_save(flags); |
1963 | unfreeze_partials(s); | 1966 | unfreeze_partials(s); |
1964 | local_irq_restore(flags); | 1967 | local_irq_restore(flags); |
1968 | oldpage = NULL; | ||
1965 | pobjects = 0; | 1969 | pobjects = 0; |
1966 | pages = 0; | 1970 | pages = 0; |
1967 | stat(s, CPU_PARTIAL_DRAIN); | 1971 | stat(s, CPU_PARTIAL_DRAIN); |
@@ -2310,7 +2314,7 @@ new_slab: | |||
2310 | * | 2314 | * |
2311 | * Otherwise we can simply pick the next object from the lockless free list. | 2315 | * Otherwise we can simply pick the next object from the lockless free list. |
2312 | */ | 2316 | */ |
2313 | static __always_inline void *slab_alloc(struct kmem_cache *s, | 2317 | static __always_inline void *slab_alloc_node(struct kmem_cache *s, |
2314 | gfp_t gfpflags, int node, unsigned long addr) | 2318 | gfp_t gfpflags, int node, unsigned long addr) |
2315 | { | 2319 | { |
2316 | void **object; | 2320 | void **object; |
@@ -2380,9 +2384,15 @@ redo: | |||
2380 | return object; | 2384 | return object; |
2381 | } | 2385 | } |
2382 | 2386 | ||
2387 | static __always_inline void *slab_alloc(struct kmem_cache *s, | ||
2388 | gfp_t gfpflags, unsigned long addr) | ||
2389 | { | ||
2390 | return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr); | ||
2391 | } | ||
2392 | |||
2383 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | 2393 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) |
2384 | { | 2394 | { |
2385 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 2395 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
2386 | 2396 | ||
2387 | trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); | 2397 | trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); |
2388 | 2398 | ||
@@ -2393,7 +2403,7 @@ EXPORT_SYMBOL(kmem_cache_alloc); | |||
2393 | #ifdef CONFIG_TRACING | 2403 | #ifdef CONFIG_TRACING |
2394 | void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) | 2404 | void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) |
2395 | { | 2405 | { |
2396 | void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); | 2406 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
2397 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | 2407 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); |
2398 | return ret; | 2408 | return ret; |
2399 | } | 2409 | } |
@@ -2411,7 +2421,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); | |||
2411 | #ifdef CONFIG_NUMA | 2421 | #ifdef CONFIG_NUMA |
2412 | void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | 2422 | void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) |
2413 | { | 2423 | { |
2414 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); | 2424 | void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); |
2415 | 2425 | ||
2416 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | 2426 | trace_kmem_cache_alloc_node(_RET_IP_, ret, |
2417 | s->object_size, s->size, gfpflags, node); | 2427 | s->object_size, s->size, gfpflags, node); |
@@ -2425,7 +2435,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
2425 | gfp_t gfpflags, | 2435 | gfp_t gfpflags, |
2426 | int node, size_t size) | 2436 | int node, size_t size) |
2427 | { | 2437 | { |
2428 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); | 2438 | void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); |
2429 | 2439 | ||
2430 | trace_kmalloc_node(_RET_IP_, ret, | 2440 | trace_kmalloc_node(_RET_IP_, ret, |
2431 | size, s->size, gfpflags, node); | 2441 | size, s->size, gfpflags, node); |
@@ -2457,7 +2467,8 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2457 | 2467 | ||
2458 | stat(s, FREE_SLOWPATH); | 2468 | stat(s, FREE_SLOWPATH); |
2459 | 2469 | ||
2460 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2470 | if (kmem_cache_debug(s) && |
2471 | !(n = free_debug_processing(s, page, x, addr, &flags))) | ||
2461 | return; | 2472 | return; |
2462 | 2473 | ||
2463 | do { | 2474 | do { |
@@ -2612,6 +2623,13 @@ void kmem_cache_free(struct kmem_cache *s, void *x) | |||
2612 | 2623 | ||
2613 | page = virt_to_head_page(x); | 2624 | page = virt_to_head_page(x); |
2614 | 2625 | ||
2626 | if (kmem_cache_debug(s) && page->slab != s) { | ||
2627 | pr_err("kmem_cache_free: Wrong slab cache. %s but object" | ||
2628 | " is from %s\n", page->slab->name, s->name); | ||
2629 | WARN_ON_ONCE(1); | ||
2630 | return; | ||
2631 | } | ||
2632 | |||
2615 | slab_free(s, page, x, _RET_IP_); | 2633 | slab_free(s, page, x, _RET_IP_); |
2616 | 2634 | ||
2617 | trace_kmem_cache_free(_RET_IP_, x); | 2635 | trace_kmem_cache_free(_RET_IP_, x); |
@@ -3026,17 +3044,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
3026 | 3044 | ||
3027 | } | 3045 | } |
3028 | 3046 | ||
3029 | static int kmem_cache_open(struct kmem_cache *s, | 3047 | static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) |
3030 | const char *name, size_t size, | ||
3031 | size_t align, unsigned long flags, | ||
3032 | void (*ctor)(void *)) | ||
3033 | { | 3048 | { |
3034 | memset(s, 0, kmem_size); | 3049 | s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); |
3035 | s->name = name; | ||
3036 | s->ctor = ctor; | ||
3037 | s->object_size = size; | ||
3038 | s->align = align; | ||
3039 | s->flags = kmem_cache_flags(size, flags, name, ctor); | ||
3040 | s->reserved = 0; | 3050 | s->reserved = 0; |
3041 | 3051 | ||
3042 | if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) | 3052 | if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) |
@@ -3098,7 +3108,6 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3098 | else | 3108 | else |
3099 | s->cpu_partial = 30; | 3109 | s->cpu_partial = 30; |
3100 | 3110 | ||
3101 | s->refcount = 1; | ||
3102 | #ifdef CONFIG_NUMA | 3111 | #ifdef CONFIG_NUMA |
3103 | s->remote_node_defrag_ratio = 1000; | 3112 | s->remote_node_defrag_ratio = 1000; |
3104 | #endif | 3113 | #endif |
@@ -3106,16 +3115,16 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
3106 | goto error; | 3115 | goto error; |
3107 | 3116 | ||
3108 | if (alloc_kmem_cache_cpus(s)) | 3117 | if (alloc_kmem_cache_cpus(s)) |
3109 | return 1; | 3118 | return 0; |
3110 | 3119 | ||
3111 | free_kmem_cache_nodes(s); | 3120 | free_kmem_cache_nodes(s); |
3112 | error: | 3121 | error: |
3113 | if (flags & SLAB_PANIC) | 3122 | if (flags & SLAB_PANIC) |
3114 | panic("Cannot create slab %s size=%lu realsize=%u " | 3123 | panic("Cannot create slab %s size=%lu realsize=%u " |
3115 | "order=%u offset=%u flags=%lx\n", | 3124 | "order=%u offset=%u flags=%lx\n", |
3116 | s->name, (unsigned long)size, s->size, oo_order(s->oo), | 3125 | s->name, (unsigned long)s->size, s->size, oo_order(s->oo), |
3117 | s->offset, flags); | 3126 | s->offset, flags); |
3118 | return 0; | 3127 | return -EINVAL; |
3119 | } | 3128 | } |
3120 | 3129 | ||
3121 | /* | 3130 | /* |
@@ -3137,7 +3146,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, | |||
3137 | sizeof(long), GFP_ATOMIC); | 3146 | sizeof(long), GFP_ATOMIC); |
3138 | if (!map) | 3147 | if (!map) |
3139 | return; | 3148 | return; |
3140 | slab_err(s, page, "%s", text); | 3149 | slab_err(s, page, text, s->name); |
3141 | slab_lock(page); | 3150 | slab_lock(page); |
3142 | 3151 | ||
3143 | get_map(s, page, map); | 3152 | get_map(s, page, map); |
@@ -3169,7 +3178,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | |||
3169 | discard_slab(s, page); | 3178 | discard_slab(s, page); |
3170 | } else { | 3179 | } else { |
3171 | list_slab_objects(s, page, | 3180 | list_slab_objects(s, page, |
3172 | "Objects remaining on kmem_cache_close()"); | 3181 | "Objects remaining in %s on kmem_cache_close()"); |
3173 | } | 3182 | } |
3174 | } | 3183 | } |
3175 | } | 3184 | } |
@@ -3182,7 +3191,6 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
3182 | int node; | 3191 | int node; |
3183 | 3192 | ||
3184 | flush_all(s); | 3193 | flush_all(s); |
3185 | free_percpu(s->cpu_slab); | ||
3186 | /* Attempt to free all objects */ | 3194 | /* Attempt to free all objects */ |
3187 | for_each_node_state(node, N_NORMAL_MEMORY) { | 3195 | for_each_node_state(node, N_NORMAL_MEMORY) { |
3188 | struct kmem_cache_node *n = get_node(s, node); | 3196 | struct kmem_cache_node *n = get_node(s, node); |
@@ -3191,33 +3199,20 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
3191 | if (n->nr_partial || slabs_node(s, node)) | 3199 | if (n->nr_partial || slabs_node(s, node)) |
3192 | return 1; | 3200 | return 1; |
3193 | } | 3201 | } |
3202 | free_percpu(s->cpu_slab); | ||
3194 | free_kmem_cache_nodes(s); | 3203 | free_kmem_cache_nodes(s); |
3195 | return 0; | 3204 | return 0; |
3196 | } | 3205 | } |
3197 | 3206 | ||
3198 | /* | 3207 | int __kmem_cache_shutdown(struct kmem_cache *s) |
3199 | * Close a cache and release the kmem_cache structure | ||
3200 | * (must be used for caches created using kmem_cache_create) | ||
3201 | */ | ||
3202 | void kmem_cache_destroy(struct kmem_cache *s) | ||
3203 | { | 3208 | { |
3204 | mutex_lock(&slab_mutex); | 3209 | int rc = kmem_cache_close(s); |
3205 | s->refcount--; | 3210 | |
3206 | if (!s->refcount) { | 3211 | if (!rc) |
3207 | list_del(&s->list); | ||
3208 | mutex_unlock(&slab_mutex); | ||
3209 | if (kmem_cache_close(s)) { | ||
3210 | printk(KERN_ERR "SLUB %s: %s called for cache that " | ||
3211 | "still has objects.\n", s->name, __func__); | ||
3212 | dump_stack(); | ||
3213 | } | ||
3214 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
3215 | rcu_barrier(); | ||
3216 | sysfs_slab_remove(s); | 3212 | sysfs_slab_remove(s); |
3217 | } else | 3213 | |
3218 | mutex_unlock(&slab_mutex); | 3214 | return rc; |
3219 | } | 3215 | } |
3220 | EXPORT_SYMBOL(kmem_cache_destroy); | ||
3221 | 3216 | ||
3222 | /******************************************************************** | 3217 | /******************************************************************** |
3223 | * Kmalloc subsystem | 3218 | * Kmalloc subsystem |
@@ -3226,8 +3221,6 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
3226 | struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; | 3221 | struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; |
3227 | EXPORT_SYMBOL(kmalloc_caches); | 3222 | EXPORT_SYMBOL(kmalloc_caches); |
3228 | 3223 | ||
3229 | static struct kmem_cache *kmem_cache; | ||
3230 | |||
3231 | #ifdef CONFIG_ZONE_DMA | 3224 | #ifdef CONFIG_ZONE_DMA |
3232 | static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; | 3225 | static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; |
3233 | #endif | 3226 | #endif |
@@ -3273,14 +3266,17 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, | |||
3273 | { | 3266 | { |
3274 | struct kmem_cache *s; | 3267 | struct kmem_cache *s; |
3275 | 3268 | ||
3276 | s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); | 3269 | s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); |
3270 | |||
3271 | s->name = name; | ||
3272 | s->size = s->object_size = size; | ||
3273 | s->align = ARCH_KMALLOC_MINALIGN; | ||
3277 | 3274 | ||
3278 | /* | 3275 | /* |
3279 | * This function is called with IRQs disabled during early-boot on | 3276 | * This function is called with IRQs disabled during early-boot on |
3280 | * single CPU so there's no need to take slab_mutex here. | 3277 | * single CPU so there's no need to take slab_mutex here. |
3281 | */ | 3278 | */ |
3282 | if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, | 3279 | if (kmem_cache_open(s, flags)) |
3283 | flags, NULL)) | ||
3284 | goto panic; | 3280 | goto panic; |
3285 | 3281 | ||
3286 | list_add(&s->list, &slab_caches); | 3282 | list_add(&s->list, &slab_caches); |
@@ -3362,7 +3358,7 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
3362 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 3358 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
3363 | return s; | 3359 | return s; |
3364 | 3360 | ||
3365 | ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); | 3361 | ret = slab_alloc(s, flags, _RET_IP_); |
3366 | 3362 | ||
3367 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | 3363 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); |
3368 | 3364 | ||
@@ -3405,7 +3401,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3405 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 3401 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
3406 | return s; | 3402 | return s; |
3407 | 3403 | ||
3408 | ret = slab_alloc(s, flags, node, _RET_IP_); | 3404 | ret = slab_alloc_node(s, flags, node, _RET_IP_); |
3409 | 3405 | ||
3410 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | 3406 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); |
3411 | 3407 | ||
@@ -3482,7 +3478,7 @@ void kfree(const void *x) | |||
3482 | if (unlikely(!PageSlab(page))) { | 3478 | if (unlikely(!PageSlab(page))) { |
3483 | BUG_ON(!PageCompound(page)); | 3479 | BUG_ON(!PageCompound(page)); |
3484 | kmemleak_free(x); | 3480 | kmemleak_free(x); |
3485 | put_page(page); | 3481 | __free_pages(page, compound_order(page)); |
3486 | return; | 3482 | return; |
3487 | } | 3483 | } |
3488 | slab_free(page->slab, page, object, _RET_IP_); | 3484 | slab_free(page->slab, page, object, _RET_IP_); |
@@ -3719,12 +3715,12 @@ void __init kmem_cache_init(void) | |||
3719 | slub_max_order = 0; | 3715 | slub_max_order = 0; |
3720 | 3716 | ||
3721 | kmem_size = offsetof(struct kmem_cache, node) + | 3717 | kmem_size = offsetof(struct kmem_cache, node) + |
3722 | nr_node_ids * sizeof(struct kmem_cache_node *); | 3718 | nr_node_ids * sizeof(struct kmem_cache_node *); |
3723 | 3719 | ||
3724 | /* Allocate two kmem_caches from the page allocator */ | 3720 | /* Allocate two kmem_caches from the page allocator */ |
3725 | kmalloc_size = ALIGN(kmem_size, cache_line_size()); | 3721 | kmalloc_size = ALIGN(kmem_size, cache_line_size()); |
3726 | order = get_order(2 * kmalloc_size); | 3722 | order = get_order(2 * kmalloc_size); |
3727 | kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); | 3723 | kmem_cache = (void *)__get_free_pages(GFP_NOWAIT | __GFP_ZERO, order); |
3728 | 3724 | ||
3729 | /* | 3725 | /* |
3730 | * Must first have the slab cache available for the allocations of the | 3726 | * Must first have the slab cache available for the allocations of the |
@@ -3733,9 +3729,10 @@ void __init kmem_cache_init(void) | |||
3733 | */ | 3729 | */ |
3734 | kmem_cache_node = (void *)kmem_cache + kmalloc_size; | 3730 | kmem_cache_node = (void *)kmem_cache + kmalloc_size; |
3735 | 3731 | ||
3736 | kmem_cache_open(kmem_cache_node, "kmem_cache_node", | 3732 | kmem_cache_node->name = "kmem_cache_node"; |
3737 | sizeof(struct kmem_cache_node), | 3733 | kmem_cache_node->size = kmem_cache_node->object_size = |
3738 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); | 3734 | sizeof(struct kmem_cache_node); |
3735 | kmem_cache_open(kmem_cache_node, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | ||
3739 | 3736 | ||
3740 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); | 3737 | hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); |
3741 | 3738 | ||
@@ -3743,8 +3740,10 @@ void __init kmem_cache_init(void) | |||
3743 | slab_state = PARTIAL; | 3740 | slab_state = PARTIAL; |
3744 | 3741 | ||
3745 | temp_kmem_cache = kmem_cache; | 3742 | temp_kmem_cache = kmem_cache; |
3746 | kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, | 3743 | kmem_cache->name = "kmem_cache"; |
3747 | 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); | 3744 | kmem_cache->size = kmem_cache->object_size = kmem_size; |
3745 | kmem_cache_open(kmem_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC); | ||
3746 | |||
3748 | kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); | 3747 | kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); |
3749 | memcpy(kmem_cache, temp_kmem_cache, kmem_size); | 3748 | memcpy(kmem_cache, temp_kmem_cache, kmem_size); |
3750 | 3749 | ||
@@ -3933,11 +3932,10 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
3933 | return NULL; | 3932 | return NULL; |
3934 | } | 3933 | } |
3935 | 3934 | ||
3936 | struct kmem_cache *__kmem_cache_create(const char *name, size_t size, | 3935 | struct kmem_cache *__kmem_cache_alias(const char *name, size_t size, |
3937 | size_t align, unsigned long flags, void (*ctor)(void *)) | 3936 | size_t align, unsigned long flags, void (*ctor)(void *)) |
3938 | { | 3937 | { |
3939 | struct kmem_cache *s; | 3938 | struct kmem_cache *s; |
3940 | char *n; | ||
3941 | 3939 | ||
3942 | s = find_mergeable(size, align, flags, name, ctor); | 3940 | s = find_mergeable(size, align, flags, name, ctor); |
3943 | if (s) { | 3941 | if (s) { |
@@ -3951,36 +3949,29 @@ struct kmem_cache *__kmem_cache_create(const char *name, size_t size, | |||
3951 | 3949 | ||
3952 | if (sysfs_slab_alias(s, name)) { | 3950 | if (sysfs_slab_alias(s, name)) { |
3953 | s->refcount--; | 3951 | s->refcount--; |
3954 | return NULL; | 3952 | s = NULL; |
3955 | } | 3953 | } |
3956 | return s; | ||
3957 | } | 3954 | } |
3958 | 3955 | ||
3959 | n = kstrdup(name, GFP_KERNEL); | 3956 | return s; |
3960 | if (!n) | 3957 | } |
3961 | return NULL; | ||
3962 | 3958 | ||
3963 | s = kmalloc(kmem_size, GFP_KERNEL); | 3959 | int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) |
3964 | if (s) { | 3960 | { |
3965 | if (kmem_cache_open(s, n, | 3961 | int err; |
3966 | size, align, flags, ctor)) { | ||
3967 | int r; | ||
3968 | 3962 | ||
3969 | list_add(&s->list, &slab_caches); | 3963 | err = kmem_cache_open(s, flags); |
3970 | mutex_unlock(&slab_mutex); | 3964 | if (err) |
3971 | r = sysfs_slab_add(s); | 3965 | return err; |
3972 | mutex_lock(&slab_mutex); | ||
3973 | 3966 | ||
3974 | if (!r) | 3967 | mutex_unlock(&slab_mutex); |
3975 | return s; | 3968 | err = sysfs_slab_add(s); |
3969 | mutex_lock(&slab_mutex); | ||
3976 | 3970 | ||
3977 | list_del(&s->list); | 3971 | if (err) |
3978 | kmem_cache_close(s); | 3972 | kmem_cache_close(s); |
3979 | } | 3973 | |
3980 | kfree(s); | 3974 | return err; |
3981 | } | ||
3982 | kfree(n); | ||
3983 | return NULL; | ||
3984 | } | 3975 | } |
3985 | 3976 | ||
3986 | #ifdef CONFIG_SMP | 3977 | #ifdef CONFIG_SMP |
@@ -4033,7 +4024,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) | |||
4033 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 4024 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
4034 | return s; | 4025 | return s; |
4035 | 4026 | ||
4036 | ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); | 4027 | ret = slab_alloc(s, gfpflags, caller); |
4037 | 4028 | ||
4038 | /* Honor the call site pointer we received. */ | 4029 | /* Honor the call site pointer we received. */ |
4039 | trace_kmalloc(caller, ret, size, s->size, gfpflags); | 4030 | trace_kmalloc(caller, ret, size, s->size, gfpflags); |
@@ -4063,7 +4054,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
4063 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 4054 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
4064 | return s; | 4055 | return s; |
4065 | 4056 | ||
4066 | ret = slab_alloc(s, gfpflags, node, caller); | 4057 | ret = slab_alloc_node(s, gfpflags, node, caller); |
4067 | 4058 | ||
4068 | /* Honor the call site pointer we received. */ | 4059 | /* Honor the call site pointer we received. */ |
4069 | trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); | 4060 | trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); |
@@ -5210,14 +5201,6 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
5210 | return err; | 5201 | return err; |
5211 | } | 5202 | } |
5212 | 5203 | ||
5213 | static void kmem_cache_release(struct kobject *kobj) | ||
5214 | { | ||
5215 | struct kmem_cache *s = to_slab(kobj); | ||
5216 | |||
5217 | kfree(s->name); | ||
5218 | kfree(s); | ||
5219 | } | ||
5220 | |||
5221 | static const struct sysfs_ops slab_sysfs_ops = { | 5204 | static const struct sysfs_ops slab_sysfs_ops = { |
5222 | .show = slab_attr_show, | 5205 | .show = slab_attr_show, |
5223 | .store = slab_attr_store, | 5206 | .store = slab_attr_store, |
@@ -5225,7 +5208,6 @@ static const struct sysfs_ops slab_sysfs_ops = { | |||
5225 | 5208 | ||
5226 | static struct kobj_type slab_ktype = { | 5209 | static struct kobj_type slab_ktype = { |
5227 | .sysfs_ops = &slab_sysfs_ops, | 5210 | .sysfs_ops = &slab_sysfs_ops, |
5228 | .release = kmem_cache_release | ||
5229 | }; | 5211 | }; |
5230 | 5212 | ||
5231 | static int uevent_filter(struct kset *kset, struct kobject *kobj) | 5213 | static int uevent_filter(struct kset *kset, struct kobject *kobj) |
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page) | |||
446 | } | 446 | } |
447 | EXPORT_SYMBOL(mark_page_accessed); | 447 | EXPORT_SYMBOL(mark_page_accessed); |
448 | 448 | ||
449 | /* | ||
450 | * Order of operations is important: flush the pagevec when it's already | ||
451 | * full, not when adding the last page, to make sure that last page is | ||
452 | * not added to the LRU directly when passed to this function. Because | ||
453 | * mark_page_accessed() (called after this when writing) only activates | ||
454 | * pages that are on the LRU, linear writes in subpage chunks would see | ||
455 | * every PAGEVEC_SIZE page activated, which is unexpected. | ||
456 | */ | ||
449 | void __lru_cache_add(struct page *page, enum lru_list lru) | 457 | void __lru_cache_add(struct page *page, enum lru_list lru) |
450 | { | 458 | { |
451 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; | 459 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; |
452 | 460 | ||
453 | page_cache_get(page); | 461 | page_cache_get(page); |
454 | if (!pagevec_add(pvec, page)) | 462 | if (!pagevec_space(pvec)) |
455 | __pagevec_lru_add(pvec, lru); | 463 | __pagevec_lru_add(pvec, lru); |
464 | pagevec_add(pvec, page); | ||
456 | put_cpu_var(lru_add_pvecs); | 465 | put_cpu_var(lru_add_pvecs); |
457 | } | 466 | } |
458 | EXPORT_SYMBOL(__lru_cache_add); | 467 | EXPORT_SYMBOL(__lru_cache_add); |
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
742 | 751 | ||
743 | SetPageLRU(page_tail); | 752 | SetPageLRU(page_tail); |
744 | 753 | ||
745 | if (page_evictable(page_tail, NULL)) { | 754 | if (page_evictable(page_tail)) { |
746 | if (PageActive(page)) { | 755 | if (PageActive(page)) { |
747 | SetPageActive(page_tail); | 756 | SetPageActive(page_tail); |
748 | active = 1; | 757 | active = 1; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 14e254c768fc..71cd288b2001 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1483,7 +1483,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1483 | struct file *swap_file, *victim; | 1483 | struct file *swap_file, *victim; |
1484 | struct address_space *mapping; | 1484 | struct address_space *mapping; |
1485 | struct inode *inode; | 1485 | struct inode *inode; |
1486 | char *pathname; | 1486 | struct filename *pathname; |
1487 | int oom_score_adj; | 1487 | int oom_score_adj; |
1488 | int i, type, prev; | 1488 | int i, type, prev; |
1489 | int err; | 1489 | int err; |
@@ -1498,8 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1498 | if (IS_ERR(pathname)) | 1498 | if (IS_ERR(pathname)) |
1499 | goto out; | 1499 | goto out; |
1500 | 1500 | ||
1501 | victim = filp_open(pathname, O_RDWR|O_LARGEFILE, 0); | 1501 | victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); |
1502 | putname(pathname); | ||
1503 | err = PTR_ERR(victim); | 1502 | err = PTR_ERR(victim); |
1504 | if (IS_ERR(victim)) | 1503 | if (IS_ERR(victim)) |
1505 | goto out; | 1504 | goto out; |
@@ -1936,7 +1935,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
1936 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1935 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
1937 | { | 1936 | { |
1938 | struct swap_info_struct *p; | 1937 | struct swap_info_struct *p; |
1939 | char *name; | 1938 | struct filename *name; |
1940 | struct file *swap_file = NULL; | 1939 | struct file *swap_file = NULL; |
1941 | struct address_space *mapping; | 1940 | struct address_space *mapping; |
1942 | int i; | 1941 | int i; |
@@ -1967,7 +1966,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
1967 | name = NULL; | 1966 | name = NULL; |
1968 | goto bad_swap; | 1967 | goto bad_swap; |
1969 | } | 1968 | } |
1970 | swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0); | 1969 | swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0); |
1971 | if (IS_ERR(swap_file)) { | 1970 | if (IS_ERR(swap_file)) { |
1972 | error = PTR_ERR(swap_file); | 1971 | error = PTR_ERR(swap_file); |
1973 | swap_file = NULL; | 1972 | swap_file = NULL; |
@@ -2053,7 +2052,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2053 | 2052 | ||
2054 | printk(KERN_INFO "Adding %uk swap on %s. " | 2053 | printk(KERN_INFO "Adding %uk swap on %s. " |
2055 | "Priority:%d extents:%d across:%lluk %s%s%s\n", | 2054 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
2056 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2055 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
2057 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2056 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2058 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2057 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2059 | (p->flags & SWP_DISCARDABLE) ? "D" : "", | 2058 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
diff --git a/mm/truncate.c b/mm/truncate.c index 75801acdaac7..d51ce92d6e83 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
107 | 107 | ||
108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 108 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
109 | 109 | ||
110 | clear_page_mlock(page); | ||
111 | ClearPageMappedToDisk(page); | 110 | ClearPageMappedToDisk(page); |
112 | delete_from_page_cache(page); | 111 | delete_from_page_cache(page); |
113 | return 0; | 112 | return 0; |
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
132 | if (page_has_private(page) && !try_to_release_page(page, 0)) | 131 | if (page_has_private(page) && !try_to_release_page(page, 0)) |
133 | return 0; | 132 | return 0; |
134 | 133 | ||
135 | clear_page_mlock(page); | ||
136 | ret = remove_mapping(mapping, page); | 134 | ret = remove_mapping(mapping, page); |
137 | 135 | ||
138 | return ret; | 136 | return ret; |
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
398 | if (PageDirty(page)) | 396 | if (PageDirty(page)) |
399 | goto failed; | 397 | goto failed; |
400 | 398 | ||
401 | clear_page_mlock(page); | ||
402 | BUG_ON(page_has_private(page)); | 399 | BUG_ON(page_has_private(page)); |
403 | __delete_from_page_cache(page); | 400 | __delete_from_page_cache(page); |
404 | spin_unlock_irq(&mapping->tree_lock); | 401 | spin_unlock_irq(&mapping->tree_lock); |
@@ -105,6 +105,25 @@ void *memdup_user(const void __user *src, size_t len) | |||
105 | } | 105 | } |
106 | EXPORT_SYMBOL(memdup_user); | 106 | EXPORT_SYMBOL(memdup_user); |
107 | 107 | ||
108 | static __always_inline void *__do_krealloc(const void *p, size_t new_size, | ||
109 | gfp_t flags) | ||
110 | { | ||
111 | void *ret; | ||
112 | size_t ks = 0; | ||
113 | |||
114 | if (p) | ||
115 | ks = ksize(p); | ||
116 | |||
117 | if (ks >= new_size) | ||
118 | return (void *)p; | ||
119 | |||
120 | ret = kmalloc_track_caller(new_size, flags); | ||
121 | if (ret && p) | ||
122 | memcpy(ret, p, ks); | ||
123 | |||
124 | return ret; | ||
125 | } | ||
126 | |||
108 | /** | 127 | /** |
109 | * __krealloc - like krealloc() but don't free @p. | 128 | * __krealloc - like krealloc() but don't free @p. |
110 | * @p: object to reallocate memory for. | 129 | * @p: object to reallocate memory for. |
@@ -117,23 +136,11 @@ EXPORT_SYMBOL(memdup_user); | |||
117 | */ | 136 | */ |
118 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) | 137 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) |
119 | { | 138 | { |
120 | void *ret; | ||
121 | size_t ks = 0; | ||
122 | |||
123 | if (unlikely(!new_size)) | 139 | if (unlikely(!new_size)) |
124 | return ZERO_SIZE_PTR; | 140 | return ZERO_SIZE_PTR; |
125 | 141 | ||
126 | if (p) | 142 | return __do_krealloc(p, new_size, flags); |
127 | ks = ksize(p); | ||
128 | 143 | ||
129 | if (ks >= new_size) | ||
130 | return (void *)p; | ||
131 | |||
132 | ret = kmalloc_track_caller(new_size, flags); | ||
133 | if (ret && p) | ||
134 | memcpy(ret, p, ks); | ||
135 | |||
136 | return ret; | ||
137 | } | 144 | } |
138 | EXPORT_SYMBOL(__krealloc); | 145 | EXPORT_SYMBOL(__krealloc); |
139 | 146 | ||
@@ -157,7 +164,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
157 | return ZERO_SIZE_PTR; | 164 | return ZERO_SIZE_PTR; |
158 | } | 165 | } |
159 | 166 | ||
160 | ret = __krealloc(p, new_size, flags); | 167 | ret = __do_krealloc(p, new_size, flags); |
161 | if (ret && p != ret) | 168 | if (ret && p != ret) |
162 | kfree(p); | 169 | kfree(p); |
163 | 170 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2bb90b1d241c..78e08300db21 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
2163 | usize -= PAGE_SIZE; | 2163 | usize -= PAGE_SIZE; |
2164 | } while (usize > 0); | 2164 | } while (usize > 0); |
2165 | 2165 | ||
2166 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 2166 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
2167 | vma->vm_flags |= VM_RESERVED; | ||
2168 | 2167 | ||
2169 | return 0; | 2168 | return 0; |
2170 | } | 2169 | } |
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p) | |||
2572 | { | 2571 | { |
2573 | struct vm_struct *v = p; | 2572 | struct vm_struct *v = p; |
2574 | 2573 | ||
2575 | seq_printf(m, "0x%p-0x%p %7ld", | 2574 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2576 | v->addr, v->addr + v->size, v->size); | 2575 | v->addr, v->addr + v->size, v->size); |
2577 | 2576 | ||
2578 | if (v->caller) | 2577 | if (v->caller) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 99b434b674c0..2624edcfb420 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page) | |||
553 | redo: | 553 | redo: |
554 | ClearPageUnevictable(page); | 554 | ClearPageUnevictable(page); |
555 | 555 | ||
556 | if (page_evictable(page, NULL)) { | 556 | if (page_evictable(page)) { |
557 | /* | 557 | /* |
558 | * For evictable pages, we can use the cache. | 558 | * For evictable pages, we can use the cache. |
559 | * In event of a race, worst case is we end up with an | 559 | * In event of a race, worst case is we end up with an |
@@ -587,7 +587,7 @@ redo: | |||
587 | * page is on unevictable list, it never be freed. To avoid that, | 587 | * page is on unevictable list, it never be freed. To avoid that, |
588 | * check after we added it to the list, again. | 588 | * check after we added it to the list, again. |
589 | */ | 589 | */ |
590 | if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { | 590 | if (lru == LRU_UNEVICTABLE && page_evictable(page)) { |
591 | if (!isolate_lru_page(page)) { | 591 | if (!isolate_lru_page(page)) { |
592 | put_page(page); | 592 | put_page(page); |
593 | goto redo; | 593 | goto redo; |
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page, | |||
674 | static unsigned long shrink_page_list(struct list_head *page_list, | 674 | static unsigned long shrink_page_list(struct list_head *page_list, |
675 | struct zone *zone, | 675 | struct zone *zone, |
676 | struct scan_control *sc, | 676 | struct scan_control *sc, |
677 | enum ttu_flags ttu_flags, | ||
677 | unsigned long *ret_nr_dirty, | 678 | unsigned long *ret_nr_dirty, |
678 | unsigned long *ret_nr_writeback) | 679 | unsigned long *ret_nr_writeback, |
680 | bool force_reclaim) | ||
679 | { | 681 | { |
680 | LIST_HEAD(ret_pages); | 682 | LIST_HEAD(ret_pages); |
681 | LIST_HEAD(free_pages); | 683 | LIST_HEAD(free_pages); |
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
689 | 691 | ||
690 | mem_cgroup_uncharge_start(); | 692 | mem_cgroup_uncharge_start(); |
691 | while (!list_empty(page_list)) { | 693 | while (!list_empty(page_list)) { |
692 | enum page_references references; | ||
693 | struct address_space *mapping; | 694 | struct address_space *mapping; |
694 | struct page *page; | 695 | struct page *page; |
695 | int may_enter_fs; | 696 | int may_enter_fs; |
697 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | ||
696 | 698 | ||
697 | cond_resched(); | 699 | cond_resched(); |
698 | 700 | ||
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
707 | 709 | ||
708 | sc->nr_scanned++; | 710 | sc->nr_scanned++; |
709 | 711 | ||
710 | if (unlikely(!page_evictable(page, NULL))) | 712 | if (unlikely(!page_evictable(page))) |
711 | goto cull_mlocked; | 713 | goto cull_mlocked; |
712 | 714 | ||
713 | if (!sc->may_unmap && page_mapped(page)) | 715 | if (!sc->may_unmap && page_mapped(page)) |
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
758 | wait_on_page_writeback(page); | 760 | wait_on_page_writeback(page); |
759 | } | 761 | } |
760 | 762 | ||
761 | references = page_check_references(page, sc); | 763 | if (!force_reclaim) |
764 | references = page_check_references(page, sc); | ||
765 | |||
762 | switch (references) { | 766 | switch (references) { |
763 | case PAGEREF_ACTIVATE: | 767 | case PAGEREF_ACTIVATE: |
764 | goto activate_locked; | 768 | goto activate_locked; |
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
788 | * processes. Try to unmap it here. | 792 | * processes. Try to unmap it here. |
789 | */ | 793 | */ |
790 | if (page_mapped(page) && mapping) { | 794 | if (page_mapped(page) && mapping) { |
791 | switch (try_to_unmap(page, TTU_UNMAP)) { | 795 | switch (try_to_unmap(page, ttu_flags)) { |
792 | case SWAP_FAIL: | 796 | case SWAP_FAIL: |
793 | goto activate_locked; | 797 | goto activate_locked; |
794 | case SWAP_AGAIN: | 798 | case SWAP_AGAIN: |
@@ -960,6 +964,33 @@ keep: | |||
960 | return nr_reclaimed; | 964 | return nr_reclaimed; |
961 | } | 965 | } |
962 | 966 | ||
967 | unsigned long reclaim_clean_pages_from_list(struct zone *zone, | ||
968 | struct list_head *page_list) | ||
969 | { | ||
970 | struct scan_control sc = { | ||
971 | .gfp_mask = GFP_KERNEL, | ||
972 | .priority = DEF_PRIORITY, | ||
973 | .may_unmap = 1, | ||
974 | }; | ||
975 | unsigned long ret, dummy1, dummy2; | ||
976 | struct page *page, *next; | ||
977 | LIST_HEAD(clean_pages); | ||
978 | |||
979 | list_for_each_entry_safe(page, next, page_list, lru) { | ||
980 | if (page_is_file_cache(page) && !PageDirty(page)) { | ||
981 | ClearPageActive(page); | ||
982 | list_move(&page->lru, &clean_pages); | ||
983 | } | ||
984 | } | ||
985 | |||
986 | ret = shrink_page_list(&clean_pages, zone, &sc, | ||
987 | TTU_UNMAP|TTU_IGNORE_ACCESS, | ||
988 | &dummy1, &dummy2, true); | ||
989 | list_splice(&clean_pages, page_list); | ||
990 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | ||
991 | return ret; | ||
992 | } | ||
993 | |||
963 | /* | 994 | /* |
964 | * Attempt to remove the specified page from its LRU. Only take this page | 995 | * Attempt to remove the specified page from its LRU. Only take this page |
965 | * if it is of the appropriate PageActive status. Pages which are being | 996 | * if it is of the appropriate PageActive status. Pages which are being |
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) | |||
978 | if (!PageLRU(page)) | 1009 | if (!PageLRU(page)) |
979 | return ret; | 1010 | return ret; |
980 | 1011 | ||
981 | /* Do not give back unevictable pages for compaction */ | 1012 | /* Compaction should not handle unevictable pages but CMA can do so */ |
982 | if (PageUnevictable(page)) | 1013 | if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) |
983 | return ret; | 1014 | return ret; |
984 | 1015 | ||
985 | ret = -EBUSY; | 1016 | ret = -EBUSY; |
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) | |||
1186 | 1217 | ||
1187 | VM_BUG_ON(PageLRU(page)); | 1218 | VM_BUG_ON(PageLRU(page)); |
1188 | list_del(&page->lru); | 1219 | list_del(&page->lru); |
1189 | if (unlikely(!page_evictable(page, NULL))) { | 1220 | if (unlikely(!page_evictable(page))) { |
1190 | spin_unlock_irq(&zone->lru_lock); | 1221 | spin_unlock_irq(&zone->lru_lock); |
1191 | putback_lru_page(page); | 1222 | putback_lru_page(page); |
1192 | spin_lock_irq(&zone->lru_lock); | 1223 | spin_lock_irq(&zone->lru_lock); |
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1278 | if (nr_taken == 0) | 1309 | if (nr_taken == 0) |
1279 | return 0; | 1310 | return 0; |
1280 | 1311 | ||
1281 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, | 1312 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, |
1282 | &nr_dirty, &nr_writeback); | 1313 | &nr_dirty, &nr_writeback, false); |
1283 | 1314 | ||
1284 | spin_lock_irq(&zone->lru_lock); | 1315 | spin_lock_irq(&zone->lru_lock); |
1285 | 1316 | ||
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1439 | page = lru_to_page(&l_hold); | 1470 | page = lru_to_page(&l_hold); |
1440 | list_del(&page->lru); | 1471 | list_del(&page->lru); |
1441 | 1472 | ||
1442 | if (unlikely(!page_evictable(page, NULL))) { | 1473 | if (unlikely(!page_evictable(page))) { |
1443 | putback_lru_page(page); | 1474 | putback_lru_page(page); |
1444 | continue; | 1475 | continue; |
1445 | } | 1476 | } |
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
1729 | return false; | 1760 | return false; |
1730 | } | 1761 | } |
1731 | 1762 | ||
1763 | #ifdef CONFIG_COMPACTION | ||
1764 | /* | ||
1765 | * If compaction is deferred for sc->order then scale the number of pages | ||
1766 | * reclaimed based on the number of consecutive allocation failures | ||
1767 | */ | ||
1768 | static unsigned long scale_for_compaction(unsigned long pages_for_compaction, | ||
1769 | struct lruvec *lruvec, struct scan_control *sc) | ||
1770 | { | ||
1771 | struct zone *zone = lruvec_zone(lruvec); | ||
1772 | |||
1773 | if (zone->compact_order_failed <= sc->order) | ||
1774 | pages_for_compaction <<= zone->compact_defer_shift; | ||
1775 | return pages_for_compaction; | ||
1776 | } | ||
1777 | #else | ||
1778 | static unsigned long scale_for_compaction(unsigned long pages_for_compaction, | ||
1779 | struct lruvec *lruvec, struct scan_control *sc) | ||
1780 | { | ||
1781 | return pages_for_compaction; | ||
1782 | } | ||
1783 | #endif | ||
1784 | |||
1732 | /* | 1785 | /* |
1733 | * Reclaim/compaction is used for high-order allocation requests. It reclaims | 1786 | * Reclaim/compaction is used for high-order allocation requests. It reclaims |
1734 | * order-0 pages before compacting the zone. should_continue_reclaim() returns | 1787 | * order-0 pages before compacting the zone. should_continue_reclaim() returns |
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1776 | * inactive lists are large enough, continue reclaiming | 1829 | * inactive lists are large enough, continue reclaiming |
1777 | */ | 1830 | */ |
1778 | pages_for_compaction = (2UL << sc->order); | 1831 | pages_for_compaction = (2UL << sc->order); |
1832 | |||
1833 | pages_for_compaction = scale_for_compaction(pages_for_compaction, | ||
1834 | lruvec, sc); | ||
1779 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1835 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1780 | if (nr_swap_pages > 0) | 1836 | if (nr_swap_pages > 0) |
1781 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1837 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); |
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2839 | */ | 2895 | */ |
2840 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2896 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
2841 | 2897 | ||
2898 | /* | ||
2899 | * Compaction records what page blocks it recently failed to | ||
2900 | * isolate pages from and skips them in the future scanning. | ||
2901 | * When kswapd is going to sleep, it is reasonable to assume | ||
2902 | * that pages and compaction may succeed so reset the cache. | ||
2903 | */ | ||
2904 | reset_isolation_suitable(pgdat); | ||
2905 | |||
2842 | if (!kthread_should_stop()) | 2906 | if (!kthread_should_stop()) |
2843 | schedule(); | 2907 | schedule(); |
2844 | 2908 | ||
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid) | |||
3101 | if (IS_ERR(pgdat->kswapd)) { | 3165 | if (IS_ERR(pgdat->kswapd)) { |
3102 | /* failure at boot is fatal */ | 3166 | /* failure at boot is fatal */ |
3103 | BUG_ON(system_state == SYSTEM_BOOTING); | 3167 | BUG_ON(system_state == SYSTEM_BOOTING); |
3104 | printk("Failed to start kswapd on node %d\n",nid); | ||
3105 | pgdat->kswapd = NULL; | 3168 | pgdat->kswapd = NULL; |
3106 | ret = -1; | 3169 | pr_err("Failed to start kswapd on node %d\n", nid); |
3170 | ret = PTR_ERR(pgdat->kswapd); | ||
3107 | } | 3171 | } |
3108 | return ret; | 3172 | return ret; |
3109 | } | 3173 | } |
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3350 | /* | 3414 | /* |
3351 | * page_evictable - test whether a page is evictable | 3415 | * page_evictable - test whether a page is evictable |
3352 | * @page: the page to test | 3416 | * @page: the page to test |
3353 | * @vma: the VMA in which the page is or will be mapped, may be NULL | ||
3354 | * | 3417 | * |
3355 | * Test whether page is evictable--i.e., should be placed on active/inactive | 3418 | * Test whether page is evictable--i.e., should be placed on active/inactive |
3356 | * lists vs unevictable list. The vma argument is !NULL when called from the | 3419 | * lists vs unevictable list. |
3357 | * fault path to determine how to instantate a new page. | ||
3358 | * | 3420 | * |
3359 | * Reasons page might not be evictable: | 3421 | * Reasons page might not be evictable: |
3360 | * (1) page's mapping marked unevictable | 3422 | * (1) page's mapping marked unevictable |
3361 | * (2) page is part of an mlocked VMA | 3423 | * (2) page is part of an mlocked VMA |
3362 | * | 3424 | * |
3363 | */ | 3425 | */ |
3364 | int page_evictable(struct page *page, struct vm_area_struct *vma) | 3426 | int page_evictable(struct page *page) |
3365 | { | 3427 | { |
3366 | 3428 | return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); | |
3367 | if (mapping_unevictable(page_mapping(page))) | ||
3368 | return 0; | ||
3369 | |||
3370 | if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) | ||
3371 | return 0; | ||
3372 | |||
3373 | return 1; | ||
3374 | } | 3429 | } |
3375 | 3430 | ||
3376 | #ifdef CONFIG_SHMEM | 3431 | #ifdef CONFIG_SHMEM |
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3408 | if (!PageLRU(page) || !PageUnevictable(page)) | 3463 | if (!PageLRU(page) || !PageUnevictable(page)) |
3409 | continue; | 3464 | continue; |
3410 | 3465 | ||
3411 | if (page_evictable(page, NULL)) { | 3466 | if (page_evictable(page)) { |
3412 | enum lru_list lru = page_lru_base_type(page); | 3467 | enum lru_list lru = page_lru_base_type(page); |
3413 | 3468 | ||
3414 | VM_BUG_ON(PageActive(page)); | 3469 | VM_BUG_ON(PageActive(page)); |
diff --git a/mm/vmstat.c b/mm/vmstat.c index df7a6748231d..c7370579111b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu) | |||
495 | atomic_long_add(global_diff[i], &vm_stat[i]); | 495 | atomic_long_add(global_diff[i], &vm_stat[i]); |
496 | } | 496 | } |
497 | 497 | ||
498 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) | ||
499 | { | ||
500 | int i; | ||
501 | |||
502 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
503 | if (pset->vm_stat_diff[i]) { | ||
504 | int v = pset->vm_stat_diff[i]; | ||
505 | pset->vm_stat_diff[i] = 0; | ||
506 | atomic_long_add(v, &zone->vm_stat[i]); | ||
507 | atomic_long_add(v, &vm_stat[i]); | ||
508 | } | ||
509 | } | ||
498 | #endif | 510 | #endif |
499 | 511 | ||
500 | #ifdef CONFIG_NUMA | 512 | #ifdef CONFIG_NUMA |
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = { | |||
722 | "numa_other", | 734 | "numa_other", |
723 | #endif | 735 | #endif |
724 | "nr_anon_transparent_hugepages", | 736 | "nr_anon_transparent_hugepages", |
737 | "nr_free_cma", | ||
725 | "nr_dirty_threshold", | 738 | "nr_dirty_threshold", |
726 | "nr_dirty_background_threshold", | 739 | "nr_dirty_background_threshold", |
727 | 740 | ||
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = { | |||
781 | "unevictable_pgs_munlocked", | 794 | "unevictable_pgs_munlocked", |
782 | "unevictable_pgs_cleared", | 795 | "unevictable_pgs_cleared", |
783 | "unevictable_pgs_stranded", | 796 | "unevictable_pgs_stranded", |
784 | "unevictable_pgs_mlockfreed", | ||
785 | 797 | ||
786 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 798 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
787 | "thp_fault_alloc", | 799 | "thp_fault_alloc", |
@@ -1157,7 +1169,7 @@ static void __cpuinit start_cpu_timer(int cpu) | |||
1157 | { | 1169 | { |
1158 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); | 1170 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); |
1159 | 1171 | ||
1160 | INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); | 1172 | INIT_DEFERRABLE_WORK(work, vmstat_update); |
1161 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); | 1173 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); |
1162 | } | 1174 | } |
1163 | 1175 | ||