aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:42:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:42:48 -0500
commit40ba587923ae67090d9f141c1d3c951be5c1420e (patch)
tree342a72fc0ee13a0d2496ef970b64dfeadf1355d2 /mm
parent54c2c5761febcca46c8037d3a81612991e6c209a (diff)
parent6b550f9495947fc279d12c38feaf98500e8d0646 (diff)
Merge branch 'akpm' (aka "Andrew's patch-bomb")
Andrew elucidates: - First installmeant of MM. We have a HUGE number of MM patches this time. It's crazy. - MAINTAINERS updates - backlight updates - leds - checkpatch updates - misc ELF stuff - rtc updates - reiserfs - procfs - some misc other bits * akpm: (124 commits) user namespace: make signal.c respect user namespaces workqueue: make alloc_workqueue() take printf fmt and args for name procfs: add hidepid= and gid= mount options procfs: parse mount options procfs: introduce the /proc/<pid>/map_files/ directory procfs: make proc_get_link to use dentry instead of inode signal: add block_sigmask() for adding sigmask to current->blocked sparc: make SA_NOMASK a synonym of SA_NODEFER reiserfs: don't lock root inode searching reiserfs: don't lock journal_init() reiserfs: delay reiserfs lock until journal initialization reiserfs: delete comments referring to the BKL drivers/rtc/interface.c: fix alarm rollover when day or month is out-of-range drivers/rtc/rtc-twl.c: add DT support for RTC inside twl4030/twl6030 drivers/rtc/: remove redundant spi driver bus initialization drivers/rtc/rtc-jz4740.c: make jz4740_rtc_driver static drivers/rtc/rtc-mc13xxx.c: make mc13xxx_rtc_idtable static rtc: convert drivers/rtc/* to use module_platform_driver() drivers/rtc/rtc-wm831x.c: convert to devm_kzalloc() drivers/rtc/rtc-wm831x.c: remove unused period IRQ handler ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug5
-rw-r--r--mm/bootmem.c24
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/fadvise.c3
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/hugetlb.c19
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/mempool.c104
-rw-r--r--mm/migrate.c14
-rw-r--r--mm/mmap.c60
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c290
-rw-r--r--mm/page_alloc.c253
-rw-r--r--mm/rmap.c45
-rw-r--r--mm/slub.c3
-rw-r--r--mm/swap.c14
-rw-r--r--mm/swapfile.c6
-rw-r--r--mm/vmalloc.c8
-rw-r--r--mm/vmscan.c42
20 files changed, 615 insertions, 313 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 8b1a477162dc..4b2443254de2 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC
4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC 4 depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
5 depends on !KMEMCHECK 5 depends on !KMEMCHECK
6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 6 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
7 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
7 ---help--- 8 ---help---
8 Unmap pages from the kernel linear mapping after free_pages(). 9 Unmap pages from the kernel linear mapping after free_pages().
9 This results in a large slowdown, but helps to find certain types 10 This results in a large slowdown, but helps to find certain types
@@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS
22config PAGE_POISONING 23config PAGE_POISONING
23 bool 24 bool
24 select WANT_PAGE_DEBUG_FLAGS 25 select WANT_PAGE_DEBUG_FLAGS
26
27config PAGE_GUARD
28 bool
29 select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1a77012ecdb3..668e94df8cf2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
56 56
57static unsigned long __init bootmap_bytes(unsigned long pages) 57static unsigned long __init bootmap_bytes(unsigned long pages)
58{ 58{
59 unsigned long bytes = (pages + 7) / 8; 59 unsigned long bytes = DIV_ROUND_UP(pages, 8);
60 60
61 return ALIGN(bytes, sizeof(long)); 61 return ALIGN(bytes, sizeof(long));
62} 62}
@@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
171 171
172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) 172static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
173{ 173{
174 int aligned;
175 struct page *page; 174 struct page *page;
176 unsigned long start, end, pages, count = 0; 175 unsigned long start, end, pages, count = 0;
177 176
@@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
181 start = bdata->node_min_pfn; 180 start = bdata->node_min_pfn;
182 end = bdata->node_low_pfn; 181 end = bdata->node_low_pfn;
183 182
184 /* 183 bdebug("nid=%td start=%lx end=%lx\n",
185 * If the start is aligned to the machines wordsize, we might 184 bdata - bootmem_node_data, start, end);
186 * be able to free pages in bulks of that order.
187 */
188 aligned = !(start & (BITS_PER_LONG - 1));
189
190 bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
191 bdata - bootmem_node_data, start, end, aligned);
192 185
193 while (start < end) { 186 while (start < end) {
194 unsigned long *map, idx, vec; 187 unsigned long *map, idx, vec;
@@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
196 map = bdata->node_bootmem_map; 189 map = bdata->node_bootmem_map;
197 idx = start - bdata->node_min_pfn; 190 idx = start - bdata->node_min_pfn;
198 vec = ~map[idx / BITS_PER_LONG]; 191 vec = ~map[idx / BITS_PER_LONG];
199 192 /*
200 if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { 193 * If we have a properly aligned and fully unreserved
194 * BITS_PER_LONG block of pages in front of us, free
195 * it in one go.
196 */
197 if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
201 int order = ilog2(BITS_PER_LONG); 198 int order = ilog2(BITS_PER_LONG);
202 199
203 __free_pages_bootmem(pfn_to_page(start), order); 200 __free_pages_bootmem(pfn_to_page(start), order);
204 count += BITS_PER_LONG; 201 count += BITS_PER_LONG;
202 start += BITS_PER_LONG;
205 } else { 203 } else {
206 unsigned long off = 0; 204 unsigned long off = 0;
207 205
@@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
214 vec >>= 1; 212 vec >>= 1;
215 off++; 213 off++;
216 } 214 }
215 start = ALIGN(start + 1, BITS_PER_LONG);
217 } 216 }
218 start += BITS_PER_LONG;
219 } 217 }
220 218
221 page = virt_to_page(bdata->node_bootmem_map); 219 page = virt_to_page(bdata->node_bootmem_map);
diff --git a/mm/compaction.c b/mm/compaction.c
index 1253d7ac332b..e6670c34eb49 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
365 nr_isolated++; 365 nr_isolated++;
366 366
367 /* Avoid isolating too much */ 367 /* Avoid isolating too much */
368 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 368 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
369 ++low_pfn;
369 break; 370 break;
371 }
370 } 372 }
371 373
372 acct_isolated(zone, cc); 374 acct_isolated(zone, cc);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 8d723c9e8b75..469491e0af79 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
117 break; 117 break;
118 case POSIX_FADV_DONTNEED: 118 case POSIX_FADV_DONTNEED:
119 if (!bdi_write_congested(mapping->backing_dev_info)) 119 if (!bdi_write_congested(mapping->backing_dev_info))
120 filemap_flush(mapping); 120 __filemap_fdatawrite_range(mapping, offset, endbyte,
121 WB_SYNC_NONE);
121 122
122 /* First and last FULL page! */ 123 /* First and last FULL page! */
123 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; 124 start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
diff --git a/mm/filemap.c b/mm/filemap.c
index a0701e6eec10..c4ee2e918bea 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2351,8 +2351,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2351 pgoff_t index, unsigned flags) 2351 pgoff_t index, unsigned flags)
2352{ 2352{
2353 int status; 2353 int status;
2354 gfp_t gfp_mask;
2354 struct page *page; 2355 struct page *page;
2355 gfp_t gfp_notmask = 0; 2356 gfp_t gfp_notmask = 0;
2357
2358 gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE;
2356 if (flags & AOP_FLAG_NOFS) 2359 if (flags & AOP_FLAG_NOFS)
2357 gfp_notmask = __GFP_FS; 2360 gfp_notmask = __GFP_FS;
2358repeat: 2361repeat:
@@ -2360,7 +2363,7 @@ repeat:
2360 if (page) 2363 if (page)
2361 goto found; 2364 goto found;
2362 2365
2363 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2366 page = __page_cache_alloc(gfp_mask & ~gfp_notmask);
2364 if (!page) 2367 if (!page)
2365 return NULL; 2368 return NULL;
2366 status = add_to_page_cache_lru(page, mapping, index, 2369 status = add_to_page_cache_lru(page, mapping, index,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7acd12503f73..ea8c3a4cd2ae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
800 800
801 if (page && arch_prepare_hugepage(page)) { 801 if (page && arch_prepare_hugepage(page)) {
802 __free_pages(page, huge_page_order(h)); 802 __free_pages(page, huge_page_order(h));
803 return NULL; 803 page = NULL;
804 } 804 }
805 805
806 spin_lock(&hugetlb_lock); 806 spin_lock(&hugetlb_lock);
@@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2315 * from page cache lookup which is in HPAGE_SIZE units. 2315 * from page cache lookup which is in HPAGE_SIZE units.
2316 */ 2316 */
2317 address = address & huge_page_mask(h); 2317 address = address & huge_page_mask(h);
2318 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) 2318 pgoff = vma_hugecache_offset(h, vma, address);
2319 + (vma->vm_pgoff >> PAGE_SHIFT);
2320 mapping = (struct address_space *)page_private(page); 2319 mapping = (struct address_space *)page_private(page);
2321 2320
2322 /* 2321 /*
@@ -2349,6 +2348,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2349 2348
2350/* 2349/*
2351 * Hugetlb_cow() should be called with page lock of the original hugepage held. 2350 * Hugetlb_cow() should be called with page lock of the original hugepage held.
2351 * Called with hugetlb_instantiation_mutex held and pte_page locked so we
2352 * cannot race with other handlers or page migration.
2353 * Keep the pte_same checks anyway to make transition from the mutex easier.
2352 */ 2354 */
2353static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 2355static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2354 unsigned long address, pte_t *ptep, pte_t pte, 2356 unsigned long address, pte_t *ptep, pte_t pte,
@@ -2408,7 +2410,14 @@ retry_avoidcopy:
2408 BUG_ON(page_count(old_page) != 1); 2410 BUG_ON(page_count(old_page) != 1);
2409 BUG_ON(huge_pte_none(pte)); 2411 BUG_ON(huge_pte_none(pte));
2410 spin_lock(&mm->page_table_lock); 2412 spin_lock(&mm->page_table_lock);
2411 goto retry_avoidcopy; 2413 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2414 if (likely(pte_same(huge_ptep_get(ptep), pte)))
2415 goto retry_avoidcopy;
2416 /*
2417 * race occurs while re-acquiring page_table_lock, and
2418 * our job is done.
2419 */
2420 return 0;
2412 } 2421 }
2413 WARN_ON_ONCE(1); 2422 WARN_ON_ONCE(1);
2414 } 2423 }
@@ -2630,6 +2639,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2630 static DEFINE_MUTEX(hugetlb_instantiation_mutex); 2639 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
2631 struct hstate *h = hstate_vma(vma); 2640 struct hstate *h = hstate_vma(vma);
2632 2641
2642 address &= huge_page_mask(h);
2643
2633 ptep = huge_pte_offset(mm, address); 2644 ptep = huge_pte_offset(mm, address);
2634 if (ptep) { 2645 if (ptep) {
2635 entry = huge_ptep_get(ptep); 2646 entry = huge_ptep_get(ptep);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3fdbcb17658..e3d58f088466 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1983} 1983}
1984 1984
1985/* Slow path of a mempolicy comparison */ 1985/* Slow path of a mempolicy comparison */
1986int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1986bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1987{ 1987{
1988 if (!a || !b) 1988 if (!a || !b)
1989 return 0; 1989 return false;
1990 if (a->mode != b->mode) 1990 if (a->mode != b->mode)
1991 return 0; 1991 return false;
1992 if (a->flags != b->flags) 1992 if (a->flags != b->flags)
1993 return 0; 1993 return false;
1994 if (mpol_store_user_nodemask(a)) 1994 if (mpol_store_user_nodemask(a))
1995 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) 1995 if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
1996 return 0; 1996 return false;
1997 1997
1998 switch (a->mode) { 1998 switch (a->mode) {
1999 case MPOL_BIND: 1999 case MPOL_BIND:
2000 /* Fall through */ 2000 /* Fall through */
2001 case MPOL_INTERLEAVE: 2001 case MPOL_INTERLEAVE:
2002 return nodes_equal(a->v.nodes, b->v.nodes); 2002 return !!nodes_equal(a->v.nodes, b->v.nodes);
2003 case MPOL_PREFERRED: 2003 case MPOL_PREFERRED:
2004 return a->v.preferred_node == b->v.preferred_node; 2004 return a->v.preferred_node == b->v.preferred_node;
2005 default: 2005 default:
2006 BUG(); 2006 BUG();
2007 return 0; 2007 return false;
2008 } 2008 }
2009} 2009}
2010 2010
diff --git a/mm/mempool.c b/mm/mempool.c
index e73641b79bb5..d9049811f352 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool)
27 return pool->elements[--pool->curr_nr]; 27 return pool->elements[--pool->curr_nr];
28} 28}
29 29
30static void free_pool(mempool_t *pool) 30/**
31 * mempool_destroy - deallocate a memory pool
32 * @pool: pointer to the memory pool which was allocated via
33 * mempool_create().
34 *
35 * Free all reserved elements in @pool and @pool itself. This function
36 * only sleeps if the free_fn() function sleeps.
37 */
38void mempool_destroy(mempool_t *pool)
31{ 39{
32 while (pool->curr_nr) { 40 while (pool->curr_nr) {
33 void *element = remove_element(pool); 41 void *element = remove_element(pool);
@@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool)
36 kfree(pool->elements); 44 kfree(pool->elements);
37 kfree(pool); 45 kfree(pool);
38} 46}
47EXPORT_SYMBOL(mempool_destroy);
39 48
40/** 49/**
41 * mempool_create - create a memory pool 50 * mempool_create - create a memory pool
@@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
86 95
87 element = pool->alloc(GFP_KERNEL, pool->pool_data); 96 element = pool->alloc(GFP_KERNEL, pool->pool_data);
88 if (unlikely(!element)) { 97 if (unlikely(!element)) {
89 free_pool(pool); 98 mempool_destroy(pool);
90 return NULL; 99 return NULL;
91 } 100 }
92 add_element(pool, element); 101 add_element(pool, element);
@@ -172,23 +181,6 @@ out:
172EXPORT_SYMBOL(mempool_resize); 181EXPORT_SYMBOL(mempool_resize);
173 182
174/** 183/**
175 * mempool_destroy - deallocate a memory pool
176 * @pool: pointer to the memory pool which was allocated via
177 * mempool_create().
178 *
179 * this function only sleeps if the free_fn() function sleeps. The caller
180 * has to guarantee that all elements have been returned to the pool (ie:
181 * freed) prior to calling mempool_destroy().
182 */
183void mempool_destroy(mempool_t *pool)
184{
185 /* Check for outstanding elements */
186 BUG_ON(pool->curr_nr != pool->min_nr);
187 free_pool(pool);
188}
189EXPORT_SYMBOL(mempool_destroy);
190
191/**
192 * mempool_alloc - allocate an element from a specific memory pool 184 * mempool_alloc - allocate an element from a specific memory pool
193 * @pool: pointer to the memory pool which was allocated via 185 * @pool: pointer to the memory pool which was allocated via
194 * mempool_create(). 186 * mempool_create().
@@ -224,28 +216,40 @@ repeat_alloc:
224 if (likely(pool->curr_nr)) { 216 if (likely(pool->curr_nr)) {
225 element = remove_element(pool); 217 element = remove_element(pool);
226 spin_unlock_irqrestore(&pool->lock, flags); 218 spin_unlock_irqrestore(&pool->lock, flags);
219 /* paired with rmb in mempool_free(), read comment there */
220 smp_wmb();
227 return element; 221 return element;
228 } 222 }
229 spin_unlock_irqrestore(&pool->lock, flags);
230 223
231 /* We must not sleep in the GFP_ATOMIC case */ 224 /*
232 if (!(gfp_mask & __GFP_WAIT)) 225 * We use gfp mask w/o __GFP_WAIT or IO for the first round. If
226 * alloc failed with that and @pool was empty, retry immediately.
227 */
228 if (gfp_temp != gfp_mask) {
229 spin_unlock_irqrestore(&pool->lock, flags);
230 gfp_temp = gfp_mask;
231 goto repeat_alloc;
232 }
233
234 /* We must not sleep if !__GFP_WAIT */
235 if (!(gfp_mask & __GFP_WAIT)) {
236 spin_unlock_irqrestore(&pool->lock, flags);
233 return NULL; 237 return NULL;
238 }
234 239
235 /* Now start performing page reclaim */ 240 /* Let's wait for someone else to return an element to @pool */
236 gfp_temp = gfp_mask;
237 init_wait(&wait); 241 init_wait(&wait);
238 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); 242 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
239 smp_mb();
240 if (!pool->curr_nr) {
241 /*
242 * FIXME: this should be io_schedule(). The timeout is there
243 * as a workaround for some DM problems in 2.6.18.
244 */
245 io_schedule_timeout(5*HZ);
246 }
247 finish_wait(&pool->wait, &wait);
248 243
244 spin_unlock_irqrestore(&pool->lock, flags);
245
246 /*
247 * FIXME: this should be io_schedule(). The timeout is there as a
248 * workaround for some DM problems in 2.6.18.
249 */
250 io_schedule_timeout(5*HZ);
251
252 finish_wait(&pool->wait, &wait);
249 goto repeat_alloc; 253 goto repeat_alloc;
250} 254}
251EXPORT_SYMBOL(mempool_alloc); 255EXPORT_SYMBOL(mempool_alloc);
@@ -265,7 +269,39 @@ void mempool_free(void *element, mempool_t *pool)
265 if (unlikely(element == NULL)) 269 if (unlikely(element == NULL))
266 return; 270 return;
267 271
268 smp_mb(); 272 /*
273 * Paired with the wmb in mempool_alloc(). The preceding read is
274 * for @element and the following @pool->curr_nr. This ensures
275 * that the visible value of @pool->curr_nr is from after the
276 * allocation of @element. This is necessary for fringe cases
277 * where @element was passed to this task without going through
278 * barriers.
279 *
280 * For example, assume @p is %NULL at the beginning and one task
281 * performs "p = mempool_alloc(...);" while another task is doing
282 * "while (!p) cpu_relax(); mempool_free(p, ...);". This function
283 * may end up using curr_nr value which is from before allocation
284 * of @p without the following rmb.
285 */
286 smp_rmb();
287
288 /*
289 * For correctness, we need a test which is guaranteed to trigger
290 * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr
291 * without locking achieves that and refilling as soon as possible
292 * is desirable.
293 *
294 * Because curr_nr visible here is always a value after the
295 * allocation of @element, any task which decremented curr_nr below
296 * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets
297 * incremented to min_nr afterwards. If curr_nr gets incremented
298 * to min_nr after the allocation of @element, the elements
299 * allocated after that are subject to the same guarantee.
300 *
301 * Waiters happen iff curr_nr is 0 and the above guarantee also
302 * ensures that there will be frees which return elements to the
303 * pool waking up the waiters.
304 */
269 if (pool->curr_nr < pool->min_nr) { 305 if (pool->curr_nr < pool->min_nr) {
270 spin_lock_irqsave(&pool->lock, flags); 306 spin_lock_irqsave(&pool->lock, flags);
271 if (pool->curr_nr < pool->min_nr) { 307 if (pool->curr_nr < pool->min_nr) {
diff --git a/mm/migrate.c b/mm/migrate.c
index 177aca424a06..89ea0854332e 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,8 +39,6 @@
39 39
40#include "internal.h" 40#include "internal.h"
41 41
42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
43
44/* 42/*
45 * migrate_prep() needs to be called before we start compiling a list of pages 43 * migrate_prep() needs to be called before we start compiling a list of pages
46 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is 44 * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
@@ -181,8 +179,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
181 * Something used the pte of a page under migration. We need to 179 * Something used the pte of a page under migration. We need to
182 * get to the page and wait until migration is finished. 180 * get to the page and wait until migration is finished.
183 * When we return from this function the fault will be retried. 181 * When we return from this function the fault will be retried.
184 *
185 * This function is called from do_swap_page().
186 */ 182 */
187void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 183void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
188 unsigned long address) 184 unsigned long address)
@@ -269,12 +265,12 @@ static int migrate_page_move_mapping(struct address_space *mapping,
269 265
270 radix_tree_replace_slot(pslot, newpage); 266 radix_tree_replace_slot(pslot, newpage);
271 267
272 page_unfreeze_refs(page, expected_count);
273 /* 268 /*
274 * Drop cache reference from old page. 269 * Drop cache reference from old page by unfreezing
270 * to one less reference.
275 * We know this isn't the last reference. 271 * We know this isn't the last reference.
276 */ 272 */
277 __put_page(page); 273 page_unfreeze_refs(page, expected_count - 1);
278 274
279 /* 275 /*
280 * If moved to a different zone then also account 276 * If moved to a different zone then also account
@@ -334,9 +330,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
334 330
335 radix_tree_replace_slot(pslot, newpage); 331 radix_tree_replace_slot(pslot, newpage);
336 332
337 page_unfreeze_refs(page, expected_count); 333 page_unfreeze_refs(page, expected_count - 1);
338
339 __put_page(page);
340 334
341 spin_unlock_irq(&mapping->tree_lock); 335 spin_unlock_irq(&mapping->tree_lock);
342 return 0; 336 return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index eae90af60ea6..3f758c7f4c81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1603 1603
1604EXPORT_SYMBOL(find_vma); 1604EXPORT_SYMBOL(find_vma);
1605 1605
1606/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ 1606/*
1607 * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
1608 * Note: pprev is set to NULL when return value is NULL.
1609 */
1607struct vm_area_struct * 1610struct vm_area_struct *
1608find_vma_prev(struct mm_struct *mm, unsigned long addr, 1611find_vma_prev(struct mm_struct *mm, unsigned long addr,
1609 struct vm_area_struct **pprev) 1612 struct vm_area_struct **pprev)
1610{ 1613{
1611 struct vm_area_struct *vma = NULL, *prev = NULL; 1614 struct vm_area_struct *vma;
1612 struct rb_node *rb_node;
1613 if (!mm)
1614 goto out;
1615
1616 /* Guard against addr being lower than the first VMA */
1617 vma = mm->mmap;
1618
1619 /* Go through the RB tree quickly. */
1620 rb_node = mm->mm_rb.rb_node;
1621
1622 while (rb_node) {
1623 struct vm_area_struct *vma_tmp;
1624 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
1625
1626 if (addr < vma_tmp->vm_end) {
1627 rb_node = rb_node->rb_left;
1628 } else {
1629 prev = vma_tmp;
1630 if (!prev->vm_next || (addr < prev->vm_next->vm_end))
1631 break;
1632 rb_node = rb_node->rb_right;
1633 }
1634 }
1635 1615
1636out: 1616 vma = find_vma(mm, addr);
1637 *pprev = prev; 1617 *pprev = vma ? vma->vm_prev : NULL;
1638 return prev ? prev->vm_next : vma; 1618 return vma;
1639} 1619}
1640 1620
1641/* 1621/*
@@ -2322,13 +2302,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2322 struct vm_area_struct *new_vma, *prev; 2302 struct vm_area_struct *new_vma, *prev;
2323 struct rb_node **rb_link, *rb_parent; 2303 struct rb_node **rb_link, *rb_parent;
2324 struct mempolicy *pol; 2304 struct mempolicy *pol;
2305 bool faulted_in_anon_vma = true;
2325 2306
2326 /* 2307 /*
2327 * If anonymous vma has not yet been faulted, update new pgoff 2308 * If anonymous vma has not yet been faulted, update new pgoff
2328 * to match new location, to increase its chance of merging. 2309 * to match new location, to increase its chance of merging.
2329 */ 2310 */
2330 if (!vma->vm_file && !vma->anon_vma) 2311 if (unlikely(!vma->vm_file && !vma->anon_vma)) {
2331 pgoff = addr >> PAGE_SHIFT; 2312 pgoff = addr >> PAGE_SHIFT;
2313 faulted_in_anon_vma = false;
2314 }
2332 2315
2333 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2316 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
2334 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2317 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2337,9 +2320,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2337 /* 2320 /*
2338 * Source vma may have been merged into new_vma 2321 * Source vma may have been merged into new_vma
2339 */ 2322 */
2340 if (vma_start >= new_vma->vm_start && 2323 if (unlikely(vma_start >= new_vma->vm_start &&
2341 vma_start < new_vma->vm_end) 2324 vma_start < new_vma->vm_end)) {
2325 /*
2326 * The only way we can get a vma_merge with
2327 * self during an mremap is if the vma hasn't
2328 * been faulted in yet and we were allowed to
2329 * reset the dst vma->vm_pgoff to the
2330 * destination address of the mremap to allow
2331 * the merge to happen. mremap must change the
2332 * vm_pgoff linearity between src and dst vmas
2333 * (in turn preventing a vma_merge) to be
2334 * safe. It is only safe to keep the vm_pgoff
2335 * linear if there are no pages mapped yet.
2336 */
2337 VM_BUG_ON(faulted_in_anon_vma);
2342 *vmap = new_vma; 2338 *vmap = new_vma;
2339 } else
2340 anon_vma_moveto_tail(new_vma);
2343 } else { 2341 } else {
2344 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2342 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2345 if (new_vma) { 2343 if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c
index d6959cb4df58..87bb8393e7d2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -221,6 +221,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
221 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); 221 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
222 if (moved_len < old_len) { 222 if (moved_len < old_len) {
223 /* 223 /*
224 * Before moving the page tables from the new vma to
225 * the old vma, we need to be sure the old vma is
226 * queued after new vma in the same_anon_vma list to
227 * prevent SMP races with rmap_walk (that could lead
228 * rmap_walk to miss some page table).
229 */
230 anon_vma_moveto_tail(vma);
231
232 /*
224 * On error, move entries back from new area to old, 233 * On error, move entries back from new area to old,
225 * which will succeed since page tables still there, 234 * which will succeed since page tables still there,
226 * and then proceed to unmap new area instead of old. 235 * and then proceed to unmap new area instead of old.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eeb27e27dce3..7c122faa05c5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -33,6 +33,10 @@
33#include <linux/security.h> 33#include <linux/security.h>
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/ftrace.h>
37
38#define CREATE_TRACE_POINTS
39#include <trace/events/oom.h>
36 40
37int sysctl_panic_on_oom; 41int sysctl_panic_on_oom;
38int sysctl_oom_kill_allocating_task; 42int sysctl_oom_kill_allocating_task;
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val)
55 spin_lock_irq(&sighand->siglock); 59 spin_lock_irq(&sighand->siglock);
56 if (current->signal->oom_score_adj == old_val) 60 if (current->signal->oom_score_adj == old_val)
57 current->signal->oom_score_adj = new_val; 61 current->signal->oom_score_adj = new_val;
62 trace_oom_score_adj_update(current);
58 spin_unlock_irq(&sighand->siglock); 63 spin_unlock_irq(&sighand->siglock);
59} 64}
60 65
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val)
74 spin_lock_irq(&sighand->siglock); 79 spin_lock_irq(&sighand->siglock);
75 old_val = current->signal->oom_score_adj; 80 old_val = current->signal->oom_score_adj;
76 current->signal->oom_score_adj = new_val; 81 current->signal->oom_score_adj = new_val;
82 trace_oom_score_adj_update(current);
77 spin_unlock_irq(&sighand->siglock); 83 spin_unlock_irq(&sighand->siglock);
78 84
79 return old_val; 85 return old_val;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8616ef3025a4..5cdd4f2b0c9d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -130,6 +130,191 @@ unsigned long global_dirty_limit;
130static struct prop_descriptor vm_completions; 130static struct prop_descriptor vm_completions;
131 131
132/* 132/*
133 * Work out the current dirty-memory clamping and background writeout
134 * thresholds.
135 *
136 * The main aim here is to lower them aggressively if there is a lot of mapped
137 * memory around. To avoid stressing page reclaim with lots of unreclaimable
138 * pages. It is better to clamp down on writers than to start swapping, and
139 * performing lots of scanning.
140 *
141 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
142 *
143 * We don't permit the clamping level to fall below 5% - that is getting rather
144 * excessive.
145 *
146 * We make sure that the background writeout level is below the adjusted
147 * clamping level.
148 */
149
150/*
151 * In a memory zone, there is a certain amount of pages we consider
152 * available for the page cache, which is essentially the number of
153 * free and reclaimable pages, minus some zone reserves to protect
154 * lowmem and the ability to uphold the zone's watermarks without
155 * requiring writeback.
156 *
157 * This number of dirtyable pages is the base value of which the
158 * user-configurable dirty ratio is the effictive number of pages that
159 * are allowed to be actually dirtied. Per individual zone, or
160 * globally by using the sum of dirtyable pages over all zones.
161 *
162 * Because the user is allowed to specify the dirty limit globally as
163 * absolute number of bytes, calculating the per-zone dirty limit can
164 * require translating the configured limit into a percentage of
165 * global dirtyable memory first.
166 */
167
168static unsigned long highmem_dirtyable_memory(unsigned long total)
169{
170#ifdef CONFIG_HIGHMEM
171 int node;
172 unsigned long x = 0;
173
174 for_each_node_state(node, N_HIGH_MEMORY) {
175 struct zone *z =
176 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
177
178 x += zone_page_state(z, NR_FREE_PAGES) +
179 zone_reclaimable_pages(z) - z->dirty_balance_reserve;
180 }
181 /*
182 * Make sure that the number of highmem pages is never larger
183 * than the number of the total dirtyable memory. This can only
184 * occur in very strange VM situations but we want to make sure
185 * that this does not occur.
186 */
187 return min(x, total);
188#else
189 return 0;
190#endif
191}
192
193/**
194 * global_dirtyable_memory - number of globally dirtyable pages
195 *
196 * Returns the global number of pages potentially available for dirty
197 * page cache. This is the base value for the global dirty limits.
198 */
199unsigned long global_dirtyable_memory(void)
200{
201 unsigned long x;
202
203 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
204 dirty_balance_reserve;
205
206 if (!vm_highmem_is_dirtyable)
207 x -= highmem_dirtyable_memory(x);
208
209 return x + 1; /* Ensure that we never return 0 */
210}
211
212/*
213 * global_dirty_limits - background-writeback and dirty-throttling thresholds
214 *
215 * Calculate the dirty thresholds based on sysctl parameters
216 * - vm.dirty_background_ratio or vm.dirty_background_bytes
217 * - vm.dirty_ratio or vm.dirty_bytes
218 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
219 * real-time tasks.
220 */
221void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
222{
223 unsigned long background;
224 unsigned long dirty;
225 unsigned long uninitialized_var(available_memory);
226 struct task_struct *tsk;
227
228 if (!vm_dirty_bytes || !dirty_background_bytes)
229 available_memory = global_dirtyable_memory();
230
231 if (vm_dirty_bytes)
232 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
233 else
234 dirty = (vm_dirty_ratio * available_memory) / 100;
235
236 if (dirty_background_bytes)
237 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
238 else
239 background = (dirty_background_ratio * available_memory) / 100;
240
241 if (background >= dirty)
242 background = dirty / 2;
243 tsk = current;
244 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
245 background += background / 4;
246 dirty += dirty / 4;
247 }
248 *pbackground = background;
249 *pdirty = dirty;
250 trace_global_dirty_state(background, dirty);
251}
252
253/**
254 * zone_dirtyable_memory - number of dirtyable pages in a zone
255 * @zone: the zone
256 *
257 * Returns the zone's number of pages potentially available for dirty
258 * page cache. This is the base value for the per-zone dirty limits.
259 */
260static unsigned long zone_dirtyable_memory(struct zone *zone)
261{
262 /*
263 * The effective global number of dirtyable pages may exclude
264 * highmem as a big-picture measure to keep the ratio between
265 * dirty memory and lowmem reasonable.
266 *
267 * But this function is purely about the individual zone and a
268 * highmem zone can hold its share of dirty pages, so we don't
269 * care about vm_highmem_is_dirtyable here.
270 */
271 return zone_page_state(zone, NR_FREE_PAGES) +
272 zone_reclaimable_pages(zone) -
273 zone->dirty_balance_reserve;
274}
275
276/**
277 * zone_dirty_limit - maximum number of dirty pages allowed in a zone
278 * @zone: the zone
279 *
280 * Returns the maximum number of dirty pages allowed in a zone, based
281 * on the zone's dirtyable memory.
282 */
283static unsigned long zone_dirty_limit(struct zone *zone)
284{
285 unsigned long zone_memory = zone_dirtyable_memory(zone);
286 struct task_struct *tsk = current;
287 unsigned long dirty;
288
289 if (vm_dirty_bytes)
290 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
291 zone_memory / global_dirtyable_memory();
292 else
293 dirty = vm_dirty_ratio * zone_memory / 100;
294
295 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
296 dirty += dirty / 4;
297
298 return dirty;
299}
300
301/**
302 * zone_dirty_ok - tells whether a zone is within its dirty limits
303 * @zone: the zone to check
304 *
305 * Returns %true when the dirty pages in @zone are within the zone's
306 * dirty limit, %false if the limit is exceeded.
307 */
308bool zone_dirty_ok(struct zone *zone)
309{
310 unsigned long limit = zone_dirty_limit(zone);
311
312 return zone_page_state(zone, NR_FILE_DIRTY) +
313 zone_page_state(zone, NR_UNSTABLE_NFS) +
314 zone_page_state(zone, NR_WRITEBACK) <= limit;
315}
316
317/*
133 * couple the period to the dirty_ratio: 318 * couple the period to the dirty_ratio:
134 * 319 *
135 * period/2 ~ roundup_pow_of_two(dirty limit) 320 * period/2 ~ roundup_pow_of_two(dirty limit)
@@ -141,7 +326,7 @@ static int calc_period_shift(void)
141 if (vm_dirty_bytes) 326 if (vm_dirty_bytes)
142 dirty_total = vm_dirty_bytes / PAGE_SIZE; 327 dirty_total = vm_dirty_bytes / PAGE_SIZE;
143 else 328 else
144 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 329 dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
145 100; 330 100;
146 return 2 + ilog2(dirty_total - 1); 331 return 2 + ilog2(dirty_total - 1);
147} 332}
@@ -196,7 +381,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
196 return ret; 381 return ret;
197} 382}
198 383
199
200int dirty_bytes_handler(struct ctl_table *table, int write, 384int dirty_bytes_handler(struct ctl_table *table, int write,
201 void __user *buffer, size_t *lenp, 385 void __user *buffer, size_t *lenp,
202 loff_t *ppos) 386 loff_t *ppos)
@@ -291,67 +475,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
291} 475}
292EXPORT_SYMBOL(bdi_set_max_ratio); 476EXPORT_SYMBOL(bdi_set_max_ratio);
293 477
294/*
295 * Work out the current dirty-memory clamping and background writeout
296 * thresholds.
297 *
298 * The main aim here is to lower them aggressively if there is a lot of mapped
299 * memory around. To avoid stressing page reclaim with lots of unreclaimable
300 * pages. It is better to clamp down on writers than to start swapping, and
301 * performing lots of scanning.
302 *
303 * We only allow 1/2 of the currently-unmapped memory to be dirtied.
304 *
305 * We don't permit the clamping level to fall below 5% - that is getting rather
306 * excessive.
307 *
308 * We make sure that the background writeout level is below the adjusted
309 * clamping level.
310 */
311
312static unsigned long highmem_dirtyable_memory(unsigned long total)
313{
314#ifdef CONFIG_HIGHMEM
315 int node;
316 unsigned long x = 0;
317
318 for_each_node_state(node, N_HIGH_MEMORY) {
319 struct zone *z =
320 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
321
322 x += zone_page_state(z, NR_FREE_PAGES) +
323 zone_reclaimable_pages(z);
324 }
325 /*
326 * Make sure that the number of highmem pages is never larger
327 * than the number of the total dirtyable memory. This can only
328 * occur in very strange VM situations but we want to make sure
329 * that this does not occur.
330 */
331 return min(x, total);
332#else
333 return 0;
334#endif
335}
336
337/**
338 * determine_dirtyable_memory - amount of memory that may be used
339 *
340 * Returns the numebr of pages that can currently be freed and used
341 * by the kernel for direct mappings.
342 */
343unsigned long determine_dirtyable_memory(void)
344{
345 unsigned long x;
346
347 x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
348
349 if (!vm_highmem_is_dirtyable)
350 x -= highmem_dirtyable_memory(x);
351
352 return x + 1; /* Ensure that we never return 0 */
353}
354
355static unsigned long dirty_freerun_ceiling(unsigned long thresh, 478static unsigned long dirty_freerun_ceiling(unsigned long thresh,
356 unsigned long bg_thresh) 479 unsigned long bg_thresh)
357{ 480{
@@ -363,47 +486,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
363 return max(thresh, global_dirty_limit); 486 return max(thresh, global_dirty_limit);
364} 487}
365 488
366/*
367 * global_dirty_limits - background-writeback and dirty-throttling thresholds
368 *
369 * Calculate the dirty thresholds based on sysctl parameters
370 * - vm.dirty_background_ratio or vm.dirty_background_bytes
371 * - vm.dirty_ratio or vm.dirty_bytes
372 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
373 * real-time tasks.
374 */
375void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
376{
377 unsigned long background;
378 unsigned long dirty;
379 unsigned long uninitialized_var(available_memory);
380 struct task_struct *tsk;
381
382 if (!vm_dirty_bytes || !dirty_background_bytes)
383 available_memory = determine_dirtyable_memory();
384
385 if (vm_dirty_bytes)
386 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
387 else
388 dirty = (vm_dirty_ratio * available_memory) / 100;
389
390 if (dirty_background_bytes)
391 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
392 else
393 background = (dirty_background_ratio * available_memory) / 100;
394
395 if (background >= dirty)
396 background = dirty / 2;
397 tsk = current;
398 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
399 background += background / 4;
400 dirty += dirty / 4;
401 }
402 *pbackground = background;
403 *pdirty = dirty;
404 trace_global_dirty_state(background, dirty);
405}
406
407/** 489/**
408 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 490 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
409 * @bdi: the backing_dev_info to query 491 * @bdi: the backing_dev_info to query
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7990ca154d1b..794e6715c226 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
57#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
58#include <linux/memcontrol.h> 58#include <linux/memcontrol.h>
59#include <linux/prefetch.h> 59#include <linux/prefetch.h>
60#include <linux/page-debug-flags.h>
60 61
61#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
62#include <asm/div64.h> 63#include <asm/div64.h>
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states);
96 97
97unsigned long totalram_pages __read_mostly; 98unsigned long totalram_pages __read_mostly;
98unsigned long totalreserve_pages __read_mostly; 99unsigned long totalreserve_pages __read_mostly;
100/*
101 * When calculating the number of globally allowed dirty pages, there
102 * is a certain number of per-zone reserves that should not be
103 * considered dirtyable memory. This is the sum of those reserves
104 * over all existing zones that contribute dirtyable memory.
105 */
106unsigned long dirty_balance_reserve __read_mostly;
107
99int percpu_pagelist_fraction; 108int percpu_pagelist_fraction;
100gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; 109gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
101 110
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void)
127 saved_gfp_mask = gfp_allowed_mask; 136 saved_gfp_mask = gfp_allowed_mask;
128 gfp_allowed_mask &= ~GFP_IOFS; 137 gfp_allowed_mask &= ~GFP_IOFS;
129} 138}
139
140bool pm_suspended_storage(void)
141{
142 if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
143 return false;
144 return true;
145}
130#endif /* CONFIG_PM_SLEEP */ 146#endif /* CONFIG_PM_SLEEP */
131 147
132#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 148#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
381 clear_highpage(page + i); 397 clear_highpage(page + i);
382} 398}
383 399
400#ifdef CONFIG_DEBUG_PAGEALLOC
401unsigned int _debug_guardpage_minorder;
402
403static int __init debug_guardpage_minorder_setup(char *buf)
404{
405 unsigned long res;
406
407 if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
408 printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
409 return 0;
410 }
411 _debug_guardpage_minorder = res;
412 printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
413 return 0;
414}
415__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
416
417static inline void set_page_guard_flag(struct page *page)
418{
419 __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
420}
421
422static inline void clear_page_guard_flag(struct page *page)
423{
424 __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
425}
426#else
427static inline void set_page_guard_flag(struct page *page) { }
428static inline void clear_page_guard_flag(struct page *page) { }
429#endif
430
384static inline void set_page_order(struct page *page, int order) 431static inline void set_page_order(struct page *page, int order)
385{ 432{
386 set_page_private(page, order); 433 set_page_private(page, order);
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
438 if (page_zone_id(page) != page_zone_id(buddy)) 485 if (page_zone_id(page) != page_zone_id(buddy))
439 return 0; 486 return 0;
440 487
488 if (page_is_guard(buddy) && page_order(buddy) == order) {
489 VM_BUG_ON(page_count(buddy) != 0);
490 return 1;
491 }
492
441 if (PageBuddy(buddy) && page_order(buddy) == order) { 493 if (PageBuddy(buddy) && page_order(buddy) == order) {
442 VM_BUG_ON(page_count(buddy) != 0); 494 VM_BUG_ON(page_count(buddy) != 0);
443 return 1; 495 return 1;
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page,
494 buddy = page + (buddy_idx - page_idx); 546 buddy = page + (buddy_idx - page_idx);
495 if (!page_is_buddy(page, buddy, order)) 547 if (!page_is_buddy(page, buddy, order))
496 break; 548 break;
497 549 /*
498 /* Our buddy is free, merge with it and move up one order. */ 550 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
499 list_del(&buddy->lru); 551 * merge with it and move up one order.
500 zone->free_area[order].nr_free--; 552 */
501 rmv_page_order(buddy); 553 if (page_is_guard(buddy)) {
554 clear_page_guard_flag(buddy);
555 set_page_private(page, 0);
556 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
557 } else {
558 list_del(&buddy->lru);
559 zone->free_area[order].nr_free--;
560 rmv_page_order(buddy);
561 }
502 combined_idx = buddy_idx & page_idx; 562 combined_idx = buddy_idx & page_idx;
503 page = page + (combined_idx - page_idx); 563 page = page + (combined_idx - page_idx);
504 page_idx = combined_idx; 564 page_idx = combined_idx;
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
632 int i; 692 int i;
633 int bad = 0; 693 int bad = 0;
634 694
635 trace_mm_page_free_direct(page, order); 695 trace_mm_page_free(page, order);
636 kmemcheck_free_shadow(page, order); 696 kmemcheck_free_shadow(page, order);
637 697
638 if (PageAnon(page)) 698 if (PageAnon(page))
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order)
670 local_irq_restore(flags); 730 local_irq_restore(flags);
671} 731}
672 732
673/*
674 * permit the bootmem allocator to evade page validation on high-order frees
675 */
676void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 733void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
677{ 734{
678 if (order == 0) { 735 unsigned int nr_pages = 1 << order;
679 __ClearPageReserved(page); 736 unsigned int loop;
680 set_page_count(page, 0);
681 set_page_refcounted(page);
682 __free_page(page);
683 } else {
684 int loop;
685
686 prefetchw(page);
687 for (loop = 0; loop < (1 << order); loop++) {
688 struct page *p = &page[loop];
689 737
690 if (loop + 1 < (1 << order)) 738 prefetchw(page);
691 prefetchw(p + 1); 739 for (loop = 0; loop < nr_pages; loop++) {
692 __ClearPageReserved(p); 740 struct page *p = &page[loop];
693 set_page_count(p, 0);
694 }
695 741
696 set_page_refcounted(page); 742 if (loop + 1 < nr_pages)
697 __free_pages(page, order); 743 prefetchw(p + 1);
744 __ClearPageReserved(p);
745 set_page_count(p, 0);
698 } 746 }
747
748 set_page_refcounted(page);
749 __free_pages(page, order);
699} 750}
700 751
701 752
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page,
724 high--; 775 high--;
725 size >>= 1; 776 size >>= 1;
726 VM_BUG_ON(bad_range(zone, &page[size])); 777 VM_BUG_ON(bad_range(zone, &page[size]));
778
779#ifdef CONFIG_DEBUG_PAGEALLOC
780 if (high < debug_guardpage_minorder()) {
781 /*
782 * Mark as guard pages (or page), that will allow to
783 * merge back to allocator when buddy will be freed.
784 * Corresponding page table entries will not be touched,
785 * pages will stay not present in virtual address space
786 */
787 INIT_LIST_HEAD(&page[size].lru);
788 set_page_guard_flag(&page[size]);
789 set_page_private(&page[size], high);
790 /* Guard pages are not available for any usage */
791 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
792 continue;
793 }
794#endif
727 list_add(&page[size].lru, &area->free_list[migratetype]); 795 list_add(&page[size].lru, &area->free_list[migratetype]);
728 area->nr_free++; 796 area->nr_free++;
729 set_page_order(&page[size], high); 797 set_page_order(&page[size], high);
@@ -1189,6 +1257,19 @@ out:
1189} 1257}
1190 1258
1191/* 1259/*
1260 * Free a list of 0-order pages
1261 */
1262void free_hot_cold_page_list(struct list_head *list, int cold)
1263{
1264 struct page *page, *next;
1265
1266 list_for_each_entry_safe(page, next, list, lru) {
1267 trace_mm_page_free_batched(page, cold);
1268 free_hot_cold_page(page, cold);
1269 }
1270}
1271
1272/*
1192 * split_page takes a non-compound higher-order page, and splits it into 1273 * split_page takes a non-compound higher-order page, and splits it into
1193 * n (1<<order) sub-pages: page[0..n] 1274 * n (1<<order) sub-pages: page[0..n]
1194 * Each sub-page must be freed individually. 1275 * Each sub-page must be freed individually.
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1435 long min = mark; 1516 long min = mark;
1436 int o; 1517 int o;
1437 1518
1438 free_pages -= (1 << order) + 1; 1519 free_pages -= (1 << order) - 1;
1439 if (alloc_flags & ALLOC_HIGH) 1520 if (alloc_flags & ALLOC_HIGH)
1440 min -= min / 2; 1521 min -= min / 2;
1441 if (alloc_flags & ALLOC_HARDER) 1522 if (alloc_flags & ALLOC_HARDER)
@@ -1645,6 +1726,35 @@ zonelist_scan:
1645 if ((alloc_flags & ALLOC_CPUSET) && 1726 if ((alloc_flags & ALLOC_CPUSET) &&
1646 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1727 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1647 continue; 1728 continue;
1729 /*
1730 * When allocating a page cache page for writing, we
1731 * want to get it from a zone that is within its dirty
1732 * limit, such that no single zone holds more than its
1733 * proportional share of globally allowed dirty pages.
1734 * The dirty limits take into account the zone's
1735 * lowmem reserves and high watermark so that kswapd
1736 * should be able to balance it without having to
1737 * write pages from its LRU list.
1738 *
1739 * This may look like it could increase pressure on
1740 * lower zones by failing allocations in higher zones
1741 * before they are full. But the pages that do spill
1742 * over are limited as the lower zones are protected
1743 * by this very same mechanism. It should not become
1744 * a practical burden to them.
1745 *
1746 * XXX: For now, allow allocations to potentially
1747 * exceed the per-zone dirty limit in the slowpath
1748 * (ALLOC_WMARK_LOW unset) before going into reclaim,
1749 * which is important when on a NUMA setup the allowed
1750 * zones are together not big enough to reach the
1751 * global limit. The proper fix for these situations
1752 * will require awareness of zones in the
1753 * dirty-throttling and the flusher threads.
1754 */
1755 if ((alloc_flags & ALLOC_WMARK_LOW) &&
1756 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1757 goto this_zone_full;
1648 1758
1649 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1759 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1650 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1760 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1734{ 1844{
1735 unsigned int filter = SHOW_MEM_FILTER_NODES; 1845 unsigned int filter = SHOW_MEM_FILTER_NODES;
1736 1846
1737 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) 1847 if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
1848 debug_guardpage_minorder() > 0)
1738 return; 1849 return;
1739 1850
1740 /* 1851 /*
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
1773 1884
1774static inline int 1885static inline int
1775should_alloc_retry(gfp_t gfp_mask, unsigned int order, 1886should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1887 unsigned long did_some_progress,
1776 unsigned long pages_reclaimed) 1888 unsigned long pages_reclaimed)
1777{ 1889{
1778 /* Do not loop if specifically requested */ 1890 /* Do not loop if specifically requested */
1779 if (gfp_mask & __GFP_NORETRY) 1891 if (gfp_mask & __GFP_NORETRY)
1780 return 0; 1892 return 0;
1781 1893
1894 /* Always retry if specifically requested */
1895 if (gfp_mask & __GFP_NOFAIL)
1896 return 1;
1897
1898 /*
1899 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
1900 * making forward progress without invoking OOM. Suspend also disables
1901 * storage devices so kswapd will not help. Bail if we are suspending.
1902 */
1903 if (!did_some_progress && pm_suspended_storage())
1904 return 0;
1905
1782 /* 1906 /*
1783 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER 1907 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1784 * means __GFP_NOFAIL, but that may not be true in other 1908 * means __GFP_NOFAIL, but that may not be true in other
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1797 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) 1921 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1798 return 1; 1922 return 1;
1799 1923
1800 /*
1801 * Don't let big-order allocations loop unless the caller
1802 * explicitly requests that.
1803 */
1804 if (gfp_mask & __GFP_NOFAIL)
1805 return 1;
1806
1807 return 0; 1924 return 0;
1808} 1925}
1809 1926
@@ -2196,7 +2313,8 @@ rebalance:
2196 2313
2197 /* Check if we should retry the allocation */ 2314 /* Check if we should retry the allocation */
2198 pages_reclaimed += did_some_progress; 2315 pages_reclaimed += did_some_progress;
2199 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { 2316 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2317 pages_reclaimed)) {
2200 /* Wait for some write requests to complete then retry */ 2318 /* Wait for some write requests to complete then retry */
2201 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2319 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2202 goto rebalance; 2320 goto rebalance;
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
2306} 2424}
2307EXPORT_SYMBOL(get_zeroed_page); 2425EXPORT_SYMBOL(get_zeroed_page);
2308 2426
2309void __pagevec_free(struct pagevec *pvec)
2310{
2311 int i = pagevec_count(pvec);
2312
2313 while (--i >= 0) {
2314 trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
2315 free_hot_cold_page(pvec->pages[i], pvec->cold);
2316 }
2317}
2318
2319void __free_pages(struct page *page, unsigned int order) 2427void __free_pages(struct page *page, unsigned int order)
2320{ 2428{
2321 if (put_page_testzero(page)) { 2429 if (put_page_testzero(page)) {
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone)
3385 if (page_to_nid(page) != zone_to_nid(zone)) 3493 if (page_to_nid(page) != zone_to_nid(zone))
3386 continue; 3494 continue;
3387 3495
3388 /* Blocks with reserved pages will never free, skip them. */
3389 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3390 if (pageblock_is_reserved(pfn, block_end_pfn))
3391 continue;
3392
3393 block_migratetype = get_pageblock_migratetype(page); 3496 block_migratetype = get_pageblock_migratetype(page);
3394 3497
3395 /* If this block is reserved, account for it */ 3498 /* Only test what is necessary when the reserves are not met */
3396 if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { 3499 if (reserve > 0) {
3397 reserve--; 3500 /*
3398 continue; 3501 * Blocks with reserved pages will never free, skip
3399 } 3502 * them.
3503 */
3504 block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
3505 if (pageblock_is_reserved(pfn, block_end_pfn))
3506 continue;
3400 3507
3401 /* Suitable for reserving if this block is movable */ 3508 /* If this block is reserved, account for it */
3402 if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { 3509 if (block_migratetype == MIGRATE_RESERVE) {
3403 set_pageblock_migratetype(page, MIGRATE_RESERVE); 3510 reserve--;
3404 move_freepages_block(zone, page, MIGRATE_RESERVE); 3511 continue;
3405 reserve--; 3512 }
3406 continue; 3513
3514 /* Suitable for reserving if this block is movable */
3515 if (block_migratetype == MIGRATE_MOVABLE) {
3516 set_pageblock_migratetype(page,
3517 MIGRATE_RESERVE);
3518 move_freepages_block(zone, page,
3519 MIGRATE_RESERVE);
3520 reserve--;
3521 continue;
3522 }
3407 } 3523 }
3408 3524
3409 /* 3525 /*
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void)
4734 if (max > zone->present_pages) 4850 if (max > zone->present_pages)
4735 max = zone->present_pages; 4851 max = zone->present_pages;
4736 reserve_pages += max; 4852 reserve_pages += max;
4853 /*
4854 * Lowmem reserves are not available to
4855 * GFP_HIGHUSER page cache allocations and
4856 * kswapd tries to balance zones to their high
4857 * watermark. As a result, neither should be
4858 * regarded as dirtyable memory, to prevent a
4859 * situation where reclaim has to clean pages
4860 * in order to balance the zones.
4861 */
4862 zone->dirty_balance_reserve = max;
4737 } 4863 }
4738 } 4864 }
4865 dirty_balance_reserve = reserve_pages;
4739 totalreserve_pages = reserve_pages; 4866 totalreserve_pages = reserve_pages;
4740} 4867}
4741 4868
diff --git a/mm/rmap.c b/mm/rmap.c
index a4fd3680038b..a2e5ce1fa081 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -272,6 +272,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
272} 272}
273 273
274/* 274/*
275 * Some rmap walk that needs to find all ptes/hugepmds without false
276 * negatives (like migrate and split_huge_page) running concurrent
277 * with operations that copy or move pagetables (like mremap() and
278 * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
279 * list to be in a certain order: the dst_vma must be placed after the
280 * src_vma in the list. This is always guaranteed by fork() but
281 * mremap() needs to call this function to enforce it in case the
282 * dst_vma isn't newly allocated and chained with the anon_vma_clone()
283 * function but just an extension of a pre-existing vma through
284 * vma_merge.
285 *
286 * NOTE: the same_anon_vma list can still be changed by other
287 * processes while mremap runs because mremap doesn't hold the
288 * anon_vma mutex to prevent modifications to the list while it
289 * runs. All we need to enforce is that the relative order of this
290 * process vmas isn't changing (we don't care about other vmas
291 * order). Each vma corresponds to an anon_vma_chain structure so
292 * there's no risk that other processes calling anon_vma_moveto_tail()
293 * and changing the same_anon_vma list under mremap() will screw with
294 * the relative order of this process vmas in the list, because we
295 * they can't alter the order of any vma that belongs to this
296 * process. And there can't be another anon_vma_moveto_tail() running
297 * concurrently with mremap() coming from this process because we hold
298 * the mmap_sem for the whole mremap(). fork() ordering dependency
299 * also shouldn't be affected because fork() only cares that the
300 * parent vmas are placed in the list before the child vmas and
301 * anon_vma_moveto_tail() won't reorder vmas from either the fork()
302 * parent or child.
303 */
304void anon_vma_moveto_tail(struct vm_area_struct *dst)
305{
306 struct anon_vma_chain *pavc;
307 struct anon_vma *root = NULL;
308
309 list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
310 struct anon_vma *anon_vma = pavc->anon_vma;
311 VM_BUG_ON(pavc->vma != dst);
312 root = lock_anon_vma_root(root, anon_vma);
313 list_del(&pavc->same_anon_vma);
314 list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
315 }
316 unlock_anon_vma_root(root);
317}
318
319/*
275 * Attach vma to its own anon_vma, as well as to the anon_vmas that 320 * Attach vma to its own anon_vma, as well as to the anon_vmas that
276 * the corresponding VMA in the parent process is attached to. 321 * the corresponding VMA in the parent process is attached to.
277 * Returns 0 on success, non-zero on failure. 322 * Returns 0 on success, non-zero on failure.
diff --git a/mm/slub.c b/mm/slub.c
index 025f6ac51569..d99acbf14e01 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3654,6 +3654,9 @@ void __init kmem_cache_init(void)
3654 struct kmem_cache *temp_kmem_cache_node; 3654 struct kmem_cache *temp_kmem_cache_node;
3655 unsigned long kmalloc_size; 3655 unsigned long kmalloc_size;
3656 3656
3657 if (debug_guardpage_minorder())
3658 slub_max_order = 0;
3659
3657 kmem_size = offsetof(struct kmem_cache, node) + 3660 kmem_size = offsetof(struct kmem_cache, node) +
3658 nr_node_ids * sizeof(struct kmem_cache_node *); 3661 nr_node_ids * sizeof(struct kmem_cache_node *);
3659 3662
diff --git a/mm/swap.c b/mm/swap.c
index a91caf754d9b..67a09a633a09 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -585,11 +585,10 @@ int lru_add_drain_all(void)
585void release_pages(struct page **pages, int nr, int cold) 585void release_pages(struct page **pages, int nr, int cold)
586{ 586{
587 int i; 587 int i;
588 struct pagevec pages_to_free; 588 LIST_HEAD(pages_to_free);
589 struct zone *zone = NULL; 589 struct zone *zone = NULL;
590 unsigned long uninitialized_var(flags); 590 unsigned long uninitialized_var(flags);
591 591
592 pagevec_init(&pages_to_free, cold);
593 for (i = 0; i < nr; i++) { 592 for (i = 0; i < nr; i++) {
594 struct page *page = pages[i]; 593 struct page *page = pages[i];
595 594
@@ -620,19 +619,12 @@ void release_pages(struct page **pages, int nr, int cold)
620 del_page_from_lru(zone, page); 619 del_page_from_lru(zone, page);
621 } 620 }
622 621
623 if (!pagevec_add(&pages_to_free, page)) { 622 list_add(&page->lru, &pages_to_free);
624 if (zone) {
625 spin_unlock_irqrestore(&zone->lru_lock, flags);
626 zone = NULL;
627 }
628 __pagevec_free(&pages_to_free);
629 pagevec_reinit(&pages_to_free);
630 }
631 } 623 }
632 if (zone) 624 if (zone)
633 spin_unlock_irqrestore(&zone->lru_lock, flags); 625 spin_unlock_irqrestore(&zone->lru_lock, flags);
634 626
635 pagevec_free(&pages_to_free); 627 free_hot_cold_page_list(&pages_to_free, cold);
636} 628}
637EXPORT_SYMBOL(release_pages); 629EXPORT_SYMBOL(release_pages);
638 630
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b1cd12060723..9520592d4231 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -667,10 +667,10 @@ int try_to_free_swap(struct page *page)
667 * original page might be freed under memory pressure, then 667 * original page might be freed under memory pressure, then
668 * later read back in from swap, now with the wrong data. 668 * later read back in from swap, now with the wrong data.
669 * 669 *
670 * Hibernation clears bits from gfp_allowed_mask to prevent 670 * Hibration suspends storage while it is writing the image
671 * memory reclaim from writing to disk, so check that here. 671 * to disk so check that here.
672 */ 672 */
673 if (!(gfp_allowed_mask & __GFP_IO)) 673 if (pm_suspended_storage())
674 return 0; 674 return 0;
675 675
676 delete_from_swap_cache(page); 676 delete_from_swap_cache(page);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 21fdf46ad5aa..877ca046f43d 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -256,7 +256,7 @@ struct vmap_area {
256 struct rb_node rb_node; /* address sorted rbtree */ 256 struct rb_node rb_node; /* address sorted rbtree */
257 struct list_head list; /* address sorted list */ 257 struct list_head list; /* address sorted list */
258 struct list_head purge_list; /* "lazy purge" list */ 258 struct list_head purge_list; /* "lazy purge" list */
259 void *private; 259 struct vm_struct *vm;
260 struct rcu_head rcu_head; 260 struct rcu_head rcu_head;
261}; 261};
262 262
@@ -1285,7 +1285,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1285 vm->addr = (void *)va->va_start; 1285 vm->addr = (void *)va->va_start;
1286 vm->size = va->va_end - va->va_start; 1286 vm->size = va->va_end - va->va_start;
1287 vm->caller = caller; 1287 vm->caller = caller;
1288 va->private = vm; 1288 va->vm = vm;
1289 va->flags |= VM_VM_AREA; 1289 va->flags |= VM_VM_AREA;
1290} 1290}
1291 1291
@@ -1408,7 +1408,7 @@ static struct vm_struct *find_vm_area(const void *addr)
1408 1408
1409 va = find_vmap_area((unsigned long)addr); 1409 va = find_vmap_area((unsigned long)addr);
1410 if (va && va->flags & VM_VM_AREA) 1410 if (va && va->flags & VM_VM_AREA)
1411 return va->private; 1411 return va->vm;
1412 1412
1413 return NULL; 1413 return NULL;
1414} 1414}
@@ -1427,7 +1427,7 @@ struct vm_struct *remove_vm_area(const void *addr)
1427 1427
1428 va = find_vmap_area((unsigned long)addr); 1428 va = find_vmap_area((unsigned long)addr);
1429 if (va && va->flags & VM_VM_AREA) { 1429 if (va && va->flags & VM_VM_AREA) {
1430 struct vm_struct *vm = va->private; 1430 struct vm_struct *vm = va->vm;
1431 1431
1432 if (!(vm->flags & VM_UNLIST)) { 1432 if (!(vm->flags & VM_UNLIST)) {
1433 struct vm_struct *tmp, **p; 1433 struct vm_struct *tmp, **p;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 11adc890ce30..26f4a8a4e0c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -715,7 +715,13 @@ static enum page_references page_check_references(struct page *page,
715 */ 715 */
716 SetPageReferenced(page); 716 SetPageReferenced(page);
717 717
718 if (referenced_page) 718 if (referenced_page || referenced_ptes > 1)
719 return PAGEREF_ACTIVATE;
720
721 /*
722 * Activate file-backed executable pages after first usage.
723 */
724 if (vm_flags & VM_EXEC)
719 return PAGEREF_ACTIVATE; 725 return PAGEREF_ACTIVATE;
720 726
721 return PAGEREF_KEEP; 727 return PAGEREF_KEEP;
@@ -728,24 +734,6 @@ static enum page_references page_check_references(struct page *page,
728 return PAGEREF_RECLAIM; 734 return PAGEREF_RECLAIM;
729} 735}
730 736
731static noinline_for_stack void free_page_list(struct list_head *free_pages)
732{
733 struct pagevec freed_pvec;
734 struct page *page, *tmp;
735
736 pagevec_init(&freed_pvec, 1);
737
738 list_for_each_entry_safe(page, tmp, free_pages, lru) {
739 list_del(&page->lru);
740 if (!pagevec_add(&freed_pvec, page)) {
741 __pagevec_free(&freed_pvec);
742 pagevec_reinit(&freed_pvec);
743 }
744 }
745
746 pagevec_free(&freed_pvec);
747}
748
749/* 737/*
750 * shrink_page_list() returns the number of reclaimed pages 738 * shrink_page_list() returns the number of reclaimed pages
751 */ 739 */
@@ -1009,7 +997,7 @@ keep_lumpy:
1009 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) 997 if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
1010 zone_set_flag(zone, ZONE_CONGESTED); 998 zone_set_flag(zone, ZONE_CONGESTED);
1011 999
1012 free_page_list(&free_pages); 1000 free_hot_cold_page_list(&free_pages, 1);
1013 1001
1014 list_splice(&ret_pages, page_list); 1002 list_splice(&ret_pages, page_list);
1015 count_vm_events(PGACTIVATE, pgactivate); 1003 count_vm_events(PGACTIVATE, pgactivate);
@@ -1178,14 +1166,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1178 * anon page which don't already have a swap slot is 1166 * anon page which don't already have a swap slot is
1179 * pointless. 1167 * pointless.
1180 */ 1168 */
1181 if (nr_swap_pages <= 0 && PageAnon(cursor_page) && 1169 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1182 !PageSwapCache(cursor_page)) 1170 !PageSwapCache(cursor_page))
1183 break; 1171 break;
1184 1172
1185 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1173 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1186 list_move(&cursor_page->lru, dst); 1174 list_move(&cursor_page->lru, dst);
1187 mem_cgroup_del_lru(cursor_page); 1175 mem_cgroup_del_lru(cursor_page);
1188 nr_taken += hpage_nr_pages(page); 1176 nr_taken += hpage_nr_pages(cursor_page);
1189 nr_lumpy_taken++; 1177 nr_lumpy_taken++;
1190 if (PageDirty(cursor_page)) 1178 if (PageDirty(cursor_page))
1191 nr_lumpy_dirty++; 1179 nr_lumpy_dirty++;
@@ -2012,8 +2000,9 @@ static inline bool should_continue_reclaim(struct zone *zone,
2012 * inactive lists are large enough, continue reclaiming 2000 * inactive lists are large enough, continue reclaiming
2013 */ 2001 */
2014 pages_for_compaction = (2UL << sc->order); 2002 pages_for_compaction = (2UL << sc->order);
2015 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + 2003 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
2016 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); 2004 if (nr_swap_pages > 0)
2005 inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
2017 if (sc->nr_reclaimed < pages_for_compaction && 2006 if (sc->nr_reclaimed < pages_for_compaction &&
2018 inactive_lru_pages > pages_for_compaction) 2007 inactive_lru_pages > pages_for_compaction)
2019 return true; 2008 return true;
@@ -3448,9 +3437,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
3448static void warn_scan_unevictable_pages(void) 3437static void warn_scan_unevictable_pages(void)
3449{ 3438{
3450 printk_once(KERN_WARNING 3439 printk_once(KERN_WARNING
3451 "The scan_unevictable_pages sysctl/node-interface has been " 3440 "%s: The scan_unevictable_pages sysctl/node-interface has been "
3452 "disabled for lack of a legitimate use case. If you have " 3441 "disabled for lack of a legitimate use case. If you have "
3453 "one, please send an email to linux-mm@kvack.org.\n"); 3442 "one, please send an email to linux-mm@kvack.org.\n",
3443 current->comm);
3454} 3444}
3455 3445
3456/* 3446/*