diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 19:42:48 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 19:42:48 -0500 |
commit | 40ba587923ae67090d9f141c1d3c951be5c1420e (patch) | |
tree | 342a72fc0ee13a0d2496ef970b64dfeadf1355d2 /mm | |
parent | 54c2c5761febcca46c8037d3a81612991e6c209a (diff) | |
parent | 6b550f9495947fc279d12c38feaf98500e8d0646 (diff) |
Merge branch 'akpm' (aka "Andrew's patch-bomb")
Andrew elucidates:
- First installmeant of MM. We have a HUGE number of MM patches this
time. It's crazy.
- MAINTAINERS updates
- backlight updates
- leds
- checkpatch updates
- misc ELF stuff
- rtc updates
- reiserfs
- procfs
- some misc other bits
* akpm: (124 commits)
user namespace: make signal.c respect user namespaces
workqueue: make alloc_workqueue() take printf fmt and args for name
procfs: add hidepid= and gid= mount options
procfs: parse mount options
procfs: introduce the /proc/<pid>/map_files/ directory
procfs: make proc_get_link to use dentry instead of inode
signal: add block_sigmask() for adding sigmask to current->blocked
sparc: make SA_NOMASK a synonym of SA_NODEFER
reiserfs: don't lock root inode searching
reiserfs: don't lock journal_init()
reiserfs: delay reiserfs lock until journal initialization
reiserfs: delete comments referring to the BKL
drivers/rtc/interface.c: fix alarm rollover when day or month is out-of-range
drivers/rtc/rtc-twl.c: add DT support for RTC inside twl4030/twl6030
drivers/rtc/: remove redundant spi driver bus initialization
drivers/rtc/rtc-jz4740.c: make jz4740_rtc_driver static
drivers/rtc/rtc-mc13xxx.c: make mc13xxx_rtc_idtable static
rtc: convert drivers/rtc/* to use module_platform_driver()
drivers/rtc/rtc-wm831x.c: convert to devm_kzalloc()
drivers/rtc/rtc-wm831x.c: remove unused period IRQ handler
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig.debug | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 24 | ||||
-rw-r--r-- | mm/compaction.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 3 | ||||
-rw-r--r-- | mm/filemap.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 19 | ||||
-rw-r--r-- | mm/mempolicy.c | 14 | ||||
-rw-r--r-- | mm/mempool.c | 104 | ||||
-rw-r--r-- | mm/migrate.c | 14 | ||||
-rw-r--r-- | mm/mmap.c | 60 | ||||
-rw-r--r-- | mm/mremap.c | 9 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 290 | ||||
-rw-r--r-- | mm/page_alloc.c | 253 | ||||
-rw-r--r-- | mm/rmap.c | 45 | ||||
-rw-r--r-- | mm/slub.c | 3 | ||||
-rw-r--r-- | mm/swap.c | 14 | ||||
-rw-r--r-- | mm/swapfile.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 42 |
20 files changed, 615 insertions, 313 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 8b1a477162dc..4b2443254de2 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC | |||
4 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC | 4 | depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC |
5 | depends on !KMEMCHECK | 5 | depends on !KMEMCHECK |
6 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 6 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
7 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
7 | ---help--- | 8 | ---help--- |
8 | Unmap pages from the kernel linear mapping after free_pages(). | 9 | Unmap pages from the kernel linear mapping after free_pages(). |
9 | This results in a large slowdown, but helps to find certain types | 10 | This results in a large slowdown, but helps to find certain types |
@@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS | |||
22 | config PAGE_POISONING | 23 | config PAGE_POISONING |
23 | bool | 24 | bool |
24 | select WANT_PAGE_DEBUG_FLAGS | 25 | select WANT_PAGE_DEBUG_FLAGS |
26 | |||
27 | config PAGE_GUARD | ||
28 | bool | ||
29 | select WANT_PAGE_DEBUG_FLAGS | ||
diff --git a/mm/bootmem.c b/mm/bootmem.c index 1a77012ecdb3..668e94df8cf2 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup); | |||
56 | 56 | ||
57 | static unsigned long __init bootmap_bytes(unsigned long pages) | 57 | static unsigned long __init bootmap_bytes(unsigned long pages) |
58 | { | 58 | { |
59 | unsigned long bytes = (pages + 7) / 8; | 59 | unsigned long bytes = DIV_ROUND_UP(pages, 8); |
60 | 60 | ||
61 | return ALIGN(bytes, sizeof(long)); | 61 | return ALIGN(bytes, sizeof(long)); |
62 | } | 62 | } |
@@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
171 | 171 | ||
172 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 172 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
173 | { | 173 | { |
174 | int aligned; | ||
175 | struct page *page; | 174 | struct page *page; |
176 | unsigned long start, end, pages, count = 0; | 175 | unsigned long start, end, pages, count = 0; |
177 | 176 | ||
@@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
181 | start = bdata->node_min_pfn; | 180 | start = bdata->node_min_pfn; |
182 | end = bdata->node_low_pfn; | 181 | end = bdata->node_low_pfn; |
183 | 182 | ||
184 | /* | 183 | bdebug("nid=%td start=%lx end=%lx\n", |
185 | * If the start is aligned to the machines wordsize, we might | 184 | bdata - bootmem_node_data, start, end); |
186 | * be able to free pages in bulks of that order. | ||
187 | */ | ||
188 | aligned = !(start & (BITS_PER_LONG - 1)); | ||
189 | |||
190 | bdebug("nid=%td start=%lx end=%lx aligned=%d\n", | ||
191 | bdata - bootmem_node_data, start, end, aligned); | ||
192 | 185 | ||
193 | while (start < end) { | 186 | while (start < end) { |
194 | unsigned long *map, idx, vec; | 187 | unsigned long *map, idx, vec; |
@@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
196 | map = bdata->node_bootmem_map; | 189 | map = bdata->node_bootmem_map; |
197 | idx = start - bdata->node_min_pfn; | 190 | idx = start - bdata->node_min_pfn; |
198 | vec = ~map[idx / BITS_PER_LONG]; | 191 | vec = ~map[idx / BITS_PER_LONG]; |
199 | 192 | /* | |
200 | if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { | 193 | * If we have a properly aligned and fully unreserved |
194 | * BITS_PER_LONG block of pages in front of us, free | ||
195 | * it in one go. | ||
196 | */ | ||
197 | if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { | ||
201 | int order = ilog2(BITS_PER_LONG); | 198 | int order = ilog2(BITS_PER_LONG); |
202 | 199 | ||
203 | __free_pages_bootmem(pfn_to_page(start), order); | 200 | __free_pages_bootmem(pfn_to_page(start), order); |
204 | count += BITS_PER_LONG; | 201 | count += BITS_PER_LONG; |
202 | start += BITS_PER_LONG; | ||
205 | } else { | 203 | } else { |
206 | unsigned long off = 0; | 204 | unsigned long off = 0; |
207 | 205 | ||
@@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
214 | vec >>= 1; | 212 | vec >>= 1; |
215 | off++; | 213 | off++; |
216 | } | 214 | } |
215 | start = ALIGN(start + 1, BITS_PER_LONG); | ||
217 | } | 216 | } |
218 | start += BITS_PER_LONG; | ||
219 | } | 217 | } |
220 | 218 | ||
221 | page = virt_to_page(bdata->node_bootmem_map); | 219 | page = virt_to_page(bdata->node_bootmem_map); |
diff --git a/mm/compaction.c b/mm/compaction.c index 1253d7ac332b..e6670c34eb49 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
365 | nr_isolated++; | 365 | nr_isolated++; |
366 | 366 | ||
367 | /* Avoid isolating too much */ | 367 | /* Avoid isolating too much */ |
368 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) | 368 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
369 | ++low_pfn; | ||
369 | break; | 370 | break; |
371 | } | ||
370 | } | 372 | } |
371 | 373 | ||
372 | acct_isolated(zone, cc); | 374 | acct_isolated(zone, cc); |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 8d723c9e8b75..469491e0af79 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
117 | break; | 117 | break; |
118 | case POSIX_FADV_DONTNEED: | 118 | case POSIX_FADV_DONTNEED: |
119 | if (!bdi_write_congested(mapping->backing_dev_info)) | 119 | if (!bdi_write_congested(mapping->backing_dev_info)) |
120 | filemap_flush(mapping); | 120 | __filemap_fdatawrite_range(mapping, offset, endbyte, |
121 | WB_SYNC_NONE); | ||
121 | 122 | ||
122 | /* First and last FULL page! */ | 123 | /* First and last FULL page! */ |
123 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; | 124 | start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; |
diff --git a/mm/filemap.c b/mm/filemap.c index a0701e6eec10..c4ee2e918bea 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -2351,8 +2351,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
2351 | pgoff_t index, unsigned flags) | 2351 | pgoff_t index, unsigned flags) |
2352 | { | 2352 | { |
2353 | int status; | 2353 | int status; |
2354 | gfp_t gfp_mask; | ||
2354 | struct page *page; | 2355 | struct page *page; |
2355 | gfp_t gfp_notmask = 0; | 2356 | gfp_t gfp_notmask = 0; |
2357 | |||
2358 | gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; | ||
2356 | if (flags & AOP_FLAG_NOFS) | 2359 | if (flags & AOP_FLAG_NOFS) |
2357 | gfp_notmask = __GFP_FS; | 2360 | gfp_notmask = __GFP_FS; |
2358 | repeat: | 2361 | repeat: |
@@ -2360,7 +2363,7 @@ repeat: | |||
2360 | if (page) | 2363 | if (page) |
2361 | goto found; | 2364 | goto found; |
2362 | 2365 | ||
2363 | page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); | 2366 | page = __page_cache_alloc(gfp_mask & ~gfp_notmask); |
2364 | if (!page) | 2367 | if (!page) |
2365 | return NULL; | 2368 | return NULL; |
2366 | status = add_to_page_cache_lru(page, mapping, index, | 2369 | status = add_to_page_cache_lru(page, mapping, index, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7acd12503f73..ea8c3a4cd2ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) | |||
800 | 800 | ||
801 | if (page && arch_prepare_hugepage(page)) { | 801 | if (page && arch_prepare_hugepage(page)) { |
802 | __free_pages(page, huge_page_order(h)); | 802 | __free_pages(page, huge_page_order(h)); |
803 | return NULL; | 803 | page = NULL; |
804 | } | 804 | } |
805 | 805 | ||
806 | spin_lock(&hugetlb_lock); | 806 | spin_lock(&hugetlb_lock); |
@@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2315 | * from page cache lookup which is in HPAGE_SIZE units. | 2315 | * from page cache lookup which is in HPAGE_SIZE units. |
2316 | */ | 2316 | */ |
2317 | address = address & huge_page_mask(h); | 2317 | address = address & huge_page_mask(h); |
2318 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) | 2318 | pgoff = vma_hugecache_offset(h, vma, address); |
2319 | + (vma->vm_pgoff >> PAGE_SHIFT); | ||
2320 | mapping = (struct address_space *)page_private(page); | 2319 | mapping = (struct address_space *)page_private(page); |
2321 | 2320 | ||
2322 | /* | 2321 | /* |
@@ -2349,6 +2348,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2349 | 2348 | ||
2350 | /* | 2349 | /* |
2351 | * Hugetlb_cow() should be called with page lock of the original hugepage held. | 2350 | * Hugetlb_cow() should be called with page lock of the original hugepage held. |
2351 | * Called with hugetlb_instantiation_mutex held and pte_page locked so we | ||
2352 | * cannot race with other handlers or page migration. | ||
2353 | * Keep the pte_same checks anyway to make transition from the mutex easier. | ||
2352 | */ | 2354 | */ |
2353 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | 2355 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
2354 | unsigned long address, pte_t *ptep, pte_t pte, | 2356 | unsigned long address, pte_t *ptep, pte_t pte, |
@@ -2408,7 +2410,14 @@ retry_avoidcopy: | |||
2408 | BUG_ON(page_count(old_page) != 1); | 2410 | BUG_ON(page_count(old_page) != 1); |
2409 | BUG_ON(huge_pte_none(pte)); | 2411 | BUG_ON(huge_pte_none(pte)); |
2410 | spin_lock(&mm->page_table_lock); | 2412 | spin_lock(&mm->page_table_lock); |
2411 | goto retry_avoidcopy; | 2413 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
2414 | if (likely(pte_same(huge_ptep_get(ptep), pte))) | ||
2415 | goto retry_avoidcopy; | ||
2416 | /* | ||
2417 | * race occurs while re-acquiring page_table_lock, and | ||
2418 | * our job is done. | ||
2419 | */ | ||
2420 | return 0; | ||
2412 | } | 2421 | } |
2413 | WARN_ON_ONCE(1); | 2422 | WARN_ON_ONCE(1); |
2414 | } | 2423 | } |
@@ -2630,6 +2639,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2630 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | 2639 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); |
2631 | struct hstate *h = hstate_vma(vma); | 2640 | struct hstate *h = hstate_vma(vma); |
2632 | 2641 | ||
2642 | address &= huge_page_mask(h); | ||
2643 | |||
2633 | ptep = huge_pte_offset(mm, address); | 2644 | ptep = huge_pte_offset(mm, address); |
2634 | if (ptep) { | 2645 | if (ptep) { |
2635 | entry = huge_ptep_get(ptep); | 2646 | entry = huge_ptep_get(ptep); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c3fdbcb17658..e3d58f088466 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, | |||
1983 | } | 1983 | } |
1984 | 1984 | ||
1985 | /* Slow path of a mempolicy comparison */ | 1985 | /* Slow path of a mempolicy comparison */ |
1986 | int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | 1986 | bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) |
1987 | { | 1987 | { |
1988 | if (!a || !b) | 1988 | if (!a || !b) |
1989 | return 0; | 1989 | return false; |
1990 | if (a->mode != b->mode) | 1990 | if (a->mode != b->mode) |
1991 | return 0; | 1991 | return false; |
1992 | if (a->flags != b->flags) | 1992 | if (a->flags != b->flags) |
1993 | return 0; | 1993 | return false; |
1994 | if (mpol_store_user_nodemask(a)) | 1994 | if (mpol_store_user_nodemask(a)) |
1995 | if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) | 1995 | if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) |
1996 | return 0; | 1996 | return false; |
1997 | 1997 | ||
1998 | switch (a->mode) { | 1998 | switch (a->mode) { |
1999 | case MPOL_BIND: | 1999 | case MPOL_BIND: |
2000 | /* Fall through */ | 2000 | /* Fall through */ |
2001 | case MPOL_INTERLEAVE: | 2001 | case MPOL_INTERLEAVE: |
2002 | return nodes_equal(a->v.nodes, b->v.nodes); | 2002 | return !!nodes_equal(a->v.nodes, b->v.nodes); |
2003 | case MPOL_PREFERRED: | 2003 | case MPOL_PREFERRED: |
2004 | return a->v.preferred_node == b->v.preferred_node; | 2004 | return a->v.preferred_node == b->v.preferred_node; |
2005 | default: | 2005 | default: |
2006 | BUG(); | 2006 | BUG(); |
2007 | return 0; | 2007 | return false; |
2008 | } | 2008 | } |
2009 | } | 2009 | } |
2010 | 2010 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index e73641b79bb5..d9049811f352 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool) | |||
27 | return pool->elements[--pool->curr_nr]; | 27 | return pool->elements[--pool->curr_nr]; |
28 | } | 28 | } |
29 | 29 | ||
30 | static void free_pool(mempool_t *pool) | 30 | /** |
31 | * mempool_destroy - deallocate a memory pool | ||
32 | * @pool: pointer to the memory pool which was allocated via | ||
33 | * mempool_create(). | ||
34 | * | ||
35 | * Free all reserved elements in @pool and @pool itself. This function | ||
36 | * only sleeps if the free_fn() function sleeps. | ||
37 | */ | ||
38 | void mempool_destroy(mempool_t *pool) | ||
31 | { | 39 | { |
32 | while (pool->curr_nr) { | 40 | while (pool->curr_nr) { |
33 | void *element = remove_element(pool); | 41 | void *element = remove_element(pool); |
@@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool) | |||
36 | kfree(pool->elements); | 44 | kfree(pool->elements); |
37 | kfree(pool); | 45 | kfree(pool); |
38 | } | 46 | } |
47 | EXPORT_SYMBOL(mempool_destroy); | ||
39 | 48 | ||
40 | /** | 49 | /** |
41 | * mempool_create - create a memory pool | 50 | * mempool_create - create a memory pool |
@@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, | |||
86 | 95 | ||
87 | element = pool->alloc(GFP_KERNEL, pool->pool_data); | 96 | element = pool->alloc(GFP_KERNEL, pool->pool_data); |
88 | if (unlikely(!element)) { | 97 | if (unlikely(!element)) { |
89 | free_pool(pool); | 98 | mempool_destroy(pool); |
90 | return NULL; | 99 | return NULL; |
91 | } | 100 | } |
92 | add_element(pool, element); | 101 | add_element(pool, element); |
@@ -172,23 +181,6 @@ out: | |||
172 | EXPORT_SYMBOL(mempool_resize); | 181 | EXPORT_SYMBOL(mempool_resize); |
173 | 182 | ||
174 | /** | 183 | /** |
175 | * mempool_destroy - deallocate a memory pool | ||
176 | * @pool: pointer to the memory pool which was allocated via | ||
177 | * mempool_create(). | ||
178 | * | ||
179 | * this function only sleeps if the free_fn() function sleeps. The caller | ||
180 | * has to guarantee that all elements have been returned to the pool (ie: | ||
181 | * freed) prior to calling mempool_destroy(). | ||
182 | */ | ||
183 | void mempool_destroy(mempool_t *pool) | ||
184 | { | ||
185 | /* Check for outstanding elements */ | ||
186 | BUG_ON(pool->curr_nr != pool->min_nr); | ||
187 | free_pool(pool); | ||
188 | } | ||
189 | EXPORT_SYMBOL(mempool_destroy); | ||
190 | |||
191 | /** | ||
192 | * mempool_alloc - allocate an element from a specific memory pool | 184 | * mempool_alloc - allocate an element from a specific memory pool |
193 | * @pool: pointer to the memory pool which was allocated via | 185 | * @pool: pointer to the memory pool which was allocated via |
194 | * mempool_create(). | 186 | * mempool_create(). |
@@ -224,28 +216,40 @@ repeat_alloc: | |||
224 | if (likely(pool->curr_nr)) { | 216 | if (likely(pool->curr_nr)) { |
225 | element = remove_element(pool); | 217 | element = remove_element(pool); |
226 | spin_unlock_irqrestore(&pool->lock, flags); | 218 | spin_unlock_irqrestore(&pool->lock, flags); |
219 | /* paired with rmb in mempool_free(), read comment there */ | ||
220 | smp_wmb(); | ||
227 | return element; | 221 | return element; |
228 | } | 222 | } |
229 | spin_unlock_irqrestore(&pool->lock, flags); | ||
230 | 223 | ||
231 | /* We must not sleep in the GFP_ATOMIC case */ | 224 | /* |
232 | if (!(gfp_mask & __GFP_WAIT)) | 225 | * We use gfp mask w/o __GFP_WAIT or IO for the first round. If |
226 | * alloc failed with that and @pool was empty, retry immediately. | ||
227 | */ | ||
228 | if (gfp_temp != gfp_mask) { | ||
229 | spin_unlock_irqrestore(&pool->lock, flags); | ||
230 | gfp_temp = gfp_mask; | ||
231 | goto repeat_alloc; | ||
232 | } | ||
233 | |||
234 | /* We must not sleep if !__GFP_WAIT */ | ||
235 | if (!(gfp_mask & __GFP_WAIT)) { | ||
236 | spin_unlock_irqrestore(&pool->lock, flags); | ||
233 | return NULL; | 237 | return NULL; |
238 | } | ||
234 | 239 | ||
235 | /* Now start performing page reclaim */ | 240 | /* Let's wait for someone else to return an element to @pool */ |
236 | gfp_temp = gfp_mask; | ||
237 | init_wait(&wait); | 241 | init_wait(&wait); |
238 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); | 242 | prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); |
239 | smp_mb(); | ||
240 | if (!pool->curr_nr) { | ||
241 | /* | ||
242 | * FIXME: this should be io_schedule(). The timeout is there | ||
243 | * as a workaround for some DM problems in 2.6.18. | ||
244 | */ | ||
245 | io_schedule_timeout(5*HZ); | ||
246 | } | ||
247 | finish_wait(&pool->wait, &wait); | ||
248 | 243 | ||
244 | spin_unlock_irqrestore(&pool->lock, flags); | ||
245 | |||
246 | /* | ||
247 | * FIXME: this should be io_schedule(). The timeout is there as a | ||
248 | * workaround for some DM problems in 2.6.18. | ||
249 | */ | ||
250 | io_schedule_timeout(5*HZ); | ||
251 | |||
252 | finish_wait(&pool->wait, &wait); | ||
249 | goto repeat_alloc; | 253 | goto repeat_alloc; |
250 | } | 254 | } |
251 | EXPORT_SYMBOL(mempool_alloc); | 255 | EXPORT_SYMBOL(mempool_alloc); |
@@ -265,7 +269,39 @@ void mempool_free(void *element, mempool_t *pool) | |||
265 | if (unlikely(element == NULL)) | 269 | if (unlikely(element == NULL)) |
266 | return; | 270 | return; |
267 | 271 | ||
268 | smp_mb(); | 272 | /* |
273 | * Paired with the wmb in mempool_alloc(). The preceding read is | ||
274 | * for @element and the following @pool->curr_nr. This ensures | ||
275 | * that the visible value of @pool->curr_nr is from after the | ||
276 | * allocation of @element. This is necessary for fringe cases | ||
277 | * where @element was passed to this task without going through | ||
278 | * barriers. | ||
279 | * | ||
280 | * For example, assume @p is %NULL at the beginning and one task | ||
281 | * performs "p = mempool_alloc(...);" while another task is doing | ||
282 | * "while (!p) cpu_relax(); mempool_free(p, ...);". This function | ||
283 | * may end up using curr_nr value which is from before allocation | ||
284 | * of @p without the following rmb. | ||
285 | */ | ||
286 | smp_rmb(); | ||
287 | |||
288 | /* | ||
289 | * For correctness, we need a test which is guaranteed to trigger | ||
290 | * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr | ||
291 | * without locking achieves that and refilling as soon as possible | ||
292 | * is desirable. | ||
293 | * | ||
294 | * Because curr_nr visible here is always a value after the | ||
295 | * allocation of @element, any task which decremented curr_nr below | ||
296 | * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets | ||
297 | * incremented to min_nr afterwards. If curr_nr gets incremented | ||
298 | * to min_nr after the allocation of @element, the elements | ||
299 | * allocated after that are subject to the same guarantee. | ||
300 | * | ||
301 | * Waiters happen iff curr_nr is 0 and the above guarantee also | ||
302 | * ensures that there will be frees which return elements to the | ||
303 | * pool waking up the waiters. | ||
304 | */ | ||
269 | if (pool->curr_nr < pool->min_nr) { | 305 | if (pool->curr_nr < pool->min_nr) { |
270 | spin_lock_irqsave(&pool->lock, flags); | 306 | spin_lock_irqsave(&pool->lock, flags); |
271 | if (pool->curr_nr < pool->min_nr) { | 307 | if (pool->curr_nr < pool->min_nr) { |
diff --git a/mm/migrate.c b/mm/migrate.c index 177aca424a06..89ea0854332e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -39,8 +39,6 @@ | |||
39 | 39 | ||
40 | #include "internal.h" | 40 | #include "internal.h" |
41 | 41 | ||
42 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | ||
43 | |||
44 | /* | 42 | /* |
45 | * migrate_prep() needs to be called before we start compiling a list of pages | 43 | * migrate_prep() needs to be called before we start compiling a list of pages |
46 | * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is | 44 | * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is |
@@ -181,8 +179,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
181 | * Something used the pte of a page under migration. We need to | 179 | * Something used the pte of a page under migration. We need to |
182 | * get to the page and wait until migration is finished. | 180 | * get to the page and wait until migration is finished. |
183 | * When we return from this function the fault will be retried. | 181 | * When we return from this function the fault will be retried. |
184 | * | ||
185 | * This function is called from do_swap_page(). | ||
186 | */ | 182 | */ |
187 | void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | 183 | void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, |
188 | unsigned long address) | 184 | unsigned long address) |
@@ -269,12 +265,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
269 | 265 | ||
270 | radix_tree_replace_slot(pslot, newpage); | 266 | radix_tree_replace_slot(pslot, newpage); |
271 | 267 | ||
272 | page_unfreeze_refs(page, expected_count); | ||
273 | /* | 268 | /* |
274 | * Drop cache reference from old page. | 269 | * Drop cache reference from old page by unfreezing |
270 | * to one less reference. | ||
275 | * We know this isn't the last reference. | 271 | * We know this isn't the last reference. |
276 | */ | 272 | */ |
277 | __put_page(page); | 273 | page_unfreeze_refs(page, expected_count - 1); |
278 | 274 | ||
279 | /* | 275 | /* |
280 | * If moved to a different zone then also account | 276 | * If moved to a different zone then also account |
@@ -334,9 +330,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
334 | 330 | ||
335 | radix_tree_replace_slot(pslot, newpage); | 331 | radix_tree_replace_slot(pslot, newpage); |
336 | 332 | ||
337 | page_unfreeze_refs(page, expected_count); | 333 | page_unfreeze_refs(page, expected_count - 1); |
338 | |||
339 | __put_page(page); | ||
340 | 334 | ||
341 | spin_unlock_irq(&mapping->tree_lock); | 335 | spin_unlock_irq(&mapping->tree_lock); |
342 | return 0; | 336 | return 0; |
@@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1603 | 1603 | ||
1604 | EXPORT_SYMBOL(find_vma); | 1604 | EXPORT_SYMBOL(find_vma); |
1605 | 1605 | ||
1606 | /* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ | 1606 | /* |
1607 | * Same as find_vma, but also return a pointer to the previous VMA in *pprev. | ||
1608 | * Note: pprev is set to NULL when return value is NULL. | ||
1609 | */ | ||
1607 | struct vm_area_struct * | 1610 | struct vm_area_struct * |
1608 | find_vma_prev(struct mm_struct *mm, unsigned long addr, | 1611 | find_vma_prev(struct mm_struct *mm, unsigned long addr, |
1609 | struct vm_area_struct **pprev) | 1612 | struct vm_area_struct **pprev) |
1610 | { | 1613 | { |
1611 | struct vm_area_struct *vma = NULL, *prev = NULL; | 1614 | struct vm_area_struct *vma; |
1612 | struct rb_node *rb_node; | ||
1613 | if (!mm) | ||
1614 | goto out; | ||
1615 | |||
1616 | /* Guard against addr being lower than the first VMA */ | ||
1617 | vma = mm->mmap; | ||
1618 | |||
1619 | /* Go through the RB tree quickly. */ | ||
1620 | rb_node = mm->mm_rb.rb_node; | ||
1621 | |||
1622 | while (rb_node) { | ||
1623 | struct vm_area_struct *vma_tmp; | ||
1624 | vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); | ||
1625 | |||
1626 | if (addr < vma_tmp->vm_end) { | ||
1627 | rb_node = rb_node->rb_left; | ||
1628 | } else { | ||
1629 | prev = vma_tmp; | ||
1630 | if (!prev->vm_next || (addr < prev->vm_next->vm_end)) | ||
1631 | break; | ||
1632 | rb_node = rb_node->rb_right; | ||
1633 | } | ||
1634 | } | ||
1635 | 1615 | ||
1636 | out: | 1616 | vma = find_vma(mm, addr); |
1637 | *pprev = prev; | 1617 | *pprev = vma ? vma->vm_prev : NULL; |
1638 | return prev ? prev->vm_next : vma; | 1618 | return vma; |
1639 | } | 1619 | } |
1640 | 1620 | ||
1641 | /* | 1621 | /* |
@@ -2322,13 +2302,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2322 | struct vm_area_struct *new_vma, *prev; | 2302 | struct vm_area_struct *new_vma, *prev; |
2323 | struct rb_node **rb_link, *rb_parent; | 2303 | struct rb_node **rb_link, *rb_parent; |
2324 | struct mempolicy *pol; | 2304 | struct mempolicy *pol; |
2305 | bool faulted_in_anon_vma = true; | ||
2325 | 2306 | ||
2326 | /* | 2307 | /* |
2327 | * If anonymous vma has not yet been faulted, update new pgoff | 2308 | * If anonymous vma has not yet been faulted, update new pgoff |
2328 | * to match new location, to increase its chance of merging. | 2309 | * to match new location, to increase its chance of merging. |
2329 | */ | 2310 | */ |
2330 | if (!vma->vm_file && !vma->anon_vma) | 2311 | if (unlikely(!vma->vm_file && !vma->anon_vma)) { |
2331 | pgoff = addr >> PAGE_SHIFT; | 2312 | pgoff = addr >> PAGE_SHIFT; |
2313 | faulted_in_anon_vma = false; | ||
2314 | } | ||
2332 | 2315 | ||
2333 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); | 2316 | find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); |
2334 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, | 2317 | new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, |
@@ -2337,9 +2320,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2337 | /* | 2320 | /* |
2338 | * Source vma may have been merged into new_vma | 2321 | * Source vma may have been merged into new_vma |
2339 | */ | 2322 | */ |
2340 | if (vma_start >= new_vma->vm_start && | 2323 | if (unlikely(vma_start >= new_vma->vm_start && |
2341 | vma_start < new_vma->vm_end) | 2324 | vma_start < new_vma->vm_end)) { |
2325 | /* | ||
2326 | * The only way we can get a vma_merge with | ||
2327 | * self during an mremap is if the vma hasn't | ||
2328 | * been faulted in yet and we were allowed to | ||
2329 | * reset the dst vma->vm_pgoff to the | ||
2330 | * destination address of the mremap to allow | ||
2331 | * the merge to happen. mremap must change the | ||
2332 | * vm_pgoff linearity between src and dst vmas | ||
2333 | * (in turn preventing a vma_merge) to be | ||
2334 | * safe. It is only safe to keep the vm_pgoff | ||
2335 | * linear if there are no pages mapped yet. | ||
2336 | */ | ||
2337 | VM_BUG_ON(faulted_in_anon_vma); | ||
2342 | *vmap = new_vma; | 2338 | *vmap = new_vma; |
2339 | } else | ||
2340 | anon_vma_moveto_tail(new_vma); | ||
2343 | } else { | 2341 | } else { |
2344 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 2342 | new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
2345 | if (new_vma) { | 2343 | if (new_vma) { |
diff --git a/mm/mremap.c b/mm/mremap.c index d6959cb4df58..87bb8393e7d2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -221,6 +221,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
221 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); | 221 | moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); |
222 | if (moved_len < old_len) { | 222 | if (moved_len < old_len) { |
223 | /* | 223 | /* |
224 | * Before moving the page tables from the new vma to | ||
225 | * the old vma, we need to be sure the old vma is | ||
226 | * queued after new vma in the same_anon_vma list to | ||
227 | * prevent SMP races with rmap_walk (that could lead | ||
228 | * rmap_walk to miss some page table). | ||
229 | */ | ||
230 | anon_vma_moveto_tail(vma); | ||
231 | |||
232 | /* | ||
224 | * On error, move entries back from new area to old, | 233 | * On error, move entries back from new area to old, |
225 | * which will succeed since page tables still there, | 234 | * which will succeed since page tables still there, |
226 | * and then proceed to unmap new area instead of old. | 235 | * and then proceed to unmap new area instead of old. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e27dce3..7c122faa05c5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -33,6 +33,10 @@ | |||
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/ptrace.h> | 34 | #include <linux/ptrace.h> |
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/ftrace.h> | ||
37 | |||
38 | #define CREATE_TRACE_POINTS | ||
39 | #include <trace/events/oom.h> | ||
36 | 40 | ||
37 | int sysctl_panic_on_oom; | 41 | int sysctl_panic_on_oom; |
38 | int sysctl_oom_kill_allocating_task; | 42 | int sysctl_oom_kill_allocating_task; |
@@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) | |||
55 | spin_lock_irq(&sighand->siglock); | 59 | spin_lock_irq(&sighand->siglock); |
56 | if (current->signal->oom_score_adj == old_val) | 60 | if (current->signal->oom_score_adj == old_val) |
57 | current->signal->oom_score_adj = new_val; | 61 | current->signal->oom_score_adj = new_val; |
62 | trace_oom_score_adj_update(current); | ||
58 | spin_unlock_irq(&sighand->siglock); | 63 | spin_unlock_irq(&sighand->siglock); |
59 | } | 64 | } |
60 | 65 | ||
@@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) | |||
74 | spin_lock_irq(&sighand->siglock); | 79 | spin_lock_irq(&sighand->siglock); |
75 | old_val = current->signal->oom_score_adj; | 80 | old_val = current->signal->oom_score_adj; |
76 | current->signal->oom_score_adj = new_val; | 81 | current->signal->oom_score_adj = new_val; |
82 | trace_oom_score_adj_update(current); | ||
77 | spin_unlock_irq(&sighand->siglock); | 83 | spin_unlock_irq(&sighand->siglock); |
78 | 84 | ||
79 | return old_val; | 85 | return old_val; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8616ef3025a4..5cdd4f2b0c9d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -130,6 +130,191 @@ unsigned long global_dirty_limit; | |||
130 | static struct prop_descriptor vm_completions; | 130 | static struct prop_descriptor vm_completions; |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * Work out the current dirty-memory clamping and background writeout | ||
134 | * thresholds. | ||
135 | * | ||
136 | * The main aim here is to lower them aggressively if there is a lot of mapped | ||
137 | * memory around. To avoid stressing page reclaim with lots of unreclaimable | ||
138 | * pages. It is better to clamp down on writers than to start swapping, and | ||
139 | * performing lots of scanning. | ||
140 | * | ||
141 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. | ||
142 | * | ||
143 | * We don't permit the clamping level to fall below 5% - that is getting rather | ||
144 | * excessive. | ||
145 | * | ||
146 | * We make sure that the background writeout level is below the adjusted | ||
147 | * clamping level. | ||
148 | */ | ||
149 | |||
150 | /* | ||
151 | * In a memory zone, there is a certain amount of pages we consider | ||
152 | * available for the page cache, which is essentially the number of | ||
153 | * free and reclaimable pages, minus some zone reserves to protect | ||
154 | * lowmem and the ability to uphold the zone's watermarks without | ||
155 | * requiring writeback. | ||
156 | * | ||
157 | * This number of dirtyable pages is the base value of which the | ||
158 | * user-configurable dirty ratio is the effictive number of pages that | ||
159 | * are allowed to be actually dirtied. Per individual zone, or | ||
160 | * globally by using the sum of dirtyable pages over all zones. | ||
161 | * | ||
162 | * Because the user is allowed to specify the dirty limit globally as | ||
163 | * absolute number of bytes, calculating the per-zone dirty limit can | ||
164 | * require translating the configured limit into a percentage of | ||
165 | * global dirtyable memory first. | ||
166 | */ | ||
167 | |||
168 | static unsigned long highmem_dirtyable_memory(unsigned long total) | ||
169 | { | ||
170 | #ifdef CONFIG_HIGHMEM | ||
171 | int node; | ||
172 | unsigned long x = 0; | ||
173 | |||
174 | for_each_node_state(node, N_HIGH_MEMORY) { | ||
175 | struct zone *z = | ||
176 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | ||
177 | |||
178 | x += zone_page_state(z, NR_FREE_PAGES) + | ||
179 | zone_reclaimable_pages(z) - z->dirty_balance_reserve; | ||
180 | } | ||
181 | /* | ||
182 | * Make sure that the number of highmem pages is never larger | ||
183 | * than the number of the total dirtyable memory. This can only | ||
184 | * occur in very strange VM situations but we want to make sure | ||
185 | * that this does not occur. | ||
186 | */ | ||
187 | return min(x, total); | ||
188 | #else | ||
189 | return 0; | ||
190 | #endif | ||
191 | } | ||
192 | |||
193 | /** | ||
194 | * global_dirtyable_memory - number of globally dirtyable pages | ||
195 | * | ||
196 | * Returns the global number of pages potentially available for dirty | ||
197 | * page cache. This is the base value for the global dirty limits. | ||
198 | */ | ||
199 | unsigned long global_dirtyable_memory(void) | ||
200 | { | ||
201 | unsigned long x; | ||
202 | |||
203 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - | ||
204 | dirty_balance_reserve; | ||
205 | |||
206 | if (!vm_highmem_is_dirtyable) | ||
207 | x -= highmem_dirtyable_memory(x); | ||
208 | |||
209 | return x + 1; /* Ensure that we never return 0 */ | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | ||
214 | * | ||
215 | * Calculate the dirty thresholds based on sysctl parameters | ||
216 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | ||
217 | * - vm.dirty_ratio or vm.dirty_bytes | ||
218 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | ||
219 | * real-time tasks. | ||
220 | */ | ||
221 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | ||
222 | { | ||
223 | unsigned long background; | ||
224 | unsigned long dirty; | ||
225 | unsigned long uninitialized_var(available_memory); | ||
226 | struct task_struct *tsk; | ||
227 | |||
228 | if (!vm_dirty_bytes || !dirty_background_bytes) | ||
229 | available_memory = global_dirtyable_memory(); | ||
230 | |||
231 | if (vm_dirty_bytes) | ||
232 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | ||
233 | else | ||
234 | dirty = (vm_dirty_ratio * available_memory) / 100; | ||
235 | |||
236 | if (dirty_background_bytes) | ||
237 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); | ||
238 | else | ||
239 | background = (dirty_background_ratio * available_memory) / 100; | ||
240 | |||
241 | if (background >= dirty) | ||
242 | background = dirty / 2; | ||
243 | tsk = current; | ||
244 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | ||
245 | background += background / 4; | ||
246 | dirty += dirty / 4; | ||
247 | } | ||
248 | *pbackground = background; | ||
249 | *pdirty = dirty; | ||
250 | trace_global_dirty_state(background, dirty); | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * zone_dirtyable_memory - number of dirtyable pages in a zone | ||
255 | * @zone: the zone | ||
256 | * | ||
257 | * Returns the zone's number of pages potentially available for dirty | ||
258 | * page cache. This is the base value for the per-zone dirty limits. | ||
259 | */ | ||
260 | static unsigned long zone_dirtyable_memory(struct zone *zone) | ||
261 | { | ||
262 | /* | ||
263 | * The effective global number of dirtyable pages may exclude | ||
264 | * highmem as a big-picture measure to keep the ratio between | ||
265 | * dirty memory and lowmem reasonable. | ||
266 | * | ||
267 | * But this function is purely about the individual zone and a | ||
268 | * highmem zone can hold its share of dirty pages, so we don't | ||
269 | * care about vm_highmem_is_dirtyable here. | ||
270 | */ | ||
271 | return zone_page_state(zone, NR_FREE_PAGES) + | ||
272 | zone_reclaimable_pages(zone) - | ||
273 | zone->dirty_balance_reserve; | ||
274 | } | ||
275 | |||
276 | /** | ||
277 | * zone_dirty_limit - maximum number of dirty pages allowed in a zone | ||
278 | * @zone: the zone | ||
279 | * | ||
280 | * Returns the maximum number of dirty pages allowed in a zone, based | ||
281 | * on the zone's dirtyable memory. | ||
282 | */ | ||
283 | static unsigned long zone_dirty_limit(struct zone *zone) | ||
284 | { | ||
285 | unsigned long zone_memory = zone_dirtyable_memory(zone); | ||
286 | struct task_struct *tsk = current; | ||
287 | unsigned long dirty; | ||
288 | |||
289 | if (vm_dirty_bytes) | ||
290 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * | ||
291 | zone_memory / global_dirtyable_memory(); | ||
292 | else | ||
293 | dirty = vm_dirty_ratio * zone_memory / 100; | ||
294 | |||
295 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) | ||
296 | dirty += dirty / 4; | ||
297 | |||
298 | return dirty; | ||
299 | } | ||
300 | |||
301 | /** | ||
302 | * zone_dirty_ok - tells whether a zone is within its dirty limits | ||
303 | * @zone: the zone to check | ||
304 | * | ||
305 | * Returns %true when the dirty pages in @zone are within the zone's | ||
306 | * dirty limit, %false if the limit is exceeded. | ||
307 | */ | ||
308 | bool zone_dirty_ok(struct zone *zone) | ||
309 | { | ||
310 | unsigned long limit = zone_dirty_limit(zone); | ||
311 | |||
312 | return zone_page_state(zone, NR_FILE_DIRTY) + | ||
313 | zone_page_state(zone, NR_UNSTABLE_NFS) + | ||
314 | zone_page_state(zone, NR_WRITEBACK) <= limit; | ||
315 | } | ||
316 | |||
317 | /* | ||
133 | * couple the period to the dirty_ratio: | 318 | * couple the period to the dirty_ratio: |
134 | * | 319 | * |
135 | * period/2 ~ roundup_pow_of_two(dirty limit) | 320 | * period/2 ~ roundup_pow_of_two(dirty limit) |
@@ -141,7 +326,7 @@ static int calc_period_shift(void) | |||
141 | if (vm_dirty_bytes) | 326 | if (vm_dirty_bytes) |
142 | dirty_total = vm_dirty_bytes / PAGE_SIZE; | 327 | dirty_total = vm_dirty_bytes / PAGE_SIZE; |
143 | else | 328 | else |
144 | dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / | 329 | dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / |
145 | 100; | 330 | 100; |
146 | return 2 + ilog2(dirty_total - 1); | 331 | return 2 + ilog2(dirty_total - 1); |
147 | } | 332 | } |
@@ -196,7 +381,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
196 | return ret; | 381 | return ret; |
197 | } | 382 | } |
198 | 383 | ||
199 | |||
200 | int dirty_bytes_handler(struct ctl_table *table, int write, | 384 | int dirty_bytes_handler(struct ctl_table *table, int write, |
201 | void __user *buffer, size_t *lenp, | 385 | void __user *buffer, size_t *lenp, |
202 | loff_t *ppos) | 386 | loff_t *ppos) |
@@ -291,67 +475,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | |||
291 | } | 475 | } |
292 | EXPORT_SYMBOL(bdi_set_max_ratio); | 476 | EXPORT_SYMBOL(bdi_set_max_ratio); |
293 | 477 | ||
294 | /* | ||
295 | * Work out the current dirty-memory clamping and background writeout | ||
296 | * thresholds. | ||
297 | * | ||
298 | * The main aim here is to lower them aggressively if there is a lot of mapped | ||
299 | * memory around. To avoid stressing page reclaim with lots of unreclaimable | ||
300 | * pages. It is better to clamp down on writers than to start swapping, and | ||
301 | * performing lots of scanning. | ||
302 | * | ||
303 | * We only allow 1/2 of the currently-unmapped memory to be dirtied. | ||
304 | * | ||
305 | * We don't permit the clamping level to fall below 5% - that is getting rather | ||
306 | * excessive. | ||
307 | * | ||
308 | * We make sure that the background writeout level is below the adjusted | ||
309 | * clamping level. | ||
310 | */ | ||
311 | |||
312 | static unsigned long highmem_dirtyable_memory(unsigned long total) | ||
313 | { | ||
314 | #ifdef CONFIG_HIGHMEM | ||
315 | int node; | ||
316 | unsigned long x = 0; | ||
317 | |||
318 | for_each_node_state(node, N_HIGH_MEMORY) { | ||
319 | struct zone *z = | ||
320 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | ||
321 | |||
322 | x += zone_page_state(z, NR_FREE_PAGES) + | ||
323 | zone_reclaimable_pages(z); | ||
324 | } | ||
325 | /* | ||
326 | * Make sure that the number of highmem pages is never larger | ||
327 | * than the number of the total dirtyable memory. This can only | ||
328 | * occur in very strange VM situations but we want to make sure | ||
329 | * that this does not occur. | ||
330 | */ | ||
331 | return min(x, total); | ||
332 | #else | ||
333 | return 0; | ||
334 | #endif | ||
335 | } | ||
336 | |||
337 | /** | ||
338 | * determine_dirtyable_memory - amount of memory that may be used | ||
339 | * | ||
340 | * Returns the numebr of pages that can currently be freed and used | ||
341 | * by the kernel for direct mappings. | ||
342 | */ | ||
343 | unsigned long determine_dirtyable_memory(void) | ||
344 | { | ||
345 | unsigned long x; | ||
346 | |||
347 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); | ||
348 | |||
349 | if (!vm_highmem_is_dirtyable) | ||
350 | x -= highmem_dirtyable_memory(x); | ||
351 | |||
352 | return x + 1; /* Ensure that we never return 0 */ | ||
353 | } | ||
354 | |||
355 | static unsigned long dirty_freerun_ceiling(unsigned long thresh, | 478 | static unsigned long dirty_freerun_ceiling(unsigned long thresh, |
356 | unsigned long bg_thresh) | 479 | unsigned long bg_thresh) |
357 | { | 480 | { |
@@ -363,47 +486,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh) | |||
363 | return max(thresh, global_dirty_limit); | 486 | return max(thresh, global_dirty_limit); |
364 | } | 487 | } |
365 | 488 | ||
366 | /* | ||
367 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | ||
368 | * | ||
369 | * Calculate the dirty thresholds based on sysctl parameters | ||
370 | * - vm.dirty_background_ratio or vm.dirty_background_bytes | ||
371 | * - vm.dirty_ratio or vm.dirty_bytes | ||
372 | * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and | ||
373 | * real-time tasks. | ||
374 | */ | ||
375 | void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | ||
376 | { | ||
377 | unsigned long background; | ||
378 | unsigned long dirty; | ||
379 | unsigned long uninitialized_var(available_memory); | ||
380 | struct task_struct *tsk; | ||
381 | |||
382 | if (!vm_dirty_bytes || !dirty_background_bytes) | ||
383 | available_memory = determine_dirtyable_memory(); | ||
384 | |||
385 | if (vm_dirty_bytes) | ||
386 | dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); | ||
387 | else | ||
388 | dirty = (vm_dirty_ratio * available_memory) / 100; | ||
389 | |||
390 | if (dirty_background_bytes) | ||
391 | background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); | ||
392 | else | ||
393 | background = (dirty_background_ratio * available_memory) / 100; | ||
394 | |||
395 | if (background >= dirty) | ||
396 | background = dirty / 2; | ||
397 | tsk = current; | ||
398 | if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { | ||
399 | background += background / 4; | ||
400 | dirty += dirty / 4; | ||
401 | } | ||
402 | *pbackground = background; | ||
403 | *pdirty = dirty; | ||
404 | trace_global_dirty_state(background, dirty); | ||
405 | } | ||
406 | |||
407 | /** | 489 | /** |
408 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 490 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
409 | * @bdi: the backing_dev_info to query | 491 | * @bdi: the backing_dev_info to query |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca154d1b..794e6715c226 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | 59 | #include <linux/prefetch.h> |
60 | #include <linux/page-debug-flags.h> | ||
60 | 61 | ||
61 | #include <asm/tlbflush.h> | 62 | #include <asm/tlbflush.h> |
62 | #include <asm/div64.h> | 63 | #include <asm/div64.h> |
@@ -96,6 +97,14 @@ EXPORT_SYMBOL(node_states); | |||
96 | 97 | ||
97 | unsigned long totalram_pages __read_mostly; | 98 | unsigned long totalram_pages __read_mostly; |
98 | unsigned long totalreserve_pages __read_mostly; | 99 | unsigned long totalreserve_pages __read_mostly; |
100 | /* | ||
101 | * When calculating the number of globally allowed dirty pages, there | ||
102 | * is a certain number of per-zone reserves that should not be | ||
103 | * considered dirtyable memory. This is the sum of those reserves | ||
104 | * over all existing zones that contribute dirtyable memory. | ||
105 | */ | ||
106 | unsigned long dirty_balance_reserve __read_mostly; | ||
107 | |||
99 | int percpu_pagelist_fraction; | 108 | int percpu_pagelist_fraction; |
100 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 109 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
101 | 110 | ||
@@ -127,6 +136,13 @@ void pm_restrict_gfp_mask(void) | |||
127 | saved_gfp_mask = gfp_allowed_mask; | 136 | saved_gfp_mask = gfp_allowed_mask; |
128 | gfp_allowed_mask &= ~GFP_IOFS; | 137 | gfp_allowed_mask &= ~GFP_IOFS; |
129 | } | 138 | } |
139 | |||
140 | bool pm_suspended_storage(void) | ||
141 | { | ||
142 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) | ||
143 | return false; | ||
144 | return true; | ||
145 | } | ||
130 | #endif /* CONFIG_PM_SLEEP */ | 146 | #endif /* CONFIG_PM_SLEEP */ |
131 | 147 | ||
132 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 148 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
@@ -381,6 +397,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | |||
381 | clear_highpage(page + i); | 397 | clear_highpage(page + i); |
382 | } | 398 | } |
383 | 399 | ||
400 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
401 | unsigned int _debug_guardpage_minorder; | ||
402 | |||
403 | static int __init debug_guardpage_minorder_setup(char *buf) | ||
404 | { | ||
405 | unsigned long res; | ||
406 | |||
407 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { | ||
408 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); | ||
409 | return 0; | ||
410 | } | ||
411 | _debug_guardpage_minorder = res; | ||
412 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); | ||
413 | return 0; | ||
414 | } | ||
415 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | ||
416 | |||
417 | static inline void set_page_guard_flag(struct page *page) | ||
418 | { | ||
419 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | ||
420 | } | ||
421 | |||
422 | static inline void clear_page_guard_flag(struct page *page) | ||
423 | { | ||
424 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | ||
425 | } | ||
426 | #else | ||
427 | static inline void set_page_guard_flag(struct page *page) { } | ||
428 | static inline void clear_page_guard_flag(struct page *page) { } | ||
429 | #endif | ||
430 | |||
384 | static inline void set_page_order(struct page *page, int order) | 431 | static inline void set_page_order(struct page *page, int order) |
385 | { | 432 | { |
386 | set_page_private(page, order); | 433 | set_page_private(page, order); |
@@ -438,6 +485,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
438 | if (page_zone_id(page) != page_zone_id(buddy)) | 485 | if (page_zone_id(page) != page_zone_id(buddy)) |
439 | return 0; | 486 | return 0; |
440 | 487 | ||
488 | if (page_is_guard(buddy) && page_order(buddy) == order) { | ||
489 | VM_BUG_ON(page_count(buddy) != 0); | ||
490 | return 1; | ||
491 | } | ||
492 | |||
441 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 493 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
442 | VM_BUG_ON(page_count(buddy) != 0); | 494 | VM_BUG_ON(page_count(buddy) != 0); |
443 | return 1; | 495 | return 1; |
@@ -494,11 +546,19 @@ static inline void __free_one_page(struct page *page, | |||
494 | buddy = page + (buddy_idx - page_idx); | 546 | buddy = page + (buddy_idx - page_idx); |
495 | if (!page_is_buddy(page, buddy, order)) | 547 | if (!page_is_buddy(page, buddy, order)) |
496 | break; | 548 | break; |
497 | 549 | /* | |
498 | /* Our buddy is free, merge with it and move up one order. */ | 550 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
499 | list_del(&buddy->lru); | 551 | * merge with it and move up one order. |
500 | zone->free_area[order].nr_free--; | 552 | */ |
501 | rmv_page_order(buddy); | 553 | if (page_is_guard(buddy)) { |
554 | clear_page_guard_flag(buddy); | ||
555 | set_page_private(page, 0); | ||
556 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
557 | } else { | ||
558 | list_del(&buddy->lru); | ||
559 | zone->free_area[order].nr_free--; | ||
560 | rmv_page_order(buddy); | ||
561 | } | ||
502 | combined_idx = buddy_idx & page_idx; | 562 | combined_idx = buddy_idx & page_idx; |
503 | page = page + (combined_idx - page_idx); | 563 | page = page + (combined_idx - page_idx); |
504 | page_idx = combined_idx; | 564 | page_idx = combined_idx; |
@@ -632,7 +692,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
632 | int i; | 692 | int i; |
633 | int bad = 0; | 693 | int bad = 0; |
634 | 694 | ||
635 | trace_mm_page_free_direct(page, order); | 695 | trace_mm_page_free(page, order); |
636 | kmemcheck_free_shadow(page, order); | 696 | kmemcheck_free_shadow(page, order); |
637 | 697 | ||
638 | if (PageAnon(page)) | 698 | if (PageAnon(page)) |
@@ -670,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
670 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
671 | } | 731 | } |
672 | 732 | ||
673 | /* | ||
674 | * permit the bootmem allocator to evade page validation on high-order frees | ||
675 | */ | ||
676 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 733 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
677 | { | 734 | { |
678 | if (order == 0) { | 735 | unsigned int nr_pages = 1 << order; |
679 | __ClearPageReserved(page); | 736 | unsigned int loop; |
680 | set_page_count(page, 0); | ||
681 | set_page_refcounted(page); | ||
682 | __free_page(page); | ||
683 | } else { | ||
684 | int loop; | ||
685 | |||
686 | prefetchw(page); | ||
687 | for (loop = 0; loop < (1 << order); loop++) { | ||
688 | struct page *p = &page[loop]; | ||
689 | 737 | ||
690 | if (loop + 1 < (1 << order)) | 738 | prefetchw(page); |
691 | prefetchw(p + 1); | 739 | for (loop = 0; loop < nr_pages; loop++) { |
692 | __ClearPageReserved(p); | 740 | struct page *p = &page[loop]; |
693 | set_page_count(p, 0); | ||
694 | } | ||
695 | 741 | ||
696 | set_page_refcounted(page); | 742 | if (loop + 1 < nr_pages) |
697 | __free_pages(page, order); | 743 | prefetchw(p + 1); |
744 | __ClearPageReserved(p); | ||
745 | set_page_count(p, 0); | ||
698 | } | 746 | } |
747 | |||
748 | set_page_refcounted(page); | ||
749 | __free_pages(page, order); | ||
699 | } | 750 | } |
700 | 751 | ||
701 | 752 | ||
@@ -724,6 +775,23 @@ static inline void expand(struct zone *zone, struct page *page, | |||
724 | high--; | 775 | high--; |
725 | size >>= 1; | 776 | size >>= 1; |
726 | VM_BUG_ON(bad_range(zone, &page[size])); | 777 | VM_BUG_ON(bad_range(zone, &page[size])); |
778 | |||
779 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
780 | if (high < debug_guardpage_minorder()) { | ||
781 | /* | ||
782 | * Mark as guard pages (or page), that will allow to | ||
783 | * merge back to allocator when buddy will be freed. | ||
784 | * Corresponding page table entries will not be touched, | ||
785 | * pages will stay not present in virtual address space | ||
786 | */ | ||
787 | INIT_LIST_HEAD(&page[size].lru); | ||
788 | set_page_guard_flag(&page[size]); | ||
789 | set_page_private(&page[size], high); | ||
790 | /* Guard pages are not available for any usage */ | ||
791 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | ||
792 | continue; | ||
793 | } | ||
794 | #endif | ||
727 | list_add(&page[size].lru, &area->free_list[migratetype]); | 795 | list_add(&page[size].lru, &area->free_list[migratetype]); |
728 | area->nr_free++; | 796 | area->nr_free++; |
729 | set_page_order(&page[size], high); | 797 | set_page_order(&page[size], high); |
@@ -1189,6 +1257,19 @@ out: | |||
1189 | } | 1257 | } |
1190 | 1258 | ||
1191 | /* | 1259 | /* |
1260 | * Free a list of 0-order pages | ||
1261 | */ | ||
1262 | void free_hot_cold_page_list(struct list_head *list, int cold) | ||
1263 | { | ||
1264 | struct page *page, *next; | ||
1265 | |||
1266 | list_for_each_entry_safe(page, next, list, lru) { | ||
1267 | trace_mm_page_free_batched(page, cold); | ||
1268 | free_hot_cold_page(page, cold); | ||
1269 | } | ||
1270 | } | ||
1271 | |||
1272 | /* | ||
1192 | * split_page takes a non-compound higher-order page, and splits it into | 1273 | * split_page takes a non-compound higher-order page, and splits it into |
1193 | * n (1<<order) sub-pages: page[0..n] | 1274 | * n (1<<order) sub-pages: page[0..n] |
1194 | * Each sub-page must be freed individually. | 1275 | * Each sub-page must be freed individually. |
@@ -1435,7 +1516,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1435 | long min = mark; | 1516 | long min = mark; |
1436 | int o; | 1517 | int o; |
1437 | 1518 | ||
1438 | free_pages -= (1 << order) + 1; | 1519 | free_pages -= (1 << order) - 1; |
1439 | if (alloc_flags & ALLOC_HIGH) | 1520 | if (alloc_flags & ALLOC_HIGH) |
1440 | min -= min / 2; | 1521 | min -= min / 2; |
1441 | if (alloc_flags & ALLOC_HARDER) | 1522 | if (alloc_flags & ALLOC_HARDER) |
@@ -1645,6 +1726,35 @@ zonelist_scan: | |||
1645 | if ((alloc_flags & ALLOC_CPUSET) && | 1726 | if ((alloc_flags & ALLOC_CPUSET) && |
1646 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1727 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1647 | continue; | 1728 | continue; |
1729 | /* | ||
1730 | * When allocating a page cache page for writing, we | ||
1731 | * want to get it from a zone that is within its dirty | ||
1732 | * limit, such that no single zone holds more than its | ||
1733 | * proportional share of globally allowed dirty pages. | ||
1734 | * The dirty limits take into account the zone's | ||
1735 | * lowmem reserves and high watermark so that kswapd | ||
1736 | * should be able to balance it without having to | ||
1737 | * write pages from its LRU list. | ||
1738 | * | ||
1739 | * This may look like it could increase pressure on | ||
1740 | * lower zones by failing allocations in higher zones | ||
1741 | * before they are full. But the pages that do spill | ||
1742 | * over are limited as the lower zones are protected | ||
1743 | * by this very same mechanism. It should not become | ||
1744 | * a practical burden to them. | ||
1745 | * | ||
1746 | * XXX: For now, allow allocations to potentially | ||
1747 | * exceed the per-zone dirty limit in the slowpath | ||
1748 | * (ALLOC_WMARK_LOW unset) before going into reclaim, | ||
1749 | * which is important when on a NUMA setup the allowed | ||
1750 | * zones are together not big enough to reach the | ||
1751 | * global limit. The proper fix for these situations | ||
1752 | * will require awareness of zones in the | ||
1753 | * dirty-throttling and the flusher threads. | ||
1754 | */ | ||
1755 | if ((alloc_flags & ALLOC_WMARK_LOW) && | ||
1756 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | ||
1757 | goto this_zone_full; | ||
1648 | 1758 | ||
1649 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1759 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1650 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1760 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1734,7 +1844,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
1734 | { | 1844 | { |
1735 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 1845 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
1736 | 1846 | ||
1737 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) | 1847 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || |
1848 | debug_guardpage_minorder() > 0) | ||
1738 | return; | 1849 | return; |
1739 | 1850 | ||
1740 | /* | 1851 | /* |
@@ -1773,12 +1884,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | |||
1773 | 1884 | ||
1774 | static inline int | 1885 | static inline int |
1775 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1886 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1887 | unsigned long did_some_progress, | ||
1776 | unsigned long pages_reclaimed) | 1888 | unsigned long pages_reclaimed) |
1777 | { | 1889 | { |
1778 | /* Do not loop if specifically requested */ | 1890 | /* Do not loop if specifically requested */ |
1779 | if (gfp_mask & __GFP_NORETRY) | 1891 | if (gfp_mask & __GFP_NORETRY) |
1780 | return 0; | 1892 | return 0; |
1781 | 1893 | ||
1894 | /* Always retry if specifically requested */ | ||
1895 | if (gfp_mask & __GFP_NOFAIL) | ||
1896 | return 1; | ||
1897 | |||
1898 | /* | ||
1899 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim | ||
1900 | * making forward progress without invoking OOM. Suspend also disables | ||
1901 | * storage devices so kswapd will not help. Bail if we are suspending. | ||
1902 | */ | ||
1903 | if (!did_some_progress && pm_suspended_storage()) | ||
1904 | return 0; | ||
1905 | |||
1782 | /* | 1906 | /* |
1783 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | 1907 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER |
1784 | * means __GFP_NOFAIL, but that may not be true in other | 1908 | * means __GFP_NOFAIL, but that may not be true in other |
@@ -1797,13 +1921,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
1797 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | 1921 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) |
1798 | return 1; | 1922 | return 1; |
1799 | 1923 | ||
1800 | /* | ||
1801 | * Don't let big-order allocations loop unless the caller | ||
1802 | * explicitly requests that. | ||
1803 | */ | ||
1804 | if (gfp_mask & __GFP_NOFAIL) | ||
1805 | return 1; | ||
1806 | |||
1807 | return 0; | 1924 | return 0; |
1808 | } | 1925 | } |
1809 | 1926 | ||
@@ -2196,7 +2313,8 @@ rebalance: | |||
2196 | 2313 | ||
2197 | /* Check if we should retry the allocation */ | 2314 | /* Check if we should retry the allocation */ |
2198 | pages_reclaimed += did_some_progress; | 2315 | pages_reclaimed += did_some_progress; |
2199 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { | 2316 | if (should_alloc_retry(gfp_mask, order, did_some_progress, |
2317 | pages_reclaimed)) { | ||
2200 | /* Wait for some write requests to complete then retry */ | 2318 | /* Wait for some write requests to complete then retry */ |
2201 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2319 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2202 | goto rebalance; | 2320 | goto rebalance; |
@@ -2306,16 +2424,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) | |||
2306 | } | 2424 | } |
2307 | EXPORT_SYMBOL(get_zeroed_page); | 2425 | EXPORT_SYMBOL(get_zeroed_page); |
2308 | 2426 | ||
2309 | void __pagevec_free(struct pagevec *pvec) | ||
2310 | { | ||
2311 | int i = pagevec_count(pvec); | ||
2312 | |||
2313 | while (--i >= 0) { | ||
2314 | trace_mm_pagevec_free(pvec->pages[i], pvec->cold); | ||
2315 | free_hot_cold_page(pvec->pages[i], pvec->cold); | ||
2316 | } | ||
2317 | } | ||
2318 | |||
2319 | void __free_pages(struct page *page, unsigned int order) | 2427 | void __free_pages(struct page *page, unsigned int order) |
2320 | { | 2428 | { |
2321 | if (put_page_testzero(page)) { | 2429 | if (put_page_testzero(page)) { |
@@ -3385,25 +3493,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
3385 | if (page_to_nid(page) != zone_to_nid(zone)) | 3493 | if (page_to_nid(page) != zone_to_nid(zone)) |
3386 | continue; | 3494 | continue; |
3387 | 3495 | ||
3388 | /* Blocks with reserved pages will never free, skip them. */ | ||
3389 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3390 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3391 | continue; | ||
3392 | |||
3393 | block_migratetype = get_pageblock_migratetype(page); | 3496 | block_migratetype = get_pageblock_migratetype(page); |
3394 | 3497 | ||
3395 | /* If this block is reserved, account for it */ | 3498 | /* Only test what is necessary when the reserves are not met */ |
3396 | if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { | 3499 | if (reserve > 0) { |
3397 | reserve--; | 3500 | /* |
3398 | continue; | 3501 | * Blocks with reserved pages will never free, skip |
3399 | } | 3502 | * them. |
3503 | */ | ||
3504 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | ||
3505 | if (pageblock_is_reserved(pfn, block_end_pfn)) | ||
3506 | continue; | ||
3400 | 3507 | ||
3401 | /* Suitable for reserving if this block is movable */ | 3508 | /* If this block is reserved, account for it */ |
3402 | if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { | 3509 | if (block_migratetype == MIGRATE_RESERVE) { |
3403 | set_pageblock_migratetype(page, MIGRATE_RESERVE); | 3510 | reserve--; |
3404 | move_freepages_block(zone, page, MIGRATE_RESERVE); | 3511 | continue; |
3405 | reserve--; | 3512 | } |
3406 | continue; | 3513 | |
3514 | /* Suitable for reserving if this block is movable */ | ||
3515 | if (block_migratetype == MIGRATE_MOVABLE) { | ||
3516 | set_pageblock_migratetype(page, | ||
3517 | MIGRATE_RESERVE); | ||
3518 | move_freepages_block(zone, page, | ||
3519 | MIGRATE_RESERVE); | ||
3520 | reserve--; | ||
3521 | continue; | ||
3522 | } | ||
3407 | } | 3523 | } |
3408 | 3524 | ||
3409 | /* | 3525 | /* |
@@ -4734,8 +4850,19 @@ static void calculate_totalreserve_pages(void) | |||
4734 | if (max > zone->present_pages) | 4850 | if (max > zone->present_pages) |
4735 | max = zone->present_pages; | 4851 | max = zone->present_pages; |
4736 | reserve_pages += max; | 4852 | reserve_pages += max; |
4853 | /* | ||
4854 | * Lowmem reserves are not available to | ||
4855 | * GFP_HIGHUSER page cache allocations and | ||
4856 | * kswapd tries to balance zones to their high | ||
4857 | * watermark. As a result, neither should be | ||
4858 | * regarded as dirtyable memory, to prevent a | ||
4859 | * situation where reclaim has to clean pages | ||
4860 | * in order to balance the zones. | ||
4861 | */ | ||
4862 | zone->dirty_balance_reserve = max; | ||
4737 | } | 4863 | } |
4738 | } | 4864 | } |
4865 | dirty_balance_reserve = reserve_pages; | ||
4739 | totalreserve_pages = reserve_pages; | 4866 | totalreserve_pages = reserve_pages; |
4740 | } | 4867 | } |
4741 | 4868 | ||
@@ -272,6 +272,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
272 | } | 272 | } |
273 | 273 | ||
274 | /* | 274 | /* |
275 | * Some rmap walk that needs to find all ptes/hugepmds without false | ||
276 | * negatives (like migrate and split_huge_page) running concurrent | ||
277 | * with operations that copy or move pagetables (like mremap() and | ||
278 | * fork()) to be safe. They depend on the anon_vma "same_anon_vma" | ||
279 | * list to be in a certain order: the dst_vma must be placed after the | ||
280 | * src_vma in the list. This is always guaranteed by fork() but | ||
281 | * mremap() needs to call this function to enforce it in case the | ||
282 | * dst_vma isn't newly allocated and chained with the anon_vma_clone() | ||
283 | * function but just an extension of a pre-existing vma through | ||
284 | * vma_merge. | ||
285 | * | ||
286 | * NOTE: the same_anon_vma list can still be changed by other | ||
287 | * processes while mremap runs because mremap doesn't hold the | ||
288 | * anon_vma mutex to prevent modifications to the list while it | ||
289 | * runs. All we need to enforce is that the relative order of this | ||
290 | * process vmas isn't changing (we don't care about other vmas | ||
291 | * order). Each vma corresponds to an anon_vma_chain structure so | ||
292 | * there's no risk that other processes calling anon_vma_moveto_tail() | ||
293 | * and changing the same_anon_vma list under mremap() will screw with | ||
294 | * the relative order of this process vmas in the list, because we | ||
295 | * they can't alter the order of any vma that belongs to this | ||
296 | * process. And there can't be another anon_vma_moveto_tail() running | ||
297 | * concurrently with mremap() coming from this process because we hold | ||
298 | * the mmap_sem for the whole mremap(). fork() ordering dependency | ||
299 | * also shouldn't be affected because fork() only cares that the | ||
300 | * parent vmas are placed in the list before the child vmas and | ||
301 | * anon_vma_moveto_tail() won't reorder vmas from either the fork() | ||
302 | * parent or child. | ||
303 | */ | ||
304 | void anon_vma_moveto_tail(struct vm_area_struct *dst) | ||
305 | { | ||
306 | struct anon_vma_chain *pavc; | ||
307 | struct anon_vma *root = NULL; | ||
308 | |||
309 | list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { | ||
310 | struct anon_vma *anon_vma = pavc->anon_vma; | ||
311 | VM_BUG_ON(pavc->vma != dst); | ||
312 | root = lock_anon_vma_root(root, anon_vma); | ||
313 | list_del(&pavc->same_anon_vma); | ||
314 | list_add_tail(&pavc->same_anon_vma, &anon_vma->head); | ||
315 | } | ||
316 | unlock_anon_vma_root(root); | ||
317 | } | ||
318 | |||
319 | /* | ||
275 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | 320 | * Attach vma to its own anon_vma, as well as to the anon_vmas that |
276 | * the corresponding VMA in the parent process is attached to. | 321 | * the corresponding VMA in the parent process is attached to. |
277 | * Returns 0 on success, non-zero on failure. | 322 | * Returns 0 on success, non-zero on failure. |
@@ -3654,6 +3654,9 @@ void __init kmem_cache_init(void) | |||
3654 | struct kmem_cache *temp_kmem_cache_node; | 3654 | struct kmem_cache *temp_kmem_cache_node; |
3655 | unsigned long kmalloc_size; | 3655 | unsigned long kmalloc_size; |
3656 | 3656 | ||
3657 | if (debug_guardpage_minorder()) | ||
3658 | slub_max_order = 0; | ||
3659 | |||
3657 | kmem_size = offsetof(struct kmem_cache, node) + | 3660 | kmem_size = offsetof(struct kmem_cache, node) + |
3658 | nr_node_ids * sizeof(struct kmem_cache_node *); | 3661 | nr_node_ids * sizeof(struct kmem_cache_node *); |
3659 | 3662 | ||
@@ -585,11 +585,10 @@ int lru_add_drain_all(void) | |||
585 | void release_pages(struct page **pages, int nr, int cold) | 585 | void release_pages(struct page **pages, int nr, int cold) |
586 | { | 586 | { |
587 | int i; | 587 | int i; |
588 | struct pagevec pages_to_free; | 588 | LIST_HEAD(pages_to_free); |
589 | struct zone *zone = NULL; | 589 | struct zone *zone = NULL; |
590 | unsigned long uninitialized_var(flags); | 590 | unsigned long uninitialized_var(flags); |
591 | 591 | ||
592 | pagevec_init(&pages_to_free, cold); | ||
593 | for (i = 0; i < nr; i++) { | 592 | for (i = 0; i < nr; i++) { |
594 | struct page *page = pages[i]; | 593 | struct page *page = pages[i]; |
595 | 594 | ||
@@ -620,19 +619,12 @@ void release_pages(struct page **pages, int nr, int cold) | |||
620 | del_page_from_lru(zone, page); | 619 | del_page_from_lru(zone, page); |
621 | } | 620 | } |
622 | 621 | ||
623 | if (!pagevec_add(&pages_to_free, page)) { | 622 | list_add(&page->lru, &pages_to_free); |
624 | if (zone) { | ||
625 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
626 | zone = NULL; | ||
627 | } | ||
628 | __pagevec_free(&pages_to_free); | ||
629 | pagevec_reinit(&pages_to_free); | ||
630 | } | ||
631 | } | 623 | } |
632 | if (zone) | 624 | if (zone) |
633 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 625 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
634 | 626 | ||
635 | pagevec_free(&pages_to_free); | 627 | free_hot_cold_page_list(&pages_to_free, cold); |
636 | } | 628 | } |
637 | EXPORT_SYMBOL(release_pages); | 629 | EXPORT_SYMBOL(release_pages); |
638 | 630 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index b1cd12060723..9520592d4231 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -667,10 +667,10 @@ int try_to_free_swap(struct page *page) | |||
667 | * original page might be freed under memory pressure, then | 667 | * original page might be freed under memory pressure, then |
668 | * later read back in from swap, now with the wrong data. | 668 | * later read back in from swap, now with the wrong data. |
669 | * | 669 | * |
670 | * Hibernation clears bits from gfp_allowed_mask to prevent | 670 | * Hibration suspends storage while it is writing the image |
671 | * memory reclaim from writing to disk, so check that here. | 671 | * to disk so check that here. |
672 | */ | 672 | */ |
673 | if (!(gfp_allowed_mask & __GFP_IO)) | 673 | if (pm_suspended_storage()) |
674 | return 0; | 674 | return 0; |
675 | 675 | ||
676 | delete_from_swap_cache(page); | 676 | delete_from_swap_cache(page); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 21fdf46ad5aa..877ca046f43d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -256,7 +256,7 @@ struct vmap_area { | |||
256 | struct rb_node rb_node; /* address sorted rbtree */ | 256 | struct rb_node rb_node; /* address sorted rbtree */ |
257 | struct list_head list; /* address sorted list */ | 257 | struct list_head list; /* address sorted list */ |
258 | struct list_head purge_list; /* "lazy purge" list */ | 258 | struct list_head purge_list; /* "lazy purge" list */ |
259 | void *private; | 259 | struct vm_struct *vm; |
260 | struct rcu_head rcu_head; | 260 | struct rcu_head rcu_head; |
261 | }; | 261 | }; |
262 | 262 | ||
@@ -1285,7 +1285,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1285 | vm->addr = (void *)va->va_start; | 1285 | vm->addr = (void *)va->va_start; |
1286 | vm->size = va->va_end - va->va_start; | 1286 | vm->size = va->va_end - va->va_start; |
1287 | vm->caller = caller; | 1287 | vm->caller = caller; |
1288 | va->private = vm; | 1288 | va->vm = vm; |
1289 | va->flags |= VM_VM_AREA; | 1289 | va->flags |= VM_VM_AREA; |
1290 | } | 1290 | } |
1291 | 1291 | ||
@@ -1408,7 +1408,7 @@ static struct vm_struct *find_vm_area(const void *addr) | |||
1408 | 1408 | ||
1409 | va = find_vmap_area((unsigned long)addr); | 1409 | va = find_vmap_area((unsigned long)addr); |
1410 | if (va && va->flags & VM_VM_AREA) | 1410 | if (va && va->flags & VM_VM_AREA) |
1411 | return va->private; | 1411 | return va->vm; |
1412 | 1412 | ||
1413 | return NULL; | 1413 | return NULL; |
1414 | } | 1414 | } |
@@ -1427,7 +1427,7 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
1427 | 1427 | ||
1428 | va = find_vmap_area((unsigned long)addr); | 1428 | va = find_vmap_area((unsigned long)addr); |
1429 | if (va && va->flags & VM_VM_AREA) { | 1429 | if (va && va->flags & VM_VM_AREA) { |
1430 | struct vm_struct *vm = va->private; | 1430 | struct vm_struct *vm = va->vm; |
1431 | 1431 | ||
1432 | if (!(vm->flags & VM_UNLIST)) { | 1432 | if (!(vm->flags & VM_UNLIST)) { |
1433 | struct vm_struct *tmp, **p; | 1433 | struct vm_struct *tmp, **p; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 11adc890ce30..26f4a8a4e0c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -715,7 +715,13 @@ static enum page_references page_check_references(struct page *page, | |||
715 | */ | 715 | */ |
716 | SetPageReferenced(page); | 716 | SetPageReferenced(page); |
717 | 717 | ||
718 | if (referenced_page) | 718 | if (referenced_page || referenced_ptes > 1) |
719 | return PAGEREF_ACTIVATE; | ||
720 | |||
721 | /* | ||
722 | * Activate file-backed executable pages after first usage. | ||
723 | */ | ||
724 | if (vm_flags & VM_EXEC) | ||
719 | return PAGEREF_ACTIVATE; | 725 | return PAGEREF_ACTIVATE; |
720 | 726 | ||
721 | return PAGEREF_KEEP; | 727 | return PAGEREF_KEEP; |
@@ -728,24 +734,6 @@ static enum page_references page_check_references(struct page *page, | |||
728 | return PAGEREF_RECLAIM; | 734 | return PAGEREF_RECLAIM; |
729 | } | 735 | } |
730 | 736 | ||
731 | static noinline_for_stack void free_page_list(struct list_head *free_pages) | ||
732 | { | ||
733 | struct pagevec freed_pvec; | ||
734 | struct page *page, *tmp; | ||
735 | |||
736 | pagevec_init(&freed_pvec, 1); | ||
737 | |||
738 | list_for_each_entry_safe(page, tmp, free_pages, lru) { | ||
739 | list_del(&page->lru); | ||
740 | if (!pagevec_add(&freed_pvec, page)) { | ||
741 | __pagevec_free(&freed_pvec); | ||
742 | pagevec_reinit(&freed_pvec); | ||
743 | } | ||
744 | } | ||
745 | |||
746 | pagevec_free(&freed_pvec); | ||
747 | } | ||
748 | |||
749 | /* | 737 | /* |
750 | * shrink_page_list() returns the number of reclaimed pages | 738 | * shrink_page_list() returns the number of reclaimed pages |
751 | */ | 739 | */ |
@@ -1009,7 +997,7 @@ keep_lumpy: | |||
1009 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) | 997 | if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) |
1010 | zone_set_flag(zone, ZONE_CONGESTED); | 998 | zone_set_flag(zone, ZONE_CONGESTED); |
1011 | 999 | ||
1012 | free_page_list(&free_pages); | 1000 | free_hot_cold_page_list(&free_pages, 1); |
1013 | 1001 | ||
1014 | list_splice(&ret_pages, page_list); | 1002 | list_splice(&ret_pages, page_list); |
1015 | count_vm_events(PGACTIVATE, pgactivate); | 1003 | count_vm_events(PGACTIVATE, pgactivate); |
@@ -1178,14 +1166,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1178 | * anon page which don't already have a swap slot is | 1166 | * anon page which don't already have a swap slot is |
1179 | * pointless. | 1167 | * pointless. |
1180 | */ | 1168 | */ |
1181 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && | 1169 | if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && |
1182 | !PageSwapCache(cursor_page)) | 1170 | !PageSwapCache(cursor_page)) |
1183 | break; | 1171 | break; |
1184 | 1172 | ||
1185 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | 1173 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { |
1186 | list_move(&cursor_page->lru, dst); | 1174 | list_move(&cursor_page->lru, dst); |
1187 | mem_cgroup_del_lru(cursor_page); | 1175 | mem_cgroup_del_lru(cursor_page); |
1188 | nr_taken += hpage_nr_pages(page); | 1176 | nr_taken += hpage_nr_pages(cursor_page); |
1189 | nr_lumpy_taken++; | 1177 | nr_lumpy_taken++; |
1190 | if (PageDirty(cursor_page)) | 1178 | if (PageDirty(cursor_page)) |
1191 | nr_lumpy_dirty++; | 1179 | nr_lumpy_dirty++; |
@@ -2012,8 +2000,9 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2012 | * inactive lists are large enough, continue reclaiming | 2000 | * inactive lists are large enough, continue reclaiming |
2013 | */ | 2001 | */ |
2014 | pages_for_compaction = (2UL << sc->order); | 2002 | pages_for_compaction = (2UL << sc->order); |
2015 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + | 2003 | inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
2016 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | 2004 | if (nr_swap_pages > 0) |
2005 | inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
2017 | if (sc->nr_reclaimed < pages_for_compaction && | 2006 | if (sc->nr_reclaimed < pages_for_compaction && |
2018 | inactive_lru_pages > pages_for_compaction) | 2007 | inactive_lru_pages > pages_for_compaction) |
2019 | return true; | 2008 | return true; |
@@ -3448,9 +3437,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) | |||
3448 | static void warn_scan_unevictable_pages(void) | 3437 | static void warn_scan_unevictable_pages(void) |
3449 | { | 3438 | { |
3450 | printk_once(KERN_WARNING | 3439 | printk_once(KERN_WARNING |
3451 | "The scan_unevictable_pages sysctl/node-interface has been " | 3440 | "%s: The scan_unevictable_pages sysctl/node-interface has been " |
3452 | "disabled for lack of a legitimate use case. If you have " | 3441 | "disabled for lack of a legitimate use case. If you have " |
3453 | "one, please send an email to linux-mm@kvack.org.\n"); | 3442 | "one, please send an email to linux-mm@kvack.org.\n", |
3443 | current->comm); | ||
3454 | } | 3444 | } |
3455 | 3445 | ||
3456 | /* | 3446 | /* |