aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c3
-rw-r--r--mm/compaction.c76
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/filemap.c19
-rw-r--r--mm/huge_memory.c11
-rw-r--r--mm/hugetlb.c49
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c222
-rw-r--r--mm/memory-failure.c25
-rw-r--r--mm/memory.c158
-rw-r--r--mm/memory_hotplug.c78
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c46
-rw-r--r--mm/nommu.c46
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c112
-rw-r--r--mm/page_cgroup.c81
-rw-r--r--mm/pagewalk.c49
-rw-r--r--mm/rmap.c118
-rw-r--r--mm/shmem.c626
-rw-r--r--mm/slab.c26
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c119
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/thrash.c120
-rw-r--r--mm/truncate.c163
-rw-r--r--mm/vmalloc.c18
-rw-r--r--mm/vmscan.c180
33 files changed, 1570 insertions, 852 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c8..f2f1ca19ed53 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -356,7 +356,7 @@ config CLEANCACHE
356 for clean pages that the kernel's pageframe replacement algorithm 356 for clean pages that the kernel's pageframe replacement algorithm
357 (PFRA) would like to keep around, but can't since there isn't enough 357 (PFRA) would like to keep around, but can't since there isn't enough
358 memory. So when the PFRA "evicts" a page, it first attempts to use 358 memory. So when the PFRA "evicts" a page, it first attempts to use
359 cleancacne code to put the data contained in that page into 359 cleancache code to put the data contained in that page into
360 "transcendent memory", memory that is not directly accessible or 360 "transcendent memory", memory that is not directly accessible or
361 addressable by the kernel and is of unknown and possibly 361 addressable by the kernel and is of unknown and possibly
362 time-varying size. And when a cleancache-enabled 362 time-varying size. And when a cleancache-enabled
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index ddd0345e2e6d..d6edf8d14f9c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -513,7 +513,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
513 list_del_rcu(&bdi->bdi_list); 513 list_del_rcu(&bdi->bdi_list);
514 spin_unlock_bh(&bdi_lock); 514 spin_unlock_bh(&bdi_lock);
515 515
516 synchronize_rcu(); 516 synchronize_rcu_expedited();
517} 517}
518 518
519int bdi_register(struct backing_dev_info *bdi, struct device *parent, 519int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -614,6 +614,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
614void bdi_unregister(struct backing_dev_info *bdi) 614void bdi_unregister(struct backing_dev_info *bdi)
615{ 615{
616 if (bdi->dev) { 616 if (bdi->dev) {
617 bdi_set_min_ratio(bdi, 0);
617 trace_writeback_bdi_unregister(bdi); 618 trace_writeback_bdi_unregister(bdi);
618 bdi_prune_sb(bdi); 619 bdi_prune_sb(bdi);
619 del_timer_sync(&bdi->wb.wakeup_timer); 620 del_timer_sync(&bdi->wb.wakeup_timer);
diff --git a/mm/compaction.c b/mm/compaction.c
index 021a2960ef9e..6cc604bd5649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone,
144 int nr_freepages = cc->nr_freepages; 144 int nr_freepages = cc->nr_freepages;
145 struct list_head *freelist = &cc->freepages; 145 struct list_head *freelist = &cc->freepages;
146 146
147 /*
148 * Initialise the free scanner. The starting point is where we last
149 * scanned from (or the end of the zone if starting). The low point
150 * is the end of the pageblock the migration scanner is using.
151 */
147 pfn = cc->free_pfn; 152 pfn = cc->free_pfn;
148 low_pfn = cc->migrate_pfn + pageblock_nr_pages; 153 low_pfn = cc->migrate_pfn + pageblock_nr_pages;
149 high_pfn = low_pfn; 154
155 /*
156 * Take care that if the migration scanner is at the end of the zone
157 * that the free scanner does not accidentally move to the next zone
158 * in the next isolation cycle.
159 */
160 high_pfn = min(low_pfn, pfn);
150 161
151 /* 162 /*
152 * Isolate free pages until enough are available to migrate the 163 * Isolate free pages until enough are available to migrate the
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone)
240 return isolated > (inactive + active) / 2; 251 return isolated > (inactive + active) / 2;
241} 252}
242 253
254/* possible outcome of isolate_migratepages */
255typedef enum {
256 ISOLATE_ABORT, /* Abort compaction now */
257 ISOLATE_NONE, /* No pages isolated, continue scanning */
258 ISOLATE_SUCCESS, /* Pages isolated, migrate */
259} isolate_migrate_t;
260
243/* 261/*
244 * Isolate all pages that can be migrated from the block pointed to by 262 * Isolate all pages that can be migrated from the block pointed to by
245 * the migrate scanner within compact_control. 263 * the migrate scanner within compact_control.
246 */ 264 */
247static unsigned long isolate_migratepages(struct zone *zone, 265static isolate_migrate_t isolate_migratepages(struct zone *zone,
248 struct compact_control *cc) 266 struct compact_control *cc)
249{ 267{
250 unsigned long low_pfn, end_pfn; 268 unsigned long low_pfn, end_pfn;
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
261 /* Do not cross the free scanner or scan within a memory hole */ 279 /* Do not cross the free scanner or scan within a memory hole */
262 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { 280 if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
263 cc->migrate_pfn = end_pfn; 281 cc->migrate_pfn = end_pfn;
264 return 0; 282 return ISOLATE_NONE;
265 } 283 }
266 284
267 /* 285 /*
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone,
270 * delay for some time until fewer pages are isolated 288 * delay for some time until fewer pages are isolated
271 */ 289 */
272 while (unlikely(too_many_isolated(zone))) { 290 while (unlikely(too_many_isolated(zone))) {
291 /* async migration should just abort */
292 if (!cc->sync)
293 return ISOLATE_ABORT;
294
273 congestion_wait(BLK_RW_ASYNC, HZ/10); 295 congestion_wait(BLK_RW_ASYNC, HZ/10);
274 296
275 if (fatal_signal_pending(current)) 297 if (fatal_signal_pending(current))
276 return 0; 298 return ISOLATE_ABORT;
277 } 299 }
278 300
279 /* Time to isolate some pages for migration */ 301 /* Time to isolate some pages for migration */
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
358 380
359 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 381 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
360 382
361 return cc->nr_migratepages; 383 return ISOLATE_SUCCESS;
362} 384}
363 385
364/* 386/*
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone,
420 if (cc->free_pfn <= cc->migrate_pfn) 442 if (cc->free_pfn <= cc->migrate_pfn)
421 return COMPACT_COMPLETE; 443 return COMPACT_COMPLETE;
422 444
423 /* Compaction run is not finished if the watermark is not met */
424 watermark = low_wmark_pages(zone);
425 watermark += (1 << cc->order);
426
427 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
428 return COMPACT_CONTINUE;
429
430 /* 445 /*
431 * order == -1 is expected when compacting via 446 * order == -1 is expected when compacting via
432 * /proc/sys/vm/compact_memory 447 * /proc/sys/vm/compact_memory
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone,
434 if (cc->order == -1) 449 if (cc->order == -1)
435 return COMPACT_CONTINUE; 450 return COMPACT_CONTINUE;
436 451
452 /* Compaction run is not finished if the watermark is not met */
453 watermark = low_wmark_pages(zone);
454 watermark += (1 << cc->order);
455
456 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
457 return COMPACT_CONTINUE;
458
437 /* Direct compactor: Is a suitable page free? */ 459 /* Direct compactor: Is a suitable page free? */
438 for (order = cc->order; order < MAX_ORDER; order++) { 460 for (order = cc->order; order < MAX_ORDER; order++) {
439 /* Job done if page is free of the right migratetype */ 461 /* Job done if page is free of the right migratetype */
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order)
461 unsigned long watermark; 483 unsigned long watermark;
462 484
463 /* 485 /*
486 * order == -1 is expected when compacting via
487 * /proc/sys/vm/compact_memory
488 */
489 if (order == -1)
490 return COMPACT_CONTINUE;
491
492 /*
464 * Watermarks for order-0 must be met for compaction. Note the 2UL. 493 * Watermarks for order-0 must be met for compaction. Note the 2UL.
465 * This is because during migration, copies of pages need to be 494 * This is because during migration, copies of pages need to be
466 * allocated and for a short time, the footprint is higher 495 * allocated and for a short time, the footprint is higher
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order)
470 return COMPACT_SKIPPED; 499 return COMPACT_SKIPPED;
471 500
472 /* 501 /*
473 * order == -1 is expected when compacting via
474 * /proc/sys/vm/compact_memory
475 */
476 if (order == -1)
477 return COMPACT_CONTINUE;
478
479 /*
480 * fragmentation index determines if allocation failures are due to 502 * fragmentation index determines if allocation failures are due to
481 * low memory or external fragmentation 503 * low memory or external fragmentation
482 * 504 *
483 * index of -1 implies allocations might succeed dependingon watermarks 505 * index of -1000 implies allocations might succeed depending on
506 * watermarks
484 * index towards 0 implies failure is due to lack of memory 507 * index towards 0 implies failure is due to lack of memory
485 * index towards 1000 implies failure is due to fragmentation 508 * index towards 1000 implies failure is due to fragmentation
486 * 509 *
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
490 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 513 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
491 return COMPACT_SKIPPED; 514 return COMPACT_SKIPPED;
492 515
493 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) 516 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
517 0, 0))
494 return COMPACT_PARTIAL; 518 return COMPACT_PARTIAL;
495 519
496 return COMPACT_CONTINUE; 520 return COMPACT_CONTINUE;
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
522 unsigned long nr_migrate, nr_remaining; 546 unsigned long nr_migrate, nr_remaining;
523 int err; 547 int err;
524 548
525 if (!isolate_migratepages(zone, cc)) 549 switch (isolate_migratepages(zone, cc)) {
550 case ISOLATE_ABORT:
551 ret = COMPACT_PARTIAL;
552 goto out;
553 case ISOLATE_NONE:
526 continue; 554 continue;
555 case ISOLATE_SUCCESS:
556 ;
557 }
527 558
528 nr_migrate = cc->nr_migratepages; 559 nr_migrate = cc->nr_migratepages;
529 err = migrate_pages(&cc->migratepages, compaction_alloc, 560 err = migrate_pages(&cc->migratepages, compaction_alloc,
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
547 578
548 } 579 }
549 580
581out:
550 /* Release free pages and check accounting */ 582 /* Release free pages and check accounting */
551 cc->nr_freepages -= release_freepages(&cc->freepages); 583 cc->nr_freepages -= release_freepages(&cc->freepages);
552 VM_BUG_ON(cc->nr_freepages != 0); 584 VM_BUG_ON(cc->nr_freepages != 0);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519a..fbb58e346888 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
500{ 500{
501 struct device *dev = pool->dev; 501 struct device *dev = pool->dev;
502 502
503 dma_pool_destroy(pool);
504 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); 503 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
504 dma_pool_destroy(pool);
505} 505}
506EXPORT_SYMBOL(dmam_pool_destroy); 506EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e492c3dd6f8..867d40222ec7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,9 +78,6 @@
78 * ->i_mutex (generic_file_buffered_write) 78 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * ->i_mutex
82 * ->i_alloc_sem (various)
83 *
84 * bdi->wb.list_lock 81 * bdi->wb.list_lock
85 * sb_lock (fs/fs-writeback.c) 82 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 83 * ->mapping->tree_lock (__sync_single_inode)
@@ -131,6 +128,7 @@ void __delete_from_page_cache(struct page *page)
131 128
132 radix_tree_delete(&mapping->page_tree, page->index); 129 radix_tree_delete(&mapping->page_tree, page->index);
133 page->mapping = NULL; 130 page->mapping = NULL;
131 /* Leave page->index set: truncation lookup relies upon it */
134 mapping->nrpages--; 132 mapping->nrpages--;
135 __dec_zone_page_state(page, NR_FILE_PAGES); 133 __dec_zone_page_state(page, NR_FILE_PAGES);
136 if (PageSwapBacked(page)) 134 if (PageSwapBacked(page))
@@ -486,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
486 spin_unlock_irq(&mapping->tree_lock); 484 spin_unlock_irq(&mapping->tree_lock);
487 } else { 485 } else {
488 page->mapping = NULL; 486 page->mapping = NULL;
487 /* Leave page->index set: truncation relies upon it */
489 spin_unlock_irq(&mapping->tree_lock); 488 spin_unlock_irq(&mapping->tree_lock);
490 mem_cgroup_uncharge_cache_page(page); 489 mem_cgroup_uncharge_cache_page(page);
491 page_cache_release(page); 490 page_cache_release(page);
@@ -1795,7 +1794,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1795 1794
1796static struct page *__read_cache_page(struct address_space *mapping, 1795static struct page *__read_cache_page(struct address_space *mapping,
1797 pgoff_t index, 1796 pgoff_t index,
1798 int (*filler)(void *,struct page*), 1797 int (*filler)(void *, struct page *),
1799 void *data, 1798 void *data,
1800 gfp_t gfp) 1799 gfp_t gfp)
1801{ 1800{
@@ -1826,7 +1825,7 @@ repeat:
1826 1825
1827static struct page *do_read_cache_page(struct address_space *mapping, 1826static struct page *do_read_cache_page(struct address_space *mapping,
1828 pgoff_t index, 1827 pgoff_t index,
1829 int (*filler)(void *,struct page*), 1828 int (*filler)(void *, struct page *),
1830 void *data, 1829 void *data,
1831 gfp_t gfp) 1830 gfp_t gfp)
1832 1831
@@ -1866,7 +1865,7 @@ out:
1866 * @mapping: the page's address_space 1865 * @mapping: the page's address_space
1867 * @index: the page index 1866 * @index: the page index
1868 * @filler: function to perform the read 1867 * @filler: function to perform the read
1869 * @data: destination for read data 1868 * @data: first arg to filler(data, page) function, often left as NULL
1870 * 1869 *
1871 * Same as read_cache_page, but don't wait for page to become unlocked 1870 * Same as read_cache_page, but don't wait for page to become unlocked
1872 * after submitting it to the filler. 1871 * after submitting it to the filler.
@@ -1878,7 +1877,7 @@ out:
1878 */ 1877 */
1879struct page *read_cache_page_async(struct address_space *mapping, 1878struct page *read_cache_page_async(struct address_space *mapping,
1880 pgoff_t index, 1879 pgoff_t index,
1881 int (*filler)(void *,struct page*), 1880 int (*filler)(void *, struct page *),
1882 void *data) 1881 void *data)
1883{ 1882{
1884 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1883 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1926 * @mapping: the page's address_space 1925 * @mapping: the page's address_space
1927 * @index: the page index 1926 * @index: the page index
1928 * @filler: function to perform the read 1927 * @filler: function to perform the read
1929 * @data: destination for read data 1928 * @data: first arg to filler(data, page) function, often left as NULL
1930 * 1929 *
1931 * Read into the page cache. If a page already exists, and PageUptodate() is 1930 * Read into the page cache. If a page already exists, and PageUptodate() is
1932 * not set, try to fill the page then wait for it to become unlocked. 1931 * not set, try to fill the page then wait for it to become unlocked.
@@ -1935,7 +1934,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1935 */ 1934 */
1936struct page *read_cache_page(struct address_space *mapping, 1935struct page *read_cache_page(struct address_space *mapping,
1937 pgoff_t index, 1936 pgoff_t index,
1938 int (*filler)(void *,struct page*), 1937 int (*filler)(void *, struct page *),
1939 void *data) 1938 void *data)
1940{ 1939{
1941 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1940 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
@@ -2000,7 +1999,7 @@ int file_remove_suid(struct file *file)
2000 error = security_inode_killpriv(dentry); 1999 error = security_inode_killpriv(dentry);
2001 if (!error && killsuid) 2000 if (!error && killsuid)
2002 error = __remove_suid(dentry, killsuid); 2001 error = __remove_suid(dentry, killsuid);
2003 if (!error) 2002 if (!error && (inode->i_sb->s_flags & MS_NOSEC))
2004 inode->i_flags |= S_NOSEC; 2003 inode->i_flags |= S_NOSEC;
2005 2004
2006 return error; 2005 return error;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 615d9743a3cb..e2d1587be269 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
1596 list_del(&mm_slot->mm_node); 1596 list_del(&mm_slot->mm_node);
1597 free = 1; 1597 free = 1;
1598 } 1598 }
1599 spin_unlock(&khugepaged_mm_lock);
1599 1600
1600 if (free) { 1601 if (free) {
1601 spin_unlock(&khugepaged_mm_lock);
1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1603 free_mm_slot(mm_slot); 1603 free_mm_slot(mm_slot);
1604 mmdrop(mm); 1604 mmdrop(mm);
1605 } else if (mm_slot) { 1605 } else if (mm_slot) {
1606 spin_unlock(&khugepaged_mm_lock);
1607 /* 1606 /*
1608 * This is required to serialize against 1607 * This is required to serialize against
1609 * khugepaged_test_exit() (which is guaranteed to run 1608 * khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
1614 */ 1613 */
1615 down_write(&mm->mmap_sem); 1614 down_write(&mm->mmap_sem);
1616 up_write(&mm->mmap_sem); 1615 up_write(&mm->mmap_sem);
1617 } else 1616 }
1618 spin_unlock(&khugepaged_mm_lock);
1619} 1617}
1620 1618
1621static void release_pte_page(struct page *page) 1619static void release_pte_page(struct page *page)
@@ -2234,11 +2232,8 @@ static void khugepaged_loop(void)
2234 while (likely(khugepaged_enabled())) { 2232 while (likely(khugepaged_enabled())) {
2235#ifndef CONFIG_NUMA 2233#ifndef CONFIG_NUMA
2236 hpage = khugepaged_alloc_hugepage(); 2234 hpage = khugepaged_alloc_hugepage();
2237 if (unlikely(!hpage)) { 2235 if (unlikely(!hpage))
2238 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2239 break; 2236 break;
2240 }
2241 count_vm_event(THP_COLLAPSE_ALLOC);
2242#else 2237#else
2243 if (IS_ERR(hpage)) { 2238 if (IS_ERR(hpage)) {
2244 khugepaged_alloc_sleep(); 2239 khugepaged_alloc_sleep();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6402458fee38..dae27ba3be2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/io.h> 27#include <linux/io.h>
28 28
29#include <linux/hugetlb.h> 29#include <linux/hugetlb.h>
30#include <linux/node.h> 30#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
62 * must either hold the mmap_sem for write, or the mmap_sem for read and 62 * must either hold the mmap_sem for write, or the mmap_sem for read and
63 * the hugetlb_instantiation mutex: 63 * the hugetlb_instantiation mutex:
64 * 64 *
65 * down_write(&mm->mmap_sem); 65 * down_write(&mm->mmap_sem);
66 * or 66 * or
67 * down_read(&mm->mmap_sem); 67 * down_read(&mm->mmap_sem);
68 * mutex_lock(&hugetlb_instantiation_mutex); 68 * mutex_lock(&hugetlb_instantiation_mutex);
69 */ 69 */
70struct file_region { 70struct file_region {
71 struct list_head link; 71 struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
503 h->nr_huge_pages--; 503 h->nr_huge_pages--;
504 h->nr_huge_pages_node[page_to_nid(page)]--; 504 h->nr_huge_pages_node[page_to_nid(page)]--;
505 for (i = 0; i < pages_per_huge_page(h); i++) { 505 for (i = 0; i < pages_per_huge_page(h); i++) {
506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
507 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 507 1 << PG_referenced | 1 << PG_dirty |
508 1 << PG_private | 1<< PG_writeback); 508 1 << PG_active | 1 << PG_reserved |
509 1 << PG_private | 1 << PG_writeback);
509 } 510 }
510 set_compound_page_dtor(page, NULL); 511 set_compound_page_dtor(page, NULL);
511 set_page_refcounted(page); 512 set_page_refcounted(page);
@@ -591,7 +592,6 @@ int PageHuge(struct page *page)
591 592
592 return dtor == free_huge_page; 593 return dtor == free_huge_page;
593} 594}
594
595EXPORT_SYMBOL_GPL(PageHuge); 595EXPORT_SYMBOL_GPL(PageHuge);
596 596
597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -1105,12 +1105,28 @@ static void __init gather_bootmem_prealloc(void)
1105 struct huge_bootmem_page *m; 1105 struct huge_bootmem_page *m;
1106 1106
1107 list_for_each_entry(m, &huge_boot_pages, list) { 1107 list_for_each_entry(m, &huge_boot_pages, list) {
1108 struct page *page = virt_to_page(m);
1109 struct hstate *h = m->hstate; 1108 struct hstate *h = m->hstate;
1109 struct page *page;
1110
1111#ifdef CONFIG_HIGHMEM
1112 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1113 free_bootmem_late((unsigned long)m,
1114 sizeof(struct huge_bootmem_page));
1115#else
1116 page = virt_to_page(m);
1117#endif
1110 __ClearPageReserved(page); 1118 __ClearPageReserved(page);
1111 WARN_ON(page_count(page) != 1); 1119 WARN_ON(page_count(page) != 1);
1112 prep_compound_huge_page(page, h->order); 1120 prep_compound_huge_page(page, h->order);
1113 prep_new_huge_page(h, page, page_to_nid(page)); 1121 prep_new_huge_page(h, page, page_to_nid(page));
1122 /*
1123 * If we had gigantic hugepages allocated at boot time, we need
1124 * to restore the 'stolen' pages to totalram_pages in order to
1125 * fix confusing memory reports from free(1) and another
1126 * side-effects, like CommitLimit going negative.
1127 */
1128 if (h->order > (MAX_ORDER - 1))
1129 totalram_pages += 1 << h->order;
1114 } 1130 }
1115} 1131}
1116 1132
@@ -2116,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2116 pte_t entry; 2132 pte_t entry;
2117 2133
2118 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2134 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2119 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2135 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2120 update_mmu_cache(vma, address, ptep); 2136 update_mmu_cache(vma, address, ptep);
2121 }
2122} 2137}
2123 2138
2124 2139
@@ -2173,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
2173 if (huge_pte_none(pte) || pte_present(pte)) 2188 if (huge_pte_none(pte) || pte_present(pte))
2174 return 0; 2189 return 0;
2175 swp = pte_to_swp_entry(pte); 2190 swp = pte_to_swp_entry(pte);
2176 if (non_swap_entry(swp) && is_migration_entry(swp)) { 2191 if (non_swap_entry(swp) && is_migration_entry(swp))
2177 return 1; 2192 return 1;
2178 } else 2193 else
2179 return 0; 2194 return 0;
2180} 2195}
2181 2196
@@ -2186,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2186 if (huge_pte_none(pte) || pte_present(pte)) 2201 if (huge_pte_none(pte) || pte_present(pte))
2187 return 0; 2202 return 0;
2188 swp = pte_to_swp_entry(pte); 2203 swp = pte_to_swp_entry(pte);
2189 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { 2204 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2190 return 1; 2205 return 1;
2191 } else 2206 else
2192 return 0; 2207 return 0;
2193} 2208}
2194 2209
@@ -2551,7 +2566,7 @@ retry:
2551 * So we need to block hugepage fault by PG_hwpoison bit check. 2566 * So we need to block hugepage fault by PG_hwpoison bit check.
2552 */ 2567 */
2553 if (unlikely(PageHWPoison(page))) { 2568 if (unlikely(PageHWPoison(page))) {
2554 ret = VM_FAULT_HWPOISON | 2569 ret = VM_FAULT_HWPOISON |
2555 VM_FAULT_SET_HINDEX(h - hstates); 2570 VM_FAULT_SET_HINDEX(h - hstates);
2556 goto backout_unlocked; 2571 goto backout_unlocked;
2557 } 2572 }
@@ -2619,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2619 migration_entry_wait(mm, (pmd_t *)ptep, address); 2634 migration_entry_wait(mm, (pmd_t *)ptep, address);
2620 return 0; 2635 return 0;
2621 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2636 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2622 return VM_FAULT_HWPOISON_LARGE | 2637 return VM_FAULT_HWPOISON_LARGE |
2623 VM_FAULT_SET_HINDEX(h - hstates); 2638 VM_FAULT_SET_HINDEX(h - hstates);
2624 } 2639 }
2625 2640
diff --git a/mm/ksm.c b/mm/ksm.c
index d708b3ef2260..9a68b0cf0a1c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); 1302 slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
1303 ksm_scan.mm_slot = slot; 1303 ksm_scan.mm_slot = slot;
1304 spin_unlock(&ksm_mmlist_lock); 1304 spin_unlock(&ksm_mmlist_lock);
1305 /*
1306 * Although we tested list_empty() above, a racing __ksm_exit
1307 * of the last mm on the list may have removed it since then.
1308 */
1309 if (slot == &ksm_mm_head)
1310 return NULL;
1305next_mm: 1311next_mm:
1306 ksm_scan.address = 0; 1312 ksm_scan.address = 0;
1307 ksm_scan.rmap_list = &slot->rmap_list; 1313 ksm_scan.rmap_list = &slot->rmap_list;
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 218 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 220
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 221 /* vmtruncate_range needs to take i_mutex */
222 up_read(&current->mm->mmap_sem); 222 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 223 error = vmtruncate_range(mapping->host, offset, endoff);
224 down_read(&current->mm->mmap_sem); 224 down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index a0562d1a6ad4..ccbf97339592 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -758,9 +758,9 @@ void __init memblock_analyze(void)
758 758
759 /* Check marker in the unused last array entry */ 759 /* Check marker in the unused last array entry */
760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base 760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
761 != (phys_addr_t)RED_INACTIVE); 761 != MEMBLOCK_INACTIVE);
762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base 762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
763 != (phys_addr_t)RED_INACTIVE); 763 != MEMBLOCK_INACTIVE);
764 764
765 memblock.memory_size = 0; 765 memblock.memory_size = 0;
766 766
@@ -786,8 +786,8 @@ void __init memblock_init(void)
786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS; 786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
787 787
788 /* Write a marker in the unused last array entry */ 788 /* Write a marker in the unused last array entry */
789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
791 791
792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
793 * This simplifies the memblock_add() code below... 793 * This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bd9052a5d3ad..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/rbtree.h> 37#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
38#include <linux/slab.h> 39#include <linux/slab.h>
39#include <linux/swap.h> 40#include <linux/swap.h>
40#include <linux/swapops.h> 41#include <linux/swapops.h>
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index {
107enum mem_cgroup_events_target { 108enum mem_cgroup_events_target {
108 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_THRESH,
109 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_SOFTLIMIT,
111 MEM_CGROUP_TARGET_NUMAINFO,
110 MEM_CGROUP_NTARGETS, 112 MEM_CGROUP_NTARGETS,
111}; 113};
112#define THRESHOLDS_EVENTS_TARGET (128) 114#define THRESHOLDS_EVENTS_TARGET (128)
113#define SOFTLIMIT_EVENTS_TARGET (1024) 115#define SOFTLIMIT_EVENTS_TARGET (1024)
116#define NUMAINFO_EVENTS_TARGET (1024)
114 117
115struct mem_cgroup_stat_cpu { 118struct mem_cgroup_stat_cpu {
116 long count[MEM_CGROUP_STAT_NSTATS]; 119 long count[MEM_CGROUP_STAT_NSTATS];
@@ -236,7 +239,8 @@ struct mem_cgroup {
236 int last_scanned_node; 239 int last_scanned_node;
237#if MAX_NUMNODES > 1 240#if MAX_NUMNODES > 1
238 nodemask_t scan_nodes; 241 nodemask_t scan_nodes;
239 unsigned long next_scan_node_update; 242 atomic_t numainfo_events;
243 atomic_t numainfo_updating;
240#endif 244#endif
241 /* 245 /*
242 * Should the accounting and control be hierarchical, per subtree? 246 * Should the accounting and control be hierarchical, per subtree?
@@ -359,7 +363,7 @@ enum charge_type {
359static void mem_cgroup_get(struct mem_cgroup *mem); 363static void mem_cgroup_get(struct mem_cgroup *mem);
360static void mem_cgroup_put(struct mem_cgroup *mem); 364static void mem_cgroup_put(struct mem_cgroup *mem);
361static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 365static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
362static void drain_all_stock_async(void); 366static void drain_all_stock_async(struct mem_cgroup *mem);
363 367
364static struct mem_cgroup_per_zone * 368static struct mem_cgroup_per_zone *
365mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 369mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
576 return val; 580 return val;
577} 581}
578 582
579static long mem_cgroup_local_usage(struct mem_cgroup *mem)
580{
581 long ret;
582
583 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
584 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
585 return ret;
586}
587
588static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 583static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
589 bool charge) 584 bool charge)
590{ 585{
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
688 case MEM_CGROUP_TARGET_SOFTLIMIT: 683 case MEM_CGROUP_TARGET_SOFTLIMIT:
689 next = val + SOFTLIMIT_EVENTS_TARGET; 684 next = val + SOFTLIMIT_EVENTS_TARGET;
690 break; 685 break;
686 case MEM_CGROUP_TARGET_NUMAINFO:
687 next = val + NUMAINFO_EVENTS_TARGET;
688 break;
691 default: 689 default:
692 return; 690 return;
693 } 691 }
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
706 mem_cgroup_threshold(mem); 704 mem_cgroup_threshold(mem);
707 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 705 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
708 if (unlikely(__memcg_event_check(mem, 706 if (unlikely(__memcg_event_check(mem,
709 MEM_CGROUP_TARGET_SOFTLIMIT))){ 707 MEM_CGROUP_TARGET_SOFTLIMIT))) {
710 mem_cgroup_update_tree(mem, page); 708 mem_cgroup_update_tree(mem, page);
711 __mem_cgroup_target_update(mem, 709 __mem_cgroup_target_update(mem,
712 MEM_CGROUP_TARGET_SOFTLIMIT); 710 MEM_CGROUP_TARGET_SOFTLIMIT);
713 } 711 }
712#if MAX_NUMNODES > 1
713 if (unlikely(__memcg_event_check(mem,
714 MEM_CGROUP_TARGET_NUMAINFO))) {
715 atomic_inc(&mem->numainfo_events);
716 __mem_cgroup_target_update(mem,
717 MEM_CGROUP_TARGET_NUMAINFO);
718 }
719#endif
714 } 720 }
715} 721}
716 722
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
735 struct mem_cgroup, css); 741 struct mem_cgroup, css);
736} 742}
737 743
738static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 744struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
739{ 745{
740 struct mem_cgroup *mem = NULL; 746 struct mem_cgroup *mem = NULL;
741 747
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1128 return MEM_CGROUP_ZSTAT(mz, lru); 1134 return MEM_CGROUP_ZSTAT(mz, lru);
1129} 1135}
1130 1136
1131#ifdef CONFIG_NUMA
1132static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, 1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1133 int nid) 1138 int nid)
1134{ 1139{
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1140 return ret; 1145 return ret;
1141} 1146}
1142 1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1143static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) 1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1144{ 1160{
1145 u64 total = 0; 1161 u64 total = 0;
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1151 return total; 1167 return total;
1152} 1168}
1153 1169
1154static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1155 int nid)
1156{
1157 unsigned long ret;
1158
1159 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1160 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1161
1162 return ret;
1163}
1164
1165static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) 1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1166{ 1171{
1167 u64 total = 0; 1172 u64 total = 0;
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1558 return ret; 1563 return ret;
1559} 1564}
1560 1565
1566/**
1567 * test_mem_cgroup_node_reclaimable
1568 * @mem: the target memcg
1569 * @nid: the node ID to be checked.
1570 * @noswap : specify true here if the user wants flle only information.
1571 *
1572 * This function returns whether the specified memcg contains any
1573 * reclaimable pages on a node. Returns true if there are any reclaimable
1574 * pages in the node.
1575 */
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap)
1578{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
1580 return true;
1581 if (noswap || !total_swap_pages)
1582 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
1584 return true;
1585 return false;
1586
1587}
1561#if MAX_NUMNODES > 1 1588#if MAX_NUMNODES > 1
1562 1589
1563/* 1590/*
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1569static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1596static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1570{ 1597{
1571 int nid; 1598 int nid;
1572 1599 /*
1573 if (time_after(mem->next_scan_node_update, jiffies)) 1600 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1601 * pagein/pageout changes since the last update.
1602 */
1603 if (!atomic_read(&mem->numainfo_events))
1604 return;
1605 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1574 return; 1606 return;
1575 1607
1576 mem->next_scan_node_update = jiffies + 10*HZ;
1577 /* make a nodemask where this memcg uses memory from */ 1608 /* make a nodemask where this memcg uses memory from */
1578 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1609 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1579 1610
1580 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1611 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1581 1612
1582 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || 1613 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1583 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) 1614 node_clear(nid, mem->scan_nodes);
1584 continue;
1585
1586 if (total_swap_pages &&
1587 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1588 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1589 continue;
1590 node_clear(nid, mem->scan_nodes);
1591 } 1615 }
1616
1617 atomic_set(&mem->numainfo_events, 0);
1618 atomic_set(&mem->numainfo_updating, 0);
1592} 1619}
1593 1620
1594/* 1621/*
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1626 return node; 1653 return node;
1627} 1654}
1628 1655
1656/*
1657 * Check all nodes whether it contains reclaimable pages or not.
1658 * For quick scan, we make use of scan_nodes. This will allow us to skip
1659 * unused nodes. But scan_nodes is lazily updated and may not cotain
1660 * enough new information. We need to do double check.
1661 */
1662bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1663{
1664 int nid;
1665
1666 /*
1667 * quick check...making use of scan_node.
1668 * We can skip unused nodes.
1669 */
1670 if (!nodes_empty(mem->scan_nodes)) {
1671 for (nid = first_node(mem->scan_nodes);
1672 nid < MAX_NUMNODES;
1673 nid = next_node(nid, mem->scan_nodes)) {
1674
1675 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1676 return true;
1677 }
1678 }
1679 /*
1680 * Check rest of nodes.
1681 */
1682 for_each_node_state(nid, N_HIGH_MEMORY) {
1683 if (node_isset(nid, mem->scan_nodes))
1684 continue;
1685 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1686 return true;
1687 }
1688 return false;
1689}
1690
1629#else 1691#else
1630int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1692int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1631{ 1693{
1632 return 0; 1694 return 0;
1633} 1695}
1696
1697bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1698{
1699 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1700}
1634#endif 1701#endif
1635 1702
1636/* 1703/*
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1664 1731
1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1666 if (root_mem->memsw_is_minimum) 1733 if (!check_soft && root_mem->memsw_is_minimum)
1667 noswap = true; 1734 noswap = true;
1668 1735
1669 while (1) { 1736 while (1) {
1670 victim = mem_cgroup_select_victim(root_mem); 1737 victim = mem_cgroup_select_victim(root_mem);
1671 if (victim == root_mem) { 1738 if (victim == root_mem) {
1672 loop++; 1739 loop++;
1673 if (loop >= 1) 1740 /*
1674 drain_all_stock_async(); 1741 * We are not draining per cpu cached charges during
1742 * soft limit reclaim because global reclaim doesn't
1743 * care about charges. It tries to free some memory and
1744 * charges will not give any.
1745 */
1746 if (!check_soft && loop >= 1)
1747 drain_all_stock_async(root_mem);
1675 if (loop >= 2) { 1748 if (loop >= 2) {
1676 /* 1749 /*
1677 * If we have not been able to reclaim 1750 * If we have not been able to reclaim
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1695 } 1768 }
1696 } 1769 }
1697 } 1770 }
1698 if (!mem_cgroup_local_usage(victim)) { 1771 if (!mem_cgroup_reclaimable(victim, noswap)) {
1699 /* this cgroup's local usage == 0 */ 1772 /* this cgroup's local usage == 0 */
1700 css_put(&victim->css); 1773 css_put(&victim->css);
1701 continue; 1774 continue;
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp {
1934 struct mem_cgroup *cached; /* this never be root cgroup */ 2007 struct mem_cgroup *cached; /* this never be root cgroup */
1935 unsigned int nr_pages; 2008 unsigned int nr_pages;
1936 struct work_struct work; 2009 struct work_struct work;
2010 unsigned long flags;
2011#define FLUSHING_CACHED_CHARGE (0)
1937}; 2012};
1938static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2013static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1939static atomic_t memcg_drain_count; 2014static DEFINE_MUTEX(percpu_charge_mutex);
1940 2015
1941/* 2016/*
1942 * Try to consume stocked charge on this cpu. If success, one page is consumed 2017 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy)
1984{ 2059{
1985 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2060 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1986 drain_stock(stock); 2061 drain_stock(stock);
2062 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1987} 2063}
1988 2064
1989/* 2065/*
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2008 * expects some charges will be back to res_counter later but cannot wait for 2084 * expects some charges will be back to res_counter later but cannot wait for
2009 * it. 2085 * it.
2010 */ 2086 */
2011static void drain_all_stock_async(void) 2087static void drain_all_stock_async(struct mem_cgroup *root_mem)
2012{ 2088{
2013 int cpu; 2089 int cpu, curcpu;
2014 /* This function is for scheduling "drain" in asynchronous way. 2090 /*
2015 * The result of "drain" is not directly handled by callers. Then, 2091 * If someone calls draining, avoid adding more kworker runs.
2016 * if someone is calling drain, we don't have to call drain more.
2017 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
2018 * there is a race. We just do loose check here.
2019 */ 2092 */
2020 if (atomic_read(&memcg_drain_count)) 2093 if (!mutex_trylock(&percpu_charge_mutex))
2021 return; 2094 return;
2022 /* Notify other cpus that system-wide "drain" is running */ 2095 /* Notify other cpus that system-wide "drain" is running */
2023 atomic_inc(&memcg_drain_count);
2024 get_online_cpus(); 2096 get_online_cpus();
2097 /*
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
2025 for_each_online_cpu(cpu) { 2104 for_each_online_cpu(cpu) {
2026 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2027 schedule_work_on(cpu, &stock->work); 2106 struct mem_cgroup *mem;
2107
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached;
2112 if (!mem)
2113 continue;
2114 if (mem != root_mem) {
2115 if (!root_mem->use_hierarchy)
2116 continue;
2117 /* check whether "mem" is under tree of "root_mem" */
2118 if (!css_is_ancestor(&mem->css, &root_mem->css))
2119 continue;
2120 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2028 } 2123 }
2029 put_online_cpus(); 2124 put_online_cpus();
2030 atomic_dec(&memcg_drain_count); 2125 mutex_unlock(&percpu_charge_mutex);
2031 /* We don't wait for flush_work */ 2126 /* We don't wait for flush_work */
2032} 2127}
2033 2128
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void)
2035static void drain_all_stock_sync(void) 2130static void drain_all_stock_sync(void)
2036{ 2131{
2037 /* called when force_empty is called */ 2132 /* called when force_empty is called */
2038 atomic_inc(&memcg_drain_count); 2133 mutex_lock(&percpu_charge_mutex);
2039 schedule_on_each_cpu(drain_local_stock); 2134 schedule_on_each_cpu(drain_local_stock);
2040 atomic_dec(&memcg_drain_count); 2135 mutex_unlock(&percpu_charge_mutex);
2041} 2136}
2042 2137
2043/* 2138/*
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = {
4640 { 4735 {
4641 .name = "numa_stat", 4736 .name = "numa_stat",
4642 .open = mem_control_numa_stat_open, 4737 .open = mem_control_numa_stat_open,
4738 .mode = S_IRUGO,
4643 }, 4739 },
4644#endif 4740#endif
4645}; 4741};
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5414 struct cgroup *old_cont, 5510 struct cgroup *old_cont,
5415 struct task_struct *p) 5511 struct task_struct *p)
5416{ 5512{
5417 struct mm_struct *mm; 5513 struct mm_struct *mm = get_task_mm(p);
5418 5514
5419 if (!mc.to)
5420 /* no need to move charge */
5421 return;
5422
5423 mm = get_task_mm(p);
5424 if (mm) { 5515 if (mm) {
5425 mem_cgroup_move_charge(mm); 5516 if (mc.to)
5517 mem_cgroup_move_charge(mm);
5518 put_swap_token(mm);
5426 mmput(mm); 5519 mmput(mm);
5427 } 5520 }
5428 mem_cgroup_clear_mc(); 5521 if (mc.to)
5522 mem_cgroup_clear_mc();
5429} 5523}
5430#else /* !CONFIG_MMU */ 5524#else /* !CONFIG_MMU */
5431static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5525static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5c8f7e08928d..740c4f52059c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -52,6 +52,7 @@
52#include <linux/swapops.h> 52#include <linux/swapops.h>
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h>
55#include "internal.h" 56#include "internal.h"
56 57
57int sysctl_memory_failure_early_kill __read_mostly = 0; 58int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -390,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
390 struct task_struct *tsk; 391 struct task_struct *tsk;
391 struct anon_vma *av; 392 struct anon_vma *av;
392 393
393 read_lock(&tasklist_lock);
394 av = page_lock_anon_vma(page); 394 av = page_lock_anon_vma(page);
395 if (av == NULL) /* Not actually mapped anymore */ 395 if (av == NULL) /* Not actually mapped anymore */
396 goto out; 396 return;
397
398 read_lock(&tasklist_lock);
397 for_each_process (tsk) { 399 for_each_process (tsk) {
398 struct anon_vma_chain *vmac; 400 struct anon_vma_chain *vmac;
399 401
@@ -407,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
407 add_to_kill(tsk, page, vma, to_kill, tkc); 409 add_to_kill(tsk, page, vma, to_kill, tkc);
408 } 410 }
409 } 411 }
410 page_unlock_anon_vma(av);
411out:
412 read_unlock(&tasklist_lock); 412 read_unlock(&tasklist_lock);
413 page_unlock_anon_vma(av);
413} 414}
414 415
415/* 416/*
@@ -423,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
423 struct prio_tree_iter iter; 424 struct prio_tree_iter iter;
424 struct address_space *mapping = page->mapping; 425 struct address_space *mapping = page->mapping;
425 426
426 /*
427 * A note on the locking order between the two locks.
428 * We don't rely on this particular order.
429 * If you have some other code that needs a different order
430 * feel free to switch them around. Or add a reverse link
431 * from mm_struct to task_struct, then this could be all
432 * done without taking tasklist_lock and looping over all tasks.
433 */
434
435 read_lock(&tasklist_lock);
436 mutex_lock(&mapping->i_mmap_mutex); 427 mutex_lock(&mapping->i_mmap_mutex);
428 read_lock(&tasklist_lock);
437 for_each_process(tsk) { 429 for_each_process(tsk) {
438 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 430 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
439 431
@@ -453,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
453 add_to_kill(tsk, page, vma, to_kill, tkc); 445 add_to_kill(tsk, page, vma, to_kill, tkc);
454 } 446 }
455 } 447 }
456 mutex_unlock(&mapping->i_mmap_mutex);
457 read_unlock(&tasklist_lock); 448 read_unlock(&tasklist_lock);
449 mutex_unlock(&mapping->i_mmap_mutex);
458} 450}
459 451
460/* 452/*
@@ -1468,7 +1460,8 @@ int soft_offline_page(struct page *page, int flags)
1468 put_page(page); 1460 put_page(page);
1469 if (!ret) { 1461 if (!ret) {
1470 LIST_HEAD(pagelist); 1462 LIST_HEAD(pagelist);
1471 1463 inc_zone_page_state(page, NR_ISOLATED_ANON +
1464 page_is_file_cache(page));
1472 list_add(&page->lru, &pagelist); 1465 list_add(&page->lru, &pagelist);
1473 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 1466 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1474 0, true); 1467 0, true);
diff --git a/mm/memory.c b/mm/memory.c
index 6953d3926e01..a56e3ba816b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
305 if (batch->nr == batch->max) { 305 if (batch->nr == batch->max) {
306 if (!tlb_next_batch(tlb)) 306 if (!tlb_next_batch(tlb))
307 return 0; 307 return 0;
308 batch = tlb->active;
308 } 309 }
309 VM_BUG_ON(batch->nr > batch->max); 310 VM_BUG_ON(batch->nr > batch->max);
310 311
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1112 int force_flush = 0; 1113 int force_flush = 0;
1113 int rss[NR_MM_COUNTERS]; 1114 int rss[NR_MM_COUNTERS];
1114 spinlock_t *ptl; 1115 spinlock_t *ptl;
1116 pte_t *start_pte;
1115 pte_t *pte; 1117 pte_t *pte;
1116 1118
1117again: 1119again:
1118 init_rss_vec(rss); 1120 init_rss_vec(rss);
1119 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1121 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
1122 pte = start_pte;
1120 arch_enter_lazy_mmu_mode(); 1123 arch_enter_lazy_mmu_mode();
1121 do { 1124 do {
1122 pte_t ptent = *pte; 1125 pte_t ptent = *pte;
@@ -1196,7 +1199,7 @@ again:
1196 1199
1197 add_mm_rss_vec(mm, rss); 1200 add_mm_rss_vec(mm, rss);
1198 arch_leave_lazy_mmu_mode(); 1201 arch_leave_lazy_mmu_mode();
1199 pte_unmap_unlock(pte - 1, ptl); 1202 pte_unmap_unlock(start_pte, ptl);
1200 1203
1201 /* 1204 /*
1202 * mmu_gather ran out of room to batch pages, we break out of 1205 * mmu_gather ran out of room to batch pages, we break out of
@@ -1287,16 +1290,9 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1287 return addr; 1290 return addr;
1288} 1291}
1289 1292
1290#ifdef CONFIG_PREEMPT
1291# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1292#else
1293/* No preempt: go for improved straight-line efficiency */
1294# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1295#endif
1296
1297/** 1293/**
1298 * unmap_vmas - unmap a range of memory covered by a list of vma's 1294 * unmap_vmas - unmap a range of memory covered by a list of vma's
1299 * @tlbp: address of the caller's struct mmu_gather 1295 * @tlb: address of the caller's struct mmu_gather
1300 * @vma: the starting vma 1296 * @vma: the starting vma
1301 * @start_addr: virtual address at which to start unmapping 1297 * @start_addr: virtual address at which to start unmapping
1302 * @end_addr: virtual address at which to end unmapping 1298 * @end_addr: virtual address at which to end unmapping
@@ -1307,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1307 * 1303 *
1308 * Unmap all pages in the vma list. 1304 * Unmap all pages in the vma list.
1309 * 1305 *
1310 * We aim to not hold locks for too long (for scheduling latency reasons).
1311 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
1312 * return the ending mmu_gather to the caller.
1313 *
1314 * Only addresses between `start' and `end' will be unmapped. 1306 * Only addresses between `start' and `end' will be unmapped.
1315 * 1307 *
1316 * The VMA list must be sorted in ascending virtual address order. 1308 * The VMA list must be sorted in ascending virtual address order.
@@ -1813,7 +1805,63 @@ next_page:
1813} 1805}
1814EXPORT_SYMBOL(__get_user_pages); 1806EXPORT_SYMBOL(__get_user_pages);
1815 1807
1816/** 1808/*
1809 * fixup_user_fault() - manually resolve a user page fault
1810 * @tsk: the task_struct to use for page fault accounting, or
1811 * NULL if faults are not to be recorded.
1812 * @mm: mm_struct of target mm
1813 * @address: user address
1814 * @fault_flags:flags to pass down to handle_mm_fault()
1815 *
1816 * This is meant to be called in the specific scenario where for locking reasons
1817 * we try to access user memory in atomic context (within a pagefault_disable()
1818 * section), this returns -EFAULT, and we want to resolve the user fault before
1819 * trying again.
1820 *
1821 * Typically this is meant to be used by the futex code.
1822 *
1823 * The main difference with get_user_pages() is that this function will
1824 * unconditionally call handle_mm_fault() which will in turn perform all the
1825 * necessary SW fixup of the dirty and young bits in the PTE, while
1826 * handle_mm_fault() only guarantees to update these in the struct page.
1827 *
1828 * This is important for some architectures where those bits also gate the
1829 * access permission to the page because they are maintained in software. On
1830 * such architectures, gup() will not be enough to make a subsequent access
1831 * succeed.
1832 *
1833 * This should be called with the mm_sem held for read.
1834 */
1835int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1836 unsigned long address, unsigned int fault_flags)
1837{
1838 struct vm_area_struct *vma;
1839 int ret;
1840
1841 vma = find_extend_vma(mm, address);
1842 if (!vma || address < vma->vm_start)
1843 return -EFAULT;
1844
1845 ret = handle_mm_fault(mm, vma, address, fault_flags);
1846 if (ret & VM_FAULT_ERROR) {
1847 if (ret & VM_FAULT_OOM)
1848 return -ENOMEM;
1849 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1850 return -EHWPOISON;
1851 if (ret & VM_FAULT_SIGBUS)
1852 return -EFAULT;
1853 BUG();
1854 }
1855 if (tsk) {
1856 if (ret & VM_FAULT_MAJOR)
1857 tsk->maj_flt++;
1858 else
1859 tsk->min_flt++;
1860 }
1861 return 0;
1862}
1863
1864/*
1817 * get_user_pages() - pin user pages in memory 1865 * get_user_pages() - pin user pages in memory
1818 * @tsk: the task_struct to use for page fault accounting, or 1866 * @tsk: the task_struct to use for page fault accounting, or
1819 * NULL if faults are not to be recorded. 1867 * NULL if faults are not to be recorded.
@@ -2796,30 +2844,6 @@ void unmap_mapping_range(struct address_space *mapping,
2796} 2844}
2797EXPORT_SYMBOL(unmap_mapping_range); 2845EXPORT_SYMBOL(unmap_mapping_range);
2798 2846
2799int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2800{
2801 struct address_space *mapping = inode->i_mapping;
2802
2803 /*
2804 * If the underlying filesystem is not going to provide
2805 * a way to truncate a range of blocks (punch a hole) -
2806 * we should return failure right now.
2807 */
2808 if (!inode->i_op->truncate_range)
2809 return -ENOSYS;
2810
2811 mutex_lock(&inode->i_mutex);
2812 down_write(&inode->i_alloc_sem);
2813 unmap_mapping_range(mapping, offset, (end - offset), 1);
2814 truncate_inode_pages_range(mapping, offset, end);
2815 unmap_mapping_range(mapping, offset, (end - offset), 1);
2816 inode->i_op->truncate_range(inode, offset, end);
2817 up_write(&inode->i_alloc_sem);
2818 mutex_unlock(&inode->i_mutex);
2819
2820 return 0;
2821}
2822
2823/* 2847/*
2824 * We enter with non-exclusive mmap_sem (to exclude vma changes, 2848 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2825 * but allow concurrent faults), and pte mapped but not yet locked. 2849 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3125,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3125 pte_t *page_table; 3149 pte_t *page_table;
3126 spinlock_t *ptl; 3150 spinlock_t *ptl;
3127 struct page *page; 3151 struct page *page;
3152 struct page *cow_page;
3128 pte_t entry; 3153 pte_t entry;
3129 int anon = 0; 3154 int anon = 0;
3130 int charged = 0;
3131 struct page *dirty_page = NULL; 3155 struct page *dirty_page = NULL;
3132 struct vm_fault vmf; 3156 struct vm_fault vmf;
3133 int ret; 3157 int ret;
3134 int page_mkwrite = 0; 3158 int page_mkwrite = 0;
3135 3159
3160 /*
3161 * If we do COW later, allocate page befor taking lock_page()
3162 * on the file cache page. This will reduce lock holding time.
3163 */
3164 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3165
3166 if (unlikely(anon_vma_prepare(vma)))
3167 return VM_FAULT_OOM;
3168
3169 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3170 if (!cow_page)
3171 return VM_FAULT_OOM;
3172
3173 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3174 page_cache_release(cow_page);
3175 return VM_FAULT_OOM;
3176 }
3177 } else
3178 cow_page = NULL;
3179
3136 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3180 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3137 vmf.pgoff = pgoff; 3181 vmf.pgoff = pgoff;
3138 vmf.flags = flags; 3182 vmf.flags = flags;
@@ -3141,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3141 ret = vma->vm_ops->fault(vma, &vmf); 3185 ret = vma->vm_ops->fault(vma, &vmf);
3142 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3186 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3143 VM_FAULT_RETRY))) 3187 VM_FAULT_RETRY)))
3144 return ret; 3188 goto uncharge_out;
3145 3189
3146 if (unlikely(PageHWPoison(vmf.page))) { 3190 if (unlikely(PageHWPoison(vmf.page))) {
3147 if (ret & VM_FAULT_LOCKED) 3191 if (ret & VM_FAULT_LOCKED)
3148 unlock_page(vmf.page); 3192 unlock_page(vmf.page);
3149 return VM_FAULT_HWPOISON; 3193 ret = VM_FAULT_HWPOISON;
3194 goto uncharge_out;
3150 } 3195 }
3151 3196
3152 /* 3197 /*
@@ -3164,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3164 page = vmf.page; 3209 page = vmf.page;
3165 if (flags & FAULT_FLAG_WRITE) { 3210 if (flags & FAULT_FLAG_WRITE) {
3166 if (!(vma->vm_flags & VM_SHARED)) { 3211 if (!(vma->vm_flags & VM_SHARED)) {
3212 page = cow_page;
3167 anon = 1; 3213 anon = 1;
3168 if (unlikely(anon_vma_prepare(vma))) {
3169 ret = VM_FAULT_OOM;
3170 goto out;
3171 }
3172 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3173 vma, address);
3174 if (!page) {
3175 ret = VM_FAULT_OOM;
3176 goto out;
3177 }
3178 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3179 ret = VM_FAULT_OOM;
3180 page_cache_release(page);
3181 goto out;
3182 }
3183 charged = 1;
3184 copy_user_highpage(page, vmf.page, address, vma); 3214 copy_user_highpage(page, vmf.page, address, vma);
3185 __SetPageUptodate(page); 3215 __SetPageUptodate(page);
3186 } else { 3216 } else {
@@ -3249,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3249 /* no need to invalidate: a not-present page won't be cached */ 3279 /* no need to invalidate: a not-present page won't be cached */
3250 update_mmu_cache(vma, address, page_table); 3280 update_mmu_cache(vma, address, page_table);
3251 } else { 3281 } else {
3252 if (charged) 3282 if (cow_page)
3253 mem_cgroup_uncharge_page(page); 3283 mem_cgroup_uncharge_page(cow_page);
3254 if (anon) 3284 if (anon)
3255 page_cache_release(page); 3285 page_cache_release(page);
3256 else 3286 else
@@ -3259,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3259 3289
3260 pte_unmap_unlock(page_table, ptl); 3290 pte_unmap_unlock(page_table, ptl);
3261 3291
3262out:
3263 if (dirty_page) { 3292 if (dirty_page) {
3264 struct address_space *mapping = page->mapping; 3293 struct address_space *mapping = page->mapping;
3265 3294
@@ -3289,6 +3318,13 @@ out:
3289unwritable_page: 3318unwritable_page:
3290 page_cache_release(page); 3319 page_cache_release(page);
3291 return ret; 3320 return ret;
3321uncharge_out:
3322 /* fs's fault handler get error */
3323 if (cow_page) {
3324 mem_cgroup_uncharge_page(cow_page);
3325 page_cache_release(cow_page);
3326 }
3327 return ret;
3292} 3328}
3293 3329
3294static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3330static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9f646374e32f..6e7d8b21dbfa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37/*
38 * online_page_callback contains pointer to current page onlining function.
39 * Initially it is generic_online_page(). If it is required it could be
40 * changed by calling set_online_page_callback() for callback registration
41 * and restore_online_page_callback() for generic callback restore.
42 */
43
44static void generic_online_page(struct page *page);
45
46static online_page_callback_t online_page_callback = generic_online_page;
47
37DEFINE_MUTEX(mem_hotplug_mutex); 48DEFINE_MUTEX(mem_hotplug_mutex);
38 49
39void lock_memory_hotplug(void) 50void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
361} 372}
362EXPORT_SYMBOL_GPL(__remove_pages); 373EXPORT_SYMBOL_GPL(__remove_pages);
363 374
364void online_page(struct page *page) 375int set_online_page_callback(online_page_callback_t callback)
376{
377 int rc = -EINVAL;
378
379 lock_memory_hotplug();
380
381 if (online_page_callback == generic_online_page) {
382 online_page_callback = callback;
383 rc = 0;
384 }
385
386 unlock_memory_hotplug();
387
388 return rc;
389}
390EXPORT_SYMBOL_GPL(set_online_page_callback);
391
392int restore_online_page_callback(online_page_callback_t callback)
393{
394 int rc = -EINVAL;
395
396 lock_memory_hotplug();
397
398 if (online_page_callback == callback) {
399 online_page_callback = generic_online_page;
400 rc = 0;
401 }
402
403 unlock_memory_hotplug();
404
405 return rc;
406}
407EXPORT_SYMBOL_GPL(restore_online_page_callback);
408
409void __online_page_set_limits(struct page *page)
365{ 410{
366 unsigned long pfn = page_to_pfn(page); 411 unsigned long pfn = page_to_pfn(page);
367 412
368 totalram_pages++;
369 if (pfn >= num_physpages) 413 if (pfn >= num_physpages)
370 num_physpages = pfn + 1; 414 num_physpages = pfn + 1;
415}
416EXPORT_SYMBOL_GPL(__online_page_set_limits);
417
418void __online_page_increment_counters(struct page *page)
419{
420 totalram_pages++;
371 421
372#ifdef CONFIG_HIGHMEM 422#ifdef CONFIG_HIGHMEM
373 if (PageHighMem(page)) 423 if (PageHighMem(page))
374 totalhigh_pages++; 424 totalhigh_pages++;
375#endif 425#endif
426}
427EXPORT_SYMBOL_GPL(__online_page_increment_counters);
376 428
429void __online_page_free(struct page *page)
430{
377 ClearPageReserved(page); 431 ClearPageReserved(page);
378 init_page_count(page); 432 init_page_count(page);
379 __free_page(page); 433 __free_page(page);
380} 434}
435EXPORT_SYMBOL_GPL(__online_page_free);
436
437static void generic_online_page(struct page *page)
438{
439 __online_page_set_limits(page);
440 __online_page_increment_counters(page);
441 __online_page_free(page);
442}
381 443
382static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 444static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
383 void *arg) 445 void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
388 if (PageReserved(pfn_to_page(start_pfn))) 450 if (PageReserved(pfn_to_page(start_pfn)))
389 for (i = 0; i < nr_pages; i++) { 451 for (i = 0; i < nr_pages; i++) {
390 page = pfn_to_page(start_pfn + i); 452 page = pfn_to_page(start_pfn + i);
391 online_page(page); 453 (*online_page_callback)(page);
392 onlined_pages++; 454 onlined_pages++;
393 } 455 }
394 *(unsigned long *)arg = onlined_pages; 456 *(unsigned long *)arg = onlined_pages;
@@ -494,6 +556,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
494 /* init node's zones as empty zones, we don't have any present pages.*/ 556 /* init node's zones as empty zones, we don't have any present pages.*/
495 free_area_init_node(nid, zones_size, start_pfn, zholes_size); 557 free_area_init_node(nid, zones_size, start_pfn, zholes_size);
496 558
559 /*
560 * The node we allocated has no zone fallback lists. For avoiding
561 * to access not-initialized zonelist, build here.
562 */
563 mutex_lock(&zonelists_mutex);
564 build_all_zonelists(NULL);
565 mutex_unlock(&zonelists_mutex);
566
497 return pgdat; 567 return pgdat;
498} 568}
499 569
@@ -515,7 +585,7 @@ int mem_online_node(int nid)
515 585
516 lock_memory_hotplug(); 586 lock_memory_hotplug();
517 pgdat = hotadd_new_pgdat(nid, 0); 587 pgdat = hotadd_new_pgdat(nid, 0);
518 if (pgdat) { 588 if (!pgdat) {
519 ret = -ENOMEM; 589 ret = -ENOMEM;
520 goto out; 590 goto out;
521 } 591 }
diff --git a/mm/migrate.c b/mm/migrate.c
index e4a5c912983d..666e4e677414 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
288 */ 288 */
289 __dec_zone_page_state(page, NR_FILE_PAGES); 289 __dec_zone_page_state(page, NR_FILE_PAGES);
290 __inc_zone_page_state(newpage, NR_FILE_PAGES); 290 __inc_zone_page_state(newpage, NR_FILE_PAGES);
291 if (PageSwapBacked(page)) { 291 if (!PageSwapCache(page) && PageSwapBacked(page)) {
292 __dec_zone_page_state(page, NR_SHMEM); 292 __dec_zone_page_state(page, NR_SHMEM);
293 __inc_zone_page_state(newpage, NR_SHMEM); 293 __inc_zone_page_state(newpage, NR_SHMEM);
294 } 294 }
diff --git a/mm/mmap.c b/mm/mmap.c
index bbdc9af5e117..a65efd4db3e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
122 return 0; 122 return 0;
123 123
124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
125 unsigned long n; 125 free = global_page_state(NR_FREE_PAGES);
126 free += global_page_state(NR_FILE_PAGES);
127
128 /*
129 * shmem pages shouldn't be counted as free in this
130 * case, they can't be purged, only swapped out, and
131 * that won't affect the overall amount of available
132 * memory in the system.
133 */
134 free -= global_page_state(NR_SHMEM);
126 135
127 free = global_page_state(NR_FILE_PAGES);
128 free += nr_swap_pages; 136 free += nr_swap_pages;
129 137
130 /* 138 /*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
136 free += global_page_state(NR_SLAB_RECLAIMABLE); 144 free += global_page_state(NR_SLAB_RECLAIMABLE);
137 145
138 /* 146 /*
139 * Leave the last 3% for root
140 */
141 if (!cap_sys_admin)
142 free -= free / 32;
143
144 if (free > pages)
145 return 0;
146
147 /*
148 * nr_free_pages() is very expensive on large systems,
149 * only call if we're about to fail.
150 */
151 n = nr_free_pages();
152
153 /*
154 * Leave reserved pages. The pages are not for anonymous pages. 147 * Leave reserved pages. The pages are not for anonymous pages.
155 */ 148 */
156 if (n <= totalreserve_pages) 149 if (free <= totalreserve_pages)
157 goto error; 150 goto error;
158 else 151 else
159 n -= totalreserve_pages; 152 free -= totalreserve_pages;
160 153
161 /* 154 /*
162 * Leave the last 3% for root 155 * Leave the last 3% for root
163 */ 156 */
164 if (!cap_sys_admin) 157 if (!cap_sys_admin)
165 n -= n / 32; 158 free -= free / 32;
166 free += n;
167 159
168 if (free > pages) 160 if (free > pages)
169 return 0; 161 return 0;
@@ -906,14 +898,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
906 if (anon_vma) 898 if (anon_vma)
907 return anon_vma; 899 return anon_vma;
908try_prev: 900try_prev:
909 /* 901 near = vma->vm_prev;
910 * It is potentially slow to have to call find_vma_prev here.
911 * But it's only on the first write fault on the vma, not
912 * every time, and we could devise a way to avoid it later
913 * (e.g. stash info in next's anon_vma_node when assigning
914 * an anon_vma, or when trying vma_merge). Another time.
915 */
916 BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
917 if (!near) 902 if (!near)
918 goto none; 903 goto none;
919 904
@@ -2044,9 +2029,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
2044 return -EINVAL; 2029 return -EINVAL;
2045 2030
2046 /* Find the first overlapping VMA */ 2031 /* Find the first overlapping VMA */
2047 vma = find_vma_prev(mm, start, &prev); 2032 vma = find_vma(mm, start);
2048 if (!vma) 2033 if (!vma)
2049 return 0; 2034 return 0;
2035 prev = vma->vm_prev;
2050 /* we have start < vma->vm_end */ 2036 /* we have start < vma->vm_end */
2051 2037
2052 /* if it doesn't overlap, we have nothing.. */ 2038 /* if it doesn't overlap, we have nothing.. */
diff --git a/mm/nommu.c b/mm/nommu.c
index 1fd0c51b10a6..4358032566e9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
1087 * it's being traced - otherwise breakpoints set in it may interfere 1086 * it's being traced - otherwise breakpoints set in it may interfere
1088 * with another untraced process 1087 * with another untraced process
1089 */ 1088 */
1090 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) 1089 if ((flags & MAP_PRIVATE) && current->ptrace)
1091 vm_flags &= ~VM_MAYSHARE; 1090 vm_flags &= ~VM_MAYSHARE;
1092 1091
1093 return vm_flags; 1092 return vm_flags;
@@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1813 return NULL; 1812 return NULL;
1814} 1813}
1815 1814
1816int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, 1815int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1817 unsigned long to, unsigned long size, pgprot_t prot) 1816 unsigned long pfn, unsigned long size, pgprot_t prot)
1818{ 1817{
1819 vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; 1818 if (addr != (pfn << PAGE_SHIFT))
1819 return -EINVAL;
1820
1821 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1820 return 0; 1822 return 0;
1821} 1823}
1822EXPORT_SYMBOL(remap_pfn_range); 1824EXPORT_SYMBOL(remap_pfn_range);
@@ -1882,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1882 return 0; 1884 return 0;
1883 1885
1884 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1886 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1885 unsigned long n; 1887 free = global_page_state(NR_FREE_PAGES);
1888 free += global_page_state(NR_FILE_PAGES);
1889
1890 /*
1891 * shmem pages shouldn't be counted as free in this
1892 * case, they can't be purged, only swapped out, and
1893 * that won't affect the overall amount of available
1894 * memory in the system.
1895 */
1896 free -= global_page_state(NR_SHMEM);
1886 1897
1887 free = global_page_state(NR_FILE_PAGES);
1888 free += nr_swap_pages; 1898 free += nr_swap_pages;
1889 1899
1890 /* 1900 /*
@@ -1896,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1896 free += global_page_state(NR_SLAB_RECLAIMABLE); 1906 free += global_page_state(NR_SLAB_RECLAIMABLE);
1897 1907
1898 /* 1908 /*
1899 * Leave the last 3% for root
1900 */
1901 if (!cap_sys_admin)
1902 free -= free / 32;
1903
1904 if (free > pages)
1905 return 0;
1906
1907 /*
1908 * nr_free_pages() is very expensive on large systems,
1909 * only call if we're about to fail.
1910 */
1911 n = nr_free_pages();
1912
1913 /*
1914 * Leave reserved pages. The pages are not for anonymous pages. 1909 * Leave reserved pages. The pages are not for anonymous pages.
1915 */ 1910 */
1916 if (n <= totalreserve_pages) 1911 if (free <= totalreserve_pages)
1917 goto error; 1912 goto error;
1918 else 1913 else
1919 n -= totalreserve_pages; 1914 free -= totalreserve_pages;
1920 1915
1921 /* 1916 /*
1922 * Leave the last 3% for root 1917 * Leave the last 3% for root
1923 */ 1918 */
1924 if (!cap_sys_admin) 1919 if (!cap_sys_admin)
1925 n -= n / 32; 1920 free -= free / 32;
1926 free += n;
1927 1921
1928 if (free > pages) 1922 if (free > pages)
1929 return 0; 1923 return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..eafff89b3dd6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339 * then wait for it to finish before killing 339 * then wait for it to finish before killing
340 * some other task unnecessarily. 340 * some other task unnecessarily.
341 */ 341 */
342 if (!(task_ptrace(p->group_leader) & 342 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL); 343 return ERR_PTR(-1UL);
345 } 344 }
346 } 345 }
@@ -488,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
488 487
489 /* 488 /*
490 * If any of p's children has a different mm and is eligible for kill, 489 * If any of p's children has a different mm and is eligible for kill,
491 * the one with the highest badness() score is sacrificed for its 490 * the one with the highest oom_badness() score is sacrificed for its
492 * parent. This attempts to lose the minimal amount of work done while 491 * parent. This attempts to lose the minimal amount of work done while
493 * still freeing memory. 492 * still freeing memory.
494 */ 493 */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1d781803e629..d1960744f881 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1347,7 +1347,6 @@ EXPORT_SYMBOL(account_page_dirtied);
1347void account_page_writeback(struct page *page) 1347void account_page_writeback(struct page *page)
1348{ 1348{
1349 inc_zone_page_state(page, NR_WRITEBACK); 1349 inc_zone_page_state(page, NR_WRITEBACK);
1350 inc_zone_page_state(page, NR_WRITTEN);
1351} 1350}
1352EXPORT_SYMBOL(account_page_writeback); 1351EXPORT_SYMBOL(account_page_writeback);
1353 1352
@@ -1564,8 +1563,10 @@ int test_clear_page_writeback(struct page *page)
1564 } else { 1563 } else {
1565 ret = TestClearPageWriteback(page); 1564 ret = TestClearPageWriteback(page);
1566 } 1565 }
1567 if (ret) 1566 if (ret) {
1568 dec_zone_page_state(page, NR_WRITEBACK); 1567 dec_zone_page_state(page, NR_WRITEBACK);
1568 inc_zone_page_state(page, NR_WRITTEN);
1569 }
1569 return ret; 1570 return ret;
1570} 1571}
1571 1572
@@ -1611,10 +1612,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
1611 */ 1612 */
1612int mapping_tagged(struct address_space *mapping, int tag) 1613int mapping_tagged(struct address_space *mapping, int tag)
1613{ 1614{
1614 int ret; 1615 return radix_tree_tagged(&mapping->page_tree, tag);
1615 rcu_read_lock();
1616 ret = radix_tree_tagged(&mapping->page_tree, tag);
1617 rcu_read_unlock();
1618 return ret;
1619} 1616}
1620EXPORT_SYMBOL(mapping_tagged); 1617EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..094472377d81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1616 set_bit(i, zlc->fullzones); 1616 set_bit(i, zlc->fullzones);
1617} 1617}
1618 1618
1619/*
1620 * clear all zones full, called after direct reclaim makes progress so that
1621 * a zone that was recently full is not skipped over for up to a second
1622 */
1623static void zlc_clear_zones_full(struct zonelist *zonelist)
1624{
1625 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1626
1627 zlc = zonelist->zlcache_ptr;
1628 if (!zlc)
1629 return;
1630
1631 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1632}
1633
1619#else /* CONFIG_NUMA */ 1634#else /* CONFIG_NUMA */
1620 1635
1621static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1636static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1632static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1647static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1633{ 1648{
1634} 1649}
1650
1651static void zlc_clear_zones_full(struct zonelist *zonelist)
1652{
1653}
1635#endif /* CONFIG_NUMA */ 1654#endif /* CONFIG_NUMA */
1636 1655
1637/* 1656/*
@@ -1664,7 +1683,7 @@ zonelist_scan:
1664 continue; 1683 continue;
1665 if ((alloc_flags & ALLOC_CPUSET) && 1684 if ((alloc_flags & ALLOC_CPUSET) &&
1666 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1685 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1667 goto try_next_zone; 1686 continue;
1668 1687
1669 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1688 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1670 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1689 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1695,36 @@ zonelist_scan:
1676 classzone_idx, alloc_flags)) 1695 classzone_idx, alloc_flags))
1677 goto try_this_zone; 1696 goto try_this_zone;
1678 1697
1698 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1699 /*
1700 * we do zlc_setup if there are multiple nodes
1701 * and before considering the first zone allowed
1702 * by the cpuset.
1703 */
1704 allowednodes = zlc_setup(zonelist, alloc_flags);
1705 zlc_active = 1;
1706 did_zlc_setup = 1;
1707 }
1708
1679 if (zone_reclaim_mode == 0) 1709 if (zone_reclaim_mode == 0)
1680 goto this_zone_full; 1710 goto this_zone_full;
1681 1711
1712 /*
1713 * As we may have just activated ZLC, check if the first
1714 * eligible zone has failed zone_reclaim recently.
1715 */
1716 if (NUMA_BUILD && zlc_active &&
1717 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1718 continue;
1719
1682 ret = zone_reclaim(zone, gfp_mask, order); 1720 ret = zone_reclaim(zone, gfp_mask, order);
1683 switch (ret) { 1721 switch (ret) {
1684 case ZONE_RECLAIM_NOSCAN: 1722 case ZONE_RECLAIM_NOSCAN:
1685 /* did not scan */ 1723 /* did not scan */
1686 goto try_next_zone; 1724 continue;
1687 case ZONE_RECLAIM_FULL: 1725 case ZONE_RECLAIM_FULL:
1688 /* scanned but unreclaimable */ 1726 /* scanned but unreclaimable */
1689 goto this_zone_full; 1727 continue;
1690 default: 1728 default:
1691 /* did we reclaim enough */ 1729 /* did we reclaim enough */
1692 if (!zone_watermark_ok(zone, order, mark, 1730 if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1741,6 @@ try_this_zone:
1703this_zone_full: 1741this_zone_full:
1704 if (NUMA_BUILD) 1742 if (NUMA_BUILD)
1705 zlc_mark_zone_full(zonelist, z); 1743 zlc_mark_zone_full(zonelist, z);
1706try_next_zone:
1707 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1708 /*
1709 * we do zlc_setup after the first zone is tried but only
1710 * if there are multiple nodes make it worthwhile
1711 */
1712 allowednodes = zlc_setup(zonelist, alloc_flags);
1713 zlc_active = 1;
1714 did_zlc_setup = 1;
1715 }
1716 } 1744 }
1717 1745
1718 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1746 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1954 if (unlikely(!(*did_some_progress))) 1982 if (unlikely(!(*did_some_progress)))
1955 return NULL; 1983 return NULL;
1956 1984
1985 /* After successful reclaim, reconsider all zones for allocation */
1986 if (NUMA_BUILD)
1987 zlc_clear_zones_full(zonelist);
1988
1957retry: 1989retry:
1958 page = get_page_from_freelist(gfp_mask, nodemask, order, 1990 page = get_page_from_freelist(gfp_mask, nodemask, order,
1959 zonelist, high_zoneidx, 1991 zonelist, high_zoneidx,
@@ -4585,6 +4617,60 @@ void __init sort_node_map(void)
4585 cmp_node_active_region, NULL); 4617 cmp_node_active_region, NULL);
4586} 4618}
4587 4619
4620/**
4621 * node_map_pfn_alignment - determine the maximum internode alignment
4622 *
4623 * This function should be called after node map is populated and sorted.
4624 * It calculates the maximum power of two alignment which can distinguish
4625 * all the nodes.
4626 *
4627 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4628 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4629 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4630 * shifted, 1GiB is enough and this function will indicate so.
4631 *
4632 * This is used to test whether pfn -> nid mapping of the chosen memory
4633 * model has fine enough granularity to avoid incorrect mapping for the
4634 * populated node map.
4635 *
4636 * Returns the determined alignment in pfn's. 0 if there is no alignment
4637 * requirement (single node).
4638 */
4639unsigned long __init node_map_pfn_alignment(void)
4640{
4641 unsigned long accl_mask = 0, last_end = 0;
4642 int last_nid = -1;
4643 int i;
4644
4645 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4646 int nid = early_node_map[i].nid;
4647 unsigned long start = early_node_map[i].start_pfn;
4648 unsigned long end = early_node_map[i].end_pfn;
4649 unsigned long mask;
4650
4651 if (!start || last_nid < 0 || last_nid == nid) {
4652 last_nid = nid;
4653 last_end = end;
4654 continue;
4655 }
4656
4657 /*
4658 * Start with a mask granular enough to pin-point to the
4659 * start pfn and tick off bits one-by-one until it becomes
4660 * too coarse to separate the current node from the last.
4661 */
4662 mask = ~((1 << __ffs(start)) - 1);
4663 while (mask && last_end <= (start & (mask << 1)))
4664 mask <<= 1;
4665
4666 /* accumulate all internode masks */
4667 accl_mask |= mask;
4668 }
4669
4670 /* convert mask to number of pages */
4671 return ~accl_mask + 1;
4672}
4673
4588/* Find the lowest pfn for a node */ 4674/* Find the lowest pfn for a node */
4589static unsigned long __init find_min_pfn_for_node(int nid) 4675static unsigned long __init find_min_pfn_for_node(int nid)
4590{ 4676{
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 74ccff61d1be..39d216d535ea 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr)
162} 162}
163#endif 163#endif
164 164
165static int __meminit init_section_page_cgroup(unsigned long pfn) 165static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
166{ 166{
167 struct page_cgroup *base, *pc; 167 struct page_cgroup *base, *pc;
168 struct mem_section *section; 168 struct mem_section *section;
169 unsigned long table_size; 169 unsigned long table_size;
170 unsigned long nr; 170 unsigned long nr;
171 int nid, index; 171 int index;
172 172
173 nr = pfn_to_section_nr(pfn); 173 nr = pfn_to_section_nr(pfn);
174 section = __nr_to_section(nr); 174 section = __nr_to_section(nr);
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
176 if (section->page_cgroup) 176 if (section->page_cgroup)
177 return 0; 177 return 0;
178 178
179 nid = page_to_nid(pfn_to_page(pfn));
180 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 179 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
181 base = alloc_page_cgroup(table_size, nid); 180 base = alloc_page_cgroup(table_size, nid);
182 181
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn)
196 pc = base + index; 195 pc = base + index;
197 init_page_cgroup(pc, nr); 196 init_page_cgroup(pc, nr);
198 } 197 }
199 198 /*
199 * The passed "pfn" may not be aligned to SECTION. For the calculation
200 * we need to apply a mask.
201 */
202 pfn &= PAGE_SECTION_MASK;
200 section->page_cgroup = base - pfn; 203 section->page_cgroup = base - pfn;
201 total_usage += table_size; 204 total_usage += table_size;
202 return 0; 205 return 0;
@@ -222,13 +225,23 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
222 unsigned long start, end, pfn; 225 unsigned long start, end, pfn;
223 int fail = 0; 226 int fail = 0;
224 227
225 start = start_pfn & ~(PAGES_PER_SECTION - 1); 228 start = SECTION_ALIGN_DOWN(start_pfn);
226 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 229 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
230
231 if (nid == -1) {
232 /*
233 * In this case, "nid" already exists and contains valid memory.
234 * "start_pfn" passed to us is a pfn which is an arg for
235 * online__pages(), and start_pfn should exist.
236 */
237 nid = pfn_to_nid(start_pfn);
238 VM_BUG_ON(!node_state(nid, N_ONLINE));
239 }
227 240
228 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { 241 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
229 if (!pfn_present(pfn)) 242 if (!pfn_present(pfn))
230 continue; 243 continue;
231 fail = init_section_page_cgroup(pfn); 244 fail = init_section_page_cgroup(pfn, nid);
232 } 245 }
233 if (!fail) 246 if (!fail)
234 return 0; 247 return 0;
@@ -245,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
245{ 258{
246 unsigned long start, end, pfn; 259 unsigned long start, end, pfn;
247 260
248 start = start_pfn & ~(PAGES_PER_SECTION - 1); 261 start = SECTION_ALIGN_DOWN(start_pfn);
249 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 262 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
250 263
251 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 264 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
252 __free_page_cgroup(pfn); 265 __free_page_cgroup(pfn);
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
284void __init page_cgroup_init(void) 297void __init page_cgroup_init(void)
285{ 298{
286 unsigned long pfn; 299 unsigned long pfn;
287 int fail = 0; 300 int nid;
288 301
289 if (mem_cgroup_disabled()) 302 if (mem_cgroup_disabled())
290 return; 303 return;
291 304
292 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 305 for_each_node_state(nid, N_HIGH_MEMORY) {
293 if (!pfn_present(pfn)) 306 unsigned long start_pfn, end_pfn;
294 continue; 307
295 fail = init_section_page_cgroup(pfn); 308 start_pfn = node_start_pfn(nid);
296 } 309 end_pfn = node_end_pfn(nid);
297 if (fail) { 310 /*
298 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); 311 * start_pfn and end_pfn may not be aligned to SECTION and the
299 panic("Out of memory"); 312 * page->flags of out of node pages are not initialized. So we
300 } else { 313 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
301 hotplug_memory_notifier(page_cgroup_callback, 0); 314 */
315 for (pfn = start_pfn;
316 pfn < end_pfn;
317 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
318
319 if (!pfn_valid(pfn))
320 continue;
321 /*
322 * Nodes's pfns can be overlapping.
323 * We know some arch can have a nodes layout such as
324 * -------------pfn-------------->
325 * N0 | N1 | N2 | N0 | N1 | N2|....
326 */
327 if (pfn_to_nid(pfn) != nid)
328 continue;
329 if (init_section_page_cgroup(pfn, nid))
330 goto oom;
331 }
302 } 332 }
333 hotplug_memory_notifier(page_cgroup_callback, 0);
303 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); 334 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
304 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" 335 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
305 " want memory cgroups\n"); 336 "don't want memory cgroups\n");
337 return;
338oom:
339 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
340 panic("Out of memory");
306} 341}
307 342
308void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) 343void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -502,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
502nomem: 537nomem:
503 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 538 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
504 printk(KERN_INFO 539 printk(KERN_INFO
505 "swap_cgroup can be disabled by noswapaccount boot option\n"); 540 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
506 return -ENOMEM; 541 return -ENOMEM;
507} 542}
508 543
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d533611..2f5cf10ff660 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
126 126
127 return 0; 127 return 0;
128} 128}
129#endif 129
130static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
131{
132 struct vm_area_struct *vma;
133
134 /* We don't need vma lookup at all. */
135 if (!walk->hugetlb_entry)
136 return NULL;
137
138 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
139 vma = find_vma(walk->mm, addr);
140 if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
141 return vma;
142
143 return NULL;
144}
145
146#else /* CONFIG_HUGETLB_PAGE */
147static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
148{
149 return NULL;
150}
151
152static int walk_hugetlb_range(struct vm_area_struct *vma,
153 unsigned long addr, unsigned long end,
154 struct mm_walk *walk)
155{
156 return 0;
157}
158
159#endif /* CONFIG_HUGETLB_PAGE */
160
161
130 162
131/** 163/**
132 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
144 * associated range, and a copy of the original mm_walk for access to 176 * associated range, and a copy of the original mm_walk for access to
145 * the ->private or ->mm fields. 177 * the ->private or ->mm fields.
146 * 178 *
147 * No locks are taken, but the bottom level iterator will map PTE 179 * Usually no locks are taken, but splitting transparent huge page may
180 * take page table lock. And the bottom level iterator will map PTE
148 * directories from highmem if necessary. 181 * directories from highmem if necessary.
149 * 182 *
150 * If any callback returns a non-zero value, the walk is aborted and 183 * If any callback returns a non-zero value, the walk is aborted and
151 * the return value is propagated back to the caller. Otherwise 0 is returned. 184 * the return value is propagated back to the caller. Otherwise 0 is returned.
185 *
186 * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
187 * is !NULL.
152 */ 188 */
153int walk_page_range(unsigned long addr, unsigned long end, 189int walk_page_range(unsigned long addr, unsigned long end,
154 struct mm_walk *walk) 190 struct mm_walk *walk)
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
165 201
166 pgd = pgd_offset(walk->mm, addr); 202 pgd = pgd_offset(walk->mm, addr);
167 do { 203 do {
168 struct vm_area_struct *uninitialized_var(vma); 204 struct vm_area_struct *vma;
169 205
170 next = pgd_addr_end(addr, end); 206 next = pgd_addr_end(addr, end);
171 207
172#ifdef CONFIG_HUGETLB_PAGE
173 /* 208 /*
174 * handle hugetlb vma individually because pagetable walk for 209 * handle hugetlb vma individually because pagetable walk for
175 * the hugetlb page is dependent on the architecture and 210 * the hugetlb page is dependent on the architecture and
176 * we can't handled it in the same manner as non-huge pages. 211 * we can't handled it in the same manner as non-huge pages.
177 */ 212 */
178 vma = find_vma(walk->mm, addr); 213 vma = hugetlb_vma(addr, walk);
179 if (vma && is_vm_hugetlb_page(vma)) { 214 if (vma) {
180 if (vma->vm_end < next) 215 if (vma->vm_end < next)
181 next = vma->vm_end; 216 next = vma->vm_end;
182 /* 217 /*
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
189 pgd = pgd_offset(walk->mm, next); 224 pgd = pgd_offset(walk->mm, next);
190 continue; 225 continue;
191 } 226 }
192#endif 227
193 if (pgd_none_or_clear_bad(pgd)) { 228 if (pgd_none_or_clear_bad(pgd)) {
194 if (walk->pte_hole) 229 if (walk->pte_hole)
195 err = walk->pte_hole(addr, next, walk); 230 err = walk->pte_hole(addr, next, walk);
diff --git a/mm/rmap.c b/mm/rmap.c
index d04e36a7cc9f..8005080fb9e3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 24 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
@@ -38,9 +37,8 @@
38 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
39 * within bdi.wb->list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
40 * 39 *
41 * (code doesn't rely on that order so it could be switched around) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
42 * ->tasklist_lock 41 * ->tasklist_lock
43 * anon_vma->mutex (memory_failure, collect_procs_anon)
44 * pte map lock 42 * pte map lock
45 */ 43 */
46 44
@@ -112,9 +110,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
112 kmem_cache_free(anon_vma_cachep, anon_vma); 110 kmem_cache_free(anon_vma_cachep, anon_vma);
113} 111}
114 112
115static inline struct anon_vma_chain *anon_vma_chain_alloc(void) 113static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
116{ 114{
117 return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); 115 return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
118} 116}
119 117
120static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) 118static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
@@ -159,7 +157,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
159 struct mm_struct *mm = vma->vm_mm; 157 struct mm_struct *mm = vma->vm_mm;
160 struct anon_vma *allocated; 158 struct anon_vma *allocated;
161 159
162 avc = anon_vma_chain_alloc(); 160 avc = anon_vma_chain_alloc(GFP_KERNEL);
163 if (!avc) 161 if (!avc)
164 goto out_enomem; 162 goto out_enomem;
165 163
@@ -200,6 +198,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
200 return -ENOMEM; 198 return -ENOMEM;
201} 199}
202 200
201/*
202 * This is a useful helper function for locking the anon_vma root as
203 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
204 * have the same vma.
205 *
206 * Such anon_vma's should have the same root, so you'd expect to see
207 * just a single mutex_lock for the whole traversal.
208 */
209static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
210{
211 struct anon_vma *new_root = anon_vma->root;
212 if (new_root != root) {
213 if (WARN_ON_ONCE(root))
214 mutex_unlock(&root->mutex);
215 root = new_root;
216 mutex_lock(&root->mutex);
217 }
218 return root;
219}
220
221static inline void unlock_anon_vma_root(struct anon_vma *root)
222{
223 if (root)
224 mutex_unlock(&root->mutex);
225}
226
203static void anon_vma_chain_link(struct vm_area_struct *vma, 227static void anon_vma_chain_link(struct vm_area_struct *vma,
204 struct anon_vma_chain *avc, 228 struct anon_vma_chain *avc,
205 struct anon_vma *anon_vma) 229 struct anon_vma *anon_vma)
@@ -208,13 +232,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
208 avc->anon_vma = anon_vma; 232 avc->anon_vma = anon_vma;
209 list_add(&avc->same_vma, &vma->anon_vma_chain); 233 list_add(&avc->same_vma, &vma->anon_vma_chain);
210 234
211 anon_vma_lock(anon_vma);
212 /* 235 /*
213 * It's critical to add new vmas to the tail of the anon_vma, 236 * It's critical to add new vmas to the tail of the anon_vma,
214 * see comment in huge_memory.c:__split_huge_page(). 237 * see comment in huge_memory.c:__split_huge_page().
215 */ 238 */
216 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 239 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
217 anon_vma_unlock(anon_vma);
218} 240}
219 241
220/* 242/*
@@ -224,13 +246,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
224int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 246int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
225{ 247{
226 struct anon_vma_chain *avc, *pavc; 248 struct anon_vma_chain *avc, *pavc;
249 struct anon_vma *root = NULL;
227 250
228 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { 251 list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
229 avc = anon_vma_chain_alloc(); 252 struct anon_vma *anon_vma;
230 if (!avc) 253
231 goto enomem_failure; 254 avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
232 anon_vma_chain_link(dst, avc, pavc->anon_vma); 255 if (unlikely(!avc)) {
256 unlock_anon_vma_root(root);
257 root = NULL;
258 avc = anon_vma_chain_alloc(GFP_KERNEL);
259 if (!avc)
260 goto enomem_failure;
261 }
262 anon_vma = pavc->anon_vma;
263 root = lock_anon_vma_root(root, anon_vma);
264 anon_vma_chain_link(dst, avc, anon_vma);
233 } 265 }
266 unlock_anon_vma_root(root);
234 return 0; 267 return 0;
235 268
236 enomem_failure: 269 enomem_failure:
@@ -263,7 +296,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
263 anon_vma = anon_vma_alloc(); 296 anon_vma = anon_vma_alloc();
264 if (!anon_vma) 297 if (!anon_vma)
265 goto out_error; 298 goto out_error;
266 avc = anon_vma_chain_alloc(); 299 avc = anon_vma_chain_alloc(GFP_KERNEL);
267 if (!avc) 300 if (!avc)
268 goto out_error_free_anon_vma; 301 goto out_error_free_anon_vma;
269 302
@@ -280,7 +313,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
280 get_anon_vma(anon_vma->root); 313 get_anon_vma(anon_vma->root);
281 /* Mark this anon_vma as the one where our new (COWed) pages go. */ 314 /* Mark this anon_vma as the one where our new (COWed) pages go. */
282 vma->anon_vma = anon_vma; 315 vma->anon_vma = anon_vma;
316 anon_vma_lock(anon_vma);
283 anon_vma_chain_link(vma, avc, anon_vma); 317 anon_vma_chain_link(vma, avc, anon_vma);
318 anon_vma_unlock(anon_vma);
284 319
285 return 0; 320 return 0;
286 321
@@ -291,36 +326,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
291 return -ENOMEM; 326 return -ENOMEM;
292} 327}
293 328
294static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
295{
296 struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
297 int empty;
298
299 /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
300 if (!anon_vma)
301 return;
302
303 anon_vma_lock(anon_vma);
304 list_del(&anon_vma_chain->same_anon_vma);
305
306 /* We must garbage collect the anon_vma if it's empty */
307 empty = list_empty(&anon_vma->head);
308 anon_vma_unlock(anon_vma);
309
310 if (empty)
311 put_anon_vma(anon_vma);
312}
313
314void unlink_anon_vmas(struct vm_area_struct *vma) 329void unlink_anon_vmas(struct vm_area_struct *vma)
315{ 330{
316 struct anon_vma_chain *avc, *next; 331 struct anon_vma_chain *avc, *next;
332 struct anon_vma *root = NULL;
317 333
318 /* 334 /*
319 * Unlink each anon_vma chained to the VMA. This list is ordered 335 * Unlink each anon_vma chained to the VMA. This list is ordered
320 * from newest to oldest, ensuring the root anon_vma gets freed last. 336 * from newest to oldest, ensuring the root anon_vma gets freed last.
321 */ 337 */
322 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 338 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
323 anon_vma_unlink(avc); 339 struct anon_vma *anon_vma = avc->anon_vma;
340
341 root = lock_anon_vma_root(root, anon_vma);
342 list_del(&avc->same_anon_vma);
343
344 /*
345 * Leave empty anon_vmas on the list - we'll need
346 * to free them outside the lock.
347 */
348 if (list_empty(&anon_vma->head))
349 continue;
350
351 list_del(&avc->same_vma);
352 anon_vma_chain_free(avc);
353 }
354 unlock_anon_vma_root(root);
355
356 /*
357 * Iterate the list once more, it now only contains empty and unlinked
358 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
359 * needing to acquire the anon_vma->root->mutex.
360 */
361 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
362 struct anon_vma *anon_vma = avc->anon_vma;
363
364 put_anon_vma(anon_vma);
365
324 list_del(&avc->same_vma); 366 list_del(&avc->same_vma);
325 anon_vma_chain_free(avc); 367 anon_vma_chain_free(avc);
326 } 368 }
@@ -827,11 +869,11 @@ int page_referenced(struct page *page,
827 vm_flags); 869 vm_flags);
828 if (we_locked) 870 if (we_locked)
829 unlock_page(page); 871 unlock_page(page);
872
873 if (page_test_and_clear_young(page_to_pfn(page)))
874 referenced++;
830 } 875 }
831out: 876out:
832 if (page_test_and_clear_young(page_to_pfn(page)))
833 referenced++;
834
835 return referenced; 877 return referenced;
836} 878}
837 879
diff --git a/mm/shmem.c b/mm/shmem.c
index d221a1cfd7b1..5cc21f8b4cd3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/splice.h>
54#include <linux/security.h> 55#include <linux/security.h>
55#include <linux/swapops.h> 56#include <linux/swapops.h>
56#include <linux/mempolicy.h> 57#include <linux/mempolicy.h>
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void)
126} 127}
127#endif 128#endif
128 129
129static int shmem_getpage(struct inode *inode, unsigned long idx, 130static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
130 struct page **pagep, enum sgp_type sgp, int *type); 131 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
132
133static inline int shmem_getpage(struct inode *inode, pgoff_t index,
134 struct page **pagep, enum sgp_type sgp, int *fault_type)
135{
136 return shmem_getpage_gfp(inode, index, pagep, sgp,
137 mapping_gfp_mask(inode->i_mapping), fault_type);
138}
131 139
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) 140static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{ 141{
@@ -241,9 +249,7 @@ static void shmem_free_blocks(struct inode *inode, long pages)
241 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 249 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242 if (sbinfo->max_blocks) { 250 if (sbinfo->max_blocks) {
243 percpu_counter_add(&sbinfo->used_blocks, -pages); 251 percpu_counter_add(&sbinfo->used_blocks, -pages);
244 spin_lock(&inode->i_lock);
245 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 252 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246 spin_unlock(&inode->i_lock);
247 } 253 }
248} 254}
249 255
@@ -405,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
405 * @info: info structure for the inode 411 * @info: info structure for the inode
406 * @index: index of the page to find 412 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation? 413 * @sgp: check and recheck i_size? skip allocation?
414 * @gfp: gfp mask to use for any page allocation
408 * 415 *
409 * If the entry does not exist, allocate it. 416 * If the entry does not exist, allocate it.
410 */ 417 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) 418static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
419 unsigned long index, enum sgp_type sgp, gfp_t gfp)
412{ 420{
413 struct inode *inode = &info->vfs_inode; 421 struct inode *inode = &info->vfs_inode;
414 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 422 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -432,13 +440,11 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
432 sbinfo->max_blocks - 1) >= 0) 440 sbinfo->max_blocks - 1) >= 0)
433 return ERR_PTR(-ENOSPC); 441 return ERR_PTR(-ENOSPC);
434 percpu_counter_inc(&sbinfo->used_blocks); 442 percpu_counter_inc(&sbinfo->used_blocks);
435 spin_lock(&inode->i_lock);
436 inode->i_blocks += BLOCKS_PER_PAGE; 443 inode->i_blocks += BLOCKS_PER_PAGE;
437 spin_unlock(&inode->i_lock);
438 } 444 }
439 445
440 spin_unlock(&info->lock); 446 spin_unlock(&info->lock);
441 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 447 page = shmem_dir_alloc(gfp);
442 spin_lock(&info->lock); 448 spin_lock(&info->lock);
443 449
444 if (!page) { 450 if (!page) {
@@ -539,7 +545,7 @@ static void shmem_free_pages(struct list_head *next)
539 } while (next); 545 } while (next);
540} 546}
541 547
542static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 548void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
543{ 549{
544 struct shmem_inode_info *info = SHMEM_I(inode); 550 struct shmem_inode_info *info = SHMEM_I(inode);
545 unsigned long idx; 551 unsigned long idx;
@@ -562,6 +568,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
562 spinlock_t *punch_lock; 568 spinlock_t *punch_lock;
563 unsigned long upper_limit; 569 unsigned long upper_limit;
564 570
571 truncate_inode_pages_range(inode->i_mapping, start, end);
572
565 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 573 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
566 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 574 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
567 if (idx >= info->next_index) 575 if (idx >= info->next_index)
@@ -738,16 +746,8 @@ done2:
738 * lowered next_index. Also, though shmem_getpage checks 746 * lowered next_index. Also, though shmem_getpage checks
739 * i_size before adding to cache, no recheck after: so fix the 747 * i_size before adding to cache, no recheck after: so fix the
740 * narrow window there too. 748 * narrow window there too.
741 *
742 * Recalling truncate_inode_pages_range and unmap_mapping_range
743 * every time for punch_hole (which never got a chance to clear
744 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
745 * yet hardly ever necessary: try to optimize them out later.
746 */ 749 */
747 truncate_inode_pages_range(inode->i_mapping, start, end); 750 truncate_inode_pages_range(inode->i_mapping, start, end);
748 if (punch_hole)
749 unmap_mapping_range(inode->i_mapping, start,
750 end - start, 1);
751 } 751 }
752 752
753 spin_lock(&info->lock); 753 spin_lock(&info->lock);
@@ -766,22 +766,23 @@ done2:
766 shmem_free_pages(pages_to_free.next); 766 shmem_free_pages(pages_to_free.next);
767 } 767 }
768} 768}
769EXPORT_SYMBOL_GPL(shmem_truncate_range);
769 770
770static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) 771static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
771{ 772{
772 struct inode *inode = dentry->d_inode; 773 struct inode *inode = dentry->d_inode;
773 loff_t newsize = attr->ia_size;
774 int error; 774 int error;
775 775
776 error = inode_change_ok(inode, attr); 776 error = inode_change_ok(inode, attr);
777 if (error) 777 if (error)
778 return error; 778 return error;
779 779
780 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) 780 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
781 && newsize != inode->i_size) { 781 loff_t oldsize = inode->i_size;
782 loff_t newsize = attr->ia_size;
782 struct page *page = NULL; 783 struct page *page = NULL;
783 784
784 if (newsize < inode->i_size) { 785 if (newsize < oldsize) {
785 /* 786 /*
786 * If truncating down to a partial page, then 787 * If truncating down to a partial page, then
787 * if that page is already allocated, hold it 788 * if that page is already allocated, hold it
@@ -810,12 +811,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
810 spin_unlock(&info->lock); 811 spin_unlock(&info->lock);
811 } 812 }
812 } 813 }
813 814 if (newsize != oldsize) {
814 /* XXX(truncate): truncate_setsize should be called last */ 815 i_size_write(inode, newsize);
815 truncate_setsize(inode, newsize); 816 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
817 }
818 if (newsize < oldsize) {
819 loff_t holebegin = round_up(newsize, PAGE_SIZE);
820 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
821 shmem_truncate_range(inode, newsize, (loff_t)-1);
822 /* unmap again to remove racily COWed private pages */
823 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
824 }
816 if (page) 825 if (page)
817 page_cache_release(page); 826 page_cache_release(page);
818 shmem_truncate_range(inode, newsize, (loff_t)-1);
819 } 827 }
820 828
821 setattr_copy(inode, attr); 829 setattr_copy(inode, attr);
@@ -832,7 +840,6 @@ static void shmem_evict_inode(struct inode *inode)
832 struct shmem_xattr *xattr, *nxattr; 840 struct shmem_xattr *xattr, *nxattr;
833 841
834 if (inode->i_mapping->a_ops == &shmem_aops) { 842 if (inode->i_mapping->a_ops == &shmem_aops) {
835 truncate_inode_pages(inode->i_mapping, 0);
836 shmem_unacct_size(info->flags, inode->i_size); 843 shmem_unacct_size(info->flags, inode->i_size);
837 inode->i_size = 0; 844 inode->i_size = 0;
838 shmem_truncate_range(inode, 0, (loff_t)-1); 845 shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -965,20 +972,7 @@ found:
965 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 972 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
966 /* which does mem_cgroup_uncharge_cache_page on error */ 973 /* which does mem_cgroup_uncharge_cache_page on error */
967 974
968 if (error == -EEXIST) { 975 if (error != -ENOMEM) {
969 struct page *filepage = find_get_page(mapping, idx);
970 error = 1;
971 if (filepage) {
972 /*
973 * There might be a more uptodate page coming down
974 * from a stacked writepage: forget our swappage if so.
975 */
976 if (PageUptodate(filepage))
977 error = 0;
978 page_cache_release(filepage);
979 }
980 }
981 if (!error) {
982 delete_from_swap_cache(page); 976 delete_from_swap_cache(page);
983 set_page_dirty(page); 977 set_page_dirty(page);
984 info->flags |= SHMEM_PAGEIN; 978 info->flags |= SHMEM_PAGEIN;
@@ -1065,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1065 /* 1059 /*
1066 * shmem_backing_dev_info's capabilities prevent regular writeback or 1060 * shmem_backing_dev_info's capabilities prevent regular writeback or
1067 * sync from ever calling shmem_writepage; but a stacking filesystem 1061 * sync from ever calling shmem_writepage; but a stacking filesystem
1068 * may use the ->writepage of its underlying filesystem, in which case 1062 * might use ->writepage of its underlying filesystem, in which case
1069 * tmpfs should write out to swap only in response to memory pressure, 1063 * tmpfs should write out to swap only in response to memory pressure,
1070 * and not for the writeback threads or sync. However, in those cases, 1064 * and not for the writeback threads or sync.
1071 * we do still want to check if there's a redundant swappage to be
1072 * discarded.
1073 */ 1065 */
1074 if (wbc->for_reclaim) 1066 if (!wbc->for_reclaim) {
1075 swap = get_swap_page(); 1067 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1076 else 1068 goto redirty;
1077 swap.val = 0; 1069 }
1070 swap = get_swap_page();
1071 if (!swap.val)
1072 goto redirty;
1078 1073
1079 /* 1074 /*
1080 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1075 * Add inode to shmem_unuse()'s list of swapped-out inodes,
@@ -1085,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1085 * we've taken the spinlock, because shmem_unuse_inode() will 1080 * we've taken the spinlock, because shmem_unuse_inode() will
1086 * prune a !swapped inode from the swaplist under both locks. 1081 * prune a !swapped inode from the swaplist under both locks.
1087 */ 1082 */
1088 if (swap.val) { 1083 mutex_lock(&shmem_swaplist_mutex);
1089 mutex_lock(&shmem_swaplist_mutex); 1084 if (list_empty(&info->swaplist))
1090 if (list_empty(&info->swaplist)) 1085 list_add_tail(&info->swaplist, &shmem_swaplist);
1091 list_add_tail(&info->swaplist, &shmem_swaplist);
1092 }
1093 1086
1094 spin_lock(&info->lock); 1087 spin_lock(&info->lock);
1095 if (swap.val) 1088 mutex_unlock(&shmem_swaplist_mutex);
1096 mutex_unlock(&shmem_swaplist_mutex);
1097 1089
1098 if (index >= info->next_index) { 1090 if (index >= info->next_index) {
1099 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1091 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
@@ -1101,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1101 } 1093 }
1102 entry = shmem_swp_entry(info, index, NULL); 1094 entry = shmem_swp_entry(info, index, NULL);
1103 if (entry->val) { 1095 if (entry->val) {
1104 /* 1096 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1105 * The more uptodate page coming down from a stacked
1106 * writepage should replace our old swappage.
1107 */
1108 free_swap_and_cache(*entry); 1097 free_swap_and_cache(*entry);
1109 shmem_swp_set(info, entry, 0); 1098 shmem_swp_set(info, entry, 0);
1110 } 1099 }
1111 shmem_recalc_inode(inode); 1100 shmem_recalc_inode(inode);
1112 1101
1113 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1102 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1114 delete_from_page_cache(page); 1103 delete_from_page_cache(page);
1115 shmem_swp_set(info, entry, swap.val); 1104 shmem_swp_set(info, entry, swap.val);
1116 shmem_swp_unmap(entry); 1105 shmem_swp_unmap(entry);
@@ -1227,92 +1216,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1227#endif 1216#endif
1228 1217
1229/* 1218/*
1230 * shmem_getpage - either get the page from swap or allocate a new one 1219 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1231 * 1220 *
1232 * If we allocate a new one we do not mark it dirty. That's up to the 1221 * If we allocate a new one we do not mark it dirty. That's up to the
1233 * vm. If we swap it in we mark it dirty since we also free the swap 1222 * vm. If we swap it in we mark it dirty since we also free the swap
1234 * entry since a page cannot live in both the swap and page cache 1223 * entry since a page cannot live in both the swap and page cache
1235 */ 1224 */
1236static int shmem_getpage(struct inode *inode, unsigned long idx, 1225static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
1237 struct page **pagep, enum sgp_type sgp, int *type) 1226 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1238{ 1227{
1239 struct address_space *mapping = inode->i_mapping; 1228 struct address_space *mapping = inode->i_mapping;
1240 struct shmem_inode_info *info = SHMEM_I(inode); 1229 struct shmem_inode_info *info = SHMEM_I(inode);
1241 struct shmem_sb_info *sbinfo; 1230 struct shmem_sb_info *sbinfo;
1242 struct page *filepage = *pagep; 1231 struct page *page;
1243 struct page *swappage;
1244 struct page *prealloc_page = NULL; 1232 struct page *prealloc_page = NULL;
1245 swp_entry_t *entry; 1233 swp_entry_t *entry;
1246 swp_entry_t swap; 1234 swp_entry_t swap;
1247 gfp_t gfp;
1248 int error; 1235 int error;
1236 int ret;
1249 1237
1250 if (idx >= SHMEM_MAX_INDEX) 1238 if (idx >= SHMEM_MAX_INDEX)
1251 return -EFBIG; 1239 return -EFBIG;
1252
1253 if (type)
1254 *type = 0;
1255
1256 /*
1257 * Normally, filepage is NULL on entry, and either found
1258 * uptodate immediately, or allocated and zeroed, or read
1259 * in under swappage, which is then assigned to filepage.
1260 * But shmem_readpage (required for splice) passes in a locked
1261 * filepage, which may be found not uptodate by other callers
1262 * too, and may need to be copied from the swappage read in.
1263 */
1264repeat: 1240repeat:
1265 if (!filepage) 1241 page = find_lock_page(mapping, idx);
1266 filepage = find_lock_page(mapping, idx); 1242 if (page) {
1267 if (filepage && PageUptodate(filepage))
1268 goto done;
1269 gfp = mapping_gfp_mask(mapping);
1270 if (!filepage) {
1271 /* 1243 /*
1272 * Try to preload while we can wait, to not make a habit of 1244 * Once we can get the page lock, it must be uptodate:
1273 * draining atomic reserves; but don't latch on to this cpu. 1245 * if there were an error in reading back from swap,
1246 * the page would not be inserted into the filecache.
1274 */ 1247 */
1275 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 1248 BUG_ON(!PageUptodate(page));
1276 if (error) 1249 goto done;
1277 goto failed; 1250 }
1278 radix_tree_preload_end(); 1251
1279 if (sgp != SGP_READ && !prealloc_page) { 1252 /*
1280 /* We don't care if this fails */ 1253 * Try to preload while we can wait, to not make a habit of
1281 prealloc_page = shmem_alloc_page(gfp, info, idx); 1254 * draining atomic reserves; but don't latch on to this cpu.
1282 if (prealloc_page) { 1255 */
1283 if (mem_cgroup_cache_charge(prealloc_page, 1256 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1284 current->mm, GFP_KERNEL)) { 1257 if (error)
1285 page_cache_release(prealloc_page); 1258 goto out;
1286 prealloc_page = NULL; 1259 radix_tree_preload_end();
1287 } 1260
1261 if (sgp != SGP_READ && !prealloc_page) {
1262 prealloc_page = shmem_alloc_page(gfp, info, idx);
1263 if (prealloc_page) {
1264 SetPageSwapBacked(prealloc_page);
1265 if (mem_cgroup_cache_charge(prealloc_page,
1266 current->mm, GFP_KERNEL)) {
1267 page_cache_release(prealloc_page);
1268 prealloc_page = NULL;
1288 } 1269 }
1289 } 1270 }
1290 } 1271 }
1291 error = 0;
1292 1272
1293 spin_lock(&info->lock); 1273 spin_lock(&info->lock);
1294 shmem_recalc_inode(inode); 1274 shmem_recalc_inode(inode);
1295 entry = shmem_swp_alloc(info, idx, sgp); 1275 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1296 if (IS_ERR(entry)) { 1276 if (IS_ERR(entry)) {
1297 spin_unlock(&info->lock); 1277 spin_unlock(&info->lock);
1298 error = PTR_ERR(entry); 1278 error = PTR_ERR(entry);
1299 goto failed; 1279 goto out;
1300 } 1280 }
1301 swap = *entry; 1281 swap = *entry;
1302 1282
1303 if (swap.val) { 1283 if (swap.val) {
1304 /* Look it up and read it in.. */ 1284 /* Look it up and read it in.. */
1305 swappage = lookup_swap_cache(swap); 1285 page = lookup_swap_cache(swap);
1306 if (!swappage) { 1286 if (!page) {
1307 shmem_swp_unmap(entry); 1287 shmem_swp_unmap(entry);
1308 spin_unlock(&info->lock); 1288 spin_unlock(&info->lock);
1309 /* here we actually do the io */ 1289 /* here we actually do the io */
1310 if (type) 1290 if (fault_type)
1311 *type |= VM_FAULT_MAJOR; 1291 *fault_type |= VM_FAULT_MAJOR;
1312 swappage = shmem_swapin(swap, gfp, info, idx); 1292 page = shmem_swapin(swap, gfp, info, idx);
1313 if (!swappage) { 1293 if (!page) {
1314 spin_lock(&info->lock); 1294 spin_lock(&info->lock);
1315 entry = shmem_swp_alloc(info, idx, sgp); 1295 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1316 if (IS_ERR(entry)) 1296 if (IS_ERR(entry))
1317 error = PTR_ERR(entry); 1297 error = PTR_ERR(entry);
1318 else { 1298 else {
@@ -1322,62 +1302,42 @@ repeat:
1322 } 1302 }
1323 spin_unlock(&info->lock); 1303 spin_unlock(&info->lock);
1324 if (error) 1304 if (error)
1325 goto failed; 1305 goto out;
1326 goto repeat; 1306 goto repeat;
1327 } 1307 }
1328 wait_on_page_locked(swappage); 1308 wait_on_page_locked(page);
1329 page_cache_release(swappage); 1309 page_cache_release(page);
1330 goto repeat; 1310 goto repeat;
1331 } 1311 }
1332 1312
1333 /* We have to do this with page locked to prevent races */ 1313 /* We have to do this with page locked to prevent races */
1334 if (!trylock_page(swappage)) { 1314 if (!trylock_page(page)) {
1335 shmem_swp_unmap(entry); 1315 shmem_swp_unmap(entry);
1336 spin_unlock(&info->lock); 1316 spin_unlock(&info->lock);
1337 wait_on_page_locked(swappage); 1317 wait_on_page_locked(page);
1338 page_cache_release(swappage); 1318 page_cache_release(page);
1339 goto repeat; 1319 goto repeat;
1340 } 1320 }
1341 if (PageWriteback(swappage)) { 1321 if (PageWriteback(page)) {
1342 shmem_swp_unmap(entry); 1322 shmem_swp_unmap(entry);
1343 spin_unlock(&info->lock); 1323 spin_unlock(&info->lock);
1344 wait_on_page_writeback(swappage); 1324 wait_on_page_writeback(page);
1345 unlock_page(swappage); 1325 unlock_page(page);
1346 page_cache_release(swappage); 1326 page_cache_release(page);
1347 goto repeat; 1327 goto repeat;
1348 } 1328 }
1349 if (!PageUptodate(swappage)) { 1329 if (!PageUptodate(page)) {
1350 shmem_swp_unmap(entry); 1330 shmem_swp_unmap(entry);
1351 spin_unlock(&info->lock); 1331 spin_unlock(&info->lock);
1352 unlock_page(swappage); 1332 unlock_page(page);
1353 page_cache_release(swappage); 1333 page_cache_release(page);
1354 error = -EIO; 1334 error = -EIO;
1355 goto failed; 1335 goto out;
1356 } 1336 }
1357 1337
1358 if (filepage) { 1338 error = add_to_page_cache_locked(page, mapping,
1359 shmem_swp_set(info, entry, 0); 1339 idx, GFP_NOWAIT);
1360 shmem_swp_unmap(entry); 1340 if (error) {
1361 delete_from_swap_cache(swappage);
1362 spin_unlock(&info->lock);
1363 copy_highpage(filepage, swappage);
1364 unlock_page(swappage);
1365 page_cache_release(swappage);
1366 flush_dcache_page(filepage);
1367 SetPageUptodate(filepage);
1368 set_page_dirty(filepage);
1369 swap_free(swap);
1370 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1371 idx, GFP_NOWAIT))) {
1372 info->flags |= SHMEM_PAGEIN;
1373 shmem_swp_set(info, entry, 0);
1374 shmem_swp_unmap(entry);
1375 delete_from_swap_cache(swappage);
1376 spin_unlock(&info->lock);
1377 filepage = swappage;
1378 set_page_dirty(filepage);
1379 swap_free(swap);
1380 } else {
1381 shmem_swp_unmap(entry); 1341 shmem_swp_unmap(entry);
1382 spin_unlock(&info->lock); 1342 spin_unlock(&info->lock);
1383 if (error == -ENOMEM) { 1343 if (error == -ENOMEM) {
@@ -1386,32 +1346,38 @@ repeat:
1386 * call memcg's OOM if needed. 1346 * call memcg's OOM if needed.
1387 */ 1347 */
1388 error = mem_cgroup_shmem_charge_fallback( 1348 error = mem_cgroup_shmem_charge_fallback(
1389 swappage, 1349 page, current->mm, gfp);
1390 current->mm,
1391 gfp);
1392 if (error) { 1350 if (error) {
1393 unlock_page(swappage); 1351 unlock_page(page);
1394 page_cache_release(swappage); 1352 page_cache_release(page);
1395 goto failed; 1353 goto out;
1396 } 1354 }
1397 } 1355 }
1398 unlock_page(swappage); 1356 unlock_page(page);
1399 page_cache_release(swappage); 1357 page_cache_release(page);
1400 goto repeat; 1358 goto repeat;
1401 } 1359 }
1402 } else if (sgp == SGP_READ && !filepage) { 1360
1361 info->flags |= SHMEM_PAGEIN;
1362 shmem_swp_set(info, entry, 0);
1403 shmem_swp_unmap(entry); 1363 shmem_swp_unmap(entry);
1404 filepage = find_get_page(mapping, idx); 1364 delete_from_swap_cache(page);
1405 if (filepage && 1365 spin_unlock(&info->lock);
1406 (!PageUptodate(filepage) || !trylock_page(filepage))) { 1366 set_page_dirty(page);
1367 swap_free(swap);
1368
1369 } else if (sgp == SGP_READ) {
1370 shmem_swp_unmap(entry);
1371 page = find_get_page(mapping, idx);
1372 if (page && !trylock_page(page)) {
1407 spin_unlock(&info->lock); 1373 spin_unlock(&info->lock);
1408 wait_on_page_locked(filepage); 1374 wait_on_page_locked(page);
1409 page_cache_release(filepage); 1375 page_cache_release(page);
1410 filepage = NULL;
1411 goto repeat; 1376 goto repeat;
1412 } 1377 }
1413 spin_unlock(&info->lock); 1378 spin_unlock(&info->lock);
1414 } else { 1379
1380 } else if (prealloc_page) {
1415 shmem_swp_unmap(entry); 1381 shmem_swp_unmap(entry);
1416 sbinfo = SHMEM_SB(inode->i_sb); 1382 sbinfo = SHMEM_SB(inode->i_sb);
1417 if (sbinfo->max_blocks) { 1383 if (sbinfo->max_blocks) {
@@ -1420,126 +1386,86 @@ repeat:
1420 shmem_acct_block(info->flags)) 1386 shmem_acct_block(info->flags))
1421 goto nospace; 1387 goto nospace;
1422 percpu_counter_inc(&sbinfo->used_blocks); 1388 percpu_counter_inc(&sbinfo->used_blocks);
1423 spin_lock(&inode->i_lock);
1424 inode->i_blocks += BLOCKS_PER_PAGE; 1389 inode->i_blocks += BLOCKS_PER_PAGE;
1425 spin_unlock(&inode->i_lock);
1426 } else if (shmem_acct_block(info->flags)) 1390 } else if (shmem_acct_block(info->flags))
1427 goto nospace; 1391 goto nospace;
1428 1392
1429 if (!filepage) { 1393 page = prealloc_page;
1430 int ret; 1394 prealloc_page = NULL;
1431
1432 if (!prealloc_page) {
1433 spin_unlock(&info->lock);
1434 filepage = shmem_alloc_page(gfp, info, idx);
1435 if (!filepage) {
1436 shmem_unacct_blocks(info->flags, 1);
1437 shmem_free_blocks(inode, 1);
1438 error = -ENOMEM;
1439 goto failed;
1440 }
1441 SetPageSwapBacked(filepage);
1442 1395
1443 /* 1396 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1444 * Precharge page while we can wait, compensate 1397 if (IS_ERR(entry))
1445 * after 1398 error = PTR_ERR(entry);
1446 */ 1399 else {
1447 error = mem_cgroup_cache_charge(filepage, 1400 swap = *entry;
1448 current->mm, GFP_KERNEL); 1401 shmem_swp_unmap(entry);
1449 if (error) { 1402 }
1450 page_cache_release(filepage); 1403 ret = error || swap.val;
1451 shmem_unacct_blocks(info->flags, 1); 1404 if (ret)
1452 shmem_free_blocks(inode, 1); 1405 mem_cgroup_uncharge_cache_page(page);
1453 filepage = NULL; 1406 else
1454 goto failed; 1407 ret = add_to_page_cache_lru(page, mapping,
1455 }
1456
1457 spin_lock(&info->lock);
1458 } else {
1459 filepage = prealloc_page;
1460 prealloc_page = NULL;
1461 SetPageSwapBacked(filepage);
1462 }
1463
1464 entry = shmem_swp_alloc(info, idx, sgp);
1465 if (IS_ERR(entry))
1466 error = PTR_ERR(entry);
1467 else {
1468 swap = *entry;
1469 shmem_swp_unmap(entry);
1470 }
1471 ret = error || swap.val;
1472 if (ret)
1473 mem_cgroup_uncharge_cache_page(filepage);
1474 else
1475 ret = add_to_page_cache_lru(filepage, mapping,
1476 idx, GFP_NOWAIT); 1408 idx, GFP_NOWAIT);
1477 /* 1409 /*
1478 * At add_to_page_cache_lru() failure, uncharge will 1410 * At add_to_page_cache_lru() failure,
1479 * be done automatically. 1411 * uncharge will be done automatically.
1480 */ 1412 */
1481 if (ret) { 1413 if (ret) {
1482 spin_unlock(&info->lock); 1414 shmem_unacct_blocks(info->flags, 1);
1483 page_cache_release(filepage); 1415 shmem_free_blocks(inode, 1);
1484 shmem_unacct_blocks(info->flags, 1); 1416 spin_unlock(&info->lock);
1485 shmem_free_blocks(inode, 1); 1417 page_cache_release(page);
1486 filepage = NULL; 1418 if (error)
1487 if (error) 1419 goto out;
1488 goto failed; 1420 goto repeat;
1489 goto repeat;
1490 }
1491 info->flags |= SHMEM_PAGEIN;
1492 } 1421 }
1493 1422
1423 info->flags |= SHMEM_PAGEIN;
1494 info->alloced++; 1424 info->alloced++;
1495 spin_unlock(&info->lock); 1425 spin_unlock(&info->lock);
1496 clear_highpage(filepage); 1426 clear_highpage(page);
1497 flush_dcache_page(filepage); 1427 flush_dcache_page(page);
1498 SetPageUptodate(filepage); 1428 SetPageUptodate(page);
1499 if (sgp == SGP_DIRTY) 1429 if (sgp == SGP_DIRTY)
1500 set_page_dirty(filepage); 1430 set_page_dirty(page);
1431
1432 } else {
1433 spin_unlock(&info->lock);
1434 error = -ENOMEM;
1435 goto out;
1501 } 1436 }
1502done: 1437done:
1503 *pagep = filepage; 1438 *pagep = page;
1504 error = 0; 1439 error = 0;
1505 goto out; 1440out:
1441 if (prealloc_page) {
1442 mem_cgroup_uncharge_cache_page(prealloc_page);
1443 page_cache_release(prealloc_page);
1444 }
1445 return error;
1506 1446
1507nospace: 1447nospace:
1508 /* 1448 /*
1509 * Perhaps the page was brought in from swap between find_lock_page 1449 * Perhaps the page was brought in from swap between find_lock_page
1510 * and taking info->lock? We allow for that at add_to_page_cache_lru, 1450 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1511 * but must also avoid reporting a spurious ENOSPC while working on a 1451 * but must also avoid reporting a spurious ENOSPC while working on a
1512 * full tmpfs. (When filepage has been passed in to shmem_getpage, it 1452 * full tmpfs.
1513 * is already in page cache, which prevents this race from occurring.)
1514 */ 1453 */
1515 if (!filepage) { 1454 page = find_get_page(mapping, idx);
1516 struct page *page = find_get_page(mapping, idx);
1517 if (page) {
1518 spin_unlock(&info->lock);
1519 page_cache_release(page);
1520 goto repeat;
1521 }
1522 }
1523 spin_unlock(&info->lock); 1455 spin_unlock(&info->lock);
1524 error = -ENOSPC; 1456 if (page) {
1525failed: 1457 page_cache_release(page);
1526 if (*pagep != filepage) { 1458 goto repeat;
1527 unlock_page(filepage);
1528 page_cache_release(filepage);
1529 }
1530out:
1531 if (prealloc_page) {
1532 mem_cgroup_uncharge_cache_page(prealloc_page);
1533 page_cache_release(prealloc_page);
1534 } 1459 }
1535 return error; 1460 error = -ENOSPC;
1461 goto out;
1536} 1462}
1537 1463
1538static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1464static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1539{ 1465{
1540 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1466 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1541 int error; 1467 int error;
1542 int ret; 1468 int ret = VM_FAULT_LOCKED;
1543 1469
1544 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1470 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1545 return VM_FAULT_SIGBUS; 1471 return VM_FAULT_SIGBUS;
@@ -1547,11 +1473,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1547 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1473 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1548 if (error) 1474 if (error)
1549 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1475 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1476
1550 if (ret & VM_FAULT_MAJOR) { 1477 if (ret & VM_FAULT_MAJOR) {
1551 count_vm_event(PGMAJFAULT); 1478 count_vm_event(PGMAJFAULT);
1552 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1479 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1553 } 1480 }
1554 return ret | VM_FAULT_LOCKED; 1481 return ret;
1555} 1482}
1556 1483
1557#ifdef CONFIG_NUMA 1484#ifdef CONFIG_NUMA
@@ -1668,19 +1595,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1668static const struct inode_operations shmem_symlink_inode_operations; 1595static const struct inode_operations shmem_symlink_inode_operations;
1669static const struct inode_operations shmem_symlink_inline_operations; 1596static const struct inode_operations shmem_symlink_inline_operations;
1670 1597
1671/*
1672 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1673 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1674 * below the loop driver, in the generic fashion that many filesystems support.
1675 */
1676static int shmem_readpage(struct file *file, struct page *page)
1677{
1678 struct inode *inode = page->mapping->host;
1679 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1680 unlock_page(page);
1681 return error;
1682}
1683
1684static int 1598static int
1685shmem_write_begin(struct file *file, struct address_space *mapping, 1599shmem_write_begin(struct file *file, struct address_space *mapping,
1686 loff_t pos, unsigned len, unsigned flags, 1600 loff_t pos, unsigned len, unsigned flags,
@@ -1688,7 +1602,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1688{ 1602{
1689 struct inode *inode = mapping->host; 1603 struct inode *inode = mapping->host;
1690 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1604 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1691 *pagep = NULL;
1692 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1605 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1693} 1606}
1694 1607
@@ -1845,6 +1758,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1845 return retval; 1758 return retval;
1846} 1759}
1847 1760
1761static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1762 struct pipe_inode_info *pipe, size_t len,
1763 unsigned int flags)
1764{
1765 struct address_space *mapping = in->f_mapping;
1766 struct inode *inode = mapping->host;
1767 unsigned int loff, nr_pages, req_pages;
1768 struct page *pages[PIPE_DEF_BUFFERS];
1769 struct partial_page partial[PIPE_DEF_BUFFERS];
1770 struct page *page;
1771 pgoff_t index, end_index;
1772 loff_t isize, left;
1773 int error, page_nr;
1774 struct splice_pipe_desc spd = {
1775 .pages = pages,
1776 .partial = partial,
1777 .flags = flags,
1778 .ops = &page_cache_pipe_buf_ops,
1779 .spd_release = spd_release_page,
1780 };
1781
1782 isize = i_size_read(inode);
1783 if (unlikely(*ppos >= isize))
1784 return 0;
1785
1786 left = isize - *ppos;
1787 if (unlikely(left < len))
1788 len = left;
1789
1790 if (splice_grow_spd(pipe, &spd))
1791 return -ENOMEM;
1792
1793 index = *ppos >> PAGE_CACHE_SHIFT;
1794 loff = *ppos & ~PAGE_CACHE_MASK;
1795 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1796 nr_pages = min(req_pages, pipe->buffers);
1797
1798 spd.nr_pages = find_get_pages_contig(mapping, index,
1799 nr_pages, spd.pages);
1800 index += spd.nr_pages;
1801 error = 0;
1802
1803 while (spd.nr_pages < nr_pages) {
1804 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1805 if (error)
1806 break;
1807 unlock_page(page);
1808 spd.pages[spd.nr_pages++] = page;
1809 index++;
1810 }
1811
1812 index = *ppos >> PAGE_CACHE_SHIFT;
1813 nr_pages = spd.nr_pages;
1814 spd.nr_pages = 0;
1815
1816 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1817 unsigned int this_len;
1818
1819 if (!len)
1820 break;
1821
1822 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1823 page = spd.pages[page_nr];
1824
1825 if (!PageUptodate(page) || page->mapping != mapping) {
1826 error = shmem_getpage(inode, index, &page,
1827 SGP_CACHE, NULL);
1828 if (error)
1829 break;
1830 unlock_page(page);
1831 page_cache_release(spd.pages[page_nr]);
1832 spd.pages[page_nr] = page;
1833 }
1834
1835 isize = i_size_read(inode);
1836 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1837 if (unlikely(!isize || index > end_index))
1838 break;
1839
1840 if (end_index == index) {
1841 unsigned int plen;
1842
1843 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1844 if (plen <= loff)
1845 break;
1846
1847 this_len = min(this_len, plen - loff);
1848 len = this_len;
1849 }
1850
1851 spd.partial[page_nr].offset = loff;
1852 spd.partial[page_nr].len = this_len;
1853 len -= this_len;
1854 loff = 0;
1855 spd.nr_pages++;
1856 index++;
1857 }
1858
1859 while (page_nr < nr_pages)
1860 page_cache_release(spd.pages[page_nr++]);
1861
1862 if (spd.nr_pages)
1863 error = splice_to_pipe(pipe, &spd);
1864
1865 splice_shrink_spd(pipe, &spd);
1866
1867 if (error > 0) {
1868 *ppos += error;
1869 file_accessed(in);
1870 }
1871 return error;
1872}
1873
1848static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1874static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1849{ 1875{
1850 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1876 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2005,7 +2031,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2005 int error; 2031 int error;
2006 int len; 2032 int len;
2007 struct inode *inode; 2033 struct inode *inode;
2008 struct page *page = NULL; 2034 struct page *page;
2009 char *kaddr; 2035 char *kaddr;
2010 struct shmem_inode_info *info; 2036 struct shmem_inode_info *info;
2011 2037
@@ -2683,7 +2709,6 @@ static const struct address_space_operations shmem_aops = {
2683 .writepage = shmem_writepage, 2709 .writepage = shmem_writepage,
2684 .set_page_dirty = __set_page_dirty_no_writeback, 2710 .set_page_dirty = __set_page_dirty_no_writeback,
2685#ifdef CONFIG_TMPFS 2711#ifdef CONFIG_TMPFS
2686 .readpage = shmem_readpage,
2687 .write_begin = shmem_write_begin, 2712 .write_begin = shmem_write_begin,
2688 .write_end = shmem_write_end, 2713 .write_end = shmem_write_end,
2689#endif 2714#endif
@@ -2700,13 +2725,13 @@ static const struct file_operations shmem_file_operations = {
2700 .aio_read = shmem_file_aio_read, 2725 .aio_read = shmem_file_aio_read,
2701 .aio_write = generic_file_aio_write, 2726 .aio_write = generic_file_aio_write,
2702 .fsync = noop_fsync, 2727 .fsync = noop_fsync,
2703 .splice_read = generic_file_splice_read, 2728 .splice_read = shmem_file_splice_read,
2704 .splice_write = generic_file_splice_write, 2729 .splice_write = generic_file_splice_write,
2705#endif 2730#endif
2706}; 2731};
2707 2732
2708static const struct inode_operations shmem_inode_operations = { 2733static const struct inode_operations shmem_inode_operations = {
2709 .setattr = shmem_notify_change, 2734 .setattr = shmem_setattr,
2710 .truncate_range = shmem_truncate_range, 2735 .truncate_range = shmem_truncate_range,
2711#ifdef CONFIG_TMPFS_XATTR 2736#ifdef CONFIG_TMPFS_XATTR
2712 .setxattr = shmem_setxattr, 2737 .setxattr = shmem_setxattr,
@@ -2714,10 +2739,6 @@ static const struct inode_operations shmem_inode_operations = {
2714 .listxattr = shmem_listxattr, 2739 .listxattr = shmem_listxattr,
2715 .removexattr = shmem_removexattr, 2740 .removexattr = shmem_removexattr,
2716#endif 2741#endif
2717#ifdef CONFIG_TMPFS_POSIX_ACL
2718 .check_acl = generic_check_acl,
2719#endif
2720
2721}; 2742};
2722 2743
2723static const struct inode_operations shmem_dir_inode_operations = { 2744static const struct inode_operations shmem_dir_inode_operations = {
@@ -2739,8 +2760,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2739 .removexattr = shmem_removexattr, 2760 .removexattr = shmem_removexattr,
2740#endif 2761#endif
2741#ifdef CONFIG_TMPFS_POSIX_ACL 2762#ifdef CONFIG_TMPFS_POSIX_ACL
2742 .setattr = shmem_notify_change, 2763 .setattr = shmem_setattr,
2743 .check_acl = generic_check_acl,
2744#endif 2764#endif
2745}; 2765};
2746 2766
@@ -2752,8 +2772,7 @@ static const struct inode_operations shmem_special_inode_operations = {
2752 .removexattr = shmem_removexattr, 2772 .removexattr = shmem_removexattr,
2753#endif 2773#endif
2754#ifdef CONFIG_TMPFS_POSIX_ACL 2774#ifdef CONFIG_TMPFS_POSIX_ACL
2755 .setattr = shmem_notify_change, 2775 .setattr = shmem_setattr,
2756 .check_acl = generic_check_acl,
2757#endif 2776#endif
2758}; 2777};
2759 2778
@@ -2908,6 +2927,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2908 return 0; 2927 return 0;
2909} 2928}
2910 2929
2930void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
2931{
2932 truncate_inode_pages_range(inode->i_mapping, start, end);
2933}
2934EXPORT_SYMBOL_GPL(shmem_truncate_range);
2935
2911#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2936#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2912/** 2937/**
2913 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file 2938 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -3028,3 +3053,42 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3028 vma->vm_flags |= VM_CAN_NONLINEAR; 3053 vma->vm_flags |= VM_CAN_NONLINEAR;
3029 return 0; 3054 return 0;
3030} 3055}
3056
3057/**
3058 * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
3059 * @mapping: the page's address_space
3060 * @index: the page index
3061 * @gfp: the page allocator flags to use if allocating
3062 *
3063 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
3064 * with any new page allocations done using the specified allocation flags.
3065 * But read_cache_page_gfp() uses the ->readpage() method: which does not
3066 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3067 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3068 *
3069 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3070 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3071 */
3072struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3073 pgoff_t index, gfp_t gfp)
3074{
3075#ifdef CONFIG_SHMEM
3076 struct inode *inode = mapping->host;
3077 struct page *page;
3078 int error;
3079
3080 BUG_ON(mapping->a_ops != &shmem_aops);
3081 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
3082 if (error)
3083 page = ERR_PTR(error);
3084 else
3085 unlock_page(page);
3086 return page;
3087#else
3088 /*
3089 * The tiny !SHMEM case uses ramfs without swap
3090 */
3091 return read_cache_page_gfp(mapping, index, gfp);
3092#endif
3093}
3094EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index bcfa4987c8ae..1e523ed47c61 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
575 575
576/* internal cache of cache description objs */ 576/* internal cache of cache description objs */
577static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
577static struct kmem_cache cache_cache = { 578static struct kmem_cache cache_cache = {
579 .nodelists = cache_cache_nodelists,
578 .batchcount = 1, 580 .batchcount = 1,
579 .limit = BOOT_CPUCACHE_ENTRIES, 581 .limit = BOOT_CPUCACHE_ENTRIES,
580 .shared = 1, 582 .shared = 1,
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void)
1492 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1494 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1493 1495
1494 /* 1496 /*
1495 * struct kmem_cache size depends on nr_node_ids, which 1497 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1496 * can be less than MAX_NUMNODES.
1497 */ 1498 */
1498 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1499 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1499 nr_node_ids * sizeof(struct kmem_list3 *); 1500 nr_node_ids * sizeof(struct kmem_list3 *);
1500#if DEBUG 1501#if DEBUG
1501 cache_cache.obj_size = cache_cache.buffer_size; 1502 cache_cache.obj_size = cache_cache.buffer_size;
1502#endif 1503#endif
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2308 if (!cachep) 2309 if (!cachep)
2309 goto oops; 2310 goto oops;
2310 2311
2312 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2311#if DEBUG 2313#if DEBUG
2312 cachep->obj_size = size; 2314 cachep->obj_size = size;
2313 2315
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3153 objp += obj_offset(cachep); 3155 objp += obj_offset(cachep);
3154 if (cachep->ctor && cachep->flags & SLAB_POISON) 3156 if (cachep->ctor && cachep->flags & SLAB_POISON)
3155 cachep->ctor(objp); 3157 cachep->ctor(objp);
3156#if ARCH_SLAB_MINALIGN 3158 if (ARCH_SLAB_MINALIGN &&
3157 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3159 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3158 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3160 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3159 objp, ARCH_SLAB_MINALIGN); 3161 objp, (int)ARCH_SLAB_MINALIGN);
3160 } 3162 }
3161#endif
3162 return objp; 3163 return objp;
3163} 3164}
3164#else 3165#else
@@ -3604,13 +3605,14 @@ free_done:
3604 * Release an obj back to its cache. If the obj has a constructed state, it must 3605 * Release an obj back to its cache. If the obj has a constructed state, it must
3605 * be in this state _before_ it is released. Called with disabled ints. 3606 * be in this state _before_ it is released. Called with disabled ints.
3606 */ 3607 */
3607static inline void __cache_free(struct kmem_cache *cachep, void *objp) 3608static inline void __cache_free(struct kmem_cache *cachep, void *objp,
3609 void *caller)
3608{ 3610{
3609 struct array_cache *ac = cpu_cache_get(cachep); 3611 struct array_cache *ac = cpu_cache_get(cachep);
3610 3612
3611 check_irq_off(); 3613 check_irq_off();
3612 kmemleak_free_recursive(objp, cachep->flags); 3614 kmemleak_free_recursive(objp, cachep->flags);
3613 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3615 objp = cache_free_debugcheck(cachep, objp, caller);
3614 3616
3615 kmemcheck_slab_free(cachep, objp, obj_size(cachep)); 3617 kmemcheck_slab_free(cachep, objp, obj_size(cachep));
3616 3618
@@ -3801,7 +3803,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3801 debug_check_no_locks_freed(objp, obj_size(cachep)); 3803 debug_check_no_locks_freed(objp, obj_size(cachep));
3802 if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) 3804 if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
3803 debug_check_no_obj_freed(objp, obj_size(cachep)); 3805 debug_check_no_obj_freed(objp, obj_size(cachep));
3804 __cache_free(cachep, objp); 3806 __cache_free(cachep, objp, __builtin_return_address(0));
3805 local_irq_restore(flags); 3807 local_irq_restore(flags);
3806 3808
3807 trace_kmem_cache_free(_RET_IP_, objp); 3809 trace_kmem_cache_free(_RET_IP_, objp);
@@ -3831,7 +3833,7 @@ void kfree(const void *objp)
3831 c = virt_to_cache(objp); 3833 c = virt_to_cache(objp);
3832 debug_check_no_locks_freed(objp, obj_size(c)); 3834 debug_check_no_locks_freed(objp, obj_size(c));
3833 debug_check_no_obj_freed(objp, obj_size(c)); 3835 debug_check_no_obj_freed(objp, obj_size(c));
3834 __cache_free(c, (void *)objp); 3836 __cache_free(c, (void *)objp, __builtin_return_address(0));
3835 local_irq_restore(flags); 3837 local_irq_restore(flags);
3836} 3838}
3837EXPORT_SYMBOL(kfree); 3839EXPORT_SYMBOL(kfree);
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a23..0ae881831ae2 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
483 void *ret; 483 void *ret;
484 484
485 gfp &= gfp_allowed_mask;
486
485 lockdep_trace_alloc(gfp); 487 lockdep_trace_alloc(gfp);
486 488
487 if (size < PAGE_SIZE - align) { 489 if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
608{ 610{
609 void *b; 611 void *b;
610 612
613 flags &= gfp_allowed_mask;
614
615 lockdep_trace_alloc(flags);
616
611 if (c->size < PAGE_SIZE) { 617 if (c->size < PAGE_SIZE) {
612 b = slob_alloc(c->size, flags, c->align, node); 618 b = slob_alloc(c->size, flags, c->align, node);
613 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 619 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 7be0223531b0..f8f5e8efeb88 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
27#include <linux/memory.h> 27#include <linux/memory.h>
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30#include <linux/stacktrace.h>
30 31
31#include <trace/events/kmem.h> 32#include <trace/events/kmem.h>
32 33
@@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches);
191/* 192/*
192 * Tracking user of a slab. 193 * Tracking user of a slab.
193 */ 194 */
195#define TRACK_ADDRS_COUNT 16
194struct track { 196struct track {
195 unsigned long addr; /* Called from address */ 197 unsigned long addr; /* Called from address */
198#ifdef CONFIG_STACKTRACE
199 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
200#endif
196 int cpu; /* Was running on cpu */ 201 int cpu; /* Was running on cpu */
197 int pid; /* Pid context */ 202 int pid; /* Pid context */
198 unsigned long when; /* When did the operation occur */ 203 unsigned long when; /* When did the operation occur */
@@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object,
420 struct track *p = get_track(s, object, alloc); 425 struct track *p = get_track(s, object, alloc);
421 426
422 if (addr) { 427 if (addr) {
428#ifdef CONFIG_STACKTRACE
429 struct stack_trace trace;
430 int i;
431
432 trace.nr_entries = 0;
433 trace.max_entries = TRACK_ADDRS_COUNT;
434 trace.entries = p->addrs;
435 trace.skip = 3;
436 save_stack_trace(&trace);
437
438 /* See rant in lockdep.c */
439 if (trace.nr_entries != 0 &&
440 trace.entries[trace.nr_entries - 1] == ULONG_MAX)
441 trace.nr_entries--;
442
443 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
444 p->addrs[i] = 0;
445#endif
423 p->addr = addr; 446 p->addr = addr;
424 p->cpu = smp_processor_id(); 447 p->cpu = smp_processor_id();
425 p->pid = current->pid; 448 p->pid = current->pid;
@@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t)
444 467
445 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 468 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
446 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 469 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
470#ifdef CONFIG_STACKTRACE
471 {
472 int i;
473 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
474 if (t->addrs[i])
475 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
476 else
477 break;
478 }
479#endif
447} 480}
448 481
449static void print_tracking(struct kmem_cache *s, void *object) 482static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
557 memset(p + s->objsize, val, s->inuse - s->objsize); 590 memset(p + s->objsize, val, s->inuse - s->objsize);
558} 591}
559 592
560static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 593static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
561{ 594{
562 while (bytes) { 595 while (bytes) {
563 if (*start != (u8)value) 596 if (*start != value)
564 return start; 597 return start;
565 start++; 598 start++;
566 bytes--; 599 bytes--;
@@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
568 return NULL; 601 return NULL;
569} 602}
570 603
604static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
605{
606 u64 value64;
607 unsigned int words, prefix;
608
609 if (bytes <= 16)
610 return check_bytes8(start, value, bytes);
611
612 value64 = value | value << 8 | value << 16 | value << 24;
613 value64 = value64 | value64 << 32;
614 prefix = 8 - ((unsigned long)start) % 8;
615
616 if (prefix) {
617 u8 *r = check_bytes8(start, value, prefix);
618 if (r)
619 return r;
620 start += prefix;
621 bytes -= prefix;
622 }
623
624 words = bytes / 8;
625
626 while (words) {
627 if (*(u64 *)start != value64)
628 return check_bytes8(start, value, 8);
629 start += 8;
630 words--;
631 }
632
633 return check_bytes8(start, value, bytes % 8);
634}
635
571static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 636static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
572 void *from, void *to) 637 void *from, void *to)
573{ 638{
@@ -2320,16 +2385,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2320 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2385 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2321 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2386 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2322 2387
2323#ifdef CONFIG_CMPXCHG_LOCAL
2324 /* 2388 /*
2325 * Must align to double word boundary for the double cmpxchg instructions 2389 * Must align to double word boundary for the double cmpxchg
2326 * to work. 2390 * instructions to work; see __pcpu_double_call_return_bool().
2327 */ 2391 */
2328 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); 2392 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
2329#else 2393 2 * sizeof(void *));
2330 /* Regular alignment is sufficient */
2331 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2332#endif
2333 2394
2334 if (!s->cpu_slab) 2395 if (!s->cpu_slab)
2335 return 0; 2396 return 0;
@@ -2932,6 +2993,42 @@ size_t ksize(const void *object)
2932} 2993}
2933EXPORT_SYMBOL(ksize); 2994EXPORT_SYMBOL(ksize);
2934 2995
2996#ifdef CONFIG_SLUB_DEBUG
2997bool verify_mem_not_deleted(const void *x)
2998{
2999 struct page *page;
3000 void *object = (void *)x;
3001 unsigned long flags;
3002 bool rv;
3003
3004 if (unlikely(ZERO_OR_NULL_PTR(x)))
3005 return false;
3006
3007 local_irq_save(flags);
3008
3009 page = virt_to_head_page(x);
3010 if (unlikely(!PageSlab(page))) {
3011 /* maybe it was from stack? */
3012 rv = true;
3013 goto out_unlock;
3014 }
3015
3016 slab_lock(page);
3017 if (on_freelist(page->slab, page, object)) {
3018 object_err(page->slab, page, object, "Object is on free-list");
3019 rv = false;
3020 } else {
3021 rv = true;
3022 }
3023 slab_unlock(page);
3024
3025out_unlock:
3026 local_irq_restore(flags);
3027 return rv;
3028}
3029EXPORT_SYMBOL(verify_mem_not_deleted);
3030#endif
3031
2935void kfree(const void *x) 3032void kfree(const void *x)
2936{ 3033{
2937 struct page *page; 3034 struct page *page;
@@ -4062,7 +4159,7 @@ static int any_slab_objects(struct kmem_cache *s)
4062#endif 4159#endif
4063 4160
4064#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4161#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4065#define to_slab(n) container_of(n, struct kmem_cache, kobj); 4162#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4066 4163
4067struct slab_attribute { 4164struct slab_attribute {
4068 struct attribute attr; 4165 struct attribute attr;
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a2..858e1dff9b2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; 40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
41#endif 41#endif
42 42
43int page_to_nid(struct page *page) 43int page_to_nid(const struct page *page)
44{ 44{
45 return section_to_node_table[page_to_section(page)]; 45 return section_to_node_table[page_to_section(page)];
46} 46}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d537d29e9b7b..1b8c33907242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shmem_fs.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h> 19#include <linux/random.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
@@ -1681,19 +1681,14 @@ out:
1681} 1681}
1682 1682
1683#ifdef CONFIG_PROC_FS 1683#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait) 1684static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{ 1685{
1691 struct proc_swaps *s = file->private_data; 1686 struct seq_file *seq = file->private_data;
1692 1687
1693 poll_wait(file, &proc_poll_wait, wait); 1688 poll_wait(file, &proc_poll_wait, wait);
1694 1689
1695 if (s->event != atomic_read(&proc_poll_event)) { 1690 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event); 1691 seq->poll_event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 1692 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 } 1693 }
1699 1694
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
1783 1778
1784static int swaps_open(struct inode *inode, struct file *file) 1779static int swaps_open(struct inode *inode, struct file *file)
1785{ 1780{
1786 struct proc_swaps *s; 1781 struct seq_file *seq;
1787 int ret; 1782 int ret;
1788 1783
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op); 1784 ret = seq_open(file, &swaps_op);
1796 if (ret) { 1785 if (ret)
1797 kfree(s);
1798 return ret; 1786 return ret;
1799 }
1800 1787
1801 s->seq.private = s; 1788 seq = file->private_data;
1802 s->event = atomic_read(&proc_poll_event); 1789 seq->poll_event = atomic_read(&proc_poll_event);
1803 return ret; 1790 return 0;
1804} 1791}
1805 1792
1806static const struct file_operations proc_swaps_operations = { 1793static const struct file_operations proc_swaps_operations = {
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd8..e53f7d02c17c 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
6 * Released under the GPL, see the file COPYING for details. 6 * Released under the GPL, see the file COPYING for details.
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 * 10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> 11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token: 12 * Improved algorithm to pass token:
@@ -21,14 +21,40 @@
21#include <linux/mm.h> 21#include <linux/mm.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
24 29
25static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
26struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
27static unsigned int global_faults; 32struct mem_cgroup *swap_token_memcg;
33
34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
36{
37 struct mem_cgroup *memcg;
38
39 memcg = try_get_mem_cgroup_from_mm(mm);
40 if (memcg)
41 css_put(mem_cgroup_css(memcg));
42
43 return memcg;
44}
45#else
46static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
47{
48 return NULL;
49}
50#endif
28 51
29void grab_swap_token(struct mm_struct *mm) 52void grab_swap_token(struct mm_struct *mm)
30{ 53{
31 int current_interval; 54 int current_interval;
55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
32 58
33 global_faults++; 59 global_faults++;
34 60
@@ -38,40 +64,92 @@ void grab_swap_token(struct mm_struct *mm)
38 return; 64 return;
39 65
40 /* First come first served */ 66 /* First come first served */
41 if (swap_token_mm == NULL) { 67 if (!swap_token_mm)
42 mm->token_priority = mm->token_priority + 2; 68 goto replace_token;
43 swap_token_mm = mm; 69
44 goto out; 70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
82 swap_token_mm->token_priority /= 2;
83 last_aging = global_faults;
45 } 84 }
46 85
47 if (mm != swap_token_mm) { 86 if (mm == swap_token_mm) {
48 if (current_interval < mm->last_interval)
49 mm->token_priority++;
50 else {
51 if (likely(mm->token_priority > 0))
52 mm->token_priority--;
53 }
54 /* Check if we deserve the token */
55 if (mm->token_priority > swap_token_mm->token_priority) {
56 mm->token_priority += 2;
57 swap_token_mm = mm;
58 }
59 } else {
60 /* Token holder came in again! */
61 mm->token_priority += 2; 87 mm->token_priority += 2;
88 goto update_priority;
89 }
90
91 if (current_interval < mm->last_interval)
92 mm->token_priority++;
93 else {
94 if (likely(mm->token_priority > 0))
95 mm->token_priority--;
62 } 96 }
63 97
98 /* Check if we deserve the token */
99 if (mm->token_priority > swap_token_mm->token_priority)
100 goto replace_token;
101
102update_priority:
103 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
104
64out: 105out:
65 mm->faultstamp = global_faults; 106 mm->faultstamp = global_faults;
66 mm->last_interval = current_interval; 107 mm->last_interval = current_interval;
67 spin_unlock(&swap_token_lock); 108 spin_unlock(&swap_token_lock);
109 return;
110
111replace_token:
112 mm->token_priority += 2;
113 trace_replace_swap_token(swap_token_mm, mm);
114 swap_token_mm = mm;
115 swap_token_memcg = swap_token_memcg_from_mm(mm);
116 last_aging = global_faults;
117 goto out;
68} 118}
69 119
70/* Called on process exit. */ 120/* Called on process exit. */
71void __put_swap_token(struct mm_struct *mm) 121void __put_swap_token(struct mm_struct *mm)
72{ 122{
73 spin_lock(&swap_token_lock); 123 spin_lock(&swap_token_lock);
74 if (likely(mm == swap_token_mm)) 124 if (likely(mm == swap_token_mm)) {
125 trace_put_swap_token(swap_token_mm);
75 swap_token_mm = NULL; 126 swap_token_mm = NULL;
127 swap_token_memcg = NULL;
128 }
76 spin_unlock(&swap_token_lock); 129 spin_unlock(&swap_token_lock);
77} 130}
131
132static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
133{
134 if (!a)
135 return true;
136 if (!b)
137 return true;
138 if (a == b)
139 return true;
140 return false;
141}
142
143void disable_swap_token(struct mem_cgroup *memcg)
144{
145 /* memcg reclaim don't disable unrelated mm token. */
146 if (match_memcg(memcg, swap_token_memcg)) {
147 spin_lock(&swap_token_lock);
148 if (match_memcg(memcg, swap_token_memcg)) {
149 trace_disable_swap_token(swap_token_mm);
150 swap_token_mm = NULL;
151 swap_token_memcg = NULL;
152 }
153 spin_unlock(&swap_token_lock);
154 }
155}
diff --git a/mm/truncate.c b/mm/truncate.c
index 3a29a6180212..232eb2736a79 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page)
199 * The first pass will remove most pages, so the search cost of the second pass 199 * The first pass will remove most pages, so the search cost of the second pass
200 * is low. 200 * is low.
201 * 201 *
202 * When looking at page->index outside the page lock we need to be careful to
203 * copy it into a local to avoid races (it could change at any time).
204 *
205 * We pass down the cache-hot hint to the page freeing code. Even if the 202 * We pass down the cache-hot hint to the page freeing code. Even if the
206 * mapping is large, it is probably the case that the final pages are the most 203 * mapping is large, it is probably the case that the final pages are the most
207 * recently touched, and freeing happens in ascending file offset order. 204 * recently touched, and freeing happens in ascending file offset order.
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
210 loff_t lstart, loff_t lend) 207 loff_t lstart, loff_t lend)
211{ 208{
212 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 209 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
213 pgoff_t end;
214 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 210 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
215 struct pagevec pvec; 211 struct pagevec pvec;
216 pgoff_t next; 212 pgoff_t index;
213 pgoff_t end;
217 int i; 214 int i;
218 215
219 cleancache_flush_inode(mapping); 216 cleancache_flush_inode(mapping);
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
224 end = (lend >> PAGE_CACHE_SHIFT); 221 end = (lend >> PAGE_CACHE_SHIFT);
225 222
226 pagevec_init(&pvec, 0); 223 pagevec_init(&pvec, 0);
227 next = start; 224 index = start;
228 while (next <= end && 225 while (index <= end && pagevec_lookup(&pvec, mapping, index,
229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 226 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
230 mem_cgroup_uncharge_start(); 227 mem_cgroup_uncharge_start();
231 for (i = 0; i < pagevec_count(&pvec); i++) { 228 for (i = 0; i < pagevec_count(&pvec); i++) {
232 struct page *page = pvec.pages[i]; 229 struct page *page = pvec.pages[i];
233 pgoff_t page_index = page->index;
234 230
235 if (page_index > end) { 231 /* We rely upon deletion not changing page->index */
236 next = page_index; 232 index = page->index;
233 if (index > end)
237 break; 234 break;
238 }
239 235
240 if (page_index > next)
241 next = page_index;
242 next++;
243 if (!trylock_page(page)) 236 if (!trylock_page(page))
244 continue; 237 continue;
238 WARN_ON(page->index != index);
245 if (PageWriteback(page)) { 239 if (PageWriteback(page)) {
246 unlock_page(page); 240 unlock_page(page);
247 continue; 241 continue;
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
252 pagevec_release(&pvec); 246 pagevec_release(&pvec);
253 mem_cgroup_uncharge_end(); 247 mem_cgroup_uncharge_end();
254 cond_resched(); 248 cond_resched();
249 index++;
255 } 250 }
256 251
257 if (partial) { 252 if (partial) {
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
264 } 259 }
265 } 260 }
266 261
267 next = start; 262 index = start;
268 for ( ; ; ) { 263 for ( ; ; ) {
269 cond_resched(); 264 cond_resched();
270 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 265 if (!pagevec_lookup(&pvec, mapping, index,
271 if (next == start) 266 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
267 if (index == start)
272 break; 268 break;
273 next = start; 269 index = start;
274 continue; 270 continue;
275 } 271 }
276 if (pvec.pages[0]->index > end) { 272 if (index == start && pvec.pages[0]->index > end) {
277 pagevec_release(&pvec); 273 pagevec_release(&pvec);
278 break; 274 break;
279 } 275 }
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping,
281 for (i = 0; i < pagevec_count(&pvec); i++) { 277 for (i = 0; i < pagevec_count(&pvec); i++) {
282 struct page *page = pvec.pages[i]; 278 struct page *page = pvec.pages[i];
283 279
284 if (page->index > end) 280 /* We rely upon deletion not changing page->index */
281 index = page->index;
282 if (index > end)
285 break; 283 break;
284
286 lock_page(page); 285 lock_page(page);
286 WARN_ON(page->index != index);
287 wait_on_page_writeback(page); 287 wait_on_page_writeback(page);
288 truncate_inode_page(mapping, page); 288 truncate_inode_page(mapping, page);
289 if (page->index > next)
290 next = page->index;
291 next++;
292 unlock_page(page); 289 unlock_page(page);
293 } 290 }
294 pagevec_release(&pvec); 291 pagevec_release(&pvec);
295 mem_cgroup_uncharge_end(); 292 mem_cgroup_uncharge_end();
293 index++;
296 } 294 }
297 cleancache_flush_inode(mapping); 295 cleancache_flush_inode(mapping);
298} 296}
@@ -304,6 +302,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
304 * @lstart: offset from which to truncate 302 * @lstart: offset from which to truncate
305 * 303 *
306 * Called under (and serialised by) inode->i_mutex. 304 * Called under (and serialised by) inode->i_mutex.
305 *
306 * Note: When this function returns, there can be a page in the process of
307 * deletion (inside __delete_from_page_cache()) in the specified range. Thus
308 * mapping->nrpages can be non-zero when this function returns even after
309 * truncation of the whole mapping.
307 */ 310 */
308void truncate_inode_pages(struct address_space *mapping, loff_t lstart) 311void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
309{ 312{
@@ -328,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
328 pgoff_t start, pgoff_t end) 331 pgoff_t start, pgoff_t end)
329{ 332{
330 struct pagevec pvec; 333 struct pagevec pvec;
331 pgoff_t next = start; 334 pgoff_t index = start;
332 unsigned long ret; 335 unsigned long ret;
333 unsigned long count = 0; 336 unsigned long count = 0;
334 int i; 337 int i;
335 338
336 pagevec_init(&pvec, 0); 339 pagevec_init(&pvec, 0);
337 while (next <= end && 340 while (index <= end && pagevec_lookup(&pvec, mapping, index,
338 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 341 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
339 mem_cgroup_uncharge_start(); 342 mem_cgroup_uncharge_start();
340 for (i = 0; i < pagevec_count(&pvec); i++) { 343 for (i = 0; i < pagevec_count(&pvec); i++) {
341 struct page *page = pvec.pages[i]; 344 struct page *page = pvec.pages[i];
342 pgoff_t index;
343 int lock_failed;
344
345 lock_failed = !trylock_page(page);
346 345
347 /* 346 /* We rely upon deletion not changing page->index */
348 * We really shouldn't be looking at the ->index of an
349 * unlocked page. But we're not allowed to lock these
350 * pages. So we rely upon nobody altering the ->index
351 * of this (pinned-by-us) page.
352 */
353 index = page->index; 347 index = page->index;
354 if (index > next) 348 if (index > end)
355 next = index; 349 break;
356 next++;
357 if (lock_failed)
358 continue;
359 350
351 if (!trylock_page(page))
352 continue;
353 WARN_ON(page->index != index);
360 ret = invalidate_inode_page(page); 354 ret = invalidate_inode_page(page);
361 unlock_page(page); 355 unlock_page(page);
362 /* 356 /*
@@ -366,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
366 if (!ret) 360 if (!ret)
367 deactivate_page(page); 361 deactivate_page(page);
368 count += ret; 362 count += ret;
369 if (next > end)
370 break;
371 } 363 }
372 pagevec_release(&pvec); 364 pagevec_release(&pvec);
373 mem_cgroup_uncharge_end(); 365 mem_cgroup_uncharge_end();
374 cond_resched(); 366 cond_resched();
367 index++;
375 } 368 }
376 return count; 369 return count;
377} 370}
@@ -437,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
437 pgoff_t start, pgoff_t end) 430 pgoff_t start, pgoff_t end)
438{ 431{
439 struct pagevec pvec; 432 struct pagevec pvec;
440 pgoff_t next; 433 pgoff_t index;
441 int i; 434 int i;
442 int ret = 0; 435 int ret = 0;
443 int ret2 = 0; 436 int ret2 = 0;
444 int did_range_unmap = 0; 437 int did_range_unmap = 0;
445 int wrapped = 0;
446 438
447 cleancache_flush_inode(mapping); 439 cleancache_flush_inode(mapping);
448 pagevec_init(&pvec, 0); 440 pagevec_init(&pvec, 0);
449 next = start; 441 index = start;
450 while (next <= end && !wrapped && 442 while (index <= end && pagevec_lookup(&pvec, mapping, index,
451 pagevec_lookup(&pvec, mapping, next, 443 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
452 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
453 mem_cgroup_uncharge_start(); 444 mem_cgroup_uncharge_start();
454 for (i = 0; i < pagevec_count(&pvec); i++) { 445 for (i = 0; i < pagevec_count(&pvec); i++) {
455 struct page *page = pvec.pages[i]; 446 struct page *page = pvec.pages[i];
456 pgoff_t page_index; 447
448 /* We rely upon deletion not changing page->index */
449 index = page->index;
450 if (index > end)
451 break;
457 452
458 lock_page(page); 453 lock_page(page);
454 WARN_ON(page->index != index);
459 if (page->mapping != mapping) { 455 if (page->mapping != mapping) {
460 unlock_page(page); 456 unlock_page(page);
461 continue; 457 continue;
462 } 458 }
463 page_index = page->index;
464 next = page_index + 1;
465 if (next == 0)
466 wrapped = 1;
467 if (page_index > end) {
468 unlock_page(page);
469 break;
470 }
471 wait_on_page_writeback(page); 459 wait_on_page_writeback(page);
472 if (page_mapped(page)) { 460 if (page_mapped(page)) {
473 if (!did_range_unmap) { 461 if (!did_range_unmap) {
@@ -475,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
475 * Zap the rest of the file in one hit. 463 * Zap the rest of the file in one hit.
476 */ 464 */
477 unmap_mapping_range(mapping, 465 unmap_mapping_range(mapping,
478 (loff_t)page_index<<PAGE_CACHE_SHIFT, 466 (loff_t)index << PAGE_CACHE_SHIFT,
479 (loff_t)(end - page_index + 1) 467 (loff_t)(1 + end - index)
480 << PAGE_CACHE_SHIFT, 468 << PAGE_CACHE_SHIFT,
481 0); 469 0);
482 did_range_unmap = 1; 470 did_range_unmap = 1;
483 } else { 471 } else {
@@ -485,8 +473,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
485 * Just zap this page 473 * Just zap this page
486 */ 474 */
487 unmap_mapping_range(mapping, 475 unmap_mapping_range(mapping,
488 (loff_t)page_index<<PAGE_CACHE_SHIFT, 476 (loff_t)index << PAGE_CACHE_SHIFT,
489 PAGE_CACHE_SIZE, 0); 477 PAGE_CACHE_SIZE, 0);
490 } 478 }
491 } 479 }
492 BUG_ON(page_mapped(page)); 480 BUG_ON(page_mapped(page));
@@ -502,6 +490,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
502 pagevec_release(&pvec); 490 pagevec_release(&pvec);
503 mem_cgroup_uncharge_end(); 491 mem_cgroup_uncharge_end();
504 cond_resched(); 492 cond_resched();
493 index++;
505 } 494 }
506 cleancache_flush_inode(mapping); 495 cleancache_flush_inode(mapping);
507 return ret; 496 return ret;
@@ -526,8 +515,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
526/** 515/**
527 * truncate_pagecache - unmap and remove pagecache that has been truncated 516 * truncate_pagecache - unmap and remove pagecache that has been truncated
528 * @inode: inode 517 * @inode: inode
529 * @old: old file offset 518 * @oldsize: old file size
530 * @new: new file offset 519 * @newsize: new file size
531 * 520 *
532 * inode's new i_size must already be written before truncate_pagecache 521 * inode's new i_size must already be written before truncate_pagecache
533 * is called. 522 * is called.
@@ -539,9 +528,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
539 * situations such as writepage being called for a page that has already 528 * situations such as writepage being called for a page that has already
540 * had its underlying blocks deallocated. 529 * had its underlying blocks deallocated.
541 */ 530 */
542void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 531void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
543{ 532{
544 struct address_space *mapping = inode->i_mapping; 533 struct address_space *mapping = inode->i_mapping;
534 loff_t holebegin = round_up(newsize, PAGE_SIZE);
545 535
546 /* 536 /*
547 * unmap_mapping_range is called twice, first simply for 537 * unmap_mapping_range is called twice, first simply for
@@ -552,9 +542,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
552 * truncate_inode_pages finishes, hence the second 542 * truncate_inode_pages finishes, hence the second
553 * unmap_mapping_range call must be made for correctness. 543 * unmap_mapping_range call must be made for correctness.
554 */ 544 */
555 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 545 unmap_mapping_range(mapping, holebegin, 0, 1);
556 truncate_inode_pages(mapping, new); 546 truncate_inode_pages(mapping, newsize);
557 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 547 unmap_mapping_range(mapping, holebegin, 0, 1);
558} 548}
559EXPORT_SYMBOL(truncate_pagecache); 549EXPORT_SYMBOL(truncate_pagecache);
560 550
@@ -584,22 +574,47 @@ EXPORT_SYMBOL(truncate_setsize);
584/** 574/**
585 * vmtruncate - unmap mappings "freed" by truncate() syscall 575 * vmtruncate - unmap mappings "freed" by truncate() syscall
586 * @inode: inode of the file used 576 * @inode: inode of the file used
587 * @offset: file offset to start truncating 577 * @newsize: file offset to start truncating
588 * 578 *
589 * This function is deprecated and truncate_setsize or truncate_pagecache 579 * This function is deprecated and truncate_setsize or truncate_pagecache
590 * should be used instead, together with filesystem specific block truncation. 580 * should be used instead, together with filesystem specific block truncation.
591 */ 581 */
592int vmtruncate(struct inode *inode, loff_t offset) 582int vmtruncate(struct inode *inode, loff_t newsize)
593{ 583{
594 int error; 584 int error;
595 585
596 error = inode_newsize_ok(inode, offset); 586 error = inode_newsize_ok(inode, newsize);
597 if (error) 587 if (error)
598 return error; 588 return error;
599 589
600 truncate_setsize(inode, offset); 590 truncate_setsize(inode, newsize);
601 if (inode->i_op->truncate) 591 if (inode->i_op->truncate)
602 inode->i_op->truncate(inode); 592 inode->i_op->truncate(inode);
603 return 0; 593 return 0;
604} 594}
605EXPORT_SYMBOL(vmtruncate); 595EXPORT_SYMBOL(vmtruncate);
596
597int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
598{
599 struct address_space *mapping = inode->i_mapping;
600 loff_t holebegin = round_up(lstart, PAGE_SIZE);
601 loff_t holelen = 1 + lend - holebegin;
602
603 /*
604 * If the underlying filesystem is not going to provide
605 * a way to truncate a range of blocks (punch a hole) -
606 * we should return failure right now.
607 */
608 if (!inode->i_op->truncate_range)
609 return -ENOSYS;
610
611 mutex_lock(&inode->i_mutex);
612 inode_dio_wait(inode);
613 unmap_mapping_range(mapping, holebegin, holelen, 1);
614 inode->i_op->truncate_range(inode, lstart, lend);
615 /* unmap again to remove racily COWed private pages */
616 unmap_mapping_range(mapping, holebegin, holelen, 1);
617 mutex_unlock(&inode->i_mutex);
618
619 return 0;
620}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a7..ab8494cde007 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -452,13 +452,6 @@ overflow:
452 return ERR_PTR(-EBUSY); 452 return ERR_PTR(-EBUSY);
453} 453}
454 454
455static void rcu_free_va(struct rcu_head *head)
456{
457 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
458
459 kfree(va);
460}
461
462static void __free_vmap_area(struct vmap_area *va) 455static void __free_vmap_area(struct vmap_area *va)
463{ 456{
464 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 457 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
491 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 484 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
492 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 485 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
493 486
494 call_rcu(&va->rcu_head, rcu_free_va); 487 kfree_rcu(va, rcu_head);
495} 488}
496 489
497/* 490/*
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
837 return vb; 830 return vb;
838} 831}
839 832
840static void rcu_free_vb(struct rcu_head *head)
841{
842 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
843
844 kfree(vb);
845}
846
847static void free_vmap_block(struct vmap_block *vb) 833static void free_vmap_block(struct vmap_block *vb)
848{ 834{
849 struct vmap_block *tmp; 835 struct vmap_block *tmp;
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb)
856 BUG_ON(tmp != vb); 842 BUG_ON(tmp != vb);
857 843
858 free_vmap_area_noflush(vb->va); 844 free_vmap_area_noflush(vb->va);
859 call_rcu(&vb->rcu_head, rcu_free_vb); 845 kfree_rcu(vb, rcu_head);
860} 846}
861 847
862static void purge_fragmented_blocks(int cpu) 848static void purge_fragmented_blocks(int cpu)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index faa0a088f9cc..febbc044e792 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 unsigned long total_scan;
252 unsigned long max_pass; 252 unsigned long max_pass;
253 int shrink_ret = 0;
254 long nr;
255 long new_nr;
256 long batch_size = shrinker->batch ? shrinker->batch
257 : SHRINK_BATCH;
253 258
259 /*
260 * copy the current shrinker scan count into a local variable
261 * and zero it so that other concurrent shrinker invocations
262 * don't also do this scanning work.
263 */
264 do {
265 nr = shrinker->nr;
266 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
267
268 total_scan = nr;
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 269 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 270 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 271 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 272 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 273 total_scan += delta;
259 if (shrinker->nr < 0) { 274 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 275 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 276 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 277 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 278 total_scan = max_pass;
264 } 279 }
265 280
266 /* 281 /*
282 * We need to avoid excessive windup on filesystem shrinkers
283 * due to large numbers of GFP_NOFS allocations causing the
284 * shrinkers to return -1 all the time. This results in a large
285 * nr being built up so when a shrink that can do some work
286 * comes along it empties the entire cache due to nr >>>
287 * max_pass. This is bad for sustaining a working set in
288 * memory.
289 *
290 * Hence only allow the shrinker to scan the entire cache when
291 * a large delta change is calculated directly.
292 */
293 if (delta < max_pass / 4)
294 total_scan = min(total_scan, max_pass / 2);
295
296 /*
267 * Avoid risking looping forever due to too large nr value: 297 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 298 * never try to free more than twice the estimate number of
269 * freeable entries. 299 * freeable entries.
270 */ 300 */
271 if (shrinker->nr > max_pass * 2) 301 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 302 total_scan = max_pass * 2;
273 303
274 total_scan = shrinker->nr; 304 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 305 nr_pages_scanned, lru_pages,
306 max_pass, delta, total_scan);
276 307
277 while (total_scan >= SHRINK_BATCH) { 308 while (total_scan >= batch_size) {
278 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 309 int nr_before;
281 310
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 311 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
283 shrink_ret = do_shrinker_shrink(shrinker, shrink, 312 shrink_ret = do_shrinker_shrink(shrinker, shrink,
284 this_scan); 313 batch_size);
285 if (shrink_ret == -1) 314 if (shrink_ret == -1)
286 break; 315 break;
287 if (shrink_ret < nr_before) 316 if (shrink_ret < nr_before)
288 ret += nr_before - shrink_ret; 317 ret += nr_before - shrink_ret;
289 count_vm_events(SLABS_SCANNED, this_scan); 318 count_vm_events(SLABS_SCANNED, batch_size);
290 total_scan -= this_scan; 319 total_scan -= batch_size;
291 320
292 cond_resched(); 321 cond_resched();
293 } 322 }
294 323
295 shrinker->nr += total_scan; 324 /*
325 * move the unused scan count back into the shrinker in a
326 * manner that handles concurrent updates. If we exhausted the
327 * scan, there is no need to do an update.
328 */
329 do {
330 nr = shrinker->nr;
331 new_nr = total_scan + nr;
332 if (total_scan <= 0)
333 break;
334 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
335
336 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 337 }
297 up_read(&shrinker_rwsem); 338 up_read(&shrinker_rwsem);
298out: 339out:
@@ -1124,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1124 nr_lumpy_dirty++; 1165 nr_lumpy_dirty++;
1125 scan++; 1166 scan++;
1126 } else { 1167 } else {
1127 /* the page is freed already. */ 1168 /*
1128 if (!page_count(cursor_page)) 1169 * Check if the page is freed already.
1170 *
1171 * We can't use page_count() as that
1172 * requires compound_head and we don't
1173 * have a pin on the page here. If a
1174 * page is tail, we may or may not
1175 * have isolated the head, so assume
1176 * it's not free, it'd be tricky to
1177 * track the head status without a
1178 * page pin.
1179 */
1180 if (!PageTail(cursor_page) &&
1181 !atomic_read(&cursor_page->_count))
1129 continue; 1182 continue;
1130 break; 1183 break;
1131 } 1184 }
@@ -1983,14 +2036,13 @@ restart:
1983 * If a zone is deemed to be full of pinned pages then just give it a light 2036 * If a zone is deemed to be full of pinned pages then just give it a light
1984 * scan then give up on it. 2037 * scan then give up on it.
1985 */ 2038 */
1986static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 2039static void shrink_zones(int priority, struct zonelist *zonelist,
1987 struct scan_control *sc) 2040 struct scan_control *sc)
1988{ 2041{
1989 struct zoneref *z; 2042 struct zoneref *z;
1990 struct zone *zone; 2043 struct zone *zone;
1991 unsigned long nr_soft_reclaimed; 2044 unsigned long nr_soft_reclaimed;
1992 unsigned long nr_soft_scanned; 2045 unsigned long nr_soft_scanned;
1993 unsigned long total_scanned = 0;
1994 2046
1995 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2047 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1996 gfp_zone(sc->gfp_mask), sc->nodemask) { 2048 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2005,19 +2057,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
2005 continue; 2057 continue;
2006 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2058 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2059 continue; /* Let kswapd poll it */
2060 /*
2061 * This steals pages from memory cgroups over softlimit
2062 * and returns the number of reclaimed pages and
2063 * scanned pages. This works for global memory pressure
2064 * and balancing, not for a memcg's limit.
2065 */
2066 nr_soft_scanned = 0;
2067 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2068 sc->order, sc->gfp_mask,
2069 &nr_soft_scanned);
2070 sc->nr_reclaimed += nr_soft_reclaimed;
2071 sc->nr_scanned += nr_soft_scanned;
2072 /* need some check for avoid more shrink_zone() */
2008 } 2073 }
2009 2074
2010 nr_soft_scanned = 0;
2011 nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
2012 sc->order, sc->gfp_mask,
2013 &nr_soft_scanned);
2014 sc->nr_reclaimed += nr_soft_reclaimed;
2015 total_scanned += nr_soft_scanned;
2016
2017 shrink_zone(priority, zone, sc); 2075 shrink_zone(priority, zone, sc);
2018 } 2076 }
2019
2020 return total_scanned;
2021} 2077}
2022 2078
2023static bool zone_reclaimable(struct zone *zone) 2079static bool zone_reclaimable(struct zone *zone)
@@ -2081,8 +2137,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2081 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2137 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2082 sc->nr_scanned = 0; 2138 sc->nr_scanned = 0;
2083 if (!priority) 2139 if (!priority)
2084 disable_swap_token(); 2140 disable_swap_token(sc->mem_cgroup);
2085 total_scanned += shrink_zones(priority, zonelist, sc); 2141 shrink_zones(priority, zonelist, sc);
2086 /* 2142 /*
2087 * Don't shrink slabs when reclaiming memory from 2143 * Don't shrink slabs when reclaiming memory from
2088 * over limit cgroups 2144 * over limit cgroups
@@ -2295,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2295 for (i = 0; i <= classzone_idx; i++) 2351 for (i = 0; i <= classzone_idx; i++)
2296 present_pages += pgdat->node_zones[i].present_pages; 2352 present_pages += pgdat->node_zones[i].present_pages;
2297 2353
2298 return balanced_pages > (present_pages >> 2); 2354 /* A special case here: if zone has no page, we think it's balanced */
2355 return balanced_pages >= (present_pages >> 2);
2299} 2356}
2300 2357
2301/* is kswapd sleeping prematurely? */ 2358/* is kswapd sleeping prematurely? */
@@ -2311,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2311 return true; 2368 return true;
2312 2369
2313 /* Check the watermark levels */ 2370 /* Check the watermark levels */
2314 for (i = 0; i < pgdat->nr_zones; i++) { 2371 for (i = 0; i <= classzone_idx; i++) {
2315 struct zone *zone = pgdat->node_zones + i; 2372 struct zone *zone = pgdat->node_zones + i;
2316 2373
2317 if (!populated_zone(zone)) 2374 if (!populated_zone(zone))
@@ -2329,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2329 } 2386 }
2330 2387
2331 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2388 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2332 classzone_idx, 0)) 2389 i, 0))
2333 all_zones_ok = false; 2390 all_zones_ok = false;
2334 else 2391 else
2335 balanced += zone->present_pages; 2392 balanced += zone->present_pages;
@@ -2407,7 +2464,7 @@ loop_again:
2407 2464
2408 /* The swap token gets in the way of swapout... */ 2465 /* The swap token gets in the way of swapout... */
2409 if (!priority) 2466 if (!priority)
2410 disable_swap_token(); 2467 disable_swap_token(NULL);
2411 2468
2412 all_zones_ok = 1; 2469 all_zones_ok = 1;
2413 balanced = 0; 2470 balanced = 0;
@@ -2436,7 +2493,6 @@ loop_again:
2436 if (!zone_watermark_ok_safe(zone, order, 2493 if (!zone_watermark_ok_safe(zone, order,
2437 high_wmark_pages(zone), 0, 0)) { 2494 high_wmark_pages(zone), 0, 0)) {
2438 end_zone = i; 2495 end_zone = i;
2439 *classzone_idx = i;
2440 break; 2496 break;
2441 } 2497 }
2442 } 2498 }
@@ -2495,18 +2551,18 @@ loop_again:
2495 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2551 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2496 if (!zone_watermark_ok_safe(zone, order, 2552 if (!zone_watermark_ok_safe(zone, order,
2497 high_wmark_pages(zone) + balance_gap, 2553 high_wmark_pages(zone) + balance_gap,
2498 end_zone, 0)) 2554 end_zone, 0)) {
2499 shrink_zone(priority, zone, &sc); 2555 shrink_zone(priority, zone, &sc);
2500 reclaim_state->reclaimed_slab = 0;
2501 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2502 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2503 total_scanned += sc.nr_scanned;
2504 2556
2505 if (zone->all_unreclaimable) 2557 reclaim_state->reclaimed_slab = 0;
2506 continue; 2558 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2507 if (nr_slab == 0 && 2559 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2508 !zone_reclaimable(zone)) 2560 total_scanned += sc.nr_scanned;
2509 zone->all_unreclaimable = 1; 2561
2562 if (nr_slab == 0 && !zone_reclaimable(zone))
2563 zone->all_unreclaimable = 1;
2564 }
2565
2510 /* 2566 /*
2511 * If we've done a decent amount of scanning and 2567 * If we've done a decent amount of scanning and
2512 * the reclaim ratio is low, start doing writepage 2568 * the reclaim ratio is low, start doing writepage
@@ -2516,6 +2572,12 @@ loop_again:
2516 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2572 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2517 sc.may_writepage = 1; 2573 sc.may_writepage = 1;
2518 2574
2575 if (zone->all_unreclaimable) {
2576 if (end_zone && end_zone == i)
2577 end_zone--;
2578 continue;
2579 }
2580
2519 if (!zone_watermark_ok_safe(zone, order, 2581 if (!zone_watermark_ok_safe(zone, order,
2520 high_wmark_pages(zone), end_zone, 0)) { 2582 high_wmark_pages(zone), end_zone, 0)) {
2521 all_zones_ok = 0; 2583 all_zones_ok = 0;
@@ -2694,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2694 */ 2756 */
2695static int kswapd(void *p) 2757static int kswapd(void *p)
2696{ 2758{
2697 unsigned long order; 2759 unsigned long order, new_order;
2698 int classzone_idx; 2760 int classzone_idx, new_classzone_idx;
2699 pg_data_t *pgdat = (pg_data_t*)p; 2761 pg_data_t *pgdat = (pg_data_t*)p;
2700 struct task_struct *tsk = current; 2762 struct task_struct *tsk = current;
2701 2763
@@ -2725,17 +2787,23 @@ static int kswapd(void *p)
2725 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; 2787 tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2726 set_freezable(); 2788 set_freezable();
2727 2789
2728 order = 0; 2790 order = new_order = 0;
2729 classzone_idx = MAX_NR_ZONES - 1; 2791 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2730 for ( ; ; ) { 2792 for ( ; ; ) {
2731 unsigned long new_order;
2732 int new_classzone_idx;
2733 int ret; 2793 int ret;
2734 2794
2735 new_order = pgdat->kswapd_max_order; 2795 /*
2736 new_classzone_idx = pgdat->classzone_idx; 2796 * If the last balance_pgdat was unsuccessful it's unlikely a
2737 pgdat->kswapd_max_order = 0; 2797 * new request of a similar or harder type will succeed soon
2738 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2798 * so consider going to sleep on the basis we reclaimed at
2799 */
2800 if (classzone_idx >= new_classzone_idx && order == new_order) {
2801 new_order = pgdat->kswapd_max_order;
2802 new_classzone_idx = pgdat->classzone_idx;
2803 pgdat->kswapd_max_order = 0;
2804 pgdat->classzone_idx = pgdat->nr_zones - 1;
2805 }
2806
2739 if (order < new_order || classzone_idx > new_classzone_idx) { 2807 if (order < new_order || classzone_idx > new_classzone_idx) {
2740 /* 2808 /*
2741 * Don't sleep if someone wants a larger 'order' 2809 * Don't sleep if someone wants a larger 'order'
@@ -2748,7 +2816,7 @@ static int kswapd(void *p)
2748 order = pgdat->kswapd_max_order; 2816 order = pgdat->kswapd_max_order;
2749 classzone_idx = pgdat->classzone_idx; 2817 classzone_idx = pgdat->classzone_idx;
2750 pgdat->kswapd_max_order = 0; 2818 pgdat->kswapd_max_order = 0;
2751 pgdat->classzone_idx = MAX_NR_ZONES - 1; 2819 pgdat->classzone_idx = pgdat->nr_zones - 1;
2752 } 2820 }
2753 2821
2754 ret = try_to_freeze(); 2822 ret = try_to_freeze();