diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 3 | ||||
-rw-r--r-- | mm/compaction.c | 76 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 19 | ||||
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/hugetlb.c | 49 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memblock.c | 8 | ||||
-rw-r--r-- | mm/memcontrol.c | 222 | ||||
-rw-r--r-- | mm/memory-failure.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 158 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 78 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 46 | ||||
-rw-r--r-- | mm/nommu.c | 46 | ||||
-rw-r--r-- | mm/oom_kill.c | 5 | ||||
-rw-r--r-- | mm/page-writeback.c | 11 | ||||
-rw-r--r-- | mm/page_alloc.c | 112 | ||||
-rw-r--r-- | mm/page_cgroup.c | 81 | ||||
-rw-r--r-- | mm/pagewalk.c | 49 | ||||
-rw-r--r-- | mm/rmap.c | 118 | ||||
-rw-r--r-- | mm/shmem.c | 626 | ||||
-rw-r--r-- | mm/slab.c | 26 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 119 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 31 | ||||
-rw-r--r-- | mm/thrash.c | 120 | ||||
-rw-r--r-- | mm/truncate.c | 163 | ||||
-rw-r--r-- | mm/vmalloc.c | 18 | ||||
-rw-r--r-- | mm/vmscan.c | 180 |
33 files changed, 1570 insertions, 852 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8ca47a5ee9c8..f2f1ca19ed53 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -356,7 +356,7 @@ config CLEANCACHE | |||
356 | for clean pages that the kernel's pageframe replacement algorithm | 356 | for clean pages that the kernel's pageframe replacement algorithm |
357 | (PFRA) would like to keep around, but can't since there isn't enough | 357 | (PFRA) would like to keep around, but can't since there isn't enough |
358 | memory. So when the PFRA "evicts" a page, it first attempts to use | 358 | memory. So when the PFRA "evicts" a page, it first attempts to use |
359 | cleancacne code to put the data contained in that page into | 359 | cleancache code to put the data contained in that page into |
360 | "transcendent memory", memory that is not directly accessible or | 360 | "transcendent memory", memory that is not directly accessible or |
361 | addressable by the kernel and is of unknown and possibly | 361 | addressable by the kernel and is of unknown and possibly |
362 | time-varying size. And when a cleancache-enabled | 362 | time-varying size. And when a cleancache-enabled |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ddd0345e2e6d..d6edf8d14f9c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -513,7 +513,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) | |||
513 | list_del_rcu(&bdi->bdi_list); | 513 | list_del_rcu(&bdi->bdi_list); |
514 | spin_unlock_bh(&bdi_lock); | 514 | spin_unlock_bh(&bdi_lock); |
515 | 515 | ||
516 | synchronize_rcu(); | 516 | synchronize_rcu_expedited(); |
517 | } | 517 | } |
518 | 518 | ||
519 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 519 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
@@ -614,6 +614,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
614 | void bdi_unregister(struct backing_dev_info *bdi) | 614 | void bdi_unregister(struct backing_dev_info *bdi) |
615 | { | 615 | { |
616 | if (bdi->dev) { | 616 | if (bdi->dev) { |
617 | bdi_set_min_ratio(bdi, 0); | ||
617 | trace_writeback_bdi_unregister(bdi); | 618 | trace_writeback_bdi_unregister(bdi); |
618 | bdi_prune_sb(bdi); | 619 | bdi_prune_sb(bdi); |
619 | del_timer_sync(&bdi->wb.wakeup_timer); | 620 | del_timer_sync(&bdi->wb.wakeup_timer); |
diff --git a/mm/compaction.c b/mm/compaction.c index 021a2960ef9e..6cc604bd5649 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone, | |||
144 | int nr_freepages = cc->nr_freepages; | 144 | int nr_freepages = cc->nr_freepages; |
145 | struct list_head *freelist = &cc->freepages; | 145 | struct list_head *freelist = &cc->freepages; |
146 | 146 | ||
147 | /* | ||
148 | * Initialise the free scanner. The starting point is where we last | ||
149 | * scanned from (or the end of the zone if starting). The low point | ||
150 | * is the end of the pageblock the migration scanner is using. | ||
151 | */ | ||
147 | pfn = cc->free_pfn; | 152 | pfn = cc->free_pfn; |
148 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 153 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; |
149 | high_pfn = low_pfn; | 154 | |
155 | /* | ||
156 | * Take care that if the migration scanner is at the end of the zone | ||
157 | * that the free scanner does not accidentally move to the next zone | ||
158 | * in the next isolation cycle. | ||
159 | */ | ||
160 | high_pfn = min(low_pfn, pfn); | ||
150 | 161 | ||
151 | /* | 162 | /* |
152 | * Isolate free pages until enough are available to migrate the | 163 | * Isolate free pages until enough are available to migrate the |
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone) | |||
240 | return isolated > (inactive + active) / 2; | 251 | return isolated > (inactive + active) / 2; |
241 | } | 252 | } |
242 | 253 | ||
254 | /* possible outcome of isolate_migratepages */ | ||
255 | typedef enum { | ||
256 | ISOLATE_ABORT, /* Abort compaction now */ | ||
257 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | ||
258 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | ||
259 | } isolate_migrate_t; | ||
260 | |||
243 | /* | 261 | /* |
244 | * Isolate all pages that can be migrated from the block pointed to by | 262 | * Isolate all pages that can be migrated from the block pointed to by |
245 | * the migrate scanner within compact_control. | 263 | * the migrate scanner within compact_control. |
246 | */ | 264 | */ |
247 | static unsigned long isolate_migratepages(struct zone *zone, | 265 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
248 | struct compact_control *cc) | 266 | struct compact_control *cc) |
249 | { | 267 | { |
250 | unsigned long low_pfn, end_pfn; | 268 | unsigned long low_pfn, end_pfn; |
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
261 | /* Do not cross the free scanner or scan within a memory hole */ | 279 | /* Do not cross the free scanner or scan within a memory hole */ |
262 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 280 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
263 | cc->migrate_pfn = end_pfn; | 281 | cc->migrate_pfn = end_pfn; |
264 | return 0; | 282 | return ISOLATE_NONE; |
265 | } | 283 | } |
266 | 284 | ||
267 | /* | 285 | /* |
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
270 | * delay for some time until fewer pages are isolated | 288 | * delay for some time until fewer pages are isolated |
271 | */ | 289 | */ |
272 | while (unlikely(too_many_isolated(zone))) { | 290 | while (unlikely(too_many_isolated(zone))) { |
291 | /* async migration should just abort */ | ||
292 | if (!cc->sync) | ||
293 | return ISOLATE_ABORT; | ||
294 | |||
273 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 295 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
274 | 296 | ||
275 | if (fatal_signal_pending(current)) | 297 | if (fatal_signal_pending(current)) |
276 | return 0; | 298 | return ISOLATE_ABORT; |
277 | } | 299 | } |
278 | 300 | ||
279 | /* Time to isolate some pages for migration */ | 301 | /* Time to isolate some pages for migration */ |
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
358 | 380 | ||
359 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 381 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
360 | 382 | ||
361 | return cc->nr_migratepages; | 383 | return ISOLATE_SUCCESS; |
362 | } | 384 | } |
363 | 385 | ||
364 | /* | 386 | /* |
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone, | |||
420 | if (cc->free_pfn <= cc->migrate_pfn) | 442 | if (cc->free_pfn <= cc->migrate_pfn) |
421 | return COMPACT_COMPLETE; | 443 | return COMPACT_COMPLETE; |
422 | 444 | ||
423 | /* Compaction run is not finished if the watermark is not met */ | ||
424 | watermark = low_wmark_pages(zone); | ||
425 | watermark += (1 << cc->order); | ||
426 | |||
427 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
428 | return COMPACT_CONTINUE; | ||
429 | |||
430 | /* | 445 | /* |
431 | * order == -1 is expected when compacting via | 446 | * order == -1 is expected when compacting via |
432 | * /proc/sys/vm/compact_memory | 447 | * /proc/sys/vm/compact_memory |
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone, | |||
434 | if (cc->order == -1) | 449 | if (cc->order == -1) |
435 | return COMPACT_CONTINUE; | 450 | return COMPACT_CONTINUE; |
436 | 451 | ||
452 | /* Compaction run is not finished if the watermark is not met */ | ||
453 | watermark = low_wmark_pages(zone); | ||
454 | watermark += (1 << cc->order); | ||
455 | |||
456 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
457 | return COMPACT_CONTINUE; | ||
458 | |||
437 | /* Direct compactor: Is a suitable page free? */ | 459 | /* Direct compactor: Is a suitable page free? */ |
438 | for (order = cc->order; order < MAX_ORDER; order++) { | 460 | for (order = cc->order; order < MAX_ORDER; order++) { |
439 | /* Job done if page is free of the right migratetype */ | 461 | /* Job done if page is free of the right migratetype */ |
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
461 | unsigned long watermark; | 483 | unsigned long watermark; |
462 | 484 | ||
463 | /* | 485 | /* |
486 | * order == -1 is expected when compacting via | ||
487 | * /proc/sys/vm/compact_memory | ||
488 | */ | ||
489 | if (order == -1) | ||
490 | return COMPACT_CONTINUE; | ||
491 | |||
492 | /* | ||
464 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 493 | * Watermarks for order-0 must be met for compaction. Note the 2UL. |
465 | * This is because during migration, copies of pages need to be | 494 | * This is because during migration, copies of pages need to be |
466 | * allocated and for a short time, the footprint is higher | 495 | * allocated and for a short time, the footprint is higher |
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
470 | return COMPACT_SKIPPED; | 499 | return COMPACT_SKIPPED; |
471 | 500 | ||
472 | /* | 501 | /* |
473 | * order == -1 is expected when compacting via | ||
474 | * /proc/sys/vm/compact_memory | ||
475 | */ | ||
476 | if (order == -1) | ||
477 | return COMPACT_CONTINUE; | ||
478 | |||
479 | /* | ||
480 | * fragmentation index determines if allocation failures are due to | 502 | * fragmentation index determines if allocation failures are due to |
481 | * low memory or external fragmentation | 503 | * low memory or external fragmentation |
482 | * | 504 | * |
483 | * index of -1 implies allocations might succeed dependingon watermarks | 505 | * index of -1000 implies allocations might succeed depending on |
506 | * watermarks | ||
484 | * index towards 0 implies failure is due to lack of memory | 507 | * index towards 0 implies failure is due to lack of memory |
485 | * index towards 1000 implies failure is due to fragmentation | 508 | * index towards 1000 implies failure is due to fragmentation |
486 | * | 509 | * |
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
490 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 513 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
491 | return COMPACT_SKIPPED; | 514 | return COMPACT_SKIPPED; |
492 | 515 | ||
493 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) | 516 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, |
517 | 0, 0)) | ||
494 | return COMPACT_PARTIAL; | 518 | return COMPACT_PARTIAL; |
495 | 519 | ||
496 | return COMPACT_CONTINUE; | 520 | return COMPACT_CONTINUE; |
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
522 | unsigned long nr_migrate, nr_remaining; | 546 | unsigned long nr_migrate, nr_remaining; |
523 | int err; | 547 | int err; |
524 | 548 | ||
525 | if (!isolate_migratepages(zone, cc)) | 549 | switch (isolate_migratepages(zone, cc)) { |
550 | case ISOLATE_ABORT: | ||
551 | ret = COMPACT_PARTIAL; | ||
552 | goto out; | ||
553 | case ISOLATE_NONE: | ||
526 | continue; | 554 | continue; |
555 | case ISOLATE_SUCCESS: | ||
556 | ; | ||
557 | } | ||
527 | 558 | ||
528 | nr_migrate = cc->nr_migratepages; | 559 | nr_migrate = cc->nr_migratepages; |
529 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 560 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
547 | 578 | ||
548 | } | 579 | } |
549 | 580 | ||
581 | out: | ||
550 | /* Release free pages and check accounting */ | 582 | /* Release free pages and check accounting */ |
551 | cc->nr_freepages -= release_freepages(&cc->freepages); | 583 | cc->nr_freepages -= release_freepages(&cc->freepages); |
552 | VM_BUG_ON(cc->nr_freepages != 0); | 584 | VM_BUG_ON(cc->nr_freepages != 0); |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb4519a..fbb58e346888 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) | |||
500 | { | 500 | { |
501 | struct device *dev = pool->dev; | 501 | struct device *dev = pool->dev; |
502 | 502 | ||
503 | dma_pool_destroy(pool); | ||
504 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); | 503 | WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); |
504 | dma_pool_destroy(pool); | ||
505 | } | 505 | } |
506 | EXPORT_SYMBOL(dmam_pool_destroy); | 506 | EXPORT_SYMBOL(dmam_pool_destroy); |
diff --git a/mm/filemap.c b/mm/filemap.c index 1e492c3dd6f8..867d40222ec7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -78,9 +78,6 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 78 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 80 | * |
81 | * ->i_mutex | ||
82 | * ->i_alloc_sem (various) | ||
83 | * | ||
84 | * bdi->wb.list_lock | 81 | * bdi->wb.list_lock |
85 | * sb_lock (fs/fs-writeback.c) | 82 | * sb_lock (fs/fs-writeback.c) |
86 | * ->mapping->tree_lock (__sync_single_inode) | 83 | * ->mapping->tree_lock (__sync_single_inode) |
@@ -131,6 +128,7 @@ void __delete_from_page_cache(struct page *page) | |||
131 | 128 | ||
132 | radix_tree_delete(&mapping->page_tree, page->index); | 129 | radix_tree_delete(&mapping->page_tree, page->index); |
133 | page->mapping = NULL; | 130 | page->mapping = NULL; |
131 | /* Leave page->index set: truncation lookup relies upon it */ | ||
134 | mapping->nrpages--; | 132 | mapping->nrpages--; |
135 | __dec_zone_page_state(page, NR_FILE_PAGES); | 133 | __dec_zone_page_state(page, NR_FILE_PAGES); |
136 | if (PageSwapBacked(page)) | 134 | if (PageSwapBacked(page)) |
@@ -486,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
486 | spin_unlock_irq(&mapping->tree_lock); | 484 | spin_unlock_irq(&mapping->tree_lock); |
487 | } else { | 485 | } else { |
488 | page->mapping = NULL; | 486 | page->mapping = NULL; |
487 | /* Leave page->index set: truncation relies upon it */ | ||
489 | spin_unlock_irq(&mapping->tree_lock); | 488 | spin_unlock_irq(&mapping->tree_lock); |
490 | mem_cgroup_uncharge_cache_page(page); | 489 | mem_cgroup_uncharge_cache_page(page); |
491 | page_cache_release(page); | 490 | page_cache_release(page); |
@@ -1795,7 +1794,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); | |||
1795 | 1794 | ||
1796 | static struct page *__read_cache_page(struct address_space *mapping, | 1795 | static struct page *__read_cache_page(struct address_space *mapping, |
1797 | pgoff_t index, | 1796 | pgoff_t index, |
1798 | int (*filler)(void *,struct page*), | 1797 | int (*filler)(void *, struct page *), |
1799 | void *data, | 1798 | void *data, |
1800 | gfp_t gfp) | 1799 | gfp_t gfp) |
1801 | { | 1800 | { |
@@ -1826,7 +1825,7 @@ repeat: | |||
1826 | 1825 | ||
1827 | static struct page *do_read_cache_page(struct address_space *mapping, | 1826 | static struct page *do_read_cache_page(struct address_space *mapping, |
1828 | pgoff_t index, | 1827 | pgoff_t index, |
1829 | int (*filler)(void *,struct page*), | 1828 | int (*filler)(void *, struct page *), |
1830 | void *data, | 1829 | void *data, |
1831 | gfp_t gfp) | 1830 | gfp_t gfp) |
1832 | 1831 | ||
@@ -1866,7 +1865,7 @@ out: | |||
1866 | * @mapping: the page's address_space | 1865 | * @mapping: the page's address_space |
1867 | * @index: the page index | 1866 | * @index: the page index |
1868 | * @filler: function to perform the read | 1867 | * @filler: function to perform the read |
1869 | * @data: destination for read data | 1868 | * @data: first arg to filler(data, page) function, often left as NULL |
1870 | * | 1869 | * |
1871 | * Same as read_cache_page, but don't wait for page to become unlocked | 1870 | * Same as read_cache_page, but don't wait for page to become unlocked |
1872 | * after submitting it to the filler. | 1871 | * after submitting it to the filler. |
@@ -1878,7 +1877,7 @@ out: | |||
1878 | */ | 1877 | */ |
1879 | struct page *read_cache_page_async(struct address_space *mapping, | 1878 | struct page *read_cache_page_async(struct address_space *mapping, |
1880 | pgoff_t index, | 1879 | pgoff_t index, |
1881 | int (*filler)(void *,struct page*), | 1880 | int (*filler)(void *, struct page *), |
1882 | void *data) | 1881 | void *data) |
1883 | { | 1882 | { |
1884 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | 1883 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); |
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
1926 | * @mapping: the page's address_space | 1925 | * @mapping: the page's address_space |
1927 | * @index: the page index | 1926 | * @index: the page index |
1928 | * @filler: function to perform the read | 1927 | * @filler: function to perform the read |
1929 | * @data: destination for read data | 1928 | * @data: first arg to filler(data, page) function, often left as NULL |
1930 | * | 1929 | * |
1931 | * Read into the page cache. If a page already exists, and PageUptodate() is | 1930 | * Read into the page cache. If a page already exists, and PageUptodate() is |
1932 | * not set, try to fill the page then wait for it to become unlocked. | 1931 | * not set, try to fill the page then wait for it to become unlocked. |
@@ -1935,7 +1934,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
1935 | */ | 1934 | */ |
1936 | struct page *read_cache_page(struct address_space *mapping, | 1935 | struct page *read_cache_page(struct address_space *mapping, |
1937 | pgoff_t index, | 1936 | pgoff_t index, |
1938 | int (*filler)(void *,struct page*), | 1937 | int (*filler)(void *, struct page *), |
1939 | void *data) | 1938 | void *data) |
1940 | { | 1939 | { |
1941 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); | 1940 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
@@ -2000,7 +1999,7 @@ int file_remove_suid(struct file *file) | |||
2000 | error = security_inode_killpriv(dentry); | 1999 | error = security_inode_killpriv(dentry); |
2001 | if (!error && killsuid) | 2000 | if (!error && killsuid) |
2002 | error = __remove_suid(dentry, killsuid); | 2001 | error = __remove_suid(dentry, killsuid); |
2003 | if (!error) | 2002 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) |
2004 | inode->i_flags |= S_NOSEC; | 2003 | inode->i_flags |= S_NOSEC; |
2005 | 2004 | ||
2006 | return error; | 2005 | return error; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 615d9743a3cb..e2d1587be269 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
1596 | list_del(&mm_slot->mm_node); | 1596 | list_del(&mm_slot->mm_node); |
1597 | free = 1; | 1597 | free = 1; |
1598 | } | 1598 | } |
1599 | spin_unlock(&khugepaged_mm_lock); | ||
1599 | 1600 | ||
1600 | if (free) { | 1601 | if (free) { |
1601 | spin_unlock(&khugepaged_mm_lock); | ||
1602 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | 1602 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); |
1603 | free_mm_slot(mm_slot); | 1603 | free_mm_slot(mm_slot); |
1604 | mmdrop(mm); | 1604 | mmdrop(mm); |
1605 | } else if (mm_slot) { | 1605 | } else if (mm_slot) { |
1606 | spin_unlock(&khugepaged_mm_lock); | ||
1607 | /* | 1606 | /* |
1608 | * This is required to serialize against | 1607 | * This is required to serialize against |
1609 | * khugepaged_test_exit() (which is guaranteed to run | 1608 | * khugepaged_test_exit() (which is guaranteed to run |
@@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
1614 | */ | 1613 | */ |
1615 | down_write(&mm->mmap_sem); | 1614 | down_write(&mm->mmap_sem); |
1616 | up_write(&mm->mmap_sem); | 1615 | up_write(&mm->mmap_sem); |
1617 | } else | 1616 | } |
1618 | spin_unlock(&khugepaged_mm_lock); | ||
1619 | } | 1617 | } |
1620 | 1618 | ||
1621 | static void release_pte_page(struct page *page) | 1619 | static void release_pte_page(struct page *page) |
@@ -2234,11 +2232,8 @@ static void khugepaged_loop(void) | |||
2234 | while (likely(khugepaged_enabled())) { | 2232 | while (likely(khugepaged_enabled())) { |
2235 | #ifndef CONFIG_NUMA | 2233 | #ifndef CONFIG_NUMA |
2236 | hpage = khugepaged_alloc_hugepage(); | 2234 | hpage = khugepaged_alloc_hugepage(); |
2237 | if (unlikely(!hpage)) { | 2235 | if (unlikely(!hpage)) |
2238 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2239 | break; | 2236 | break; |
2240 | } | ||
2241 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2242 | #else | 2237 | #else |
2243 | if (IS_ERR(hpage)) { | 2238 | if (IS_ERR(hpage)) { |
2244 | khugepaged_alloc_sleep(); | 2239 | khugepaged_alloc_sleep(); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6402458fee38..dae27ba3be2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -24,7 +24,7 @@ | |||
24 | 24 | ||
25 | #include <asm/page.h> | 25 | #include <asm/page.h> |
26 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
27 | #include <asm/io.h> | 27 | #include <linux/io.h> |
28 | 28 | ||
29 | #include <linux/hugetlb.h> | 29 | #include <linux/hugetlb.h> |
30 | #include <linux/node.h> | 30 | #include <linux/node.h> |
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); | |||
62 | * must either hold the mmap_sem for write, or the mmap_sem for read and | 62 | * must either hold the mmap_sem for write, or the mmap_sem for read and |
63 | * the hugetlb_instantiation mutex: | 63 | * the hugetlb_instantiation mutex: |
64 | * | 64 | * |
65 | * down_write(&mm->mmap_sem); | 65 | * down_write(&mm->mmap_sem); |
66 | * or | 66 | * or |
67 | * down_read(&mm->mmap_sem); | 67 | * down_read(&mm->mmap_sem); |
68 | * mutex_lock(&hugetlb_instantiation_mutex); | 68 | * mutex_lock(&hugetlb_instantiation_mutex); |
69 | */ | 69 | */ |
70 | struct file_region { | 70 | struct file_region { |
71 | struct list_head link; | 71 | struct list_head link; |
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) | |||
503 | h->nr_huge_pages--; | 503 | h->nr_huge_pages--; |
504 | h->nr_huge_pages_node[page_to_nid(page)]--; | 504 | h->nr_huge_pages_node[page_to_nid(page)]--; |
505 | for (i = 0; i < pages_per_huge_page(h); i++) { | 505 | for (i = 0; i < pages_per_huge_page(h); i++) { |
506 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 506 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | |
507 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 507 | 1 << PG_referenced | 1 << PG_dirty | |
508 | 1 << PG_private | 1<< PG_writeback); | 508 | 1 << PG_active | 1 << PG_reserved | |
509 | 1 << PG_private | 1 << PG_writeback); | ||
509 | } | 510 | } |
510 | set_compound_page_dtor(page, NULL); | 511 | set_compound_page_dtor(page, NULL); |
511 | set_page_refcounted(page); | 512 | set_page_refcounted(page); |
@@ -591,7 +592,6 @@ int PageHuge(struct page *page) | |||
591 | 592 | ||
592 | return dtor == free_huge_page; | 593 | return dtor == free_huge_page; |
593 | } | 594 | } |
594 | |||
595 | EXPORT_SYMBOL_GPL(PageHuge); | 595 | EXPORT_SYMBOL_GPL(PageHuge); |
596 | 596 | ||
597 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 597 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
@@ -1105,12 +1105,28 @@ static void __init gather_bootmem_prealloc(void) | |||
1105 | struct huge_bootmem_page *m; | 1105 | struct huge_bootmem_page *m; |
1106 | 1106 | ||
1107 | list_for_each_entry(m, &huge_boot_pages, list) { | 1107 | list_for_each_entry(m, &huge_boot_pages, list) { |
1108 | struct page *page = virt_to_page(m); | ||
1109 | struct hstate *h = m->hstate; | 1108 | struct hstate *h = m->hstate; |
1109 | struct page *page; | ||
1110 | |||
1111 | #ifdef CONFIG_HIGHMEM | ||
1112 | page = pfn_to_page(m->phys >> PAGE_SHIFT); | ||
1113 | free_bootmem_late((unsigned long)m, | ||
1114 | sizeof(struct huge_bootmem_page)); | ||
1115 | #else | ||
1116 | page = virt_to_page(m); | ||
1117 | #endif | ||
1110 | __ClearPageReserved(page); | 1118 | __ClearPageReserved(page); |
1111 | WARN_ON(page_count(page) != 1); | 1119 | WARN_ON(page_count(page) != 1); |
1112 | prep_compound_huge_page(page, h->order); | 1120 | prep_compound_huge_page(page, h->order); |
1113 | prep_new_huge_page(h, page, page_to_nid(page)); | 1121 | prep_new_huge_page(h, page, page_to_nid(page)); |
1122 | /* | ||
1123 | * If we had gigantic hugepages allocated at boot time, we need | ||
1124 | * to restore the 'stolen' pages to totalram_pages in order to | ||
1125 | * fix confusing memory reports from free(1) and another | ||
1126 | * side-effects, like CommitLimit going negative. | ||
1127 | */ | ||
1128 | if (h->order > (MAX_ORDER - 1)) | ||
1129 | totalram_pages += 1 << h->order; | ||
1114 | } | 1130 | } |
1115 | } | 1131 | } |
1116 | 1132 | ||
@@ -2116,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
2116 | pte_t entry; | 2132 | pte_t entry; |
2117 | 2133 | ||
2118 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2134 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
2119 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2135 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) |
2120 | update_mmu_cache(vma, address, ptep); | 2136 | update_mmu_cache(vma, address, ptep); |
2121 | } | ||
2122 | } | 2137 | } |
2123 | 2138 | ||
2124 | 2139 | ||
@@ -2173,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte) | |||
2173 | if (huge_pte_none(pte) || pte_present(pte)) | 2188 | if (huge_pte_none(pte) || pte_present(pte)) |
2174 | return 0; | 2189 | return 0; |
2175 | swp = pte_to_swp_entry(pte); | 2190 | swp = pte_to_swp_entry(pte); |
2176 | if (non_swap_entry(swp) && is_migration_entry(swp)) { | 2191 | if (non_swap_entry(swp) && is_migration_entry(swp)) |
2177 | return 1; | 2192 | return 1; |
2178 | } else | 2193 | else |
2179 | return 0; | 2194 | return 0; |
2180 | } | 2195 | } |
2181 | 2196 | ||
@@ -2186,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) | |||
2186 | if (huge_pte_none(pte) || pte_present(pte)) | 2201 | if (huge_pte_none(pte) || pte_present(pte)) |
2187 | return 0; | 2202 | return 0; |
2188 | swp = pte_to_swp_entry(pte); | 2203 | swp = pte_to_swp_entry(pte); |
2189 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { | 2204 | if (non_swap_entry(swp) && is_hwpoison_entry(swp)) |
2190 | return 1; | 2205 | return 1; |
2191 | } else | 2206 | else |
2192 | return 0; | 2207 | return 0; |
2193 | } | 2208 | } |
2194 | 2209 | ||
@@ -2551,7 +2566,7 @@ retry: | |||
2551 | * So we need to block hugepage fault by PG_hwpoison bit check. | 2566 | * So we need to block hugepage fault by PG_hwpoison bit check. |
2552 | */ | 2567 | */ |
2553 | if (unlikely(PageHWPoison(page))) { | 2568 | if (unlikely(PageHWPoison(page))) { |
2554 | ret = VM_FAULT_HWPOISON | | 2569 | ret = VM_FAULT_HWPOISON | |
2555 | VM_FAULT_SET_HINDEX(h - hstates); | 2570 | VM_FAULT_SET_HINDEX(h - hstates); |
2556 | goto backout_unlocked; | 2571 | goto backout_unlocked; |
2557 | } | 2572 | } |
@@ -2619,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2619 | migration_entry_wait(mm, (pmd_t *)ptep, address); | 2634 | migration_entry_wait(mm, (pmd_t *)ptep, address); |
2620 | return 0; | 2635 | return 0; |
2621 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) | 2636 | } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) |
2622 | return VM_FAULT_HWPOISON_LARGE | | 2637 | return VM_FAULT_HWPOISON_LARGE | |
2623 | VM_FAULT_SET_HINDEX(h - hstates); | 2638 | VM_FAULT_SET_HINDEX(h - hstates); |
2624 | } | 2639 | } |
2625 | 2640 | ||
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
1303 | ksm_scan.mm_slot = slot; | 1303 | ksm_scan.mm_slot = slot; |
1304 | spin_unlock(&ksm_mmlist_lock); | 1304 | spin_unlock(&ksm_mmlist_lock); |
1305 | /* | ||
1306 | * Although we tested list_empty() above, a racing __ksm_exit | ||
1307 | * of the last mm on the list may have removed it since then. | ||
1308 | */ | ||
1309 | if (slot == &ksm_mm_head) | ||
1310 | return NULL; | ||
1305 | next_mm: | 1311 | next_mm: |
1306 | ksm_scan.address = 0; | 1312 | ksm_scan.address = 0; |
1307 | ksm_scan.rmap_list = &slot->rmap_list; | 1313 | ksm_scan.rmap_list = &slot->rmap_list; |
diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed503..74bf193eff04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
218 | endoff = (loff_t)(end - vma->vm_start - 1) | 218 | endoff = (loff_t)(end - vma->vm_start - 1) |
219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 219 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
220 | 220 | ||
221 | /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ | 221 | /* vmtruncate_range needs to take i_mutex */ |
222 | up_read(¤t->mm->mmap_sem); | 222 | up_read(¤t->mm->mmap_sem); |
223 | error = vmtruncate_range(mapping->host, offset, endoff); | 223 | error = vmtruncate_range(mapping->host, offset, endoff); |
224 | down_read(¤t->mm->mmap_sem); | 224 | down_read(¤t->mm->mmap_sem); |
diff --git a/mm/memblock.c b/mm/memblock.c index a0562d1a6ad4..ccbf97339592 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -758,9 +758,9 @@ void __init memblock_analyze(void) | |||
758 | 758 | ||
759 | /* Check marker in the unused last array entry */ | 759 | /* Check marker in the unused last array entry */ |
760 | WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base | 760 | WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base |
761 | != (phys_addr_t)RED_INACTIVE); | 761 | != MEMBLOCK_INACTIVE); |
762 | WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base | 762 | WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base |
763 | != (phys_addr_t)RED_INACTIVE); | 763 | != MEMBLOCK_INACTIVE); |
764 | 764 | ||
765 | memblock.memory_size = 0; | 765 | memblock.memory_size = 0; |
766 | 766 | ||
@@ -786,8 +786,8 @@ void __init memblock_init(void) | |||
786 | memblock.reserved.max = INIT_MEMBLOCK_REGIONS; | 786 | memblock.reserved.max = INIT_MEMBLOCK_REGIONS; |
787 | 787 | ||
788 | /* Write a marker in the unused last array entry */ | 788 | /* Write a marker in the unused last array entry */ |
789 | memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; | 789 | memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; |
790 | memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; | 790 | memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; |
791 | 791 | ||
792 | /* Create a dummy zero size MEMBLOCK which will get coalesced away later. | 792 | /* Create a dummy zero size MEMBLOCK which will get coalesced away later. |
793 | * This simplifies the memblock_add() code below... | 793 | * This simplifies the memblock_add() code below... |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3ad..e013b8e57d25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
38 | #include <linux/shmem_fs.h> | ||
38 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
39 | #include <linux/swap.h> | 40 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index { | |||
107 | enum mem_cgroup_events_target { | 108 | enum mem_cgroup_events_target { |
108 | MEM_CGROUP_TARGET_THRESH, | 109 | MEM_CGROUP_TARGET_THRESH, |
109 | MEM_CGROUP_TARGET_SOFTLIMIT, | 110 | MEM_CGROUP_TARGET_SOFTLIMIT, |
111 | MEM_CGROUP_TARGET_NUMAINFO, | ||
110 | MEM_CGROUP_NTARGETS, | 112 | MEM_CGROUP_NTARGETS, |
111 | }; | 113 | }; |
112 | #define THRESHOLDS_EVENTS_TARGET (128) | 114 | #define THRESHOLDS_EVENTS_TARGET (128) |
113 | #define SOFTLIMIT_EVENTS_TARGET (1024) | 115 | #define SOFTLIMIT_EVENTS_TARGET (1024) |
116 | #define NUMAINFO_EVENTS_TARGET (1024) | ||
114 | 117 | ||
115 | struct mem_cgroup_stat_cpu { | 118 | struct mem_cgroup_stat_cpu { |
116 | long count[MEM_CGROUP_STAT_NSTATS]; | 119 | long count[MEM_CGROUP_STAT_NSTATS]; |
@@ -236,7 +239,8 @@ struct mem_cgroup { | |||
236 | int last_scanned_node; | 239 | int last_scanned_node; |
237 | #if MAX_NUMNODES > 1 | 240 | #if MAX_NUMNODES > 1 |
238 | nodemask_t scan_nodes; | 241 | nodemask_t scan_nodes; |
239 | unsigned long next_scan_node_update; | 242 | atomic_t numainfo_events; |
243 | atomic_t numainfo_updating; | ||
240 | #endif | 244 | #endif |
241 | /* | 245 | /* |
242 | * Should the accounting and control be hierarchical, per subtree? | 246 | * Should the accounting and control be hierarchical, per subtree? |
@@ -359,7 +363,7 @@ enum charge_type { | |||
359 | static void mem_cgroup_get(struct mem_cgroup *mem); | 363 | static void mem_cgroup_get(struct mem_cgroup *mem); |
360 | static void mem_cgroup_put(struct mem_cgroup *mem); | 364 | static void mem_cgroup_put(struct mem_cgroup *mem); |
361 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 365 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
362 | static void drain_all_stock_async(void); | 366 | static void drain_all_stock_async(struct mem_cgroup *mem); |
363 | 367 | ||
364 | static struct mem_cgroup_per_zone * | 368 | static struct mem_cgroup_per_zone * |
365 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 369 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
576 | return val; | 580 | return val; |
577 | } | 581 | } |
578 | 582 | ||
579 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
580 | { | ||
581 | long ret; | ||
582 | |||
583 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
584 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
585 | return ret; | ||
586 | } | ||
587 | |||
588 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 583 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
589 | bool charge) | 584 | bool charge) |
590 | { | 585 | { |
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | |||
688 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 683 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
689 | next = val + SOFTLIMIT_EVENTS_TARGET; | 684 | next = val + SOFTLIMIT_EVENTS_TARGET; |
690 | break; | 685 | break; |
686 | case MEM_CGROUP_TARGET_NUMAINFO: | ||
687 | next = val + NUMAINFO_EVENTS_TARGET; | ||
688 | break; | ||
691 | default: | 689 | default: |
692 | return; | 690 | return; |
693 | } | 691 | } |
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | |||
706 | mem_cgroup_threshold(mem); | 704 | mem_cgroup_threshold(mem); |
707 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); | 705 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
708 | if (unlikely(__memcg_event_check(mem, | 706 | if (unlikely(__memcg_event_check(mem, |
709 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | 707 | MEM_CGROUP_TARGET_SOFTLIMIT))) { |
710 | mem_cgroup_update_tree(mem, page); | 708 | mem_cgroup_update_tree(mem, page); |
711 | __mem_cgroup_target_update(mem, | 709 | __mem_cgroup_target_update(mem, |
712 | MEM_CGROUP_TARGET_SOFTLIMIT); | 710 | MEM_CGROUP_TARGET_SOFTLIMIT); |
713 | } | 711 | } |
712 | #if MAX_NUMNODES > 1 | ||
713 | if (unlikely(__memcg_event_check(mem, | ||
714 | MEM_CGROUP_TARGET_NUMAINFO))) { | ||
715 | atomic_inc(&mem->numainfo_events); | ||
716 | __mem_cgroup_target_update(mem, | ||
717 | MEM_CGROUP_TARGET_NUMAINFO); | ||
718 | } | ||
719 | #endif | ||
714 | } | 720 | } |
715 | } | 721 | } |
716 | 722 | ||
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
735 | struct mem_cgroup, css); | 741 | struct mem_cgroup, css); |
736 | } | 742 | } |
737 | 743 | ||
738 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 744 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
739 | { | 745 | { |
740 | struct mem_cgroup *mem = NULL; | 746 | struct mem_cgroup *mem = NULL; |
741 | 747 | ||
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, | |||
1128 | return MEM_CGROUP_ZSTAT(mz, lru); | 1134 | return MEM_CGROUP_ZSTAT(mz, lru); |
1129 | } | 1135 | } |
1130 | 1136 | ||
1131 | #ifdef CONFIG_NUMA | ||
1132 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | 1137 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, |
1133 | int nid) | 1138 | int nid) |
1134 | { | 1139 | { |
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | |||
1140 | return ret; | 1145 | return ret; |
1141 | } | 1146 | } |
1142 | 1147 | ||
1148 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1149 | int nid) | ||
1150 | { | ||
1151 | unsigned long ret; | ||
1152 | |||
1153 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1154 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | #if MAX_NUMNODES > 1 | ||
1143 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | 1159 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) |
1144 | { | 1160 | { |
1145 | u64 total = 0; | 1161 | u64 total = 0; |
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | |||
1151 | return total; | 1167 | return total; |
1152 | } | 1168 | } |
1153 | 1169 | ||
1154 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1155 | int nid) | ||
1156 | { | ||
1157 | unsigned long ret; | ||
1158 | |||
1159 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1160 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1161 | |||
1162 | return ret; | ||
1163 | } | ||
1164 | |||
1165 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | 1170 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) |
1166 | { | 1171 | { |
1167 | u64 total = 0; | 1172 | u64 total = 0; |
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1558 | return ret; | 1563 | return ret; |
1559 | } | 1564 | } |
1560 | 1565 | ||
1566 | /** | ||
1567 | * test_mem_cgroup_node_reclaimable | ||
1568 | * @mem: the target memcg | ||
1569 | * @nid: the node ID to be checked. | ||
1570 | * @noswap : specify true here if the user wants flle only information. | ||
1571 | * | ||
1572 | * This function returns whether the specified memcg contains any | ||
1573 | * reclaimable pages on a node. Returns true if there are any reclaimable | ||
1574 | * pages in the node. | ||
1575 | */ | ||
1576 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | ||
1577 | int nid, bool noswap) | ||
1578 | { | ||
1579 | if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) | ||
1580 | return true; | ||
1581 | if (noswap || !total_swap_pages) | ||
1582 | return false; | ||
1583 | if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) | ||
1584 | return true; | ||
1585 | return false; | ||
1586 | |||
1587 | } | ||
1561 | #if MAX_NUMNODES > 1 | 1588 | #if MAX_NUMNODES > 1 |
1562 | 1589 | ||
1563 | /* | 1590 | /* |
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1569 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | 1596 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) |
1570 | { | 1597 | { |
1571 | int nid; | 1598 | int nid; |
1572 | 1599 | /* | |
1573 | if (time_after(mem->next_scan_node_update, jiffies)) | 1600 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET |
1601 | * pagein/pageout changes since the last update. | ||
1602 | */ | ||
1603 | if (!atomic_read(&mem->numainfo_events)) | ||
1604 | return; | ||
1605 | if (atomic_inc_return(&mem->numainfo_updating) > 1) | ||
1574 | return; | 1606 | return; |
1575 | 1607 | ||
1576 | mem->next_scan_node_update = jiffies + 10*HZ; | ||
1577 | /* make a nodemask where this memcg uses memory from */ | 1608 | /* make a nodemask where this memcg uses memory from */ |
1578 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | 1609 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; |
1579 | 1610 | ||
1580 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1611 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { |
1581 | 1612 | ||
1582 | if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || | 1613 | if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) |
1583 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) | 1614 | node_clear(nid, mem->scan_nodes); |
1584 | continue; | ||
1585 | |||
1586 | if (total_swap_pages && | ||
1587 | (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || | ||
1588 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) | ||
1589 | continue; | ||
1590 | node_clear(nid, mem->scan_nodes); | ||
1591 | } | 1615 | } |
1616 | |||
1617 | atomic_set(&mem->numainfo_events, 0); | ||
1618 | atomic_set(&mem->numainfo_updating, 0); | ||
1592 | } | 1619 | } |
1593 | 1620 | ||
1594 | /* | 1621 | /* |
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1626 | return node; | 1653 | return node; |
1627 | } | 1654 | } |
1628 | 1655 | ||
1656 | /* | ||
1657 | * Check all nodes whether it contains reclaimable pages or not. | ||
1658 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1659 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1660 | * enough new information. We need to do double check. | ||
1661 | */ | ||
1662 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | ||
1663 | { | ||
1664 | int nid; | ||
1665 | |||
1666 | /* | ||
1667 | * quick check...making use of scan_node. | ||
1668 | * We can skip unused nodes. | ||
1669 | */ | ||
1670 | if (!nodes_empty(mem->scan_nodes)) { | ||
1671 | for (nid = first_node(mem->scan_nodes); | ||
1672 | nid < MAX_NUMNODES; | ||
1673 | nid = next_node(nid, mem->scan_nodes)) { | ||
1674 | |||
1675 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | ||
1676 | return true; | ||
1677 | } | ||
1678 | } | ||
1679 | /* | ||
1680 | * Check rest of nodes. | ||
1681 | */ | ||
1682 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1683 | if (node_isset(nid, mem->scan_nodes)) | ||
1684 | continue; | ||
1685 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | ||
1686 | return true; | ||
1687 | } | ||
1688 | return false; | ||
1689 | } | ||
1690 | |||
1629 | #else | 1691 | #else |
1630 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1692 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) |
1631 | { | 1693 | { |
1632 | return 0; | 1694 | return 0; |
1633 | } | 1695 | } |
1696 | |||
1697 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | ||
1698 | { | ||
1699 | return test_mem_cgroup_node_reclaimable(mem, 0, noswap); | ||
1700 | } | ||
1634 | #endif | 1701 | #endif |
1635 | 1702 | ||
1636 | /* | 1703 | /* |
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1730 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1664 | 1731 | ||
1665 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1732 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1666 | if (root_mem->memsw_is_minimum) | 1733 | if (!check_soft && root_mem->memsw_is_minimum) |
1667 | noswap = true; | 1734 | noswap = true; |
1668 | 1735 | ||
1669 | while (1) { | 1736 | while (1) { |
1670 | victim = mem_cgroup_select_victim(root_mem); | 1737 | victim = mem_cgroup_select_victim(root_mem); |
1671 | if (victim == root_mem) { | 1738 | if (victim == root_mem) { |
1672 | loop++; | 1739 | loop++; |
1673 | if (loop >= 1) | 1740 | /* |
1674 | drain_all_stock_async(); | 1741 | * We are not draining per cpu cached charges during |
1742 | * soft limit reclaim because global reclaim doesn't | ||
1743 | * care about charges. It tries to free some memory and | ||
1744 | * charges will not give any. | ||
1745 | */ | ||
1746 | if (!check_soft && loop >= 1) | ||
1747 | drain_all_stock_async(root_mem); | ||
1675 | if (loop >= 2) { | 1748 | if (loop >= 2) { |
1676 | /* | 1749 | /* |
1677 | * If we have not been able to reclaim | 1750 | * If we have not been able to reclaim |
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1695 | } | 1768 | } |
1696 | } | 1769 | } |
1697 | } | 1770 | } |
1698 | if (!mem_cgroup_local_usage(victim)) { | 1771 | if (!mem_cgroup_reclaimable(victim, noswap)) { |
1699 | /* this cgroup's local usage == 0 */ | 1772 | /* this cgroup's local usage == 0 */ |
1700 | css_put(&victim->css); | 1773 | css_put(&victim->css); |
1701 | continue; | 1774 | continue; |
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp { | |||
1934 | struct mem_cgroup *cached; /* this never be root cgroup */ | 2007 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1935 | unsigned int nr_pages; | 2008 | unsigned int nr_pages; |
1936 | struct work_struct work; | 2009 | struct work_struct work; |
2010 | unsigned long flags; | ||
2011 | #define FLUSHING_CACHED_CHARGE (0) | ||
1937 | }; | 2012 | }; |
1938 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 2013 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1939 | static atomic_t memcg_drain_count; | 2014 | static DEFINE_MUTEX(percpu_charge_mutex); |
1940 | 2015 | ||
1941 | /* | 2016 | /* |
1942 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 2017 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1984 | { | 2059 | { |
1985 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | 2060 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); |
1986 | drain_stock(stock); | 2061 | drain_stock(stock); |
2062 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | ||
1987 | } | 2063 | } |
1988 | 2064 | ||
1989 | /* | 2065 | /* |
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | |||
2008 | * expects some charges will be back to res_counter later but cannot wait for | 2084 | * expects some charges will be back to res_counter later but cannot wait for |
2009 | * it. | 2085 | * it. |
2010 | */ | 2086 | */ |
2011 | static void drain_all_stock_async(void) | 2087 | static void drain_all_stock_async(struct mem_cgroup *root_mem) |
2012 | { | 2088 | { |
2013 | int cpu; | 2089 | int cpu, curcpu; |
2014 | /* This function is for scheduling "drain" in asynchronous way. | 2090 | /* |
2015 | * The result of "drain" is not directly handled by callers. Then, | 2091 | * If someone calls draining, avoid adding more kworker runs. |
2016 | * if someone is calling drain, we don't have to call drain more. | ||
2017 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
2018 | * there is a race. We just do loose check here. | ||
2019 | */ | 2092 | */ |
2020 | if (atomic_read(&memcg_drain_count)) | 2093 | if (!mutex_trylock(&percpu_charge_mutex)) |
2021 | return; | 2094 | return; |
2022 | /* Notify other cpus that system-wide "drain" is running */ | 2095 | /* Notify other cpus that system-wide "drain" is running */ |
2023 | atomic_inc(&memcg_drain_count); | ||
2024 | get_online_cpus(); | 2096 | get_online_cpus(); |
2097 | /* | ||
2098 | * Get a hint for avoiding draining charges on the current cpu, | ||
2099 | * which must be exhausted by our charging. It is not required that | ||
2100 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2101 | * getcpu()/putcpu(). | ||
2102 | */ | ||
2103 | curcpu = raw_smp_processor_id(); | ||
2025 | for_each_online_cpu(cpu) { | 2104 | for_each_online_cpu(cpu) { |
2026 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2105 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2027 | schedule_work_on(cpu, &stock->work); | 2106 | struct mem_cgroup *mem; |
2107 | |||
2108 | if (cpu == curcpu) | ||
2109 | continue; | ||
2110 | |||
2111 | mem = stock->cached; | ||
2112 | if (!mem) | ||
2113 | continue; | ||
2114 | if (mem != root_mem) { | ||
2115 | if (!root_mem->use_hierarchy) | ||
2116 | continue; | ||
2117 | /* check whether "mem" is under tree of "root_mem" */ | ||
2118 | if (!css_is_ancestor(&mem->css, &root_mem->css)) | ||
2119 | continue; | ||
2120 | } | ||
2121 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2122 | schedule_work_on(cpu, &stock->work); | ||
2028 | } | 2123 | } |
2029 | put_online_cpus(); | 2124 | put_online_cpus(); |
2030 | atomic_dec(&memcg_drain_count); | 2125 | mutex_unlock(&percpu_charge_mutex); |
2031 | /* We don't wait for flush_work */ | 2126 | /* We don't wait for flush_work */ |
2032 | } | 2127 | } |
2033 | 2128 | ||
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void) | |||
2035 | static void drain_all_stock_sync(void) | 2130 | static void drain_all_stock_sync(void) |
2036 | { | 2131 | { |
2037 | /* called when force_empty is called */ | 2132 | /* called when force_empty is called */ |
2038 | atomic_inc(&memcg_drain_count); | 2133 | mutex_lock(&percpu_charge_mutex); |
2039 | schedule_on_each_cpu(drain_local_stock); | 2134 | schedule_on_each_cpu(drain_local_stock); |
2040 | atomic_dec(&memcg_drain_count); | 2135 | mutex_unlock(&percpu_charge_mutex); |
2041 | } | 2136 | } |
2042 | 2137 | ||
2043 | /* | 2138 | /* |
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4640 | { | 4735 | { |
4641 | .name = "numa_stat", | 4736 | .name = "numa_stat", |
4642 | .open = mem_control_numa_stat_open, | 4737 | .open = mem_control_numa_stat_open, |
4738 | .mode = S_IRUGO, | ||
4643 | }, | 4739 | }, |
4644 | #endif | 4740 | #endif |
4645 | }; | 4741 | }; |
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
5414 | struct cgroup *old_cont, | 5510 | struct cgroup *old_cont, |
5415 | struct task_struct *p) | 5511 | struct task_struct *p) |
5416 | { | 5512 | { |
5417 | struct mm_struct *mm; | 5513 | struct mm_struct *mm = get_task_mm(p); |
5418 | 5514 | ||
5419 | if (!mc.to) | ||
5420 | /* no need to move charge */ | ||
5421 | return; | ||
5422 | |||
5423 | mm = get_task_mm(p); | ||
5424 | if (mm) { | 5515 | if (mm) { |
5425 | mem_cgroup_move_charge(mm); | 5516 | if (mc.to) |
5517 | mem_cgroup_move_charge(mm); | ||
5518 | put_swap_token(mm); | ||
5426 | mmput(mm); | 5519 | mmput(mm); |
5427 | } | 5520 | } |
5428 | mem_cgroup_clear_mc(); | 5521 | if (mc.to) |
5522 | mem_cgroup_clear_mc(); | ||
5429 | } | 5523 | } |
5430 | #else /* !CONFIG_MMU */ | 5524 | #else /* !CONFIG_MMU */ |
5431 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5525 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5c8f7e08928d..740c4f52059c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <linux/swapops.h> | 52 | #include <linux/swapops.h> |
53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 54 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | ||
55 | #include "internal.h" | 56 | #include "internal.h" |
56 | 57 | ||
57 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 58 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -390,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
390 | struct task_struct *tsk; | 391 | struct task_struct *tsk; |
391 | struct anon_vma *av; | 392 | struct anon_vma *av; |
392 | 393 | ||
393 | read_lock(&tasklist_lock); | ||
394 | av = page_lock_anon_vma(page); | 394 | av = page_lock_anon_vma(page); |
395 | if (av == NULL) /* Not actually mapped anymore */ | 395 | if (av == NULL) /* Not actually mapped anymore */ |
396 | goto out; | 396 | return; |
397 | |||
398 | read_lock(&tasklist_lock); | ||
397 | for_each_process (tsk) { | 399 | for_each_process (tsk) { |
398 | struct anon_vma_chain *vmac; | 400 | struct anon_vma_chain *vmac; |
399 | 401 | ||
@@ -407,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
407 | add_to_kill(tsk, page, vma, to_kill, tkc); | 409 | add_to_kill(tsk, page, vma, to_kill, tkc); |
408 | } | 410 | } |
409 | } | 411 | } |
410 | page_unlock_anon_vma(av); | ||
411 | out: | ||
412 | read_unlock(&tasklist_lock); | 412 | read_unlock(&tasklist_lock); |
413 | page_unlock_anon_vma(av); | ||
413 | } | 414 | } |
414 | 415 | ||
415 | /* | 416 | /* |
@@ -423,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
423 | struct prio_tree_iter iter; | 424 | struct prio_tree_iter iter; |
424 | struct address_space *mapping = page->mapping; | 425 | struct address_space *mapping = page->mapping; |
425 | 426 | ||
426 | /* | ||
427 | * A note on the locking order between the two locks. | ||
428 | * We don't rely on this particular order. | ||
429 | * If you have some other code that needs a different order | ||
430 | * feel free to switch them around. Or add a reverse link | ||
431 | * from mm_struct to task_struct, then this could be all | ||
432 | * done without taking tasklist_lock and looping over all tasks. | ||
433 | */ | ||
434 | |||
435 | read_lock(&tasklist_lock); | ||
436 | mutex_lock(&mapping->i_mmap_mutex); | 427 | mutex_lock(&mapping->i_mmap_mutex); |
428 | read_lock(&tasklist_lock); | ||
437 | for_each_process(tsk) { | 429 | for_each_process(tsk) { |
438 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 430 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
439 | 431 | ||
@@ -453,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
453 | add_to_kill(tsk, page, vma, to_kill, tkc); | 445 | add_to_kill(tsk, page, vma, to_kill, tkc); |
454 | } | 446 | } |
455 | } | 447 | } |
456 | mutex_unlock(&mapping->i_mmap_mutex); | ||
457 | read_unlock(&tasklist_lock); | 448 | read_unlock(&tasklist_lock); |
449 | mutex_unlock(&mapping->i_mmap_mutex); | ||
458 | } | 450 | } |
459 | 451 | ||
460 | /* | 452 | /* |
@@ -1468,7 +1460,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1468 | put_page(page); | 1460 | put_page(page); |
1469 | if (!ret) { | 1461 | if (!ret) { |
1470 | LIST_HEAD(pagelist); | 1462 | LIST_HEAD(pagelist); |
1471 | 1463 | inc_zone_page_state(page, NR_ISOLATED_ANON + | |
1464 | page_is_file_cache(page)); | ||
1472 | list_add(&page->lru, &pagelist); | 1465 | list_add(&page->lru, &pagelist); |
1473 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1474 | 0, true); | 1467 | 0, true); |
diff --git a/mm/memory.c b/mm/memory.c index 6953d3926e01..a56e3ba816b2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
305 | if (batch->nr == batch->max) { | 305 | if (batch->nr == batch->max) { |
306 | if (!tlb_next_batch(tlb)) | 306 | if (!tlb_next_batch(tlb)) |
307 | return 0; | 307 | return 0; |
308 | batch = tlb->active; | ||
308 | } | 309 | } |
309 | VM_BUG_ON(batch->nr > batch->max); | 310 | VM_BUG_ON(batch->nr > batch->max); |
310 | 311 | ||
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1112 | int force_flush = 0; | 1113 | int force_flush = 0; |
1113 | int rss[NR_MM_COUNTERS]; | 1114 | int rss[NR_MM_COUNTERS]; |
1114 | spinlock_t *ptl; | 1115 | spinlock_t *ptl; |
1116 | pte_t *start_pte; | ||
1115 | pte_t *pte; | 1117 | pte_t *pte; |
1116 | 1118 | ||
1117 | again: | 1119 | again: |
1118 | init_rss_vec(rss); | 1120 | init_rss_vec(rss); |
1119 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1121 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
1122 | pte = start_pte; | ||
1120 | arch_enter_lazy_mmu_mode(); | 1123 | arch_enter_lazy_mmu_mode(); |
1121 | do { | 1124 | do { |
1122 | pte_t ptent = *pte; | 1125 | pte_t ptent = *pte; |
@@ -1196,7 +1199,7 @@ again: | |||
1196 | 1199 | ||
1197 | add_mm_rss_vec(mm, rss); | 1200 | add_mm_rss_vec(mm, rss); |
1198 | arch_leave_lazy_mmu_mode(); | 1201 | arch_leave_lazy_mmu_mode(); |
1199 | pte_unmap_unlock(pte - 1, ptl); | 1202 | pte_unmap_unlock(start_pte, ptl); |
1200 | 1203 | ||
1201 | /* | 1204 | /* |
1202 | * mmu_gather ran out of room to batch pages, we break out of | 1205 | * mmu_gather ran out of room to batch pages, we break out of |
@@ -1287,16 +1290,9 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1287 | return addr; | 1290 | return addr; |
1288 | } | 1291 | } |
1289 | 1292 | ||
1290 | #ifdef CONFIG_PREEMPT | ||
1291 | # define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) | ||
1292 | #else | ||
1293 | /* No preempt: go for improved straight-line efficiency */ | ||
1294 | # define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) | ||
1295 | #endif | ||
1296 | |||
1297 | /** | 1293 | /** |
1298 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1294 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1299 | * @tlbp: address of the caller's struct mmu_gather | 1295 | * @tlb: address of the caller's struct mmu_gather |
1300 | * @vma: the starting vma | 1296 | * @vma: the starting vma |
1301 | * @start_addr: virtual address at which to start unmapping | 1297 | * @start_addr: virtual address at which to start unmapping |
1302 | * @end_addr: virtual address at which to end unmapping | 1298 | * @end_addr: virtual address at which to end unmapping |
@@ -1307,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1307 | * | 1303 | * |
1308 | * Unmap all pages in the vma list. | 1304 | * Unmap all pages in the vma list. |
1309 | * | 1305 | * |
1310 | * We aim to not hold locks for too long (for scheduling latency reasons). | ||
1311 | * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to | ||
1312 | * return the ending mmu_gather to the caller. | ||
1313 | * | ||
1314 | * Only addresses between `start' and `end' will be unmapped. | 1306 | * Only addresses between `start' and `end' will be unmapped. |
1315 | * | 1307 | * |
1316 | * The VMA list must be sorted in ascending virtual address order. | 1308 | * The VMA list must be sorted in ascending virtual address order. |
@@ -1813,7 +1805,63 @@ next_page: | |||
1813 | } | 1805 | } |
1814 | EXPORT_SYMBOL(__get_user_pages); | 1806 | EXPORT_SYMBOL(__get_user_pages); |
1815 | 1807 | ||
1816 | /** | 1808 | /* |
1809 | * fixup_user_fault() - manually resolve a user page fault | ||
1810 | * @tsk: the task_struct to use for page fault accounting, or | ||
1811 | * NULL if faults are not to be recorded. | ||
1812 | * @mm: mm_struct of target mm | ||
1813 | * @address: user address | ||
1814 | * @fault_flags:flags to pass down to handle_mm_fault() | ||
1815 | * | ||
1816 | * This is meant to be called in the specific scenario where for locking reasons | ||
1817 | * we try to access user memory in atomic context (within a pagefault_disable() | ||
1818 | * section), this returns -EFAULT, and we want to resolve the user fault before | ||
1819 | * trying again. | ||
1820 | * | ||
1821 | * Typically this is meant to be used by the futex code. | ||
1822 | * | ||
1823 | * The main difference with get_user_pages() is that this function will | ||
1824 | * unconditionally call handle_mm_fault() which will in turn perform all the | ||
1825 | * necessary SW fixup of the dirty and young bits in the PTE, while | ||
1826 | * handle_mm_fault() only guarantees to update these in the struct page. | ||
1827 | * | ||
1828 | * This is important for some architectures where those bits also gate the | ||
1829 | * access permission to the page because they are maintained in software. On | ||
1830 | * such architectures, gup() will not be enough to make a subsequent access | ||
1831 | * succeed. | ||
1832 | * | ||
1833 | * This should be called with the mm_sem held for read. | ||
1834 | */ | ||
1835 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | ||
1836 | unsigned long address, unsigned int fault_flags) | ||
1837 | { | ||
1838 | struct vm_area_struct *vma; | ||
1839 | int ret; | ||
1840 | |||
1841 | vma = find_extend_vma(mm, address); | ||
1842 | if (!vma || address < vma->vm_start) | ||
1843 | return -EFAULT; | ||
1844 | |||
1845 | ret = handle_mm_fault(mm, vma, address, fault_flags); | ||
1846 | if (ret & VM_FAULT_ERROR) { | ||
1847 | if (ret & VM_FAULT_OOM) | ||
1848 | return -ENOMEM; | ||
1849 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | ||
1850 | return -EHWPOISON; | ||
1851 | if (ret & VM_FAULT_SIGBUS) | ||
1852 | return -EFAULT; | ||
1853 | BUG(); | ||
1854 | } | ||
1855 | if (tsk) { | ||
1856 | if (ret & VM_FAULT_MAJOR) | ||
1857 | tsk->maj_flt++; | ||
1858 | else | ||
1859 | tsk->min_flt++; | ||
1860 | } | ||
1861 | return 0; | ||
1862 | } | ||
1863 | |||
1864 | /* | ||
1817 | * get_user_pages() - pin user pages in memory | 1865 | * get_user_pages() - pin user pages in memory |
1818 | * @tsk: the task_struct to use for page fault accounting, or | 1866 | * @tsk: the task_struct to use for page fault accounting, or |
1819 | * NULL if faults are not to be recorded. | 1867 | * NULL if faults are not to be recorded. |
@@ -2796,30 +2844,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2796 | } | 2844 | } |
2797 | EXPORT_SYMBOL(unmap_mapping_range); | 2845 | EXPORT_SYMBOL(unmap_mapping_range); |
2798 | 2846 | ||
2799 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
2800 | { | ||
2801 | struct address_space *mapping = inode->i_mapping; | ||
2802 | |||
2803 | /* | ||
2804 | * If the underlying filesystem is not going to provide | ||
2805 | * a way to truncate a range of blocks (punch a hole) - | ||
2806 | * we should return failure right now. | ||
2807 | */ | ||
2808 | if (!inode->i_op->truncate_range) | ||
2809 | return -ENOSYS; | ||
2810 | |||
2811 | mutex_lock(&inode->i_mutex); | ||
2812 | down_write(&inode->i_alloc_sem); | ||
2813 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2814 | truncate_inode_pages_range(mapping, offset, end); | ||
2815 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2816 | inode->i_op->truncate_range(inode, offset, end); | ||
2817 | up_write(&inode->i_alloc_sem); | ||
2818 | mutex_unlock(&inode->i_mutex); | ||
2819 | |||
2820 | return 0; | ||
2821 | } | ||
2822 | |||
2823 | /* | 2847 | /* |
2824 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2848 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2825 | * but allow concurrent faults), and pte mapped but not yet locked. | 2849 | * but allow concurrent faults), and pte mapped but not yet locked. |
@@ -3125,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3125 | pte_t *page_table; | 3149 | pte_t *page_table; |
3126 | spinlock_t *ptl; | 3150 | spinlock_t *ptl; |
3127 | struct page *page; | 3151 | struct page *page; |
3152 | struct page *cow_page; | ||
3128 | pte_t entry; | 3153 | pte_t entry; |
3129 | int anon = 0; | 3154 | int anon = 0; |
3130 | int charged = 0; | ||
3131 | struct page *dirty_page = NULL; | 3155 | struct page *dirty_page = NULL; |
3132 | struct vm_fault vmf; | 3156 | struct vm_fault vmf; |
3133 | int ret; | 3157 | int ret; |
3134 | int page_mkwrite = 0; | 3158 | int page_mkwrite = 0; |
3135 | 3159 | ||
3160 | /* | ||
3161 | * If we do COW later, allocate page befor taking lock_page() | ||
3162 | * on the file cache page. This will reduce lock holding time. | ||
3163 | */ | ||
3164 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { | ||
3165 | |||
3166 | if (unlikely(anon_vma_prepare(vma))) | ||
3167 | return VM_FAULT_OOM; | ||
3168 | |||
3169 | cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
3170 | if (!cow_page) | ||
3171 | return VM_FAULT_OOM; | ||
3172 | |||
3173 | if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { | ||
3174 | page_cache_release(cow_page); | ||
3175 | return VM_FAULT_OOM; | ||
3176 | } | ||
3177 | } else | ||
3178 | cow_page = NULL; | ||
3179 | |||
3136 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 3180 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); |
3137 | vmf.pgoff = pgoff; | 3181 | vmf.pgoff = pgoff; |
3138 | vmf.flags = flags; | 3182 | vmf.flags = flags; |
@@ -3141,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3141 | ret = vma->vm_ops->fault(vma, &vmf); | 3185 | ret = vma->vm_ops->fault(vma, &vmf); |
3142 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3186 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3143 | VM_FAULT_RETRY))) | 3187 | VM_FAULT_RETRY))) |
3144 | return ret; | 3188 | goto uncharge_out; |
3145 | 3189 | ||
3146 | if (unlikely(PageHWPoison(vmf.page))) { | 3190 | if (unlikely(PageHWPoison(vmf.page))) { |
3147 | if (ret & VM_FAULT_LOCKED) | 3191 | if (ret & VM_FAULT_LOCKED) |
3148 | unlock_page(vmf.page); | 3192 | unlock_page(vmf.page); |
3149 | return VM_FAULT_HWPOISON; | 3193 | ret = VM_FAULT_HWPOISON; |
3194 | goto uncharge_out; | ||
3150 | } | 3195 | } |
3151 | 3196 | ||
3152 | /* | 3197 | /* |
@@ -3164,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3164 | page = vmf.page; | 3209 | page = vmf.page; |
3165 | if (flags & FAULT_FLAG_WRITE) { | 3210 | if (flags & FAULT_FLAG_WRITE) { |
3166 | if (!(vma->vm_flags & VM_SHARED)) { | 3211 | if (!(vma->vm_flags & VM_SHARED)) { |
3212 | page = cow_page; | ||
3167 | anon = 1; | 3213 | anon = 1; |
3168 | if (unlikely(anon_vma_prepare(vma))) { | ||
3169 | ret = VM_FAULT_OOM; | ||
3170 | goto out; | ||
3171 | } | ||
3172 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, | ||
3173 | vma, address); | ||
3174 | if (!page) { | ||
3175 | ret = VM_FAULT_OOM; | ||
3176 | goto out; | ||
3177 | } | ||
3178 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
3179 | ret = VM_FAULT_OOM; | ||
3180 | page_cache_release(page); | ||
3181 | goto out; | ||
3182 | } | ||
3183 | charged = 1; | ||
3184 | copy_user_highpage(page, vmf.page, address, vma); | 3214 | copy_user_highpage(page, vmf.page, address, vma); |
3185 | __SetPageUptodate(page); | 3215 | __SetPageUptodate(page); |
3186 | } else { | 3216 | } else { |
@@ -3249,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3249 | /* no need to invalidate: a not-present page won't be cached */ | 3279 | /* no need to invalidate: a not-present page won't be cached */ |
3250 | update_mmu_cache(vma, address, page_table); | 3280 | update_mmu_cache(vma, address, page_table); |
3251 | } else { | 3281 | } else { |
3252 | if (charged) | 3282 | if (cow_page) |
3253 | mem_cgroup_uncharge_page(page); | 3283 | mem_cgroup_uncharge_page(cow_page); |
3254 | if (anon) | 3284 | if (anon) |
3255 | page_cache_release(page); | 3285 | page_cache_release(page); |
3256 | else | 3286 | else |
@@ -3259,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3259 | 3289 | ||
3260 | pte_unmap_unlock(page_table, ptl); | 3290 | pte_unmap_unlock(page_table, ptl); |
3261 | 3291 | ||
3262 | out: | ||
3263 | if (dirty_page) { | 3292 | if (dirty_page) { |
3264 | struct address_space *mapping = page->mapping; | 3293 | struct address_space *mapping = page->mapping; |
3265 | 3294 | ||
@@ -3289,6 +3318,13 @@ out: | |||
3289 | unwritable_page: | 3318 | unwritable_page: |
3290 | page_cache_release(page); | 3319 | page_cache_release(page); |
3291 | return ret; | 3320 | return ret; |
3321 | uncharge_out: | ||
3322 | /* fs's fault handler get error */ | ||
3323 | if (cow_page) { | ||
3324 | mem_cgroup_uncharge_page(cow_page); | ||
3325 | page_cache_release(cow_page); | ||
3326 | } | ||
3327 | return ret; | ||
3292 | } | 3328 | } |
3293 | 3329 | ||
3294 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3330 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9f646374e32f..6e7d8b21dbfa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -34,6 +34,17 @@ | |||
34 | 34 | ||
35 | #include "internal.h" | 35 | #include "internal.h" |
36 | 36 | ||
37 | /* | ||
38 | * online_page_callback contains pointer to current page onlining function. | ||
39 | * Initially it is generic_online_page(). If it is required it could be | ||
40 | * changed by calling set_online_page_callback() for callback registration | ||
41 | * and restore_online_page_callback() for generic callback restore. | ||
42 | */ | ||
43 | |||
44 | static void generic_online_page(struct page *page); | ||
45 | |||
46 | static online_page_callback_t online_page_callback = generic_online_page; | ||
47 | |||
37 | DEFINE_MUTEX(mem_hotplug_mutex); | 48 | DEFINE_MUTEX(mem_hotplug_mutex); |
38 | 49 | ||
39 | void lock_memory_hotplug(void) | 50 | void lock_memory_hotplug(void) |
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
361 | } | 372 | } |
362 | EXPORT_SYMBOL_GPL(__remove_pages); | 373 | EXPORT_SYMBOL_GPL(__remove_pages); |
363 | 374 | ||
364 | void online_page(struct page *page) | 375 | int set_online_page_callback(online_page_callback_t callback) |
376 | { | ||
377 | int rc = -EINVAL; | ||
378 | |||
379 | lock_memory_hotplug(); | ||
380 | |||
381 | if (online_page_callback == generic_online_page) { | ||
382 | online_page_callback = callback; | ||
383 | rc = 0; | ||
384 | } | ||
385 | |||
386 | unlock_memory_hotplug(); | ||
387 | |||
388 | return rc; | ||
389 | } | ||
390 | EXPORT_SYMBOL_GPL(set_online_page_callback); | ||
391 | |||
392 | int restore_online_page_callback(online_page_callback_t callback) | ||
393 | { | ||
394 | int rc = -EINVAL; | ||
395 | |||
396 | lock_memory_hotplug(); | ||
397 | |||
398 | if (online_page_callback == callback) { | ||
399 | online_page_callback = generic_online_page; | ||
400 | rc = 0; | ||
401 | } | ||
402 | |||
403 | unlock_memory_hotplug(); | ||
404 | |||
405 | return rc; | ||
406 | } | ||
407 | EXPORT_SYMBOL_GPL(restore_online_page_callback); | ||
408 | |||
409 | void __online_page_set_limits(struct page *page) | ||
365 | { | 410 | { |
366 | unsigned long pfn = page_to_pfn(page); | 411 | unsigned long pfn = page_to_pfn(page); |
367 | 412 | ||
368 | totalram_pages++; | ||
369 | if (pfn >= num_physpages) | 413 | if (pfn >= num_physpages) |
370 | num_physpages = pfn + 1; | 414 | num_physpages = pfn + 1; |
415 | } | ||
416 | EXPORT_SYMBOL_GPL(__online_page_set_limits); | ||
417 | |||
418 | void __online_page_increment_counters(struct page *page) | ||
419 | { | ||
420 | totalram_pages++; | ||
371 | 421 | ||
372 | #ifdef CONFIG_HIGHMEM | 422 | #ifdef CONFIG_HIGHMEM |
373 | if (PageHighMem(page)) | 423 | if (PageHighMem(page)) |
374 | totalhigh_pages++; | 424 | totalhigh_pages++; |
375 | #endif | 425 | #endif |
426 | } | ||
427 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); | ||
376 | 428 | ||
429 | void __online_page_free(struct page *page) | ||
430 | { | ||
377 | ClearPageReserved(page); | 431 | ClearPageReserved(page); |
378 | init_page_count(page); | 432 | init_page_count(page); |
379 | __free_page(page); | 433 | __free_page(page); |
380 | } | 434 | } |
435 | EXPORT_SYMBOL_GPL(__online_page_free); | ||
436 | |||
437 | static void generic_online_page(struct page *page) | ||
438 | { | ||
439 | __online_page_set_limits(page); | ||
440 | __online_page_increment_counters(page); | ||
441 | __online_page_free(page); | ||
442 | } | ||
381 | 443 | ||
382 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | 444 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
383 | void *arg) | 445 | void *arg) |
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
388 | if (PageReserved(pfn_to_page(start_pfn))) | 450 | if (PageReserved(pfn_to_page(start_pfn))) |
389 | for (i = 0; i < nr_pages; i++) { | 451 | for (i = 0; i < nr_pages; i++) { |
390 | page = pfn_to_page(start_pfn + i); | 452 | page = pfn_to_page(start_pfn + i); |
391 | online_page(page); | 453 | (*online_page_callback)(page); |
392 | onlined_pages++; | 454 | onlined_pages++; |
393 | } | 455 | } |
394 | *(unsigned long *)arg = onlined_pages; | 456 | *(unsigned long *)arg = onlined_pages; |
@@ -494,6 +556,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
494 | /* init node's zones as empty zones, we don't have any present pages.*/ | 556 | /* init node's zones as empty zones, we don't have any present pages.*/ |
495 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); | 557 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); |
496 | 558 | ||
559 | /* | ||
560 | * The node we allocated has no zone fallback lists. For avoiding | ||
561 | * to access not-initialized zonelist, build here. | ||
562 | */ | ||
563 | mutex_lock(&zonelists_mutex); | ||
564 | build_all_zonelists(NULL); | ||
565 | mutex_unlock(&zonelists_mutex); | ||
566 | |||
497 | return pgdat; | 567 | return pgdat; |
498 | } | 568 | } |
499 | 569 | ||
@@ -515,7 +585,7 @@ int mem_online_node(int nid) | |||
515 | 585 | ||
516 | lock_memory_hotplug(); | 586 | lock_memory_hotplug(); |
517 | pgdat = hotadd_new_pgdat(nid, 0); | 587 | pgdat = hotadd_new_pgdat(nid, 0); |
518 | if (pgdat) { | 588 | if (!pgdat) { |
519 | ret = -ENOMEM; | 589 | ret = -ENOMEM; |
520 | goto out; | 590 | goto out; |
521 | } | 591 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index e4a5c912983d..666e4e677414 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
288 | */ | 288 | */ |
289 | __dec_zone_page_state(page, NR_FILE_PAGES); | 289 | __dec_zone_page_state(page, NR_FILE_PAGES); |
290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
291 | if (PageSwapBacked(page)) { | 291 | if (!PageSwapCache(page) && PageSwapBacked(page)) { |
292 | __dec_zone_page_state(page, NR_SHMEM); | 292 | __dec_zone_page_state(page, NR_SHMEM); |
293 | __inc_zone_page_state(newpage, NR_SHMEM); | 293 | __inc_zone_page_state(newpage, NR_SHMEM); |
294 | } | 294 | } |
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
122 | return 0; | 122 | return 0; |
123 | 123 | ||
124 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 124 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
125 | unsigned long n; | 125 | free = global_page_state(NR_FREE_PAGES); |
126 | free += global_page_state(NR_FILE_PAGES); | ||
127 | |||
128 | /* | ||
129 | * shmem pages shouldn't be counted as free in this | ||
130 | * case, they can't be purged, only swapped out, and | ||
131 | * that won't affect the overall amount of available | ||
132 | * memory in the system. | ||
133 | */ | ||
134 | free -= global_page_state(NR_SHMEM); | ||
126 | 135 | ||
127 | free = global_page_state(NR_FILE_PAGES); | ||
128 | free += nr_swap_pages; | 136 | free += nr_swap_pages; |
129 | 137 | ||
130 | /* | 138 | /* |
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
136 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 144 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
137 | 145 | ||
138 | /* | 146 | /* |
139 | * Leave the last 3% for root | ||
140 | */ | ||
141 | if (!cap_sys_admin) | ||
142 | free -= free / 32; | ||
143 | |||
144 | if (free > pages) | ||
145 | return 0; | ||
146 | |||
147 | /* | ||
148 | * nr_free_pages() is very expensive on large systems, | ||
149 | * only call if we're about to fail. | ||
150 | */ | ||
151 | n = nr_free_pages(); | ||
152 | |||
153 | /* | ||
154 | * Leave reserved pages. The pages are not for anonymous pages. | 147 | * Leave reserved pages. The pages are not for anonymous pages. |
155 | */ | 148 | */ |
156 | if (n <= totalreserve_pages) | 149 | if (free <= totalreserve_pages) |
157 | goto error; | 150 | goto error; |
158 | else | 151 | else |
159 | n -= totalreserve_pages; | 152 | free -= totalreserve_pages; |
160 | 153 | ||
161 | /* | 154 | /* |
162 | * Leave the last 3% for root | 155 | * Leave the last 3% for root |
163 | */ | 156 | */ |
164 | if (!cap_sys_admin) | 157 | if (!cap_sys_admin) |
165 | n -= n / 32; | 158 | free -= free / 32; |
166 | free += n; | ||
167 | 159 | ||
168 | if (free > pages) | 160 | if (free > pages) |
169 | return 0; | 161 | return 0; |
@@ -906,14 +898,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | |||
906 | if (anon_vma) | 898 | if (anon_vma) |
907 | return anon_vma; | 899 | return anon_vma; |
908 | try_prev: | 900 | try_prev: |
909 | /* | 901 | near = vma->vm_prev; |
910 | * It is potentially slow to have to call find_vma_prev here. | ||
911 | * But it's only on the first write fault on the vma, not | ||
912 | * every time, and we could devise a way to avoid it later | ||
913 | * (e.g. stash info in next's anon_vma_node when assigning | ||
914 | * an anon_vma, or when trying vma_merge). Another time. | ||
915 | */ | ||
916 | BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); | ||
917 | if (!near) | 902 | if (!near) |
918 | goto none; | 903 | goto none; |
919 | 904 | ||
@@ -2044,9 +2029,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2044 | return -EINVAL; | 2029 | return -EINVAL; |
2045 | 2030 | ||
2046 | /* Find the first overlapping VMA */ | 2031 | /* Find the first overlapping VMA */ |
2047 | vma = find_vma_prev(mm, start, &prev); | 2032 | vma = find_vma(mm, start); |
2048 | if (!vma) | 2033 | if (!vma) |
2049 | return 0; | 2034 | return 0; |
2035 | prev = vma->vm_prev; | ||
2050 | /* we have start < vma->vm_end */ | 2036 | /* we have start < vma->vm_end */ |
2051 | 2037 | ||
2052 | /* if it doesn't overlap, we have nothing.. */ | 2038 | /* if it doesn't overlap, we have nothing.. */ |
diff --git a/mm/nommu.c b/mm/nommu.c index 1fd0c51b10a6..4358032566e9 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -22,7 +22,6 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
25 | #include <linux/tracehook.h> | ||
26 | #include <linux/blkdev.h> | 25 | #include <linux/blkdev.h> |
27 | #include <linux/backing-dev.h> | 26 | #include <linux/backing-dev.h> |
28 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
1087 | * it's being traced - otherwise breakpoints set in it may interfere | 1086 | * it's being traced - otherwise breakpoints set in it may interfere |
1088 | * with another untraced process | 1087 | * with another untraced process |
1089 | */ | 1088 | */ |
1090 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) | 1089 | if ((flags & MAP_PRIVATE) && current->ptrace) |
1091 | vm_flags &= ~VM_MAYSHARE; | 1090 | vm_flags &= ~VM_MAYSHARE; |
1092 | 1091 | ||
1093 | return vm_flags; | 1092 | return vm_flags; |
@@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1813 | return NULL; | 1812 | return NULL; |
1814 | } | 1813 | } |
1815 | 1814 | ||
1816 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1815 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1817 | unsigned long to, unsigned long size, pgprot_t prot) | 1816 | unsigned long pfn, unsigned long size, pgprot_t prot) |
1818 | { | 1817 | { |
1819 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; | 1818 | if (addr != (pfn << PAGE_SHIFT)) |
1819 | return -EINVAL; | ||
1820 | |||
1821 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
1820 | return 0; | 1822 | return 0; |
1821 | } | 1823 | } |
1822 | EXPORT_SYMBOL(remap_pfn_range); | 1824 | EXPORT_SYMBOL(remap_pfn_range); |
@@ -1882,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1882 | return 0; | 1884 | return 0; |
1883 | 1885 | ||
1884 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { | 1886 | if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { |
1885 | unsigned long n; | 1887 | free = global_page_state(NR_FREE_PAGES); |
1888 | free += global_page_state(NR_FILE_PAGES); | ||
1889 | |||
1890 | /* | ||
1891 | * shmem pages shouldn't be counted as free in this | ||
1892 | * case, they can't be purged, only swapped out, and | ||
1893 | * that won't affect the overall amount of available | ||
1894 | * memory in the system. | ||
1895 | */ | ||
1896 | free -= global_page_state(NR_SHMEM); | ||
1886 | 1897 | ||
1887 | free = global_page_state(NR_FILE_PAGES); | ||
1888 | free += nr_swap_pages; | 1898 | free += nr_swap_pages; |
1889 | 1899 | ||
1890 | /* | 1900 | /* |
@@ -1896,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1896 | free += global_page_state(NR_SLAB_RECLAIMABLE); | 1906 | free += global_page_state(NR_SLAB_RECLAIMABLE); |
1897 | 1907 | ||
1898 | /* | 1908 | /* |
1899 | * Leave the last 3% for root | ||
1900 | */ | ||
1901 | if (!cap_sys_admin) | ||
1902 | free -= free / 32; | ||
1903 | |||
1904 | if (free > pages) | ||
1905 | return 0; | ||
1906 | |||
1907 | /* | ||
1908 | * nr_free_pages() is very expensive on large systems, | ||
1909 | * only call if we're about to fail. | ||
1910 | */ | ||
1911 | n = nr_free_pages(); | ||
1912 | |||
1913 | /* | ||
1914 | * Leave reserved pages. The pages are not for anonymous pages. | 1909 | * Leave reserved pages. The pages are not for anonymous pages. |
1915 | */ | 1910 | */ |
1916 | if (n <= totalreserve_pages) | 1911 | if (free <= totalreserve_pages) |
1917 | goto error; | 1912 | goto error; |
1918 | else | 1913 | else |
1919 | n -= totalreserve_pages; | 1914 | free -= totalreserve_pages; |
1920 | 1915 | ||
1921 | /* | 1916 | /* |
1922 | * Leave the last 3% for root | 1917 | * Leave the last 3% for root |
1923 | */ | 1918 | */ |
1924 | if (!cap_sys_admin) | 1919 | if (!cap_sys_admin) |
1925 | n -= n / 32; | 1920 | free -= free / 32; |
1926 | free += n; | ||
1927 | 1921 | ||
1928 | if (free > pages) | 1922 | if (free > pages) |
1929 | return 0; | 1923 | return 0; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca351..eafff89b3dd6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
339 | * then wait for it to finish before killing | 339 | * then wait for it to finish before killing |
340 | * some other task unnecessarily. | 340 | * some other task unnecessarily. |
341 | */ | 341 | */ |
342 | if (!(task_ptrace(p->group_leader) & | 342 | if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) |
343 | PT_TRACE_EXIT)) | ||
344 | return ERR_PTR(-1UL); | 343 | return ERR_PTR(-1UL); |
345 | } | 344 | } |
346 | } | 345 | } |
@@ -488,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
488 | 487 | ||
489 | /* | 488 | /* |
490 | * If any of p's children has a different mm and is eligible for kill, | 489 | * If any of p's children has a different mm and is eligible for kill, |
491 | * the one with the highest badness() score is sacrificed for its | 490 | * the one with the highest oom_badness() score is sacrificed for its |
492 | * parent. This attempts to lose the minimal amount of work done while | 491 | * parent. This attempts to lose the minimal amount of work done while |
493 | * still freeing memory. | 492 | * still freeing memory. |
494 | */ | 493 | */ |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1d781803e629..d1960744f881 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1347,7 +1347,6 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
1347 | void account_page_writeback(struct page *page) | 1347 | void account_page_writeback(struct page *page) |
1348 | { | 1348 | { |
1349 | inc_zone_page_state(page, NR_WRITEBACK); | 1349 | inc_zone_page_state(page, NR_WRITEBACK); |
1350 | inc_zone_page_state(page, NR_WRITTEN); | ||
1351 | } | 1350 | } |
1352 | EXPORT_SYMBOL(account_page_writeback); | 1351 | EXPORT_SYMBOL(account_page_writeback); |
1353 | 1352 | ||
@@ -1564,8 +1563,10 @@ int test_clear_page_writeback(struct page *page) | |||
1564 | } else { | 1563 | } else { |
1565 | ret = TestClearPageWriteback(page); | 1564 | ret = TestClearPageWriteback(page); |
1566 | } | 1565 | } |
1567 | if (ret) | 1566 | if (ret) { |
1568 | dec_zone_page_state(page, NR_WRITEBACK); | 1567 | dec_zone_page_state(page, NR_WRITEBACK); |
1568 | inc_zone_page_state(page, NR_WRITTEN); | ||
1569 | } | ||
1569 | return ret; | 1570 | return ret; |
1570 | } | 1571 | } |
1571 | 1572 | ||
@@ -1611,10 +1612,6 @@ EXPORT_SYMBOL(test_set_page_writeback); | |||
1611 | */ | 1612 | */ |
1612 | int mapping_tagged(struct address_space *mapping, int tag) | 1613 | int mapping_tagged(struct address_space *mapping, int tag) |
1613 | { | 1614 | { |
1614 | int ret; | 1615 | return radix_tree_tagged(&mapping->page_tree, tag); |
1615 | rcu_read_lock(); | ||
1616 | ret = radix_tree_tagged(&mapping->page_tree, tag); | ||
1617 | rcu_read_unlock(); | ||
1618 | return ret; | ||
1619 | } | 1616 | } |
1620 | EXPORT_SYMBOL(mapping_tagged); | 1617 | EXPORT_SYMBOL(mapping_tagged); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8985acdab8..094472377d81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1616 | set_bit(i, zlc->fullzones); | 1616 | set_bit(i, zlc->fullzones); |
1617 | } | 1617 | } |
1618 | 1618 | ||
1619 | /* | ||
1620 | * clear all zones full, called after direct reclaim makes progress so that | ||
1621 | * a zone that was recently full is not skipped over for up to a second | ||
1622 | */ | ||
1623 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1624 | { | ||
1625 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | ||
1626 | |||
1627 | zlc = zonelist->zlcache_ptr; | ||
1628 | if (!zlc) | ||
1629 | return; | ||
1630 | |||
1631 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | ||
1632 | } | ||
1633 | |||
1619 | #else /* CONFIG_NUMA */ | 1634 | #else /* CONFIG_NUMA */ |
1620 | 1635 | ||
1621 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1636 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
@@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | |||
1632 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1647 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1633 | { | 1648 | { |
1634 | } | 1649 | } |
1650 | |||
1651 | static void zlc_clear_zones_full(struct zonelist *zonelist) | ||
1652 | { | ||
1653 | } | ||
1635 | #endif /* CONFIG_NUMA */ | 1654 | #endif /* CONFIG_NUMA */ |
1636 | 1655 | ||
1637 | /* | 1656 | /* |
@@ -1664,7 +1683,7 @@ zonelist_scan: | |||
1664 | continue; | 1683 | continue; |
1665 | if ((alloc_flags & ALLOC_CPUSET) && | 1684 | if ((alloc_flags & ALLOC_CPUSET) && |
1666 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1685 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1667 | goto try_next_zone; | 1686 | continue; |
1668 | 1687 | ||
1669 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1688 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1670 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1689 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
@@ -1676,17 +1695,36 @@ zonelist_scan: | |||
1676 | classzone_idx, alloc_flags)) | 1695 | classzone_idx, alloc_flags)) |
1677 | goto try_this_zone; | 1696 | goto try_this_zone; |
1678 | 1697 | ||
1698 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1699 | /* | ||
1700 | * we do zlc_setup if there are multiple nodes | ||
1701 | * and before considering the first zone allowed | ||
1702 | * by the cpuset. | ||
1703 | */ | ||
1704 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1705 | zlc_active = 1; | ||
1706 | did_zlc_setup = 1; | ||
1707 | } | ||
1708 | |||
1679 | if (zone_reclaim_mode == 0) | 1709 | if (zone_reclaim_mode == 0) |
1680 | goto this_zone_full; | 1710 | goto this_zone_full; |
1681 | 1711 | ||
1712 | /* | ||
1713 | * As we may have just activated ZLC, check if the first | ||
1714 | * eligible zone has failed zone_reclaim recently. | ||
1715 | */ | ||
1716 | if (NUMA_BUILD && zlc_active && | ||
1717 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | ||
1718 | continue; | ||
1719 | |||
1682 | ret = zone_reclaim(zone, gfp_mask, order); | 1720 | ret = zone_reclaim(zone, gfp_mask, order); |
1683 | switch (ret) { | 1721 | switch (ret) { |
1684 | case ZONE_RECLAIM_NOSCAN: | 1722 | case ZONE_RECLAIM_NOSCAN: |
1685 | /* did not scan */ | 1723 | /* did not scan */ |
1686 | goto try_next_zone; | 1724 | continue; |
1687 | case ZONE_RECLAIM_FULL: | 1725 | case ZONE_RECLAIM_FULL: |
1688 | /* scanned but unreclaimable */ | 1726 | /* scanned but unreclaimable */ |
1689 | goto this_zone_full; | 1727 | continue; |
1690 | default: | 1728 | default: |
1691 | /* did we reclaim enough */ | 1729 | /* did we reclaim enough */ |
1692 | if (!zone_watermark_ok(zone, order, mark, | 1730 | if (!zone_watermark_ok(zone, order, mark, |
@@ -1703,16 +1741,6 @@ try_this_zone: | |||
1703 | this_zone_full: | 1741 | this_zone_full: |
1704 | if (NUMA_BUILD) | 1742 | if (NUMA_BUILD) |
1705 | zlc_mark_zone_full(zonelist, z); | 1743 | zlc_mark_zone_full(zonelist, z); |
1706 | try_next_zone: | ||
1707 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | ||
1708 | /* | ||
1709 | * we do zlc_setup after the first zone is tried but only | ||
1710 | * if there are multiple nodes make it worthwhile | ||
1711 | */ | ||
1712 | allowednodes = zlc_setup(zonelist, alloc_flags); | ||
1713 | zlc_active = 1; | ||
1714 | did_zlc_setup = 1; | ||
1715 | } | ||
1716 | } | 1744 | } |
1717 | 1745 | ||
1718 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1746 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
@@ -1954,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
1954 | if (unlikely(!(*did_some_progress))) | 1982 | if (unlikely(!(*did_some_progress))) |
1955 | return NULL; | 1983 | return NULL; |
1956 | 1984 | ||
1985 | /* After successful reclaim, reconsider all zones for allocation */ | ||
1986 | if (NUMA_BUILD) | ||
1987 | zlc_clear_zones_full(zonelist); | ||
1988 | |||
1957 | retry: | 1989 | retry: |
1958 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1990 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1959 | zonelist, high_zoneidx, | 1991 | zonelist, high_zoneidx, |
@@ -4585,6 +4617,60 @@ void __init sort_node_map(void) | |||
4585 | cmp_node_active_region, NULL); | 4617 | cmp_node_active_region, NULL); |
4586 | } | 4618 | } |
4587 | 4619 | ||
4620 | /** | ||
4621 | * node_map_pfn_alignment - determine the maximum internode alignment | ||
4622 | * | ||
4623 | * This function should be called after node map is populated and sorted. | ||
4624 | * It calculates the maximum power of two alignment which can distinguish | ||
4625 | * all the nodes. | ||
4626 | * | ||
4627 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value | ||
4628 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the | ||
4629 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is | ||
4630 | * shifted, 1GiB is enough and this function will indicate so. | ||
4631 | * | ||
4632 | * This is used to test whether pfn -> nid mapping of the chosen memory | ||
4633 | * model has fine enough granularity to avoid incorrect mapping for the | ||
4634 | * populated node map. | ||
4635 | * | ||
4636 | * Returns the determined alignment in pfn's. 0 if there is no alignment | ||
4637 | * requirement (single node). | ||
4638 | */ | ||
4639 | unsigned long __init node_map_pfn_alignment(void) | ||
4640 | { | ||
4641 | unsigned long accl_mask = 0, last_end = 0; | ||
4642 | int last_nid = -1; | ||
4643 | int i; | ||
4644 | |||
4645 | for_each_active_range_index_in_nid(i, MAX_NUMNODES) { | ||
4646 | int nid = early_node_map[i].nid; | ||
4647 | unsigned long start = early_node_map[i].start_pfn; | ||
4648 | unsigned long end = early_node_map[i].end_pfn; | ||
4649 | unsigned long mask; | ||
4650 | |||
4651 | if (!start || last_nid < 0 || last_nid == nid) { | ||
4652 | last_nid = nid; | ||
4653 | last_end = end; | ||
4654 | continue; | ||
4655 | } | ||
4656 | |||
4657 | /* | ||
4658 | * Start with a mask granular enough to pin-point to the | ||
4659 | * start pfn and tick off bits one-by-one until it becomes | ||
4660 | * too coarse to separate the current node from the last. | ||
4661 | */ | ||
4662 | mask = ~((1 << __ffs(start)) - 1); | ||
4663 | while (mask && last_end <= (start & (mask << 1))) | ||
4664 | mask <<= 1; | ||
4665 | |||
4666 | /* accumulate all internode masks */ | ||
4667 | accl_mask |= mask; | ||
4668 | } | ||
4669 | |||
4670 | /* convert mask to number of pages */ | ||
4671 | return ~accl_mask + 1; | ||
4672 | } | ||
4673 | |||
4588 | /* Find the lowest pfn for a node */ | 4674 | /* Find the lowest pfn for a node */ |
4589 | static unsigned long __init find_min_pfn_for_node(int nid) | 4675 | static unsigned long __init find_min_pfn_for_node(int nid) |
4590 | { | 4676 | { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 74ccff61d1be..39d216d535ea 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr) | |||
162 | } | 162 | } |
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | static int __meminit init_section_page_cgroup(unsigned long pfn) | 165 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) |
166 | { | 166 | { |
167 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | 168 | struct mem_section *section; |
169 | unsigned long table_size; | 169 | unsigned long table_size; |
170 | unsigned long nr; | 170 | unsigned long nr; |
171 | int nid, index; | 171 | int index; |
172 | 172 | ||
173 | nr = pfn_to_section_nr(pfn); | 173 | nr = pfn_to_section_nr(pfn); |
174 | section = __nr_to_section(nr); | 174 | section = __nr_to_section(nr); |
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
176 | if (section->page_cgroup) | 176 | if (section->page_cgroup) |
177 | return 0; | 177 | return 0; |
178 | 178 | ||
179 | nid = page_to_nid(pfn_to_page(pfn)); | ||
180 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 179 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
181 | base = alloc_page_cgroup(table_size, nid); | 180 | base = alloc_page_cgroup(table_size, nid); |
182 | 181 | ||
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
196 | pc = base + index; | 195 | pc = base + index; |
197 | init_page_cgroup(pc, nr); | 196 | init_page_cgroup(pc, nr); |
198 | } | 197 | } |
199 | 198 | /* | |
199 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
200 | * we need to apply a mask. | ||
201 | */ | ||
202 | pfn &= PAGE_SECTION_MASK; | ||
200 | section->page_cgroup = base - pfn; | 203 | section->page_cgroup = base - pfn; |
201 | total_usage += table_size; | 204 | total_usage += table_size; |
202 | return 0; | 205 | return 0; |
@@ -222,13 +225,23 @@ int __meminit online_page_cgroup(unsigned long start_pfn, | |||
222 | unsigned long start, end, pfn; | 225 | unsigned long start, end, pfn; |
223 | int fail = 0; | 226 | int fail = 0; |
224 | 227 | ||
225 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 228 | start = SECTION_ALIGN_DOWN(start_pfn); |
226 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 229 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
230 | |||
231 | if (nid == -1) { | ||
232 | /* | ||
233 | * In this case, "nid" already exists and contains valid memory. | ||
234 | * "start_pfn" passed to us is a pfn which is an arg for | ||
235 | * online__pages(), and start_pfn should exist. | ||
236 | */ | ||
237 | nid = pfn_to_nid(start_pfn); | ||
238 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
239 | } | ||
227 | 240 | ||
228 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | 241 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { |
229 | if (!pfn_present(pfn)) | 242 | if (!pfn_present(pfn)) |
230 | continue; | 243 | continue; |
231 | fail = init_section_page_cgroup(pfn); | 244 | fail = init_section_page_cgroup(pfn, nid); |
232 | } | 245 | } |
233 | if (!fail) | 246 | if (!fail) |
234 | return 0; | 247 | return 0; |
@@ -245,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, | |||
245 | { | 258 | { |
246 | unsigned long start, end, pfn; | 259 | unsigned long start, end, pfn; |
247 | 260 | ||
248 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 261 | start = SECTION_ALIGN_DOWN(start_pfn); |
249 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 262 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
250 | 263 | ||
251 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | 264 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) |
252 | __free_page_cgroup(pfn); | 265 | __free_page_cgroup(pfn); |
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
284 | void __init page_cgroup_init(void) | 297 | void __init page_cgroup_init(void) |
285 | { | 298 | { |
286 | unsigned long pfn; | 299 | unsigned long pfn; |
287 | int fail = 0; | 300 | int nid; |
288 | 301 | ||
289 | if (mem_cgroup_disabled()) | 302 | if (mem_cgroup_disabled()) |
290 | return; | 303 | return; |
291 | 304 | ||
292 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | 305 | for_each_node_state(nid, N_HIGH_MEMORY) { |
293 | if (!pfn_present(pfn)) | 306 | unsigned long start_pfn, end_pfn; |
294 | continue; | 307 | |
295 | fail = init_section_page_cgroup(pfn); | 308 | start_pfn = node_start_pfn(nid); |
296 | } | 309 | end_pfn = node_end_pfn(nid); |
297 | if (fail) { | 310 | /* |
298 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | 311 | * start_pfn and end_pfn may not be aligned to SECTION and the |
299 | panic("Out of memory"); | 312 | * page->flags of out of node pages are not initialized. So we |
300 | } else { | 313 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. |
301 | hotplug_memory_notifier(page_cgroup_callback, 0); | 314 | */ |
315 | for (pfn = start_pfn; | ||
316 | pfn < end_pfn; | ||
317 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
318 | |||
319 | if (!pfn_valid(pfn)) | ||
320 | continue; | ||
321 | /* | ||
322 | * Nodes's pfns can be overlapping. | ||
323 | * We know some arch can have a nodes layout such as | ||
324 | * -------------pfn--------------> | ||
325 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
326 | */ | ||
327 | if (pfn_to_nid(pfn) != nid) | ||
328 | continue; | ||
329 | if (init_section_page_cgroup(pfn, nid)) | ||
330 | goto oom; | ||
331 | } | ||
302 | } | 332 | } |
333 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
303 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 334 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
304 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" | 335 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " |
305 | " want memory cgroups\n"); | 336 | "don't want memory cgroups\n"); |
337 | return; | ||
338 | oom: | ||
339 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | ||
340 | panic("Out of memory"); | ||
306 | } | 341 | } |
307 | 342 | ||
308 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 343 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |
@@ -502,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
502 | nomem: | 537 | nomem: |
503 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | 538 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); |
504 | printk(KERN_INFO | 539 | printk(KERN_INFO |
505 | "swap_cgroup can be disabled by noswapaccount boot option\n"); | 540 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); |
506 | return -ENOMEM; | 541 | return -ENOMEM; |
507 | } | 542 | } |
508 | 543 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d533611..2f5cf10ff660 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
126 | 126 | ||
127 | return 0; | 127 | return 0; |
128 | } | 128 | } |
129 | #endif | 129 | |
130 | static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) | ||
131 | { | ||
132 | struct vm_area_struct *vma; | ||
133 | |||
134 | /* We don't need vma lookup at all. */ | ||
135 | if (!walk->hugetlb_entry) | ||
136 | return NULL; | ||
137 | |||
138 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | ||
139 | vma = find_vma(walk->mm, addr); | ||
140 | if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) | ||
141 | return vma; | ||
142 | |||
143 | return NULL; | ||
144 | } | ||
145 | |||
146 | #else /* CONFIG_HUGETLB_PAGE */ | ||
147 | static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) | ||
148 | { | ||
149 | return NULL; | ||
150 | } | ||
151 | |||
152 | static int walk_hugetlb_range(struct vm_area_struct *vma, | ||
153 | unsigned long addr, unsigned long end, | ||
154 | struct mm_walk *walk) | ||
155 | { | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | #endif /* CONFIG_HUGETLB_PAGE */ | ||
160 | |||
161 | |||
130 | 162 | ||
131 | /** | 163 | /** |
132 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
144 | * associated range, and a copy of the original mm_walk for access to | 176 | * associated range, and a copy of the original mm_walk for access to |
145 | * the ->private or ->mm fields. | 177 | * the ->private or ->mm fields. |
146 | * | 178 | * |
147 | * No locks are taken, but the bottom level iterator will map PTE | 179 | * Usually no locks are taken, but splitting transparent huge page may |
180 | * take page table lock. And the bottom level iterator will map PTE | ||
148 | * directories from highmem if necessary. | 181 | * directories from highmem if necessary. |
149 | * | 182 | * |
150 | * If any callback returns a non-zero value, the walk is aborted and | 183 | * If any callback returns a non-zero value, the walk is aborted and |
151 | * the return value is propagated back to the caller. Otherwise 0 is returned. | 184 | * the return value is propagated back to the caller. Otherwise 0 is returned. |
185 | * | ||
186 | * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry | ||
187 | * is !NULL. | ||
152 | */ | 188 | */ |
153 | int walk_page_range(unsigned long addr, unsigned long end, | 189 | int walk_page_range(unsigned long addr, unsigned long end, |
154 | struct mm_walk *walk) | 190 | struct mm_walk *walk) |
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
165 | 201 | ||
166 | pgd = pgd_offset(walk->mm, addr); | 202 | pgd = pgd_offset(walk->mm, addr); |
167 | do { | 203 | do { |
168 | struct vm_area_struct *uninitialized_var(vma); | 204 | struct vm_area_struct *vma; |
169 | 205 | ||
170 | next = pgd_addr_end(addr, end); | 206 | next = pgd_addr_end(addr, end); |
171 | 207 | ||
172 | #ifdef CONFIG_HUGETLB_PAGE | ||
173 | /* | 208 | /* |
174 | * handle hugetlb vma individually because pagetable walk for | 209 | * handle hugetlb vma individually because pagetable walk for |
175 | * the hugetlb page is dependent on the architecture and | 210 | * the hugetlb page is dependent on the architecture and |
176 | * we can't handled it in the same manner as non-huge pages. | 211 | * we can't handled it in the same manner as non-huge pages. |
177 | */ | 212 | */ |
178 | vma = find_vma(walk->mm, addr); | 213 | vma = hugetlb_vma(addr, walk); |
179 | if (vma && is_vm_hugetlb_page(vma)) { | 214 | if (vma) { |
180 | if (vma->vm_end < next) | 215 | if (vma->vm_end < next) |
181 | next = vma->vm_end; | 216 | next = vma->vm_end; |
182 | /* | 217 | /* |
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
189 | pgd = pgd_offset(walk->mm, next); | 224 | pgd = pgd_offset(walk->mm, next); |
190 | continue; | 225 | continue; |
191 | } | 226 | } |
192 | #endif | 227 | |
193 | if (pgd_none_or_clear_bad(pgd)) { | 228 | if (pgd_none_or_clear_bad(pgd)) { |
194 | if (walk->pte_hole) | 229 | if (walk->pte_hole) |
195 | err = walk->pte_hole(addr, next, walk); | 230 | err = walk->pte_hole(addr, next, walk); |
@@ -21,7 +21,6 @@ | |||
21 | * Lock ordering in mm: | 21 | * Lock ordering in mm: |
22 | * | 22 | * |
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * inode->i_alloc_sem (vmtruncate_range) | ||
25 | * mm->mmap_sem | 24 | * mm->mmap_sem |
26 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
27 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
@@ -38,9 +37,8 @@ | |||
38 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
39 | * within bdi.wb->list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
40 | * | 39 | * |
41 | * (code doesn't rely on that order so it could be switched around) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
42 | * ->tasklist_lock | 41 | * ->tasklist_lock |
43 | * anon_vma->mutex (memory_failure, collect_procs_anon) | ||
44 | * pte map lock | 42 | * pte map lock |
45 | */ | 43 | */ |
46 | 44 | ||
@@ -112,9 +110,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
112 | kmem_cache_free(anon_vma_cachep, anon_vma); | 110 | kmem_cache_free(anon_vma_cachep, anon_vma); |
113 | } | 111 | } |
114 | 112 | ||
115 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | 113 | static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) |
116 | { | 114 | { |
117 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | 115 | return kmem_cache_alloc(anon_vma_chain_cachep, gfp); |
118 | } | 116 | } |
119 | 117 | ||
120 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | 118 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) |
@@ -159,7 +157,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
159 | struct mm_struct *mm = vma->vm_mm; | 157 | struct mm_struct *mm = vma->vm_mm; |
160 | struct anon_vma *allocated; | 158 | struct anon_vma *allocated; |
161 | 159 | ||
162 | avc = anon_vma_chain_alloc(); | 160 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
163 | if (!avc) | 161 | if (!avc) |
164 | goto out_enomem; | 162 | goto out_enomem; |
165 | 163 | ||
@@ -200,6 +198,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
200 | return -ENOMEM; | 198 | return -ENOMEM; |
201 | } | 199 | } |
202 | 200 | ||
201 | /* | ||
202 | * This is a useful helper function for locking the anon_vma root as | ||
203 | * we traverse the vma->anon_vma_chain, looping over anon_vma's that | ||
204 | * have the same vma. | ||
205 | * | ||
206 | * Such anon_vma's should have the same root, so you'd expect to see | ||
207 | * just a single mutex_lock for the whole traversal. | ||
208 | */ | ||
209 | static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) | ||
210 | { | ||
211 | struct anon_vma *new_root = anon_vma->root; | ||
212 | if (new_root != root) { | ||
213 | if (WARN_ON_ONCE(root)) | ||
214 | mutex_unlock(&root->mutex); | ||
215 | root = new_root; | ||
216 | mutex_lock(&root->mutex); | ||
217 | } | ||
218 | return root; | ||
219 | } | ||
220 | |||
221 | static inline void unlock_anon_vma_root(struct anon_vma *root) | ||
222 | { | ||
223 | if (root) | ||
224 | mutex_unlock(&root->mutex); | ||
225 | } | ||
226 | |||
203 | static void anon_vma_chain_link(struct vm_area_struct *vma, | 227 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
204 | struct anon_vma_chain *avc, | 228 | struct anon_vma_chain *avc, |
205 | struct anon_vma *anon_vma) | 229 | struct anon_vma *anon_vma) |
@@ -208,13 +232,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
208 | avc->anon_vma = anon_vma; | 232 | avc->anon_vma = anon_vma; |
209 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 233 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
210 | 234 | ||
211 | anon_vma_lock(anon_vma); | ||
212 | /* | 235 | /* |
213 | * It's critical to add new vmas to the tail of the anon_vma, | 236 | * It's critical to add new vmas to the tail of the anon_vma, |
214 | * see comment in huge_memory.c:__split_huge_page(). | 237 | * see comment in huge_memory.c:__split_huge_page(). |
215 | */ | 238 | */ |
216 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 239 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
217 | anon_vma_unlock(anon_vma); | ||
218 | } | 240 | } |
219 | 241 | ||
220 | /* | 242 | /* |
@@ -224,13 +246,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
224 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 246 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
225 | { | 247 | { |
226 | struct anon_vma_chain *avc, *pavc; | 248 | struct anon_vma_chain *avc, *pavc; |
249 | struct anon_vma *root = NULL; | ||
227 | 250 | ||
228 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { | 251 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
229 | avc = anon_vma_chain_alloc(); | 252 | struct anon_vma *anon_vma; |
230 | if (!avc) | 253 | |
231 | goto enomem_failure; | 254 | avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); |
232 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | 255 | if (unlikely(!avc)) { |
256 | unlock_anon_vma_root(root); | ||
257 | root = NULL; | ||
258 | avc = anon_vma_chain_alloc(GFP_KERNEL); | ||
259 | if (!avc) | ||
260 | goto enomem_failure; | ||
261 | } | ||
262 | anon_vma = pavc->anon_vma; | ||
263 | root = lock_anon_vma_root(root, anon_vma); | ||
264 | anon_vma_chain_link(dst, avc, anon_vma); | ||
233 | } | 265 | } |
266 | unlock_anon_vma_root(root); | ||
234 | return 0; | 267 | return 0; |
235 | 268 | ||
236 | enomem_failure: | 269 | enomem_failure: |
@@ -263,7 +296,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
263 | anon_vma = anon_vma_alloc(); | 296 | anon_vma = anon_vma_alloc(); |
264 | if (!anon_vma) | 297 | if (!anon_vma) |
265 | goto out_error; | 298 | goto out_error; |
266 | avc = anon_vma_chain_alloc(); | 299 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
267 | if (!avc) | 300 | if (!avc) |
268 | goto out_error_free_anon_vma; | 301 | goto out_error_free_anon_vma; |
269 | 302 | ||
@@ -280,7 +313,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
280 | get_anon_vma(anon_vma->root); | 313 | get_anon_vma(anon_vma->root); |
281 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 314 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
282 | vma->anon_vma = anon_vma; | 315 | vma->anon_vma = anon_vma; |
316 | anon_vma_lock(anon_vma); | ||
283 | anon_vma_chain_link(vma, avc, anon_vma); | 317 | anon_vma_chain_link(vma, avc, anon_vma); |
318 | anon_vma_unlock(anon_vma); | ||
284 | 319 | ||
285 | return 0; | 320 | return 0; |
286 | 321 | ||
@@ -291,36 +326,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
291 | return -ENOMEM; | 326 | return -ENOMEM; |
292 | } | 327 | } |
293 | 328 | ||
294 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) | ||
295 | { | ||
296 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; | ||
297 | int empty; | ||
298 | |||
299 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
300 | if (!anon_vma) | ||
301 | return; | ||
302 | |||
303 | anon_vma_lock(anon_vma); | ||
304 | list_del(&anon_vma_chain->same_anon_vma); | ||
305 | |||
306 | /* We must garbage collect the anon_vma if it's empty */ | ||
307 | empty = list_empty(&anon_vma->head); | ||
308 | anon_vma_unlock(anon_vma); | ||
309 | |||
310 | if (empty) | ||
311 | put_anon_vma(anon_vma); | ||
312 | } | ||
313 | |||
314 | void unlink_anon_vmas(struct vm_area_struct *vma) | 329 | void unlink_anon_vmas(struct vm_area_struct *vma) |
315 | { | 330 | { |
316 | struct anon_vma_chain *avc, *next; | 331 | struct anon_vma_chain *avc, *next; |
332 | struct anon_vma *root = NULL; | ||
317 | 333 | ||
318 | /* | 334 | /* |
319 | * Unlink each anon_vma chained to the VMA. This list is ordered | 335 | * Unlink each anon_vma chained to the VMA. This list is ordered |
320 | * from newest to oldest, ensuring the root anon_vma gets freed last. | 336 | * from newest to oldest, ensuring the root anon_vma gets freed last. |
321 | */ | 337 | */ |
322 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 338 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
323 | anon_vma_unlink(avc); | 339 | struct anon_vma *anon_vma = avc->anon_vma; |
340 | |||
341 | root = lock_anon_vma_root(root, anon_vma); | ||
342 | list_del(&avc->same_anon_vma); | ||
343 | |||
344 | /* | ||
345 | * Leave empty anon_vmas on the list - we'll need | ||
346 | * to free them outside the lock. | ||
347 | */ | ||
348 | if (list_empty(&anon_vma->head)) | ||
349 | continue; | ||
350 | |||
351 | list_del(&avc->same_vma); | ||
352 | anon_vma_chain_free(avc); | ||
353 | } | ||
354 | unlock_anon_vma_root(root); | ||
355 | |||
356 | /* | ||
357 | * Iterate the list once more, it now only contains empty and unlinked | ||
358 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | ||
359 | * needing to acquire the anon_vma->root->mutex. | ||
360 | */ | ||
361 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
362 | struct anon_vma *anon_vma = avc->anon_vma; | ||
363 | |||
364 | put_anon_vma(anon_vma); | ||
365 | |||
324 | list_del(&avc->same_vma); | 366 | list_del(&avc->same_vma); |
325 | anon_vma_chain_free(avc); | 367 | anon_vma_chain_free(avc); |
326 | } | 368 | } |
@@ -827,11 +869,11 @@ int page_referenced(struct page *page, | |||
827 | vm_flags); | 869 | vm_flags); |
828 | if (we_locked) | 870 | if (we_locked) |
829 | unlock_page(page); | 871 | unlock_page(page); |
872 | |||
873 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
874 | referenced++; | ||
830 | } | 875 | } |
831 | out: | 876 | out: |
832 | if (page_test_and_clear_young(page_to_pfn(page))) | ||
833 | referenced++; | ||
834 | |||
835 | return referenced; | 877 | return referenced; |
836 | } | 878 | } |
837 | 879 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index d221a1cfd7b1..5cc21f8b4cd3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt; | |||
51 | #include <linux/shmem_fs.h> | 51 | #include <linux/shmem_fs.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/splice.h> | ||
54 | #include <linux/security.h> | 55 | #include <linux/security.h> |
55 | #include <linux/swapops.h> | 56 | #include <linux/swapops.h> |
56 | #include <linux/mempolicy.h> | 57 | #include <linux/mempolicy.h> |
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void) | |||
126 | } | 127 | } |
127 | #endif | 128 | #endif |
128 | 129 | ||
129 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 130 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
130 | struct page **pagep, enum sgp_type sgp, int *type); | 131 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
132 | |||
133 | static inline int shmem_getpage(struct inode *inode, pgoff_t index, | ||
134 | struct page **pagep, enum sgp_type sgp, int *fault_type) | ||
135 | { | ||
136 | return shmem_getpage_gfp(inode, index, pagep, sgp, | ||
137 | mapping_gfp_mask(inode->i_mapping), fault_type); | ||
138 | } | ||
131 | 139 | ||
132 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | 140 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) |
133 | { | 141 | { |
@@ -241,9 +249,7 @@ static void shmem_free_blocks(struct inode *inode, long pages) | |||
241 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 249 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); |
242 | if (sbinfo->max_blocks) { | 250 | if (sbinfo->max_blocks) { |
243 | percpu_counter_add(&sbinfo->used_blocks, -pages); | 251 | percpu_counter_add(&sbinfo->used_blocks, -pages); |
244 | spin_lock(&inode->i_lock); | ||
245 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | 252 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; |
246 | spin_unlock(&inode->i_lock); | ||
247 | } | 253 | } |
248 | } | 254 | } |
249 | 255 | ||
@@ -405,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns | |||
405 | * @info: info structure for the inode | 411 | * @info: info structure for the inode |
406 | * @index: index of the page to find | 412 | * @index: index of the page to find |
407 | * @sgp: check and recheck i_size? skip allocation? | 413 | * @sgp: check and recheck i_size? skip allocation? |
414 | * @gfp: gfp mask to use for any page allocation | ||
408 | * | 415 | * |
409 | * If the entry does not exist, allocate it. | 416 | * If the entry does not exist, allocate it. |
410 | */ | 417 | */ |
411 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) | 418 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, |
419 | unsigned long index, enum sgp_type sgp, gfp_t gfp) | ||
412 | { | 420 | { |
413 | struct inode *inode = &info->vfs_inode; | 421 | struct inode *inode = &info->vfs_inode; |
414 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 422 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); |
@@ -432,13 +440,11 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long | |||
432 | sbinfo->max_blocks - 1) >= 0) | 440 | sbinfo->max_blocks - 1) >= 0) |
433 | return ERR_PTR(-ENOSPC); | 441 | return ERR_PTR(-ENOSPC); |
434 | percpu_counter_inc(&sbinfo->used_blocks); | 442 | percpu_counter_inc(&sbinfo->used_blocks); |
435 | spin_lock(&inode->i_lock); | ||
436 | inode->i_blocks += BLOCKS_PER_PAGE; | 443 | inode->i_blocks += BLOCKS_PER_PAGE; |
437 | spin_unlock(&inode->i_lock); | ||
438 | } | 444 | } |
439 | 445 | ||
440 | spin_unlock(&info->lock); | 446 | spin_unlock(&info->lock); |
441 | page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); | 447 | page = shmem_dir_alloc(gfp); |
442 | spin_lock(&info->lock); | 448 | spin_lock(&info->lock); |
443 | 449 | ||
444 | if (!page) { | 450 | if (!page) { |
@@ -539,7 +545,7 @@ static void shmem_free_pages(struct list_head *next) | |||
539 | } while (next); | 545 | } while (next); |
540 | } | 546 | } |
541 | 547 | ||
542 | static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 548 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) |
543 | { | 549 | { |
544 | struct shmem_inode_info *info = SHMEM_I(inode); | 550 | struct shmem_inode_info *info = SHMEM_I(inode); |
545 | unsigned long idx; | 551 | unsigned long idx; |
@@ -562,6 +568,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | |||
562 | spinlock_t *punch_lock; | 568 | spinlock_t *punch_lock; |
563 | unsigned long upper_limit; | 569 | unsigned long upper_limit; |
564 | 570 | ||
571 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
572 | |||
565 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 573 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
566 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 574 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
567 | if (idx >= info->next_index) | 575 | if (idx >= info->next_index) |
@@ -738,16 +746,8 @@ done2: | |||
738 | * lowered next_index. Also, though shmem_getpage checks | 746 | * lowered next_index. Also, though shmem_getpage checks |
739 | * i_size before adding to cache, no recheck after: so fix the | 747 | * i_size before adding to cache, no recheck after: so fix the |
740 | * narrow window there too. | 748 | * narrow window there too. |
741 | * | ||
742 | * Recalling truncate_inode_pages_range and unmap_mapping_range | ||
743 | * every time for punch_hole (which never got a chance to clear | ||
744 | * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, | ||
745 | * yet hardly ever necessary: try to optimize them out later. | ||
746 | */ | 749 | */ |
747 | truncate_inode_pages_range(inode->i_mapping, start, end); | 750 | truncate_inode_pages_range(inode->i_mapping, start, end); |
748 | if (punch_hole) | ||
749 | unmap_mapping_range(inode->i_mapping, start, | ||
750 | end - start, 1); | ||
751 | } | 751 | } |
752 | 752 | ||
753 | spin_lock(&info->lock); | 753 | spin_lock(&info->lock); |
@@ -766,22 +766,23 @@ done2: | |||
766 | shmem_free_pages(pages_to_free.next); | 766 | shmem_free_pages(pages_to_free.next); |
767 | } | 767 | } |
768 | } | 768 | } |
769 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | ||
769 | 770 | ||
770 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | 771 | static int shmem_setattr(struct dentry *dentry, struct iattr *attr) |
771 | { | 772 | { |
772 | struct inode *inode = dentry->d_inode; | 773 | struct inode *inode = dentry->d_inode; |
773 | loff_t newsize = attr->ia_size; | ||
774 | int error; | 774 | int error; |
775 | 775 | ||
776 | error = inode_change_ok(inode, attr); | 776 | error = inode_change_ok(inode, attr); |
777 | if (error) | 777 | if (error) |
778 | return error; | 778 | return error; |
779 | 779 | ||
780 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) | 780 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
781 | && newsize != inode->i_size) { | 781 | loff_t oldsize = inode->i_size; |
782 | loff_t newsize = attr->ia_size; | ||
782 | struct page *page = NULL; | 783 | struct page *page = NULL; |
783 | 784 | ||
784 | if (newsize < inode->i_size) { | 785 | if (newsize < oldsize) { |
785 | /* | 786 | /* |
786 | * If truncating down to a partial page, then | 787 | * If truncating down to a partial page, then |
787 | * if that page is already allocated, hold it | 788 | * if that page is already allocated, hold it |
@@ -810,12 +811,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | spin_unlock(&info->lock); | 811 | spin_unlock(&info->lock); |
811 | } | 812 | } |
812 | } | 813 | } |
813 | 814 | if (newsize != oldsize) { | |
814 | /* XXX(truncate): truncate_setsize should be called last */ | 815 | i_size_write(inode, newsize); |
815 | truncate_setsize(inode, newsize); | 816 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
817 | } | ||
818 | if (newsize < oldsize) { | ||
819 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | ||
820 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | ||
821 | shmem_truncate_range(inode, newsize, (loff_t)-1); | ||
822 | /* unmap again to remove racily COWed private pages */ | ||
823 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | ||
824 | } | ||
816 | if (page) | 825 | if (page) |
817 | page_cache_release(page); | 826 | page_cache_release(page); |
818 | shmem_truncate_range(inode, newsize, (loff_t)-1); | ||
819 | } | 827 | } |
820 | 828 | ||
821 | setattr_copy(inode, attr); | 829 | setattr_copy(inode, attr); |
@@ -832,7 +840,6 @@ static void shmem_evict_inode(struct inode *inode) | |||
832 | struct shmem_xattr *xattr, *nxattr; | 840 | struct shmem_xattr *xattr, *nxattr; |
833 | 841 | ||
834 | if (inode->i_mapping->a_ops == &shmem_aops) { | 842 | if (inode->i_mapping->a_ops == &shmem_aops) { |
835 | truncate_inode_pages(inode->i_mapping, 0); | ||
836 | shmem_unacct_size(info->flags, inode->i_size); | 843 | shmem_unacct_size(info->flags, inode->i_size); |
837 | inode->i_size = 0; | 844 | inode->i_size = 0; |
838 | shmem_truncate_range(inode, 0, (loff_t)-1); | 845 | shmem_truncate_range(inode, 0, (loff_t)-1); |
@@ -965,20 +972,7 @@ found: | |||
965 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); | 972 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); |
966 | /* which does mem_cgroup_uncharge_cache_page on error */ | 973 | /* which does mem_cgroup_uncharge_cache_page on error */ |
967 | 974 | ||
968 | if (error == -EEXIST) { | 975 | if (error != -ENOMEM) { |
969 | struct page *filepage = find_get_page(mapping, idx); | ||
970 | error = 1; | ||
971 | if (filepage) { | ||
972 | /* | ||
973 | * There might be a more uptodate page coming down | ||
974 | * from a stacked writepage: forget our swappage if so. | ||
975 | */ | ||
976 | if (PageUptodate(filepage)) | ||
977 | error = 0; | ||
978 | page_cache_release(filepage); | ||
979 | } | ||
980 | } | ||
981 | if (!error) { | ||
982 | delete_from_swap_cache(page); | 976 | delete_from_swap_cache(page); |
983 | set_page_dirty(page); | 977 | set_page_dirty(page); |
984 | info->flags |= SHMEM_PAGEIN; | 978 | info->flags |= SHMEM_PAGEIN; |
@@ -1065,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1065 | /* | 1059 | /* |
1066 | * shmem_backing_dev_info's capabilities prevent regular writeback or | 1060 | * shmem_backing_dev_info's capabilities prevent regular writeback or |
1067 | * sync from ever calling shmem_writepage; but a stacking filesystem | 1061 | * sync from ever calling shmem_writepage; but a stacking filesystem |
1068 | * may use the ->writepage of its underlying filesystem, in which case | 1062 | * might use ->writepage of its underlying filesystem, in which case |
1069 | * tmpfs should write out to swap only in response to memory pressure, | 1063 | * tmpfs should write out to swap only in response to memory pressure, |
1070 | * and not for the writeback threads or sync. However, in those cases, | 1064 | * and not for the writeback threads or sync. |
1071 | * we do still want to check if there's a redundant swappage to be | ||
1072 | * discarded. | ||
1073 | */ | 1065 | */ |
1074 | if (wbc->for_reclaim) | 1066 | if (!wbc->for_reclaim) { |
1075 | swap = get_swap_page(); | 1067 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
1076 | else | 1068 | goto redirty; |
1077 | swap.val = 0; | 1069 | } |
1070 | swap = get_swap_page(); | ||
1071 | if (!swap.val) | ||
1072 | goto redirty; | ||
1078 | 1073 | ||
1079 | /* | 1074 | /* |
1080 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | 1075 | * Add inode to shmem_unuse()'s list of swapped-out inodes, |
@@ -1085,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1085 | * we've taken the spinlock, because shmem_unuse_inode() will | 1080 | * we've taken the spinlock, because shmem_unuse_inode() will |
1086 | * prune a !swapped inode from the swaplist under both locks. | 1081 | * prune a !swapped inode from the swaplist under both locks. |
1087 | */ | 1082 | */ |
1088 | if (swap.val) { | 1083 | mutex_lock(&shmem_swaplist_mutex); |
1089 | mutex_lock(&shmem_swaplist_mutex); | 1084 | if (list_empty(&info->swaplist)) |
1090 | if (list_empty(&info->swaplist)) | 1085 | list_add_tail(&info->swaplist, &shmem_swaplist); |
1091 | list_add_tail(&info->swaplist, &shmem_swaplist); | ||
1092 | } | ||
1093 | 1086 | ||
1094 | spin_lock(&info->lock); | 1087 | spin_lock(&info->lock); |
1095 | if (swap.val) | 1088 | mutex_unlock(&shmem_swaplist_mutex); |
1096 | mutex_unlock(&shmem_swaplist_mutex); | ||
1097 | 1089 | ||
1098 | if (index >= info->next_index) { | 1090 | if (index >= info->next_index) { |
1099 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | 1091 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); |
@@ -1101,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1101 | } | 1093 | } |
1102 | entry = shmem_swp_entry(info, index, NULL); | 1094 | entry = shmem_swp_entry(info, index, NULL); |
1103 | if (entry->val) { | 1095 | if (entry->val) { |
1104 | /* | 1096 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
1105 | * The more uptodate page coming down from a stacked | ||
1106 | * writepage should replace our old swappage. | ||
1107 | */ | ||
1108 | free_swap_and_cache(*entry); | 1097 | free_swap_and_cache(*entry); |
1109 | shmem_swp_set(info, entry, 0); | 1098 | shmem_swp_set(info, entry, 0); |
1110 | } | 1099 | } |
1111 | shmem_recalc_inode(inode); | 1100 | shmem_recalc_inode(inode); |
1112 | 1101 | ||
1113 | if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 1102 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1114 | delete_from_page_cache(page); | 1103 | delete_from_page_cache(page); |
1115 | shmem_swp_set(info, entry, swap.val); | 1104 | shmem_swp_set(info, entry, swap.val); |
1116 | shmem_swp_unmap(entry); | 1105 | shmem_swp_unmap(entry); |
@@ -1227,92 +1216,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1227 | #endif | 1216 | #endif |
1228 | 1217 | ||
1229 | /* | 1218 | /* |
1230 | * shmem_getpage - either get the page from swap or allocate a new one | 1219 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
1231 | * | 1220 | * |
1232 | * If we allocate a new one we do not mark it dirty. That's up to the | 1221 | * If we allocate a new one we do not mark it dirty. That's up to the |
1233 | * vm. If we swap it in we mark it dirty since we also free the swap | 1222 | * vm. If we swap it in we mark it dirty since we also free the swap |
1234 | * entry since a page cannot live in both the swap and page cache | 1223 | * entry since a page cannot live in both the swap and page cache |
1235 | */ | 1224 | */ |
1236 | static int shmem_getpage(struct inode *inode, unsigned long idx, | 1225 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, |
1237 | struct page **pagep, enum sgp_type sgp, int *type) | 1226 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
1238 | { | 1227 | { |
1239 | struct address_space *mapping = inode->i_mapping; | 1228 | struct address_space *mapping = inode->i_mapping; |
1240 | struct shmem_inode_info *info = SHMEM_I(inode); | 1229 | struct shmem_inode_info *info = SHMEM_I(inode); |
1241 | struct shmem_sb_info *sbinfo; | 1230 | struct shmem_sb_info *sbinfo; |
1242 | struct page *filepage = *pagep; | 1231 | struct page *page; |
1243 | struct page *swappage; | ||
1244 | struct page *prealloc_page = NULL; | 1232 | struct page *prealloc_page = NULL; |
1245 | swp_entry_t *entry; | 1233 | swp_entry_t *entry; |
1246 | swp_entry_t swap; | 1234 | swp_entry_t swap; |
1247 | gfp_t gfp; | ||
1248 | int error; | 1235 | int error; |
1236 | int ret; | ||
1249 | 1237 | ||
1250 | if (idx >= SHMEM_MAX_INDEX) | 1238 | if (idx >= SHMEM_MAX_INDEX) |
1251 | return -EFBIG; | 1239 | return -EFBIG; |
1252 | |||
1253 | if (type) | ||
1254 | *type = 0; | ||
1255 | |||
1256 | /* | ||
1257 | * Normally, filepage is NULL on entry, and either found | ||
1258 | * uptodate immediately, or allocated and zeroed, or read | ||
1259 | * in under swappage, which is then assigned to filepage. | ||
1260 | * But shmem_readpage (required for splice) passes in a locked | ||
1261 | * filepage, which may be found not uptodate by other callers | ||
1262 | * too, and may need to be copied from the swappage read in. | ||
1263 | */ | ||
1264 | repeat: | 1240 | repeat: |
1265 | if (!filepage) | 1241 | page = find_lock_page(mapping, idx); |
1266 | filepage = find_lock_page(mapping, idx); | 1242 | if (page) { |
1267 | if (filepage && PageUptodate(filepage)) | ||
1268 | goto done; | ||
1269 | gfp = mapping_gfp_mask(mapping); | ||
1270 | if (!filepage) { | ||
1271 | /* | 1243 | /* |
1272 | * Try to preload while we can wait, to not make a habit of | 1244 | * Once we can get the page lock, it must be uptodate: |
1273 | * draining atomic reserves; but don't latch on to this cpu. | 1245 | * if there were an error in reading back from swap, |
1246 | * the page would not be inserted into the filecache. | ||
1274 | */ | 1247 | */ |
1275 | error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); | 1248 | BUG_ON(!PageUptodate(page)); |
1276 | if (error) | 1249 | goto done; |
1277 | goto failed; | 1250 | } |
1278 | radix_tree_preload_end(); | 1251 | |
1279 | if (sgp != SGP_READ && !prealloc_page) { | 1252 | /* |
1280 | /* We don't care if this fails */ | 1253 | * Try to preload while we can wait, to not make a habit of |
1281 | prealloc_page = shmem_alloc_page(gfp, info, idx); | 1254 | * draining atomic reserves; but don't latch on to this cpu. |
1282 | if (prealloc_page) { | 1255 | */ |
1283 | if (mem_cgroup_cache_charge(prealloc_page, | 1256 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); |
1284 | current->mm, GFP_KERNEL)) { | 1257 | if (error) |
1285 | page_cache_release(prealloc_page); | 1258 | goto out; |
1286 | prealloc_page = NULL; | 1259 | radix_tree_preload_end(); |
1287 | } | 1260 | |
1261 | if (sgp != SGP_READ && !prealloc_page) { | ||
1262 | prealloc_page = shmem_alloc_page(gfp, info, idx); | ||
1263 | if (prealloc_page) { | ||
1264 | SetPageSwapBacked(prealloc_page); | ||
1265 | if (mem_cgroup_cache_charge(prealloc_page, | ||
1266 | current->mm, GFP_KERNEL)) { | ||
1267 | page_cache_release(prealloc_page); | ||
1268 | prealloc_page = NULL; | ||
1288 | } | 1269 | } |
1289 | } | 1270 | } |
1290 | } | 1271 | } |
1291 | error = 0; | ||
1292 | 1272 | ||
1293 | spin_lock(&info->lock); | 1273 | spin_lock(&info->lock); |
1294 | shmem_recalc_inode(inode); | 1274 | shmem_recalc_inode(inode); |
1295 | entry = shmem_swp_alloc(info, idx, sgp); | 1275 | entry = shmem_swp_alloc(info, idx, sgp, gfp); |
1296 | if (IS_ERR(entry)) { | 1276 | if (IS_ERR(entry)) { |
1297 | spin_unlock(&info->lock); | 1277 | spin_unlock(&info->lock); |
1298 | error = PTR_ERR(entry); | 1278 | error = PTR_ERR(entry); |
1299 | goto failed; | 1279 | goto out; |
1300 | } | 1280 | } |
1301 | swap = *entry; | 1281 | swap = *entry; |
1302 | 1282 | ||
1303 | if (swap.val) { | 1283 | if (swap.val) { |
1304 | /* Look it up and read it in.. */ | 1284 | /* Look it up and read it in.. */ |
1305 | swappage = lookup_swap_cache(swap); | 1285 | page = lookup_swap_cache(swap); |
1306 | if (!swappage) { | 1286 | if (!page) { |
1307 | shmem_swp_unmap(entry); | 1287 | shmem_swp_unmap(entry); |
1308 | spin_unlock(&info->lock); | 1288 | spin_unlock(&info->lock); |
1309 | /* here we actually do the io */ | 1289 | /* here we actually do the io */ |
1310 | if (type) | 1290 | if (fault_type) |
1311 | *type |= VM_FAULT_MAJOR; | 1291 | *fault_type |= VM_FAULT_MAJOR; |
1312 | swappage = shmem_swapin(swap, gfp, info, idx); | 1292 | page = shmem_swapin(swap, gfp, info, idx); |
1313 | if (!swappage) { | 1293 | if (!page) { |
1314 | spin_lock(&info->lock); | 1294 | spin_lock(&info->lock); |
1315 | entry = shmem_swp_alloc(info, idx, sgp); | 1295 | entry = shmem_swp_alloc(info, idx, sgp, gfp); |
1316 | if (IS_ERR(entry)) | 1296 | if (IS_ERR(entry)) |
1317 | error = PTR_ERR(entry); | 1297 | error = PTR_ERR(entry); |
1318 | else { | 1298 | else { |
@@ -1322,62 +1302,42 @@ repeat: | |||
1322 | } | 1302 | } |
1323 | spin_unlock(&info->lock); | 1303 | spin_unlock(&info->lock); |
1324 | if (error) | 1304 | if (error) |
1325 | goto failed; | 1305 | goto out; |
1326 | goto repeat; | 1306 | goto repeat; |
1327 | } | 1307 | } |
1328 | wait_on_page_locked(swappage); | 1308 | wait_on_page_locked(page); |
1329 | page_cache_release(swappage); | 1309 | page_cache_release(page); |
1330 | goto repeat; | 1310 | goto repeat; |
1331 | } | 1311 | } |
1332 | 1312 | ||
1333 | /* We have to do this with page locked to prevent races */ | 1313 | /* We have to do this with page locked to prevent races */ |
1334 | if (!trylock_page(swappage)) { | 1314 | if (!trylock_page(page)) { |
1335 | shmem_swp_unmap(entry); | 1315 | shmem_swp_unmap(entry); |
1336 | spin_unlock(&info->lock); | 1316 | spin_unlock(&info->lock); |
1337 | wait_on_page_locked(swappage); | 1317 | wait_on_page_locked(page); |
1338 | page_cache_release(swappage); | 1318 | page_cache_release(page); |
1339 | goto repeat; | 1319 | goto repeat; |
1340 | } | 1320 | } |
1341 | if (PageWriteback(swappage)) { | 1321 | if (PageWriteback(page)) { |
1342 | shmem_swp_unmap(entry); | 1322 | shmem_swp_unmap(entry); |
1343 | spin_unlock(&info->lock); | 1323 | spin_unlock(&info->lock); |
1344 | wait_on_page_writeback(swappage); | 1324 | wait_on_page_writeback(page); |
1345 | unlock_page(swappage); | 1325 | unlock_page(page); |
1346 | page_cache_release(swappage); | 1326 | page_cache_release(page); |
1347 | goto repeat; | 1327 | goto repeat; |
1348 | } | 1328 | } |
1349 | if (!PageUptodate(swappage)) { | 1329 | if (!PageUptodate(page)) { |
1350 | shmem_swp_unmap(entry); | 1330 | shmem_swp_unmap(entry); |
1351 | spin_unlock(&info->lock); | 1331 | spin_unlock(&info->lock); |
1352 | unlock_page(swappage); | 1332 | unlock_page(page); |
1353 | page_cache_release(swappage); | 1333 | page_cache_release(page); |
1354 | error = -EIO; | 1334 | error = -EIO; |
1355 | goto failed; | 1335 | goto out; |
1356 | } | 1336 | } |
1357 | 1337 | ||
1358 | if (filepage) { | 1338 | error = add_to_page_cache_locked(page, mapping, |
1359 | shmem_swp_set(info, entry, 0); | 1339 | idx, GFP_NOWAIT); |
1360 | shmem_swp_unmap(entry); | 1340 | if (error) { |
1361 | delete_from_swap_cache(swappage); | ||
1362 | spin_unlock(&info->lock); | ||
1363 | copy_highpage(filepage, swappage); | ||
1364 | unlock_page(swappage); | ||
1365 | page_cache_release(swappage); | ||
1366 | flush_dcache_page(filepage); | ||
1367 | SetPageUptodate(filepage); | ||
1368 | set_page_dirty(filepage); | ||
1369 | swap_free(swap); | ||
1370 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, | ||
1371 | idx, GFP_NOWAIT))) { | ||
1372 | info->flags |= SHMEM_PAGEIN; | ||
1373 | shmem_swp_set(info, entry, 0); | ||
1374 | shmem_swp_unmap(entry); | ||
1375 | delete_from_swap_cache(swappage); | ||
1376 | spin_unlock(&info->lock); | ||
1377 | filepage = swappage; | ||
1378 | set_page_dirty(filepage); | ||
1379 | swap_free(swap); | ||
1380 | } else { | ||
1381 | shmem_swp_unmap(entry); | 1341 | shmem_swp_unmap(entry); |
1382 | spin_unlock(&info->lock); | 1342 | spin_unlock(&info->lock); |
1383 | if (error == -ENOMEM) { | 1343 | if (error == -ENOMEM) { |
@@ -1386,32 +1346,38 @@ repeat: | |||
1386 | * call memcg's OOM if needed. | 1346 | * call memcg's OOM if needed. |
1387 | */ | 1347 | */ |
1388 | error = mem_cgroup_shmem_charge_fallback( | 1348 | error = mem_cgroup_shmem_charge_fallback( |
1389 | swappage, | 1349 | page, current->mm, gfp); |
1390 | current->mm, | ||
1391 | gfp); | ||
1392 | if (error) { | 1350 | if (error) { |
1393 | unlock_page(swappage); | 1351 | unlock_page(page); |
1394 | page_cache_release(swappage); | 1352 | page_cache_release(page); |
1395 | goto failed; | 1353 | goto out; |
1396 | } | 1354 | } |
1397 | } | 1355 | } |
1398 | unlock_page(swappage); | 1356 | unlock_page(page); |
1399 | page_cache_release(swappage); | 1357 | page_cache_release(page); |
1400 | goto repeat; | 1358 | goto repeat; |
1401 | } | 1359 | } |
1402 | } else if (sgp == SGP_READ && !filepage) { | 1360 | |
1361 | info->flags |= SHMEM_PAGEIN; | ||
1362 | shmem_swp_set(info, entry, 0); | ||
1403 | shmem_swp_unmap(entry); | 1363 | shmem_swp_unmap(entry); |
1404 | filepage = find_get_page(mapping, idx); | 1364 | delete_from_swap_cache(page); |
1405 | if (filepage && | 1365 | spin_unlock(&info->lock); |
1406 | (!PageUptodate(filepage) || !trylock_page(filepage))) { | 1366 | set_page_dirty(page); |
1367 | swap_free(swap); | ||
1368 | |||
1369 | } else if (sgp == SGP_READ) { | ||
1370 | shmem_swp_unmap(entry); | ||
1371 | page = find_get_page(mapping, idx); | ||
1372 | if (page && !trylock_page(page)) { | ||
1407 | spin_unlock(&info->lock); | 1373 | spin_unlock(&info->lock); |
1408 | wait_on_page_locked(filepage); | 1374 | wait_on_page_locked(page); |
1409 | page_cache_release(filepage); | 1375 | page_cache_release(page); |
1410 | filepage = NULL; | ||
1411 | goto repeat; | 1376 | goto repeat; |
1412 | } | 1377 | } |
1413 | spin_unlock(&info->lock); | 1378 | spin_unlock(&info->lock); |
1414 | } else { | 1379 | |
1380 | } else if (prealloc_page) { | ||
1415 | shmem_swp_unmap(entry); | 1381 | shmem_swp_unmap(entry); |
1416 | sbinfo = SHMEM_SB(inode->i_sb); | 1382 | sbinfo = SHMEM_SB(inode->i_sb); |
1417 | if (sbinfo->max_blocks) { | 1383 | if (sbinfo->max_blocks) { |
@@ -1420,126 +1386,86 @@ repeat: | |||
1420 | shmem_acct_block(info->flags)) | 1386 | shmem_acct_block(info->flags)) |
1421 | goto nospace; | 1387 | goto nospace; |
1422 | percpu_counter_inc(&sbinfo->used_blocks); | 1388 | percpu_counter_inc(&sbinfo->used_blocks); |
1423 | spin_lock(&inode->i_lock); | ||
1424 | inode->i_blocks += BLOCKS_PER_PAGE; | 1389 | inode->i_blocks += BLOCKS_PER_PAGE; |
1425 | spin_unlock(&inode->i_lock); | ||
1426 | } else if (shmem_acct_block(info->flags)) | 1390 | } else if (shmem_acct_block(info->flags)) |
1427 | goto nospace; | 1391 | goto nospace; |
1428 | 1392 | ||
1429 | if (!filepage) { | 1393 | page = prealloc_page; |
1430 | int ret; | 1394 | prealloc_page = NULL; |
1431 | |||
1432 | if (!prealloc_page) { | ||
1433 | spin_unlock(&info->lock); | ||
1434 | filepage = shmem_alloc_page(gfp, info, idx); | ||
1435 | if (!filepage) { | ||
1436 | shmem_unacct_blocks(info->flags, 1); | ||
1437 | shmem_free_blocks(inode, 1); | ||
1438 | error = -ENOMEM; | ||
1439 | goto failed; | ||
1440 | } | ||
1441 | SetPageSwapBacked(filepage); | ||
1442 | 1395 | ||
1443 | /* | 1396 | entry = shmem_swp_alloc(info, idx, sgp, gfp); |
1444 | * Precharge page while we can wait, compensate | 1397 | if (IS_ERR(entry)) |
1445 | * after | 1398 | error = PTR_ERR(entry); |
1446 | */ | 1399 | else { |
1447 | error = mem_cgroup_cache_charge(filepage, | 1400 | swap = *entry; |
1448 | current->mm, GFP_KERNEL); | 1401 | shmem_swp_unmap(entry); |
1449 | if (error) { | 1402 | } |
1450 | page_cache_release(filepage); | 1403 | ret = error || swap.val; |
1451 | shmem_unacct_blocks(info->flags, 1); | 1404 | if (ret) |
1452 | shmem_free_blocks(inode, 1); | 1405 | mem_cgroup_uncharge_cache_page(page); |
1453 | filepage = NULL; | 1406 | else |
1454 | goto failed; | 1407 | ret = add_to_page_cache_lru(page, mapping, |
1455 | } | ||
1456 | |||
1457 | spin_lock(&info->lock); | ||
1458 | } else { | ||
1459 | filepage = prealloc_page; | ||
1460 | prealloc_page = NULL; | ||
1461 | SetPageSwapBacked(filepage); | ||
1462 | } | ||
1463 | |||
1464 | entry = shmem_swp_alloc(info, idx, sgp); | ||
1465 | if (IS_ERR(entry)) | ||
1466 | error = PTR_ERR(entry); | ||
1467 | else { | ||
1468 | swap = *entry; | ||
1469 | shmem_swp_unmap(entry); | ||
1470 | } | ||
1471 | ret = error || swap.val; | ||
1472 | if (ret) | ||
1473 | mem_cgroup_uncharge_cache_page(filepage); | ||
1474 | else | ||
1475 | ret = add_to_page_cache_lru(filepage, mapping, | ||
1476 | idx, GFP_NOWAIT); | 1408 | idx, GFP_NOWAIT); |
1477 | /* | 1409 | /* |
1478 | * At add_to_page_cache_lru() failure, uncharge will | 1410 | * At add_to_page_cache_lru() failure, |
1479 | * be done automatically. | 1411 | * uncharge will be done automatically. |
1480 | */ | 1412 | */ |
1481 | if (ret) { | 1413 | if (ret) { |
1482 | spin_unlock(&info->lock); | 1414 | shmem_unacct_blocks(info->flags, 1); |
1483 | page_cache_release(filepage); | 1415 | shmem_free_blocks(inode, 1); |
1484 | shmem_unacct_blocks(info->flags, 1); | 1416 | spin_unlock(&info->lock); |
1485 | shmem_free_blocks(inode, 1); | 1417 | page_cache_release(page); |
1486 | filepage = NULL; | 1418 | if (error) |
1487 | if (error) | 1419 | goto out; |
1488 | goto failed; | 1420 | goto repeat; |
1489 | goto repeat; | ||
1490 | } | ||
1491 | info->flags |= SHMEM_PAGEIN; | ||
1492 | } | 1421 | } |
1493 | 1422 | ||
1423 | info->flags |= SHMEM_PAGEIN; | ||
1494 | info->alloced++; | 1424 | info->alloced++; |
1495 | spin_unlock(&info->lock); | 1425 | spin_unlock(&info->lock); |
1496 | clear_highpage(filepage); | 1426 | clear_highpage(page); |
1497 | flush_dcache_page(filepage); | 1427 | flush_dcache_page(page); |
1498 | SetPageUptodate(filepage); | 1428 | SetPageUptodate(page); |
1499 | if (sgp == SGP_DIRTY) | 1429 | if (sgp == SGP_DIRTY) |
1500 | set_page_dirty(filepage); | 1430 | set_page_dirty(page); |
1431 | |||
1432 | } else { | ||
1433 | spin_unlock(&info->lock); | ||
1434 | error = -ENOMEM; | ||
1435 | goto out; | ||
1501 | } | 1436 | } |
1502 | done: | 1437 | done: |
1503 | *pagep = filepage; | 1438 | *pagep = page; |
1504 | error = 0; | 1439 | error = 0; |
1505 | goto out; | 1440 | out: |
1441 | if (prealloc_page) { | ||
1442 | mem_cgroup_uncharge_cache_page(prealloc_page); | ||
1443 | page_cache_release(prealloc_page); | ||
1444 | } | ||
1445 | return error; | ||
1506 | 1446 | ||
1507 | nospace: | 1447 | nospace: |
1508 | /* | 1448 | /* |
1509 | * Perhaps the page was brought in from swap between find_lock_page | 1449 | * Perhaps the page was brought in from swap between find_lock_page |
1510 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | 1450 | * and taking info->lock? We allow for that at add_to_page_cache_lru, |
1511 | * but must also avoid reporting a spurious ENOSPC while working on a | 1451 | * but must also avoid reporting a spurious ENOSPC while working on a |
1512 | * full tmpfs. (When filepage has been passed in to shmem_getpage, it | 1452 | * full tmpfs. |
1513 | * is already in page cache, which prevents this race from occurring.) | ||
1514 | */ | 1453 | */ |
1515 | if (!filepage) { | 1454 | page = find_get_page(mapping, idx); |
1516 | struct page *page = find_get_page(mapping, idx); | ||
1517 | if (page) { | ||
1518 | spin_unlock(&info->lock); | ||
1519 | page_cache_release(page); | ||
1520 | goto repeat; | ||
1521 | } | ||
1522 | } | ||
1523 | spin_unlock(&info->lock); | 1455 | spin_unlock(&info->lock); |
1524 | error = -ENOSPC; | 1456 | if (page) { |
1525 | failed: | 1457 | page_cache_release(page); |
1526 | if (*pagep != filepage) { | 1458 | goto repeat; |
1527 | unlock_page(filepage); | ||
1528 | page_cache_release(filepage); | ||
1529 | } | ||
1530 | out: | ||
1531 | if (prealloc_page) { | ||
1532 | mem_cgroup_uncharge_cache_page(prealloc_page); | ||
1533 | page_cache_release(prealloc_page); | ||
1534 | } | 1459 | } |
1535 | return error; | 1460 | error = -ENOSPC; |
1461 | goto out; | ||
1536 | } | 1462 | } |
1537 | 1463 | ||
1538 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1464 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1539 | { | 1465 | { |
1540 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; | 1466 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1541 | int error; | 1467 | int error; |
1542 | int ret; | 1468 | int ret = VM_FAULT_LOCKED; |
1543 | 1469 | ||
1544 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 1470 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) |
1545 | return VM_FAULT_SIGBUS; | 1471 | return VM_FAULT_SIGBUS; |
@@ -1547,11 +1473,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1547 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1473 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1548 | if (error) | 1474 | if (error) |
1549 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1475 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1476 | |||
1550 | if (ret & VM_FAULT_MAJOR) { | 1477 | if (ret & VM_FAULT_MAJOR) { |
1551 | count_vm_event(PGMAJFAULT); | 1478 | count_vm_event(PGMAJFAULT); |
1552 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1479 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
1553 | } | 1480 | } |
1554 | return ret | VM_FAULT_LOCKED; | 1481 | return ret; |
1555 | } | 1482 | } |
1556 | 1483 | ||
1557 | #ifdef CONFIG_NUMA | 1484 | #ifdef CONFIG_NUMA |
@@ -1668,19 +1595,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1668 | static const struct inode_operations shmem_symlink_inode_operations; | 1595 | static const struct inode_operations shmem_symlink_inode_operations; |
1669 | static const struct inode_operations shmem_symlink_inline_operations; | 1596 | static const struct inode_operations shmem_symlink_inline_operations; |
1670 | 1597 | ||
1671 | /* | ||
1672 | * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; | ||
1673 | * but providing them allows a tmpfs file to be used for splice, sendfile, and | ||
1674 | * below the loop driver, in the generic fashion that many filesystems support. | ||
1675 | */ | ||
1676 | static int shmem_readpage(struct file *file, struct page *page) | ||
1677 | { | ||
1678 | struct inode *inode = page->mapping->host; | ||
1679 | int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); | ||
1680 | unlock_page(page); | ||
1681 | return error; | ||
1682 | } | ||
1683 | |||
1684 | static int | 1598 | static int |
1685 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1599 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1686 | loff_t pos, unsigned len, unsigned flags, | 1600 | loff_t pos, unsigned len, unsigned flags, |
@@ -1688,7 +1602,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
1688 | { | 1602 | { |
1689 | struct inode *inode = mapping->host; | 1603 | struct inode *inode = mapping->host; |
1690 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1604 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1691 | *pagep = NULL; | ||
1692 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); | 1605 | return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); |
1693 | } | 1606 | } |
1694 | 1607 | ||
@@ -1845,6 +1758,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, | |||
1845 | return retval; | 1758 | return retval; |
1846 | } | 1759 | } |
1847 | 1760 | ||
1761 | static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | ||
1762 | struct pipe_inode_info *pipe, size_t len, | ||
1763 | unsigned int flags) | ||
1764 | { | ||
1765 | struct address_space *mapping = in->f_mapping; | ||
1766 | struct inode *inode = mapping->host; | ||
1767 | unsigned int loff, nr_pages, req_pages; | ||
1768 | struct page *pages[PIPE_DEF_BUFFERS]; | ||
1769 | struct partial_page partial[PIPE_DEF_BUFFERS]; | ||
1770 | struct page *page; | ||
1771 | pgoff_t index, end_index; | ||
1772 | loff_t isize, left; | ||
1773 | int error, page_nr; | ||
1774 | struct splice_pipe_desc spd = { | ||
1775 | .pages = pages, | ||
1776 | .partial = partial, | ||
1777 | .flags = flags, | ||
1778 | .ops = &page_cache_pipe_buf_ops, | ||
1779 | .spd_release = spd_release_page, | ||
1780 | }; | ||
1781 | |||
1782 | isize = i_size_read(inode); | ||
1783 | if (unlikely(*ppos >= isize)) | ||
1784 | return 0; | ||
1785 | |||
1786 | left = isize - *ppos; | ||
1787 | if (unlikely(left < len)) | ||
1788 | len = left; | ||
1789 | |||
1790 | if (splice_grow_spd(pipe, &spd)) | ||
1791 | return -ENOMEM; | ||
1792 | |||
1793 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1794 | loff = *ppos & ~PAGE_CACHE_MASK; | ||
1795 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1796 | nr_pages = min(req_pages, pipe->buffers); | ||
1797 | |||
1798 | spd.nr_pages = find_get_pages_contig(mapping, index, | ||
1799 | nr_pages, spd.pages); | ||
1800 | index += spd.nr_pages; | ||
1801 | error = 0; | ||
1802 | |||
1803 | while (spd.nr_pages < nr_pages) { | ||
1804 | error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); | ||
1805 | if (error) | ||
1806 | break; | ||
1807 | unlock_page(page); | ||
1808 | spd.pages[spd.nr_pages++] = page; | ||
1809 | index++; | ||
1810 | } | ||
1811 | |||
1812 | index = *ppos >> PAGE_CACHE_SHIFT; | ||
1813 | nr_pages = spd.nr_pages; | ||
1814 | spd.nr_pages = 0; | ||
1815 | |||
1816 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { | ||
1817 | unsigned int this_len; | ||
1818 | |||
1819 | if (!len) | ||
1820 | break; | ||
1821 | |||
1822 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | ||
1823 | page = spd.pages[page_nr]; | ||
1824 | |||
1825 | if (!PageUptodate(page) || page->mapping != mapping) { | ||
1826 | error = shmem_getpage(inode, index, &page, | ||
1827 | SGP_CACHE, NULL); | ||
1828 | if (error) | ||
1829 | break; | ||
1830 | unlock_page(page); | ||
1831 | page_cache_release(spd.pages[page_nr]); | ||
1832 | spd.pages[page_nr] = page; | ||
1833 | } | ||
1834 | |||
1835 | isize = i_size_read(inode); | ||
1836 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | ||
1837 | if (unlikely(!isize || index > end_index)) | ||
1838 | break; | ||
1839 | |||
1840 | if (end_index == index) { | ||
1841 | unsigned int plen; | ||
1842 | |||
1843 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | ||
1844 | if (plen <= loff) | ||
1845 | break; | ||
1846 | |||
1847 | this_len = min(this_len, plen - loff); | ||
1848 | len = this_len; | ||
1849 | } | ||
1850 | |||
1851 | spd.partial[page_nr].offset = loff; | ||
1852 | spd.partial[page_nr].len = this_len; | ||
1853 | len -= this_len; | ||
1854 | loff = 0; | ||
1855 | spd.nr_pages++; | ||
1856 | index++; | ||
1857 | } | ||
1858 | |||
1859 | while (page_nr < nr_pages) | ||
1860 | page_cache_release(spd.pages[page_nr++]); | ||
1861 | |||
1862 | if (spd.nr_pages) | ||
1863 | error = splice_to_pipe(pipe, &spd); | ||
1864 | |||
1865 | splice_shrink_spd(pipe, &spd); | ||
1866 | |||
1867 | if (error > 0) { | ||
1868 | *ppos += error; | ||
1869 | file_accessed(in); | ||
1870 | } | ||
1871 | return error; | ||
1872 | } | ||
1873 | |||
1848 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1874 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1849 | { | 1875 | { |
1850 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1876 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -2005,7 +2031,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2005 | int error; | 2031 | int error; |
2006 | int len; | 2032 | int len; |
2007 | struct inode *inode; | 2033 | struct inode *inode; |
2008 | struct page *page = NULL; | 2034 | struct page *page; |
2009 | char *kaddr; | 2035 | char *kaddr; |
2010 | struct shmem_inode_info *info; | 2036 | struct shmem_inode_info *info; |
2011 | 2037 | ||
@@ -2683,7 +2709,6 @@ static const struct address_space_operations shmem_aops = { | |||
2683 | .writepage = shmem_writepage, | 2709 | .writepage = shmem_writepage, |
2684 | .set_page_dirty = __set_page_dirty_no_writeback, | 2710 | .set_page_dirty = __set_page_dirty_no_writeback, |
2685 | #ifdef CONFIG_TMPFS | 2711 | #ifdef CONFIG_TMPFS |
2686 | .readpage = shmem_readpage, | ||
2687 | .write_begin = shmem_write_begin, | 2712 | .write_begin = shmem_write_begin, |
2688 | .write_end = shmem_write_end, | 2713 | .write_end = shmem_write_end, |
2689 | #endif | 2714 | #endif |
@@ -2700,13 +2725,13 @@ static const struct file_operations shmem_file_operations = { | |||
2700 | .aio_read = shmem_file_aio_read, | 2725 | .aio_read = shmem_file_aio_read, |
2701 | .aio_write = generic_file_aio_write, | 2726 | .aio_write = generic_file_aio_write, |
2702 | .fsync = noop_fsync, | 2727 | .fsync = noop_fsync, |
2703 | .splice_read = generic_file_splice_read, | 2728 | .splice_read = shmem_file_splice_read, |
2704 | .splice_write = generic_file_splice_write, | 2729 | .splice_write = generic_file_splice_write, |
2705 | #endif | 2730 | #endif |
2706 | }; | 2731 | }; |
2707 | 2732 | ||
2708 | static const struct inode_operations shmem_inode_operations = { | 2733 | static const struct inode_operations shmem_inode_operations = { |
2709 | .setattr = shmem_notify_change, | 2734 | .setattr = shmem_setattr, |
2710 | .truncate_range = shmem_truncate_range, | 2735 | .truncate_range = shmem_truncate_range, |
2711 | #ifdef CONFIG_TMPFS_XATTR | 2736 | #ifdef CONFIG_TMPFS_XATTR |
2712 | .setxattr = shmem_setxattr, | 2737 | .setxattr = shmem_setxattr, |
@@ -2714,10 +2739,6 @@ static const struct inode_operations shmem_inode_operations = { | |||
2714 | .listxattr = shmem_listxattr, | 2739 | .listxattr = shmem_listxattr, |
2715 | .removexattr = shmem_removexattr, | 2740 | .removexattr = shmem_removexattr, |
2716 | #endif | 2741 | #endif |
2717 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
2718 | .check_acl = generic_check_acl, | ||
2719 | #endif | ||
2720 | |||
2721 | }; | 2742 | }; |
2722 | 2743 | ||
2723 | static const struct inode_operations shmem_dir_inode_operations = { | 2744 | static const struct inode_operations shmem_dir_inode_operations = { |
@@ -2739,8 +2760,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2739 | .removexattr = shmem_removexattr, | 2760 | .removexattr = shmem_removexattr, |
2740 | #endif | 2761 | #endif |
2741 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2762 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2742 | .setattr = shmem_notify_change, | 2763 | .setattr = shmem_setattr, |
2743 | .check_acl = generic_check_acl, | ||
2744 | #endif | 2764 | #endif |
2745 | }; | 2765 | }; |
2746 | 2766 | ||
@@ -2752,8 +2772,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2752 | .removexattr = shmem_removexattr, | 2772 | .removexattr = shmem_removexattr, |
2753 | #endif | 2773 | #endif |
2754 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2774 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2755 | .setattr = shmem_notify_change, | 2775 | .setattr = shmem_setattr, |
2756 | .check_acl = generic_check_acl, | ||
2757 | #endif | 2776 | #endif |
2758 | }; | 2777 | }; |
2759 | 2778 | ||
@@ -2908,6 +2927,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2908 | return 0; | 2927 | return 0; |
2909 | } | 2928 | } |
2910 | 2929 | ||
2930 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | ||
2931 | { | ||
2932 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
2933 | } | ||
2934 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | ||
2935 | |||
2911 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2936 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2912 | /** | 2937 | /** |
2913 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | 2938 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file |
@@ -3028,3 +3053,42 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3028 | vma->vm_flags |= VM_CAN_NONLINEAR; | 3053 | vma->vm_flags |= VM_CAN_NONLINEAR; |
3029 | return 0; | 3054 | return 0; |
3030 | } | 3055 | } |
3056 | |||
3057 | /** | ||
3058 | * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. | ||
3059 | * @mapping: the page's address_space | ||
3060 | * @index: the page index | ||
3061 | * @gfp: the page allocator flags to use if allocating | ||
3062 | * | ||
3063 | * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", | ||
3064 | * with any new page allocations done using the specified allocation flags. | ||
3065 | * But read_cache_page_gfp() uses the ->readpage() method: which does not | ||
3066 | * suit tmpfs, since it may have pages in swapcache, and needs to find those | ||
3067 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. | ||
3068 | * | ||
3069 | * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in | ||
3070 | * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. | ||
3071 | */ | ||
3072 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | ||
3073 | pgoff_t index, gfp_t gfp) | ||
3074 | { | ||
3075 | #ifdef CONFIG_SHMEM | ||
3076 | struct inode *inode = mapping->host; | ||
3077 | struct page *page; | ||
3078 | int error; | ||
3079 | |||
3080 | BUG_ON(mapping->a_ops != &shmem_aops); | ||
3081 | error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); | ||
3082 | if (error) | ||
3083 | page = ERR_PTR(error); | ||
3084 | else | ||
3085 | unlock_page(page); | ||
3086 | return page; | ||
3087 | #else | ||
3088 | /* | ||
3089 | * The tiny !SHMEM case uses ramfs without swap | ||
3090 | */ | ||
3091 | return read_cache_page_gfp(mapping, index, gfp); | ||
3092 | #endif | ||
3093 | } | ||
3094 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); | ||
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = | |||
574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | 574 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; |
575 | 575 | ||
576 | /* internal cache of cache description objs */ | 576 | /* internal cache of cache description objs */ |
577 | static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; | ||
577 | static struct kmem_cache cache_cache = { | 578 | static struct kmem_cache cache_cache = { |
579 | .nodelists = cache_cache_nodelists, | ||
578 | .batchcount = 1, | 580 | .batchcount = 1, |
579 | .limit = BOOT_CPUCACHE_ENTRIES, | 581 | .limit = BOOT_CPUCACHE_ENTRIES, |
580 | .shared = 1, | 582 | .shared = 1, |
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void) | |||
1492 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; | 1494 | cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; |
1493 | 1495 | ||
1494 | /* | 1496 | /* |
1495 | * struct kmem_cache size depends on nr_node_ids, which | 1497 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1496 | * can be less than MAX_NUMNODES. | ||
1497 | */ | 1498 | */ |
1498 | cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + | 1499 | cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + |
1499 | nr_node_ids * sizeof(struct kmem_list3 *); | 1500 | nr_node_ids * sizeof(struct kmem_list3 *); |
1500 | #if DEBUG | 1501 | #if DEBUG |
1501 | cache_cache.obj_size = cache_cache.buffer_size; | 1502 | cache_cache.obj_size = cache_cache.buffer_size; |
1502 | #endif | 1503 | #endif |
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2308 | if (!cachep) | 2309 | if (!cachep) |
2309 | goto oops; | 2310 | goto oops; |
2310 | 2311 | ||
2312 | cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; | ||
2311 | #if DEBUG | 2313 | #if DEBUG |
2312 | cachep->obj_size = size; | 2314 | cachep->obj_size = size; |
2313 | 2315 | ||
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
3153 | objp += obj_offset(cachep); | 3155 | objp += obj_offset(cachep); |
3154 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3156 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
3155 | cachep->ctor(objp); | 3157 | cachep->ctor(objp); |
3156 | #if ARCH_SLAB_MINALIGN | 3158 | if (ARCH_SLAB_MINALIGN && |
3157 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3159 | ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { |
3158 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3160 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
3159 | objp, ARCH_SLAB_MINALIGN); | 3161 | objp, (int)ARCH_SLAB_MINALIGN); |
3160 | } | 3162 | } |
3161 | #endif | ||
3162 | return objp; | 3163 | return objp; |
3163 | } | 3164 | } |
3164 | #else | 3165 | #else |
@@ -3604,13 +3605,14 @@ free_done: | |||
3604 | * Release an obj back to its cache. If the obj has a constructed state, it must | 3605 | * Release an obj back to its cache. If the obj has a constructed state, it must |
3605 | * be in this state _before_ it is released. Called with disabled ints. | 3606 | * be in this state _before_ it is released. Called with disabled ints. |
3606 | */ | 3607 | */ |
3607 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) | 3608 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, |
3609 | void *caller) | ||
3608 | { | 3610 | { |
3609 | struct array_cache *ac = cpu_cache_get(cachep); | 3611 | struct array_cache *ac = cpu_cache_get(cachep); |
3610 | 3612 | ||
3611 | check_irq_off(); | 3613 | check_irq_off(); |
3612 | kmemleak_free_recursive(objp, cachep->flags); | 3614 | kmemleak_free_recursive(objp, cachep->flags); |
3613 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3615 | objp = cache_free_debugcheck(cachep, objp, caller); |
3614 | 3616 | ||
3615 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | 3617 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); |
3616 | 3618 | ||
@@ -3801,7 +3803,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3801 | debug_check_no_locks_freed(objp, obj_size(cachep)); | 3803 | debug_check_no_locks_freed(objp, obj_size(cachep)); |
3802 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3804 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3803 | debug_check_no_obj_freed(objp, obj_size(cachep)); | 3805 | debug_check_no_obj_freed(objp, obj_size(cachep)); |
3804 | __cache_free(cachep, objp); | 3806 | __cache_free(cachep, objp, __builtin_return_address(0)); |
3805 | local_irq_restore(flags); | 3807 | local_irq_restore(flags); |
3806 | 3808 | ||
3807 | trace_kmem_cache_free(_RET_IP_, objp); | 3809 | trace_kmem_cache_free(_RET_IP_, objp); |
@@ -3831,7 +3833,7 @@ void kfree(const void *objp) | |||
3831 | c = virt_to_cache(objp); | 3833 | c = virt_to_cache(objp); |
3832 | debug_check_no_locks_freed(objp, obj_size(c)); | 3834 | debug_check_no_locks_freed(objp, obj_size(c)); |
3833 | debug_check_no_obj_freed(objp, obj_size(c)); | 3835 | debug_check_no_obj_freed(objp, obj_size(c)); |
3834 | __cache_free(c, (void *)objp); | 3836 | __cache_free(c, (void *)objp, __builtin_return_address(0)); |
3835 | local_irq_restore(flags); | 3837 | local_irq_restore(flags); |
3836 | } | 3838 | } |
3837 | EXPORT_SYMBOL(kfree); | 3839 | EXPORT_SYMBOL(kfree); |
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 482 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
483 | void *ret; | 483 | void *ret; |
484 | 484 | ||
485 | gfp &= gfp_allowed_mask; | ||
486 | |||
485 | lockdep_trace_alloc(gfp); | 487 | lockdep_trace_alloc(gfp); |
486 | 488 | ||
487 | if (size < PAGE_SIZE - align) { | 489 | if (size < PAGE_SIZE - align) { |
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
608 | { | 610 | { |
609 | void *b; | 611 | void *b; |
610 | 612 | ||
613 | flags &= gfp_allowed_mask; | ||
614 | |||
615 | lockdep_trace_alloc(flags); | ||
616 | |||
611 | if (c->size < PAGE_SIZE) { | 617 | if (c->size < PAGE_SIZE) { |
612 | b = slob_alloc(c->size, flags, c->align, node); | 618 | b = slob_alloc(c->size, flags, c->align, node); |
613 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | 619 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, |
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/memory.h> | 27 | #include <linux/memory.h> |
28 | #include <linux/math64.h> | 28 | #include <linux/math64.h> |
29 | #include <linux/fault-inject.h> | 29 | #include <linux/fault-inject.h> |
30 | #include <linux/stacktrace.h> | ||
30 | 31 | ||
31 | #include <trace/events/kmem.h> | 32 | #include <trace/events/kmem.h> |
32 | 33 | ||
@@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches); | |||
191 | /* | 192 | /* |
192 | * Tracking user of a slab. | 193 | * Tracking user of a slab. |
193 | */ | 194 | */ |
195 | #define TRACK_ADDRS_COUNT 16 | ||
194 | struct track { | 196 | struct track { |
195 | unsigned long addr; /* Called from address */ | 197 | unsigned long addr; /* Called from address */ |
198 | #ifdef CONFIG_STACKTRACE | ||
199 | unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ | ||
200 | #endif | ||
196 | int cpu; /* Was running on cpu */ | 201 | int cpu; /* Was running on cpu */ |
197 | int pid; /* Pid context */ | 202 | int pid; /* Pid context */ |
198 | unsigned long when; /* When did the operation occur */ | 203 | unsigned long when; /* When did the operation occur */ |
@@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object, | |||
420 | struct track *p = get_track(s, object, alloc); | 425 | struct track *p = get_track(s, object, alloc); |
421 | 426 | ||
422 | if (addr) { | 427 | if (addr) { |
428 | #ifdef CONFIG_STACKTRACE | ||
429 | struct stack_trace trace; | ||
430 | int i; | ||
431 | |||
432 | trace.nr_entries = 0; | ||
433 | trace.max_entries = TRACK_ADDRS_COUNT; | ||
434 | trace.entries = p->addrs; | ||
435 | trace.skip = 3; | ||
436 | save_stack_trace(&trace); | ||
437 | |||
438 | /* See rant in lockdep.c */ | ||
439 | if (trace.nr_entries != 0 && | ||
440 | trace.entries[trace.nr_entries - 1] == ULONG_MAX) | ||
441 | trace.nr_entries--; | ||
442 | |||
443 | for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) | ||
444 | p->addrs[i] = 0; | ||
445 | #endif | ||
423 | p->addr = addr; | 446 | p->addr = addr; |
424 | p->cpu = smp_processor_id(); | 447 | p->cpu = smp_processor_id(); |
425 | p->pid = current->pid; | 448 | p->pid = current->pid; |
@@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t) | |||
444 | 467 | ||
445 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", | 468 | printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", |
446 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); | 469 | s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); |
470 | #ifdef CONFIG_STACKTRACE | ||
471 | { | ||
472 | int i; | ||
473 | for (i = 0; i < TRACK_ADDRS_COUNT; i++) | ||
474 | if (t->addrs[i]) | ||
475 | printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); | ||
476 | else | ||
477 | break; | ||
478 | } | ||
479 | #endif | ||
447 | } | 480 | } |
448 | 481 | ||
449 | static void print_tracking(struct kmem_cache *s, void *object) | 482 | static void print_tracking(struct kmem_cache *s, void *object) |
@@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) | |||
557 | memset(p + s->objsize, val, s->inuse - s->objsize); | 590 | memset(p + s->objsize, val, s->inuse - s->objsize); |
558 | } | 591 | } |
559 | 592 | ||
560 | static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | 593 | static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) |
561 | { | 594 | { |
562 | while (bytes) { | 595 | while (bytes) { |
563 | if (*start != (u8)value) | 596 | if (*start != value) |
564 | return start; | 597 | return start; |
565 | start++; | 598 | start++; |
566 | bytes--; | 599 | bytes--; |
@@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) | |||
568 | return NULL; | 601 | return NULL; |
569 | } | 602 | } |
570 | 603 | ||
604 | static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) | ||
605 | { | ||
606 | u64 value64; | ||
607 | unsigned int words, prefix; | ||
608 | |||
609 | if (bytes <= 16) | ||
610 | return check_bytes8(start, value, bytes); | ||
611 | |||
612 | value64 = value | value << 8 | value << 16 | value << 24; | ||
613 | value64 = value64 | value64 << 32; | ||
614 | prefix = 8 - ((unsigned long)start) % 8; | ||
615 | |||
616 | if (prefix) { | ||
617 | u8 *r = check_bytes8(start, value, prefix); | ||
618 | if (r) | ||
619 | return r; | ||
620 | start += prefix; | ||
621 | bytes -= prefix; | ||
622 | } | ||
623 | |||
624 | words = bytes / 8; | ||
625 | |||
626 | while (words) { | ||
627 | if (*(u64 *)start != value64) | ||
628 | return check_bytes8(start, value, 8); | ||
629 | start += 8; | ||
630 | words--; | ||
631 | } | ||
632 | |||
633 | return check_bytes8(start, value, bytes % 8); | ||
634 | } | ||
635 | |||
571 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, | 636 | static void restore_bytes(struct kmem_cache *s, char *message, u8 data, |
572 | void *from, void *to) | 637 | void *from, void *to) |
573 | { | 638 | { |
@@ -2320,16 +2385,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) | |||
2320 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < | 2385 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < |
2321 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); | 2386 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); |
2322 | 2387 | ||
2323 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2324 | /* | 2388 | /* |
2325 | * Must align to double word boundary for the double cmpxchg instructions | 2389 | * Must align to double word boundary for the double cmpxchg |
2326 | * to work. | 2390 | * instructions to work; see __pcpu_double_call_return_bool(). |
2327 | */ | 2391 | */ |
2328 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); | 2392 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), |
2329 | #else | 2393 | 2 * sizeof(void *)); |
2330 | /* Regular alignment is sufficient */ | ||
2331 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
2332 | #endif | ||
2333 | 2394 | ||
2334 | if (!s->cpu_slab) | 2395 | if (!s->cpu_slab) |
2335 | return 0; | 2396 | return 0; |
@@ -2932,6 +2993,42 @@ size_t ksize(const void *object) | |||
2932 | } | 2993 | } |
2933 | EXPORT_SYMBOL(ksize); | 2994 | EXPORT_SYMBOL(ksize); |
2934 | 2995 | ||
2996 | #ifdef CONFIG_SLUB_DEBUG | ||
2997 | bool verify_mem_not_deleted(const void *x) | ||
2998 | { | ||
2999 | struct page *page; | ||
3000 | void *object = (void *)x; | ||
3001 | unsigned long flags; | ||
3002 | bool rv; | ||
3003 | |||
3004 | if (unlikely(ZERO_OR_NULL_PTR(x))) | ||
3005 | return false; | ||
3006 | |||
3007 | local_irq_save(flags); | ||
3008 | |||
3009 | page = virt_to_head_page(x); | ||
3010 | if (unlikely(!PageSlab(page))) { | ||
3011 | /* maybe it was from stack? */ | ||
3012 | rv = true; | ||
3013 | goto out_unlock; | ||
3014 | } | ||
3015 | |||
3016 | slab_lock(page); | ||
3017 | if (on_freelist(page->slab, page, object)) { | ||
3018 | object_err(page->slab, page, object, "Object is on free-list"); | ||
3019 | rv = false; | ||
3020 | } else { | ||
3021 | rv = true; | ||
3022 | } | ||
3023 | slab_unlock(page); | ||
3024 | |||
3025 | out_unlock: | ||
3026 | local_irq_restore(flags); | ||
3027 | return rv; | ||
3028 | } | ||
3029 | EXPORT_SYMBOL(verify_mem_not_deleted); | ||
3030 | #endif | ||
3031 | |||
2935 | void kfree(const void *x) | 3032 | void kfree(const void *x) |
2936 | { | 3033 | { |
2937 | struct page *page; | 3034 | struct page *page; |
@@ -4062,7 +4159,7 @@ static int any_slab_objects(struct kmem_cache *s) | |||
4062 | #endif | 4159 | #endif |
4063 | 4160 | ||
4064 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) | 4161 | #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) |
4065 | #define to_slab(n) container_of(n, struct kmem_cache, kobj); | 4162 | #define to_slab(n) container_of(n, struct kmem_cache, kobj) |
4066 | 4163 | ||
4067 | struct slab_attribute { | 4164 | struct slab_attribute { |
4068 | struct attribute attr; | 4165 | struct attribute attr; |
diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12831a2..858e1dff9b2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | |||
40 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; | 40 | static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; |
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | int page_to_nid(struct page *page) | 43 | int page_to_nid(const struct page *page) |
44 | { | 44 | { |
45 | return section_to_node_table[page_to_section(page)]; | 45 | return section_to_node_table[page_to_section(page)]; |
46 | } | 46 | } |
diff --git a/mm/swapfile.c b/mm/swapfile.c index d537d29e9b7b..1b8c33907242 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/vmalloc.h> | 14 | #include <linux/vmalloc.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/shm.h> | 17 | #include <linux/shmem_fs.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/random.h> | 19 | #include <linux/random.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
@@ -1681,19 +1681,14 @@ out: | |||
1681 | } | 1681 | } |
1682 | 1682 | ||
1683 | #ifdef CONFIG_PROC_FS | 1683 | #ifdef CONFIG_PROC_FS |
1684 | struct proc_swaps { | ||
1685 | struct seq_file seq; | ||
1686 | int event; | ||
1687 | }; | ||
1688 | |||
1689 | static unsigned swaps_poll(struct file *file, poll_table *wait) | 1684 | static unsigned swaps_poll(struct file *file, poll_table *wait) |
1690 | { | 1685 | { |
1691 | struct proc_swaps *s = file->private_data; | 1686 | struct seq_file *seq = file->private_data; |
1692 | 1687 | ||
1693 | poll_wait(file, &proc_poll_wait, wait); | 1688 | poll_wait(file, &proc_poll_wait, wait); |
1694 | 1689 | ||
1695 | if (s->event != atomic_read(&proc_poll_event)) { | 1690 | if (seq->poll_event != atomic_read(&proc_poll_event)) { |
1696 | s->event = atomic_read(&proc_poll_event); | 1691 | seq->poll_event = atomic_read(&proc_poll_event); |
1697 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; | 1692 | return POLLIN | POLLRDNORM | POLLERR | POLLPRI; |
1698 | } | 1693 | } |
1699 | 1694 | ||
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = { | |||
1783 | 1778 | ||
1784 | static int swaps_open(struct inode *inode, struct file *file) | 1779 | static int swaps_open(struct inode *inode, struct file *file) |
1785 | { | 1780 | { |
1786 | struct proc_swaps *s; | 1781 | struct seq_file *seq; |
1787 | int ret; | 1782 | int ret; |
1788 | 1783 | ||
1789 | s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); | ||
1790 | if (!s) | ||
1791 | return -ENOMEM; | ||
1792 | |||
1793 | file->private_data = s; | ||
1794 | |||
1795 | ret = seq_open(file, &swaps_op); | 1784 | ret = seq_open(file, &swaps_op); |
1796 | if (ret) { | 1785 | if (ret) |
1797 | kfree(s); | ||
1798 | return ret; | 1786 | return ret; |
1799 | } | ||
1800 | 1787 | ||
1801 | s->seq.private = s; | 1788 | seq = file->private_data; |
1802 | s->event = atomic_read(&proc_poll_event); | 1789 | seq->poll_event = atomic_read(&proc_poll_event); |
1803 | return ret; | 1790 | return 0; |
1804 | } | 1791 | } |
1805 | 1792 | ||
1806 | static const struct file_operations proc_swaps_operations = { | 1793 | static const struct file_operations proc_swaps_operations = { |
diff --git a/mm/thrash.c b/mm/thrash.c index 2372d4ed5dd8..e53f7d02c17c 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -6,7 +6,7 @@ | |||
6 | * Released under the GPL, see the file COPYING for details. | 6 | * Released under the GPL, see the file COPYING for details. |
7 | * | 7 | * |
8 | * Simple token based thrashing protection, using the algorithm | 8 | * Simple token based thrashing protection, using the algorithm |
9 | * described in: http://www.cs.wm.edu/~sjiang/token.pdf | 9 | * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html |
10 | * | 10 | * |
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | 11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> |
12 | * Improved algorithm to pass token: | 12 | * Improved algorithm to pass token: |
@@ -21,14 +21,40 @@ | |||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/memcontrol.h> | ||
25 | |||
26 | #include <trace/events/vmscan.h> | ||
27 | |||
28 | #define TOKEN_AGING_INTERVAL (0xFF) | ||
24 | 29 | ||
25 | static DEFINE_SPINLOCK(swap_token_lock); | 30 | static DEFINE_SPINLOCK(swap_token_lock); |
26 | struct mm_struct *swap_token_mm; | 31 | struct mm_struct *swap_token_mm; |
27 | static unsigned int global_faults; | 32 | struct mem_cgroup *swap_token_memcg; |
33 | |||
34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
36 | { | ||
37 | struct mem_cgroup *memcg; | ||
38 | |||
39 | memcg = try_get_mem_cgroup_from_mm(mm); | ||
40 | if (memcg) | ||
41 | css_put(mem_cgroup_css(memcg)); | ||
42 | |||
43 | return memcg; | ||
44 | } | ||
45 | #else | ||
46 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
47 | { | ||
48 | return NULL; | ||
49 | } | ||
50 | #endif | ||
28 | 51 | ||
29 | void grab_swap_token(struct mm_struct *mm) | 52 | void grab_swap_token(struct mm_struct *mm) |
30 | { | 53 | { |
31 | int current_interval; | 54 | int current_interval; |
55 | unsigned int old_prio = mm->token_priority; | ||
56 | static unsigned int global_faults; | ||
57 | static unsigned int last_aging; | ||
32 | 58 | ||
33 | global_faults++; | 59 | global_faults++; |
34 | 60 | ||
@@ -38,40 +64,92 @@ void grab_swap_token(struct mm_struct *mm) | |||
38 | return; | 64 | return; |
39 | 65 | ||
40 | /* First come first served */ | 66 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | 67 | if (!swap_token_mm) |
42 | mm->token_priority = mm->token_priority + 2; | 68 | goto replace_token; |
43 | swap_token_mm = mm; | 69 | |
44 | goto out; | 70 | /* |
71 | * Usually, we don't need priority aging because long interval faults | ||
72 | * makes priority decrease quickly. But there is one exception. If the | ||
73 | * token owner task is sleeping, it never make long interval faults. | ||
74 | * Thus, we need a priority aging mechanism instead. The requirements | ||
75 | * of priority aging are | ||
76 | * 1) An aging interval is reasonable enough long. Too short aging | ||
77 | * interval makes quick swap token lost and decrease performance. | ||
78 | * 2) The swap token owner task have to get priority aging even if | ||
79 | * it's under sleep. | ||
80 | */ | ||
81 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { | ||
82 | swap_token_mm->token_priority /= 2; | ||
83 | last_aging = global_faults; | ||
45 | } | 84 | } |
46 | 85 | ||
47 | if (mm != swap_token_mm) { | 86 | if (mm == swap_token_mm) { |
48 | if (current_interval < mm->last_interval) | ||
49 | mm->token_priority++; | ||
50 | else { | ||
51 | if (likely(mm->token_priority > 0)) | ||
52 | mm->token_priority--; | ||
53 | } | ||
54 | /* Check if we deserve the token */ | ||
55 | if (mm->token_priority > swap_token_mm->token_priority) { | ||
56 | mm->token_priority += 2; | ||
57 | swap_token_mm = mm; | ||
58 | } | ||
59 | } else { | ||
60 | /* Token holder came in again! */ | ||
61 | mm->token_priority += 2; | 87 | mm->token_priority += 2; |
88 | goto update_priority; | ||
89 | } | ||
90 | |||
91 | if (current_interval < mm->last_interval) | ||
92 | mm->token_priority++; | ||
93 | else { | ||
94 | if (likely(mm->token_priority > 0)) | ||
95 | mm->token_priority--; | ||
62 | } | 96 | } |
63 | 97 | ||
98 | /* Check if we deserve the token */ | ||
99 | if (mm->token_priority > swap_token_mm->token_priority) | ||
100 | goto replace_token; | ||
101 | |||
102 | update_priority: | ||
103 | trace_update_swap_token_priority(mm, old_prio, swap_token_mm); | ||
104 | |||
64 | out: | 105 | out: |
65 | mm->faultstamp = global_faults; | 106 | mm->faultstamp = global_faults; |
66 | mm->last_interval = current_interval; | 107 | mm->last_interval = current_interval; |
67 | spin_unlock(&swap_token_lock); | 108 | spin_unlock(&swap_token_lock); |
109 | return; | ||
110 | |||
111 | replace_token: | ||
112 | mm->token_priority += 2; | ||
113 | trace_replace_swap_token(swap_token_mm, mm); | ||
114 | swap_token_mm = mm; | ||
115 | swap_token_memcg = swap_token_memcg_from_mm(mm); | ||
116 | last_aging = global_faults; | ||
117 | goto out; | ||
68 | } | 118 | } |
69 | 119 | ||
70 | /* Called on process exit. */ | 120 | /* Called on process exit. */ |
71 | void __put_swap_token(struct mm_struct *mm) | 121 | void __put_swap_token(struct mm_struct *mm) |
72 | { | 122 | { |
73 | spin_lock(&swap_token_lock); | 123 | spin_lock(&swap_token_lock); |
74 | if (likely(mm == swap_token_mm)) | 124 | if (likely(mm == swap_token_mm)) { |
125 | trace_put_swap_token(swap_token_mm); | ||
75 | swap_token_mm = NULL; | 126 | swap_token_mm = NULL; |
127 | swap_token_memcg = NULL; | ||
128 | } | ||
76 | spin_unlock(&swap_token_lock); | 129 | spin_unlock(&swap_token_lock); |
77 | } | 130 | } |
131 | |||
132 | static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) | ||
133 | { | ||
134 | if (!a) | ||
135 | return true; | ||
136 | if (!b) | ||
137 | return true; | ||
138 | if (a == b) | ||
139 | return true; | ||
140 | return false; | ||
141 | } | ||
142 | |||
143 | void disable_swap_token(struct mem_cgroup *memcg) | ||
144 | { | ||
145 | /* memcg reclaim don't disable unrelated mm token. */ | ||
146 | if (match_memcg(memcg, swap_token_memcg)) { | ||
147 | spin_lock(&swap_token_lock); | ||
148 | if (match_memcg(memcg, swap_token_memcg)) { | ||
149 | trace_disable_swap_token(swap_token_mm); | ||
150 | swap_token_mm = NULL; | ||
151 | swap_token_memcg = NULL; | ||
152 | } | ||
153 | spin_unlock(&swap_token_lock); | ||
154 | } | ||
155 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 3a29a6180212..232eb2736a79 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) | |||
199 | * The first pass will remove most pages, so the search cost of the second pass | 199 | * The first pass will remove most pages, so the search cost of the second pass |
200 | * is low. | 200 | * is low. |
201 | * | 201 | * |
202 | * When looking at page->index outside the page lock we need to be careful to | ||
203 | * copy it into a local to avoid races (it could change at any time). | ||
204 | * | ||
205 | * We pass down the cache-hot hint to the page freeing code. Even if the | 202 | * We pass down the cache-hot hint to the page freeing code. Even if the |
206 | * mapping is large, it is probably the case that the final pages are the most | 203 | * mapping is large, it is probably the case that the final pages are the most |
207 | * recently touched, and freeing happens in ascending file offset order. | 204 | * recently touched, and freeing happens in ascending file offset order. |
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
210 | loff_t lstart, loff_t lend) | 207 | loff_t lstart, loff_t lend) |
211 | { | 208 | { |
212 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; | 209 | const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; |
213 | pgoff_t end; | ||
214 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 210 | const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
215 | struct pagevec pvec; | 211 | struct pagevec pvec; |
216 | pgoff_t next; | 212 | pgoff_t index; |
213 | pgoff_t end; | ||
217 | int i; | 214 | int i; |
218 | 215 | ||
219 | cleancache_flush_inode(mapping); | 216 | cleancache_flush_inode(mapping); |
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
224 | end = (lend >> PAGE_CACHE_SHIFT); | 221 | end = (lend >> PAGE_CACHE_SHIFT); |
225 | 222 | ||
226 | pagevec_init(&pvec, 0); | 223 | pagevec_init(&pvec, 0); |
227 | next = start; | 224 | index = start; |
228 | while (next <= end && | 225 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
229 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 226 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
230 | mem_cgroup_uncharge_start(); | 227 | mem_cgroup_uncharge_start(); |
231 | for (i = 0; i < pagevec_count(&pvec); i++) { | 228 | for (i = 0; i < pagevec_count(&pvec); i++) { |
232 | struct page *page = pvec.pages[i]; | 229 | struct page *page = pvec.pages[i]; |
233 | pgoff_t page_index = page->index; | ||
234 | 230 | ||
235 | if (page_index > end) { | 231 | /* We rely upon deletion not changing page->index */ |
236 | next = page_index; | 232 | index = page->index; |
233 | if (index > end) | ||
237 | break; | 234 | break; |
238 | } | ||
239 | 235 | ||
240 | if (page_index > next) | ||
241 | next = page_index; | ||
242 | next++; | ||
243 | if (!trylock_page(page)) | 236 | if (!trylock_page(page)) |
244 | continue; | 237 | continue; |
238 | WARN_ON(page->index != index); | ||
245 | if (PageWriteback(page)) { | 239 | if (PageWriteback(page)) { |
246 | unlock_page(page); | 240 | unlock_page(page); |
247 | continue; | 241 | continue; |
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
252 | pagevec_release(&pvec); | 246 | pagevec_release(&pvec); |
253 | mem_cgroup_uncharge_end(); | 247 | mem_cgroup_uncharge_end(); |
254 | cond_resched(); | 248 | cond_resched(); |
249 | index++; | ||
255 | } | 250 | } |
256 | 251 | ||
257 | if (partial) { | 252 | if (partial) { |
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
264 | } | 259 | } |
265 | } | 260 | } |
266 | 261 | ||
267 | next = start; | 262 | index = start; |
268 | for ( ; ; ) { | 263 | for ( ; ; ) { |
269 | cond_resched(); | 264 | cond_resched(); |
270 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 265 | if (!pagevec_lookup(&pvec, mapping, index, |
271 | if (next == start) | 266 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
267 | if (index == start) | ||
272 | break; | 268 | break; |
273 | next = start; | 269 | index = start; |
274 | continue; | 270 | continue; |
275 | } | 271 | } |
276 | if (pvec.pages[0]->index > end) { | 272 | if (index == start && pvec.pages[0]->index > end) { |
277 | pagevec_release(&pvec); | 273 | pagevec_release(&pvec); |
278 | break; | 274 | break; |
279 | } | 275 | } |
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
281 | for (i = 0; i < pagevec_count(&pvec); i++) { | 277 | for (i = 0; i < pagevec_count(&pvec); i++) { |
282 | struct page *page = pvec.pages[i]; | 278 | struct page *page = pvec.pages[i]; |
283 | 279 | ||
284 | if (page->index > end) | 280 | /* We rely upon deletion not changing page->index */ |
281 | index = page->index; | ||
282 | if (index > end) | ||
285 | break; | 283 | break; |
284 | |||
286 | lock_page(page); | 285 | lock_page(page); |
286 | WARN_ON(page->index != index); | ||
287 | wait_on_page_writeback(page); | 287 | wait_on_page_writeback(page); |
288 | truncate_inode_page(mapping, page); | 288 | truncate_inode_page(mapping, page); |
289 | if (page->index > next) | ||
290 | next = page->index; | ||
291 | next++; | ||
292 | unlock_page(page); | 289 | unlock_page(page); |
293 | } | 290 | } |
294 | pagevec_release(&pvec); | 291 | pagevec_release(&pvec); |
295 | mem_cgroup_uncharge_end(); | 292 | mem_cgroup_uncharge_end(); |
293 | index++; | ||
296 | } | 294 | } |
297 | cleancache_flush_inode(mapping); | 295 | cleancache_flush_inode(mapping); |
298 | } | 296 | } |
@@ -304,6 +302,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range); | |||
304 | * @lstart: offset from which to truncate | 302 | * @lstart: offset from which to truncate |
305 | * | 303 | * |
306 | * Called under (and serialised by) inode->i_mutex. | 304 | * Called under (and serialised by) inode->i_mutex. |
305 | * | ||
306 | * Note: When this function returns, there can be a page in the process of | ||
307 | * deletion (inside __delete_from_page_cache()) in the specified range. Thus | ||
308 | * mapping->nrpages can be non-zero when this function returns even after | ||
309 | * truncation of the whole mapping. | ||
307 | */ | 310 | */ |
308 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | 311 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) |
309 | { | 312 | { |
@@ -328,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
328 | pgoff_t start, pgoff_t end) | 331 | pgoff_t start, pgoff_t end) |
329 | { | 332 | { |
330 | struct pagevec pvec; | 333 | struct pagevec pvec; |
331 | pgoff_t next = start; | 334 | pgoff_t index = start; |
332 | unsigned long ret; | 335 | unsigned long ret; |
333 | unsigned long count = 0; | 336 | unsigned long count = 0; |
334 | int i; | 337 | int i; |
335 | 338 | ||
336 | pagevec_init(&pvec, 0); | 339 | pagevec_init(&pvec, 0); |
337 | while (next <= end && | 340 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
338 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 341 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
339 | mem_cgroup_uncharge_start(); | 342 | mem_cgroup_uncharge_start(); |
340 | for (i = 0; i < pagevec_count(&pvec); i++) { | 343 | for (i = 0; i < pagevec_count(&pvec); i++) { |
341 | struct page *page = pvec.pages[i]; | 344 | struct page *page = pvec.pages[i]; |
342 | pgoff_t index; | ||
343 | int lock_failed; | ||
344 | |||
345 | lock_failed = !trylock_page(page); | ||
346 | 345 | ||
347 | /* | 346 | /* We rely upon deletion not changing page->index */ |
348 | * We really shouldn't be looking at the ->index of an | ||
349 | * unlocked page. But we're not allowed to lock these | ||
350 | * pages. So we rely upon nobody altering the ->index | ||
351 | * of this (pinned-by-us) page. | ||
352 | */ | ||
353 | index = page->index; | 347 | index = page->index; |
354 | if (index > next) | 348 | if (index > end) |
355 | next = index; | 349 | break; |
356 | next++; | ||
357 | if (lock_failed) | ||
358 | continue; | ||
359 | 350 | ||
351 | if (!trylock_page(page)) | ||
352 | continue; | ||
353 | WARN_ON(page->index != index); | ||
360 | ret = invalidate_inode_page(page); | 354 | ret = invalidate_inode_page(page); |
361 | unlock_page(page); | 355 | unlock_page(page); |
362 | /* | 356 | /* |
@@ -366,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
366 | if (!ret) | 360 | if (!ret) |
367 | deactivate_page(page); | 361 | deactivate_page(page); |
368 | count += ret; | 362 | count += ret; |
369 | if (next > end) | ||
370 | break; | ||
371 | } | 363 | } |
372 | pagevec_release(&pvec); | 364 | pagevec_release(&pvec); |
373 | mem_cgroup_uncharge_end(); | 365 | mem_cgroup_uncharge_end(); |
374 | cond_resched(); | 366 | cond_resched(); |
367 | index++; | ||
375 | } | 368 | } |
376 | return count; | 369 | return count; |
377 | } | 370 | } |
@@ -437,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
437 | pgoff_t start, pgoff_t end) | 430 | pgoff_t start, pgoff_t end) |
438 | { | 431 | { |
439 | struct pagevec pvec; | 432 | struct pagevec pvec; |
440 | pgoff_t next; | 433 | pgoff_t index; |
441 | int i; | 434 | int i; |
442 | int ret = 0; | 435 | int ret = 0; |
443 | int ret2 = 0; | 436 | int ret2 = 0; |
444 | int did_range_unmap = 0; | 437 | int did_range_unmap = 0; |
445 | int wrapped = 0; | ||
446 | 438 | ||
447 | cleancache_flush_inode(mapping); | 439 | cleancache_flush_inode(mapping); |
448 | pagevec_init(&pvec, 0); | 440 | pagevec_init(&pvec, 0); |
449 | next = start; | 441 | index = start; |
450 | while (next <= end && !wrapped && | 442 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
451 | pagevec_lookup(&pvec, mapping, next, | 443 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
452 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | ||
453 | mem_cgroup_uncharge_start(); | 444 | mem_cgroup_uncharge_start(); |
454 | for (i = 0; i < pagevec_count(&pvec); i++) { | 445 | for (i = 0; i < pagevec_count(&pvec); i++) { |
455 | struct page *page = pvec.pages[i]; | 446 | struct page *page = pvec.pages[i]; |
456 | pgoff_t page_index; | 447 | |
448 | /* We rely upon deletion not changing page->index */ | ||
449 | index = page->index; | ||
450 | if (index > end) | ||
451 | break; | ||
457 | 452 | ||
458 | lock_page(page); | 453 | lock_page(page); |
454 | WARN_ON(page->index != index); | ||
459 | if (page->mapping != mapping) { | 455 | if (page->mapping != mapping) { |
460 | unlock_page(page); | 456 | unlock_page(page); |
461 | continue; | 457 | continue; |
462 | } | 458 | } |
463 | page_index = page->index; | ||
464 | next = page_index + 1; | ||
465 | if (next == 0) | ||
466 | wrapped = 1; | ||
467 | if (page_index > end) { | ||
468 | unlock_page(page); | ||
469 | break; | ||
470 | } | ||
471 | wait_on_page_writeback(page); | 459 | wait_on_page_writeback(page); |
472 | if (page_mapped(page)) { | 460 | if (page_mapped(page)) { |
473 | if (!did_range_unmap) { | 461 | if (!did_range_unmap) { |
@@ -475,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
475 | * Zap the rest of the file in one hit. | 463 | * Zap the rest of the file in one hit. |
476 | */ | 464 | */ |
477 | unmap_mapping_range(mapping, | 465 | unmap_mapping_range(mapping, |
478 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | 466 | (loff_t)index << PAGE_CACHE_SHIFT, |
479 | (loff_t)(end - page_index + 1) | 467 | (loff_t)(1 + end - index) |
480 | << PAGE_CACHE_SHIFT, | 468 | << PAGE_CACHE_SHIFT, |
481 | 0); | 469 | 0); |
482 | did_range_unmap = 1; | 470 | did_range_unmap = 1; |
483 | } else { | 471 | } else { |
@@ -485,8 +473,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
485 | * Just zap this page | 473 | * Just zap this page |
486 | */ | 474 | */ |
487 | unmap_mapping_range(mapping, | 475 | unmap_mapping_range(mapping, |
488 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | 476 | (loff_t)index << PAGE_CACHE_SHIFT, |
489 | PAGE_CACHE_SIZE, 0); | 477 | PAGE_CACHE_SIZE, 0); |
490 | } | 478 | } |
491 | } | 479 | } |
492 | BUG_ON(page_mapped(page)); | 480 | BUG_ON(page_mapped(page)); |
@@ -502,6 +490,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
502 | pagevec_release(&pvec); | 490 | pagevec_release(&pvec); |
503 | mem_cgroup_uncharge_end(); | 491 | mem_cgroup_uncharge_end(); |
504 | cond_resched(); | 492 | cond_resched(); |
493 | index++; | ||
505 | } | 494 | } |
506 | cleancache_flush_inode(mapping); | 495 | cleancache_flush_inode(mapping); |
507 | return ret; | 496 | return ret; |
@@ -526,8 +515,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
526 | /** | 515 | /** |
527 | * truncate_pagecache - unmap and remove pagecache that has been truncated | 516 | * truncate_pagecache - unmap and remove pagecache that has been truncated |
528 | * @inode: inode | 517 | * @inode: inode |
529 | * @old: old file offset | 518 | * @oldsize: old file size |
530 | * @new: new file offset | 519 | * @newsize: new file size |
531 | * | 520 | * |
532 | * inode's new i_size must already be written before truncate_pagecache | 521 | * inode's new i_size must already be written before truncate_pagecache |
533 | * is called. | 522 | * is called. |
@@ -539,9 +528,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
539 | * situations such as writepage being called for a page that has already | 528 | * situations such as writepage being called for a page that has already |
540 | * had its underlying blocks deallocated. | 529 | * had its underlying blocks deallocated. |
541 | */ | 530 | */ |
542 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | 531 | void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) |
543 | { | 532 | { |
544 | struct address_space *mapping = inode->i_mapping; | 533 | struct address_space *mapping = inode->i_mapping; |
534 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | ||
545 | 535 | ||
546 | /* | 536 | /* |
547 | * unmap_mapping_range is called twice, first simply for | 537 | * unmap_mapping_range is called twice, first simply for |
@@ -552,9 +542,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | |||
552 | * truncate_inode_pages finishes, hence the second | 542 | * truncate_inode_pages finishes, hence the second |
553 | * unmap_mapping_range call must be made for correctness. | 543 | * unmap_mapping_range call must be made for correctness. |
554 | */ | 544 | */ |
555 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 545 | unmap_mapping_range(mapping, holebegin, 0, 1); |
556 | truncate_inode_pages(mapping, new); | 546 | truncate_inode_pages(mapping, newsize); |
557 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 547 | unmap_mapping_range(mapping, holebegin, 0, 1); |
558 | } | 548 | } |
559 | EXPORT_SYMBOL(truncate_pagecache); | 549 | EXPORT_SYMBOL(truncate_pagecache); |
560 | 550 | ||
@@ -584,22 +574,47 @@ EXPORT_SYMBOL(truncate_setsize); | |||
584 | /** | 574 | /** |
585 | * vmtruncate - unmap mappings "freed" by truncate() syscall | 575 | * vmtruncate - unmap mappings "freed" by truncate() syscall |
586 | * @inode: inode of the file used | 576 | * @inode: inode of the file used |
587 | * @offset: file offset to start truncating | 577 | * @newsize: file offset to start truncating |
588 | * | 578 | * |
589 | * This function is deprecated and truncate_setsize or truncate_pagecache | 579 | * This function is deprecated and truncate_setsize or truncate_pagecache |
590 | * should be used instead, together with filesystem specific block truncation. | 580 | * should be used instead, together with filesystem specific block truncation. |
591 | */ | 581 | */ |
592 | int vmtruncate(struct inode *inode, loff_t offset) | 582 | int vmtruncate(struct inode *inode, loff_t newsize) |
593 | { | 583 | { |
594 | int error; | 584 | int error; |
595 | 585 | ||
596 | error = inode_newsize_ok(inode, offset); | 586 | error = inode_newsize_ok(inode, newsize); |
597 | if (error) | 587 | if (error) |
598 | return error; | 588 | return error; |
599 | 589 | ||
600 | truncate_setsize(inode, offset); | 590 | truncate_setsize(inode, newsize); |
601 | if (inode->i_op->truncate) | 591 | if (inode->i_op->truncate) |
602 | inode->i_op->truncate(inode); | 592 | inode->i_op->truncate(inode); |
603 | return 0; | 593 | return 0; |
604 | } | 594 | } |
605 | EXPORT_SYMBOL(vmtruncate); | 595 | EXPORT_SYMBOL(vmtruncate); |
596 | |||
597 | int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
598 | { | ||
599 | struct address_space *mapping = inode->i_mapping; | ||
600 | loff_t holebegin = round_up(lstart, PAGE_SIZE); | ||
601 | loff_t holelen = 1 + lend - holebegin; | ||
602 | |||
603 | /* | ||
604 | * If the underlying filesystem is not going to provide | ||
605 | * a way to truncate a range of blocks (punch a hole) - | ||
606 | * we should return failure right now. | ||
607 | */ | ||
608 | if (!inode->i_op->truncate_range) | ||
609 | return -ENOSYS; | ||
610 | |||
611 | mutex_lock(&inode->i_mutex); | ||
612 | inode_dio_wait(inode); | ||
613 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
614 | inode->i_op->truncate_range(inode, lstart, lend); | ||
615 | /* unmap again to remove racily COWed private pages */ | ||
616 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
617 | mutex_unlock(&inode->i_mutex); | ||
618 | |||
619 | return 0; | ||
620 | } | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a7..ab8494cde007 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -452,13 +452,6 @@ overflow: | |||
452 | return ERR_PTR(-EBUSY); | 452 | return ERR_PTR(-EBUSY); |
453 | } | 453 | } |
454 | 454 | ||
455 | static void rcu_free_va(struct rcu_head *head) | ||
456 | { | ||
457 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
458 | |||
459 | kfree(va); | ||
460 | } | ||
461 | |||
462 | static void __free_vmap_area(struct vmap_area *va) | 455 | static void __free_vmap_area(struct vmap_area *va) |
463 | { | 456 | { |
464 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | 457 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); |
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) | |||
491 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) | 484 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) |
492 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); | 485 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); |
493 | 486 | ||
494 | call_rcu(&va->rcu_head, rcu_free_va); | 487 | kfree_rcu(va, rcu_head); |
495 | } | 488 | } |
496 | 489 | ||
497 | /* | 490 | /* |
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
837 | return vb; | 830 | return vb; |
838 | } | 831 | } |
839 | 832 | ||
840 | static void rcu_free_vb(struct rcu_head *head) | ||
841 | { | ||
842 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
843 | |||
844 | kfree(vb); | ||
845 | } | ||
846 | |||
847 | static void free_vmap_block(struct vmap_block *vb) | 833 | static void free_vmap_block(struct vmap_block *vb) |
848 | { | 834 | { |
849 | struct vmap_block *tmp; | 835 | struct vmap_block *tmp; |
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb) | |||
856 | BUG_ON(tmp != vb); | 842 | BUG_ON(tmp != vb); |
857 | 843 | ||
858 | free_vmap_area_noflush(vb->va); | 844 | free_vmap_area_noflush(vb->va); |
859 | call_rcu(&vb->rcu_head, rcu_free_vb); | 845 | kfree_rcu(vb, rcu_head); |
860 | } | 846 | } |
861 | 847 | ||
862 | static void purge_fragmented_blocks(int cpu) | 848 | static void purge_fragmented_blocks(int cpu) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index faa0a088f9cc..febbc044e792 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, | |||
250 | unsigned long long delta; | 250 | unsigned long long delta; |
251 | unsigned long total_scan; | 251 | unsigned long total_scan; |
252 | unsigned long max_pass; | 252 | unsigned long max_pass; |
253 | int shrink_ret = 0; | ||
254 | long nr; | ||
255 | long new_nr; | ||
256 | long batch_size = shrinker->batch ? shrinker->batch | ||
257 | : SHRINK_BATCH; | ||
253 | 258 | ||
259 | /* | ||
260 | * copy the current shrinker scan count into a local variable | ||
261 | * and zero it so that other concurrent shrinker invocations | ||
262 | * don't also do this scanning work. | ||
263 | */ | ||
264 | do { | ||
265 | nr = shrinker->nr; | ||
266 | } while (cmpxchg(&shrinker->nr, nr, 0) != nr); | ||
267 | |||
268 | total_scan = nr; | ||
254 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); | 269 | max_pass = do_shrinker_shrink(shrinker, shrink, 0); |
255 | delta = (4 * nr_pages_scanned) / shrinker->seeks; | 270 | delta = (4 * nr_pages_scanned) / shrinker->seeks; |
256 | delta *= max_pass; | 271 | delta *= max_pass; |
257 | do_div(delta, lru_pages + 1); | 272 | do_div(delta, lru_pages + 1); |
258 | shrinker->nr += delta; | 273 | total_scan += delta; |
259 | if (shrinker->nr < 0) { | 274 | if (total_scan < 0) { |
260 | printk(KERN_ERR "shrink_slab: %pF negative objects to " | 275 | printk(KERN_ERR "shrink_slab: %pF negative objects to " |
261 | "delete nr=%ld\n", | 276 | "delete nr=%ld\n", |
262 | shrinker->shrink, shrinker->nr); | 277 | shrinker->shrink, total_scan); |
263 | shrinker->nr = max_pass; | 278 | total_scan = max_pass; |
264 | } | 279 | } |
265 | 280 | ||
266 | /* | 281 | /* |
282 | * We need to avoid excessive windup on filesystem shrinkers | ||
283 | * due to large numbers of GFP_NOFS allocations causing the | ||
284 | * shrinkers to return -1 all the time. This results in a large | ||
285 | * nr being built up so when a shrink that can do some work | ||
286 | * comes along it empties the entire cache due to nr >>> | ||
287 | * max_pass. This is bad for sustaining a working set in | ||
288 | * memory. | ||
289 | * | ||
290 | * Hence only allow the shrinker to scan the entire cache when | ||
291 | * a large delta change is calculated directly. | ||
292 | */ | ||
293 | if (delta < max_pass / 4) | ||
294 | total_scan = min(total_scan, max_pass / 2); | ||
295 | |||
296 | /* | ||
267 | * Avoid risking looping forever due to too large nr value: | 297 | * Avoid risking looping forever due to too large nr value: |
268 | * never try to free more than twice the estimate number of | 298 | * never try to free more than twice the estimate number of |
269 | * freeable entries. | 299 | * freeable entries. |
270 | */ | 300 | */ |
271 | if (shrinker->nr > max_pass * 2) | 301 | if (total_scan > max_pass * 2) |
272 | shrinker->nr = max_pass * 2; | 302 | total_scan = max_pass * 2; |
273 | 303 | ||
274 | total_scan = shrinker->nr; | 304 | trace_mm_shrink_slab_start(shrinker, shrink, nr, |
275 | shrinker->nr = 0; | 305 | nr_pages_scanned, lru_pages, |
306 | max_pass, delta, total_scan); | ||
276 | 307 | ||
277 | while (total_scan >= SHRINK_BATCH) { | 308 | while (total_scan >= batch_size) { |
278 | long this_scan = SHRINK_BATCH; | ||
279 | int shrink_ret; | ||
280 | int nr_before; | 309 | int nr_before; |
281 | 310 | ||
282 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); | 311 | nr_before = do_shrinker_shrink(shrinker, shrink, 0); |
283 | shrink_ret = do_shrinker_shrink(shrinker, shrink, | 312 | shrink_ret = do_shrinker_shrink(shrinker, shrink, |
284 | this_scan); | 313 | batch_size); |
285 | if (shrink_ret == -1) | 314 | if (shrink_ret == -1) |
286 | break; | 315 | break; |
287 | if (shrink_ret < nr_before) | 316 | if (shrink_ret < nr_before) |
288 | ret += nr_before - shrink_ret; | 317 | ret += nr_before - shrink_ret; |
289 | count_vm_events(SLABS_SCANNED, this_scan); | 318 | count_vm_events(SLABS_SCANNED, batch_size); |
290 | total_scan -= this_scan; | 319 | total_scan -= batch_size; |
291 | 320 | ||
292 | cond_resched(); | 321 | cond_resched(); |
293 | } | 322 | } |
294 | 323 | ||
295 | shrinker->nr += total_scan; | 324 | /* |
325 | * move the unused scan count back into the shrinker in a | ||
326 | * manner that handles concurrent updates. If we exhausted the | ||
327 | * scan, there is no need to do an update. | ||
328 | */ | ||
329 | do { | ||
330 | nr = shrinker->nr; | ||
331 | new_nr = total_scan + nr; | ||
332 | if (total_scan <= 0) | ||
333 | break; | ||
334 | } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); | ||
335 | |||
336 | trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); | ||
296 | } | 337 | } |
297 | up_read(&shrinker_rwsem); | 338 | up_read(&shrinker_rwsem); |
298 | out: | 339 | out: |
@@ -1124,8 +1165,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1124 | nr_lumpy_dirty++; | 1165 | nr_lumpy_dirty++; |
1125 | scan++; | 1166 | scan++; |
1126 | } else { | 1167 | } else { |
1127 | /* the page is freed already. */ | 1168 | /* |
1128 | if (!page_count(cursor_page)) | 1169 | * Check if the page is freed already. |
1170 | * | ||
1171 | * We can't use page_count() as that | ||
1172 | * requires compound_head and we don't | ||
1173 | * have a pin on the page here. If a | ||
1174 | * page is tail, we may or may not | ||
1175 | * have isolated the head, so assume | ||
1176 | * it's not free, it'd be tricky to | ||
1177 | * track the head status without a | ||
1178 | * page pin. | ||
1179 | */ | ||
1180 | if (!PageTail(cursor_page) && | ||
1181 | !atomic_read(&cursor_page->_count)) | ||
1129 | continue; | 1182 | continue; |
1130 | break; | 1183 | break; |
1131 | } | 1184 | } |
@@ -1983,14 +2036,13 @@ restart: | |||
1983 | * If a zone is deemed to be full of pinned pages then just give it a light | 2036 | * If a zone is deemed to be full of pinned pages then just give it a light |
1984 | * scan then give up on it. | 2037 | * scan then give up on it. |
1985 | */ | 2038 | */ |
1986 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | 2039 | static void shrink_zones(int priority, struct zonelist *zonelist, |
1987 | struct scan_control *sc) | 2040 | struct scan_control *sc) |
1988 | { | 2041 | { |
1989 | struct zoneref *z; | 2042 | struct zoneref *z; |
1990 | struct zone *zone; | 2043 | struct zone *zone; |
1991 | unsigned long nr_soft_reclaimed; | 2044 | unsigned long nr_soft_reclaimed; |
1992 | unsigned long nr_soft_scanned; | 2045 | unsigned long nr_soft_scanned; |
1993 | unsigned long total_scanned = 0; | ||
1994 | 2046 | ||
1995 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2047 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1996 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2048 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2005,19 +2057,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
2005 | continue; | 2057 | continue; |
2006 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2058 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2007 | continue; /* Let kswapd poll it */ | 2059 | continue; /* Let kswapd poll it */ |
2060 | /* | ||
2061 | * This steals pages from memory cgroups over softlimit | ||
2062 | * and returns the number of reclaimed pages and | ||
2063 | * scanned pages. This works for global memory pressure | ||
2064 | * and balancing, not for a memcg's limit. | ||
2065 | */ | ||
2066 | nr_soft_scanned = 0; | ||
2067 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2068 | sc->order, sc->gfp_mask, | ||
2069 | &nr_soft_scanned); | ||
2070 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2071 | sc->nr_scanned += nr_soft_scanned; | ||
2072 | /* need some check for avoid more shrink_zone() */ | ||
2008 | } | 2073 | } |
2009 | 2074 | ||
2010 | nr_soft_scanned = 0; | ||
2011 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2012 | sc->order, sc->gfp_mask, | ||
2013 | &nr_soft_scanned); | ||
2014 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2015 | total_scanned += nr_soft_scanned; | ||
2016 | |||
2017 | shrink_zone(priority, zone, sc); | 2075 | shrink_zone(priority, zone, sc); |
2018 | } | 2076 | } |
2019 | |||
2020 | return total_scanned; | ||
2021 | } | 2077 | } |
2022 | 2078 | ||
2023 | static bool zone_reclaimable(struct zone *zone) | 2079 | static bool zone_reclaimable(struct zone *zone) |
@@ -2081,8 +2137,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2081 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2137 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2082 | sc->nr_scanned = 0; | 2138 | sc->nr_scanned = 0; |
2083 | if (!priority) | 2139 | if (!priority) |
2084 | disable_swap_token(); | 2140 | disable_swap_token(sc->mem_cgroup); |
2085 | total_scanned += shrink_zones(priority, zonelist, sc); | 2141 | shrink_zones(priority, zonelist, sc); |
2086 | /* | 2142 | /* |
2087 | * Don't shrink slabs when reclaiming memory from | 2143 | * Don't shrink slabs when reclaiming memory from |
2088 | * over limit cgroups | 2144 | * over limit cgroups |
@@ -2295,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2295 | for (i = 0; i <= classzone_idx; i++) | 2351 | for (i = 0; i <= classzone_idx; i++) |
2296 | present_pages += pgdat->node_zones[i].present_pages; | 2352 | present_pages += pgdat->node_zones[i].present_pages; |
2297 | 2353 | ||
2298 | return balanced_pages > (present_pages >> 2); | 2354 | /* A special case here: if zone has no page, we think it's balanced */ |
2355 | return balanced_pages >= (present_pages >> 2); | ||
2299 | } | 2356 | } |
2300 | 2357 | ||
2301 | /* is kswapd sleeping prematurely? */ | 2358 | /* is kswapd sleeping prematurely? */ |
@@ -2311,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2311 | return true; | 2368 | return true; |
2312 | 2369 | ||
2313 | /* Check the watermark levels */ | 2370 | /* Check the watermark levels */ |
2314 | for (i = 0; i < pgdat->nr_zones; i++) { | 2371 | for (i = 0; i <= classzone_idx; i++) { |
2315 | struct zone *zone = pgdat->node_zones + i; | 2372 | struct zone *zone = pgdat->node_zones + i; |
2316 | 2373 | ||
2317 | if (!populated_zone(zone)) | 2374 | if (!populated_zone(zone)) |
@@ -2329,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2329 | } | 2386 | } |
2330 | 2387 | ||
2331 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), | 2388 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2332 | classzone_idx, 0)) | 2389 | i, 0)) |
2333 | all_zones_ok = false; | 2390 | all_zones_ok = false; |
2334 | else | 2391 | else |
2335 | balanced += zone->present_pages; | 2392 | balanced += zone->present_pages; |
@@ -2407,7 +2464,7 @@ loop_again: | |||
2407 | 2464 | ||
2408 | /* The swap token gets in the way of swapout... */ | 2465 | /* The swap token gets in the way of swapout... */ |
2409 | if (!priority) | 2466 | if (!priority) |
2410 | disable_swap_token(); | 2467 | disable_swap_token(NULL); |
2411 | 2468 | ||
2412 | all_zones_ok = 1; | 2469 | all_zones_ok = 1; |
2413 | balanced = 0; | 2470 | balanced = 0; |
@@ -2436,7 +2493,6 @@ loop_again: | |||
2436 | if (!zone_watermark_ok_safe(zone, order, | 2493 | if (!zone_watermark_ok_safe(zone, order, |
2437 | high_wmark_pages(zone), 0, 0)) { | 2494 | high_wmark_pages(zone), 0, 0)) { |
2438 | end_zone = i; | 2495 | end_zone = i; |
2439 | *classzone_idx = i; | ||
2440 | break; | 2496 | break; |
2441 | } | 2497 | } |
2442 | } | 2498 | } |
@@ -2495,18 +2551,18 @@ loop_again: | |||
2495 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2551 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2496 | if (!zone_watermark_ok_safe(zone, order, | 2552 | if (!zone_watermark_ok_safe(zone, order, |
2497 | high_wmark_pages(zone) + balance_gap, | 2553 | high_wmark_pages(zone) + balance_gap, |
2498 | end_zone, 0)) | 2554 | end_zone, 0)) { |
2499 | shrink_zone(priority, zone, &sc); | 2555 | shrink_zone(priority, zone, &sc); |
2500 | reclaim_state->reclaimed_slab = 0; | ||
2501 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | ||
2502 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2503 | total_scanned += sc.nr_scanned; | ||
2504 | 2556 | ||
2505 | if (zone->all_unreclaimable) | 2557 | reclaim_state->reclaimed_slab = 0; |
2506 | continue; | 2558 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2507 | if (nr_slab == 0 && | 2559 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2508 | !zone_reclaimable(zone)) | 2560 | total_scanned += sc.nr_scanned; |
2509 | zone->all_unreclaimable = 1; | 2561 | |
2562 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2563 | zone->all_unreclaimable = 1; | ||
2564 | } | ||
2565 | |||
2510 | /* | 2566 | /* |
2511 | * If we've done a decent amount of scanning and | 2567 | * If we've done a decent amount of scanning and |
2512 | * the reclaim ratio is low, start doing writepage | 2568 | * the reclaim ratio is low, start doing writepage |
@@ -2516,6 +2572,12 @@ loop_again: | |||
2516 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2572 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2517 | sc.may_writepage = 1; | 2573 | sc.may_writepage = 1; |
2518 | 2574 | ||
2575 | if (zone->all_unreclaimable) { | ||
2576 | if (end_zone && end_zone == i) | ||
2577 | end_zone--; | ||
2578 | continue; | ||
2579 | } | ||
2580 | |||
2519 | if (!zone_watermark_ok_safe(zone, order, | 2581 | if (!zone_watermark_ok_safe(zone, order, |
2520 | high_wmark_pages(zone), end_zone, 0)) { | 2582 | high_wmark_pages(zone), end_zone, 0)) { |
2521 | all_zones_ok = 0; | 2583 | all_zones_ok = 0; |
@@ -2694,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2694 | */ | 2756 | */ |
2695 | static int kswapd(void *p) | 2757 | static int kswapd(void *p) |
2696 | { | 2758 | { |
2697 | unsigned long order; | 2759 | unsigned long order, new_order; |
2698 | int classzone_idx; | 2760 | int classzone_idx, new_classzone_idx; |
2699 | pg_data_t *pgdat = (pg_data_t*)p; | 2761 | pg_data_t *pgdat = (pg_data_t*)p; |
2700 | struct task_struct *tsk = current; | 2762 | struct task_struct *tsk = current; |
2701 | 2763 | ||
@@ -2725,17 +2787,23 @@ static int kswapd(void *p) | |||
2725 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 2787 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
2726 | set_freezable(); | 2788 | set_freezable(); |
2727 | 2789 | ||
2728 | order = 0; | 2790 | order = new_order = 0; |
2729 | classzone_idx = MAX_NR_ZONES - 1; | 2791 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2730 | for ( ; ; ) { | 2792 | for ( ; ; ) { |
2731 | unsigned long new_order; | ||
2732 | int new_classzone_idx; | ||
2733 | int ret; | 2793 | int ret; |
2734 | 2794 | ||
2735 | new_order = pgdat->kswapd_max_order; | 2795 | /* |
2736 | new_classzone_idx = pgdat->classzone_idx; | 2796 | * If the last balance_pgdat was unsuccessful it's unlikely a |
2737 | pgdat->kswapd_max_order = 0; | 2797 | * new request of a similar or harder type will succeed soon |
2738 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2798 | * so consider going to sleep on the basis we reclaimed at |
2799 | */ | ||
2800 | if (classzone_idx >= new_classzone_idx && order == new_order) { | ||
2801 | new_order = pgdat->kswapd_max_order; | ||
2802 | new_classzone_idx = pgdat->classzone_idx; | ||
2803 | pgdat->kswapd_max_order = 0; | ||
2804 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
2805 | } | ||
2806 | |||
2739 | if (order < new_order || classzone_idx > new_classzone_idx) { | 2807 | if (order < new_order || classzone_idx > new_classzone_idx) { |
2740 | /* | 2808 | /* |
2741 | * Don't sleep if someone wants a larger 'order' | 2809 | * Don't sleep if someone wants a larger 'order' |
@@ -2748,7 +2816,7 @@ static int kswapd(void *p) | |||
2748 | order = pgdat->kswapd_max_order; | 2816 | order = pgdat->kswapd_max_order; |
2749 | classzone_idx = pgdat->classzone_idx; | 2817 | classzone_idx = pgdat->classzone_idx; |
2750 | pgdat->kswapd_max_order = 0; | 2818 | pgdat->kswapd_max_order = 0; |
2751 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2819 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2752 | } | 2820 | } |
2753 | 2821 | ||
2754 | ret = try_to_freeze(); | 2822 | ret = try_to_freeze(); |