aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c85
-rw-r--r--mm/dmapool.c2
-rw-r--r--mm/failslab.c31
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c41
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memblock.c8
-rw-r--r--mm/memcontrol.c573
-rw-r--r--mm/memory.c125
-rw-r--r--mm/memory_hotplug.c68
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/mmap.c34
-rw-r--r--mm/nommu.c37
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c291
-rw-r--r--mm/page_alloc.c163
-rw-r--r--mm/page_cgroup.c10
-rw-r--r--mm/pagewalk.c49
-rw-r--r--mm/rmap.c11
-rw-r--r--mm/shmem.c558
-rw-r--r--mm/slab.c24
-rw-r--r--mm/slob.c8
-rw-r--r--mm/slub.c871
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swapfile.c29
-rw-r--r--mm/thrash.c17
-rw-r--r--mm/truncate.c146
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c150
33 files changed, 2224 insertions, 1187 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 8ca47a5ee9c8..f2f1ca19ed53 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -356,7 +356,7 @@ config CLEANCACHE
356 for clean pages that the kernel's pageframe replacement algorithm 356 for clean pages that the kernel's pageframe replacement algorithm
357 (PFRA) would like to keep around, but can't since there isn't enough 357 (PFRA) would like to keep around, but can't since there isn't enough
358 memory. So when the PFRA "evicts" a page, it first attempts to use 358 memory. So when the PFRA "evicts" a page, it first attempts to use
359 cleancacne code to put the data contained in that page into 359 cleancache code to put the data contained in that page into
360 "transcendent memory", memory that is not directly accessible or 360 "transcendent memory", memory that is not directly accessible or
361 addressable by the kernel and is of unknown and possibly 361 addressable by the kernel and is of unknown and possibly
362 time-varying size. And when a cleancache-enabled 362 time-varying size. And when a cleancache-enabled
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09a..d6edf8d14f9c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
45static int bdi_sync_supers(void *); 45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long); 46static void sync_supers_timer_fn(unsigned long);
47 47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57}
58
48#ifdef CONFIG_DEBUG_FS 59#ifdef CONFIG_DEBUG_FS
49#include <linux/debugfs.h> 60#include <linux/debugfs.h>
50#include <linux/seq_file.h> 61#include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82#define K(x) ((x) << (PAGE_SHIFT - 10)) 93#define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiWritten: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWriteBandwidth: %10lu kBps\n"
91 "b_more_io: %8lu\n" 102 "b_dirty: %10lu\n"
92 "bdi_list: %8u\n" 103 "b_io: %10lu\n"
93 "state: %8lx\n", 104 "b_more_io: %10lu\n"
105 "bdi_list: %10u\n"
106 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 108 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 109 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 110 K(dirty_thresh),
111 K(background_thresh),
112 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
113 (unsigned long) K(bdi->write_bandwidth),
114 nr_dirty,
115 nr_io,
116 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 117 !list_empty(&bdi->bdi_list), bdi->state);
99#undef K 118#undef K
100 119
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
249 return wb_has_dirty_io(&bdi->wb); 268 return wb_has_dirty_io(&bdi->wb);
250} 269}
251 270
252static void bdi_flush_io(struct backing_dev_info *bdi)
253{
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262}
263
264/* 271/*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr)
446 if (IS_ERR(task)) { 453 if (IS_ERR(task)) {
447 /* 454 /*
448 * If thread creation fails, force writeout of 455 * If thread creation fails, force writeout of
449 * the bdi from the thread. 456 * the bdi from the thread. Hopefully 1024 is
457 * large enough for efficient IO.
450 */ 458 */
451 bdi_flush_io(bdi); 459 writeback_inodes_wb(&bdi->wb, 1024);
452 } else { 460 } else {
453 /* 461 /*
454 * The spinlock makes sure we do not lose 462 * The spinlock makes sure we do not lose
@@ -505,7 +513,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
505 list_del_rcu(&bdi->bdi_list); 513 list_del_rcu(&bdi->bdi_list);
506 spin_unlock_bh(&bdi_lock); 514 spin_unlock_bh(&bdi_lock);
507 515
508 synchronize_rcu(); 516 synchronize_rcu_expedited();
509} 517}
510 518
511int bdi_register(struct backing_dev_info *bdi, struct device *parent, 519int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -606,6 +614,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
606void bdi_unregister(struct backing_dev_info *bdi) 614void bdi_unregister(struct backing_dev_info *bdi)
607{ 615{
608 if (bdi->dev) { 616 if (bdi->dev) {
617 bdi_set_min_ratio(bdi, 0);
609 trace_writeback_bdi_unregister(bdi); 618 trace_writeback_bdi_unregister(bdi);
610 bdi_prune_sb(bdi); 619 bdi_prune_sb(bdi);
611 del_timer_sync(&bdi->wb.wakeup_timer); 620 del_timer_sync(&bdi->wb.wakeup_timer);
@@ -628,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
628 INIT_LIST_HEAD(&wb->b_dirty); 637 INIT_LIST_HEAD(&wb->b_dirty);
629 INIT_LIST_HEAD(&wb->b_io); 638 INIT_LIST_HEAD(&wb->b_io);
630 INIT_LIST_HEAD(&wb->b_more_io); 639 INIT_LIST_HEAD(&wb->b_more_io);
640 spin_lock_init(&wb->list_lock);
631 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 641 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
632} 642}
633 643
644/*
645 * Initial write bandwidth: 100 MB/s
646 */
647#define INIT_BW (100 << (20 - PAGE_SHIFT))
648
634int bdi_init(struct backing_dev_info *bdi) 649int bdi_init(struct backing_dev_info *bdi)
635{ 650{
636 int i, err; 651 int i, err;
@@ -653,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi)
653 } 668 }
654 669
655 bdi->dirty_exceeded = 0; 670 bdi->dirty_exceeded = 0;
671
672 bdi->bw_time_stamp = jiffies;
673 bdi->written_stamp = 0;
674
675 bdi->write_bandwidth = INIT_BW;
676 bdi->avg_write_bandwidth = INIT_BW;
677
656 err = prop_local_init_percpu(&bdi->completions); 678 err = prop_local_init_percpu(&bdi->completions);
657 679
658 if (err) { 680 if (err) {
@@ -676,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
676 if (bdi_has_dirty_io(bdi)) { 698 if (bdi_has_dirty_io(bdi)) {
677 struct bdi_writeback *dst = &default_backing_dev_info.wb; 699 struct bdi_writeback *dst = &default_backing_dev_info.wb;
678 700
679 spin_lock(&inode_wb_list_lock); 701 bdi_lock_two(&bdi->wb, dst);
680 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 702 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
681 list_splice(&bdi->wb.b_io, &dst->b_io); 703 list_splice(&bdi->wb.b_io, &dst->b_io);
682 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 704 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
683 spin_unlock(&inode_wb_list_lock); 705 spin_unlock(&bdi->wb.list_lock);
706 spin_unlock(&dst->list_lock);
684 } 707 }
685 708
686 bdi_unregister(bdi); 709 bdi_unregister(bdi);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 03bf3bb4519a..fbb58e346888 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool)
500{ 500{
501 struct device *dev = pool->dev; 501 struct device *dev = pool->dev;
502 502
503 dma_pool_destroy(pool);
504 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); 503 WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool));
504 dma_pool_destroy(pool);
505} 505}
506EXPORT_SYMBOL(dmam_pool_destroy); 506EXPORT_SYMBOL(dmam_pool_destroy);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240ddc..1ce58c201dca 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
5 struct fault_attr attr; 5 struct fault_attr attr;
6 u32 ignore_gfp_wait; 6 u32 ignore_gfp_wait;
7 int cache_filter; 7 int cache_filter;
8#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
9 struct dentry *ignore_gfp_wait_file;
10 struct dentry *cache_filter_file;
11#endif
12} failslab = { 8} failslab = {
13 .attr = FAULT_ATTR_INITIALIZER, 9 .attr = FAULT_ATTR_INITIALIZER,
14 .ignore_gfp_wait = 1, 10 .ignore_gfp_wait = 1,
@@ -39,31 +35,24 @@ __setup("failslab=", setup_failslab);
39static int __init failslab_debugfs_init(void) 35static int __init failslab_debugfs_init(void)
40{ 36{
41 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 37 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
42 struct dentry *dir;
43 int err; 38 int err;
44 39
45 err = init_fault_attr_dentries(&failslab.attr, "failslab"); 40 err = init_fault_attr_dentries(&failslab.attr, "failslab");
46 if (err) 41 if (err)
47 return err; 42 return err;
48 dir = failslab.attr.dentries.dir;
49
50 failslab.ignore_gfp_wait_file =
51 debugfs_create_bool("ignore-gfp-wait", mode, dir,
52 &failslab.ignore_gfp_wait);
53 43
54 failslab.cache_filter_file = 44 if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir,
55 debugfs_create_bool("cache-filter", mode, dir, 45 &failslab.ignore_gfp_wait))
56 &failslab.cache_filter); 46 goto fail;
47 if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir,
48 &failslab.cache_filter))
49 goto fail;
57 50
58 if (!failslab.ignore_gfp_wait_file || 51 return 0;
59 !failslab.cache_filter_file) { 52fail:
60 err = -ENOMEM; 53 cleanup_fault_attr_dentries(&failslab.attr);
61 debugfs_remove(failslab.cache_filter_file);
62 debugfs_remove(failslab.ignore_gfp_wait_file);
63 cleanup_fault_attr_dentries(&failslab.attr);
64 }
65 54
66 return err; 55 return -ENOMEM;
67} 56}
68 57
69late_initcall(failslab_debugfs_init); 58late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index a8251a8d3457..867d40222ec7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -78,10 +78,7 @@
78 * ->i_mutex (generic_file_buffered_write) 78 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 79 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 80 *
81 * ->i_mutex 81 * bdi->wb.list_lock
82 * ->i_alloc_sem (various)
83 *
84 * inode_wb_list_lock
85 * sb_lock (fs/fs-writeback.c) 82 * sb_lock (fs/fs-writeback.c)
86 * ->mapping->tree_lock (__sync_single_inode) 83 * ->mapping->tree_lock (__sync_single_inode)
87 * 84 *
@@ -99,9 +96,9 @@
99 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 96 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
100 * ->private_lock (page_remove_rmap->set_page_dirty) 97 * ->private_lock (page_remove_rmap->set_page_dirty)
101 * ->tree_lock (page_remove_rmap->set_page_dirty) 98 * ->tree_lock (page_remove_rmap->set_page_dirty)
102 * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 99 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
103 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 100 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
104 * inode_wb_list_lock (zap_pte_range->set_page_dirty) 101 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
105 * ->inode->i_lock (zap_pte_range->set_page_dirty) 102 * ->inode->i_lock (zap_pte_range->set_page_dirty)
106 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
107 * 104 *
@@ -131,6 +128,7 @@ void __delete_from_page_cache(struct page *page)
131 128
132 radix_tree_delete(&mapping->page_tree, page->index); 129 radix_tree_delete(&mapping->page_tree, page->index);
133 page->mapping = NULL; 130 page->mapping = NULL;
131 /* Leave page->index set: truncation lookup relies upon it */
134 mapping->nrpages--; 132 mapping->nrpages--;
135 __dec_zone_page_state(page, NR_FILE_PAGES); 133 __dec_zone_page_state(page, NR_FILE_PAGES);
136 if (PageSwapBacked(page)) 134 if (PageSwapBacked(page))
@@ -486,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
486 spin_unlock_irq(&mapping->tree_lock); 484 spin_unlock_irq(&mapping->tree_lock);
487 } else { 485 } else {
488 page->mapping = NULL; 486 page->mapping = NULL;
487 /* Leave page->index set: truncation relies upon it */
489 spin_unlock_irq(&mapping->tree_lock); 488 spin_unlock_irq(&mapping->tree_lock);
490 mem_cgroup_uncharge_cache_page(page); 489 mem_cgroup_uncharge_cache_page(page);
491 page_cache_release(page); 490 page_cache_release(page);
@@ -1795,7 +1794,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap);
1795 1794
1796static struct page *__read_cache_page(struct address_space *mapping, 1795static struct page *__read_cache_page(struct address_space *mapping,
1797 pgoff_t index, 1796 pgoff_t index,
1798 int (*filler)(void *,struct page*), 1797 int (*filler)(void *, struct page *),
1799 void *data, 1798 void *data,
1800 gfp_t gfp) 1799 gfp_t gfp)
1801{ 1800{
@@ -1826,7 +1825,7 @@ repeat:
1826 1825
1827static struct page *do_read_cache_page(struct address_space *mapping, 1826static struct page *do_read_cache_page(struct address_space *mapping,
1828 pgoff_t index, 1827 pgoff_t index,
1829 int (*filler)(void *,struct page*), 1828 int (*filler)(void *, struct page *),
1830 void *data, 1829 void *data,
1831 gfp_t gfp) 1830 gfp_t gfp)
1832 1831
@@ -1866,7 +1865,7 @@ out:
1866 * @mapping: the page's address_space 1865 * @mapping: the page's address_space
1867 * @index: the page index 1866 * @index: the page index
1868 * @filler: function to perform the read 1867 * @filler: function to perform the read
1869 * @data: destination for read data 1868 * @data: first arg to filler(data, page) function, often left as NULL
1870 * 1869 *
1871 * Same as read_cache_page, but don't wait for page to become unlocked 1870 * Same as read_cache_page, but don't wait for page to become unlocked
1872 * after submitting it to the filler. 1871 * after submitting it to the filler.
@@ -1878,7 +1877,7 @@ out:
1878 */ 1877 */
1879struct page *read_cache_page_async(struct address_space *mapping, 1878struct page *read_cache_page_async(struct address_space *mapping,
1880 pgoff_t index, 1879 pgoff_t index,
1881 int (*filler)(void *,struct page*), 1880 int (*filler)(void *, struct page *),
1882 void *data) 1881 void *data)
1883{ 1882{
1884 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); 1883 return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1926 * @mapping: the page's address_space 1925 * @mapping: the page's address_space
1927 * @index: the page index 1926 * @index: the page index
1928 * @filler: function to perform the read 1927 * @filler: function to perform the read
1929 * @data: destination for read data 1928 * @data: first arg to filler(data, page) function, often left as NULL
1930 * 1929 *
1931 * Read into the page cache. If a page already exists, and PageUptodate() is 1930 * Read into the page cache. If a page already exists, and PageUptodate() is
1932 * not set, try to fill the page then wait for it to become unlocked. 1931 * not set, try to fill the page then wait for it to become unlocked.
@@ -1935,7 +1934,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
1935 */ 1934 */
1936struct page *read_cache_page(struct address_space *mapping, 1935struct page *read_cache_page(struct address_space *mapping,
1937 pgoff_t index, 1936 pgoff_t index,
1938 int (*filler)(void *,struct page*), 1937 int (*filler)(void *, struct page *),
1939 void *data) 1938 void *data)
1940{ 1939{
1941 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); 1940 return wait_on_page_read(read_cache_page_async(mapping, index, filler, data));
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 81532f297fd2..e2d1587be269 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm)
1596 list_del(&mm_slot->mm_node); 1596 list_del(&mm_slot->mm_node);
1597 free = 1; 1597 free = 1;
1598 } 1598 }
1599 spin_unlock(&khugepaged_mm_lock);
1599 1600
1600 if (free) { 1601 if (free) {
1601 spin_unlock(&khugepaged_mm_lock);
1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags); 1602 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1603 free_mm_slot(mm_slot); 1603 free_mm_slot(mm_slot);
1604 mmdrop(mm); 1604 mmdrop(mm);
1605 } else if (mm_slot) { 1605 } else if (mm_slot) {
1606 spin_unlock(&khugepaged_mm_lock);
1607 /* 1606 /*
1608 * This is required to serialize against 1607 * This is required to serialize against
1609 * khugepaged_test_exit() (which is guaranteed to run 1608 * khugepaged_test_exit() (which is guaranteed to run
@@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm)
1614 */ 1613 */
1615 down_write(&mm->mmap_sem); 1614 down_write(&mm->mmap_sem);
1616 up_write(&mm->mmap_sem); 1615 up_write(&mm->mmap_sem);
1617 } else 1616 }
1618 spin_unlock(&khugepaged_mm_lock);
1619} 1617}
1620 1618
1621static void release_pte_page(struct page *page) 1619static void release_pte_page(struct page *page)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfcf153bc829..dae27ba3be2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,7 +24,7 @@
24 24
25#include <asm/page.h> 25#include <asm/page.h>
26#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/io.h> 27#include <linux/io.h>
28 28
29#include <linux/hugetlb.h> 29#include <linux/hugetlb.h>
30#include <linux/node.h> 30#include <linux/node.h>
@@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock);
62 * must either hold the mmap_sem for write, or the mmap_sem for read and 62 * must either hold the mmap_sem for write, or the mmap_sem for read and
63 * the hugetlb_instantiation mutex: 63 * the hugetlb_instantiation mutex:
64 * 64 *
65 * down_write(&mm->mmap_sem); 65 * down_write(&mm->mmap_sem);
66 * or 66 * or
67 * down_read(&mm->mmap_sem); 67 * down_read(&mm->mmap_sem);
68 * mutex_lock(&hugetlb_instantiation_mutex); 68 * mutex_lock(&hugetlb_instantiation_mutex);
69 */ 69 */
70struct file_region { 70struct file_region {
71 struct list_head link; 71 struct list_head link;
@@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page)
503 h->nr_huge_pages--; 503 h->nr_huge_pages--;
504 h->nr_huge_pages_node[page_to_nid(page)]--; 504 h->nr_huge_pages_node[page_to_nid(page)]--;
505 for (i = 0; i < pages_per_huge_page(h); i++) { 505 for (i = 0; i < pages_per_huge_page(h); i++) {
506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 506 page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
507 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 507 1 << PG_referenced | 1 << PG_dirty |
508 1 << PG_private | 1<< PG_writeback); 508 1 << PG_active | 1 << PG_reserved |
509 1 << PG_private | 1 << PG_writeback);
509 } 510 }
510 set_compound_page_dtor(page, NULL); 511 set_compound_page_dtor(page, NULL);
511 set_page_refcounted(page); 512 set_page_refcounted(page);
@@ -591,7 +592,6 @@ int PageHuge(struct page *page)
591 592
592 return dtor == free_huge_page; 593 return dtor == free_huge_page;
593} 594}
594
595EXPORT_SYMBOL_GPL(PageHuge); 595EXPORT_SYMBOL_GPL(PageHuge);
596 596
597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 597static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
@@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void)
1105 struct huge_bootmem_page *m; 1105 struct huge_bootmem_page *m;
1106 1106
1107 list_for_each_entry(m, &huge_boot_pages, list) { 1107 list_for_each_entry(m, &huge_boot_pages, list) {
1108 struct page *page = virt_to_page(m);
1109 struct hstate *h = m->hstate; 1108 struct hstate *h = m->hstate;
1109 struct page *page;
1110
1111#ifdef CONFIG_HIGHMEM
1112 page = pfn_to_page(m->phys >> PAGE_SHIFT);
1113 free_bootmem_late((unsigned long)m,
1114 sizeof(struct huge_bootmem_page));
1115#else
1116 page = virt_to_page(m);
1117#endif
1110 __ClearPageReserved(page); 1118 __ClearPageReserved(page);
1111 WARN_ON(page_count(page) != 1); 1119 WARN_ON(page_count(page) != 1);
1112 prep_compound_huge_page(page, h->order); 1120 prep_compound_huge_page(page, h->order);
@@ -2124,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
2124 pte_t entry; 2132 pte_t entry;
2125 2133
2126 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); 2134 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
2127 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { 2135 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
2128 update_mmu_cache(vma, address, ptep); 2136 update_mmu_cache(vma, address, ptep);
2129 }
2130} 2137}
2131 2138
2132 2139
@@ -2181,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte)
2181 if (huge_pte_none(pte) || pte_present(pte)) 2188 if (huge_pte_none(pte) || pte_present(pte))
2182 return 0; 2189 return 0;
2183 swp = pte_to_swp_entry(pte); 2190 swp = pte_to_swp_entry(pte);
2184 if (non_swap_entry(swp) && is_migration_entry(swp)) { 2191 if (non_swap_entry(swp) && is_migration_entry(swp))
2185 return 1; 2192 return 1;
2186 } else 2193 else
2187 return 0; 2194 return 0;
2188} 2195}
2189 2196
@@ -2194,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
2194 if (huge_pte_none(pte) || pte_present(pte)) 2201 if (huge_pte_none(pte) || pte_present(pte))
2195 return 0; 2202 return 0;
2196 swp = pte_to_swp_entry(pte); 2203 swp = pte_to_swp_entry(pte);
2197 if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { 2204 if (non_swap_entry(swp) && is_hwpoison_entry(swp))
2198 return 1; 2205 return 1;
2199 } else 2206 else
2200 return 0; 2207 return 0;
2201} 2208}
2202 2209
@@ -2559,7 +2566,7 @@ retry:
2559 * So we need to block hugepage fault by PG_hwpoison bit check. 2566 * So we need to block hugepage fault by PG_hwpoison bit check.
2560 */ 2567 */
2561 if (unlikely(PageHWPoison(page))) { 2568 if (unlikely(PageHWPoison(page))) {
2562 ret = VM_FAULT_HWPOISON | 2569 ret = VM_FAULT_HWPOISON |
2563 VM_FAULT_SET_HINDEX(h - hstates); 2570 VM_FAULT_SET_HINDEX(h - hstates);
2564 goto backout_unlocked; 2571 goto backout_unlocked;
2565 } 2572 }
@@ -2627,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2627 migration_entry_wait(mm, (pmd_t *)ptep, address); 2634 migration_entry_wait(mm, (pmd_t *)ptep, address);
2628 return 0; 2635 return 0;
2629 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) 2636 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
2630 return VM_FAULT_HWPOISON_LARGE | 2637 return VM_FAULT_HWPOISON_LARGE |
2631 VM_FAULT_SET_HINDEX(h - hstates); 2638 VM_FAULT_SET_HINDEX(h - hstates);
2632 } 2639 }
2633 2640
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4019979b2637..a56a851908d2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7 7
8#include <asm/atomic.h> 8#include <linux/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h> 10#include <asm/mmu.h>
11 11
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index aacee45616fc..d6880f542f95 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -96,7 +96,7 @@
96 96
97#include <asm/sections.h> 97#include <asm/sections.h>
98#include <asm/processor.h> 98#include <asm/processor.h>
99#include <asm/atomic.h> 99#include <linux/atomic.h>
100 100
101#include <linux/kmemcheck.h> 101#include <linux/kmemcheck.h>
102#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
diff --git a/mm/madvise.c b/mm/madvise.c
index 2221491ed503..74bf193eff04 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma,
218 endoff = (loff_t)(end - vma->vm_start - 1) 218 endoff = (loff_t)(end - vma->vm_start - 1)
219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 219 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
220 220
221 /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ 221 /* vmtruncate_range needs to take i_mutex */
222 up_read(&current->mm->mmap_sem); 222 up_read(&current->mm->mmap_sem);
223 error = vmtruncate_range(mapping->host, offset, endoff); 223 error = vmtruncate_range(mapping->host, offset, endoff);
224 down_read(&current->mm->mmap_sem); 224 down_read(&current->mm->mmap_sem);
diff --git a/mm/memblock.c b/mm/memblock.c
index a0562d1a6ad4..ccbf97339592 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -758,9 +758,9 @@ void __init memblock_analyze(void)
758 758
759 /* Check marker in the unused last array entry */ 759 /* Check marker in the unused last array entry */
760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base 760 WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
761 != (phys_addr_t)RED_INACTIVE); 761 != MEMBLOCK_INACTIVE);
762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base 762 WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
763 != (phys_addr_t)RED_INACTIVE); 763 != MEMBLOCK_INACTIVE);
764 764
765 memblock.memory_size = 0; 765 memblock.memory_size = 0;
766 766
@@ -786,8 +786,8 @@ void __init memblock_init(void)
786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS; 786 memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
787 787
788 /* Write a marker in the unused last array entry */ 788 /* Write a marker in the unused last array entry */
789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 789 memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; 790 memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE;
791 791
792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later. 792 /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
793 * This simplifies the memblock_add() code below... 793 * This simplifies the memblock_add() code below...
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d25..5f84d2351ddb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -205,6 +205,50 @@ struct mem_cgroup_eventfd_list {
205static void mem_cgroup_threshold(struct mem_cgroup *mem); 205static void mem_cgroup_threshold(struct mem_cgroup *mem);
206static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
207 207
208enum {
209 SCAN_BY_LIMIT,
210 SCAN_BY_SYSTEM,
211 NR_SCAN_CONTEXT,
212 SCAN_BY_SHRINK, /* not recorded now */
213};
214
215enum {
216 SCAN,
217 SCAN_ANON,
218 SCAN_FILE,
219 ROTATE,
220 ROTATE_ANON,
221 ROTATE_FILE,
222 FREED,
223 FREED_ANON,
224 FREED_FILE,
225 ELAPSED,
226 NR_SCANSTATS,
227};
228
229struct scanstat {
230 spinlock_t lock;
231 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
232 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
233};
234
235const char *scanstat_string[NR_SCANSTATS] = {
236 "scanned_pages",
237 "scanned_anon_pages",
238 "scanned_file_pages",
239 "rotated_pages",
240 "rotated_anon_pages",
241 "rotated_file_pages",
242 "freed_pages",
243 "freed_anon_pages",
244 "freed_file_pages",
245 "elapsed_ns",
246};
247#define SCANSTAT_WORD_LIMIT "_by_limit"
248#define SCANSTAT_WORD_SYSTEM "_by_system"
249#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
250
251
208/* 252/*
209 * The memory controller data structure. The memory controller controls both 253 * The memory controller data structure. The memory controller controls both
210 * page cache and RSS per cgroup. We would eventually like to provide 254 * page cache and RSS per cgroup. We would eventually like to provide
@@ -246,10 +290,13 @@ struct mem_cgroup {
246 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
247 */ 291 */
248 bool use_hierarchy; 292 bool use_hierarchy;
249 atomic_t oom_lock; 293
294 bool oom_lock;
295 atomic_t under_oom;
296
250 atomic_t refcnt; 297 atomic_t refcnt;
251 298
252 unsigned int swappiness; 299 int swappiness;
253 /* OOM-Killer disable */ 300 /* OOM-Killer disable */
254 int oom_kill_disable; 301 int oom_kill_disable;
255 302
@@ -267,7 +314,8 @@ struct mem_cgroup {
267 314
268 /* For oom notifier event fd */ 315 /* For oom notifier event fd */
269 struct list_head oom_notify; 316 struct list_head oom_notify;
270 317 /* For recording LRU-scan statistics */
318 struct scanstat scanstat;
271 /* 319 /*
272 * Should we move charges of a task when a task is moved into this 320 * Should we move charges of a task when a task is moved into this
273 * mem_cgroup ? And what type of charges should we move ? 321 * mem_cgroup ? And what type of charges should we move ?
@@ -636,27 +684,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
636 preempt_enable(); 684 preempt_enable();
637} 685}
638 686
639static unsigned long 687unsigned long
640mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) 688mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
689 unsigned int lru_mask)
641{ 690{
642 struct mem_cgroup_per_zone *mz; 691 struct mem_cgroup_per_zone *mz;
692 enum lru_list l;
693 unsigned long ret = 0;
694
695 mz = mem_cgroup_zoneinfo(mem, nid, zid);
696
697 for_each_lru(l) {
698 if (BIT(l) & lru_mask)
699 ret += MEM_CGROUP_ZSTAT(mz, l);
700 }
701 return ret;
702}
703
704static unsigned long
705mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
706 int nid, unsigned int lru_mask)
707{
643 u64 total = 0; 708 u64 total = 0;
644 int zid; 709 int zid;
645 710
646 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 711 for (zid = 0; zid < MAX_NR_ZONES; zid++)
647 mz = mem_cgroup_zoneinfo(mem, nid, zid); 712 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
648 total += MEM_CGROUP_ZSTAT(mz, idx); 713
649 }
650 return total; 714 return total;
651} 715}
652static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 716
653 enum lru_list idx) 717static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
718 unsigned int lru_mask)
654{ 719{
655 int nid; 720 int nid;
656 u64 total = 0; 721 u64 total = 0;
657 722
658 for_each_online_node(nid) 723 for_each_node_state(nid, N_HIGH_MEMORY)
659 total += mem_cgroup_get_zonestat_node(mem, nid, idx); 724 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
660 return total; 725 return total;
661} 726}
662 727
@@ -1043,6 +1108,21 @@ void mem_cgroup_move_lists(struct page *page,
1043 mem_cgroup_add_lru_list(page, to); 1108 mem_cgroup_add_lru_list(page, to);
1044} 1109}
1045 1110
1111/*
1112 * Checks whether given mem is same or in the root_mem's
1113 * hierarchy subtree
1114 */
1115static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
1116 struct mem_cgroup *mem)
1117{
1118 if (root_mem != mem) {
1119 return (root_mem->use_hierarchy &&
1120 css_is_ancestor(&mem->css, &root_mem->css));
1121 }
1122
1123 return true;
1124}
1125
1046int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1126int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1047{ 1127{
1048 int ret; 1128 int ret;
@@ -1062,10 +1142,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1062 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1142 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
1063 * hierarchy(even if use_hierarchy is disabled in "mem"). 1143 * hierarchy(even if use_hierarchy is disabled in "mem").
1064 */ 1144 */
1065 if (mem->use_hierarchy) 1145 ret = mem_cgroup_same_or_subtree(mem, curr);
1066 ret = css_is_ancestor(&curr->css, &mem->css);
1067 else
1068 ret = (curr == mem);
1069 css_put(&curr->css); 1146 css_put(&curr->css);
1070 return ret; 1147 return ret;
1071} 1148}
@@ -1077,8 +1154,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
1077 unsigned long gb; 1154 unsigned long gb;
1078 unsigned long inactive_ratio; 1155 unsigned long inactive_ratio;
1079 1156
1080 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 1157 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
1081 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 1158 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1082 1159
1083 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1160 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1084 if (gb) 1161 if (gb)
@@ -1117,109 +1194,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1117 unsigned long active; 1194 unsigned long active;
1118 unsigned long inactive; 1195 unsigned long inactive;
1119 1196
1120 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 1197 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
1121 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 1198 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
1122 1199
1123 return (active > inactive); 1200 return (active > inactive);
1124} 1201}
1125 1202
1126unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1127 struct zone *zone,
1128 enum lru_list lru)
1129{
1130 int nid = zone_to_nid(zone);
1131 int zid = zone_idx(zone);
1132 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1133
1134 return MEM_CGROUP_ZSTAT(mz, lru);
1135}
1136
1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1138 int nid)
1139{
1140 unsigned long ret;
1141
1142 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
1143 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
1144
1145 return ret;
1146}
1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1160{
1161 u64 total = 0;
1162 int nid;
1163
1164 for_each_node_state(nid, N_HIGH_MEMORY)
1165 total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
1166
1167 return total;
1168}
1169
1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1171{
1172 u64 total = 0;
1173 int nid;
1174
1175 for_each_node_state(nid, N_HIGH_MEMORY)
1176 total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
1177
1178 return total;
1179}
1180
1181static unsigned long
1182mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
1183{
1184 return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
1185}
1186
1187static unsigned long
1188mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
1189{
1190 u64 total = 0;
1191 int nid;
1192
1193 for_each_node_state(nid, N_HIGH_MEMORY)
1194 total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
1195
1196 return total;
1197}
1198
1199static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1200 int nid)
1201{
1202 enum lru_list l;
1203 u64 total = 0;
1204
1205 for_each_lru(l)
1206 total += mem_cgroup_get_zonestat_node(memcg, nid, l);
1207
1208 return total;
1209}
1210
1211static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
1212{
1213 u64 total = 0;
1214 int nid;
1215
1216 for_each_node_state(nid, N_HIGH_MEMORY)
1217 total += mem_cgroup_node_nr_lru_pages(memcg, nid);
1218
1219 return total;
1220}
1221#endif /* CONFIG_NUMA */
1222
1223struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1203struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1224 struct zone *zone) 1204 struct zone *zone)
1225{ 1205{
@@ -1329,7 +1309,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1329 return margin >> PAGE_SHIFT; 1309 return margin >> PAGE_SHIFT;
1330} 1310}
1331 1311
1332static unsigned int get_swappiness(struct mem_cgroup *memcg) 1312int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1333{ 1313{
1334 struct cgroup *cgrp = memcg->css.cgroup; 1314 struct cgroup *cgrp = memcg->css.cgroup;
1335 1315
@@ -1401,10 +1381,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1401 to = mc.to; 1381 to = mc.to;
1402 if (!from) 1382 if (!from)
1403 goto unlock; 1383 goto unlock;
1404 if (from == mem || to == mem 1384
1405 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1385 ret = mem_cgroup_same_or_subtree(mem, from)
1406 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1386 || mem_cgroup_same_or_subtree(mem, to);
1407 ret = true;
1408unlock: 1387unlock:
1409 spin_unlock(&mc.lock); 1388 spin_unlock(&mc.lock);
1410 return ret; 1389 return ret;
@@ -1576,11 +1555,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1555static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap) 1556 int nid, bool noswap)
1578{ 1557{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) 1558 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
1580 return true; 1559 return true;
1581 if (noswap || !total_swap_pages) 1560 if (noswap || !total_swap_pages)
1582 return false; 1561 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) 1562 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
1584 return true; 1563 return true;
1585 return false; 1564 return false;
1586 1565
@@ -1700,6 +1679,44 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1700} 1679}
1701#endif 1680#endif
1702 1681
1682static void __mem_cgroup_record_scanstat(unsigned long *stats,
1683 struct memcg_scanrecord *rec)
1684{
1685
1686 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
1687 stats[SCAN_ANON] += rec->nr_scanned[0];
1688 stats[SCAN_FILE] += rec->nr_scanned[1];
1689
1690 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
1691 stats[ROTATE_ANON] += rec->nr_rotated[0];
1692 stats[ROTATE_FILE] += rec->nr_rotated[1];
1693
1694 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
1695 stats[FREED_ANON] += rec->nr_freed[0];
1696 stats[FREED_FILE] += rec->nr_freed[1];
1697
1698 stats[ELAPSED] += rec->elapsed;
1699}
1700
1701static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
1702{
1703 struct mem_cgroup *mem;
1704 int context = rec->context;
1705
1706 if (context >= NR_SCAN_CONTEXT)
1707 return;
1708
1709 mem = rec->mem;
1710 spin_lock(&mem->scanstat.lock);
1711 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
1712 spin_unlock(&mem->scanstat.lock);
1713
1714 mem = rec->root;
1715 spin_lock(&mem->scanstat.lock);
1716 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
1717 spin_unlock(&mem->scanstat.lock);
1718}
1719
1703/* 1720/*
1704 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1721 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1705 * we reclaimed from, so that we don't end up penalizing one child extensively 1722 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1724,15 +1741,25 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1724 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1741 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1725 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1742 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1726 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1743 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1744 struct memcg_scanrecord rec;
1727 unsigned long excess; 1745 unsigned long excess;
1728 unsigned long nr_scanned; 1746 unsigned long scanned;
1729 1747
1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1748 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1731 1749
1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1750 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1733 if (!check_soft && root_mem->memsw_is_minimum) 1751 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1734 noswap = true; 1752 noswap = true;
1735 1753
1754 if (shrink)
1755 rec.context = SCAN_BY_SHRINK;
1756 else if (check_soft)
1757 rec.context = SCAN_BY_SYSTEM;
1758 else
1759 rec.context = SCAN_BY_LIMIT;
1760
1761 rec.root = root_mem;
1762
1736 while (1) { 1763 while (1) {
1737 victim = mem_cgroup_select_victim(root_mem); 1764 victim = mem_cgroup_select_victim(root_mem);
1738 if (victim == root_mem) { 1765 if (victim == root_mem) {
@@ -1773,15 +1800,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1773 css_put(&victim->css); 1800 css_put(&victim->css);
1774 continue; 1801 continue;
1775 } 1802 }
1803 rec.mem = victim;
1804 rec.nr_scanned[0] = 0;
1805 rec.nr_scanned[1] = 0;
1806 rec.nr_rotated[0] = 0;
1807 rec.nr_rotated[1] = 0;
1808 rec.nr_freed[0] = 0;
1809 rec.nr_freed[1] = 0;
1810 rec.elapsed = 0;
1776 /* we use swappiness of local cgroup */ 1811 /* we use swappiness of local cgroup */
1777 if (check_soft) { 1812 if (check_soft) {
1778 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1813 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1779 noswap, get_swappiness(victim), zone, 1814 noswap, zone, &rec, &scanned);
1780 &nr_scanned); 1815 *total_scanned += scanned;
1781 *total_scanned += nr_scanned;
1782 } else 1816 } else
1783 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1817 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1784 noswap, get_swappiness(victim)); 1818 noswap, &rec);
1819 mem_cgroup_record_scanstat(&rec);
1785 css_put(&victim->css); 1820 css_put(&victim->css);
1786 /* 1821 /*
1787 * At shrinking usage, we can't check we should stop here or 1822 * At shrinking usage, we can't check we should stop here or
@@ -1803,38 +1838,84 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1803/* 1838/*
1804 * Check OOM-Killer is already running under our hierarchy. 1839 * Check OOM-Killer is already running under our hierarchy.
1805 * If someone is running, return false. 1840 * If someone is running, return false.
1841 * Has to be called with memcg_oom_lock
1806 */ 1842 */
1807static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1843static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1808{ 1844{
1809 int x, lock_count = 0; 1845 int lock_count = -1;
1810 struct mem_cgroup *iter; 1846 struct mem_cgroup *iter, *failed = NULL;
1847 bool cond = true;
1848
1849 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1850 bool locked = iter->oom_lock;
1811 1851
1812 for_each_mem_cgroup_tree(iter, mem) { 1852 iter->oom_lock = true;
1813 x = atomic_inc_return(&iter->oom_lock); 1853 if (lock_count == -1)
1814 lock_count = max(x, lock_count); 1854 lock_count = iter->oom_lock;
1855 else if (lock_count != locked) {
1856 /*
1857 * this subtree of our hierarchy is already locked
1858 * so we cannot give a lock.
1859 */
1860 lock_count = 0;
1861 failed = iter;
1862 cond = false;
1863 }
1815 } 1864 }
1816 1865
1817 if (lock_count == 1) 1866 if (!failed)
1818 return true; 1867 goto done;
1819 return false; 1868
1869 /*
1870 * OK, we failed to lock the whole subtree so we have to clean up
1871 * what we set up to the failing subtree
1872 */
1873 cond = true;
1874 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1875 if (iter == failed) {
1876 cond = false;
1877 continue;
1878 }
1879 iter->oom_lock = false;
1880 }
1881done:
1882 return lock_count;
1820} 1883}
1821 1884
1885/*
1886 * Has to be called with memcg_oom_lock
1887 */
1822static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1888static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1823{ 1889{
1824 struct mem_cgroup *iter; 1890 struct mem_cgroup *iter;
1825 1891
1892 for_each_mem_cgroup_tree(iter, mem)
1893 iter->oom_lock = false;
1894 return 0;
1895}
1896
1897static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1898{
1899 struct mem_cgroup *iter;
1900
1901 for_each_mem_cgroup_tree(iter, mem)
1902 atomic_inc(&iter->under_oom);
1903}
1904
1905static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1906{
1907 struct mem_cgroup *iter;
1908
1826 /* 1909 /*
1827 * When a new child is created while the hierarchy is under oom, 1910 * When a new child is created while the hierarchy is under oom,
1828 * mem_cgroup_oom_lock() may not be called. We have to use 1911 * mem_cgroup_oom_lock() may not be called. We have to use
1829 * atomic_add_unless() here. 1912 * atomic_add_unless() here.
1830 */ 1913 */
1831 for_each_mem_cgroup_tree(iter, mem) 1914 for_each_mem_cgroup_tree(iter, mem)
1832 atomic_add_unless(&iter->oom_lock, -1, 0); 1915 atomic_add_unless(&iter->under_oom, -1, 0);
1833 return 0;
1834} 1916}
1835 1917
1836 1918static DEFINE_SPINLOCK(memcg_oom_lock);
1837static DEFINE_MUTEX(memcg_oom_mutex);
1838static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1919static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1839 1920
1840struct oom_wait_info { 1921struct oom_wait_info {
@@ -1845,25 +1926,20 @@ struct oom_wait_info {
1845static int memcg_oom_wake_function(wait_queue_t *wait, 1926static int memcg_oom_wake_function(wait_queue_t *wait,
1846 unsigned mode, int sync, void *arg) 1927 unsigned mode, int sync, void *arg)
1847{ 1928{
1848 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1929 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
1930 *oom_wait_mem;
1849 struct oom_wait_info *oom_wait_info; 1931 struct oom_wait_info *oom_wait_info;
1850 1932
1851 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1933 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1934 oom_wait_mem = oom_wait_info->mem;
1852 1935
1853 if (oom_wait_info->mem == wake_mem)
1854 goto wakeup;
1855 /* if no hierarchy, no match */
1856 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1857 return 0;
1858 /* 1936 /*
1859 * Both of oom_wait_info->mem and wake_mem are stable under us. 1937 * Both of oom_wait_info->mem and wake_mem are stable under us.
1860 * Then we can use css_is_ancestor without taking care of RCU. 1938 * Then we can use css_is_ancestor without taking care of RCU.
1861 */ 1939 */
1862 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1940 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
1863 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1941 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
1864 return 0; 1942 return 0;
1865
1866wakeup:
1867 return autoremove_wake_function(wait, mode, sync, arg); 1943 return autoremove_wake_function(wait, mode, sync, arg);
1868} 1944}
1869 1945
@@ -1875,7 +1951,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1875 1951
1876static void memcg_oom_recover(struct mem_cgroup *mem) 1952static void memcg_oom_recover(struct mem_cgroup *mem)
1877{ 1953{
1878 if (mem && atomic_read(&mem->oom_lock)) 1954 if (mem && atomic_read(&mem->under_oom))
1879 memcg_wakeup_oom(mem); 1955 memcg_wakeup_oom(mem);
1880} 1956}
1881 1957
@@ -1893,8 +1969,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1893 owait.wait.private = current; 1969 owait.wait.private = current;
1894 INIT_LIST_HEAD(&owait.wait.task_list); 1970 INIT_LIST_HEAD(&owait.wait.task_list);
1895 need_to_kill = true; 1971 need_to_kill = true;
1972 mem_cgroup_mark_under_oom(mem);
1973
1896 /* At first, try to OOM lock hierarchy under mem.*/ 1974 /* At first, try to OOM lock hierarchy under mem.*/
1897 mutex_lock(&memcg_oom_mutex); 1975 spin_lock(&memcg_oom_lock);
1898 locked = mem_cgroup_oom_lock(mem); 1976 locked = mem_cgroup_oom_lock(mem);
1899 /* 1977 /*
1900 * Even if signal_pending(), we can't quit charge() loop without 1978 * Even if signal_pending(), we can't quit charge() loop without
@@ -1906,7 +1984,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1906 need_to_kill = false; 1984 need_to_kill = false;
1907 if (locked) 1985 if (locked)
1908 mem_cgroup_oom_notify(mem); 1986 mem_cgroup_oom_notify(mem);
1909 mutex_unlock(&memcg_oom_mutex); 1987 spin_unlock(&memcg_oom_lock);
1910 1988
1911 if (need_to_kill) { 1989 if (need_to_kill) {
1912 finish_wait(&memcg_oom_waitq, &owait.wait); 1990 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1915,10 +1993,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1915 schedule(); 1993 schedule();
1916 finish_wait(&memcg_oom_waitq, &owait.wait); 1994 finish_wait(&memcg_oom_waitq, &owait.wait);
1917 } 1995 }
1918 mutex_lock(&memcg_oom_mutex); 1996 spin_lock(&memcg_oom_lock);
1919 mem_cgroup_oom_unlock(mem); 1997 if (locked)
1998 mem_cgroup_oom_unlock(mem);
1920 memcg_wakeup_oom(mem); 1999 memcg_wakeup_oom(mem);
1921 mutex_unlock(&memcg_oom_mutex); 2000 spin_unlock(&memcg_oom_lock);
2001
2002 mem_cgroup_unmark_under_oom(mem);
1922 2003
1923 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2004 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1924 return false; 2005 return false;
@@ -2011,7 +2092,6 @@ struct memcg_stock_pcp {
2011#define FLUSHING_CACHED_CHARGE (0) 2092#define FLUSHING_CACHED_CHARGE (0)
2012}; 2093};
2013static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2094static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2014static DEFINE_MUTEX(percpu_charge_mutex);
2015 2095
2016/* 2096/*
2017 * Try to consume stocked charge on this cpu. If success, one page is consumed 2097 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2079,19 +2159,14 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2079} 2159}
2080 2160
2081/* 2161/*
2082 * Tries to drain stocked charges in other cpus. This function is asynchronous 2162 * Drains all per-CPU charge caches for given root_mem resp. subtree
2083 * and just put a work per cpu for draining localy on each cpu. Caller can 2163 * of the hierarchy under it. sync flag says whether we should block
2084 * expects some charges will be back to res_counter later but cannot wait for 2164 * until the work is done.
2085 * it.
2086 */ 2165 */
2087static void drain_all_stock_async(struct mem_cgroup *root_mem) 2166static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2088{ 2167{
2089 int cpu, curcpu; 2168 int cpu, curcpu;
2090 /* 2169
2091 * If someone calls draining, avoid adding more kworker runs.
2092 */
2093 if (!mutex_trylock(&percpu_charge_mutex))
2094 return;
2095 /* Notify other cpus that system-wide "drain" is running */ 2170 /* Notify other cpus that system-wide "drain" is running */
2096 get_online_cpus(); 2171 get_online_cpus();
2097 /* 2172 /*
@@ -2105,34 +2180,48 @@ static void drain_all_stock_async(struct mem_cgroup *root_mem)
2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2180 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2106 struct mem_cgroup *mem; 2181 struct mem_cgroup *mem;
2107 2182
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached; 2183 mem = stock->cached;
2112 if (!mem) 2184 if (!mem || !stock->nr_pages)
2185 continue;
2186 if (!mem_cgroup_same_or_subtree(root_mem, mem))
2113 continue; 2187 continue;
2114 if (mem != root_mem) { 2188 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2115 if (!root_mem->use_hierarchy) 2189 if (cpu == curcpu)
2116 continue; 2190 drain_local_stock(&stock->work);
2117 /* check whether "mem" is under tree of "root_mem" */ 2191 else
2118 if (!css_is_ancestor(&mem->css, &root_mem->css)) 2192 schedule_work_on(cpu, &stock->work);
2119 continue;
2120 } 2193 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2123 } 2194 }
2195
2196 if (!sync)
2197 goto out;
2198
2199 for_each_online_cpu(cpu) {
2200 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2201 if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
2202 test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2203 flush_work(&stock->work);
2204 }
2205out:
2124 put_online_cpus(); 2206 put_online_cpus();
2125 mutex_unlock(&percpu_charge_mutex); 2207}
2126 /* We don't wait for flush_work */ 2208
2209/*
2210 * Tries to drain stocked charges in other cpus. This function is asynchronous
2211 * and just put a work per cpu for draining localy on each cpu. Caller can
2212 * expects some charges will be back to res_counter later but cannot wait for
2213 * it.
2214 */
2215static void drain_all_stock_async(struct mem_cgroup *root_mem)
2216{
2217 drain_all_stock(root_mem, false);
2127} 2218}
2128 2219
2129/* This is a synchronous drain interface. */ 2220/* This is a synchronous drain interface. */
2130static void drain_all_stock_sync(void) 2221static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2131{ 2222{
2132 /* called when force_empty is called */ 2223 /* called when force_empty is called */
2133 mutex_lock(&percpu_charge_mutex); 2224 drain_all_stock(root_mem, true);
2134 schedule_on_each_cpu(drain_local_stock);
2135 mutex_unlock(&percpu_charge_mutex);
2136} 2225}
2137 2226
2138/* 2227/*
@@ -3780,7 +3869,7 @@ move_account:
3780 goto out; 3869 goto out;
3781 /* This is for making all *used* pages to be on LRU. */ 3870 /* This is for making all *used* pages to be on LRU. */
3782 lru_add_drain_all(); 3871 lru_add_drain_all();
3783 drain_all_stock_sync(); 3872 drain_all_stock_sync(mem);
3784 ret = 0; 3873 ret = 0;
3785 mem_cgroup_start_move(mem); 3874 mem_cgroup_start_move(mem);
3786 for_each_node_state(node, N_HIGH_MEMORY) { 3875 for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3819,14 +3908,18 @@ try_to_free:
3819 /* try to free all pages in this cgroup */ 3908 /* try to free all pages in this cgroup */
3820 shrink = 1; 3909 shrink = 1;
3821 while (nr_retries && mem->res.usage > 0) { 3910 while (nr_retries && mem->res.usage > 0) {
3911 struct memcg_scanrecord rec;
3822 int progress; 3912 int progress;
3823 3913
3824 if (signal_pending(current)) { 3914 if (signal_pending(current)) {
3825 ret = -EINTR; 3915 ret = -EINTR;
3826 goto out; 3916 goto out;
3827 } 3917 }
3918 rec.context = SCAN_BY_SHRINK;
3919 rec.mem = mem;
3920 rec.root = mem;
3828 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3921 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3829 false, get_swappiness(mem)); 3922 false, &rec);
3830 if (!progress) { 3923 if (!progress) {
3831 nr_retries--; 3924 nr_retries--;
3832 /* maybe some writeback is necessary */ 3925 /* maybe some writeback is necessary */
@@ -4152,15 +4245,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4152 s->stat[MCS_PGMAJFAULT] += val; 4245 s->stat[MCS_PGMAJFAULT] += val;
4153 4246
4154 /* per zone stat */ 4247 /* per zone stat */
4155 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4248 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
4156 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4249 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4157 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 4250 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
4158 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4251 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4159 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 4252 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
4160 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4253 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4161 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 4254 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
4162 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4255 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4163 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 4256 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
4164 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4257 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4165} 4258}
4166 4259
@@ -4182,35 +4275,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4182 struct cgroup *cont = m->private; 4275 struct cgroup *cont = m->private;
4183 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4276 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4184 4277
4185 total_nr = mem_cgroup_nr_lru_pages(mem_cont); 4278 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4186 seq_printf(m, "total=%lu", total_nr); 4279 seq_printf(m, "total=%lu", total_nr);
4187 for_each_node_state(nid, N_HIGH_MEMORY) { 4280 for_each_node_state(nid, N_HIGH_MEMORY) {
4188 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); 4281 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4189 seq_printf(m, " N%d=%lu", nid, node_nr); 4282 seq_printf(m, " N%d=%lu", nid, node_nr);
4190 } 4283 }
4191 seq_putc(m, '\n'); 4284 seq_putc(m, '\n');
4192 4285
4193 file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); 4286 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4194 seq_printf(m, "file=%lu", file_nr); 4287 seq_printf(m, "file=%lu", file_nr);
4195 for_each_node_state(nid, N_HIGH_MEMORY) { 4288 for_each_node_state(nid, N_HIGH_MEMORY) {
4196 node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); 4289 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4290 LRU_ALL_FILE);
4197 seq_printf(m, " N%d=%lu", nid, node_nr); 4291 seq_printf(m, " N%d=%lu", nid, node_nr);
4198 } 4292 }
4199 seq_putc(m, '\n'); 4293 seq_putc(m, '\n');
4200 4294
4201 anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); 4295 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4202 seq_printf(m, "anon=%lu", anon_nr); 4296 seq_printf(m, "anon=%lu", anon_nr);
4203 for_each_node_state(nid, N_HIGH_MEMORY) { 4297 for_each_node_state(nid, N_HIGH_MEMORY) {
4204 node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); 4298 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4299 LRU_ALL_ANON);
4205 seq_printf(m, " N%d=%lu", nid, node_nr); 4300 seq_printf(m, " N%d=%lu", nid, node_nr);
4206 } 4301 }
4207 seq_putc(m, '\n'); 4302 seq_putc(m, '\n');
4208 4303
4209 unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); 4304 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4210 seq_printf(m, "unevictable=%lu", unevictable_nr); 4305 seq_printf(m, "unevictable=%lu", unevictable_nr);
4211 for_each_node_state(nid, N_HIGH_MEMORY) { 4306 for_each_node_state(nid, N_HIGH_MEMORY) {
4212 node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, 4307 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4213 nid); 4308 BIT(LRU_UNEVICTABLE));
4214 seq_printf(m, " N%d=%lu", nid, node_nr); 4309 seq_printf(m, " N%d=%lu", nid, node_nr);
4215 } 4310 }
4216 seq_putc(m, '\n'); 4311 seq_putc(m, '\n');
@@ -4288,7 +4383,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4288{ 4383{
4289 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4384 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4290 4385
4291 return get_swappiness(memcg); 4386 return mem_cgroup_swappiness(memcg);
4292} 4387}
4293 4388
4294static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4389static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4578,15 +4673,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4578 if (!event) 4673 if (!event)
4579 return -ENOMEM; 4674 return -ENOMEM;
4580 4675
4581 mutex_lock(&memcg_oom_mutex); 4676 spin_lock(&memcg_oom_lock);
4582 4677
4583 event->eventfd = eventfd; 4678 event->eventfd = eventfd;
4584 list_add(&event->list, &memcg->oom_notify); 4679 list_add(&event->list, &memcg->oom_notify);
4585 4680
4586 /* already in OOM ? */ 4681 /* already in OOM ? */
4587 if (atomic_read(&memcg->oom_lock)) 4682 if (atomic_read(&memcg->under_oom))
4588 eventfd_signal(eventfd, 1); 4683 eventfd_signal(eventfd, 1);
4589 mutex_unlock(&memcg_oom_mutex); 4684 spin_unlock(&memcg_oom_lock);
4590 4685
4591 return 0; 4686 return 0;
4592} 4687}
@@ -4600,7 +4695,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4600 4695
4601 BUG_ON(type != _OOM_TYPE); 4696 BUG_ON(type != _OOM_TYPE);
4602 4697
4603 mutex_lock(&memcg_oom_mutex); 4698 spin_lock(&memcg_oom_lock);
4604 4699
4605 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4700 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4606 if (ev->eventfd == eventfd) { 4701 if (ev->eventfd == eventfd) {
@@ -4609,7 +4704,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4609 } 4704 }
4610 } 4705 }
4611 4706
4612 mutex_unlock(&memcg_oom_mutex); 4707 spin_unlock(&memcg_oom_lock);
4613} 4708}
4614 4709
4615static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4710static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4619,7 +4714,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4619 4714
4620 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4715 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4621 4716
4622 if (atomic_read(&mem->oom_lock)) 4717 if (atomic_read(&mem->under_oom))
4623 cb->fill(cb, "under_oom", 1); 4718 cb->fill(cb, "under_oom", 1);
4624 else 4719 else
4625 cb->fill(cb, "under_oom", 0); 4720 cb->fill(cb, "under_oom", 0);
@@ -4668,6 +4763,54 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4668} 4763}
4669#endif /* CONFIG_NUMA */ 4764#endif /* CONFIG_NUMA */
4670 4765
4766static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
4767 struct cftype *cft,
4768 struct cgroup_map_cb *cb)
4769{
4770 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4771 char string[64];
4772 int i;
4773
4774 for (i = 0; i < NR_SCANSTATS; i++) {
4775 strcpy(string, scanstat_string[i]);
4776 strcat(string, SCANSTAT_WORD_LIMIT);
4777 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
4778 }
4779
4780 for (i = 0; i < NR_SCANSTATS; i++) {
4781 strcpy(string, scanstat_string[i]);
4782 strcat(string, SCANSTAT_WORD_SYSTEM);
4783 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
4784 }
4785
4786 for (i = 0; i < NR_SCANSTATS; i++) {
4787 strcpy(string, scanstat_string[i]);
4788 strcat(string, SCANSTAT_WORD_LIMIT);
4789 strcat(string, SCANSTAT_WORD_HIERARCHY);
4790 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
4791 }
4792 for (i = 0; i < NR_SCANSTATS; i++) {
4793 strcpy(string, scanstat_string[i]);
4794 strcat(string, SCANSTAT_WORD_SYSTEM);
4795 strcat(string, SCANSTAT_WORD_HIERARCHY);
4796 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
4797 }
4798 return 0;
4799}
4800
4801static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
4802 unsigned int event)
4803{
4804 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4805
4806 spin_lock(&mem->scanstat.lock);
4807 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
4808 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
4809 spin_unlock(&mem->scanstat.lock);
4810 return 0;
4811}
4812
4813
4671static struct cftype mem_cgroup_files[] = { 4814static struct cftype mem_cgroup_files[] = {
4672 { 4815 {
4673 .name = "usage_in_bytes", 4816 .name = "usage_in_bytes",
@@ -4738,6 +4881,11 @@ static struct cftype mem_cgroup_files[] = {
4738 .mode = S_IRUGO, 4881 .mode = S_IRUGO,
4739 }, 4882 },
4740#endif 4883#endif
4884 {
4885 .name = "vmscan_stat",
4886 .read_map = mem_cgroup_vmscan_stat_read,
4887 .trigger = mem_cgroup_reset_vmscan_stat,
4888 },
4741}; 4889};
4742 4890
4743#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4891#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4997,10 +5145,11 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4997 INIT_LIST_HEAD(&mem->oom_notify); 5145 INIT_LIST_HEAD(&mem->oom_notify);
4998 5146
4999 if (parent) 5147 if (parent)
5000 mem->swappiness = get_swappiness(parent); 5148 mem->swappiness = mem_cgroup_swappiness(parent);
5001 atomic_set(&mem->refcnt, 1); 5149 atomic_set(&mem->refcnt, 1);
5002 mem->move_charge_at_immigrate = 0; 5150 mem->move_charge_at_immigrate = 0;
5003 mutex_init(&mem->thresholds_lock); 5151 mutex_init(&mem->thresholds_lock);
5152 spin_lock_init(&mem->scanstat.lock);
5004 return &mem->css; 5153 return &mem->css;
5005free_out: 5154free_out:
5006 __mem_cgroup_free(mem); 5155 __mem_cgroup_free(mem);
diff --git a/mm/memory.c b/mm/memory.c
index 9b8a01d941cb..a56e3ba816b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1290 return addr; 1290 return addr;
1291} 1291}
1292 1292
1293#ifdef CONFIG_PREEMPT
1294# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE)
1295#else
1296/* No preempt: go for improved straight-line efficiency */
1297# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE)
1298#endif
1299
1300/** 1293/**
1301 * unmap_vmas - unmap a range of memory covered by a list of vma's 1294 * unmap_vmas - unmap a range of memory covered by a list of vma's
1302 * @tlb: address of the caller's struct mmu_gather 1295 * @tlb: address of the caller's struct mmu_gather
@@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
1310 * 1303 *
1311 * Unmap all pages in the vma list. 1304 * Unmap all pages in the vma list.
1312 * 1305 *
1313 * We aim to not hold locks for too long (for scheduling latency reasons).
1314 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
1315 * return the ending mmu_gather to the caller.
1316 *
1317 * Only addresses between `start' and `end' will be unmapped. 1306 * Only addresses between `start' and `end' will be unmapped.
1318 * 1307 *
1319 * The VMA list must be sorted in ascending virtual address order. 1308 * The VMA list must be sorted in ascending virtual address order.
@@ -1816,7 +1805,63 @@ next_page:
1816} 1805}
1817EXPORT_SYMBOL(__get_user_pages); 1806EXPORT_SYMBOL(__get_user_pages);
1818 1807
1819/** 1808/*
1809 * fixup_user_fault() - manually resolve a user page fault
1810 * @tsk: the task_struct to use for page fault accounting, or
1811 * NULL if faults are not to be recorded.
1812 * @mm: mm_struct of target mm
1813 * @address: user address
1814 * @fault_flags:flags to pass down to handle_mm_fault()
1815 *
1816 * This is meant to be called in the specific scenario where for locking reasons
1817 * we try to access user memory in atomic context (within a pagefault_disable()
1818 * section), this returns -EFAULT, and we want to resolve the user fault before
1819 * trying again.
1820 *
1821 * Typically this is meant to be used by the futex code.
1822 *
1823 * The main difference with get_user_pages() is that this function will
1824 * unconditionally call handle_mm_fault() which will in turn perform all the
1825 * necessary SW fixup of the dirty and young bits in the PTE, while
1826 * handle_mm_fault() only guarantees to update these in the struct page.
1827 *
1828 * This is important for some architectures where those bits also gate the
1829 * access permission to the page because they are maintained in software. On
1830 * such architectures, gup() will not be enough to make a subsequent access
1831 * succeed.
1832 *
1833 * This should be called with the mm_sem held for read.
1834 */
1835int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1836 unsigned long address, unsigned int fault_flags)
1837{
1838 struct vm_area_struct *vma;
1839 int ret;
1840
1841 vma = find_extend_vma(mm, address);
1842 if (!vma || address < vma->vm_start)
1843 return -EFAULT;
1844
1845 ret = handle_mm_fault(mm, vma, address, fault_flags);
1846 if (ret & VM_FAULT_ERROR) {
1847 if (ret & VM_FAULT_OOM)
1848 return -ENOMEM;
1849 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
1850 return -EHWPOISON;
1851 if (ret & VM_FAULT_SIGBUS)
1852 return -EFAULT;
1853 BUG();
1854 }
1855 if (tsk) {
1856 if (ret & VM_FAULT_MAJOR)
1857 tsk->maj_flt++;
1858 else
1859 tsk->min_flt++;
1860 }
1861 return 0;
1862}
1863
1864/*
1820 * get_user_pages() - pin user pages in memory 1865 * get_user_pages() - pin user pages in memory
1821 * @tsk: the task_struct to use for page fault accounting, or 1866 * @tsk: the task_struct to use for page fault accounting, or
1822 * NULL if faults are not to be recorded. 1867 * NULL if faults are not to be recorded.
@@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3104 pte_t *page_table; 3149 pte_t *page_table;
3105 spinlock_t *ptl; 3150 spinlock_t *ptl;
3106 struct page *page; 3151 struct page *page;
3152 struct page *cow_page;
3107 pte_t entry; 3153 pte_t entry;
3108 int anon = 0; 3154 int anon = 0;
3109 int charged = 0;
3110 struct page *dirty_page = NULL; 3155 struct page *dirty_page = NULL;
3111 struct vm_fault vmf; 3156 struct vm_fault vmf;
3112 int ret; 3157 int ret;
3113 int page_mkwrite = 0; 3158 int page_mkwrite = 0;
3114 3159
3160 /*
3161 * If we do COW later, allocate page befor taking lock_page()
3162 * on the file cache page. This will reduce lock holding time.
3163 */
3164 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3165
3166 if (unlikely(anon_vma_prepare(vma)))
3167 return VM_FAULT_OOM;
3168
3169 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3170 if (!cow_page)
3171 return VM_FAULT_OOM;
3172
3173 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3174 page_cache_release(cow_page);
3175 return VM_FAULT_OOM;
3176 }
3177 } else
3178 cow_page = NULL;
3179
3115 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3180 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3116 vmf.pgoff = pgoff; 3181 vmf.pgoff = pgoff;
3117 vmf.flags = flags; 3182 vmf.flags = flags;
@@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3120 ret = vma->vm_ops->fault(vma, &vmf); 3185 ret = vma->vm_ops->fault(vma, &vmf);
3121 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3186 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3122 VM_FAULT_RETRY))) 3187 VM_FAULT_RETRY)))
3123 return ret; 3188 goto uncharge_out;
3124 3189
3125 if (unlikely(PageHWPoison(vmf.page))) { 3190 if (unlikely(PageHWPoison(vmf.page))) {
3126 if (ret & VM_FAULT_LOCKED) 3191 if (ret & VM_FAULT_LOCKED)
3127 unlock_page(vmf.page); 3192 unlock_page(vmf.page);
3128 return VM_FAULT_HWPOISON; 3193 ret = VM_FAULT_HWPOISON;
3194 goto uncharge_out;
3129 } 3195 }
3130 3196
3131 /* 3197 /*
@@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3143 page = vmf.page; 3209 page = vmf.page;
3144 if (flags & FAULT_FLAG_WRITE) { 3210 if (flags & FAULT_FLAG_WRITE) {
3145 if (!(vma->vm_flags & VM_SHARED)) { 3211 if (!(vma->vm_flags & VM_SHARED)) {
3212 page = cow_page;
3146 anon = 1; 3213 anon = 1;
3147 if (unlikely(anon_vma_prepare(vma))) {
3148 ret = VM_FAULT_OOM;
3149 goto out;
3150 }
3151 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3152 vma, address);
3153 if (!page) {
3154 ret = VM_FAULT_OOM;
3155 goto out;
3156 }
3157 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3158 ret = VM_FAULT_OOM;
3159 page_cache_release(page);
3160 goto out;
3161 }
3162 charged = 1;
3163 copy_user_highpage(page, vmf.page, address, vma); 3214 copy_user_highpage(page, vmf.page, address, vma);
3164 __SetPageUptodate(page); 3215 __SetPageUptodate(page);
3165 } else { 3216 } else {
@@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 /* no need to invalidate: a not-present page won't be cached */ 3279 /* no need to invalidate: a not-present page won't be cached */
3229 update_mmu_cache(vma, address, page_table); 3280 update_mmu_cache(vma, address, page_table);
3230 } else { 3281 } else {
3231 if (charged) 3282 if (cow_page)
3232 mem_cgroup_uncharge_page(page); 3283 mem_cgroup_uncharge_page(cow_page);
3233 if (anon) 3284 if (anon)
3234 page_cache_release(page); 3285 page_cache_release(page);
3235 else 3286 else
@@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3238 3289
3239 pte_unmap_unlock(page_table, ptl); 3290 pte_unmap_unlock(page_table, ptl);
3240 3291
3241out:
3242 if (dirty_page) { 3292 if (dirty_page) {
3243 struct address_space *mapping = page->mapping; 3293 struct address_space *mapping = page->mapping;
3244 3294
@@ -3268,6 +3318,13 @@ out:
3268unwritable_page: 3318unwritable_page:
3269 page_cache_release(page); 3319 page_cache_release(page);
3270 return ret; 3320 return ret;
3321uncharge_out:
3322 /* fs's fault handler get error */
3323 if (cow_page) {
3324 mem_cgroup_uncharge_page(cow_page);
3325 page_cache_release(cow_page);
3326 }
3327 return ret;
3271} 3328}
3272 3329
3273static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3330static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c46887b5a11e..6e7d8b21dbfa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,17 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37/*
38 * online_page_callback contains pointer to current page onlining function.
39 * Initially it is generic_online_page(). If it is required it could be
40 * changed by calling set_online_page_callback() for callback registration
41 * and restore_online_page_callback() for generic callback restore.
42 */
43
44static void generic_online_page(struct page *page);
45
46static online_page_callback_t online_page_callback = generic_online_page;
47
37DEFINE_MUTEX(mem_hotplug_mutex); 48DEFINE_MUTEX(mem_hotplug_mutex);
38 49
39void lock_memory_hotplug(void) 50void lock_memory_hotplug(void)
@@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
361} 372}
362EXPORT_SYMBOL_GPL(__remove_pages); 373EXPORT_SYMBOL_GPL(__remove_pages);
363 374
364void online_page(struct page *page) 375int set_online_page_callback(online_page_callback_t callback)
376{
377 int rc = -EINVAL;
378
379 lock_memory_hotplug();
380
381 if (online_page_callback == generic_online_page) {
382 online_page_callback = callback;
383 rc = 0;
384 }
385
386 unlock_memory_hotplug();
387
388 return rc;
389}
390EXPORT_SYMBOL_GPL(set_online_page_callback);
391
392int restore_online_page_callback(online_page_callback_t callback)
393{
394 int rc = -EINVAL;
395
396 lock_memory_hotplug();
397
398 if (online_page_callback == callback) {
399 online_page_callback = generic_online_page;
400 rc = 0;
401 }
402
403 unlock_memory_hotplug();
404
405 return rc;
406}
407EXPORT_SYMBOL_GPL(restore_online_page_callback);
408
409void __online_page_set_limits(struct page *page)
365{ 410{
366 unsigned long pfn = page_to_pfn(page); 411 unsigned long pfn = page_to_pfn(page);
367 412
368 totalram_pages++;
369 if (pfn >= num_physpages) 413 if (pfn >= num_physpages)
370 num_physpages = pfn + 1; 414 num_physpages = pfn + 1;
415}
416EXPORT_SYMBOL_GPL(__online_page_set_limits);
417
418void __online_page_increment_counters(struct page *page)
419{
420 totalram_pages++;
371 421
372#ifdef CONFIG_HIGHMEM 422#ifdef CONFIG_HIGHMEM
373 if (PageHighMem(page)) 423 if (PageHighMem(page))
374 totalhigh_pages++; 424 totalhigh_pages++;
375#endif 425#endif
426}
427EXPORT_SYMBOL_GPL(__online_page_increment_counters);
376 428
429void __online_page_free(struct page *page)
430{
377 ClearPageReserved(page); 431 ClearPageReserved(page);
378 init_page_count(page); 432 init_page_count(page);
379 __free_page(page); 433 __free_page(page);
380} 434}
435EXPORT_SYMBOL_GPL(__online_page_free);
436
437static void generic_online_page(struct page *page)
438{
439 __online_page_set_limits(page);
440 __online_page_increment_counters(page);
441 __online_page_free(page);
442}
381 443
382static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 444static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
383 void *arg) 445 void *arg)
@@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
388 if (PageReserved(pfn_to_page(start_pfn))) 450 if (PageReserved(pfn_to_page(start_pfn)))
389 for (i = 0; i < nr_pages; i++) { 451 for (i = 0; i < nr_pages; i++) {
390 page = pfn_to_page(start_pfn + i); 452 page = pfn_to_page(start_pfn + i);
391 online_page(page); 453 (*online_page_callback)(page);
392 onlined_pages++; 454 onlined_pages++;
393 } 455 }
394 *(unsigned long *)arg = onlined_pages; 456 *(unsigned long *)arg = onlined_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54e..8b57173c1dd5 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,7 @@
93 93
94#include <asm/tlbflush.h> 94#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 95#include <asm/uaccess.h>
96#include <linux/random.h>
96 97
97#include "internal.h" 98#include "internal.h"
98 99
@@ -1645,6 +1646,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1645 return interleave_nodes(pol); 1646 return interleave_nodes(pol);
1646} 1647}
1647 1648
1649/*
1650 * Return the bit number of a random bit set in the nodemask.
1651 * (returns -1 if nodemask is empty)
1652 */
1653int node_random(const nodemask_t *maskp)
1654{
1655 int w, bit = -1;
1656
1657 w = nodes_weight(*maskp);
1658 if (w)
1659 bit = bitmap_ord_to_pos(maskp->bits,
1660 get_random_int() % w, MAX_NUMNODES);
1661 return bit;
1662}
1663
1648#ifdef CONFIG_HUGETLBFS 1664#ifdef CONFIG_HUGETLBFS
1649/* 1665/*
1650 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1666 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/mmap.c b/mm/mmap.c
index d49736ff8a8d..a65efd4db3e1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
122 return 0; 122 return 0;
123 123
124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 124 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
125 unsigned long n; 125 free = global_page_state(NR_FREE_PAGES);
126 free += global_page_state(NR_FILE_PAGES);
127
128 /*
129 * shmem pages shouldn't be counted as free in this
130 * case, they can't be purged, only swapped out, and
131 * that won't affect the overall amount of available
132 * memory in the system.
133 */
134 free -= global_page_state(NR_SHMEM);
126 135
127 free = global_page_state(NR_FILE_PAGES);
128 free += nr_swap_pages; 136 free += nr_swap_pages;
129 137
130 /* 138 /*
@@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
136 free += global_page_state(NR_SLAB_RECLAIMABLE); 144 free += global_page_state(NR_SLAB_RECLAIMABLE);
137 145
138 /* 146 /*
139 * Leave the last 3% for root
140 */
141 if (!cap_sys_admin)
142 free -= free / 32;
143
144 if (free > pages)
145 return 0;
146
147 /*
148 * nr_free_pages() is very expensive on large systems,
149 * only call if we're about to fail.
150 */
151 n = nr_free_pages();
152
153 /*
154 * Leave reserved pages. The pages are not for anonymous pages. 147 * Leave reserved pages. The pages are not for anonymous pages.
155 */ 148 */
156 if (n <= totalreserve_pages) 149 if (free <= totalreserve_pages)
157 goto error; 150 goto error;
158 else 151 else
159 n -= totalreserve_pages; 152 free -= totalreserve_pages;
160 153
161 /* 154 /*
162 * Leave the last 3% for root 155 * Leave the last 3% for root
163 */ 156 */
164 if (!cap_sys_admin) 157 if (!cap_sys_admin)
165 n -= n / 32; 158 free -= free / 32;
166 free += n;
167 159
168 if (free > pages) 160 if (free > pages)
169 return 0; 161 return 0;
diff --git a/mm/nommu.c b/mm/nommu.c
index 9edc897a3970..4358032566e9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,6 @@
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h> 24#include <linux/vmalloc.h>
25#include <linux/tracehook.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
28#include <linux/mount.h> 27#include <linux/mount.h>
@@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file,
1087 * it's being traced - otherwise breakpoints set in it may interfere 1086 * it's being traced - otherwise breakpoints set in it may interfere
1088 * with another untraced process 1087 * with another untraced process
1089 */ 1088 */
1090 if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) 1089 if ((flags & MAP_PRIVATE) && current->ptrace)
1091 vm_flags &= ~VM_MAYSHARE; 1090 vm_flags &= ~VM_MAYSHARE;
1092 1091
1093 return vm_flags; 1092 return vm_flags;
@@ -1885,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1885 return 0; 1884 return 0;
1886 1885
1887 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { 1886 if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
1888 unsigned long n; 1887 free = global_page_state(NR_FREE_PAGES);
1888 free += global_page_state(NR_FILE_PAGES);
1889
1890 /*
1891 * shmem pages shouldn't be counted as free in this
1892 * case, they can't be purged, only swapped out, and
1893 * that won't affect the overall amount of available
1894 * memory in the system.
1895 */
1896 free -= global_page_state(NR_SHMEM);
1889 1897
1890 free = global_page_state(NR_FILE_PAGES);
1891 free += nr_swap_pages; 1898 free += nr_swap_pages;
1892 1899
1893 /* 1900 /*
@@ -1899,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1899 free += global_page_state(NR_SLAB_RECLAIMABLE); 1906 free += global_page_state(NR_SLAB_RECLAIMABLE);
1900 1907
1901 /* 1908 /*
1902 * Leave the last 3% for root
1903 */
1904 if (!cap_sys_admin)
1905 free -= free / 32;
1906
1907 if (free > pages)
1908 return 0;
1909
1910 /*
1911 * nr_free_pages() is very expensive on large systems,
1912 * only call if we're about to fail.
1913 */
1914 n = nr_free_pages();
1915
1916 /*
1917 * Leave reserved pages. The pages are not for anonymous pages. 1909 * Leave reserved pages. The pages are not for anonymous pages.
1918 */ 1910 */
1919 if (n <= totalreserve_pages) 1911 if (free <= totalreserve_pages)
1920 goto error; 1912 goto error;
1921 else 1913 else
1922 n -= totalreserve_pages; 1914 free -= totalreserve_pages;
1923 1915
1924 /* 1916 /*
1925 * Leave the last 3% for root 1917 * Leave the last 3% for root
1926 */ 1918 */
1927 if (!cap_sys_admin) 1919 if (!cap_sys_admin)
1928 n -= n / 32; 1920 free -= free / 32;
1929 free += n;
1930 1921
1931 if (free > pages) 1922 if (free > pages)
1932 return 0; 1923 return 0;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index e4b0991ca351..eafff89b3dd6 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
339 * then wait for it to finish before killing 339 * then wait for it to finish before killing
340 * some other task unnecessarily. 340 * some other task unnecessarily.
341 */ 341 */
342 if (!(task_ptrace(p->group_leader) & 342 if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
343 PT_TRACE_EXIT))
344 return ERR_PTR(-1UL); 343 return ERR_PTR(-1UL);
345 } 344 }
346 } 345 }
@@ -488,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
488 487
489 /* 488 /*
490 * If any of p's children has a different mm and is eligible for kill, 489 * If any of p's children has a different mm and is eligible for kill,
491 * the one with the highest badness() score is sacrificed for its 490 * the one with the highest oom_badness() score is sacrificed for its
492 * parent. This attempts to lose the minimal amount of work done while 491 * parent. This attempts to lose the minimal amount of work done while
493 * still freeing memory. 492 * still freeing memory.
494 */ 493 */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f698862420..d1960744f881 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 */
42#define MAX_PAUSE max(HZ/5, 1)
43
44/*
45 * Estimate write bandwidth at 200ms intervals.
46 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48
49/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 * will look to see if it needs to force writeback or throttling. 51 * will look to see if it needs to force writeback or throttling.
42 */ 52 */
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
111 121
112/* End of sysctl-exported parameters */ 122/* End of sysctl-exported parameters */
113 123
124unsigned long global_dirty_limit;
114 125
115/* 126/*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 127 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
219 */ 230 */
220static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 231static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221{ 232{
233 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 234 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 235 bdi->max_prop_frac);
224} 236}
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
244static void bdi_writeout_fraction(struct backing_dev_info *bdi, 256static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 257 long *numerator, long *denominator)
246{ 258{
247 if (bdi_cap_writeback_dirty(bdi)) { 259 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 260 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254} 261}
255 262
256static inline void task_dirties_fraction(struct task_struct *tsk, 263static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough 281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled. 282 * dirty threshold may never get throttled.
276 */ 283 */
284#define TASK_LIMIT_FRACTION 8
277static unsigned long task_dirty_limit(struct task_struct *tsk, 285static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty) 286 unsigned long bdi_dirty)
279{ 287{
280 long numerator, denominator; 288 long numerator, denominator;
281 unsigned long dirty = bdi_dirty; 289 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3; 290 u64 inv = dirty / TASK_LIMIT_FRACTION;
283 291
284 task_dirties_fraction(tsk, &numerator, &denominator); 292 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator; 293 inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
290 return max(dirty, bdi_dirty/2); 298 return max(dirty, bdi_dirty/2);
291} 299}
292 300
301/* Minimum limit for any task */
302static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
303{
304 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
305}
306
293/* 307/*
294 * 308 *
295 */ 309 */
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
397 return x + 1; /* Ensure that we never return 0 */ 411 return x + 1; /* Ensure that we never return 0 */
398} 412}
399 413
414static unsigned long hard_dirty_limit(unsigned long thresh)
415{
416 return max(thresh, global_dirty_limit);
417}
418
400/* 419/*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 420 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 421 *
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
435 } 454 }
436 *pbackground = background; 455 *pbackground = background;
437 *pdirty = dirty; 456 *pdirty = dirty;
457 trace_global_dirty_state(background, dirty);
438} 458}
439 459
440/* 460/**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 461 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
462 * @bdi: the backing_dev_info to query
463 * @dirty: global dirty limit in pages
464 *
465 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
466 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
467 * And the "limit" in the name is not seriously taken as hard limit in
468 * balance_dirty_pages().
442 * 469 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 470 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 471 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 472 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 473 *
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
468 return bdi_dirty; 495 return bdi_dirty;
469} 496}
470 497
498static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed,
500 unsigned long written)
501{
502 const unsigned long period = roundup_pow_of_two(3 * HZ);
503 unsigned long avg = bdi->avg_write_bandwidth;
504 unsigned long old = bdi->write_bandwidth;
505 u64 bw;
506
507 /*
508 * bw = written * HZ / elapsed
509 *
510 * bw * elapsed + write_bandwidth * (period - elapsed)
511 * write_bandwidth = ---------------------------------------------------
512 * period
513 */
514 bw = written - bdi->written_stamp;
515 bw *= HZ;
516 if (unlikely(elapsed > period)) {
517 do_div(bw, elapsed);
518 avg = bw;
519 goto out;
520 }
521 bw += (u64)bdi->write_bandwidth * (period - elapsed);
522 bw >>= ilog2(period);
523
524 /*
525 * one more level of smoothing, for filtering out sudden spikes
526 */
527 if (avg > old && old >= (unsigned long)bw)
528 avg -= (avg - old) >> 3;
529
530 if (avg < old && old <= (unsigned long)bw)
531 avg += (old - avg) >> 3;
532
533out:
534 bdi->write_bandwidth = bw;
535 bdi->avg_write_bandwidth = avg;
536}
537
538/*
539 * The global dirtyable memory and dirty threshold could be suddenly knocked
540 * down by a large amount (eg. on the startup of KVM in a swapless system).
541 * This may throw the system into deep dirty exceeded state and throttle
542 * heavy/light dirtiers alike. To retain good responsiveness, maintain
543 * global_dirty_limit for tracking slowly down to the knocked down dirty
544 * threshold.
545 */
546static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
547{
548 unsigned long limit = global_dirty_limit;
549
550 /*
551 * Follow up in one step.
552 */
553 if (limit < thresh) {
554 limit = thresh;
555 goto update;
556 }
557
558 /*
559 * Follow down slowly. Use the higher one as the target, because thresh
560 * may drop below dirty. This is exactly the reason to introduce
561 * global_dirty_limit which is guaranteed to lie above the dirty pages.
562 */
563 thresh = max(thresh, dirty);
564 if (limit > thresh) {
565 limit -= (limit - thresh) >> 5;
566 goto update;
567 }
568 return;
569update:
570 global_dirty_limit = limit;
571}
572
573static void global_update_bandwidth(unsigned long thresh,
574 unsigned long dirty,
575 unsigned long now)
576{
577 static DEFINE_SPINLOCK(dirty_lock);
578 static unsigned long update_time;
579
580 /*
581 * check locklessly first to optimize away locking for the most time
582 */
583 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
584 return;
585
586 spin_lock(&dirty_lock);
587 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
588 update_dirty_limit(thresh, dirty);
589 update_time = now;
590 }
591 spin_unlock(&dirty_lock);
592}
593
594void __bdi_update_bandwidth(struct backing_dev_info *bdi,
595 unsigned long thresh,
596 unsigned long dirty,
597 unsigned long bdi_thresh,
598 unsigned long bdi_dirty,
599 unsigned long start_time)
600{
601 unsigned long now = jiffies;
602 unsigned long elapsed = now - bdi->bw_time_stamp;
603 unsigned long written;
604
605 /*
606 * rate-limit, only update once every 200ms.
607 */
608 if (elapsed < BANDWIDTH_INTERVAL)
609 return;
610
611 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
612
613 /*
614 * Skip quiet periods when disk bandwidth is under-utilized.
615 * (at least 1s idle time between two flusher runs)
616 */
617 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
618 goto snapshot;
619
620 if (thresh)
621 global_update_bandwidth(thresh, dirty, now);
622
623 bdi_update_write_bandwidth(bdi, elapsed, written);
624
625snapshot:
626 bdi->written_stamp = written;
627 bdi->bw_time_stamp = now;
628}
629
630static void bdi_update_bandwidth(struct backing_dev_info *bdi,
631 unsigned long thresh,
632 unsigned long dirty,
633 unsigned long bdi_thresh,
634 unsigned long bdi_dirty,
635 unsigned long start_time)
636{
637 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
638 return;
639 spin_lock(&bdi->wb.list_lock);
640 __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
641 start_time);
642 spin_unlock(&bdi->wb.list_lock);
643}
644
471/* 645/*
472 * balance_dirty_pages() must be called by processes which are generating dirty 646 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 647 * data. It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
478static void balance_dirty_pages(struct address_space *mapping, 652static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 653 unsigned long write_chunk)
480{ 654{
481 long nr_reclaimable, bdi_nr_reclaimable; 655 unsigned long nr_reclaimable, bdi_nr_reclaimable;
482 long nr_writeback, bdi_nr_writeback; 656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty;
483 unsigned long background_thresh; 658 unsigned long background_thresh;
484 unsigned long dirty_thresh; 659 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 660 unsigned long bdi_thresh;
661 unsigned long task_bdi_thresh;
662 unsigned long min_task_bdi_thresh;
486 unsigned long pages_written = 0; 663 unsigned long pages_written = 0;
487 unsigned long pause = 1; 664 unsigned long pause = 1;
488 bool dirty_exceeded = false; 665 bool dirty_exceeded = false;
666 bool clear_dirty_exceeded = true;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 667 struct backing_dev_info *bdi = mapping->backing_dev_info;
668 unsigned long start_time = jiffies;
490 669
491 for (;;) { 670 for (;;) {
492 struct writeback_control wbc = {
493 .sync_mode = WB_SYNC_NONE,
494 .older_than_this = NULL,
495 .nr_to_write = write_chunk,
496 .range_cyclic = 1,
497 };
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 672 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 673 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 674
503 global_dirty_limits(&background_thresh, &dirty_thresh); 675 global_dirty_limits(&background_thresh, &dirty_thresh);
504 676
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
507 * catch-up. This avoids (excessively) small writeouts 679 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 680 * when the bdi limits are ramping up.
509 */ 681 */
510 if (nr_reclaimable + nr_writeback <= 682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
511 (background_thresh + dirty_thresh) / 2)
512 break; 683 break;
513 684
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh); 686 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
687 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 688
517 /* 689 /*
518 * In order to avoid the stacked BDI deadlock we need 690 * In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
524 * actually dirty; with m+n sitting in the percpu 696 * actually dirty; with m+n sitting in the percpu
525 * deltas. 697 * deltas.
526 */ 698 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 699 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 701 bdi_dirty = bdi_nr_reclaimable +
702 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 703 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 705 bdi_dirty = bdi_nr_reclaimable +
706 bdi_stat(bdi, BDI_WRITEBACK);
533 } 707 }
534 708
535 /* 709 /*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
538 * bdi or process from holding back light ones; The latter is 712 * bdi or process from holding back light ones; The latter is
539 * the last resort safeguard. 713 * the last resort safeguard.
540 */ 714 */
541 dirty_exceeded = 715 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) 716 (nr_dirty > dirty_thresh);
543 || (nr_reclaimable + nr_writeback > dirty_thresh); 717 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
718 (nr_dirty <= dirty_thresh);
544 719
545 if (!dirty_exceeded) 720 if (!dirty_exceeded)
546 break; 721 break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
548 if (!bdi->dirty_exceeded) 723 if (!bdi->dirty_exceeded)
549 bdi->dirty_exceeded = 1; 724 bdi->dirty_exceeded = 1;
550 725
726 bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
727 bdi_thresh, bdi_dirty, start_time);
728
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
552 * Unstable writes are a feature of certain networked 730 * Unstable writes are a feature of certain networked
553 * filesystems (i.e. NFS) in which data may have been 731 * filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,40 @@ static void balance_dirty_pages(struct address_space *mapping,
557 * threshold otherwise wait until the disk writes catch 735 * threshold otherwise wait until the disk writes catch
558 * up. 736 * up.
559 */ 737 */
560 trace_wbc_balance_dirty_start(&wbc, bdi); 738 trace_balance_dirty_start(bdi);
561 if (bdi_nr_reclaimable > bdi_thresh) { 739 if (bdi_nr_reclaimable > task_bdi_thresh) {
562 writeback_inodes_wb(&bdi->wb, &wbc); 740 pages_written += writeback_inodes_wb(&bdi->wb,
563 pages_written += write_chunk - wbc.nr_to_write; 741 write_chunk);
564 trace_wbc_balance_dirty_written(&wbc, bdi); 742 trace_balance_dirty_written(bdi, pages_written);
565 if (pages_written >= write_chunk) 743 if (pages_written >= write_chunk)
566 break; /* We've done our duty */ 744 break; /* We've done our duty */
567 } 745 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi);
569 __set_current_state(TASK_UNINTERRUPTIBLE); 746 __set_current_state(TASK_UNINTERRUPTIBLE);
570 io_schedule_timeout(pause); 747 io_schedule_timeout(pause);
748 trace_balance_dirty_wait(bdi);
749
750 dirty_thresh = hard_dirty_limit(dirty_thresh);
751 /*
752 * max-pause area. If dirty exceeded but still within this
753 * area, no need to sleep for more than 200ms: (a) 8 pages per
754 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive.
756 */
757 if (nr_dirty < dirty_thresh +
758 dirty_thresh / DIRTY_MAXPAUSE_AREA &&
759 time_after(jiffies, start_time + MAX_PAUSE))
760 break;
761 /*
762 * pass-good area. When some bdi gets blocked (eg. NFS server
763 * not responding), or write bandwidth dropped dramatically due
764 * to concurrent reads, or dirty threshold suddenly dropped and
765 * the dirty pages cannot be brought down anytime soon (eg. on
766 * slow USB stick), at least let go of the good bdi's.
767 */
768 if (nr_dirty < dirty_thresh +
769 dirty_thresh / DIRTY_PASSGOOD_AREA &&
770 bdi_dirty < bdi_thresh)
771 break;
571 772
572 /* 773 /*
573 * Increase the delay for each loop, up to our previous 774 * Increase the delay for each loop, up to our previous
@@ -578,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping,
578 pause = HZ / 10; 779 pause = HZ / 10;
579 } 780 }
580 781
581 if (!dirty_exceeded && bdi->dirty_exceeded) 782 /* Clear dirty_exceeded flag only when no task can exceed the limit */
783 if (clear_dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 784 bdi->dirty_exceeded = 0;
583 785
584 if (writeback_in_progress(bdi)) 786 if (writeback_in_progress(bdi))
@@ -626,9 +828,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
626void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 828void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 829 unsigned long nr_pages_dirtied)
628{ 830{
831 struct backing_dev_info *bdi = mapping->backing_dev_info;
629 unsigned long ratelimit; 832 unsigned long ratelimit;
630 unsigned long *p; 833 unsigned long *p;
631 834
835 if (!bdi_cap_account_dirty(bdi))
836 return;
837
632 ratelimit = ratelimit_pages; 838 ratelimit = ratelimit_pages;
633 if (mapping->backing_dev_info->dirty_exceeded) 839 if (mapping->backing_dev_info->dirty_exceeded)
634 ratelimit = 8; 840 ratelimit = 8;
@@ -892,12 +1098,12 @@ int write_cache_pages(struct address_space *mapping,
892 range_whole = 1; 1098 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1099 cycled = 1; /* ignore range_cyclic tests */
894 } 1100 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1101 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1102 tag = PAGECACHE_TAG_TOWRITE;
897 else 1103 else
898 tag = PAGECACHE_TAG_DIRTY; 1104 tag = PAGECACHE_TAG_DIRTY;
899retry: 1105retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1106 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1107 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1108 done_index = index;
903 while (!done && (index <= end)) { 1109 while (!done && (index <= end)) {
@@ -1141,7 +1347,6 @@ EXPORT_SYMBOL(account_page_dirtied);
1141void account_page_writeback(struct page *page) 1347void account_page_writeback(struct page *page)
1142{ 1348{
1143 inc_zone_page_state(page, NR_WRITEBACK); 1349 inc_zone_page_state(page, NR_WRITEBACK);
1144 inc_zone_page_state(page, NR_WRITTEN);
1145} 1350}
1146EXPORT_SYMBOL(account_page_writeback); 1351EXPORT_SYMBOL(account_page_writeback);
1147 1352
@@ -1358,8 +1563,10 @@ int test_clear_page_writeback(struct page *page)
1358 } else { 1563 } else {
1359 ret = TestClearPageWriteback(page); 1564 ret = TestClearPageWriteback(page);
1360 } 1565 }
1361 if (ret) 1566 if (ret) {
1362 dec_zone_page_state(page, NR_WRITEBACK); 1567 dec_zone_page_state(page, NR_WRITEBACK);
1568 inc_zone_page_state(page, NR_WRITTEN);
1569 }
1363 return ret; 1570 return ret;
1364} 1571}
1365 1572
@@ -1405,10 +1612,6 @@ EXPORT_SYMBOL(test_set_page_writeback);
1405 */ 1612 */
1406int mapping_tagged(struct address_space *mapping, int tag) 1613int mapping_tagged(struct address_space *mapping, int tag)
1407{ 1614{
1408 int ret; 1615 return radix_tree_tagged(&mapping->page_tree, tag);
1409 rcu_read_lock();
1410 ret = radix_tree_tagged(&mapping->page_tree, tag);
1411 rcu_read_unlock();
1412 return ret;
1413} 1616}
1414EXPORT_SYMBOL(mapping_tagged); 1617EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e8985acdab8..1dbcf8888f14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1370,21 +1370,12 @@ failed:
1370 1370
1371#ifdef CONFIG_FAIL_PAGE_ALLOC 1371#ifdef CONFIG_FAIL_PAGE_ALLOC
1372 1372
1373static struct fail_page_alloc_attr { 1373static struct {
1374 struct fault_attr attr; 1374 struct fault_attr attr;
1375 1375
1376 u32 ignore_gfp_highmem; 1376 u32 ignore_gfp_highmem;
1377 u32 ignore_gfp_wait; 1377 u32 ignore_gfp_wait;
1378 u32 min_order; 1378 u32 min_order;
1379
1380#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1381
1382 struct dentry *ignore_gfp_highmem_file;
1383 struct dentry *ignore_gfp_wait_file;
1384 struct dentry *min_order_file;
1385
1386#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1387
1388} fail_page_alloc = { 1379} fail_page_alloc = {
1389 .attr = FAULT_ATTR_INITIALIZER, 1380 .attr = FAULT_ATTR_INITIALIZER,
1390 .ignore_gfp_wait = 1, 1381 .ignore_gfp_wait = 1,
@@ -1424,30 +1415,24 @@ static int __init fail_page_alloc_debugfs(void)
1424 "fail_page_alloc"); 1415 "fail_page_alloc");
1425 if (err) 1416 if (err)
1426 return err; 1417 return err;
1427 dir = fail_page_alloc.attr.dentries.dir;
1428
1429 fail_page_alloc.ignore_gfp_wait_file =
1430 debugfs_create_bool("ignore-gfp-wait", mode, dir,
1431 &fail_page_alloc.ignore_gfp_wait);
1432
1433 fail_page_alloc.ignore_gfp_highmem_file =
1434 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1435 &fail_page_alloc.ignore_gfp_highmem);
1436 fail_page_alloc.min_order_file =
1437 debugfs_create_u32("min-order", mode, dir,
1438 &fail_page_alloc.min_order);
1439
1440 if (!fail_page_alloc.ignore_gfp_wait_file ||
1441 !fail_page_alloc.ignore_gfp_highmem_file ||
1442 !fail_page_alloc.min_order_file) {
1443 err = -ENOMEM;
1444 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1445 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1446 debugfs_remove(fail_page_alloc.min_order_file);
1447 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1448 }
1449 1418
1450 return err; 1419 dir = fail_page_alloc.attr.dir;
1420
1421 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1422 &fail_page_alloc.ignore_gfp_wait))
1423 goto fail;
1424 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1425 &fail_page_alloc.ignore_gfp_highmem))
1426 goto fail;
1427 if (!debugfs_create_u32("min-order", mode, dir,
1428 &fail_page_alloc.min_order))
1429 goto fail;
1430
1431 return 0;
1432fail:
1433 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1434
1435 return -ENOMEM;
1451} 1436}
1452 1437
1453late_initcall(fail_page_alloc_debugfs); 1438late_initcall(fail_page_alloc_debugfs);
@@ -1616,6 +1601,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1616 set_bit(i, zlc->fullzones); 1601 set_bit(i, zlc->fullzones);
1617} 1602}
1618 1603
1604/*
1605 * clear all zones full, called after direct reclaim makes progress so that
1606 * a zone that was recently full is not skipped over for up to a second
1607 */
1608static void zlc_clear_zones_full(struct zonelist *zonelist)
1609{
1610 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1611
1612 zlc = zonelist->zlcache_ptr;
1613 if (!zlc)
1614 return;
1615
1616 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1617}
1618
1619#else /* CONFIG_NUMA */ 1619#else /* CONFIG_NUMA */
1620 1620
1621static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1621static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1632,6 +1632,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1632static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) 1632static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1633{ 1633{
1634} 1634}
1635
1636static void zlc_clear_zones_full(struct zonelist *zonelist)
1637{
1638}
1635#endif /* CONFIG_NUMA */ 1639#endif /* CONFIG_NUMA */
1636 1640
1637/* 1641/*
@@ -1664,7 +1668,7 @@ zonelist_scan:
1664 continue; 1668 continue;
1665 if ((alloc_flags & ALLOC_CPUSET) && 1669 if ((alloc_flags & ALLOC_CPUSET) &&
1666 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1670 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1667 goto try_next_zone; 1671 continue;
1668 1672
1669 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1673 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1670 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1674 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
@@ -1676,17 +1680,36 @@ zonelist_scan:
1676 classzone_idx, alloc_flags)) 1680 classzone_idx, alloc_flags))
1677 goto try_this_zone; 1681 goto try_this_zone;
1678 1682
1683 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1684 /*
1685 * we do zlc_setup if there are multiple nodes
1686 * and before considering the first zone allowed
1687 * by the cpuset.
1688 */
1689 allowednodes = zlc_setup(zonelist, alloc_flags);
1690 zlc_active = 1;
1691 did_zlc_setup = 1;
1692 }
1693
1679 if (zone_reclaim_mode == 0) 1694 if (zone_reclaim_mode == 0)
1680 goto this_zone_full; 1695 goto this_zone_full;
1681 1696
1697 /*
1698 * As we may have just activated ZLC, check if the first
1699 * eligible zone has failed zone_reclaim recently.
1700 */
1701 if (NUMA_BUILD && zlc_active &&
1702 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1703 continue;
1704
1682 ret = zone_reclaim(zone, gfp_mask, order); 1705 ret = zone_reclaim(zone, gfp_mask, order);
1683 switch (ret) { 1706 switch (ret) {
1684 case ZONE_RECLAIM_NOSCAN: 1707 case ZONE_RECLAIM_NOSCAN:
1685 /* did not scan */ 1708 /* did not scan */
1686 goto try_next_zone; 1709 continue;
1687 case ZONE_RECLAIM_FULL: 1710 case ZONE_RECLAIM_FULL:
1688 /* scanned but unreclaimable */ 1711 /* scanned but unreclaimable */
1689 goto this_zone_full; 1712 continue;
1690 default: 1713 default:
1691 /* did we reclaim enough */ 1714 /* did we reclaim enough */
1692 if (!zone_watermark_ok(zone, order, mark, 1715 if (!zone_watermark_ok(zone, order, mark,
@@ -1703,16 +1726,6 @@ try_this_zone:
1703this_zone_full: 1726this_zone_full:
1704 if (NUMA_BUILD) 1727 if (NUMA_BUILD)
1705 zlc_mark_zone_full(zonelist, z); 1728 zlc_mark_zone_full(zonelist, z);
1706try_next_zone:
1707 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1708 /*
1709 * we do zlc_setup after the first zone is tried but only
1710 * if there are multiple nodes make it worthwhile
1711 */
1712 allowednodes = zlc_setup(zonelist, alloc_flags);
1713 zlc_active = 1;
1714 did_zlc_setup = 1;
1715 }
1716 } 1729 }
1717 1730
1718 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1731 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
@@ -1954,6 +1967,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1954 if (unlikely(!(*did_some_progress))) 1967 if (unlikely(!(*did_some_progress)))
1955 return NULL; 1968 return NULL;
1956 1969
1970 /* After successful reclaim, reconsider all zones for allocation */
1971 if (NUMA_BUILD)
1972 zlc_clear_zones_full(zonelist);
1973
1957retry: 1974retry:
1958 page = get_page_from_freelist(gfp_mask, nodemask, order, 1975 page = get_page_from_freelist(gfp_mask, nodemask, order,
1959 zonelist, high_zoneidx, 1976 zonelist, high_zoneidx,
@@ -4585,6 +4602,60 @@ void __init sort_node_map(void)
4585 cmp_node_active_region, NULL); 4602 cmp_node_active_region, NULL);
4586} 4603}
4587 4604
4605/**
4606 * node_map_pfn_alignment - determine the maximum internode alignment
4607 *
4608 * This function should be called after node map is populated and sorted.
4609 * It calculates the maximum power of two alignment which can distinguish
4610 * all the nodes.
4611 *
4612 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
4613 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the
4614 * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is
4615 * shifted, 1GiB is enough and this function will indicate so.
4616 *
4617 * This is used to test whether pfn -> nid mapping of the chosen memory
4618 * model has fine enough granularity to avoid incorrect mapping for the
4619 * populated node map.
4620 *
4621 * Returns the determined alignment in pfn's. 0 if there is no alignment
4622 * requirement (single node).
4623 */
4624unsigned long __init node_map_pfn_alignment(void)
4625{
4626 unsigned long accl_mask = 0, last_end = 0;
4627 int last_nid = -1;
4628 int i;
4629
4630 for_each_active_range_index_in_nid(i, MAX_NUMNODES) {
4631 int nid = early_node_map[i].nid;
4632 unsigned long start = early_node_map[i].start_pfn;
4633 unsigned long end = early_node_map[i].end_pfn;
4634 unsigned long mask;
4635
4636 if (!start || last_nid < 0 || last_nid == nid) {
4637 last_nid = nid;
4638 last_end = end;
4639 continue;
4640 }
4641
4642 /*
4643 * Start with a mask granular enough to pin-point to the
4644 * start pfn and tick off bits one-by-one until it becomes
4645 * too coarse to separate the current node from the last.
4646 */
4647 mask = ~((1 << __ffs(start)) - 1);
4648 while (mask && last_end <= (start & (mask << 1)))
4649 mask <<= 1;
4650
4651 /* accumulate all internode masks */
4652 accl_mask |= mask;
4653 }
4654
4655 /* convert mask to number of pages */
4656 return ~accl_mask + 1;
4657}
4658
4588/* Find the lowest pfn for a node */ 4659/* Find the lowest pfn for a node */
4589static unsigned long __init find_min_pfn_for_node(int nid) 4660static unsigned long __init find_min_pfn_for_node(int nid)
4590{ 4661{
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 53bffc6c293e..39d216d535ea 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
225 unsigned long start, end, pfn; 225 unsigned long start, end, pfn;
226 int fail = 0; 226 int fail = 0;
227 227
228 start = start_pfn & ~(PAGES_PER_SECTION - 1); 228 start = SECTION_ALIGN_DOWN(start_pfn);
229 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 229 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
230 230
231 if (nid == -1) { 231 if (nid == -1) {
232 /* 232 /*
@@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn,
258{ 258{
259 unsigned long start, end, pfn; 259 unsigned long start, end, pfn;
260 260
261 start = start_pfn & ~(PAGES_PER_SECTION - 1); 261 start = SECTION_ALIGN_DOWN(start_pfn);
262 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); 262 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
263 263
264 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) 264 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
265 __free_page_cgroup(pfn); 265 __free_page_cgroup(pfn);
@@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
537nomem: 537nomem:
538 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 538 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
539 printk(KERN_INFO 539 printk(KERN_INFO
540 "swap_cgroup can be disabled by noswapaccount boot option\n"); 540 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
541 return -ENOMEM; 541 return -ENOMEM;
542} 542}
543 543
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3450d533611..2f5cf10ff660 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
126 126
127 return 0; 127 return 0;
128} 128}
129#endif 129
130static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
131{
132 struct vm_area_struct *vma;
133
134 /* We don't need vma lookup at all. */
135 if (!walk->hugetlb_entry)
136 return NULL;
137
138 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
139 vma = find_vma(walk->mm, addr);
140 if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma))
141 return vma;
142
143 return NULL;
144}
145
146#else /* CONFIG_HUGETLB_PAGE */
147static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk)
148{
149 return NULL;
150}
151
152static int walk_hugetlb_range(struct vm_area_struct *vma,
153 unsigned long addr, unsigned long end,
154 struct mm_walk *walk)
155{
156 return 0;
157}
158
159#endif /* CONFIG_HUGETLB_PAGE */
160
161
130 162
131/** 163/**
132 * walk_page_range - walk a memory map's page tables with a callback 164 * walk_page_range - walk a memory map's page tables with a callback
@@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
144 * associated range, and a copy of the original mm_walk for access to 176 * associated range, and a copy of the original mm_walk for access to
145 * the ->private or ->mm fields. 177 * the ->private or ->mm fields.
146 * 178 *
147 * No locks are taken, but the bottom level iterator will map PTE 179 * Usually no locks are taken, but splitting transparent huge page may
180 * take page table lock. And the bottom level iterator will map PTE
148 * directories from highmem if necessary. 181 * directories from highmem if necessary.
149 * 182 *
150 * If any callback returns a non-zero value, the walk is aborted and 183 * If any callback returns a non-zero value, the walk is aborted and
151 * the return value is propagated back to the caller. Otherwise 0 is returned. 184 * the return value is propagated back to the caller. Otherwise 0 is returned.
185 *
186 * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
187 * is !NULL.
152 */ 188 */
153int walk_page_range(unsigned long addr, unsigned long end, 189int walk_page_range(unsigned long addr, unsigned long end,
154 struct mm_walk *walk) 190 struct mm_walk *walk)
@@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
165 201
166 pgd = pgd_offset(walk->mm, addr); 202 pgd = pgd_offset(walk->mm, addr);
167 do { 203 do {
168 struct vm_area_struct *uninitialized_var(vma); 204 struct vm_area_struct *vma;
169 205
170 next = pgd_addr_end(addr, end); 206 next = pgd_addr_end(addr, end);
171 207
172#ifdef CONFIG_HUGETLB_PAGE
173 /* 208 /*
174 * handle hugetlb vma individually because pagetable walk for 209 * handle hugetlb vma individually because pagetable walk for
175 * the hugetlb page is dependent on the architecture and 210 * the hugetlb page is dependent on the architecture and
176 * we can't handled it in the same manner as non-huge pages. 211 * we can't handled it in the same manner as non-huge pages.
177 */ 212 */
178 vma = find_vma(walk->mm, addr); 213 vma = hugetlb_vma(addr, walk);
179 if (vma && is_vm_hugetlb_page(vma)) { 214 if (vma) {
180 if (vma->vm_end < next) 215 if (vma->vm_end < next)
181 next = vma->vm_end; 216 next = vma->vm_end;
182 /* 217 /*
@@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
189 pgd = pgd_offset(walk->mm, next); 224 pgd = pgd_offset(walk->mm, next);
190 continue; 225 continue;
191 } 226 }
192#endif 227
193 if (pgd_none_or_clear_bad(pgd)) { 228 if (pgd_none_or_clear_bad(pgd)) {
194 if (walk->pte_hole) 229 if (walk->pte_hole)
195 err = walk->pte_hole(addr, next, walk); 230 err = walk->pte_hole(addr, next, walk);
diff --git a/mm/rmap.c b/mm/rmap.c
index 23295f65ae43..8005080fb9e3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,6 @@
21 * Lock ordering in mm: 21 * Lock ordering in mm:
22 * 22 *
23 * inode->i_mutex (while writing or truncating, not reading or faulting) 23 * inode->i_mutex (while writing or truncating, not reading or faulting)
24 * inode->i_alloc_sem (vmtruncate_range)
25 * mm->mmap_sem 24 * mm->mmap_sem
26 * page->flags PG_locked (lock_page) 25 * page->flags PG_locked (lock_page)
27 * mapping->i_mmap_mutex 26 * mapping->i_mmap_mutex
@@ -32,11 +31,11 @@
32 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
33 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
34 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
35 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
36 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
37 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
38 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
39 * within inode_wb_list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
40 * 39 *
41 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
42 * ->tasklist_lock 41 * ->tasklist_lock
@@ -870,11 +869,11 @@ int page_referenced(struct page *page,
870 vm_flags); 869 vm_flags);
871 if (we_locked) 870 if (we_locked)
872 unlock_page(page); 871 unlock_page(page);
872
873 if (page_test_and_clear_young(page_to_pfn(page)))
874 referenced++;
873 } 875 }
874out: 876out:
875 if (page_test_and_clear_young(page_to_pfn(page)))
876 referenced++;
877
878 return referenced; 877 return referenced;
879} 878}
880 879
diff --git a/mm/shmem.c b/mm/shmem.c
index fcedf5464eb7..5cc21f8b4cd3 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/splice.h>
54#include <linux/security.h> 55#include <linux/security.h>
55#include <linux/swapops.h> 56#include <linux/swapops.h>
56#include <linux/mempolicy.h> 57#include <linux/mempolicy.h>
@@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void)
126} 127}
127#endif 128#endif
128 129
129static int shmem_getpage(struct inode *inode, unsigned long idx, 130static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
130 struct page **pagep, enum sgp_type sgp, int *type); 131 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
132
133static inline int shmem_getpage(struct inode *inode, pgoff_t index,
134 struct page **pagep, enum sgp_type sgp, int *fault_type)
135{
136 return shmem_getpage_gfp(inode, index, pagep, sgp,
137 mapping_gfp_mask(inode->i_mapping), fault_type);
138}
131 139
132static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) 140static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
133{ 141{
@@ -241,9 +249,7 @@ static void shmem_free_blocks(struct inode *inode, long pages)
241 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 249 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
242 if (sbinfo->max_blocks) { 250 if (sbinfo->max_blocks) {
243 percpu_counter_add(&sbinfo->used_blocks, -pages); 251 percpu_counter_add(&sbinfo->used_blocks, -pages);
244 spin_lock(&inode->i_lock);
245 inode->i_blocks -= pages*BLOCKS_PER_PAGE; 252 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
246 spin_unlock(&inode->i_lock);
247 } 253 }
248} 254}
249 255
@@ -405,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
405 * @info: info structure for the inode 411 * @info: info structure for the inode
406 * @index: index of the page to find 412 * @index: index of the page to find
407 * @sgp: check and recheck i_size? skip allocation? 413 * @sgp: check and recheck i_size? skip allocation?
414 * @gfp: gfp mask to use for any page allocation
408 * 415 *
409 * If the entry does not exist, allocate it. 416 * If the entry does not exist, allocate it.
410 */ 417 */
411static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) 418static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
419 unsigned long index, enum sgp_type sgp, gfp_t gfp)
412{ 420{
413 struct inode *inode = &info->vfs_inode; 421 struct inode *inode = &info->vfs_inode;
414 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 422 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
@@ -432,13 +440,11 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
432 sbinfo->max_blocks - 1) >= 0) 440 sbinfo->max_blocks - 1) >= 0)
433 return ERR_PTR(-ENOSPC); 441 return ERR_PTR(-ENOSPC);
434 percpu_counter_inc(&sbinfo->used_blocks); 442 percpu_counter_inc(&sbinfo->used_blocks);
435 spin_lock(&inode->i_lock);
436 inode->i_blocks += BLOCKS_PER_PAGE; 443 inode->i_blocks += BLOCKS_PER_PAGE;
437 spin_unlock(&inode->i_lock);
438 } 444 }
439 445
440 spin_unlock(&info->lock); 446 spin_unlock(&info->lock);
441 page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); 447 page = shmem_dir_alloc(gfp);
442 spin_lock(&info->lock); 448 spin_lock(&info->lock);
443 449
444 if (!page) { 450 if (!page) {
@@ -966,20 +972,7 @@ found:
966 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 972 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
967 /* which does mem_cgroup_uncharge_cache_page on error */ 973 /* which does mem_cgroup_uncharge_cache_page on error */
968 974
969 if (error == -EEXIST) { 975 if (error != -ENOMEM) {
970 struct page *filepage = find_get_page(mapping, idx);
971 error = 1;
972 if (filepage) {
973 /*
974 * There might be a more uptodate page coming down
975 * from a stacked writepage: forget our swappage if so.
976 */
977 if (PageUptodate(filepage))
978 error = 0;
979 page_cache_release(filepage);
980 }
981 }
982 if (!error) {
983 delete_from_swap_cache(page); 976 delete_from_swap_cache(page);
984 set_page_dirty(page); 977 set_page_dirty(page);
985 info->flags |= SHMEM_PAGEIN; 978 info->flags |= SHMEM_PAGEIN;
@@ -1066,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1066 /* 1059 /*
1067 * shmem_backing_dev_info's capabilities prevent regular writeback or 1060 * shmem_backing_dev_info's capabilities prevent regular writeback or
1068 * sync from ever calling shmem_writepage; but a stacking filesystem 1061 * sync from ever calling shmem_writepage; but a stacking filesystem
1069 * may use the ->writepage of its underlying filesystem, in which case 1062 * might use ->writepage of its underlying filesystem, in which case
1070 * tmpfs should write out to swap only in response to memory pressure, 1063 * tmpfs should write out to swap only in response to memory pressure,
1071 * and not for the writeback threads or sync. However, in those cases, 1064 * and not for the writeback threads or sync.
1072 * we do still want to check if there's a redundant swappage to be
1073 * discarded.
1074 */ 1065 */
1075 if (wbc->for_reclaim) 1066 if (!wbc->for_reclaim) {
1076 swap = get_swap_page(); 1067 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1077 else 1068 goto redirty;
1078 swap.val = 0; 1069 }
1070 swap = get_swap_page();
1071 if (!swap.val)
1072 goto redirty;
1079 1073
1080 /* 1074 /*
1081 * Add inode to shmem_unuse()'s list of swapped-out inodes, 1075 * Add inode to shmem_unuse()'s list of swapped-out inodes,
@@ -1086,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1086 * we've taken the spinlock, because shmem_unuse_inode() will 1080 * we've taken the spinlock, because shmem_unuse_inode() will
1087 * prune a !swapped inode from the swaplist under both locks. 1081 * prune a !swapped inode from the swaplist under both locks.
1088 */ 1082 */
1089 if (swap.val) { 1083 mutex_lock(&shmem_swaplist_mutex);
1090 mutex_lock(&shmem_swaplist_mutex); 1084 if (list_empty(&info->swaplist))
1091 if (list_empty(&info->swaplist)) 1085 list_add_tail(&info->swaplist, &shmem_swaplist);
1092 list_add_tail(&info->swaplist, &shmem_swaplist);
1093 }
1094 1086
1095 spin_lock(&info->lock); 1087 spin_lock(&info->lock);
1096 if (swap.val) 1088 mutex_unlock(&shmem_swaplist_mutex);
1097 mutex_unlock(&shmem_swaplist_mutex);
1098 1089
1099 if (index >= info->next_index) { 1090 if (index >= info->next_index) {
1100 BUG_ON(!(info->flags & SHMEM_TRUNCATE)); 1091 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
@@ -1102,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1102 } 1093 }
1103 entry = shmem_swp_entry(info, index, NULL); 1094 entry = shmem_swp_entry(info, index, NULL);
1104 if (entry->val) { 1095 if (entry->val) {
1105 /* 1096 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1106 * The more uptodate page coming down from a stacked
1107 * writepage should replace our old swappage.
1108 */
1109 free_swap_and_cache(*entry); 1097 free_swap_and_cache(*entry);
1110 shmem_swp_set(info, entry, 0); 1098 shmem_swp_set(info, entry, 0);
1111 } 1099 }
1112 shmem_recalc_inode(inode); 1100 shmem_recalc_inode(inode);
1113 1101
1114 if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1102 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1115 delete_from_page_cache(page); 1103 delete_from_page_cache(page);
1116 shmem_swp_set(info, entry, swap.val); 1104 shmem_swp_set(info, entry, swap.val);
1117 shmem_swp_unmap(entry); 1105 shmem_swp_unmap(entry);
@@ -1228,92 +1216,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1228#endif 1216#endif
1229 1217
1230/* 1218/*
1231 * shmem_getpage - either get the page from swap or allocate a new one 1219 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
1232 * 1220 *
1233 * If we allocate a new one we do not mark it dirty. That's up to the 1221 * If we allocate a new one we do not mark it dirty. That's up to the
1234 * vm. If we swap it in we mark it dirty since we also free the swap 1222 * vm. If we swap it in we mark it dirty since we also free the swap
1235 * entry since a page cannot live in both the swap and page cache 1223 * entry since a page cannot live in both the swap and page cache
1236 */ 1224 */
1237static int shmem_getpage(struct inode *inode, unsigned long idx, 1225static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
1238 struct page **pagep, enum sgp_type sgp, int *type) 1226 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1239{ 1227{
1240 struct address_space *mapping = inode->i_mapping; 1228 struct address_space *mapping = inode->i_mapping;
1241 struct shmem_inode_info *info = SHMEM_I(inode); 1229 struct shmem_inode_info *info = SHMEM_I(inode);
1242 struct shmem_sb_info *sbinfo; 1230 struct shmem_sb_info *sbinfo;
1243 struct page *filepage = *pagep; 1231 struct page *page;
1244 struct page *swappage;
1245 struct page *prealloc_page = NULL; 1232 struct page *prealloc_page = NULL;
1246 swp_entry_t *entry; 1233 swp_entry_t *entry;
1247 swp_entry_t swap; 1234 swp_entry_t swap;
1248 gfp_t gfp;
1249 int error; 1235 int error;
1236 int ret;
1250 1237
1251 if (idx >= SHMEM_MAX_INDEX) 1238 if (idx >= SHMEM_MAX_INDEX)
1252 return -EFBIG; 1239 return -EFBIG;
1253
1254 if (type)
1255 *type = 0;
1256
1257 /*
1258 * Normally, filepage is NULL on entry, and either found
1259 * uptodate immediately, or allocated and zeroed, or read
1260 * in under swappage, which is then assigned to filepage.
1261 * But shmem_readpage (required for splice) passes in a locked
1262 * filepage, which may be found not uptodate by other callers
1263 * too, and may need to be copied from the swappage read in.
1264 */
1265repeat: 1240repeat:
1266 if (!filepage) 1241 page = find_lock_page(mapping, idx);
1267 filepage = find_lock_page(mapping, idx); 1242 if (page) {
1268 if (filepage && PageUptodate(filepage))
1269 goto done;
1270 gfp = mapping_gfp_mask(mapping);
1271 if (!filepage) {
1272 /* 1243 /*
1273 * Try to preload while we can wait, to not make a habit of 1244 * Once we can get the page lock, it must be uptodate:
1274 * draining atomic reserves; but don't latch on to this cpu. 1245 * if there were an error in reading back from swap,
1246 * the page would not be inserted into the filecache.
1275 */ 1247 */
1276 error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); 1248 BUG_ON(!PageUptodate(page));
1277 if (error) 1249 goto done;
1278 goto failed; 1250 }
1279 radix_tree_preload_end(); 1251
1280 if (sgp != SGP_READ && !prealloc_page) { 1252 /*
1281 /* We don't care if this fails */ 1253 * Try to preload while we can wait, to not make a habit of
1282 prealloc_page = shmem_alloc_page(gfp, info, idx); 1254 * draining atomic reserves; but don't latch on to this cpu.
1283 if (prealloc_page) { 1255 */
1284 if (mem_cgroup_cache_charge(prealloc_page, 1256 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
1285 current->mm, GFP_KERNEL)) { 1257 if (error)
1286 page_cache_release(prealloc_page); 1258 goto out;
1287 prealloc_page = NULL; 1259 radix_tree_preload_end();
1288 } 1260
1261 if (sgp != SGP_READ && !prealloc_page) {
1262 prealloc_page = shmem_alloc_page(gfp, info, idx);
1263 if (prealloc_page) {
1264 SetPageSwapBacked(prealloc_page);
1265 if (mem_cgroup_cache_charge(prealloc_page,
1266 current->mm, GFP_KERNEL)) {
1267 page_cache_release(prealloc_page);
1268 prealloc_page = NULL;
1289 } 1269 }
1290 } 1270 }
1291 } 1271 }
1292 error = 0;
1293 1272
1294 spin_lock(&info->lock); 1273 spin_lock(&info->lock);
1295 shmem_recalc_inode(inode); 1274 shmem_recalc_inode(inode);
1296 entry = shmem_swp_alloc(info, idx, sgp); 1275 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1297 if (IS_ERR(entry)) { 1276 if (IS_ERR(entry)) {
1298 spin_unlock(&info->lock); 1277 spin_unlock(&info->lock);
1299 error = PTR_ERR(entry); 1278 error = PTR_ERR(entry);
1300 goto failed; 1279 goto out;
1301 } 1280 }
1302 swap = *entry; 1281 swap = *entry;
1303 1282
1304 if (swap.val) { 1283 if (swap.val) {
1305 /* Look it up and read it in.. */ 1284 /* Look it up and read it in.. */
1306 swappage = lookup_swap_cache(swap); 1285 page = lookup_swap_cache(swap);
1307 if (!swappage) { 1286 if (!page) {
1308 shmem_swp_unmap(entry); 1287 shmem_swp_unmap(entry);
1309 spin_unlock(&info->lock); 1288 spin_unlock(&info->lock);
1310 /* here we actually do the io */ 1289 /* here we actually do the io */
1311 if (type) 1290 if (fault_type)
1312 *type |= VM_FAULT_MAJOR; 1291 *fault_type |= VM_FAULT_MAJOR;
1313 swappage = shmem_swapin(swap, gfp, info, idx); 1292 page = shmem_swapin(swap, gfp, info, idx);
1314 if (!swappage) { 1293 if (!page) {
1315 spin_lock(&info->lock); 1294 spin_lock(&info->lock);
1316 entry = shmem_swp_alloc(info, idx, sgp); 1295 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1317 if (IS_ERR(entry)) 1296 if (IS_ERR(entry))
1318 error = PTR_ERR(entry); 1297 error = PTR_ERR(entry);
1319 else { 1298 else {
@@ -1323,62 +1302,42 @@ repeat:
1323 } 1302 }
1324 spin_unlock(&info->lock); 1303 spin_unlock(&info->lock);
1325 if (error) 1304 if (error)
1326 goto failed; 1305 goto out;
1327 goto repeat; 1306 goto repeat;
1328 } 1307 }
1329 wait_on_page_locked(swappage); 1308 wait_on_page_locked(page);
1330 page_cache_release(swappage); 1309 page_cache_release(page);
1331 goto repeat; 1310 goto repeat;
1332 } 1311 }
1333 1312
1334 /* We have to do this with page locked to prevent races */ 1313 /* We have to do this with page locked to prevent races */
1335 if (!trylock_page(swappage)) { 1314 if (!trylock_page(page)) {
1336 shmem_swp_unmap(entry); 1315 shmem_swp_unmap(entry);
1337 spin_unlock(&info->lock); 1316 spin_unlock(&info->lock);
1338 wait_on_page_locked(swappage); 1317 wait_on_page_locked(page);
1339 page_cache_release(swappage); 1318 page_cache_release(page);
1340 goto repeat; 1319 goto repeat;
1341 } 1320 }
1342 if (PageWriteback(swappage)) { 1321 if (PageWriteback(page)) {
1343 shmem_swp_unmap(entry); 1322 shmem_swp_unmap(entry);
1344 spin_unlock(&info->lock); 1323 spin_unlock(&info->lock);
1345 wait_on_page_writeback(swappage); 1324 wait_on_page_writeback(page);
1346 unlock_page(swappage); 1325 unlock_page(page);
1347 page_cache_release(swappage); 1326 page_cache_release(page);
1348 goto repeat; 1327 goto repeat;
1349 } 1328 }
1350 if (!PageUptodate(swappage)) { 1329 if (!PageUptodate(page)) {
1351 shmem_swp_unmap(entry); 1330 shmem_swp_unmap(entry);
1352 spin_unlock(&info->lock); 1331 spin_unlock(&info->lock);
1353 unlock_page(swappage); 1332 unlock_page(page);
1354 page_cache_release(swappage); 1333 page_cache_release(page);
1355 error = -EIO; 1334 error = -EIO;
1356 goto failed; 1335 goto out;
1357 } 1336 }
1358 1337
1359 if (filepage) { 1338 error = add_to_page_cache_locked(page, mapping,
1360 shmem_swp_set(info, entry, 0); 1339 idx, GFP_NOWAIT);
1361 shmem_swp_unmap(entry); 1340 if (error) {
1362 delete_from_swap_cache(swappage);
1363 spin_unlock(&info->lock);
1364 copy_highpage(filepage, swappage);
1365 unlock_page(swappage);
1366 page_cache_release(swappage);
1367 flush_dcache_page(filepage);
1368 SetPageUptodate(filepage);
1369 set_page_dirty(filepage);
1370 swap_free(swap);
1371 } else if (!(error = add_to_page_cache_locked(swappage, mapping,
1372 idx, GFP_NOWAIT))) {
1373 info->flags |= SHMEM_PAGEIN;
1374 shmem_swp_set(info, entry, 0);
1375 shmem_swp_unmap(entry);
1376 delete_from_swap_cache(swappage);
1377 spin_unlock(&info->lock);
1378 filepage = swappage;
1379 set_page_dirty(filepage);
1380 swap_free(swap);
1381 } else {
1382 shmem_swp_unmap(entry); 1341 shmem_swp_unmap(entry);
1383 spin_unlock(&info->lock); 1342 spin_unlock(&info->lock);
1384 if (error == -ENOMEM) { 1343 if (error == -ENOMEM) {
@@ -1387,32 +1346,38 @@ repeat:
1387 * call memcg's OOM if needed. 1346 * call memcg's OOM if needed.
1388 */ 1347 */
1389 error = mem_cgroup_shmem_charge_fallback( 1348 error = mem_cgroup_shmem_charge_fallback(
1390 swappage, 1349 page, current->mm, gfp);
1391 current->mm,
1392 gfp);
1393 if (error) { 1350 if (error) {
1394 unlock_page(swappage); 1351 unlock_page(page);
1395 page_cache_release(swappage); 1352 page_cache_release(page);
1396 goto failed; 1353 goto out;
1397 } 1354 }
1398 } 1355 }
1399 unlock_page(swappage); 1356 unlock_page(page);
1400 page_cache_release(swappage); 1357 page_cache_release(page);
1401 goto repeat; 1358 goto repeat;
1402 } 1359 }
1403 } else if (sgp == SGP_READ && !filepage) { 1360
1361 info->flags |= SHMEM_PAGEIN;
1362 shmem_swp_set(info, entry, 0);
1404 shmem_swp_unmap(entry); 1363 shmem_swp_unmap(entry);
1405 filepage = find_get_page(mapping, idx); 1364 delete_from_swap_cache(page);
1406 if (filepage && 1365 spin_unlock(&info->lock);
1407 (!PageUptodate(filepage) || !trylock_page(filepage))) { 1366 set_page_dirty(page);
1367 swap_free(swap);
1368
1369 } else if (sgp == SGP_READ) {
1370 shmem_swp_unmap(entry);
1371 page = find_get_page(mapping, idx);
1372 if (page && !trylock_page(page)) {
1408 spin_unlock(&info->lock); 1373 spin_unlock(&info->lock);
1409 wait_on_page_locked(filepage); 1374 wait_on_page_locked(page);
1410 page_cache_release(filepage); 1375 page_cache_release(page);
1411 filepage = NULL;
1412 goto repeat; 1376 goto repeat;
1413 } 1377 }
1414 spin_unlock(&info->lock); 1378 spin_unlock(&info->lock);
1415 } else { 1379
1380 } else if (prealloc_page) {
1416 shmem_swp_unmap(entry); 1381 shmem_swp_unmap(entry);
1417 sbinfo = SHMEM_SB(inode->i_sb); 1382 sbinfo = SHMEM_SB(inode->i_sb);
1418 if (sbinfo->max_blocks) { 1383 if (sbinfo->max_blocks) {
@@ -1421,126 +1386,86 @@ repeat:
1421 shmem_acct_block(info->flags)) 1386 shmem_acct_block(info->flags))
1422 goto nospace; 1387 goto nospace;
1423 percpu_counter_inc(&sbinfo->used_blocks); 1388 percpu_counter_inc(&sbinfo->used_blocks);
1424 spin_lock(&inode->i_lock);
1425 inode->i_blocks += BLOCKS_PER_PAGE; 1389 inode->i_blocks += BLOCKS_PER_PAGE;
1426 spin_unlock(&inode->i_lock);
1427 } else if (shmem_acct_block(info->flags)) 1390 } else if (shmem_acct_block(info->flags))
1428 goto nospace; 1391 goto nospace;
1429 1392
1430 if (!filepage) { 1393 page = prealloc_page;
1431 int ret; 1394 prealloc_page = NULL;
1432
1433 if (!prealloc_page) {
1434 spin_unlock(&info->lock);
1435 filepage = shmem_alloc_page(gfp, info, idx);
1436 if (!filepage) {
1437 shmem_unacct_blocks(info->flags, 1);
1438 shmem_free_blocks(inode, 1);
1439 error = -ENOMEM;
1440 goto failed;
1441 }
1442 SetPageSwapBacked(filepage);
1443 1395
1444 /* 1396 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1445 * Precharge page while we can wait, compensate 1397 if (IS_ERR(entry))
1446 * after 1398 error = PTR_ERR(entry);
1447 */ 1399 else {
1448 error = mem_cgroup_cache_charge(filepage, 1400 swap = *entry;
1449 current->mm, GFP_KERNEL); 1401 shmem_swp_unmap(entry);
1450 if (error) { 1402 }
1451 page_cache_release(filepage); 1403 ret = error || swap.val;
1452 shmem_unacct_blocks(info->flags, 1); 1404 if (ret)
1453 shmem_free_blocks(inode, 1); 1405 mem_cgroup_uncharge_cache_page(page);
1454 filepage = NULL; 1406 else
1455 goto failed; 1407 ret = add_to_page_cache_lru(page, mapping,
1456 }
1457
1458 spin_lock(&info->lock);
1459 } else {
1460 filepage = prealloc_page;
1461 prealloc_page = NULL;
1462 SetPageSwapBacked(filepage);
1463 }
1464
1465 entry = shmem_swp_alloc(info, idx, sgp);
1466 if (IS_ERR(entry))
1467 error = PTR_ERR(entry);
1468 else {
1469 swap = *entry;
1470 shmem_swp_unmap(entry);
1471 }
1472 ret = error || swap.val;
1473 if (ret)
1474 mem_cgroup_uncharge_cache_page(filepage);
1475 else
1476 ret = add_to_page_cache_lru(filepage, mapping,
1477 idx, GFP_NOWAIT); 1408 idx, GFP_NOWAIT);
1478 /* 1409 /*
1479 * At add_to_page_cache_lru() failure, uncharge will 1410 * At add_to_page_cache_lru() failure,
1480 * be done automatically. 1411 * uncharge will be done automatically.
1481 */ 1412 */
1482 if (ret) { 1413 if (ret) {
1483 spin_unlock(&info->lock); 1414 shmem_unacct_blocks(info->flags, 1);
1484 page_cache_release(filepage); 1415 shmem_free_blocks(inode, 1);
1485 shmem_unacct_blocks(info->flags, 1); 1416 spin_unlock(&info->lock);
1486 shmem_free_blocks(inode, 1); 1417 page_cache_release(page);
1487 filepage = NULL; 1418 if (error)
1488 if (error) 1419 goto out;
1489 goto failed; 1420 goto repeat;
1490 goto repeat;
1491 }
1492 info->flags |= SHMEM_PAGEIN;
1493 } 1421 }
1494 1422
1423 info->flags |= SHMEM_PAGEIN;
1495 info->alloced++; 1424 info->alloced++;
1496 spin_unlock(&info->lock); 1425 spin_unlock(&info->lock);
1497 clear_highpage(filepage); 1426 clear_highpage(page);
1498 flush_dcache_page(filepage); 1427 flush_dcache_page(page);
1499 SetPageUptodate(filepage); 1428 SetPageUptodate(page);
1500 if (sgp == SGP_DIRTY) 1429 if (sgp == SGP_DIRTY)
1501 set_page_dirty(filepage); 1430 set_page_dirty(page);
1431
1432 } else {
1433 spin_unlock(&info->lock);
1434 error = -ENOMEM;
1435 goto out;
1502 } 1436 }
1503done: 1437done:
1504 *pagep = filepage; 1438 *pagep = page;
1505 error = 0; 1439 error = 0;
1506 goto out; 1440out:
1441 if (prealloc_page) {
1442 mem_cgroup_uncharge_cache_page(prealloc_page);
1443 page_cache_release(prealloc_page);
1444 }
1445 return error;
1507 1446
1508nospace: 1447nospace:
1509 /* 1448 /*
1510 * Perhaps the page was brought in from swap between find_lock_page 1449 * Perhaps the page was brought in from swap between find_lock_page
1511 * and taking info->lock? We allow for that at add_to_page_cache_lru, 1450 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1512 * but must also avoid reporting a spurious ENOSPC while working on a 1451 * but must also avoid reporting a spurious ENOSPC while working on a
1513 * full tmpfs. (When filepage has been passed in to shmem_getpage, it 1452 * full tmpfs.
1514 * is already in page cache, which prevents this race from occurring.)
1515 */ 1453 */
1516 if (!filepage) { 1454 page = find_get_page(mapping, idx);
1517 struct page *page = find_get_page(mapping, idx);
1518 if (page) {
1519 spin_unlock(&info->lock);
1520 page_cache_release(page);
1521 goto repeat;
1522 }
1523 }
1524 spin_unlock(&info->lock); 1455 spin_unlock(&info->lock);
1525 error = -ENOSPC; 1456 if (page) {
1526failed: 1457 page_cache_release(page);
1527 if (*pagep != filepage) { 1458 goto repeat;
1528 unlock_page(filepage);
1529 page_cache_release(filepage);
1530 }
1531out:
1532 if (prealloc_page) {
1533 mem_cgroup_uncharge_cache_page(prealloc_page);
1534 page_cache_release(prealloc_page);
1535 } 1459 }
1536 return error; 1460 error = -ENOSPC;
1461 goto out;
1537} 1462}
1538 1463
1539static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1464static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1540{ 1465{
1541 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 1466 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1542 int error; 1467 int error;
1543 int ret; 1468 int ret = VM_FAULT_LOCKED;
1544 1469
1545 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 1470 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1546 return VM_FAULT_SIGBUS; 1471 return VM_FAULT_SIGBUS;
@@ -1548,11 +1473,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1548 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1473 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1549 if (error) 1474 if (error)
1550 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1475 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1476
1551 if (ret & VM_FAULT_MAJOR) { 1477 if (ret & VM_FAULT_MAJOR) {
1552 count_vm_event(PGMAJFAULT); 1478 count_vm_event(PGMAJFAULT);
1553 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); 1479 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
1554 } 1480 }
1555 return ret | VM_FAULT_LOCKED; 1481 return ret;
1556} 1482}
1557 1483
1558#ifdef CONFIG_NUMA 1484#ifdef CONFIG_NUMA
@@ -1669,19 +1595,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1669static const struct inode_operations shmem_symlink_inode_operations; 1595static const struct inode_operations shmem_symlink_inode_operations;
1670static const struct inode_operations shmem_symlink_inline_operations; 1596static const struct inode_operations shmem_symlink_inline_operations;
1671 1597
1672/*
1673 * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
1674 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1675 * below the loop driver, in the generic fashion that many filesystems support.
1676 */
1677static int shmem_readpage(struct file *file, struct page *page)
1678{
1679 struct inode *inode = page->mapping->host;
1680 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1681 unlock_page(page);
1682 return error;
1683}
1684
1685static int 1598static int
1686shmem_write_begin(struct file *file, struct address_space *mapping, 1599shmem_write_begin(struct file *file, struct address_space *mapping,
1687 loff_t pos, unsigned len, unsigned flags, 1600 loff_t pos, unsigned len, unsigned flags,
@@ -1689,7 +1602,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
1689{ 1602{
1690 struct inode *inode = mapping->host; 1603 struct inode *inode = mapping->host;
1691 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1604 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1692 *pagep = NULL;
1693 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); 1605 return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
1694} 1606}
1695 1607
@@ -1846,6 +1758,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb,
1846 return retval; 1758 return retval;
1847} 1759}
1848 1760
1761static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1762 struct pipe_inode_info *pipe, size_t len,
1763 unsigned int flags)
1764{
1765 struct address_space *mapping = in->f_mapping;
1766 struct inode *inode = mapping->host;
1767 unsigned int loff, nr_pages, req_pages;
1768 struct page *pages[PIPE_DEF_BUFFERS];
1769 struct partial_page partial[PIPE_DEF_BUFFERS];
1770 struct page *page;
1771 pgoff_t index, end_index;
1772 loff_t isize, left;
1773 int error, page_nr;
1774 struct splice_pipe_desc spd = {
1775 .pages = pages,
1776 .partial = partial,
1777 .flags = flags,
1778 .ops = &page_cache_pipe_buf_ops,
1779 .spd_release = spd_release_page,
1780 };
1781
1782 isize = i_size_read(inode);
1783 if (unlikely(*ppos >= isize))
1784 return 0;
1785
1786 left = isize - *ppos;
1787 if (unlikely(left < len))
1788 len = left;
1789
1790 if (splice_grow_spd(pipe, &spd))
1791 return -ENOMEM;
1792
1793 index = *ppos >> PAGE_CACHE_SHIFT;
1794 loff = *ppos & ~PAGE_CACHE_MASK;
1795 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1796 nr_pages = min(req_pages, pipe->buffers);
1797
1798 spd.nr_pages = find_get_pages_contig(mapping, index,
1799 nr_pages, spd.pages);
1800 index += spd.nr_pages;
1801 error = 0;
1802
1803 while (spd.nr_pages < nr_pages) {
1804 error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
1805 if (error)
1806 break;
1807 unlock_page(page);
1808 spd.pages[spd.nr_pages++] = page;
1809 index++;
1810 }
1811
1812 index = *ppos >> PAGE_CACHE_SHIFT;
1813 nr_pages = spd.nr_pages;
1814 spd.nr_pages = 0;
1815
1816 for (page_nr = 0; page_nr < nr_pages; page_nr++) {
1817 unsigned int this_len;
1818
1819 if (!len)
1820 break;
1821
1822 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
1823 page = spd.pages[page_nr];
1824
1825 if (!PageUptodate(page) || page->mapping != mapping) {
1826 error = shmem_getpage(inode, index, &page,
1827 SGP_CACHE, NULL);
1828 if (error)
1829 break;
1830 unlock_page(page);
1831 page_cache_release(spd.pages[page_nr]);
1832 spd.pages[page_nr] = page;
1833 }
1834
1835 isize = i_size_read(inode);
1836 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1837 if (unlikely(!isize || index > end_index))
1838 break;
1839
1840 if (end_index == index) {
1841 unsigned int plen;
1842
1843 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
1844 if (plen <= loff)
1845 break;
1846
1847 this_len = min(this_len, plen - loff);
1848 len = this_len;
1849 }
1850
1851 spd.partial[page_nr].offset = loff;
1852 spd.partial[page_nr].len = this_len;
1853 len -= this_len;
1854 loff = 0;
1855 spd.nr_pages++;
1856 index++;
1857 }
1858
1859 while (page_nr < nr_pages)
1860 page_cache_release(spd.pages[page_nr++]);
1861
1862 if (spd.nr_pages)
1863 error = splice_to_pipe(pipe, &spd);
1864
1865 splice_shrink_spd(pipe, &spd);
1866
1867 if (error > 0) {
1868 *ppos += error;
1869 file_accessed(in);
1870 }
1871 return error;
1872}
1873
1849static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1874static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1850{ 1875{
1851 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1876 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2006,7 +2031,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2006 int error; 2031 int error;
2007 int len; 2032 int len;
2008 struct inode *inode; 2033 struct inode *inode;
2009 struct page *page = NULL; 2034 struct page *page;
2010 char *kaddr; 2035 char *kaddr;
2011 struct shmem_inode_info *info; 2036 struct shmem_inode_info *info;
2012 2037
@@ -2684,7 +2709,6 @@ static const struct address_space_operations shmem_aops = {
2684 .writepage = shmem_writepage, 2709 .writepage = shmem_writepage,
2685 .set_page_dirty = __set_page_dirty_no_writeback, 2710 .set_page_dirty = __set_page_dirty_no_writeback,
2686#ifdef CONFIG_TMPFS 2711#ifdef CONFIG_TMPFS
2687 .readpage = shmem_readpage,
2688 .write_begin = shmem_write_begin, 2712 .write_begin = shmem_write_begin,
2689 .write_end = shmem_write_end, 2713 .write_end = shmem_write_end,
2690#endif 2714#endif
@@ -2701,7 +2725,7 @@ static const struct file_operations shmem_file_operations = {
2701 .aio_read = shmem_file_aio_read, 2725 .aio_read = shmem_file_aio_read,
2702 .aio_write = generic_file_aio_write, 2726 .aio_write = generic_file_aio_write,
2703 .fsync = noop_fsync, 2727 .fsync = noop_fsync,
2704 .splice_read = generic_file_splice_read, 2728 .splice_read = shmem_file_splice_read,
2705 .splice_write = generic_file_splice_write, 2729 .splice_write = generic_file_splice_write,
2706#endif 2730#endif
2707}; 2731};
@@ -2715,10 +2739,6 @@ static const struct inode_operations shmem_inode_operations = {
2715 .listxattr = shmem_listxattr, 2739 .listxattr = shmem_listxattr,
2716 .removexattr = shmem_removexattr, 2740 .removexattr = shmem_removexattr,
2717#endif 2741#endif
2718#ifdef CONFIG_TMPFS_POSIX_ACL
2719 .check_acl = generic_check_acl,
2720#endif
2721
2722}; 2742};
2723 2743
2724static const struct inode_operations shmem_dir_inode_operations = { 2744static const struct inode_operations shmem_dir_inode_operations = {
@@ -2741,7 +2761,6 @@ static const struct inode_operations shmem_dir_inode_operations = {
2741#endif 2761#endif
2742#ifdef CONFIG_TMPFS_POSIX_ACL 2762#ifdef CONFIG_TMPFS_POSIX_ACL
2743 .setattr = shmem_setattr, 2763 .setattr = shmem_setattr,
2744 .check_acl = generic_check_acl,
2745#endif 2764#endif
2746}; 2765};
2747 2766
@@ -2754,7 +2773,6 @@ static const struct inode_operations shmem_special_inode_operations = {
2754#endif 2773#endif
2755#ifdef CONFIG_TMPFS_POSIX_ACL 2774#ifdef CONFIG_TMPFS_POSIX_ACL
2756 .setattr = shmem_setattr, 2775 .setattr = shmem_setattr,
2757 .check_acl = generic_check_acl,
2758#endif 2776#endif
2759}; 2777};
2760 2778
@@ -3048,13 +3066,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3048 * suit tmpfs, since it may have pages in swapcache, and needs to find those 3066 * suit tmpfs, since it may have pages in swapcache, and needs to find those
3049 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. 3067 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
3050 * 3068 *
3051 * Provide a stub for those callers to start using now, then later 3069 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
3052 * flesh it out to call shmem_getpage() with additional gfp mask, when 3070 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
3053 * shmem_file_splice_read() is added and shmem_readpage() is removed.
3054 */ 3071 */
3055struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, 3072struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
3056 pgoff_t index, gfp_t gfp) 3073 pgoff_t index, gfp_t gfp)
3057{ 3074{
3075#ifdef CONFIG_SHMEM
3076 struct inode *inode = mapping->host;
3077 struct page *page;
3078 int error;
3079
3080 BUG_ON(mapping->a_ops != &shmem_aops);
3081 error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
3082 if (error)
3083 page = ERR_PTR(error);
3084 else
3085 unlock_page(page);
3086 return page;
3087#else
3088 /*
3089 * The tiny !SHMEM case uses ramfs without swap
3090 */
3058 return read_cache_page_gfp(mapping, index, gfp); 3091 return read_cache_page_gfp(mapping, index, gfp);
3092#endif
3059} 3093}
3060EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); 3094EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index d96e223de775..95947400702b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic =
574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; 574 { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
575 575
576/* internal cache of cache description objs */ 576/* internal cache of cache description objs */
577static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES];
577static struct kmem_cache cache_cache = { 578static struct kmem_cache cache_cache = {
579 .nodelists = cache_cache_nodelists,
578 .batchcount = 1, 580 .batchcount = 1,
579 .limit = BOOT_CPUCACHE_ENTRIES, 581 .limit = BOOT_CPUCACHE_ENTRIES,
580 .shared = 1, 582 .shared = 1,
@@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void)
1492 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; 1494 cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
1493 1495
1494 /* 1496 /*
1495 * struct kmem_cache size depends on nr_node_ids, which 1497 * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
1496 * can be less than MAX_NUMNODES.
1497 */ 1498 */
1498 cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + 1499 cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
1499 nr_node_ids * sizeof(struct kmem_list3 *); 1500 nr_node_ids * sizeof(struct kmem_list3 *);
1500#if DEBUG 1501#if DEBUG
1501 cache_cache.obj_size = cache_cache.buffer_size; 1502 cache_cache.obj_size = cache_cache.buffer_size;
1502#endif 1503#endif
@@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2308 if (!cachep) 2309 if (!cachep)
2309 goto oops; 2310 goto oops;
2310 2311
2312 cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
2311#if DEBUG 2313#if DEBUG
2312 cachep->obj_size = size; 2314 cachep->obj_size = size;
2313 2315
@@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3153 objp += obj_offset(cachep); 3155 objp += obj_offset(cachep);
3154 if (cachep->ctor && cachep->flags & SLAB_POISON) 3156 if (cachep->ctor && cachep->flags & SLAB_POISON)
3155 cachep->ctor(objp); 3157 cachep->ctor(objp);
3156#if ARCH_SLAB_MINALIGN 3158 if (ARCH_SLAB_MINALIGN &&
3157 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { 3159 ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) {
3158 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", 3160 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3159 objp, ARCH_SLAB_MINALIGN); 3161 objp, (int)ARCH_SLAB_MINALIGN);
3160 } 3162 }
3161#endif
3162 return objp; 3163 return objp;
3163} 3164}
3164#else 3165#else
@@ -3402,7 +3403,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3402 cache_alloc_debugcheck_before(cachep, flags); 3403 cache_alloc_debugcheck_before(cachep, flags);
3403 local_irq_save(save_flags); 3404 local_irq_save(save_flags);
3404 3405
3405 if (nodeid == -1) 3406 if (nodeid == NUMA_NO_NODE)
3406 nodeid = slab_node; 3407 nodeid = slab_node;
3407 3408
3408 if (unlikely(!cachep->nodelists[nodeid])) { 3409 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3933,7 +3934,7 @@ fail:
3933 3934
3934struct ccupdate_struct { 3935struct ccupdate_struct {
3935 struct kmem_cache *cachep; 3936 struct kmem_cache *cachep;
3936 struct array_cache *new[NR_CPUS]; 3937 struct array_cache *new[0];
3937}; 3938};
3938 3939
3939static void do_ccupdate_local(void *info) 3940static void do_ccupdate_local(void *info)
@@ -3955,7 +3956,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3955 struct ccupdate_struct *new; 3956 struct ccupdate_struct *new;
3956 int i; 3957 int i;
3957 3958
3958 new = kzalloc(sizeof(*new), gfp); 3959 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
3960 gfp);
3959 if (!new) 3961 if (!new)
3960 return -ENOMEM; 3962 return -ENOMEM;
3961 3963
diff --git a/mm/slob.c b/mm/slob.c
index 46e0aee33a23..bf3918187165 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -70,7 +70,7 @@
70 70
71#include <trace/events/kmem.h> 71#include <trace/events/kmem.h>
72 72
73#include <asm/atomic.h> 73#include <linux/atomic.h>
74 74
75/* 75/*
76 * slob_block has a field 'units', which indicates size of block if +ve, 76 * slob_block has a field 'units', which indicates size of block if +ve,
@@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 482 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
483 void *ret; 483 void *ret;
484 484
485 gfp &= gfp_allowed_mask;
486
485 lockdep_trace_alloc(gfp); 487 lockdep_trace_alloc(gfp);
486 488
487 if (size < PAGE_SIZE - align) { 489 if (size < PAGE_SIZE - align) {
@@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
608{ 610{
609 void *b; 611 void *b;
610 612
613 flags &= gfp_allowed_mask;
614
615 lockdep_trace_alloc(flags);
616
611 if (c->size < PAGE_SIZE) { 617 if (c->size < PAGE_SIZE) {
612 b = slob_alloc(c->size, flags, c->align, node); 618 b = slob_alloc(c->size, flags, c->align, node);
613 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, 619 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
diff --git a/mm/slub.c b/mm/slub.c
index 35f351f26193..eb5a8f93338a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
2 * SLUB: A slab allocator that limits cache line use instead of queuing 2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists. 3 * objects in per cpu and per node lists.
4 * 4 *
5 * The allocator synchronizes using per slab locks and only 5 * The allocator synchronizes using per slab locks or atomic operatios
6 * uses a centralized lock to manage a pool of partial slabs. 6 * and only uses a centralized lock to manage a pool of partial slabs.
7 * 7 *
8 * (C) 2007 SGI, Christoph Lameter 8 * (C) 2007 SGI, Christoph Lameter
9 * (C) 2011 Linux Foundation, Christoph Lameter
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -27,20 +28,33 @@
27#include <linux/memory.h> 28#include <linux/memory.h>
28#include <linux/math64.h> 29#include <linux/math64.h>
29#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
31#include <linux/stacktrace.h>
30 32
31#include <trace/events/kmem.h> 33#include <trace/events/kmem.h>
32 34
33/* 35/*
34 * Lock order: 36 * Lock order:
35 * 1. slab_lock(page) 37 * 1. slub_lock (Global Semaphore)
36 * 2. slab->list_lock 38 * 2. node->list_lock
39 * 3. slab_lock(page) (Only on some arches and for debugging)
37 * 40 *
38 * The slab_lock protects operations on the object of a particular 41 * slub_lock
39 * slab and its metadata in the page struct. If the slab lock 42 *
40 * has been taken then no allocations nor frees can be performed 43 * The role of the slub_lock is to protect the list of all the slabs
41 * on the objects in the slab nor can the slab be added or removed 44 * and to synchronize major metadata changes to slab cache structures.
42 * from the partial or full lists since this would mean modifying 45 *
43 * the page_struct of the slab. 46 * The slab_lock is only used for debugging and on arches that do not
47 * have the ability to do a cmpxchg_double. It only protects the second
48 * double word in the page struct. Meaning
49 * A. page->freelist -> List of object free in a page
50 * B. page->counters -> Counters of objects
51 * C. page->frozen -> frozen state
52 *
53 * If a slab is frozen then it is exempt from list management. It is not
54 * on any list. The processor that froze the slab is the one who can
55 * perform list operations on the page. Other processors may put objects
56 * onto the freelist but the processor that froze the slab is the only
57 * one that can retrieve the objects from the page's freelist.
44 * 58 *
45 * The list_lock protects the partial and full list on each node and 59 * The list_lock protects the partial and full list on each node and
46 * the partial slab counter. If taken then no new slabs may be added or 60 * the partial slab counter. If taken then no new slabs may be added or
@@ -53,20 +67,6 @@
53 * slabs, operations can continue without any centralized lock. F.e. 67 * slabs, operations can continue without any centralized lock. F.e.
54 * allocating a long series of objects that fill up slabs does not require 68 * allocating a long series of objects that fill up slabs does not require
55 * the list lock. 69 * the list lock.
56 *
57 * The lock order is sometimes inverted when we are trying to get a slab
58 * off a list. We take the list_lock and then look for a page on the list
59 * to use. While we do that objects in the slabs may be freed. We can
60 * only operate on the slab if we have also taken the slab_lock. So we use
61 * a slab_trylock() on the slab. If trylock was successful then no frees
62 * can occur anymore and we can use the slab for allocations etc. If the
63 * slab_trylock() does not succeed then frees are in progress in the slab and
64 * we must stay away from it for a while since we may cause a bouncing
65 * cacheline if we try to acquire the lock. So go onto the next slab.
66 * If all pages are busy then we may allocate a new slab instead of reusing
67 * a partial slab. A new slab has no one operating on it and thus there is
68 * no danger of cacheline contention.
69 *
70 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
71 * make the slab allocator safe to use in the context of an irq. In addition 71 * make the slab allocator safe to use in the context of an irq. In addition
72 * interrupts are disabled to ensure that the processor does not change 72 * interrupts are disabled to ensure that the processor does not change
@@ -131,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
131/* Enable to test recovery from slab corruption on boot */ 131/* Enable to test recovery from slab corruption on boot */
132#undef SLUB_RESILIENCY_TEST 132#undef SLUB_RESILIENCY_TEST
133 133
134/* Enable to log cmpxchg failures */
135#undef SLUB_DEBUG_CMPXCHG
136
134/* 137/*
135 * Mininum number of partial slabs. These will be left on the partial 138 * Mininum number of partial slabs. These will be left on the partial
136 * lists even if they are empty. kmem_cache_shrink may reclaim them. 139 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -166,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
166 169
167#define OO_SHIFT 16 170#define OO_SHIFT 16
168#define OO_MASK ((1 << OO_SHIFT) - 1) 171#define OO_MASK ((1 << OO_SHIFT) - 1)
169#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 172#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
170 173
171/* Internal SLUB flags */ 174/* Internal SLUB flags */
172#define __OBJECT_POISON 0x80000000UL /* Poison object */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */
176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
173 177
174static int kmem_size = sizeof(struct kmem_cache); 178static int kmem_size = sizeof(struct kmem_cache);
175 179
@@ -191,8 +195,12 @@ static LIST_HEAD(slab_caches);
191/* 195/*
192 * Tracking user of a slab. 196 * Tracking user of a slab.
193 */ 197 */
198#define TRACK_ADDRS_COUNT 16
194struct track { 199struct track {
195 unsigned long addr; /* Called from address */ 200 unsigned long addr; /* Called from address */
201#ifdef CONFIG_STACKTRACE
202 unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
203#endif
196 int cpu; /* Was running on cpu */ 204 int cpu; /* Was running on cpu */
197 int pid; /* Pid context */ 205 int pid; /* Pid context */
198 unsigned long when; /* When did the operation occur */ 206 unsigned long when; /* When did the operation occur */
@@ -338,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
338 return x.x & OO_MASK; 346 return x.x & OO_MASK;
339} 347}
340 348
349/*
350 * Per slab locking using the pagelock
351 */
352static __always_inline void slab_lock(struct page *page)
353{
354 bit_spin_lock(PG_locked, &page->flags);
355}
356
357static __always_inline void slab_unlock(struct page *page)
358{
359 __bit_spin_unlock(PG_locked, &page->flags);
360}
361
362/* Interrupts must be disabled (for the fallback code to work right) */
363static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
364 void *freelist_old, unsigned long counters_old,
365 void *freelist_new, unsigned long counters_new,
366 const char *n)
367{
368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist,
372 freelist_old, counters_old,
373 freelist_new, counters_new))
374 return 1;
375 } else
376#endif
377 {
378 slab_lock(page);
379 if (page->freelist == freelist_old && page->counters == counters_old) {
380 page->freelist = freelist_new;
381 page->counters = counters_new;
382 slab_unlock(page);
383 return 1;
384 }
385 slab_unlock(page);
386 }
387
388 cpu_relax();
389 stat(s, CMPXCHG_DOUBLE_FAIL);
390
391#ifdef SLUB_DEBUG_CMPXCHG
392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
393#endif
394
395 return 0;
396}
397
398static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
399 void *freelist_old, unsigned long counters_old,
400 void *freelist_new, unsigned long counters_new,
401 const char *n)
402{
403#ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist,
406 freelist_old, counters_old,
407 freelist_new, counters_new))
408 return 1;
409 } else
410#endif
411 {
412 unsigned long flags;
413
414 local_irq_save(flags);
415 slab_lock(page);
416 if (page->freelist == freelist_old && page->counters == counters_old) {
417 page->freelist = freelist_new;
418 page->counters = counters_new;
419 slab_unlock(page);
420 local_irq_restore(flags);
421 return 1;
422 }
423 slab_unlock(page);
424 local_irq_restore(flags);
425 }
426
427 cpu_relax();
428 stat(s, CMPXCHG_DOUBLE_FAIL);
429
430#ifdef SLUB_DEBUG_CMPXCHG
431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
432#endif
433
434 return 0;
435}
436
341#ifdef CONFIG_SLUB_DEBUG 437#ifdef CONFIG_SLUB_DEBUG
342/* 438/*
343 * Determine a map of object in use on a page. 439 * Determine a map of object in use on a page.
344 * 440 *
345 * Slab lock or node listlock must be held to guarantee that the page does 441 * Node listlock must be held to guarantee that the page does
346 * not vanish from under us. 442 * not vanish from under us.
347 */ 443 */
348static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 444static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -420,6 +516,24 @@ static void set_track(struct kmem_cache *s, void *object,
420 struct track *p = get_track(s, object, alloc); 516 struct track *p = get_track(s, object, alloc);
421 517
422 if (addr) { 518 if (addr) {
519#ifdef CONFIG_STACKTRACE
520 struct stack_trace trace;
521 int i;
522
523 trace.nr_entries = 0;
524 trace.max_entries = TRACK_ADDRS_COUNT;
525 trace.entries = p->addrs;
526 trace.skip = 3;
527 save_stack_trace(&trace);
528
529 /* See rant in lockdep.c */
530 if (trace.nr_entries != 0 &&
531 trace.entries[trace.nr_entries - 1] == ULONG_MAX)
532 trace.nr_entries--;
533
534 for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
535 p->addrs[i] = 0;
536#endif
423 p->addr = addr; 537 p->addr = addr;
424 p->cpu = smp_processor_id(); 538 p->cpu = smp_processor_id();
425 p->pid = current->pid; 539 p->pid = current->pid;
@@ -444,6 +558,16 @@ static void print_track(const char *s, struct track *t)
444 558
445 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 559 printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
446 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 560 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
561#ifdef CONFIG_STACKTRACE
562 {
563 int i;
564 for (i = 0; i < TRACK_ADDRS_COUNT; i++)
565 if (t->addrs[i])
566 printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
567 else
568 break;
569 }
570#endif
447} 571}
448 572
449static void print_tracking(struct kmem_cache *s, void *object) 573static void print_tracking(struct kmem_cache *s, void *object)
@@ -557,10 +681,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
557 memset(p + s->objsize, val, s->inuse - s->objsize); 681 memset(p + s->objsize, val, s->inuse - s->objsize);
558} 682}
559 683
560static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) 684static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
561{ 685{
562 while (bytes) { 686 while (bytes) {
563 if (*start != (u8)value) 687 if (*start != value)
564 return start; 688 return start;
565 start++; 689 start++;
566 bytes--; 690 bytes--;
@@ -568,6 +692,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
568 return NULL; 692 return NULL;
569} 693}
570 694
695static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
696{
697 u64 value64;
698 unsigned int words, prefix;
699
700 if (bytes <= 16)
701 return check_bytes8(start, value, bytes);
702
703 value64 = value | value << 8 | value << 16 | value << 24;
704 value64 = value64 | value64 << 32;
705 prefix = 8 - ((unsigned long)start) % 8;
706
707 if (prefix) {
708 u8 *r = check_bytes8(start, value, prefix);
709 if (r)
710 return r;
711 start += prefix;
712 bytes -= prefix;
713 }
714
715 words = bytes / 8;
716
717 while (words) {
718 if (*(u64 *)start != value64)
719 return check_bytes8(start, value, 8);
720 start += 8;
721 words--;
722 }
723
724 return check_bytes8(start, value, bytes % 8);
725}
726
571static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 727static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
572 void *from, void *to) 728 void *from, void *to)
573{ 729{
@@ -773,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
773static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 929static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
774{ 930{
775 int nr = 0; 931 int nr = 0;
776 void *fp = page->freelist; 932 void *fp;
777 void *object = NULL; 933 void *object = NULL;
778 unsigned long max_objects; 934 unsigned long max_objects;
779 935
936 fp = page->freelist;
780 while (fp && nr <= page->objects) { 937 while (fp && nr <= page->objects) {
781 if (fp == search) 938 if (fp == search)
782 return 1; 939 return 1;
@@ -881,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
881 1038
882/* 1039/*
883 * Tracking of fully allocated slabs for debugging purposes. 1040 * Tracking of fully allocated slabs for debugging purposes.
1041 *
1042 * list_lock must be held.
884 */ 1043 */
885static void add_full(struct kmem_cache_node *n, struct page *page) 1044static void add_full(struct kmem_cache *s,
1045 struct kmem_cache_node *n, struct page *page)
886{ 1046{
887 spin_lock(&n->list_lock); 1047 if (!(s->flags & SLAB_STORE_USER))
1048 return;
1049
888 list_add(&page->lru, &n->full); 1050 list_add(&page->lru, &n->full);
889 spin_unlock(&n->list_lock);
890} 1051}
891 1052
1053/*
1054 * list_lock must be held.
1055 */
892static void remove_full(struct kmem_cache *s, struct page *page) 1056static void remove_full(struct kmem_cache *s, struct page *page)
893{ 1057{
894 struct kmem_cache_node *n;
895
896 if (!(s->flags & SLAB_STORE_USER)) 1058 if (!(s->flags & SLAB_STORE_USER))
897 return; 1059 return;
898 1060
899 n = get_node(s, page_to_nid(page));
900
901 spin_lock(&n->list_lock);
902 list_del(&page->lru); 1061 list_del(&page->lru);
903 spin_unlock(&n->list_lock);
904} 1062}
905 1063
906/* Tracking of the number of slabs for debugging purposes */ 1064/* Tracking of the number of slabs for debugging purposes */
@@ -956,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
956 if (!check_slab(s, page)) 1114 if (!check_slab(s, page))
957 goto bad; 1115 goto bad;
958 1116
959 if (!on_freelist(s, page, object)) {
960 object_err(s, page, object, "Object already allocated");
961 goto bad;
962 }
963
964 if (!check_valid_pointer(s, page, object)) { 1117 if (!check_valid_pointer(s, page, object)) {
965 object_err(s, page, object, "Freelist Pointer check fails"); 1118 object_err(s, page, object, "Freelist Pointer check fails");
966 goto bad; 1119 goto bad;
@@ -993,6 +1146,12 @@ bad:
993static noinline int free_debug_processing(struct kmem_cache *s, 1146static noinline int free_debug_processing(struct kmem_cache *s,
994 struct page *page, void *object, unsigned long addr) 1147 struct page *page, void *object, unsigned long addr)
995{ 1148{
1149 unsigned long flags;
1150 int rc = 0;
1151
1152 local_irq_save(flags);
1153 slab_lock(page);
1154
996 if (!check_slab(s, page)) 1155 if (!check_slab(s, page))
997 goto fail; 1156 goto fail;
998 1157
@@ -1007,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1007 } 1166 }
1008 1167
1009 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1168 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1010 return 0; 1169 goto out;
1011 1170
1012 if (unlikely(s != page->slab)) { 1171 if (unlikely(s != page->slab)) {
1013 if (!PageSlab(page)) { 1172 if (!PageSlab(page)) {
@@ -1024,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1024 goto fail; 1183 goto fail;
1025 } 1184 }
1026 1185
1027 /* Special debug activities for freeing objects */
1028 if (!PageSlubFrozen(page) && !page->freelist)
1029 remove_full(s, page);
1030 if (s->flags & SLAB_STORE_USER) 1186 if (s->flags & SLAB_STORE_USER)
1031 set_track(s, object, TRACK_FREE, addr); 1187 set_track(s, object, TRACK_FREE, addr);
1032 trace(s, page, object, 0); 1188 trace(s, page, object, 0);
1033 init_object(s, object, SLUB_RED_INACTIVE); 1189 init_object(s, object, SLUB_RED_INACTIVE);
1034 return 1; 1190 rc = 1;
1191out:
1192 slab_unlock(page);
1193 local_irq_restore(flags);
1194 return rc;
1035 1195
1036fail: 1196fail:
1037 slab_fix(s, "Object at 0x%p not freed", object); 1197 slab_fix(s, "Object at 0x%p not freed", object);
1038 return 0; 1198 goto out;
1039} 1199}
1040 1200
1041static int __init setup_slub_debug(char *str) 1201static int __init setup_slub_debug(char *str)
@@ -1135,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1135 { return 1; } 1295 { return 1; }
1136static inline int check_object(struct kmem_cache *s, struct page *page, 1296static inline int check_object(struct kmem_cache *s, struct page *page,
1137 void *object, u8 val) { return 1; } 1297 void *object, u8 val) { return 1; }
1138static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1298static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1299 struct page *page) {}
1300static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1139static inline unsigned long kmem_cache_flags(unsigned long objsize, 1301static inline unsigned long kmem_cache_flags(unsigned long objsize,
1140 unsigned long flags, const char *name, 1302 unsigned long flags, const char *name,
1141 void (*ctor)(void *)) 1303 void (*ctor)(void *))
@@ -1187,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1187 struct kmem_cache_order_objects oo = s->oo; 1349 struct kmem_cache_order_objects oo = s->oo;
1188 gfp_t alloc_gfp; 1350 gfp_t alloc_gfp;
1189 1351
1352 flags &= gfp_allowed_mask;
1353
1354 if (flags & __GFP_WAIT)
1355 local_irq_enable();
1356
1190 flags |= s->allocflags; 1357 flags |= s->allocflags;
1191 1358
1192 /* 1359 /*
@@ -1203,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1203 * Try a lower order alloc if possible 1370 * Try a lower order alloc if possible
1204 */ 1371 */
1205 page = alloc_slab_page(flags, node, oo); 1372 page = alloc_slab_page(flags, node, oo);
1206 if (!page)
1207 return NULL;
1208 1373
1209 stat(s, ORDER_FALLBACK); 1374 if (page)
1375 stat(s, ORDER_FALLBACK);
1210 } 1376 }
1211 1377
1378 if (flags & __GFP_WAIT)
1379 local_irq_disable();
1380
1381 if (!page)
1382 return NULL;
1383
1212 if (kmemcheck_enabled 1384 if (kmemcheck_enabled
1213 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1385 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1214 int pages = 1 << oo_order(oo); 1386 int pages = 1 << oo_order(oo);
@@ -1276,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1276 1448
1277 page->freelist = start; 1449 page->freelist = start;
1278 page->inuse = 0; 1450 page->inuse = 0;
1451 page->frozen = 1;
1279out: 1452out:
1280 return page; 1453 return page;
1281} 1454}
@@ -1353,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1353} 1526}
1354 1527
1355/* 1528/*
1356 * Per slab locking using the pagelock 1529 * Management of partially allocated slabs.
1357 */ 1530 *
1358static __always_inline void slab_lock(struct page *page) 1531 * list_lock must be held.
1359{
1360 bit_spin_lock(PG_locked, &page->flags);
1361}
1362
1363static __always_inline void slab_unlock(struct page *page)
1364{
1365 __bit_spin_unlock(PG_locked, &page->flags);
1366}
1367
1368static __always_inline int slab_trylock(struct page *page)
1369{
1370 int rc = 1;
1371
1372 rc = bit_spin_trylock(PG_locked, &page->flags);
1373 return rc;
1374}
1375
1376/*
1377 * Management of partially allocated slabs
1378 */ 1532 */
1379static void add_partial(struct kmem_cache_node *n, 1533static inline void add_partial(struct kmem_cache_node *n,
1380 struct page *page, int tail) 1534 struct page *page, int tail)
1381{ 1535{
1382 spin_lock(&n->list_lock);
1383 n->nr_partial++; 1536 n->nr_partial++;
1384 if (tail) 1537 if (tail)
1385 list_add_tail(&page->lru, &n->partial); 1538 list_add_tail(&page->lru, &n->partial);
1386 else 1539 else
1387 list_add(&page->lru, &n->partial); 1540 list_add(&page->lru, &n->partial);
1388 spin_unlock(&n->list_lock);
1389} 1541}
1390 1542
1391static inline void __remove_partial(struct kmem_cache_node *n, 1543/*
1544 * list_lock must be held.
1545 */
1546static inline void remove_partial(struct kmem_cache_node *n,
1392 struct page *page) 1547 struct page *page)
1393{ 1548{
1394 list_del(&page->lru); 1549 list_del(&page->lru);
1395 n->nr_partial--; 1550 n->nr_partial--;
1396} 1551}
1397 1552
1398static void remove_partial(struct kmem_cache *s, struct page *page)
1399{
1400 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1401
1402 spin_lock(&n->list_lock);
1403 __remove_partial(n, page);
1404 spin_unlock(&n->list_lock);
1405}
1406
1407/* 1553/*
1408 * Lock slab and remove from the partial list. 1554 * Lock slab, remove from the partial list and put the object into the
1555 * per cpu freelist.
1409 * 1556 *
1410 * Must hold list_lock. 1557 * Must hold list_lock.
1411 */ 1558 */
1412static inline int lock_and_freeze_slab(struct kmem_cache_node *n, 1559static inline int acquire_slab(struct kmem_cache *s,
1413 struct page *page) 1560 struct kmem_cache_node *n, struct page *page)
1414{ 1561{
1415 if (slab_trylock(page)) { 1562 void *freelist;
1416 __remove_partial(n, page); 1563 unsigned long counters;
1417 __SetPageSlubFrozen(page); 1564 struct page new;
1565
1566 /*
1567 * Zap the freelist and set the frozen bit.
1568 * The old freelist is the list of objects for the
1569 * per cpu allocation list.
1570 */
1571 do {
1572 freelist = page->freelist;
1573 counters = page->counters;
1574 new.counters = counters;
1575 new.inuse = page->objects;
1576
1577 VM_BUG_ON(new.frozen);
1578 new.frozen = 1;
1579
1580 } while (!__cmpxchg_double_slab(s, page,
1581 freelist, counters,
1582 NULL, new.counters,
1583 "lock and freeze"));
1584
1585 remove_partial(n, page);
1586
1587 if (freelist) {
1588 /* Populate the per cpu freelist */
1589 this_cpu_write(s->cpu_slab->freelist, freelist);
1590 this_cpu_write(s->cpu_slab->page, page);
1591 this_cpu_write(s->cpu_slab->node, page_to_nid(page));
1418 return 1; 1592 return 1;
1593 } else {
1594 /*
1595 * Slab page came from the wrong list. No object to allocate
1596 * from. Put it onto the correct list and continue partial
1597 * scan.
1598 */
1599 printk(KERN_ERR "SLUB: %s : Page without available objects on"
1600 " partial list\n", s->name);
1601 return 0;
1419 } 1602 }
1420 return 0;
1421} 1603}
1422 1604
1423/* 1605/*
1424 * Try to allocate a partial slab from a specific node. 1606 * Try to allocate a partial slab from a specific node.
1425 */ 1607 */
1426static struct page *get_partial_node(struct kmem_cache_node *n) 1608static struct page *get_partial_node(struct kmem_cache *s,
1609 struct kmem_cache_node *n)
1427{ 1610{
1428 struct page *page; 1611 struct page *page;
1429 1612
@@ -1438,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
1438 1621
1439 spin_lock(&n->list_lock); 1622 spin_lock(&n->list_lock);
1440 list_for_each_entry(page, &n->partial, lru) 1623 list_for_each_entry(page, &n->partial, lru)
1441 if (lock_and_freeze_slab(n, page)) 1624 if (acquire_slab(s, n, page))
1442 goto out; 1625 goto out;
1443 page = NULL; 1626 page = NULL;
1444out: 1627out:
@@ -1489,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1489 1672
1490 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1491 n->nr_partial > s->min_partial) { 1674 n->nr_partial > s->min_partial) {
1492 page = get_partial_node(n); 1675 page = get_partial_node(s, n);
1493 if (page) { 1676 if (page) {
1494 put_mems_allowed(); 1677 put_mems_allowed();
1495 return page; 1678 return page;
@@ -1509,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1509 struct page *page; 1692 struct page *page;
1510 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1693 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1511 1694
1512 page = get_partial_node(get_node(s, searchnode)); 1695 page = get_partial_node(s, get_node(s, searchnode));
1513 if (page || node != NUMA_NO_NODE) 1696 if (page || node != NUMA_NO_NODE)
1514 return page; 1697 return page;
1515 1698
1516 return get_any_partial(s, flags); 1699 return get_any_partial(s, flags);
1517} 1700}
1518 1701
1519/*
1520 * Move a page back to the lists.
1521 *
1522 * Must be called with the slab lock held.
1523 *
1524 * On exit the slab lock will have been dropped.
1525 */
1526static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1527 __releases(bitlock)
1528{
1529 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1530
1531 __ClearPageSlubFrozen(page);
1532 if (page->inuse) {
1533
1534 if (page->freelist) {
1535 add_partial(n, page, tail);
1536 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1537 } else {
1538 stat(s, DEACTIVATE_FULL);
1539 if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1540 add_full(n, page);
1541 }
1542 slab_unlock(page);
1543 } else {
1544 stat(s, DEACTIVATE_EMPTY);
1545 if (n->nr_partial < s->min_partial) {
1546 /*
1547 * Adding an empty slab to the partial slabs in order
1548 * to avoid page allocator overhead. This slab needs
1549 * to come after the other slabs with objects in
1550 * so that the others get filled first. That way the
1551 * size of the partial list stays small.
1552 *
1553 * kmem_cache_shrink can reclaim any empty slabs from
1554 * the partial list.
1555 */
1556 add_partial(n, page, 1);
1557 slab_unlock(page);
1558 } else {
1559 slab_unlock(page);
1560 stat(s, FREE_SLAB);
1561 discard_slab(s, page);
1562 }
1563 }
1564}
1565
1566#ifdef CONFIG_PREEMPT 1702#ifdef CONFIG_PREEMPT
1567/* 1703/*
1568 * Calculate the next globally unique transaction for disambiguiation 1704 * Calculate the next globally unique transaction for disambiguiation
@@ -1632,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1632/* 1768/*
1633 * Remove the cpu slab 1769 * Remove the cpu slab
1634 */ 1770 */
1771
1772/*
1773 * Remove the cpu slab
1774 */
1635static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1775static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1636 __releases(bitlock)
1637{ 1776{
1777 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1638 struct page *page = c->page; 1778 struct page *page = c->page;
1639 int tail = 1; 1779 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1640 1780 int lock = 0;
1641 if (page->freelist) 1781 enum slab_modes l = M_NONE, m = M_NONE;
1782 void *freelist;
1783 void *nextfree;
1784 int tail = 0;
1785 struct page new;
1786 struct page old;
1787
1788 if (page->freelist) {
1642 stat(s, DEACTIVATE_REMOTE_FREES); 1789 stat(s, DEACTIVATE_REMOTE_FREES);
1790 tail = 1;
1791 }
1792
1793 c->tid = next_tid(c->tid);
1794 c->page = NULL;
1795 freelist = c->freelist;
1796 c->freelist = NULL;
1797
1643 /* 1798 /*
1644 * Merge cpu freelist into slab freelist. Typically we get here 1799 * Stage one: Free all available per cpu objects back
1645 * because both freelists are empty. So this is unlikely 1800 * to the page freelist while it is still frozen. Leave the
1646 * to occur. 1801 * last one.
1802 *
1803 * There is no need to take the list->lock because the page
1804 * is still frozen.
1647 */ 1805 */
1648 while (unlikely(c->freelist)) { 1806 while (freelist && (nextfree = get_freepointer(s, freelist))) {
1649 void **object; 1807 void *prior;
1808 unsigned long counters;
1809
1810 do {
1811 prior = page->freelist;
1812 counters = page->counters;
1813 set_freepointer(s, freelist, prior);
1814 new.counters = counters;
1815 new.inuse--;
1816 VM_BUG_ON(!new.frozen);
1817
1818 } while (!__cmpxchg_double_slab(s, page,
1819 prior, counters,
1820 freelist, new.counters,
1821 "drain percpu freelist"));
1822
1823 freelist = nextfree;
1824 }
1650 1825
1651 tail = 0; /* Hot objects. Put the slab first */ 1826 /*
1827 * Stage two: Ensure that the page is unfrozen while the
1828 * list presence reflects the actual number of objects
1829 * during unfreeze.
1830 *
1831 * We setup the list membership and then perform a cmpxchg
1832 * with the count. If there is a mismatch then the page
1833 * is not unfrozen but the page is on the wrong list.
1834 *
1835 * Then we restart the process which may have to remove
1836 * the page from the list that we just put it on again
1837 * because the number of objects in the slab may have
1838 * changed.
1839 */
1840redo:
1652 1841
1653 /* Retrieve object from cpu_freelist */ 1842 old.freelist = page->freelist;
1654 object = c->freelist; 1843 old.counters = page->counters;
1655 c->freelist = get_freepointer(s, c->freelist); 1844 VM_BUG_ON(!old.frozen);
1845
1846 /* Determine target state of the slab */
1847 new.counters = old.counters;
1848 if (freelist) {
1849 new.inuse--;
1850 set_freepointer(s, freelist, old.freelist);
1851 new.freelist = freelist;
1852 } else
1853 new.freelist = old.freelist;
1854
1855 new.frozen = 0;
1656 1856
1657 /* And put onto the regular freelist */ 1857 if (!new.inuse && n->nr_partial < s->min_partial)
1658 set_freepointer(s, object, page->freelist); 1858 m = M_FREE;
1659 page->freelist = object; 1859 else if (new.freelist) {
1660 page->inuse--; 1860 m = M_PARTIAL;
1861 if (!lock) {
1862 lock = 1;
1863 /*
1864 * Taking the spinlock removes the possiblity
1865 * that acquire_slab() will see a slab page that
1866 * is frozen
1867 */
1868 spin_lock(&n->list_lock);
1869 }
1870 } else {
1871 m = M_FULL;
1872 if (kmem_cache_debug(s) && !lock) {
1873 lock = 1;
1874 /*
1875 * This also ensures that the scanning of full
1876 * slabs from diagnostic functions will not see
1877 * any frozen slabs.
1878 */
1879 spin_lock(&n->list_lock);
1880 }
1881 }
1882
1883 if (l != m) {
1884
1885 if (l == M_PARTIAL)
1886
1887 remove_partial(n, page);
1888
1889 else if (l == M_FULL)
1890
1891 remove_full(s, page);
1892
1893 if (m == M_PARTIAL) {
1894
1895 add_partial(n, page, tail);
1896 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1897
1898 } else if (m == M_FULL) {
1899
1900 stat(s, DEACTIVATE_FULL);
1901 add_full(s, n, page);
1902
1903 }
1904 }
1905
1906 l = m;
1907 if (!__cmpxchg_double_slab(s, page,
1908 old.freelist, old.counters,
1909 new.freelist, new.counters,
1910 "unfreezing slab"))
1911 goto redo;
1912
1913 if (lock)
1914 spin_unlock(&n->list_lock);
1915
1916 if (m == M_FREE) {
1917 stat(s, DEACTIVATE_EMPTY);
1918 discard_slab(s, page);
1919 stat(s, FREE_SLAB);
1661 } 1920 }
1662 c->page = NULL;
1663 c->tid = next_tid(c->tid);
1664 unfreeze_slab(s, page, tail);
1665} 1921}
1666 1922
1667static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1923static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1668{ 1924{
1669 stat(s, CPUSLAB_FLUSH); 1925 stat(s, CPUSLAB_FLUSH);
1670 slab_lock(c->page);
1671 deactivate_slab(s, c); 1926 deactivate_slab(s, c);
1672} 1927}
1673 1928
@@ -1796,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1796 void **object; 2051 void **object;
1797 struct page *page; 2052 struct page *page;
1798 unsigned long flags; 2053 unsigned long flags;
2054 struct page new;
2055 unsigned long counters;
1799 2056
1800 local_irq_save(flags); 2057 local_irq_save(flags);
1801#ifdef CONFIG_PREEMPT 2058#ifdef CONFIG_PREEMPT
@@ -1814,72 +2071,97 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1814 if (!page) 2071 if (!page)
1815 goto new_slab; 2072 goto new_slab;
1816 2073
1817 slab_lock(page); 2074 if (unlikely(!node_match(c, node))) {
1818 if (unlikely(!node_match(c, node))) 2075 stat(s, ALLOC_NODE_MISMATCH);
1819 goto another_slab; 2076 deactivate_slab(s, c);
2077 goto new_slab;
2078 }
2079
2080 stat(s, ALLOC_SLOWPATH);
2081
2082 do {
2083 object = page->freelist;
2084 counters = page->counters;
2085 new.counters = counters;
2086 VM_BUG_ON(!new.frozen);
2087
2088 /*
2089 * If there is no object left then we use this loop to
2090 * deactivate the slab which is simple since no objects
2091 * are left in the slab and therefore we do not need to
2092 * put the page back onto the partial list.
2093 *
2094 * If there are objects left then we retrieve them
2095 * and use them to refill the per cpu queue.
2096 */
2097
2098 new.inuse = page->objects;
2099 new.frozen = object != NULL;
2100
2101 } while (!__cmpxchg_double_slab(s, page,
2102 object, counters,
2103 NULL, new.counters,
2104 "__slab_alloc"));
2105
2106 if (unlikely(!object)) {
2107 c->page = NULL;
2108 stat(s, DEACTIVATE_BYPASS);
2109 goto new_slab;
2110 }
1820 2111
1821 stat(s, ALLOC_REFILL); 2112 stat(s, ALLOC_REFILL);
1822 2113
1823load_freelist: 2114load_freelist:
1824 object = page->freelist; 2115 VM_BUG_ON(!page->frozen);
1825 if (unlikely(!object))
1826 goto another_slab;
1827 if (kmem_cache_debug(s))
1828 goto debug;
1829
1830 c->freelist = get_freepointer(s, object); 2116 c->freelist = get_freepointer(s, object);
1831 page->inuse = page->objects;
1832 page->freelist = NULL;
1833
1834 slab_unlock(page);
1835 c->tid = next_tid(c->tid); 2117 c->tid = next_tid(c->tid);
1836 local_irq_restore(flags); 2118 local_irq_restore(flags);
1837 stat(s, ALLOC_SLOWPATH);
1838 return object; 2119 return object;
1839 2120
1840another_slab:
1841 deactivate_slab(s, c);
1842
1843new_slab: 2121new_slab:
1844 page = get_partial(s, gfpflags, node); 2122 page = get_partial(s, gfpflags, node);
1845 if (page) { 2123 if (page) {
1846 stat(s, ALLOC_FROM_PARTIAL); 2124 stat(s, ALLOC_FROM_PARTIAL);
1847 c->node = page_to_nid(page); 2125 object = c->freelist;
1848 c->page = page; 2126
2127 if (kmem_cache_debug(s))
2128 goto debug;
1849 goto load_freelist; 2129 goto load_freelist;
1850 } 2130 }
1851 2131
1852 gfpflags &= gfp_allowed_mask;
1853 if (gfpflags & __GFP_WAIT)
1854 local_irq_enable();
1855
1856 page = new_slab(s, gfpflags, node); 2132 page = new_slab(s, gfpflags, node);
1857 2133
1858 if (gfpflags & __GFP_WAIT)
1859 local_irq_disable();
1860
1861 if (page) { 2134 if (page) {
1862 c = __this_cpu_ptr(s->cpu_slab); 2135 c = __this_cpu_ptr(s->cpu_slab);
1863 stat(s, ALLOC_SLAB);
1864 if (c->page) 2136 if (c->page)
1865 flush_slab(s, c); 2137 flush_slab(s, c);
1866 2138
1867 slab_lock(page); 2139 /*
1868 __SetPageSlubFrozen(page); 2140 * No other reference to the page yet so we can
2141 * muck around with it freely without cmpxchg
2142 */
2143 object = page->freelist;
2144 page->freelist = NULL;
2145 page->inuse = page->objects;
2146
2147 stat(s, ALLOC_SLAB);
1869 c->node = page_to_nid(page); 2148 c->node = page_to_nid(page);
1870 c->page = page; 2149 c->page = page;
2150
2151 if (kmem_cache_debug(s))
2152 goto debug;
1871 goto load_freelist; 2153 goto load_freelist;
1872 } 2154 }
1873 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2155 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1874 slab_out_of_memory(s, gfpflags, node); 2156 slab_out_of_memory(s, gfpflags, node);
1875 local_irq_restore(flags); 2157 local_irq_restore(flags);
1876 return NULL; 2158 return NULL;
2159
1877debug: 2160debug:
1878 if (!alloc_debug_processing(s, page, object, addr)) 2161 if (!object || !alloc_debug_processing(s, page, object, addr))
1879 goto another_slab; 2162 goto new_slab;
1880 2163
1881 page->inuse++; 2164 c->freelist = get_freepointer(s, object);
1882 page->freelist = get_freepointer(s, object);
1883 deactivate_slab(s, c); 2165 deactivate_slab(s, c);
1884 c->page = NULL; 2166 c->page = NULL;
1885 c->node = NUMA_NO_NODE; 2167 c->node = NUMA_NO_NODE;
@@ -2031,40 +2313,75 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2031{ 2313{
2032 void *prior; 2314 void *prior;
2033 void **object = (void *)x; 2315 void **object = (void *)x;
2034 unsigned long flags; 2316 int was_frozen;
2317 int inuse;
2318 struct page new;
2319 unsigned long counters;
2320 struct kmem_cache_node *n = NULL;
2321 unsigned long uninitialized_var(flags);
2035 2322
2036 local_irq_save(flags);
2037 slab_lock(page);
2038 stat(s, FREE_SLOWPATH); 2323 stat(s, FREE_SLOWPATH);
2039 2324
2040 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2325 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2041 goto out_unlock; 2326 return;
2042 2327
2043 prior = page->freelist; 2328 do {
2044 set_freepointer(s, object, prior); 2329 prior = page->freelist;
2045 page->freelist = object; 2330 counters = page->counters;
2046 page->inuse--; 2331 set_freepointer(s, object, prior);
2332 new.counters = counters;
2333 was_frozen = new.frozen;
2334 new.inuse--;
2335 if ((!new.inuse || !prior) && !was_frozen && !n) {
2336 n = get_node(s, page_to_nid(page));
2337 /*
2338 * Speculatively acquire the list_lock.
2339 * If the cmpxchg does not succeed then we may
2340 * drop the list_lock without any processing.
2341 *
2342 * Otherwise the list_lock will synchronize with
2343 * other processors updating the list of slabs.
2344 */
2345 spin_lock_irqsave(&n->list_lock, flags);
2346 }
2347 inuse = new.inuse;
2047 2348
2048 if (unlikely(PageSlubFrozen(page))) { 2349 } while (!cmpxchg_double_slab(s, page,
2049 stat(s, FREE_FROZEN); 2350 prior, counters,
2050 goto out_unlock; 2351 object, new.counters,
2051 } 2352 "__slab_free"));
2052 2353
2053 if (unlikely(!page->inuse)) 2354 if (likely(!n)) {
2054 goto slab_empty; 2355 /*
2356 * The list lock was not taken therefore no list
2357 * activity can be necessary.
2358 */
2359 if (was_frozen)
2360 stat(s, FREE_FROZEN);
2361 return;
2362 }
2055 2363
2056 /* 2364 /*
2057 * Objects left in the slab. If it was not on the partial list before 2365 * was_frozen may have been set after we acquired the list_lock in
2058 * then add it. 2366 * an earlier loop. So we need to check it here again.
2059 */ 2367 */
2060 if (unlikely(!prior)) { 2368 if (was_frozen)
2061 add_partial(get_node(s, page_to_nid(page)), page, 1); 2369 stat(s, FREE_FROZEN);
2062 stat(s, FREE_ADD_PARTIAL); 2370 else {
2063 } 2371 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2372 goto slab_empty;
2064 2373
2065out_unlock: 2374 /*
2066 slab_unlock(page); 2375 * Objects left in the slab. If it was not on the partial list before
2067 local_irq_restore(flags); 2376 * then add it.
2377 */
2378 if (unlikely(!prior)) {
2379 remove_full(s, page);
2380 add_partial(n, page, 0);
2381 stat(s, FREE_ADD_PARTIAL);
2382 }
2383 }
2384 spin_unlock_irqrestore(&n->list_lock, flags);
2068 return; 2385 return;
2069 2386
2070slab_empty: 2387slab_empty:
@@ -2072,11 +2389,11 @@ slab_empty:
2072 /* 2389 /*
2073 * Slab still on the partial list. 2390 * Slab still on the partial list.
2074 */ 2391 */
2075 remove_partial(s, page); 2392 remove_partial(n, page);
2076 stat(s, FREE_REMOVE_PARTIAL); 2393 stat(s, FREE_REMOVE_PARTIAL);
2077 } 2394 }
2078 slab_unlock(page); 2395
2079 local_irq_restore(flags); 2396 spin_unlock_irqrestore(&n->list_lock, flags);
2080 stat(s, FREE_SLAB); 2397 stat(s, FREE_SLAB);
2081 discard_slab(s, page); 2398 discard_slab(s, page);
2082} 2399}
@@ -2350,7 +2667,6 @@ static void early_kmem_cache_node_alloc(int node)
2350{ 2667{
2351 struct page *page; 2668 struct page *page;
2352 struct kmem_cache_node *n; 2669 struct kmem_cache_node *n;
2353 unsigned long flags;
2354 2670
2355 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2671 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2356 2672
@@ -2368,6 +2684,7 @@ static void early_kmem_cache_node_alloc(int node)
2368 BUG_ON(!n); 2684 BUG_ON(!n);
2369 page->freelist = get_freepointer(kmem_cache_node, n); 2685 page->freelist = get_freepointer(kmem_cache_node, n);
2370 page->inuse++; 2686 page->inuse++;
2687 page->frozen = 0;
2371 kmem_cache_node->node[node] = n; 2688 kmem_cache_node->node[node] = n;
2372#ifdef CONFIG_SLUB_DEBUG 2689#ifdef CONFIG_SLUB_DEBUG
2373 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2690 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2376,14 +2693,7 @@ static void early_kmem_cache_node_alloc(int node)
2376 init_kmem_cache_node(n, kmem_cache_node); 2693 init_kmem_cache_node(n, kmem_cache_node);
2377 inc_slabs_node(kmem_cache_node, node, page->objects); 2694 inc_slabs_node(kmem_cache_node, node, page->objects);
2378 2695
2379 /*
2380 * lockdep requires consistent irq usage for each lock
2381 * so even though there cannot be a race this early in
2382 * the boot sequence, we still disable irqs.
2383 */
2384 local_irq_save(flags);
2385 add_partial(n, page, 0); 2696 add_partial(n, page, 0);
2386 local_irq_restore(flags);
2387} 2697}
2388 2698
2389static void free_kmem_cache_nodes(struct kmem_cache *s) 2699static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2589,6 +2899,12 @@ static int kmem_cache_open(struct kmem_cache *s,
2589 } 2899 }
2590 } 2900 }
2591 2901
2902#ifdef CONFIG_CMPXCHG_DOUBLE
2903 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
2904 /* Enable fast mode */
2905 s->flags |= __CMPXCHG_DOUBLE;
2906#endif
2907
2592 /* 2908 /*
2593 * The larger the object size is, the more pages we want on the partial 2909 * The larger the object size is, the more pages we want on the partial
2594 * list to avoid pounding the page allocator excessively. 2910 * list to avoid pounding the page allocator excessively.
@@ -2661,7 +2977,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2661 spin_lock_irqsave(&n->list_lock, flags); 2977 spin_lock_irqsave(&n->list_lock, flags);
2662 list_for_each_entry_safe(page, h, &n->partial, lru) { 2978 list_for_each_entry_safe(page, h, &n->partial, lru) {
2663 if (!page->inuse) { 2979 if (!page->inuse) {
2664 __remove_partial(n, page); 2980 remove_partial(n, page);
2665 discard_slab(s, page); 2981 discard_slab(s, page);
2666 } else { 2982 } else {
2667 list_slab_objects(s, page, 2983 list_slab_objects(s, page,
@@ -2928,6 +3244,42 @@ size_t ksize(const void *object)
2928} 3244}
2929EXPORT_SYMBOL(ksize); 3245EXPORT_SYMBOL(ksize);
2930 3246
3247#ifdef CONFIG_SLUB_DEBUG
3248bool verify_mem_not_deleted(const void *x)
3249{
3250 struct page *page;
3251 void *object = (void *)x;
3252 unsigned long flags;
3253 bool rv;
3254
3255 if (unlikely(ZERO_OR_NULL_PTR(x)))
3256 return false;
3257
3258 local_irq_save(flags);
3259
3260 page = virt_to_head_page(x);
3261 if (unlikely(!PageSlab(page))) {
3262 /* maybe it was from stack? */
3263 rv = true;
3264 goto out_unlock;
3265 }
3266
3267 slab_lock(page);
3268 if (on_freelist(page->slab, page, object)) {
3269 object_err(page->slab, page, object, "Object is on free-list");
3270 rv = false;
3271 } else {
3272 rv = true;
3273 }
3274 slab_unlock(page);
3275
3276out_unlock:
3277 local_irq_restore(flags);
3278 return rv;
3279}
3280EXPORT_SYMBOL(verify_mem_not_deleted);
3281#endif
3282
2931void kfree(const void *x) 3283void kfree(const void *x)
2932{ 3284{
2933 struct page *page; 3285 struct page *page;
@@ -2993,14 +3345,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
2993 * list_lock. page->inuse here is the upper limit. 3345 * list_lock. page->inuse here is the upper limit.
2994 */ 3346 */
2995 list_for_each_entry_safe(page, t, &n->partial, lru) { 3347 list_for_each_entry_safe(page, t, &n->partial, lru) {
2996 if (!page->inuse && slab_trylock(page)) { 3348 if (!page->inuse) {
2997 /* 3349 remove_partial(n, page);
2998 * Must hold slab lock here because slab_free
2999 * may have freed the last object and be
3000 * waiting to release the slab.
3001 */
3002 __remove_partial(n, page);
3003 slab_unlock(page);
3004 discard_slab(s, page); 3350 discard_slab(s, page);
3005 } else { 3351 } else {
3006 list_move(&page->lru, 3352 list_move(&page->lru,
@@ -3588,12 +3934,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3588static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3934static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3589 unsigned long *map) 3935 unsigned long *map)
3590{ 3936{
3591 if (slab_trylock(page)) { 3937 slab_lock(page);
3592 validate_slab(s, page, map); 3938 validate_slab(s, page, map);
3593 slab_unlock(page); 3939 slab_unlock(page);
3594 } else
3595 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3596 s->name, page);
3597} 3940}
3598 3941
3599static int validate_slab_node(struct kmem_cache *s, 3942static int validate_slab_node(struct kmem_cache *s,
@@ -4058,7 +4401,7 @@ static int any_slab_objects(struct kmem_cache *s)
4058#endif 4401#endif
4059 4402
4060#define to_slab_attr(n) container_of(n, struct slab_attribute, attr) 4403#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
4061#define to_slab(n) container_of(n, struct kmem_cache, kobj); 4404#define to_slab(n) container_of(n, struct kmem_cache, kobj)
4062 4405
4063struct slab_attribute { 4406struct slab_attribute {
4064 struct attribute attr; 4407 struct attribute attr;
@@ -4241,8 +4584,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
4241 const char *buf, size_t length) 4584 const char *buf, size_t length)
4242{ 4585{
4243 s->flags &= ~SLAB_DEBUG_FREE; 4586 s->flags &= ~SLAB_DEBUG_FREE;
4244 if (buf[0] == '1') 4587 if (buf[0] == '1') {
4588 s->flags &= ~__CMPXCHG_DOUBLE;
4245 s->flags |= SLAB_DEBUG_FREE; 4589 s->flags |= SLAB_DEBUG_FREE;
4590 }
4246 return length; 4591 return length;
4247} 4592}
4248SLAB_ATTR(sanity_checks); 4593SLAB_ATTR(sanity_checks);
@@ -4256,8 +4601,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4256 size_t length) 4601 size_t length)
4257{ 4602{
4258 s->flags &= ~SLAB_TRACE; 4603 s->flags &= ~SLAB_TRACE;
4259 if (buf[0] == '1') 4604 if (buf[0] == '1') {
4605 s->flags &= ~__CMPXCHG_DOUBLE;
4260 s->flags |= SLAB_TRACE; 4606 s->flags |= SLAB_TRACE;
4607 }
4261 return length; 4608 return length;
4262} 4609}
4263SLAB_ATTR(trace); 4610SLAB_ATTR(trace);
@@ -4274,8 +4621,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4274 return -EBUSY; 4621 return -EBUSY;
4275 4622
4276 s->flags &= ~SLAB_RED_ZONE; 4623 s->flags &= ~SLAB_RED_ZONE;
4277 if (buf[0] == '1') 4624 if (buf[0] == '1') {
4625 s->flags &= ~__CMPXCHG_DOUBLE;
4278 s->flags |= SLAB_RED_ZONE; 4626 s->flags |= SLAB_RED_ZONE;
4627 }
4279 calculate_sizes(s, -1); 4628 calculate_sizes(s, -1);
4280 return length; 4629 return length;
4281} 4630}
@@ -4293,8 +4642,10 @@ static ssize_t poison_store(struct kmem_cache *s,
4293 return -EBUSY; 4642 return -EBUSY;
4294 4643
4295 s->flags &= ~SLAB_POISON; 4644 s->flags &= ~SLAB_POISON;
4296 if (buf[0] == '1') 4645 if (buf[0] == '1') {
4646 s->flags &= ~__CMPXCHG_DOUBLE;
4297 s->flags |= SLAB_POISON; 4647 s->flags |= SLAB_POISON;
4648 }
4298 calculate_sizes(s, -1); 4649 calculate_sizes(s, -1);
4299 return length; 4650 return length;
4300} 4651}
@@ -4312,8 +4663,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
4312 return -EBUSY; 4663 return -EBUSY;
4313 4664
4314 s->flags &= ~SLAB_STORE_USER; 4665 s->flags &= ~SLAB_STORE_USER;
4315 if (buf[0] == '1') 4666 if (buf[0] == '1') {
4667 s->flags &= ~__CMPXCHG_DOUBLE;
4316 s->flags |= SLAB_STORE_USER; 4668 s->flags |= SLAB_STORE_USER;
4669 }
4317 calculate_sizes(s, -1); 4670 calculate_sizes(s, -1);
4318 return length; 4671 return length;
4319} 4672}
@@ -4478,6 +4831,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4478STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4831STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4479STAT_ATTR(ALLOC_SLAB, alloc_slab); 4832STAT_ATTR(ALLOC_SLAB, alloc_slab);
4480STAT_ATTR(ALLOC_REFILL, alloc_refill); 4833STAT_ATTR(ALLOC_REFILL, alloc_refill);
4834STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4481STAT_ATTR(FREE_SLAB, free_slab); 4835STAT_ATTR(FREE_SLAB, free_slab);
4482STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4836STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4483STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4837STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4485,7 +4839,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4485STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4839STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4486STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4840STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4487STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4841STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4842STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4488STAT_ATTR(ORDER_FALLBACK, order_fallback); 4843STAT_ATTR(ORDER_FALLBACK, order_fallback);
4844STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4845STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4489#endif 4846#endif
4490 4847
4491static struct attribute *slab_attrs[] = { 4848static struct attribute *slab_attrs[] = {
@@ -4535,6 +4892,7 @@ static struct attribute *slab_attrs[] = {
4535 &alloc_from_partial_attr.attr, 4892 &alloc_from_partial_attr.attr,
4536 &alloc_slab_attr.attr, 4893 &alloc_slab_attr.attr,
4537 &alloc_refill_attr.attr, 4894 &alloc_refill_attr.attr,
4895 &alloc_node_mismatch_attr.attr,
4538 &free_slab_attr.attr, 4896 &free_slab_attr.attr,
4539 &cpuslab_flush_attr.attr, 4897 &cpuslab_flush_attr.attr,
4540 &deactivate_full_attr.attr, 4898 &deactivate_full_attr.attr,
@@ -4542,7 +4900,10 @@ static struct attribute *slab_attrs[] = {
4542 &deactivate_to_head_attr.attr, 4900 &deactivate_to_head_attr.attr,
4543 &deactivate_to_tail_attr.attr, 4901 &deactivate_to_tail_attr.attr,
4544 &deactivate_remote_frees_attr.attr, 4902 &deactivate_remote_frees_attr.attr,
4903 &deactivate_bypass_attr.attr,
4545 &order_fallback_attr.attr, 4904 &order_fallback_attr.attr,
4905 &cmpxchg_double_fail_attr.attr,
4906 &cmpxchg_double_cpu_fail_attr.attr,
4546#endif 4907#endif
4547#ifdef CONFIG_FAILSLAB 4908#ifdef CONFIG_FAILSLAB
4548 &failslab_attr.attr, 4909 &failslab_attr.attr,
diff --git a/mm/sparse.c b/mm/sparse.c
index aa64b12831a2..858e1dff9b2a 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; 40static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
41#endif 41#endif
42 42
43int page_to_nid(struct page *page) 43int page_to_nid(const struct page *page)
44{ 44{
45 return section_to_node_table[page_to_section(page)]; 45 return section_to_node_table[page_to_section(page)];
46} 46}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index ff8dc1a18cb4..1b8c33907242 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1681,19 +1681,14 @@ out:
1681} 1681}
1682 1682
1683#ifdef CONFIG_PROC_FS 1683#ifdef CONFIG_PROC_FS
1684struct proc_swaps {
1685 struct seq_file seq;
1686 int event;
1687};
1688
1689static unsigned swaps_poll(struct file *file, poll_table *wait) 1684static unsigned swaps_poll(struct file *file, poll_table *wait)
1690{ 1685{
1691 struct proc_swaps *s = file->private_data; 1686 struct seq_file *seq = file->private_data;
1692 1687
1693 poll_wait(file, &proc_poll_wait, wait); 1688 poll_wait(file, &proc_poll_wait, wait);
1694 1689
1695 if (s->event != atomic_read(&proc_poll_event)) { 1690 if (seq->poll_event != atomic_read(&proc_poll_event)) {
1696 s->event = atomic_read(&proc_poll_event); 1691 seq->poll_event = atomic_read(&proc_poll_event);
1697 return POLLIN | POLLRDNORM | POLLERR | POLLPRI; 1692 return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
1698 } 1693 }
1699 1694
@@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = {
1783 1778
1784static int swaps_open(struct inode *inode, struct file *file) 1779static int swaps_open(struct inode *inode, struct file *file)
1785{ 1780{
1786 struct proc_swaps *s; 1781 struct seq_file *seq;
1787 int ret; 1782 int ret;
1788 1783
1789 s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
1790 if (!s)
1791 return -ENOMEM;
1792
1793 file->private_data = s;
1794
1795 ret = seq_open(file, &swaps_op); 1784 ret = seq_open(file, &swaps_op);
1796 if (ret) { 1785 if (ret)
1797 kfree(s);
1798 return ret; 1786 return ret;
1799 }
1800 1787
1801 s->seq.private = s; 1788 seq = file->private_data;
1802 s->event = atomic_read(&proc_poll_event); 1789 seq->poll_event = atomic_read(&proc_poll_event);
1803 return ret; 1790 return 0;
1804} 1791}
1805 1792
1806static const struct file_operations proc_swaps_operations = { 1793static const struct file_operations proc_swaps_operations = {
diff --git a/mm/thrash.c b/mm/thrash.c
index fabf2d0f5169..e53f7d02c17c 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -6,7 +6,7 @@
6 * Released under the GPL, see the file COPYING for details. 6 * Released under the GPL, see the file COPYING for details.
7 * 7 *
8 * Simple token based thrashing protection, using the algorithm 8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cs.wm.edu/~sjiang/token.pdf 9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 * 10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> 11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token: 12 * Improved algorithm to pass token:
@@ -30,8 +30,6 @@
30static DEFINE_SPINLOCK(swap_token_lock); 30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm; 31struct mm_struct *swap_token_mm;
32struct mem_cgroup *swap_token_memcg; 32struct mem_cgroup *swap_token_memcg;
33static unsigned int global_faults;
34static unsigned int last_aging;
35 33
36#ifdef CONFIG_CGROUP_MEM_RES_CTLR 34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
37static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) 35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
@@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm)
55{ 53{
56 int current_interval; 54 int current_interval;
57 unsigned int old_prio = mm->token_priority; 55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
58 58
59 global_faults++; 59 global_faults++;
60 60
@@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm)
67 if (!swap_token_mm) 67 if (!swap_token_mm)
68 goto replace_token; 68 goto replace_token;
69 69
70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
70 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { 81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
71 swap_token_mm->token_priority /= 2; 82 swap_token_mm->token_priority /= 2;
72 last_aging = global_faults; 83 last_aging = global_faults;
diff --git a/mm/truncate.c b/mm/truncate.c
index e13f22efaad7..232eb2736a79 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page)
199 * The first pass will remove most pages, so the search cost of the second pass 199 * The first pass will remove most pages, so the search cost of the second pass
200 * is low. 200 * is low.
201 * 201 *
202 * When looking at page->index outside the page lock we need to be careful to
203 * copy it into a local to avoid races (it could change at any time).
204 *
205 * We pass down the cache-hot hint to the page freeing code. Even if the 202 * We pass down the cache-hot hint to the page freeing code. Even if the
206 * mapping is large, it is probably the case that the final pages are the most 203 * mapping is large, it is probably the case that the final pages are the most
207 * recently touched, and freeing happens in ascending file offset order. 204 * recently touched, and freeing happens in ascending file offset order.
@@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
210 loff_t lstart, loff_t lend) 207 loff_t lstart, loff_t lend)
211{ 208{
212 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 209 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
213 pgoff_t end;
214 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 210 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
215 struct pagevec pvec; 211 struct pagevec pvec;
216 pgoff_t next; 212 pgoff_t index;
213 pgoff_t end;
217 int i; 214 int i;
218 215
219 cleancache_flush_inode(mapping); 216 cleancache_flush_inode(mapping);
@@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping,
224 end = (lend >> PAGE_CACHE_SHIFT); 221 end = (lend >> PAGE_CACHE_SHIFT);
225 222
226 pagevec_init(&pvec, 0); 223 pagevec_init(&pvec, 0);
227 next = start; 224 index = start;
228 while (next <= end && 225 while (index <= end && pagevec_lookup(&pvec, mapping, index,
229 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 226 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
230 mem_cgroup_uncharge_start(); 227 mem_cgroup_uncharge_start();
231 for (i = 0; i < pagevec_count(&pvec); i++) { 228 for (i = 0; i < pagevec_count(&pvec); i++) {
232 struct page *page = pvec.pages[i]; 229 struct page *page = pvec.pages[i];
233 pgoff_t page_index = page->index;
234 230
235 if (page_index > end) { 231 /* We rely upon deletion not changing page->index */
236 next = page_index; 232 index = page->index;
233 if (index > end)
237 break; 234 break;
238 }
239 235
240 if (page_index > next)
241 next = page_index;
242 next++;
243 if (!trylock_page(page)) 236 if (!trylock_page(page))
244 continue; 237 continue;
238 WARN_ON(page->index != index);
245 if (PageWriteback(page)) { 239 if (PageWriteback(page)) {
246 unlock_page(page); 240 unlock_page(page);
247 continue; 241 continue;
@@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
252 pagevec_release(&pvec); 246 pagevec_release(&pvec);
253 mem_cgroup_uncharge_end(); 247 mem_cgroup_uncharge_end();
254 cond_resched(); 248 cond_resched();
249 index++;
255 } 250 }
256 251
257 if (partial) { 252 if (partial) {
@@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping,
264 } 259 }
265 } 260 }
266 261
267 next = start; 262 index = start;
268 for ( ; ; ) { 263 for ( ; ; ) {
269 cond_resched(); 264 cond_resched();
270 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 265 if (!pagevec_lookup(&pvec, mapping, index,
271 if (next == start) 266 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
267 if (index == start)
272 break; 268 break;
273 next = start; 269 index = start;
274 continue; 270 continue;
275 } 271 }
276 if (pvec.pages[0]->index > end) { 272 if (index == start && pvec.pages[0]->index > end) {
277 pagevec_release(&pvec); 273 pagevec_release(&pvec);
278 break; 274 break;
279 } 275 }
@@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping,
281 for (i = 0; i < pagevec_count(&pvec); i++) { 277 for (i = 0; i < pagevec_count(&pvec); i++) {
282 struct page *page = pvec.pages[i]; 278 struct page *page = pvec.pages[i];
283 279
284 if (page->index > end) 280 /* We rely upon deletion not changing page->index */
281 index = page->index;
282 if (index > end)
285 break; 283 break;
284
286 lock_page(page); 285 lock_page(page);
286 WARN_ON(page->index != index);
287 wait_on_page_writeback(page); 287 wait_on_page_writeback(page);
288 truncate_inode_page(mapping, page); 288 truncate_inode_page(mapping, page);
289 if (page->index > next)
290 next = page->index;
291 next++;
292 unlock_page(page); 289 unlock_page(page);
293 } 290 }
294 pagevec_release(&pvec); 291 pagevec_release(&pvec);
295 mem_cgroup_uncharge_end(); 292 mem_cgroup_uncharge_end();
293 index++;
296 } 294 }
297 cleancache_flush_inode(mapping); 295 cleancache_flush_inode(mapping);
298} 296}
@@ -333,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
333 pgoff_t start, pgoff_t end) 331 pgoff_t start, pgoff_t end)
334{ 332{
335 struct pagevec pvec; 333 struct pagevec pvec;
336 pgoff_t next = start; 334 pgoff_t index = start;
337 unsigned long ret; 335 unsigned long ret;
338 unsigned long count = 0; 336 unsigned long count = 0;
339 int i; 337 int i;
340 338
341 pagevec_init(&pvec, 0); 339 pagevec_init(&pvec, 0);
342 while (next <= end && 340 while (index <= end && pagevec_lookup(&pvec, mapping, index,
343 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 341 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
344 mem_cgroup_uncharge_start(); 342 mem_cgroup_uncharge_start();
345 for (i = 0; i < pagevec_count(&pvec); i++) { 343 for (i = 0; i < pagevec_count(&pvec); i++) {
346 struct page *page = pvec.pages[i]; 344 struct page *page = pvec.pages[i];
347 pgoff_t index;
348 int lock_failed;
349
350 lock_failed = !trylock_page(page);
351 345
352 /* 346 /* We rely upon deletion not changing page->index */
353 * We really shouldn't be looking at the ->index of an
354 * unlocked page. But we're not allowed to lock these
355 * pages. So we rely upon nobody altering the ->index
356 * of this (pinned-by-us) page.
357 */
358 index = page->index; 347 index = page->index;
359 if (index > next) 348 if (index > end)
360 next = index; 349 break;
361 next++;
362 if (lock_failed)
363 continue;
364 350
351 if (!trylock_page(page))
352 continue;
353 WARN_ON(page->index != index);
365 ret = invalidate_inode_page(page); 354 ret = invalidate_inode_page(page);
366 unlock_page(page); 355 unlock_page(page);
367 /* 356 /*
@@ -371,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
371 if (!ret) 360 if (!ret)
372 deactivate_page(page); 361 deactivate_page(page);
373 count += ret; 362 count += ret;
374 if (next > end)
375 break;
376 } 363 }
377 pagevec_release(&pvec); 364 pagevec_release(&pvec);
378 mem_cgroup_uncharge_end(); 365 mem_cgroup_uncharge_end();
379 cond_resched(); 366 cond_resched();
367 index++;
380 } 368 }
381 return count; 369 return count;
382} 370}
@@ -442,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
442 pgoff_t start, pgoff_t end) 430 pgoff_t start, pgoff_t end)
443{ 431{
444 struct pagevec pvec; 432 struct pagevec pvec;
445 pgoff_t next; 433 pgoff_t index;
446 int i; 434 int i;
447 int ret = 0; 435 int ret = 0;
448 int ret2 = 0; 436 int ret2 = 0;
449 int did_range_unmap = 0; 437 int did_range_unmap = 0;
450 int wrapped = 0;
451 438
452 cleancache_flush_inode(mapping); 439 cleancache_flush_inode(mapping);
453 pagevec_init(&pvec, 0); 440 pagevec_init(&pvec, 0);
454 next = start; 441 index = start;
455 while (next <= end && !wrapped && 442 while (index <= end && pagevec_lookup(&pvec, mapping, index,
456 pagevec_lookup(&pvec, mapping, next, 443 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
457 min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
458 mem_cgroup_uncharge_start(); 444 mem_cgroup_uncharge_start();
459 for (i = 0; i < pagevec_count(&pvec); i++) { 445 for (i = 0; i < pagevec_count(&pvec); i++) {
460 struct page *page = pvec.pages[i]; 446 struct page *page = pvec.pages[i];
461 pgoff_t page_index; 447
448 /* We rely upon deletion not changing page->index */
449 index = page->index;
450 if (index > end)
451 break;
462 452
463 lock_page(page); 453 lock_page(page);
454 WARN_ON(page->index != index);
464 if (page->mapping != mapping) { 455 if (page->mapping != mapping) {
465 unlock_page(page); 456 unlock_page(page);
466 continue; 457 continue;
467 } 458 }
468 page_index = page->index;
469 next = page_index + 1;
470 if (next == 0)
471 wrapped = 1;
472 if (page_index > end) {
473 unlock_page(page);
474 break;
475 }
476 wait_on_page_writeback(page); 459 wait_on_page_writeback(page);
477 if (page_mapped(page)) { 460 if (page_mapped(page)) {
478 if (!did_range_unmap) { 461 if (!did_range_unmap) {
@@ -480,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
480 * Zap the rest of the file in one hit. 463 * Zap the rest of the file in one hit.
481 */ 464 */
482 unmap_mapping_range(mapping, 465 unmap_mapping_range(mapping,
483 (loff_t)page_index<<PAGE_CACHE_SHIFT, 466 (loff_t)index << PAGE_CACHE_SHIFT,
484 (loff_t)(end - page_index + 1) 467 (loff_t)(1 + end - index)
485 << PAGE_CACHE_SHIFT, 468 << PAGE_CACHE_SHIFT,
486 0); 469 0);
487 did_range_unmap = 1; 470 did_range_unmap = 1;
488 } else { 471 } else {
@@ -490,8 +473,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
490 * Just zap this page 473 * Just zap this page
491 */ 474 */
492 unmap_mapping_range(mapping, 475 unmap_mapping_range(mapping,
493 (loff_t)page_index<<PAGE_CACHE_SHIFT, 476 (loff_t)index << PAGE_CACHE_SHIFT,
494 PAGE_CACHE_SIZE, 0); 477 PAGE_CACHE_SIZE, 0);
495 } 478 }
496 } 479 }
497 BUG_ON(page_mapped(page)); 480 BUG_ON(page_mapped(page));
@@ -507,6 +490,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
507 pagevec_release(&pvec); 490 pagevec_release(&pvec);
508 mem_cgroup_uncharge_end(); 491 mem_cgroup_uncharge_end();
509 cond_resched(); 492 cond_resched();
493 index++;
510 } 494 }
511 cleancache_flush_inode(mapping); 495 cleancache_flush_inode(mapping);
512 return ret; 496 return ret;
@@ -531,8 +515,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
531/** 515/**
532 * truncate_pagecache - unmap and remove pagecache that has been truncated 516 * truncate_pagecache - unmap and remove pagecache that has been truncated
533 * @inode: inode 517 * @inode: inode
534 * @old: old file offset 518 * @oldsize: old file size
535 * @new: new file offset 519 * @newsize: new file size
536 * 520 *
537 * inode's new i_size must already be written before truncate_pagecache 521 * inode's new i_size must already be written before truncate_pagecache
538 * is called. 522 * is called.
@@ -544,9 +528,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2);
544 * situations such as writepage being called for a page that has already 528 * situations such as writepage being called for a page that has already
545 * had its underlying blocks deallocated. 529 * had its underlying blocks deallocated.
546 */ 530 */
547void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) 531void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize)
548{ 532{
549 struct address_space *mapping = inode->i_mapping; 533 struct address_space *mapping = inode->i_mapping;
534 loff_t holebegin = round_up(newsize, PAGE_SIZE);
550 535
551 /* 536 /*
552 * unmap_mapping_range is called twice, first simply for 537 * unmap_mapping_range is called twice, first simply for
@@ -557,9 +542,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
557 * truncate_inode_pages finishes, hence the second 542 * truncate_inode_pages finishes, hence the second
558 * unmap_mapping_range call must be made for correctness. 543 * unmap_mapping_range call must be made for correctness.
559 */ 544 */
560 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 545 unmap_mapping_range(mapping, holebegin, 0, 1);
561 truncate_inode_pages(mapping, new); 546 truncate_inode_pages(mapping, newsize);
562 unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); 547 unmap_mapping_range(mapping, holebegin, 0, 1);
563} 548}
564EXPORT_SYMBOL(truncate_pagecache); 549EXPORT_SYMBOL(truncate_pagecache);
565 550
@@ -589,29 +574,31 @@ EXPORT_SYMBOL(truncate_setsize);
589/** 574/**
590 * vmtruncate - unmap mappings "freed" by truncate() syscall 575 * vmtruncate - unmap mappings "freed" by truncate() syscall
591 * @inode: inode of the file used 576 * @inode: inode of the file used
592 * @offset: file offset to start truncating 577 * @newsize: file offset to start truncating
593 * 578 *
594 * This function is deprecated and truncate_setsize or truncate_pagecache 579 * This function is deprecated and truncate_setsize or truncate_pagecache
595 * should be used instead, together with filesystem specific block truncation. 580 * should be used instead, together with filesystem specific block truncation.
596 */ 581 */
597int vmtruncate(struct inode *inode, loff_t offset) 582int vmtruncate(struct inode *inode, loff_t newsize)
598{ 583{
599 int error; 584 int error;
600 585
601 error = inode_newsize_ok(inode, offset); 586 error = inode_newsize_ok(inode, newsize);
602 if (error) 587 if (error)
603 return error; 588 return error;
604 589
605 truncate_setsize(inode, offset); 590 truncate_setsize(inode, newsize);
606 if (inode->i_op->truncate) 591 if (inode->i_op->truncate)
607 inode->i_op->truncate(inode); 592 inode->i_op->truncate(inode);
608 return 0; 593 return 0;
609} 594}
610EXPORT_SYMBOL(vmtruncate); 595EXPORT_SYMBOL(vmtruncate);
611 596
612int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) 597int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
613{ 598{
614 struct address_space *mapping = inode->i_mapping; 599 struct address_space *mapping = inode->i_mapping;
600 loff_t holebegin = round_up(lstart, PAGE_SIZE);
601 loff_t holelen = 1 + lend - holebegin;
615 602
616 /* 603 /*
617 * If the underlying filesystem is not going to provide 604 * If the underlying filesystem is not going to provide
@@ -622,12 +609,11 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
622 return -ENOSYS; 609 return -ENOSYS;
623 610
624 mutex_lock(&inode->i_mutex); 611 mutex_lock(&inode->i_mutex);
625 down_write(&inode->i_alloc_sem); 612 inode_dio_wait(inode);
626 unmap_mapping_range(mapping, offset, (end - offset), 1); 613 unmap_mapping_range(mapping, holebegin, holelen, 1);
627 inode->i_op->truncate_range(inode, offset, end); 614 inode->i_op->truncate_range(inode, lstart, lend);
628 /* unmap again to remove racily COWed private pages */ 615 /* unmap again to remove racily COWed private pages */
629 unmap_mapping_range(mapping, offset, (end - offset), 1); 616 unmap_mapping_range(mapping, holebegin, holelen, 1);
630 up_write(&inode->i_alloc_sem);
631 mutex_unlock(&inode->i_mutex); 617 mutex_unlock(&inode->i_mutex);
632 618
633 return 0; 619 return 0;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1d34d75366a7..464621d18eb2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -26,7 +26,7 @@
26#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
27#include <linux/pfn.h> 27#include <linux/pfn.h>
28#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
29#include <asm/atomic.h> 29#include <linux/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
@@ -452,13 +452,6 @@ overflow:
452 return ERR_PTR(-EBUSY); 452 return ERR_PTR(-EBUSY);
453} 453}
454 454
455static void rcu_free_va(struct rcu_head *head)
456{
457 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
458
459 kfree(va);
460}
461
462static void __free_vmap_area(struct vmap_area *va) 455static void __free_vmap_area(struct vmap_area *va)
463{ 456{
464 BUG_ON(RB_EMPTY_NODE(&va->rb_node)); 457 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
@@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va)
491 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) 484 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
492 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); 485 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
493 486
494 call_rcu(&va->rcu_head, rcu_free_va); 487 kfree_rcu(va, rcu_head);
495} 488}
496 489
497/* 490/*
@@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
837 return vb; 830 return vb;
838} 831}
839 832
840static void rcu_free_vb(struct rcu_head *head)
841{
842 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
843
844 kfree(vb);
845}
846
847static void free_vmap_block(struct vmap_block *vb) 833static void free_vmap_block(struct vmap_block *vb)
848{ 834{
849 struct vmap_block *tmp; 835 struct vmap_block *tmp;
@@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb)
856 BUG_ON(tmp != vb); 842 BUG_ON(tmp != vb);
857 843
858 free_vmap_area_noflush(vb->va); 844 free_vmap_area_noflush(vb->va);
859 call_rcu(&vb->rcu_head, rcu_free_vb); 845 kfree_rcu(vb, rcu_head);
860} 846}
861 847
862static void purge_fragmented_blocks(int cpu) 848static void purge_fragmented_blocks(int cpu)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d036e59d302b..7ef69124fa3e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness;
99
100 int order; 98 int order;
101 99
102 /* 100 /*
@@ -107,6 +105,7 @@ struct scan_control {
107 105
108 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
109 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
108 struct memcg_scanrecord *memcg_record;
110 109
111 /* 110 /*
112 * Nodemask of nodes allowed by the caller. If NULL, all nodes 111 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -173,7 +172,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 172 struct scan_control *sc, enum lru_list lru)
174{ 173{
175 if (!scanning_global_lru(sc)) 174 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 175 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
176 zone_to_nid(zone), zone_idx(zone), BIT(lru));
177 177
178 return zone_page_state(zone, NR_LRU_BASE + lru); 178 return zone_page_state(zone, NR_LRU_BASE + lru);
179} 179}
@@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink,
250 unsigned long long delta; 250 unsigned long long delta;
251 unsigned long total_scan; 251 unsigned long total_scan;
252 unsigned long max_pass; 252 unsigned long max_pass;
253 int shrink_ret = 0;
254 long nr;
255 long new_nr;
256 long batch_size = shrinker->batch ? shrinker->batch
257 : SHRINK_BATCH;
253 258
259 /*
260 * copy the current shrinker scan count into a local variable
261 * and zero it so that other concurrent shrinker invocations
262 * don't also do this scanning work.
263 */
264 do {
265 nr = shrinker->nr;
266 } while (cmpxchg(&shrinker->nr, nr, 0) != nr);
267
268 total_scan = nr;
254 max_pass = do_shrinker_shrink(shrinker, shrink, 0); 269 max_pass = do_shrinker_shrink(shrinker, shrink, 0);
255 delta = (4 * nr_pages_scanned) / shrinker->seeks; 270 delta = (4 * nr_pages_scanned) / shrinker->seeks;
256 delta *= max_pass; 271 delta *= max_pass;
257 do_div(delta, lru_pages + 1); 272 do_div(delta, lru_pages + 1);
258 shrinker->nr += delta; 273 total_scan += delta;
259 if (shrinker->nr < 0) { 274 if (total_scan < 0) {
260 printk(KERN_ERR "shrink_slab: %pF negative objects to " 275 printk(KERN_ERR "shrink_slab: %pF negative objects to "
261 "delete nr=%ld\n", 276 "delete nr=%ld\n",
262 shrinker->shrink, shrinker->nr); 277 shrinker->shrink, total_scan);
263 shrinker->nr = max_pass; 278 total_scan = max_pass;
264 } 279 }
265 280
266 /* 281 /*
282 * We need to avoid excessive windup on filesystem shrinkers
283 * due to large numbers of GFP_NOFS allocations causing the
284 * shrinkers to return -1 all the time. This results in a large
285 * nr being built up so when a shrink that can do some work
286 * comes along it empties the entire cache due to nr >>>
287 * max_pass. This is bad for sustaining a working set in
288 * memory.
289 *
290 * Hence only allow the shrinker to scan the entire cache when
291 * a large delta change is calculated directly.
292 */
293 if (delta < max_pass / 4)
294 total_scan = min(total_scan, max_pass / 2);
295
296 /*
267 * Avoid risking looping forever due to too large nr value: 297 * Avoid risking looping forever due to too large nr value:
268 * never try to free more than twice the estimate number of 298 * never try to free more than twice the estimate number of
269 * freeable entries. 299 * freeable entries.
270 */ 300 */
271 if (shrinker->nr > max_pass * 2) 301 if (total_scan > max_pass * 2)
272 shrinker->nr = max_pass * 2; 302 total_scan = max_pass * 2;
273 303
274 total_scan = shrinker->nr; 304 trace_mm_shrink_slab_start(shrinker, shrink, nr,
275 shrinker->nr = 0; 305 nr_pages_scanned, lru_pages,
306 max_pass, delta, total_scan);
276 307
277 while (total_scan >= SHRINK_BATCH) { 308 while (total_scan >= batch_size) {
278 long this_scan = SHRINK_BATCH;
279 int shrink_ret;
280 int nr_before; 309 int nr_before;
281 310
282 nr_before = do_shrinker_shrink(shrinker, shrink, 0); 311 nr_before = do_shrinker_shrink(shrinker, shrink, 0);
283 shrink_ret = do_shrinker_shrink(shrinker, shrink, 312 shrink_ret = do_shrinker_shrink(shrinker, shrink,
284 this_scan); 313 batch_size);
285 if (shrink_ret == -1) 314 if (shrink_ret == -1)
286 break; 315 break;
287 if (shrink_ret < nr_before) 316 if (shrink_ret < nr_before)
288 ret += nr_before - shrink_ret; 317 ret += nr_before - shrink_ret;
289 count_vm_events(SLABS_SCANNED, this_scan); 318 count_vm_events(SLABS_SCANNED, batch_size);
290 total_scan -= this_scan; 319 total_scan -= batch_size;
291 320
292 cond_resched(); 321 cond_resched();
293 } 322 }
294 323
295 shrinker->nr += total_scan; 324 /*
325 * move the unused scan count back into the shrinker in a
326 * manner that handles concurrent updates. If we exhausted the
327 * scan, there is no need to do an update.
328 */
329 do {
330 nr = shrinker->nr;
331 new_nr = total_scan + nr;
332 if (total_scan <= 0)
333 break;
334 } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr);
335
336 trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
296 } 337 }
297 up_read(&shrinker_rwsem); 338 up_read(&shrinker_rwsem);
298out: 339out:
@@ -1308,6 +1349,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1308 int file = is_file_lru(lru); 1349 int file = is_file_lru(lru);
1309 int numpages = hpage_nr_pages(page); 1350 int numpages = hpage_nr_pages(page);
1310 reclaim_stat->recent_rotated[file] += numpages; 1351 reclaim_stat->recent_rotated[file] += numpages;
1352 if (!scanning_global_lru(sc))
1353 sc->memcg_record->nr_rotated[file] += numpages;
1311 } 1354 }
1312 if (!pagevec_add(&pvec, page)) { 1355 if (!pagevec_add(&pvec, page)) {
1313 spin_unlock_irq(&zone->lru_lock); 1356 spin_unlock_irq(&zone->lru_lock);
@@ -1351,6 +1394,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1351 1394
1352 reclaim_stat->recent_scanned[0] += *nr_anon; 1395 reclaim_stat->recent_scanned[0] += *nr_anon;
1353 reclaim_stat->recent_scanned[1] += *nr_file; 1396 reclaim_stat->recent_scanned[1] += *nr_file;
1397 if (!scanning_global_lru(sc)) {
1398 sc->memcg_record->nr_scanned[0] += *nr_anon;
1399 sc->memcg_record->nr_scanned[1] += *nr_file;
1400 }
1354} 1401}
1355 1402
1356/* 1403/*
@@ -1464,6 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1464 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1511 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1465 } 1512 }
1466 1513
1514 if (!scanning_global_lru(sc))
1515 sc->memcg_record->nr_freed[file] += nr_reclaimed;
1516
1467 local_irq_disable(); 1517 local_irq_disable();
1468 if (current_is_kswapd()) 1518 if (current_is_kswapd())
1469 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1519 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1563,6 +1613,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1563 } 1613 }
1564 1614
1565 reclaim_stat->recent_scanned[file] += nr_taken; 1615 reclaim_stat->recent_scanned[file] += nr_taken;
1616 if (!scanning_global_lru(sc))
1617 sc->memcg_record->nr_scanned[file] += nr_taken;
1566 1618
1567 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1619 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1568 if (file) 1620 if (file)
@@ -1614,6 +1666,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1614 * get_scan_ratio. 1666 * get_scan_ratio.
1615 */ 1667 */
1616 reclaim_stat->recent_rotated[file] += nr_rotated; 1668 reclaim_stat->recent_rotated[file] += nr_rotated;
1669 if (!scanning_global_lru(sc))
1670 sc->memcg_record->nr_rotated[file] += nr_rotated;
1617 1671
1618 move_active_pages_to_lru(zone, &l_active, 1672 move_active_pages_to_lru(zone, &l_active,
1619 LRU_ACTIVE + file * LRU_FILE); 1673 LRU_ACTIVE + file * LRU_FILE);
@@ -1729,6 +1783,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1729 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1783 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1730} 1784}
1731 1785
1786static int vmscan_swappiness(struct scan_control *sc)
1787{
1788 if (scanning_global_lru(sc))
1789 return vm_swappiness;
1790 return mem_cgroup_swappiness(sc->mem_cgroup);
1791}
1792
1732/* 1793/*
1733 * Determine how aggressively the anon and file LRU lists should be 1794 * Determine how aggressively the anon and file LRU lists should be
1734 * scanned. The relative value of each set of LRU lists is determined 1795 * scanned. The relative value of each set of LRU lists is determined
@@ -1748,6 +1809,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1748 enum lru_list l; 1809 enum lru_list l;
1749 int noswap = 0; 1810 int noswap = 0;
1750 int force_scan = 0; 1811 int force_scan = 0;
1812 unsigned long nr_force_scan[2];
1751 1813
1752 1814
1753 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + 1815 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
@@ -1770,6 +1832,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1770 fraction[0] = 0; 1832 fraction[0] = 0;
1771 fraction[1] = 1; 1833 fraction[1] = 1;
1772 denominator = 1; 1834 denominator = 1;
1835 nr_force_scan[0] = 0;
1836 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1773 goto out; 1837 goto out;
1774 } 1838 }
1775 1839
@@ -1781,6 +1845,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1781 fraction[0] = 1; 1845 fraction[0] = 1;
1782 fraction[1] = 0; 1846 fraction[1] = 0;
1783 denominator = 1; 1847 denominator = 1;
1848 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1849 nr_force_scan[1] = 0;
1784 goto out; 1850 goto out;
1785 } 1851 }
1786 } 1852 }
@@ -1789,8 +1855,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1789 * With swappiness at 100, anonymous and file have the same priority. 1855 * With swappiness at 100, anonymous and file have the same priority.
1790 * This scanning priority is essentially the inverse of IO cost. 1856 * This scanning priority is essentially the inverse of IO cost.
1791 */ 1857 */
1792 anon_prio = sc->swappiness; 1858 anon_prio = vmscan_swappiness(sc);
1793 file_prio = 200 - sc->swappiness; 1859 file_prio = 200 - vmscan_swappiness(sc);
1794 1860
1795 /* 1861 /*
1796 * OK, so we have swap space and a fair amount of page cache 1862 * OK, so we have swap space and a fair amount of page cache
@@ -1829,6 +1895,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1829 fraction[0] = ap; 1895 fraction[0] = ap;
1830 fraction[1] = fp; 1896 fraction[1] = fp;
1831 denominator = ap + fp + 1; 1897 denominator = ap + fp + 1;
1898 if (force_scan) {
1899 unsigned long scan = SWAP_CLUSTER_MAX;
1900 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1901 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1902 }
1832out: 1903out:
1833 for_each_evictable_lru(l) { 1904 for_each_evictable_lru(l) {
1834 int file = is_file_lru(l); 1905 int file = is_file_lru(l);
@@ -1849,12 +1920,8 @@ out:
1849 * memcg, priority drop can cause big latency. So, it's better 1920 * memcg, priority drop can cause big latency. So, it's better
1850 * to scan small amount. See may_noscan above. 1921 * to scan small amount. See may_noscan above.
1851 */ 1922 */
1852 if (!scan && force_scan) { 1923 if (!scan && force_scan)
1853 if (file) 1924 scan = nr_force_scan[file];
1854 scan = SWAP_CLUSTER_MAX;
1855 else if (!noswap)
1856 scan = SWAP_CLUSTER_MAX;
1857 }
1858 nr[l] = scan; 1925 nr[l] = scan;
1859 } 1926 }
1860} 1927}
@@ -2179,7 +2246,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2179 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2246 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2180 .may_unmap = 1, 2247 .may_unmap = 1,
2181 .may_swap = 1, 2248 .may_swap = 1,
2182 .swappiness = vm_swappiness,
2183 .order = order, 2249 .order = order,
2184 .mem_cgroup = NULL, 2250 .mem_cgroup = NULL,
2185 .nodemask = nodemask, 2251 .nodemask = nodemask,
@@ -2202,10 +2268,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2202#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2268#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2203 2269
2204unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2270unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2205 gfp_t gfp_mask, bool noswap, 2271 gfp_t gfp_mask, bool noswap,
2206 unsigned int swappiness, 2272 struct zone *zone,
2207 struct zone *zone, 2273 struct memcg_scanrecord *rec,
2208 unsigned long *nr_scanned) 2274 unsigned long *scanned)
2209{ 2275{
2210 struct scan_control sc = { 2276 struct scan_control sc = {
2211 .nr_scanned = 0, 2277 .nr_scanned = 0,
@@ -2213,10 +2279,11 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2213 .may_writepage = !laptop_mode, 2279 .may_writepage = !laptop_mode,
2214 .may_unmap = 1, 2280 .may_unmap = 1,
2215 .may_swap = !noswap, 2281 .may_swap = !noswap,
2216 .swappiness = swappiness,
2217 .order = 0, 2282 .order = 0,
2218 .mem_cgroup = mem, 2283 .mem_cgroup = mem,
2284 .memcg_record = rec,
2219 }; 2285 };
2286 unsigned long start, end;
2220 2287
2221 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2288 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2222 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2289 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2225,6 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2225 sc.may_writepage, 2292 sc.may_writepage,
2226 sc.gfp_mask); 2293 sc.gfp_mask);
2227 2294
2295 start = sched_clock();
2228 /* 2296 /*
2229 * NOTE: Although we can get the priority field, using it 2297 * NOTE: Although we can get the priority field, using it
2230 * here is not a good idea, since it limits the pages we can scan. 2298 * here is not a good idea, since it limits the pages we can scan.
@@ -2233,29 +2301,34 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2233 * the priority and make it zero. 2301 * the priority and make it zero.
2234 */ 2302 */
2235 shrink_zone(0, zone, &sc); 2303 shrink_zone(0, zone, &sc);
2304 end = sched_clock();
2305
2306 if (rec)
2307 rec->elapsed += end - start;
2308 *scanned = sc.nr_scanned;
2236 2309
2237 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2310 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2238 2311
2239 *nr_scanned = sc.nr_scanned;
2240 return sc.nr_reclaimed; 2312 return sc.nr_reclaimed;
2241} 2313}
2242 2314
2243unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2315unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2244 gfp_t gfp_mask, 2316 gfp_t gfp_mask,
2245 bool noswap, 2317 bool noswap,
2246 unsigned int swappiness) 2318 struct memcg_scanrecord *rec)
2247{ 2319{
2248 struct zonelist *zonelist; 2320 struct zonelist *zonelist;
2249 unsigned long nr_reclaimed; 2321 unsigned long nr_reclaimed;
2322 unsigned long start, end;
2250 int nid; 2323 int nid;
2251 struct scan_control sc = { 2324 struct scan_control sc = {
2252 .may_writepage = !laptop_mode, 2325 .may_writepage = !laptop_mode,
2253 .may_unmap = 1, 2326 .may_unmap = 1,
2254 .may_swap = !noswap, 2327 .may_swap = !noswap,
2255 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2328 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2256 .swappiness = swappiness,
2257 .order = 0, 2329 .order = 0,
2258 .mem_cgroup = mem_cont, 2330 .mem_cgroup = mem_cont,
2331 .memcg_record = rec,
2259 .nodemask = NULL, /* we don't care the placement */ 2332 .nodemask = NULL, /* we don't care the placement */
2260 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2333 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2261 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2334 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2264,6 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2264 .gfp_mask = sc.gfp_mask, 2337 .gfp_mask = sc.gfp_mask,
2265 }; 2338 };
2266 2339
2340 start = sched_clock();
2267 /* 2341 /*
2268 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2342 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2269 * take care of from where we get pages. So the node where we start the 2343 * take care of from where we get pages. So the node where we start the
@@ -2278,6 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2278 sc.gfp_mask); 2352 sc.gfp_mask);
2279 2353
2280 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2354 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2355 end = sched_clock();
2356 if (rec)
2357 rec->elapsed += end - start;
2281 2358
2282 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2359 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2283 2360
@@ -2404,7 +2481,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2404 * we want to put equal scanning pressure on each zone. 2481 * we want to put equal scanning pressure on each zone.
2405 */ 2482 */
2406 .nr_to_reclaim = ULONG_MAX, 2483 .nr_to_reclaim = ULONG_MAX,
2407 .swappiness = vm_swappiness,
2408 .order = order, 2484 .order = order,
2409 .mem_cgroup = NULL, 2485 .mem_cgroup = NULL,
2410 }; 2486 };
@@ -2874,7 +2950,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2874 .may_writepage = 1, 2950 .may_writepage = 1,
2875 .nr_to_reclaim = nr_to_reclaim, 2951 .nr_to_reclaim = nr_to_reclaim,
2876 .hibernation_mode = 1, 2952 .hibernation_mode = 1,
2877 .swappiness = vm_swappiness,
2878 .order = 0, 2953 .order = 0,
2879 }; 2954 };
2880 struct shrink_control shrink = { 2955 struct shrink_control shrink = {
@@ -3061,7 +3136,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3061 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3136 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3062 SWAP_CLUSTER_MAX), 3137 SWAP_CLUSTER_MAX),
3063 .gfp_mask = gfp_mask, 3138 .gfp_mask = gfp_mask,
3064 .swappiness = vm_swappiness,
3065 .order = order, 3139 .order = order,
3066 }; 3140 };
3067 struct shrink_control shrink = { 3141 struct shrink_control shrink = {