aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c82
-rw-r--r--mm/failslab.c39
-rw-r--r--mm/filemap.c118
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/memcontrol.c471
-rw-r--r--mm/memory-failure.c92
-rw-r--r--mm/mempolicy.c25
-rw-r--r--mm/mincore.c11
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c269
-rw-r--r--mm/page_alloc.c60
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c1493
-rw-r--r--mm/slab.c99
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c772
-rw-r--r--mm/swapfile.c20
-rw-r--r--mm/truncate.c8
-rw-r--r--mm/vmalloc.c17
-rw-r--r--mm/vmscan.c74
-rw-r--r--mm/vmstat.c4
23 files changed, 1879 insertions, 1793 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8290b1e88257..d6edf8d14f9c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
45static int bdi_sync_supers(void *); 45static int bdi_sync_supers(void *);
46static void sync_supers_timer_fn(unsigned long); 46static void sync_supers_timer_fn(unsigned long);
47 47
48void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
49{
50 if (wb1 < wb2) {
51 spin_lock(&wb1->list_lock);
52 spin_lock_nested(&wb2->list_lock, 1);
53 } else {
54 spin_lock(&wb2->list_lock);
55 spin_lock_nested(&wb1->list_lock, 1);
56 }
57}
58
48#ifdef CONFIG_DEBUG_FS 59#ifdef CONFIG_DEBUG_FS
49#include <linux/debugfs.h> 60#include <linux/debugfs.h>
50#include <linux/seq_file.h> 61#include <linux/seq_file.h>
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
67 struct inode *inode; 78 struct inode *inode;
68 79
69 nr_dirty = nr_io = nr_more_io = 0; 80 nr_dirty = nr_io = nr_more_io = 0;
70 spin_lock(&inode_wb_list_lock); 81 spin_lock(&wb->list_lock);
71 list_for_each_entry(inode, &wb->b_dirty, i_wb_list) 82 list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
72 nr_dirty++; 83 nr_dirty++;
73 list_for_each_entry(inode, &wb->b_io, i_wb_list) 84 list_for_each_entry(inode, &wb->b_io, i_wb_list)
74 nr_io++; 85 nr_io++;
75 list_for_each_entry(inode, &wb->b_more_io, i_wb_list) 86 list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
76 nr_more_io++; 87 nr_more_io++;
77 spin_unlock(&inode_wb_list_lock); 88 spin_unlock(&wb->list_lock);
78 89
79 global_dirty_limits(&background_thresh, &dirty_thresh); 90 global_dirty_limits(&background_thresh, &dirty_thresh);
80 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 91 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
81 92
82#define K(x) ((x) << (PAGE_SHIFT - 10)) 93#define K(x) ((x) << (PAGE_SHIFT - 10))
83 seq_printf(m, 94 seq_printf(m,
84 "BdiWriteback: %8lu kB\n" 95 "BdiWriteback: %10lu kB\n"
85 "BdiReclaimable: %8lu kB\n" 96 "BdiReclaimable: %10lu kB\n"
86 "BdiDirtyThresh: %8lu kB\n" 97 "BdiDirtyThresh: %10lu kB\n"
87 "DirtyThresh: %8lu kB\n" 98 "DirtyThresh: %10lu kB\n"
88 "BackgroundThresh: %8lu kB\n" 99 "BackgroundThresh: %10lu kB\n"
89 "b_dirty: %8lu\n" 100 "BdiWritten: %10lu kB\n"
90 "b_io: %8lu\n" 101 "BdiWriteBandwidth: %10lu kBps\n"
91 "b_more_io: %8lu\n" 102 "b_dirty: %10lu\n"
92 "bdi_list: %8u\n" 103 "b_io: %10lu\n"
93 "state: %8lx\n", 104 "b_more_io: %10lu\n"
105 "bdi_list: %10u\n"
106 "state: %10lx\n",
94 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), 107 (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
95 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), 108 (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
96 K(bdi_thresh), K(dirty_thresh), 109 K(bdi_thresh),
97 K(background_thresh), nr_dirty, nr_io, nr_more_io, 110 K(dirty_thresh),
111 K(background_thresh),
112 (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
113 (unsigned long) K(bdi->write_bandwidth),
114 nr_dirty,
115 nr_io,
116 nr_more_io,
98 !list_empty(&bdi->bdi_list), bdi->state); 117 !list_empty(&bdi->bdi_list), bdi->state);
99#undef K 118#undef K
100 119
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
249 return wb_has_dirty_io(&bdi->wb); 268 return wb_has_dirty_io(&bdi->wb);
250} 269}
251 270
252static void bdi_flush_io(struct backing_dev_info *bdi)
253{
254 struct writeback_control wbc = {
255 .sync_mode = WB_SYNC_NONE,
256 .older_than_this = NULL,
257 .range_cyclic = 1,
258 .nr_to_write = 1024,
259 };
260
261 writeback_inodes_wb(&bdi->wb, &wbc);
262}
263
264/* 271/*
265 * kupdated() used to do this. We cannot do it from the bdi_forker_thread() 272 * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
266 * or we risk deadlocking on ->s_umount. The longer term solution would be 273 * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr)
446 if (IS_ERR(task)) { 453 if (IS_ERR(task)) {
447 /* 454 /*
448 * If thread creation fails, force writeout of 455 * If thread creation fails, force writeout of
449 * the bdi from the thread. 456 * the bdi from the thread. Hopefully 1024 is
457 * large enough for efficient IO.
450 */ 458 */
451 bdi_flush_io(bdi); 459 writeback_inodes_wb(&bdi->wb, 1024);
452 } else { 460 } else {
453 /* 461 /*
454 * The spinlock makes sure we do not lose 462 * The spinlock makes sure we do not lose
@@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
629 INIT_LIST_HEAD(&wb->b_dirty); 637 INIT_LIST_HEAD(&wb->b_dirty);
630 INIT_LIST_HEAD(&wb->b_io); 638 INIT_LIST_HEAD(&wb->b_io);
631 INIT_LIST_HEAD(&wb->b_more_io); 639 INIT_LIST_HEAD(&wb->b_more_io);
640 spin_lock_init(&wb->list_lock);
632 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); 641 setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
633} 642}
634 643
644/*
645 * Initial write bandwidth: 100 MB/s
646 */
647#define INIT_BW (100 << (20 - PAGE_SHIFT))
648
635int bdi_init(struct backing_dev_info *bdi) 649int bdi_init(struct backing_dev_info *bdi)
636{ 650{
637 int i, err; 651 int i, err;
@@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi)
654 } 668 }
655 669
656 bdi->dirty_exceeded = 0; 670 bdi->dirty_exceeded = 0;
671
672 bdi->bw_time_stamp = jiffies;
673 bdi->written_stamp = 0;
674
675 bdi->write_bandwidth = INIT_BW;
676 bdi->avg_write_bandwidth = INIT_BW;
677
657 err = prop_local_init_percpu(&bdi->completions); 678 err = prop_local_init_percpu(&bdi->completions);
658 679
659 if (err) { 680 if (err) {
@@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
677 if (bdi_has_dirty_io(bdi)) { 698 if (bdi_has_dirty_io(bdi)) {
678 struct bdi_writeback *dst = &default_backing_dev_info.wb; 699 struct bdi_writeback *dst = &default_backing_dev_info.wb;
679 700
680 spin_lock(&inode_wb_list_lock); 701 bdi_lock_two(&bdi->wb, dst);
681 list_splice(&bdi->wb.b_dirty, &dst->b_dirty); 702 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
682 list_splice(&bdi->wb.b_io, &dst->b_io); 703 list_splice(&bdi->wb.b_io, &dst->b_io);
683 list_splice(&bdi->wb.b_more_io, &dst->b_more_io); 704 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
684 spin_unlock(&inode_wb_list_lock); 705 spin_unlock(&bdi->wb.list_lock);
706 spin_unlock(&dst->list_lock);
685 } 707 }
686 708
687 bdi_unregister(bdi); 709 bdi_unregister(bdi);
diff --git a/mm/failslab.c b/mm/failslab.c
index c5f88f240ddc..0dd7b8fec71c 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -5,10 +5,6 @@ static struct {
5 struct fault_attr attr; 5 struct fault_attr attr;
6 u32 ignore_gfp_wait; 6 u32 ignore_gfp_wait;
7 int cache_filter; 7 int cache_filter;
8#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
9 struct dentry *ignore_gfp_wait_file;
10 struct dentry *cache_filter_file;
11#endif
12} failslab = { 8} failslab = {
13 .attr = FAULT_ATTR_INITIALIZER, 9 .attr = FAULT_ATTR_INITIALIZER,
14 .ignore_gfp_wait = 1, 10 .ignore_gfp_wait = 1,
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab);
38#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 34#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
39static int __init failslab_debugfs_init(void) 35static int __init failslab_debugfs_init(void)
40{ 36{
41 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
42 struct dentry *dir; 37 struct dentry *dir;
43 int err; 38 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
44
45 err = init_fault_attr_dentries(&failslab.attr, "failslab");
46 if (err)
47 return err;
48 dir = failslab.attr.dentries.dir;
49 39
50 failslab.ignore_gfp_wait_file = 40 dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
51 debugfs_create_bool("ignore-gfp-wait", mode, dir, 41 if (IS_ERR(dir))
52 &failslab.ignore_gfp_wait); 42 return PTR_ERR(dir);
53 43
54 failslab.cache_filter_file = 44 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
55 debugfs_create_bool("cache-filter", mode, dir, 45 &failslab.ignore_gfp_wait))
56 &failslab.cache_filter); 46 goto fail;
47 if (!debugfs_create_bool("cache-filter", mode, dir,
48 &failslab.cache_filter))
49 goto fail;
57 50
58 if (!failslab.ignore_gfp_wait_file || 51 return 0;
59 !failslab.cache_filter_file) { 52fail:
60 err = -ENOMEM; 53 debugfs_remove_recursive(dir);
61 debugfs_remove(failslab.cache_filter_file);
62 debugfs_remove(failslab.ignore_gfp_wait_file);
63 cleanup_fault_attr_dentries(&failslab.attr);
64 }
65 54
66 return err; 55 return -ENOMEM;
67} 56}
68 57
69late_initcall(failslab_debugfs_init); 58late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 10a171113273..7771871fa353 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,7 +33,6 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
37#include <linux/cleancache.h> 36#include <linux/cleancache.h>
38#include "internal.h" 37#include "internal.h"
39 38
@@ -78,7 +77,7 @@
78 * ->i_mutex (generic_file_buffered_write) 77 * ->i_mutex (generic_file_buffered_write)
79 * ->mmap_sem (fault_in_pages_readable->do_page_fault) 78 * ->mmap_sem (fault_in_pages_readable->do_page_fault)
80 * 79 *
81 * inode_wb_list_lock 80 * bdi->wb.list_lock
82 * sb_lock (fs/fs-writeback.c) 81 * sb_lock (fs/fs-writeback.c)
83 * ->mapping->tree_lock (__sync_single_inode) 82 * ->mapping->tree_lock (__sync_single_inode)
84 * 83 *
@@ -96,9 +95,9 @@
96 * ->zone.lru_lock (check_pte_range->isolate_lru_page) 95 * ->zone.lru_lock (check_pte_range->isolate_lru_page)
97 * ->private_lock (page_remove_rmap->set_page_dirty) 96 * ->private_lock (page_remove_rmap->set_page_dirty)
98 * ->tree_lock (page_remove_rmap->set_page_dirty) 97 * ->tree_lock (page_remove_rmap->set_page_dirty)
99 * inode_wb_list_lock (page_remove_rmap->set_page_dirty) 98 * bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
100 * ->inode->i_lock (page_remove_rmap->set_page_dirty) 99 * ->inode->i_lock (page_remove_rmap->set_page_dirty)
101 * inode_wb_list_lock (zap_pte_range->set_page_dirty) 100 * bdi.wb->list_lock (zap_pte_range->set_page_dirty)
102 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 103 *
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
462 int error; 461 int error;
463 462
464 VM_BUG_ON(!PageLocked(page)); 463 VM_BUG_ON(!PageLocked(page));
464 VM_BUG_ON(PageSwapBacked(page));
465 465
466 error = mem_cgroup_cache_charge(page, current->mm, 466 error = mem_cgroup_cache_charge(page, current->mm,
467 gfp_mask & GFP_RECLAIM_MASK); 467 gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
479 if (likely(!error)) { 479 if (likely(!error)) {
480 mapping->nrpages++; 480 mapping->nrpages++;
481 __inc_zone_page_state(page, NR_FILE_PAGES); 481 __inc_zone_page_state(page, NR_FILE_PAGES);
482 if (PageSwapBacked(page))
483 __inc_zone_page_state(page, NR_SHMEM);
484 spin_unlock_irq(&mapping->tree_lock); 482 spin_unlock_irq(&mapping->tree_lock);
485 } else { 483 } else {
486 page->mapping = NULL; 484 page->mapping = NULL;
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
502{ 500{
503 int ret; 501 int ret;
504 502
505 /*
506 * Splice_read and readahead add shmem/tmpfs pages into the page cache
507 * before shmem_readpage has a chance to mark them as SwapBacked: they
508 * need to go on the anon lru below, and mem_cgroup_cache_charge
509 * (called in add_to_page_cache) needs to know where they're going too.
510 */
511 if (mapping_cap_swap_backed(mapping))
512 SetPageSwapBacked(page);
513
514 ret = add_to_page_cache(page, mapping, offset, gfp_mask); 503 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
515 if (ret == 0) { 504 if (ret == 0)
516 if (page_is_file_cache(page)) 505 lru_cache_add_file(page);
517 lru_cache_add_file(page);
518 else
519 lru_cache_add_anon(page);
520 }
521 return ret; 506 return ret;
522} 507}
523EXPORT_SYMBOL_GPL(add_to_page_cache_lru); 508EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +699,16 @@ repeat:
714 page = radix_tree_deref_slot(pagep); 699 page = radix_tree_deref_slot(pagep);
715 if (unlikely(!page)) 700 if (unlikely(!page))
716 goto out; 701 goto out;
717 if (radix_tree_deref_retry(page)) 702 if (radix_tree_exception(page)) {
718 goto repeat; 703 if (radix_tree_deref_retry(page))
719 704 goto repeat;
705 /*
706 * Otherwise, shmem/tmpfs must be storing a swap entry
707 * here as an exceptional entry: so return it without
708 * attempting to raise page count.
709 */
710 goto out;
711 }
720 if (!page_cache_get_speculative(page)) 712 if (!page_cache_get_speculative(page))
721 goto repeat; 713 goto repeat;
722 714
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
753 745
754repeat: 746repeat:
755 page = find_get_page(mapping, offset); 747 page = find_get_page(mapping, offset);
756 if (page) { 748 if (page && !radix_tree_exception(page)) {
757 lock_page(page); 749 lock_page(page);
758 /* Has the page been truncated? */ 750 /* Has the page been truncated? */
759 if (unlikely(page->mapping != mapping)) { 751 if (unlikely(page->mapping != mapping)) {
@@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
835{ 827{
836 unsigned int i; 828 unsigned int i;
837 unsigned int ret; 829 unsigned int ret;
838 unsigned int nr_found; 830 unsigned int nr_found, nr_skip;
839 831
840 rcu_read_lock(); 832 rcu_read_lock();
841restart: 833restart:
842 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 834 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
843 (void ***)pages, start, nr_pages); 835 (void ***)pages, NULL, start, nr_pages);
844 ret = 0; 836 ret = 0;
837 nr_skip = 0;
845 for (i = 0; i < nr_found; i++) { 838 for (i = 0; i < nr_found; i++) {
846 struct page *page; 839 struct page *page;
847repeat: 840repeat:
@@ -849,13 +842,23 @@ repeat:
849 if (unlikely(!page)) 842 if (unlikely(!page))
850 continue; 843 continue;
851 844
852 /* 845 if (radix_tree_exception(page)) {
853 * This can only trigger when the entry at index 0 moves out 846 if (radix_tree_deref_retry(page)) {
854 * of or back to the root: none yet gotten, safe to restart. 847 /*
855 */ 848 * Transient condition which can only trigger
856 if (radix_tree_deref_retry(page)) { 849 * when entry at index 0 moves out of or back
857 WARN_ON(start | i); 850 * to root: none yet gotten, safe to restart.
858 goto restart; 851 */
852 WARN_ON(start | i);
853 goto restart;
854 }
855 /*
856 * Otherwise, shmem/tmpfs must be storing a swap entry
857 * here as an exceptional entry: so skip over it -
858 * we only reach this from invalidate_mapping_pages().
859 */
860 nr_skip++;
861 continue;
859 } 862 }
860 863
861 if (!page_cache_get_speculative(page)) 864 if (!page_cache_get_speculative(page))
@@ -875,7 +878,7 @@ repeat:
875 * If all entries were removed before we could secure them, 878 * If all entries were removed before we could secure them,
876 * try again, because callers stop trying once 0 is returned. 879 * try again, because callers stop trying once 0 is returned.
877 */ 880 */
878 if (unlikely(!ret && nr_found)) 881 if (unlikely(!ret && nr_found > nr_skip))
879 goto restart; 882 goto restart;
880 rcu_read_unlock(); 883 rcu_read_unlock();
881 return ret; 884 return ret;
@@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
903 rcu_read_lock(); 906 rcu_read_lock();
904restart: 907restart:
905 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, 908 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
906 (void ***)pages, index, nr_pages); 909 (void ***)pages, NULL, index, nr_pages);
907 ret = 0; 910 ret = 0;
908 for (i = 0; i < nr_found; i++) { 911 for (i = 0; i < nr_found; i++) {
909 struct page *page; 912 struct page *page;
@@ -912,12 +915,22 @@ repeat:
912 if (unlikely(!page)) 915 if (unlikely(!page))
913 continue; 916 continue;
914 917
915 /* 918 if (radix_tree_exception(page)) {
916 * This can only trigger when the entry at index 0 moves out 919 if (radix_tree_deref_retry(page)) {
917 * of or back to the root: none yet gotten, safe to restart. 920 /*
918 */ 921 * Transient condition which can only trigger
919 if (radix_tree_deref_retry(page)) 922 * when entry at index 0 moves out of or back
920 goto restart; 923 * to root: none yet gotten, safe to restart.
924 */
925 goto restart;
926 }
927 /*
928 * Otherwise, shmem/tmpfs must be storing a swap entry
929 * here as an exceptional entry: so stop looking for
930 * contiguous pages.
931 */
932 break;
933 }
921 934
922 if (!page_cache_get_speculative(page)) 935 if (!page_cache_get_speculative(page))
923 goto repeat; 936 goto repeat;
@@ -977,12 +990,21 @@ repeat:
977 if (unlikely(!page)) 990 if (unlikely(!page))
978 continue; 991 continue;
979 992
980 /* 993 if (radix_tree_exception(page)) {
981 * This can only trigger when the entry at index 0 moves out 994 if (radix_tree_deref_retry(page)) {
982 * of or back to the root: none yet gotten, safe to restart. 995 /*
983 */ 996 * Transient condition which can only trigger
984 if (radix_tree_deref_retry(page)) 997 * when entry at index 0 moves out of or back
985 goto restart; 998 * to root: none yet gotten, safe to restart.
999 */
1000 goto restart;
1001 }
1002 /*
1003 * This function is never used on a shmem/tmpfs
1004 * mapping, so a swap entry won't be found here.
1005 */
1006 BUG();
1007 }
986 1008
987 if (!page_cache_get_speculative(page)) 1009 if (!page_cache_get_speculative(page))
988 goto repeat; 1010 goto repeat;
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2ed..5ef672c07f75 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -326,7 +326,7 @@ static struct page_address_slot {
326 spinlock_t lock; /* Protect this bucket's list */ 326 spinlock_t lock; /* Protect this bucket's list */
327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; 327} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
328 328
329static struct page_address_slot *page_slot(struct page *page) 329static struct page_address_slot *page_slot(const struct page *page)
330{ 330{
331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; 331 return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
332} 332}
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
337 * 337 *
338 * Returns the page's virtual address. 338 * Returns the page's virtual address.
339 */ 339 */
340void *page_address(struct page *page) 340void *page_address(const struct page *page)
341{ 341{
342 unsigned long flags; 342 unsigned long flags;
343 void *ret; 343 void *ret;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 4019979b2637..a56a851908d2 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,7 +5,7 @@
5#include <linux/list.h> 5#include <linux/list.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7 7
8#include <asm/atomic.h> 8#include <linux/atomic.h>
9#include <asm/pgtable.h> 9#include <asm/pgtable.h>
10#include <asm/mmu.h> 10#include <asm/mmu.h>
11 11
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index aacee45616fc..d6880f542f95 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -96,7 +96,7 @@
96 96
97#include <asm/sections.h> 97#include <asm/sections.h>
98#include <asm/processor.h> 98#include <asm/processor.h>
99#include <asm/atomic.h> 99#include <linux/atomic.h>
100 100
101#include <linux/kmemcheck.h> 101#include <linux/kmemcheck.h>
102#include <linux/kmemleak.h> 102#include <linux/kmemleak.h>
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e013b8e57d25..3508777837c7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,7 +35,6 @@
35#include <linux/limits.h> 35#include <linux/limits.h>
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/rbtree.h> 37#include <linux/rbtree.h>
38#include <linux/shmem_fs.h>
39#include <linux/slab.h> 38#include <linux/slab.h>
40#include <linux/swap.h> 39#include <linux/swap.h>
41#include <linux/swapops.h> 40#include <linux/swapops.h>
@@ -246,10 +245,13 @@ struct mem_cgroup {
246 * Should the accounting and control be hierarchical, per subtree? 245 * Should the accounting and control be hierarchical, per subtree?
247 */ 246 */
248 bool use_hierarchy; 247 bool use_hierarchy;
249 atomic_t oom_lock; 248
249 bool oom_lock;
250 atomic_t under_oom;
251
250 atomic_t refcnt; 252 atomic_t refcnt;
251 253
252 unsigned int swappiness; 254 int swappiness;
253 /* OOM-Killer disable */ 255 /* OOM-Killer disable */
254 int oom_kill_disable; 256 int oom_kill_disable;
255 257
@@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
636 preempt_enable(); 638 preempt_enable();
637} 639}
638 640
639static unsigned long 641unsigned long
640mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) 642mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
643 unsigned int lru_mask)
641{ 644{
642 struct mem_cgroup_per_zone *mz; 645 struct mem_cgroup_per_zone *mz;
646 enum lru_list l;
647 unsigned long ret = 0;
648
649 mz = mem_cgroup_zoneinfo(mem, nid, zid);
650
651 for_each_lru(l) {
652 if (BIT(l) & lru_mask)
653 ret += MEM_CGROUP_ZSTAT(mz, l);
654 }
655 return ret;
656}
657
658static unsigned long
659mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
660 int nid, unsigned int lru_mask)
661{
643 u64 total = 0; 662 u64 total = 0;
644 int zid; 663 int zid;
645 664
646 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 665 for (zid = 0; zid < MAX_NR_ZONES; zid++)
647 mz = mem_cgroup_zoneinfo(mem, nid, zid); 666 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
648 total += MEM_CGROUP_ZSTAT(mz, idx); 667
649 }
650 return total; 668 return total;
651} 669}
652static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 670
653 enum lru_list idx) 671static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
672 unsigned int lru_mask)
654{ 673{
655 int nid; 674 int nid;
656 u64 total = 0; 675 u64 total = 0;
657 676
658 for_each_online_node(nid) 677 for_each_node_state(nid, N_HIGH_MEMORY)
659 total += mem_cgroup_get_zonestat_node(mem, nid, idx); 678 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
660 return total; 679 return total;
661} 680}
662 681
@@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page,
1043 mem_cgroup_add_lru_list(page, to); 1062 mem_cgroup_add_lru_list(page, to);
1044} 1063}
1045 1064
1065/*
1066 * Checks whether given mem is same or in the root_mem's
1067 * hierarchy subtree
1068 */
1069static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
1070 struct mem_cgroup *mem)
1071{
1072 if (root_mem != mem) {
1073 return (root_mem->use_hierarchy &&
1074 css_is_ancestor(&mem->css, &root_mem->css));
1075 }
1076
1077 return true;
1078}
1079
1046int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1080int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1047{ 1081{
1048 int ret; 1082 int ret;
@@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1062 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1096 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
1063 * hierarchy(even if use_hierarchy is disabled in "mem"). 1097 * hierarchy(even if use_hierarchy is disabled in "mem").
1064 */ 1098 */
1065 if (mem->use_hierarchy) 1099 ret = mem_cgroup_same_or_subtree(mem, curr);
1066 ret = css_is_ancestor(&curr->css, &mem->css);
1067 else
1068 ret = (curr == mem);
1069 css_put(&curr->css); 1100 css_put(&curr->css);
1070 return ret; 1101 return ret;
1071} 1102}
@@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
1077 unsigned long gb; 1108 unsigned long gb;
1078 unsigned long inactive_ratio; 1109 unsigned long inactive_ratio;
1079 1110
1080 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 1111 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
1081 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 1112 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1082 1113
1083 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1114 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1084 if (gb) 1115 if (gb)
@@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1117 unsigned long active; 1148 unsigned long active;
1118 unsigned long inactive; 1149 unsigned long inactive;
1119 1150
1120 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 1151 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
1121 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 1152 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
1122 1153
1123 return (active > inactive); 1154 return (active > inactive);
1124} 1155}
1125 1156
1126unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
1127 struct zone *zone,
1128 enum lru_list lru)
1129{
1130 int nid = zone_to_nid(zone);
1131 int zid = zone_idx(zone);
1132 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1133
1134 return MEM_CGROUP_ZSTAT(mz, lru);
1135}
1136
1137static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
1138 int nid)
1139{
1140 unsigned long ret;
1141
1142 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
1143 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
1144
1145 return ret;
1146}
1147
1148static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
1149 int nid)
1150{
1151 unsigned long ret;
1152
1153 ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
1154 mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
1155 return ret;
1156}
1157
1158#if MAX_NUMNODES > 1
1159static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
1160{
1161 u64 total = 0;
1162 int nid;
1163
1164 for_each_node_state(nid, N_HIGH_MEMORY)
1165 total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
1166
1167 return total;
1168}
1169
1170static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
1171{
1172 u64 total = 0;
1173 int nid;
1174
1175 for_each_node_state(nid, N_HIGH_MEMORY)
1176 total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
1177
1178 return total;
1179}
1180
1181static unsigned long
1182mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
1183{
1184 return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
1185}
1186
1187static unsigned long
1188mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
1189{
1190 u64 total = 0;
1191 int nid;
1192
1193 for_each_node_state(nid, N_HIGH_MEMORY)
1194 total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
1195
1196 return total;
1197}
1198
1199static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
1200 int nid)
1201{
1202 enum lru_list l;
1203 u64 total = 0;
1204
1205 for_each_lru(l)
1206 total += mem_cgroup_get_zonestat_node(memcg, nid, l);
1207
1208 return total;
1209}
1210
1211static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
1212{
1213 u64 total = 0;
1214 int nid;
1215
1216 for_each_node_state(nid, N_HIGH_MEMORY)
1217 total += mem_cgroup_node_nr_lru_pages(memcg, nid);
1218
1219 return total;
1220}
1221#endif /* CONFIG_NUMA */
1222
1223struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1157struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1224 struct zone *zone) 1158 struct zone *zone)
1225{ 1159{
@@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1329 return margin >> PAGE_SHIFT; 1263 return margin >> PAGE_SHIFT;
1330} 1264}
1331 1265
1332static unsigned int get_swappiness(struct mem_cgroup *memcg) 1266int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1333{ 1267{
1334 struct cgroup *cgrp = memcg->css.cgroup; 1268 struct cgroup *cgrp = memcg->css.cgroup;
1335 1269
@@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1401 to = mc.to; 1335 to = mc.to;
1402 if (!from) 1336 if (!from)
1403 goto unlock; 1337 goto unlock;
1404 if (from == mem || to == mem 1338
1405 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1339 ret = mem_cgroup_same_or_subtree(mem, from)
1406 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1340 || mem_cgroup_same_or_subtree(mem, to);
1407 ret = true;
1408unlock: 1341unlock:
1409 spin_unlock(&mc.lock); 1342 spin_unlock(&mc.lock);
1410 return ret; 1343 return ret;
@@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1576static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1509static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1577 int nid, bool noswap) 1510 int nid, bool noswap)
1578{ 1511{
1579 if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) 1512 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
1580 return true; 1513 return true;
1581 if (noswap || !total_swap_pages) 1514 if (noswap || !total_swap_pages)
1582 return false; 1515 return false;
1583 if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) 1516 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
1584 return true; 1517 return true;
1585 return false; 1518 return false;
1586 1519
@@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1730 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1663 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1731 1664
1732 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1665 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1733 if (!check_soft && root_mem->memsw_is_minimum) 1666 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1734 noswap = true; 1667 noswap = true;
1735 1668
1736 while (1) { 1669 while (1) {
@@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1776 /* we use swappiness of local cgroup */ 1709 /* we use swappiness of local cgroup */
1777 if (check_soft) { 1710 if (check_soft) {
1778 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1711 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1779 noswap, get_swappiness(victim), zone, 1712 noswap, zone, &nr_scanned);
1780 &nr_scanned);
1781 *total_scanned += nr_scanned; 1713 *total_scanned += nr_scanned;
1782 } else 1714 } else
1783 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1715 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1784 noswap, get_swappiness(victim)); 1716 noswap);
1785 css_put(&victim->css); 1717 css_put(&victim->css);
1786 /* 1718 /*
1787 * At shrinking usage, we can't check we should stop here or 1719 * At shrinking usage, we can't check we should stop here or
@@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1803/* 1735/*
1804 * Check OOM-Killer is already running under our hierarchy. 1736 * Check OOM-Killer is already running under our hierarchy.
1805 * If someone is running, return false. 1737 * If someone is running, return false.
1738 * Has to be called with memcg_oom_lock
1806 */ 1739 */
1807static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1740static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1808{ 1741{
1809 int x, lock_count = 0; 1742 struct mem_cgroup *iter, *failed = NULL;
1810 struct mem_cgroup *iter; 1743 bool cond = true;
1811 1744
1812 for_each_mem_cgroup_tree(iter, mem) { 1745 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1813 x = atomic_inc_return(&iter->oom_lock); 1746 if (iter->oom_lock) {
1814 lock_count = max(x, lock_count); 1747 /*
1748 * this subtree of our hierarchy is already locked
1749 * so we cannot give a lock.
1750 */
1751 failed = iter;
1752 cond = false;
1753 } else
1754 iter->oom_lock = true;
1815 } 1755 }
1816 1756
1817 if (lock_count == 1) 1757 if (!failed)
1818 return true; 1758 return true;
1759
1760 /*
1761 * OK, we failed to lock the whole subtree so we have to clean up
1762 * what we set up to the failing subtree
1763 */
1764 cond = true;
1765 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1766 if (iter == failed) {
1767 cond = false;
1768 continue;
1769 }
1770 iter->oom_lock = false;
1771 }
1819 return false; 1772 return false;
1820} 1773}
1821 1774
1775/*
1776 * Has to be called with memcg_oom_lock
1777 */
1822static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1778static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1823{ 1779{
1824 struct mem_cgroup *iter; 1780 struct mem_cgroup *iter;
1825 1781
1782 for_each_mem_cgroup_tree(iter, mem)
1783 iter->oom_lock = false;
1784 return 0;
1785}
1786
1787static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1788{
1789 struct mem_cgroup *iter;
1790
1791 for_each_mem_cgroup_tree(iter, mem)
1792 atomic_inc(&iter->under_oom);
1793}
1794
1795static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1796{
1797 struct mem_cgroup *iter;
1798
1826 /* 1799 /*
1827 * When a new child is created while the hierarchy is under oom, 1800 * When a new child is created while the hierarchy is under oom,
1828 * mem_cgroup_oom_lock() may not be called. We have to use 1801 * mem_cgroup_oom_lock() may not be called. We have to use
1829 * atomic_add_unless() here. 1802 * atomic_add_unless() here.
1830 */ 1803 */
1831 for_each_mem_cgroup_tree(iter, mem) 1804 for_each_mem_cgroup_tree(iter, mem)
1832 atomic_add_unless(&iter->oom_lock, -1, 0); 1805 atomic_add_unless(&iter->under_oom, -1, 0);
1833 return 0;
1834} 1806}
1835 1807
1836 1808static DEFINE_SPINLOCK(memcg_oom_lock);
1837static DEFINE_MUTEX(memcg_oom_mutex);
1838static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1809static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1839 1810
1840struct oom_wait_info { 1811struct oom_wait_info {
@@ -1845,25 +1816,20 @@ struct oom_wait_info {
1845static int memcg_oom_wake_function(wait_queue_t *wait, 1816static int memcg_oom_wake_function(wait_queue_t *wait,
1846 unsigned mode, int sync, void *arg) 1817 unsigned mode, int sync, void *arg)
1847{ 1818{
1848 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1819 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
1820 *oom_wait_mem;
1849 struct oom_wait_info *oom_wait_info; 1821 struct oom_wait_info *oom_wait_info;
1850 1822
1851 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1823 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1824 oom_wait_mem = oom_wait_info->mem;
1852 1825
1853 if (oom_wait_info->mem == wake_mem)
1854 goto wakeup;
1855 /* if no hierarchy, no match */
1856 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1857 return 0;
1858 /* 1826 /*
1859 * Both of oom_wait_info->mem and wake_mem are stable under us. 1827 * Both of oom_wait_info->mem and wake_mem are stable under us.
1860 * Then we can use css_is_ancestor without taking care of RCU. 1828 * Then we can use css_is_ancestor without taking care of RCU.
1861 */ 1829 */
1862 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1830 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
1863 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1831 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
1864 return 0; 1832 return 0;
1865
1866wakeup:
1867 return autoremove_wake_function(wait, mode, sync, arg); 1833 return autoremove_wake_function(wait, mode, sync, arg);
1868} 1834}
1869 1835
@@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
1875 1841
1876static void memcg_oom_recover(struct mem_cgroup *mem) 1842static void memcg_oom_recover(struct mem_cgroup *mem)
1877{ 1843{
1878 if (mem && atomic_read(&mem->oom_lock)) 1844 if (mem && atomic_read(&mem->under_oom))
1879 memcg_wakeup_oom(mem); 1845 memcg_wakeup_oom(mem);
1880} 1846}
1881 1847
@@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1893 owait.wait.private = current; 1859 owait.wait.private = current;
1894 INIT_LIST_HEAD(&owait.wait.task_list); 1860 INIT_LIST_HEAD(&owait.wait.task_list);
1895 need_to_kill = true; 1861 need_to_kill = true;
1862 mem_cgroup_mark_under_oom(mem);
1863
1896 /* At first, try to OOM lock hierarchy under mem.*/ 1864 /* At first, try to OOM lock hierarchy under mem.*/
1897 mutex_lock(&memcg_oom_mutex); 1865 spin_lock(&memcg_oom_lock);
1898 locked = mem_cgroup_oom_lock(mem); 1866 locked = mem_cgroup_oom_lock(mem);
1899 /* 1867 /*
1900 * Even if signal_pending(), we can't quit charge() loop without 1868 * Even if signal_pending(), we can't quit charge() loop without
@@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1906 need_to_kill = false; 1874 need_to_kill = false;
1907 if (locked) 1875 if (locked)
1908 mem_cgroup_oom_notify(mem); 1876 mem_cgroup_oom_notify(mem);
1909 mutex_unlock(&memcg_oom_mutex); 1877 spin_unlock(&memcg_oom_lock);
1910 1878
1911 if (need_to_kill) { 1879 if (need_to_kill) {
1912 finish_wait(&memcg_oom_waitq, &owait.wait); 1880 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1915 schedule(); 1883 schedule();
1916 finish_wait(&memcg_oom_waitq, &owait.wait); 1884 finish_wait(&memcg_oom_waitq, &owait.wait);
1917 } 1885 }
1918 mutex_lock(&memcg_oom_mutex); 1886 spin_lock(&memcg_oom_lock);
1919 mem_cgroup_oom_unlock(mem); 1887 if (locked)
1888 mem_cgroup_oom_unlock(mem);
1920 memcg_wakeup_oom(mem); 1889 memcg_wakeup_oom(mem);
1921 mutex_unlock(&memcg_oom_mutex); 1890 spin_unlock(&memcg_oom_lock);
1891
1892 mem_cgroup_unmark_under_oom(mem);
1922 1893
1923 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1894 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1924 return false; 1895 return false;
@@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2079} 2050}
2080 2051
2081/* 2052/*
2082 * Tries to drain stocked charges in other cpus. This function is asynchronous 2053 * Drains all per-CPU charge caches for given root_mem resp. subtree
2083 * and just put a work per cpu for draining localy on each cpu. Caller can 2054 * of the hierarchy under it. sync flag says whether we should block
2084 * expects some charges will be back to res_counter later but cannot wait for 2055 * until the work is done.
2085 * it.
2086 */ 2056 */
2087static void drain_all_stock_async(struct mem_cgroup *root_mem) 2057static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2088{ 2058{
2089 int cpu, curcpu; 2059 int cpu, curcpu;
2090 /* 2060
2091 * If someone calls draining, avoid adding more kworker runs.
2092 */
2093 if (!mutex_trylock(&percpu_charge_mutex))
2094 return;
2095 /* Notify other cpus that system-wide "drain" is running */ 2061 /* Notify other cpus that system-wide "drain" is running */
2096 get_online_cpus(); 2062 get_online_cpus();
2097 /* 2063 curcpu = get_cpu();
2098 * Get a hint for avoiding draining charges on the current cpu,
2099 * which must be exhausted by our charging. It is not required that
2100 * this be a precise check, so we use raw_smp_processor_id() instead of
2101 * getcpu()/putcpu().
2102 */
2103 curcpu = raw_smp_processor_id();
2104 for_each_online_cpu(cpu) { 2064 for_each_online_cpu(cpu) {
2105 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2065 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2106 struct mem_cgroup *mem; 2066 struct mem_cgroup *mem;
2107 2067
2108 if (cpu == curcpu)
2109 continue;
2110
2111 mem = stock->cached; 2068 mem = stock->cached;
2112 if (!mem) 2069 if (!mem || !stock->nr_pages)
2113 continue; 2070 continue;
2114 if (mem != root_mem) { 2071 if (!mem_cgroup_same_or_subtree(root_mem, mem))
2115 if (!root_mem->use_hierarchy) 2072 continue;
2116 continue; 2073 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2117 /* check whether "mem" is under tree of "root_mem" */ 2074 if (cpu == curcpu)
2118 if (!css_is_ancestor(&mem->css, &root_mem->css)) 2075 drain_local_stock(&stock->work);
2119 continue; 2076 else
2077 schedule_work_on(cpu, &stock->work);
2120 } 2078 }
2121 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2122 schedule_work_on(cpu, &stock->work);
2123 } 2079 }
2080 put_cpu();
2081
2082 if (!sync)
2083 goto out;
2084
2085 for_each_online_cpu(cpu) {
2086 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2087 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2088 flush_work(&stock->work);
2089 }
2090out:
2124 put_online_cpus(); 2091 put_online_cpus();
2092}
2093
2094/*
2095 * Tries to drain stocked charges in other cpus. This function is asynchronous
2096 * and just put a work per cpu for draining localy on each cpu. Caller can
2097 * expects some charges will be back to res_counter later but cannot wait for
2098 * it.
2099 */
2100static void drain_all_stock_async(struct mem_cgroup *root_mem)
2101{
2102 /*
2103 * If someone calls draining, avoid adding more kworker runs.
2104 */
2105 if (!mutex_trylock(&percpu_charge_mutex))
2106 return;
2107 drain_all_stock(root_mem, false);
2125 mutex_unlock(&percpu_charge_mutex); 2108 mutex_unlock(&percpu_charge_mutex);
2126 /* We don't wait for flush_work */
2127} 2109}
2128 2110
2129/* This is a synchronous drain interface. */ 2111/* This is a synchronous drain interface. */
2130static void drain_all_stock_sync(void) 2112static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2131{ 2113{
2132 /* called when force_empty is called */ 2114 /* called when force_empty is called */
2133 mutex_lock(&percpu_charge_mutex); 2115 mutex_lock(&percpu_charge_mutex);
2134 schedule_on_each_cpu(drain_local_stock); 2116 drain_all_stock(root_mem, true);
2135 mutex_unlock(&percpu_charge_mutex); 2117 mutex_unlock(&percpu_charge_mutex);
2136} 2118}
2137 2119
@@ -2784,30 +2766,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2784 return 0; 2766 return 0;
2785 if (PageCompound(page)) 2767 if (PageCompound(page))
2786 return 0; 2768 return 0;
2787 /*
2788 * Corner case handling. This is called from add_to_page_cache()
2789 * in usual. But some FS (shmem) precharges this page before calling it
2790 * and call add_to_page_cache() with GFP_NOWAIT.
2791 *
2792 * For GFP_NOWAIT case, the page may be pre-charged before calling
2793 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2794 * charge twice. (It works but has to pay a bit larger cost.)
2795 * And when the page is SwapCache, it should take swap information
2796 * into account. This is under lock_page() now.
2797 */
2798 if (!(gfp_mask & __GFP_WAIT)) {
2799 struct page_cgroup *pc;
2800
2801 pc = lookup_page_cgroup(page);
2802 if (!pc)
2803 return 0;
2804 lock_page_cgroup(pc);
2805 if (PageCgroupUsed(pc)) {
2806 unlock_page_cgroup(pc);
2807 return 0;
2808 }
2809 unlock_page_cgroup(pc);
2810 }
2811 2769
2812 if (unlikely(!mm)) 2770 if (unlikely(!mm))
2813 mm = &init_mm; 2771 mm = &init_mm;
@@ -3397,31 +3355,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
3397 cgroup_release_and_wakeup_rmdir(&mem->css); 3355 cgroup_release_and_wakeup_rmdir(&mem->css);
3398} 3356}
3399 3357
3400/*
3401 * A call to try to shrink memory usage on charge failure at shmem's swapin.
3402 * Calling hierarchical_reclaim is not enough because we should update
3403 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
3404 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
3405 * not from the memcg which this page would be charged to.
3406 * try_charge_swapin does all of these works properly.
3407 */
3408int mem_cgroup_shmem_charge_fallback(struct page *page,
3409 struct mm_struct *mm,
3410 gfp_t gfp_mask)
3411{
3412 struct mem_cgroup *mem;
3413 int ret;
3414
3415 if (mem_cgroup_disabled())
3416 return 0;
3417
3418 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3419 if (!ret)
3420 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
3421
3422 return ret;
3423}
3424
3425#ifdef CONFIG_DEBUG_VM 3358#ifdef CONFIG_DEBUG_VM
3426static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3359static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3427{ 3360{
@@ -3780,7 +3713,7 @@ move_account:
3780 goto out; 3713 goto out;
3781 /* This is for making all *used* pages to be on LRU. */ 3714 /* This is for making all *used* pages to be on LRU. */
3782 lru_add_drain_all(); 3715 lru_add_drain_all();
3783 drain_all_stock_sync(); 3716 drain_all_stock_sync(mem);
3784 ret = 0; 3717 ret = 0;
3785 mem_cgroup_start_move(mem); 3718 mem_cgroup_start_move(mem);
3786 for_each_node_state(node, N_HIGH_MEMORY) { 3719 for_each_node_state(node, N_HIGH_MEMORY) {
@@ -3826,7 +3759,7 @@ try_to_free:
3826 goto out; 3759 goto out;
3827 } 3760 }
3828 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3761 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3829 false, get_swappiness(mem)); 3762 false);
3830 if (!progress) { 3763 if (!progress) {
3831 nr_retries--; 3764 nr_retries--;
3832 /* maybe some writeback is necessary */ 3765 /* maybe some writeback is necessary */
@@ -4152,15 +4085,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4152 s->stat[MCS_PGMAJFAULT] += val; 4085 s->stat[MCS_PGMAJFAULT] += val;
4153 4086
4154 /* per zone stat */ 4087 /* per zone stat */
4155 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 4088 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
4156 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4089 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4157 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 4090 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
4158 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4091 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4159 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 4092 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
4160 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4093 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4161 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 4094 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
4162 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4095 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4163 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 4096 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
4164 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4097 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4165} 4098}
4166 4099
@@ -4182,35 +4115,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4182 struct cgroup *cont = m->private; 4115 struct cgroup *cont = m->private;
4183 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4116 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4184 4117
4185 total_nr = mem_cgroup_nr_lru_pages(mem_cont); 4118 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4186 seq_printf(m, "total=%lu", total_nr); 4119 seq_printf(m, "total=%lu", total_nr);
4187 for_each_node_state(nid, N_HIGH_MEMORY) { 4120 for_each_node_state(nid, N_HIGH_MEMORY) {
4188 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); 4121 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4189 seq_printf(m, " N%d=%lu", nid, node_nr); 4122 seq_printf(m, " N%d=%lu", nid, node_nr);
4190 } 4123 }
4191 seq_putc(m, '\n'); 4124 seq_putc(m, '\n');
4192 4125
4193 file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); 4126 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4194 seq_printf(m, "file=%lu", file_nr); 4127 seq_printf(m, "file=%lu", file_nr);
4195 for_each_node_state(nid, N_HIGH_MEMORY) { 4128 for_each_node_state(nid, N_HIGH_MEMORY) {
4196 node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); 4129 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4130 LRU_ALL_FILE);
4197 seq_printf(m, " N%d=%lu", nid, node_nr); 4131 seq_printf(m, " N%d=%lu", nid, node_nr);
4198 } 4132 }
4199 seq_putc(m, '\n'); 4133 seq_putc(m, '\n');
4200 4134
4201 anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); 4135 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4202 seq_printf(m, "anon=%lu", anon_nr); 4136 seq_printf(m, "anon=%lu", anon_nr);
4203 for_each_node_state(nid, N_HIGH_MEMORY) { 4137 for_each_node_state(nid, N_HIGH_MEMORY) {
4204 node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); 4138 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4139 LRU_ALL_ANON);
4205 seq_printf(m, " N%d=%lu", nid, node_nr); 4140 seq_printf(m, " N%d=%lu", nid, node_nr);
4206 } 4141 }
4207 seq_putc(m, '\n'); 4142 seq_putc(m, '\n');
4208 4143
4209 unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); 4144 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4210 seq_printf(m, "unevictable=%lu", unevictable_nr); 4145 seq_printf(m, "unevictable=%lu", unevictable_nr);
4211 for_each_node_state(nid, N_HIGH_MEMORY) { 4146 for_each_node_state(nid, N_HIGH_MEMORY) {
4212 node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, 4147 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4213 nid); 4148 BIT(LRU_UNEVICTABLE));
4214 seq_printf(m, " N%d=%lu", nid, node_nr); 4149 seq_printf(m, " N%d=%lu", nid, node_nr);
4215 } 4150 }
4216 seq_putc(m, '\n'); 4151 seq_putc(m, '\n');
@@ -4288,7 +4223,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4288{ 4223{
4289 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4224 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4290 4225
4291 return get_swappiness(memcg); 4226 return mem_cgroup_swappiness(memcg);
4292} 4227}
4293 4228
4294static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4229static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
@@ -4578,15 +4513,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4578 if (!event) 4513 if (!event)
4579 return -ENOMEM; 4514 return -ENOMEM;
4580 4515
4581 mutex_lock(&memcg_oom_mutex); 4516 spin_lock(&memcg_oom_lock);
4582 4517
4583 event->eventfd = eventfd; 4518 event->eventfd = eventfd;
4584 list_add(&event->list, &memcg->oom_notify); 4519 list_add(&event->list, &memcg->oom_notify);
4585 4520
4586 /* already in OOM ? */ 4521 /* already in OOM ? */
4587 if (atomic_read(&memcg->oom_lock)) 4522 if (atomic_read(&memcg->under_oom))
4588 eventfd_signal(eventfd, 1); 4523 eventfd_signal(eventfd, 1);
4589 mutex_unlock(&memcg_oom_mutex); 4524 spin_unlock(&memcg_oom_lock);
4590 4525
4591 return 0; 4526 return 0;
4592} 4527}
@@ -4600,7 +4535,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4600 4535
4601 BUG_ON(type != _OOM_TYPE); 4536 BUG_ON(type != _OOM_TYPE);
4602 4537
4603 mutex_lock(&memcg_oom_mutex); 4538 spin_lock(&memcg_oom_lock);
4604 4539
4605 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4540 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4606 if (ev->eventfd == eventfd) { 4541 if (ev->eventfd == eventfd) {
@@ -4609,7 +4544,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4609 } 4544 }
4610 } 4545 }
4611 4546
4612 mutex_unlock(&memcg_oom_mutex); 4547 spin_unlock(&memcg_oom_lock);
4613} 4548}
4614 4549
4615static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4550static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -4619,7 +4554,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4619 4554
4620 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4555 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4621 4556
4622 if (atomic_read(&mem->oom_lock)) 4557 if (atomic_read(&mem->under_oom))
4623 cb->fill(cb, "under_oom", 1); 4558 cb->fill(cb, "under_oom", 1);
4624 else 4559 else
4625 cb->fill(cb, "under_oom", 0); 4560 cb->fill(cb, "under_oom", 0);
@@ -4997,7 +4932,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4997 INIT_LIST_HEAD(&mem->oom_notify); 4932 INIT_LIST_HEAD(&mem->oom_notify);
4998 4933
4999 if (parent) 4934 if (parent)
5000 mem->swappiness = get_swappiness(parent); 4935 mem->swappiness = mem_cgroup_swappiness(parent);
5001 atomic_set(&mem->refcnt, 1); 4936 atomic_set(&mem->refcnt, 1);
5002 mem->move_charge_at_immigrate = 0; 4937 mem->move_charge_at_immigrate = 0;
5003 mutex_init(&mem->thresholds_lock); 4938 mutex_init(&mem->thresholds_lock);
@@ -5181,15 +5116,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5181 pgoff = pte_to_pgoff(ptent); 5116 pgoff = pte_to_pgoff(ptent);
5182 5117
5183 /* page is moved even if it's not RSS of this task(page-faulted). */ 5118 /* page is moved even if it's not RSS of this task(page-faulted). */
5184 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 5119 page = find_get_page(mapping, pgoff);
5185 page = find_get_page(mapping, pgoff); 5120
5186 } else { /* shmem/tmpfs file. we should take account of swap too. */ 5121#ifdef CONFIG_SWAP
5187 swp_entry_t ent; 5122 /* shmem/tmpfs may report page out on swap: account for that too. */
5188 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 5123 if (radix_tree_exceptional_entry(page)) {
5124 swp_entry_t swap = radix_to_swp_entry(page);
5189 if (do_swap_account) 5125 if (do_swap_account)
5190 entry->val = ent.val; 5126 *entry = swap;
5127 page = find_get_page(&swapper_space, swap.val);
5191 } 5128 }
5192 5129#endif
5193 return page; 5130 return page;
5194} 5131}
5195 5132
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059c..2b43ba051ac9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -53,6 +53,7 @@
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h> 54#include <linux/memory_hotplug.h>
55#include <linux/mm_inline.h> 55#include <linux/mm_inline.h>
56#include <linux/kfifo.h>
56#include "internal.h" 57#include "internal.h"
57 58
58int sysctl_memory_failure_early_kill __read_mostly = 0; 59int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno)
1178 __memory_failure(pfn, trapno, 0); 1179 __memory_failure(pfn, trapno, 0);
1179} 1180}
1180 1181
1182#define MEMORY_FAILURE_FIFO_ORDER 4
1183#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER)
1184
1185struct memory_failure_entry {
1186 unsigned long pfn;
1187 int trapno;
1188 int flags;
1189};
1190
1191struct memory_failure_cpu {
1192 DECLARE_KFIFO(fifo, struct memory_failure_entry,
1193 MEMORY_FAILURE_FIFO_SIZE);
1194 spinlock_t lock;
1195 struct work_struct work;
1196};
1197
1198static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
1199
1200/**
1201 * memory_failure_queue - Schedule handling memory failure of a page.
1202 * @pfn: Page Number of the corrupted page
1203 * @trapno: Trap number reported in the signal to user space.
1204 * @flags: Flags for memory failure handling
1205 *
1206 * This function is called by the low level hardware error handler
1207 * when it detects hardware memory corruption of a page. It schedules
1208 * the recovering of error page, including dropping pages, killing
1209 * processes etc.
1210 *
1211 * The function is primarily of use for corruptions that
1212 * happen outside the current execution context (e.g. when
1213 * detected by a background scrubber)
1214 *
1215 * Can run in IRQ context.
1216 */
1217void memory_failure_queue(unsigned long pfn, int trapno, int flags)
1218{
1219 struct memory_failure_cpu *mf_cpu;
1220 unsigned long proc_flags;
1221 struct memory_failure_entry entry = {
1222 .pfn = pfn,
1223 .trapno = trapno,
1224 .flags = flags,
1225 };
1226
1227 mf_cpu = &get_cpu_var(memory_failure_cpu);
1228 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1229 if (kfifo_put(&mf_cpu->fifo, &entry))
1230 schedule_work_on(smp_processor_id(), &mf_cpu->work);
1231 else
1232 pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
1233 pfn);
1234 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1235 put_cpu_var(memory_failure_cpu);
1236}
1237EXPORT_SYMBOL_GPL(memory_failure_queue);
1238
1239static void memory_failure_work_func(struct work_struct *work)
1240{
1241 struct memory_failure_cpu *mf_cpu;
1242 struct memory_failure_entry entry = { 0, };
1243 unsigned long proc_flags;
1244 int gotten;
1245
1246 mf_cpu = &__get_cpu_var(memory_failure_cpu);
1247 for (;;) {
1248 spin_lock_irqsave(&mf_cpu->lock, proc_flags);
1249 gotten = kfifo_get(&mf_cpu->fifo, &entry);
1250 spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
1251 if (!gotten)
1252 break;
1253 __memory_failure(entry.pfn, entry.trapno, entry.flags);
1254 }
1255}
1256
1257static int __init memory_failure_init(void)
1258{
1259 struct memory_failure_cpu *mf_cpu;
1260 int cpu;
1261
1262 for_each_possible_cpu(cpu) {
1263 mf_cpu = &per_cpu(memory_failure_cpu, cpu);
1264 spin_lock_init(&mf_cpu->lock);
1265 INIT_KFIFO(mf_cpu->fifo);
1266 INIT_WORK(&mf_cpu->work, memory_failure_work_func);
1267 }
1268
1269 return 0;
1270}
1271core_initcall(memory_failure_init);
1272
1181/** 1273/**
1182 * unpoison_memory - Unpoison a previously poisoned page 1274 * unpoison_memory - Unpoison a previously poisoned page
1183 * @pfn: Page number of the to be unpoisoned page 1275 * @pfn: Page number of the to be unpoisoned page
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e7fb9d25c54e..9c51f9f58cac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,7 @@
93 93
94#include <asm/tlbflush.h> 94#include <asm/tlbflush.h>
95#include <asm/uaccess.h> 95#include <asm/uaccess.h>
96#include <linux/random.h>
96 97
97#include "internal.h" 98#include "internal.h"
98 99
@@ -635,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
635 struct vm_area_struct *prev; 636 struct vm_area_struct *prev;
636 struct vm_area_struct *vma; 637 struct vm_area_struct *vma;
637 int err = 0; 638 int err = 0;
638 pgoff_t pgoff;
639 unsigned long vmstart; 639 unsigned long vmstart;
640 unsigned long vmend; 640 unsigned long vmend;
641 641
@@ -648,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
648 vmstart = max(start, vma->vm_start); 648 vmstart = max(start, vma->vm_start);
649 vmend = min(end, vma->vm_end); 649 vmend = min(end, vma->vm_end);
650 650
651 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
652 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, 651 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
653 vma->anon_vma, vma->vm_file, pgoff, new_pol); 652 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
653 new_pol);
654 if (prev) { 654 if (prev) {
655 vma = prev; 655 vma = prev;
656 next = vma->vm_next; 656 next = vma->vm_next;
@@ -1411,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); 1411 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1412 1412
1413 if (!err && nmask) { 1413 if (!err && nmask) {
1414 err = copy_from_user(bm, nm, alloc_size); 1414 unsigned long copy_size;
1415 copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1416 err = copy_from_user(bm, nm, copy_size);
1415 /* ensure entire bitmap is zeroed */ 1417 /* ensure entire bitmap is zeroed */
1416 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); 1418 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1417 err |= compat_put_bitmap(nmask, bm, nr_bits); 1419 err |= compat_put_bitmap(nmask, bm, nr_bits);
@@ -1645,6 +1647,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1645 return interleave_nodes(pol); 1647 return interleave_nodes(pol);
1646} 1648}
1647 1649
1650/*
1651 * Return the bit number of a random bit set in the nodemask.
1652 * (returns -1 if nodemask is empty)
1653 */
1654int node_random(const nodemask_t *maskp)
1655{
1656 int w, bit = -1;
1657
1658 w = nodes_weight(*maskp);
1659 if (w)
1660 bit = bitmap_ord_to_pos(maskp->bits,
1661 get_random_int() % w, MAX_NUMNODES);
1662 return bit;
1663}
1664
1648#ifdef CONFIG_HUGETLBFS 1665#ifdef CONFIG_HUGETLBFS
1649/* 1666/*
1650 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) 1667 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c76..636a86876ff2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
69 * file will not get a swp_entry_t in its pte, but rather it is like 69 * file will not get a swp_entry_t in its pte, but rather it is like
70 * any other file mapping (ie. marked !present and faulted in with 70 * any other file mapping (ie. marked !present and faulted in with
71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here. 71 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
72 *
73 * However when tmpfs moves the page from pagecache and into swapcache,
74 * it is still in core, but the find_get_page below won't find it.
75 * No big deal, but make a note of it.
76 */ 72 */
77 page = find_get_page(mapping, pgoff); 73 page = find_get_page(mapping, pgoff);
74#ifdef CONFIG_SWAP
75 /* shmem/tmpfs may return swap: account for swapcache page too. */
76 if (radix_tree_exceptional_entry(page)) {
77 swp_entry_t swap = radix_to_swp_entry(page);
78 page = find_get_page(&swapper_space, swap.val);
79 }
80#endif
78 if (page) { 81 if (page) {
79 present = PageUptodate(page); 82 present = PageUptodate(page);
80 page_cache_release(page); 83 page_cache_release(page);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eafff89b3dd6..626303b52f3c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
303 do_each_thread(g, p) { 303 do_each_thread(g, p) {
304 unsigned int points; 304 unsigned int points;
305 305
306 if (!p->mm) 306 if (p->exit_state)
307 continue; 307 continue;
308 if (oom_unkillable_task(p, mem, nodemask)) 308 if (oom_unkillable_task(p, mem, nodemask))
309 continue; 309 continue;
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
319 */ 319 */
320 if (test_tsk_thread_flag(p, TIF_MEMDIE)) 320 if (test_tsk_thread_flag(p, TIF_MEMDIE))
321 return ERR_PTR(-1UL); 321 return ERR_PTR(-1UL);
322 if (!p->mm)
323 continue;
322 324
323 if (p->flags & PF_EXITING) { 325 if (p->flags & PF_EXITING) {
324 /* 326 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d8767b381b9c..0e309cd1b5b9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -37,6 +37,16 @@
37#include <trace/events/writeback.h> 37#include <trace/events/writeback.h>
38 38
39/* 39/*
40 * Sleep at most 200ms at a time in balance_dirty_pages().
41 */
42#define MAX_PAUSE max(HZ/5, 1)
43
44/*
45 * Estimate write bandwidth at 200ms intervals.
46 */
47#define BANDWIDTH_INTERVAL max(HZ/5, 1)
48
49/*
40 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited 50 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
41 * will look to see if it needs to force writeback or throttling. 51 * will look to see if it needs to force writeback or throttling.
42 */ 52 */
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode);
111 121
112/* End of sysctl-exported parameters */ 122/* End of sysctl-exported parameters */
113 123
124unsigned long global_dirty_limit;
114 125
115/* 126/*
116 * Scale the writeback cache size proportional to the relative writeout speeds. 127 * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
219 */ 230 */
220static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) 231static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
221{ 232{
233 __inc_bdi_stat(bdi, BDI_WRITTEN);
222 __prop_inc_percpu_max(&vm_completions, &bdi->completions, 234 __prop_inc_percpu_max(&vm_completions, &bdi->completions,
223 bdi->max_prop_frac); 235 bdi->max_prop_frac);
224} 236}
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk)
244static void bdi_writeout_fraction(struct backing_dev_info *bdi, 256static void bdi_writeout_fraction(struct backing_dev_info *bdi,
245 long *numerator, long *denominator) 257 long *numerator, long *denominator)
246{ 258{
247 if (bdi_cap_writeback_dirty(bdi)) { 259 prop_fraction_percpu(&vm_completions, &bdi->completions,
248 prop_fraction_percpu(&vm_completions, &bdi->completions,
249 numerator, denominator); 260 numerator, denominator);
250 } else {
251 *numerator = 0;
252 *denominator = 1;
253 }
254} 261}
255 262
256static inline void task_dirties_fraction(struct task_struct *tsk, 263static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
274 * effectively curb the growth of dirty pages. Light dirtiers with high enough 281 * effectively curb the growth of dirty pages. Light dirtiers with high enough
275 * dirty threshold may never get throttled. 282 * dirty threshold may never get throttled.
276 */ 283 */
284#define TASK_LIMIT_FRACTION 8
277static unsigned long task_dirty_limit(struct task_struct *tsk, 285static unsigned long task_dirty_limit(struct task_struct *tsk,
278 unsigned long bdi_dirty) 286 unsigned long bdi_dirty)
279{ 287{
280 long numerator, denominator; 288 long numerator, denominator;
281 unsigned long dirty = bdi_dirty; 289 unsigned long dirty = bdi_dirty;
282 u64 inv = dirty >> 3; 290 u64 inv = dirty / TASK_LIMIT_FRACTION;
283 291
284 task_dirties_fraction(tsk, &numerator, &denominator); 292 task_dirties_fraction(tsk, &numerator, &denominator);
285 inv *= numerator; 293 inv *= numerator;
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
290 return max(dirty, bdi_dirty/2); 298 return max(dirty, bdi_dirty/2);
291} 299}
292 300
301/* Minimum limit for any task */
302static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
303{
304 return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
305}
306
293/* 307/*
294 * 308 *
295 */ 309 */
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void)
397 return x + 1; /* Ensure that we never return 0 */ 411 return x + 1; /* Ensure that we never return 0 */
398} 412}
399 413
414static unsigned long hard_dirty_limit(unsigned long thresh)
415{
416 return max(thresh, global_dirty_limit);
417}
418
400/* 419/*
401 * global_dirty_limits - background-writeback and dirty-throttling thresholds 420 * global_dirty_limits - background-writeback and dirty-throttling thresholds
402 * 421 *
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
435 } 454 }
436 *pbackground = background; 455 *pbackground = background;
437 *pdirty = dirty; 456 *pdirty = dirty;
457 trace_global_dirty_state(background, dirty);
438} 458}
439 459
440/* 460/**
441 * bdi_dirty_limit - @bdi's share of dirty throttling threshold 461 * bdi_dirty_limit - @bdi's share of dirty throttling threshold
462 * @bdi: the backing_dev_info to query
463 * @dirty: global dirty limit in pages
464 *
465 * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
466 * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
467 * And the "limit" in the name is not seriously taken as hard limit in
468 * balance_dirty_pages().
442 * 469 *
443 * Allocate high/low dirty limits to fast/slow devices, in order to prevent 470 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
444 * - starving fast devices 471 * - starving fast devices
445 * - piling up dirty pages (that will take long time to sync) on slow devices 472 * - piling up dirty pages (that will take long time to sync) on slow devices
446 * 473 *
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
468 return bdi_dirty; 495 return bdi_dirty;
469} 496}
470 497
498static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
499 unsigned long elapsed,
500 unsigned long written)
501{
502 const unsigned long period = roundup_pow_of_two(3 * HZ);
503 unsigned long avg = bdi->avg_write_bandwidth;
504 unsigned long old = bdi->write_bandwidth;
505 u64 bw;
506
507 /*
508 * bw = written * HZ / elapsed
509 *
510 * bw * elapsed + write_bandwidth * (period - elapsed)
511 * write_bandwidth = ---------------------------------------------------
512 * period
513 */
514 bw = written - bdi->written_stamp;
515 bw *= HZ;
516 if (unlikely(elapsed > period)) {
517 do_div(bw, elapsed);
518 avg = bw;
519 goto out;
520 }
521 bw += (u64)bdi->write_bandwidth * (period - elapsed);
522 bw >>= ilog2(period);
523
524 /*
525 * one more level of smoothing, for filtering out sudden spikes
526 */
527 if (avg > old && old >= (unsigned long)bw)
528 avg -= (avg - old) >> 3;
529
530 if (avg < old && old <= (unsigned long)bw)
531 avg += (old - avg) >> 3;
532
533out:
534 bdi->write_bandwidth = bw;
535 bdi->avg_write_bandwidth = avg;
536}
537
538/*
539 * The global dirtyable memory and dirty threshold could be suddenly knocked
540 * down by a large amount (eg. on the startup of KVM in a swapless system).
541 * This may throw the system into deep dirty exceeded state and throttle
542 * heavy/light dirtiers alike. To retain good responsiveness, maintain
543 * global_dirty_limit for tracking slowly down to the knocked down dirty
544 * threshold.
545 */
546static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
547{
548 unsigned long limit = global_dirty_limit;
549
550 /*
551 * Follow up in one step.
552 */
553 if (limit < thresh) {
554 limit = thresh;
555 goto update;
556 }
557
558 /*
559 * Follow down slowly. Use the higher one as the target, because thresh
560 * may drop below dirty. This is exactly the reason to introduce
561 * global_dirty_limit which is guaranteed to lie above the dirty pages.
562 */
563 thresh = max(thresh, dirty);
564 if (limit > thresh) {
565 limit -= (limit - thresh) >> 5;
566 goto update;
567 }
568 return;
569update:
570 global_dirty_limit = limit;
571}
572
573static void global_update_bandwidth(unsigned long thresh,
574 unsigned long dirty,
575 unsigned long now)
576{
577 static DEFINE_SPINLOCK(dirty_lock);
578 static unsigned long update_time;
579
580 /*
581 * check locklessly first to optimize away locking for the most time
582 */
583 if (time_before(now, update_time + BANDWIDTH_INTERVAL))
584 return;
585
586 spin_lock(&dirty_lock);
587 if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
588 update_dirty_limit(thresh, dirty);
589 update_time = now;
590 }
591 spin_unlock(&dirty_lock);
592}
593
594void __bdi_update_bandwidth(struct backing_dev_info *bdi,
595 unsigned long thresh,
596 unsigned long dirty,
597 unsigned long bdi_thresh,
598 unsigned long bdi_dirty,
599 unsigned long start_time)
600{
601 unsigned long now = jiffies;
602 unsigned long elapsed = now - bdi->bw_time_stamp;
603 unsigned long written;
604
605 /*
606 * rate-limit, only update once every 200ms.
607 */
608 if (elapsed < BANDWIDTH_INTERVAL)
609 return;
610
611 written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
612
613 /*
614 * Skip quiet periods when disk bandwidth is under-utilized.
615 * (at least 1s idle time between two flusher runs)
616 */
617 if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
618 goto snapshot;
619
620 if (thresh)
621 global_update_bandwidth(thresh, dirty, now);
622
623 bdi_update_write_bandwidth(bdi, elapsed, written);
624
625snapshot:
626 bdi->written_stamp = written;
627 bdi->bw_time_stamp = now;
628}
629
630static void bdi_update_bandwidth(struct backing_dev_info *bdi,
631 unsigned long thresh,
632 unsigned long dirty,
633 unsigned long bdi_thresh,
634 unsigned long bdi_dirty,
635 unsigned long start_time)
636{
637 if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
638 return;
639 spin_lock(&bdi->wb.list_lock);
640 __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
641 start_time);
642 spin_unlock(&bdi->wb.list_lock);
643}
644
471/* 645/*
472 * balance_dirty_pages() must be called by processes which are generating dirty 646 * balance_dirty_pages() must be called by processes which are generating dirty
473 * data. It looks at the number of dirty pages in the machine and will force 647 * data. It looks at the number of dirty pages in the machine and will force
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
478static void balance_dirty_pages(struct address_space *mapping, 652static void balance_dirty_pages(struct address_space *mapping,
479 unsigned long write_chunk) 653 unsigned long write_chunk)
480{ 654{
481 long nr_reclaimable, bdi_nr_reclaimable; 655 unsigned long nr_reclaimable, bdi_nr_reclaimable;
482 long nr_writeback, bdi_nr_writeback; 656 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
657 unsigned long bdi_dirty;
483 unsigned long background_thresh; 658 unsigned long background_thresh;
484 unsigned long dirty_thresh; 659 unsigned long dirty_thresh;
485 unsigned long bdi_thresh; 660 unsigned long bdi_thresh;
661 unsigned long task_bdi_thresh;
662 unsigned long min_task_bdi_thresh;
486 unsigned long pages_written = 0; 663 unsigned long pages_written = 0;
487 unsigned long pause = 1; 664 unsigned long pause = 1;
488 bool dirty_exceeded = false; 665 bool dirty_exceeded = false;
666 bool clear_dirty_exceeded = true;
489 struct backing_dev_info *bdi = mapping->backing_dev_info; 667 struct backing_dev_info *bdi = mapping->backing_dev_info;
668 unsigned long start_time = jiffies;
490 669
491 for (;;) { 670 for (;;) {
492 struct writeback_control wbc = {
493 .sync_mode = WB_SYNC_NONE,
494 .older_than_this = NULL,
495 .nr_to_write = write_chunk,
496 .range_cyclic = 1,
497 };
498
499 nr_reclaimable = global_page_state(NR_FILE_DIRTY) + 671 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
500 global_page_state(NR_UNSTABLE_NFS); 672 global_page_state(NR_UNSTABLE_NFS);
501 nr_writeback = global_page_state(NR_WRITEBACK); 673 nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
502 674
503 global_dirty_limits(&background_thresh, &dirty_thresh); 675 global_dirty_limits(&background_thresh, &dirty_thresh);
504 676
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping,
507 * catch-up. This avoids (excessively) small writeouts 679 * catch-up. This avoids (excessively) small writeouts
508 * when the bdi limits are ramping up. 680 * when the bdi limits are ramping up.
509 */ 681 */
510 if (nr_reclaimable + nr_writeback <= 682 if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
511 (background_thresh + dirty_thresh) / 2)
512 break; 683 break;
513 684
514 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); 685 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
515 bdi_thresh = task_dirty_limit(current, bdi_thresh); 686 min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
687 task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
516 688
517 /* 689 /*
518 * In order to avoid the stacked BDI deadlock we need 690 * In order to avoid the stacked BDI deadlock we need
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping,
524 * actually dirty; with m+n sitting in the percpu 696 * actually dirty; with m+n sitting in the percpu
525 * deltas. 697 * deltas.
526 */ 698 */
527 if (bdi_thresh < 2*bdi_stat_error(bdi)) { 699 if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
528 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); 700 bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
529 bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); 701 bdi_dirty = bdi_nr_reclaimable +
702 bdi_stat_sum(bdi, BDI_WRITEBACK);
530 } else { 703 } else {
531 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); 704 bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
532 bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); 705 bdi_dirty = bdi_nr_reclaimable +
706 bdi_stat(bdi, BDI_WRITEBACK);
533 } 707 }
534 708
535 /* 709 /*
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
538 * bdi or process from holding back light ones; The latter is 712 * bdi or process from holding back light ones; The latter is
539 * the last resort safeguard. 713 * the last resort safeguard.
540 */ 714 */
541 dirty_exceeded = 715 dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
542 (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) 716 (nr_dirty > dirty_thresh);
543 || (nr_reclaimable + nr_writeback > dirty_thresh); 717 clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
718 (nr_dirty <= dirty_thresh);
544 719
545 if (!dirty_exceeded) 720 if (!dirty_exceeded)
546 break; 721 break;
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping,
548 if (!bdi->dirty_exceeded) 723 if (!bdi->dirty_exceeded)
549 bdi->dirty_exceeded = 1; 724 bdi->dirty_exceeded = 1;
550 725
726 bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
727 bdi_thresh, bdi_dirty, start_time);
728
551 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. 729 /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
552 * Unstable writes are a feature of certain networked 730 * Unstable writes are a feature of certain networked
553 * filesystems (i.e. NFS) in which data may have been 731 * filesystems (i.e. NFS) in which data may have been
@@ -557,17 +735,29 @@ static void balance_dirty_pages(struct address_space *mapping,
557 * threshold otherwise wait until the disk writes catch 735 * threshold otherwise wait until the disk writes catch
558 * up. 736 * up.
559 */ 737 */
560 trace_wbc_balance_dirty_start(&wbc, bdi); 738 trace_balance_dirty_start(bdi);
561 if (bdi_nr_reclaimable > bdi_thresh) { 739 if (bdi_nr_reclaimable > task_bdi_thresh) {
562 writeback_inodes_wb(&bdi->wb, &wbc); 740 pages_written += writeback_inodes_wb(&bdi->wb,
563 pages_written += write_chunk - wbc.nr_to_write; 741 write_chunk);
564 trace_wbc_balance_dirty_written(&wbc, bdi); 742 trace_balance_dirty_written(bdi, pages_written);
565 if (pages_written >= write_chunk) 743 if (pages_written >= write_chunk)
566 break; /* We've done our duty */ 744 break; /* We've done our duty */
567 } 745 }
568 trace_wbc_balance_dirty_wait(&wbc, bdi);
569 __set_current_state(TASK_UNINTERRUPTIBLE); 746 __set_current_state(TASK_UNINTERRUPTIBLE);
570 io_schedule_timeout(pause); 747 io_schedule_timeout(pause);
748 trace_balance_dirty_wait(bdi);
749
750 dirty_thresh = hard_dirty_limit(dirty_thresh);
751 /*
752 * max-pause area. If dirty exceeded but still within this
753 * area, no need to sleep for more than 200ms: (a) 8 pages per
754 * 200ms is typically more than enough to curb heavy dirtiers;
755 * (b) the pause time limit makes the dirtiers more responsive.
756 */
757 if (nr_dirty < dirty_thresh &&
758 bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 &&
759 time_after(jiffies, start_time + MAX_PAUSE))
760 break;
571 761
572 /* 762 /*
573 * Increase the delay for each loop, up to our previous 763 * Increase the delay for each loop, up to our previous
@@ -578,7 +768,8 @@ static void balance_dirty_pages(struct address_space *mapping,
578 pause = HZ / 10; 768 pause = HZ / 10;
579 } 769 }
580 770
581 if (!dirty_exceeded && bdi->dirty_exceeded) 771 /* Clear dirty_exceeded flag only when no task can exceed the limit */
772 if (clear_dirty_exceeded && bdi->dirty_exceeded)
582 bdi->dirty_exceeded = 0; 773 bdi->dirty_exceeded = 0;
583 774
584 if (writeback_in_progress(bdi)) 775 if (writeback_in_progress(bdi))
@@ -626,9 +817,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
626void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 817void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
627 unsigned long nr_pages_dirtied) 818 unsigned long nr_pages_dirtied)
628{ 819{
820 struct backing_dev_info *bdi = mapping->backing_dev_info;
629 unsigned long ratelimit; 821 unsigned long ratelimit;
630 unsigned long *p; 822 unsigned long *p;
631 823
824 if (!bdi_cap_account_dirty(bdi))
825 return;
826
632 ratelimit = ratelimit_pages; 827 ratelimit = ratelimit_pages;
633 if (mapping->backing_dev_info->dirty_exceeded) 828 if (mapping->backing_dev_info->dirty_exceeded)
634 ratelimit = 8; 829 ratelimit = 8;
@@ -892,12 +1087,12 @@ int write_cache_pages(struct address_space *mapping,
892 range_whole = 1; 1087 range_whole = 1;
893 cycled = 1; /* ignore range_cyclic tests */ 1088 cycled = 1; /* ignore range_cyclic tests */
894 } 1089 }
895 if (wbc->sync_mode == WB_SYNC_ALL) 1090 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
896 tag = PAGECACHE_TAG_TOWRITE; 1091 tag = PAGECACHE_TAG_TOWRITE;
897 else 1092 else
898 tag = PAGECACHE_TAG_DIRTY; 1093 tag = PAGECACHE_TAG_DIRTY;
899retry: 1094retry:
900 if (wbc->sync_mode == WB_SYNC_ALL) 1095 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
901 tag_pages_for_writeback(mapping, index, end); 1096 tag_pages_for_writeback(mapping, index, end);
902 done_index = index; 1097 done_index = index;
903 while (!done && (index <= end)) { 1098 while (!done && (index <= end)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 094472377d81..6e8ecb6e021c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1370,21 +1370,12 @@ failed:
1370 1370
1371#ifdef CONFIG_FAIL_PAGE_ALLOC 1371#ifdef CONFIG_FAIL_PAGE_ALLOC
1372 1372
1373static struct fail_page_alloc_attr { 1373static struct {
1374 struct fault_attr attr; 1374 struct fault_attr attr;
1375 1375
1376 u32 ignore_gfp_highmem; 1376 u32 ignore_gfp_highmem;
1377 u32 ignore_gfp_wait; 1377 u32 ignore_gfp_wait;
1378 u32 min_order; 1378 u32 min_order;
1379
1380#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
1381
1382 struct dentry *ignore_gfp_highmem_file;
1383 struct dentry *ignore_gfp_wait_file;
1384 struct dentry *min_order_file;
1385
1386#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
1387
1388} fail_page_alloc = { 1379} fail_page_alloc = {
1389 .attr = FAULT_ATTR_INITIALIZER, 1380 .attr = FAULT_ATTR_INITIALIZER,
1390 .ignore_gfp_wait = 1, 1381 .ignore_gfp_wait = 1,
@@ -1418,36 +1409,27 @@ static int __init fail_page_alloc_debugfs(void)
1418{ 1409{
1419 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 1410 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
1420 struct dentry *dir; 1411 struct dentry *dir;
1421 int err;
1422
1423 err = init_fault_attr_dentries(&fail_page_alloc.attr,
1424 "fail_page_alloc");
1425 if (err)
1426 return err;
1427 dir = fail_page_alloc.attr.dentries.dir;
1428
1429 fail_page_alloc.ignore_gfp_wait_file =
1430 debugfs_create_bool("ignore-gfp-wait", mode, dir,
1431 &fail_page_alloc.ignore_gfp_wait);
1432
1433 fail_page_alloc.ignore_gfp_highmem_file =
1434 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1435 &fail_page_alloc.ignore_gfp_highmem);
1436 fail_page_alloc.min_order_file =
1437 debugfs_create_u32("min-order", mode, dir,
1438 &fail_page_alloc.min_order);
1439
1440 if (!fail_page_alloc.ignore_gfp_wait_file ||
1441 !fail_page_alloc.ignore_gfp_highmem_file ||
1442 !fail_page_alloc.min_order_file) {
1443 err = -ENOMEM;
1444 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
1445 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
1446 debugfs_remove(fail_page_alloc.min_order_file);
1447 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
1448 }
1449 1412
1450 return err; 1413 dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
1414 &fail_page_alloc.attr);
1415 if (IS_ERR(dir))
1416 return PTR_ERR(dir);
1417
1418 if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
1419 &fail_page_alloc.ignore_gfp_wait))
1420 goto fail;
1421 if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
1422 &fail_page_alloc.ignore_gfp_highmem))
1423 goto fail;
1424 if (!debugfs_create_u32("min-order", mode, dir,
1425 &fail_page_alloc.min_order))
1426 goto fail;
1427
1428 return 0;
1429fail:
1430 debugfs_remove_recursive(dir);
1431
1432 return -ENOMEM;
1451} 1433}
1452 1434
1453late_initcall(fail_page_alloc_debugfs); 1435late_initcall(fail_page_alloc_debugfs);
diff --git a/mm/rmap.c b/mm/rmap.c
index 9701574bb67a..8005080fb9e3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -31,11 +31,11 @@
31 * mmlist_lock (in mmput, drain_mmlist and others) 31 * mmlist_lock (in mmput, drain_mmlist and others)
32 * mapping->private_lock (in __set_page_dirty_buffers) 32 * mapping->private_lock (in __set_page_dirty_buffers)
33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty) 33 * inode->i_lock (in set_page_dirty's __mark_inode_dirty)
34 * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) 34 * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
35 * sb_lock (within inode_lock in fs/fs-writeback.c) 35 * sb_lock (within inode_lock in fs/fs-writeback.c)
36 * mapping->tree_lock (widely used, in set_page_dirty, 36 * mapping->tree_lock (widely used, in set_page_dirty,
37 * in arch-dependent flush_dcache_mmap_lock, 37 * in arch-dependent flush_dcache_mmap_lock,
38 * within inode_wb_list_lock in __sync_single_inode) 38 * within bdi.wb->list_lock in __sync_single_inode)
39 * 39 *
40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) 40 * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
41 * ->tasklist_lock 41 * ->tasklist_lock
diff --git a/mm/shmem.c b/mm/shmem.c
index 5cc21f8b4cd3..32f6763f16fb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
6 * 2000-2001 Christoph Rohland 6 * 2000-2001 Christoph Rohland
7 * 2000-2001 SAP AG 7 * 2000-2001 SAP AG
8 * 2002 Red Hat Inc. 8 * 2002 Red Hat Inc.
9 * Copyright (C) 2002-2005 Hugh Dickins. 9 * Copyright (C) 2002-2011 Hugh Dickins.
10 * Copyright (C) 2011 Google Inc.
10 * Copyright (C) 2002-2005 VERITAS Software Corporation. 11 * Copyright (C) 2002-2005 VERITAS Software Corporation.
11 * Copyright (C) 2004 Andi Kleen, SuSE Labs 12 * Copyright (C) 2004 Andi Kleen, SuSE Labs
12 * 13 *
@@ -28,7 +29,6 @@
28#include <linux/file.h> 29#include <linux/file.h>
29#include <linux/mm.h> 30#include <linux/mm.h>
30#include <linux/module.h> 31#include <linux/module.h>
31#include <linux/percpu_counter.h>
32#include <linux/swap.h> 32#include <linux/swap.h>
33 33
34static struct vfsmount *shm_mnt; 34static struct vfsmount *shm_mnt;
@@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt;
51#include <linux/shmem_fs.h> 51#include <linux/shmem_fs.h>
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h>
54#include <linux/splice.h> 56#include <linux/splice.h>
55#include <linux/security.h> 57#include <linux/security.h>
56#include <linux/swapops.h> 58#include <linux/swapops.h>
@@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt;
63#include <linux/magic.h> 65#include <linux/magic.h>
64 66
65#include <asm/uaccess.h> 67#include <asm/uaccess.h>
66#include <asm/div64.h>
67#include <asm/pgtable.h> 68#include <asm/pgtable.h>
68 69
69/*
70 * The maximum size of a shmem/tmpfs file is limited by the maximum size of
71 * its triple-indirect swap vector - see illustration at shmem_swp_entry().
72 *
73 * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
74 * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum
75 * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
76 * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
77 *
78 * We use / and * instead of shifts in the definitions below, so that the swap
79 * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
80 */
81#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
82#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
83
84#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
85#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
86
87#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
88#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
89
90#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 70#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
91#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) 71#define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
92 72
93/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
94#define SHMEM_PAGEIN VM_READ
95#define SHMEM_TRUNCATE VM_WRITE
96
97/* Definition to limit shmem_truncate's steps between cond_rescheds */
98#define LATENCY_LIMIT 64
99
100/* Pretend that each entry is of this size in directory's i_size */ 73/* Pretend that each entry is of this size in directory's i_size */
101#define BOGO_DIRENT_SIZE 20 74#define BOGO_DIRENT_SIZE 20
102 75
76/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
77#define SHORT_SYMLINK_LEN 128
78
103struct shmem_xattr { 79struct shmem_xattr {
104 struct list_head list; /* anchored by shmem_inode_info->xattr_list */ 80 struct list_head list; /* anchored by shmem_inode_info->xattr_list */
105 char *name; /* xattr name */ 81 char *name; /* xattr name */
@@ -107,7 +83,7 @@ struct shmem_xattr {
107 char value[0]; 83 char value[0];
108}; 84};
109 85
110/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ 86/* Flag allocation requirements to shmem_getpage */
111enum sgp_type { 87enum sgp_type {
112 SGP_READ, /* don't exceed i_size, don't allocate page */ 88 SGP_READ, /* don't exceed i_size, don't allocate page */
113 SGP_CACHE, /* don't exceed i_size, may allocate page */ 89 SGP_CACHE, /* don't exceed i_size, may allocate page */
@@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index,
137 mapping_gfp_mask(inode->i_mapping), fault_type); 113 mapping_gfp_mask(inode->i_mapping), fault_type);
138} 114}
139 115
140static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
141{
142 /*
143 * The above definition of ENTRIES_PER_PAGE, and the use of
144 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
145 * might be reconsidered if it ever diverges from PAGE_SIZE.
146 *
147 * Mobility flags are masked out as swap vectors cannot move
148 */
149 return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
150 PAGE_CACHE_SHIFT-PAGE_SHIFT);
151}
152
153static inline void shmem_dir_free(struct page *page)
154{
155 __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
156}
157
158static struct page **shmem_dir_map(struct page *page)
159{
160 return (struct page **)kmap_atomic(page, KM_USER0);
161}
162
163static inline void shmem_dir_unmap(struct page **dir)
164{
165 kunmap_atomic(dir, KM_USER0);
166}
167
168static swp_entry_t *shmem_swp_map(struct page *page)
169{
170 return (swp_entry_t *)kmap_atomic(page, KM_USER1);
171}
172
173static inline void shmem_swp_balance_unmap(void)
174{
175 /*
176 * When passing a pointer to an i_direct entry, to code which
177 * also handles indirect entries and so will shmem_swp_unmap,
178 * we must arrange for the preempt count to remain in balance.
179 * What kmap_atomic of a lowmem page does depends on config
180 * and architecture, so pretend to kmap_atomic some lowmem page.
181 */
182 (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
183}
184
185static inline void shmem_swp_unmap(swp_entry_t *entry)
186{
187 kunmap_atomic(entry, KM_USER1);
188}
189
190static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) 116static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
191{ 117{
192 return sb->s_fs_info; 118 return sb->s_fs_info;
@@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
244static LIST_HEAD(shmem_swaplist); 170static LIST_HEAD(shmem_swaplist);
245static DEFINE_MUTEX(shmem_swaplist_mutex); 171static DEFINE_MUTEX(shmem_swaplist_mutex);
246 172
247static void shmem_free_blocks(struct inode *inode, long pages)
248{
249 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
250 if (sbinfo->max_blocks) {
251 percpu_counter_add(&sbinfo->used_blocks, -pages);
252 inode->i_blocks -= pages*BLOCKS_PER_PAGE;
253 }
254}
255
256static int shmem_reserve_inode(struct super_block *sb) 173static int shmem_reserve_inode(struct super_block *sb)
257{ 174{
258 struct shmem_sb_info *sbinfo = SHMEM_SB(sb); 175 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
279} 196}
280 197
281/** 198/**
282 * shmem_recalc_inode - recalculate the size of an inode 199 * shmem_recalc_inode - recalculate the block usage of an inode
283 * @inode: inode to recalc 200 * @inode: inode to recalc
284 * 201 *
285 * We have to calculate the free blocks since the mm can drop 202 * We have to calculate the free blocks since the mm can drop
@@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
297 214
298 freed = info->alloced - info->swapped - inode->i_mapping->nrpages; 215 freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
299 if (freed > 0) { 216 if (freed > 0) {
217 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
218 if (sbinfo->max_blocks)
219 percpu_counter_add(&sbinfo->used_blocks, -freed);
300 info->alloced -= freed; 220 info->alloced -= freed;
221 inode->i_blocks -= freed * BLOCKS_PER_PAGE;
301 shmem_unacct_blocks(info->flags, freed); 222 shmem_unacct_blocks(info->flags, freed);
302 shmem_free_blocks(inode, freed);
303 } 223 }
304} 224}
305 225
306/** 226/*
307 * shmem_swp_entry - find the swap vector position in the info structure 227 * Replace item expected in radix tree by a new item, while holding tree lock.
308 * @info: info structure for the inode
309 * @index: index of the page to find
310 * @page: optional page to add to the structure. Has to be preset to
311 * all zeros
312 *
313 * If there is no space allocated yet it will return NULL when
314 * page is NULL, else it will use the page for the needed block,
315 * setting it to NULL on return to indicate that it has been used.
316 *
317 * The swap vector is organized the following way:
318 *
319 * There are SHMEM_NR_DIRECT entries directly stored in the
320 * shmem_inode_info structure. So small files do not need an addional
321 * allocation.
322 *
323 * For pages with index > SHMEM_NR_DIRECT there is the pointer
324 * i_indirect which points to a page which holds in the first half
325 * doubly indirect blocks, in the second half triple indirect blocks:
326 *
327 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
328 * following layout (for SHMEM_NR_DIRECT == 16):
329 *
330 * i_indirect -> dir --> 16-19
331 * | +-> 20-23
332 * |
333 * +-->dir2 --> 24-27
334 * | +-> 28-31
335 * | +-> 32-35
336 * | +-> 36-39
337 * |
338 * +-->dir3 --> 40-43
339 * +-> 44-47
340 * +-> 48-51
341 * +-> 52-55
342 */ 228 */
343static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) 229static int shmem_radix_tree_replace(struct address_space *mapping,
344{ 230 pgoff_t index, void *expected, void *replacement)
345 unsigned long offset; 231{
346 struct page **dir; 232 void **pslot;
347 struct page *subdir; 233 void *item = NULL;
348 234
349 if (index < SHMEM_NR_DIRECT) { 235 VM_BUG_ON(!expected);
350 shmem_swp_balance_unmap(); 236 pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
351 return info->i_direct+index; 237 if (pslot)
352 } 238 item = radix_tree_deref_slot_protected(pslot,
353 if (!info->i_indirect) { 239 &mapping->tree_lock);
354 if (page) { 240 if (item != expected)
355 info->i_indirect = *page; 241 return -ENOENT;
356 *page = NULL; 242 if (replacement)
357 } 243 radix_tree_replace_slot(pslot, replacement);
358 return NULL; /* need another page */ 244 else
359 } 245 radix_tree_delete(&mapping->page_tree, index);
360 246 return 0;
361 index -= SHMEM_NR_DIRECT; 247}
362 offset = index % ENTRIES_PER_PAGE;
363 index /= ENTRIES_PER_PAGE;
364 dir = shmem_dir_map(info->i_indirect);
365
366 if (index >= ENTRIES_PER_PAGE/2) {
367 index -= ENTRIES_PER_PAGE/2;
368 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
369 index %= ENTRIES_PER_PAGE;
370 subdir = *dir;
371 if (!subdir) {
372 if (page) {
373 *dir = *page;
374 *page = NULL;
375 }
376 shmem_dir_unmap(dir);
377 return NULL; /* need another page */
378 }
379 shmem_dir_unmap(dir);
380 dir = shmem_dir_map(subdir);
381 }
382 248
383 dir += index; 249/*
384 subdir = *dir; 250 * Like add_to_page_cache_locked, but error if expected item has gone.
385 if (!subdir) { 251 */
386 if (!page || !(subdir = *page)) { 252static int shmem_add_to_page_cache(struct page *page,
387 shmem_dir_unmap(dir); 253 struct address_space *mapping,
388 return NULL; /* need a page */ 254 pgoff_t index, gfp_t gfp, void *expected)
255{
256 int error = 0;
257
258 VM_BUG_ON(!PageLocked(page));
259 VM_BUG_ON(!PageSwapBacked(page));
260
261 if (!expected)
262 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
263 if (!error) {
264 page_cache_get(page);
265 page->mapping = mapping;
266 page->index = index;
267
268 spin_lock_irq(&mapping->tree_lock);
269 if (!expected)
270 error = radix_tree_insert(&mapping->page_tree,
271 index, page);
272 else
273 error = shmem_radix_tree_replace(mapping, index,
274 expected, page);
275 if (!error) {
276 mapping->nrpages++;
277 __inc_zone_page_state(page, NR_FILE_PAGES);
278 __inc_zone_page_state(page, NR_SHMEM);
279 spin_unlock_irq(&mapping->tree_lock);
280 } else {
281 page->mapping = NULL;
282 spin_unlock_irq(&mapping->tree_lock);
283 page_cache_release(page);
389 } 284 }
390 *dir = subdir; 285 if (!expected)
391 *page = NULL; 286 radix_tree_preload_end();
392 } 287 }
393 shmem_dir_unmap(dir); 288 if (error)
394 return shmem_swp_map(subdir) + offset; 289 mem_cgroup_uncharge_cache_page(page);
290 return error;
395} 291}
396 292
397static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) 293/*
294 * Like delete_from_page_cache, but substitutes swap for page.
295 */
296static void shmem_delete_from_page_cache(struct page *page, void *radswap)
398{ 297{
399 long incdec = value? 1: -1; 298 struct address_space *mapping = page->mapping;
299 int error;
400 300
401 entry->val = value; 301 spin_lock_irq(&mapping->tree_lock);
402 info->swapped += incdec; 302 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
403 if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { 303 page->mapping = NULL;
404 struct page *page = kmap_atomic_to_page(entry); 304 mapping->nrpages--;
405 set_page_private(page, page_private(page) + incdec); 305 __dec_zone_page_state(page, NR_FILE_PAGES);
406 } 306 __dec_zone_page_state(page, NR_SHMEM);
307 spin_unlock_irq(&mapping->tree_lock);
308 page_cache_release(page);
309 BUG_ON(error);
407} 310}
408 311
409/** 312/*
410 * shmem_swp_alloc - get the position of the swap entry for the page. 313 * Like find_get_pages, but collecting swap entries as well as pages.
411 * @info: info structure for the inode
412 * @index: index of the page to find
413 * @sgp: check and recheck i_size? skip allocation?
414 * @gfp: gfp mask to use for any page allocation
415 *
416 * If the entry does not exist, allocate it.
417 */ 314 */
418static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, 315static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
419 unsigned long index, enum sgp_type sgp, gfp_t gfp) 316 pgoff_t start, unsigned int nr_pages,
420{ 317 struct page **pages, pgoff_t *indices)
421 struct inode *inode = &info->vfs_inode; 318{
422 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); 319 unsigned int i;
423 struct page *page = NULL; 320 unsigned int ret;
424 swp_entry_t *entry; 321 unsigned int nr_found;
425 322
426 if (sgp != SGP_WRITE && 323 rcu_read_lock();
427 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) 324restart:
428 return ERR_PTR(-EINVAL); 325 nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
429 326 (void ***)pages, indices, start, nr_pages);
430 while (!(entry = shmem_swp_entry(info, index, &page))) { 327 ret = 0;
431 if (sgp == SGP_READ) 328 for (i = 0; i < nr_found; i++) {
432 return shmem_swp_map(ZERO_PAGE(0)); 329 struct page *page;
433 /* 330repeat:
434 * Test used_blocks against 1 less max_blocks, since we have 1 data 331 page = radix_tree_deref_slot((void **)pages[i]);
435 * page (and perhaps indirect index pages) yet to allocate: 332 if (unlikely(!page))
436 * a waste to allocate index if we cannot allocate data. 333 continue;
437 */ 334 if (radix_tree_exception(page)) {
438 if (sbinfo->max_blocks) { 335 if (radix_tree_deref_retry(page))
439 if (percpu_counter_compare(&sbinfo->used_blocks, 336 goto restart;
440 sbinfo->max_blocks - 1) >= 0) 337 /*
441 return ERR_PTR(-ENOSPC); 338 * Otherwise, we must be storing a swap entry
442 percpu_counter_inc(&sbinfo->used_blocks); 339 * here as an exceptional entry: so return it
443 inode->i_blocks += BLOCKS_PER_PAGE; 340 * without attempting to raise page count.
341 */
342 goto export;
444 } 343 }
344 if (!page_cache_get_speculative(page))
345 goto repeat;
445 346
446 spin_unlock(&info->lock); 347 /* Has the page moved? */
447 page = shmem_dir_alloc(gfp); 348 if (unlikely(page != *((void **)pages[i]))) {
448 spin_lock(&info->lock); 349 page_cache_release(page);
449 350 goto repeat;
450 if (!page) {
451 shmem_free_blocks(inode, 1);
452 return ERR_PTR(-ENOMEM);
453 }
454 if (sgp != SGP_WRITE &&
455 ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
456 entry = ERR_PTR(-EINVAL);
457 break;
458 } 351 }
459 if (info->next_index <= index) 352export:
460 info->next_index = index + 1; 353 indices[ret] = indices[i];
461 } 354 pages[ret] = page;
462 if (page) { 355 ret++;
463 /* another task gave its page, or truncated the file */ 356 }
464 shmem_free_blocks(inode, 1); 357 if (unlikely(!ret && nr_found))
465 shmem_dir_free(page); 358 goto restart;
466 } 359 rcu_read_unlock();
467 if (info->next_index <= index && !IS_ERR(entry)) 360 return ret;
468 info->next_index = index + 1;
469 return entry;
470} 361}
471 362
472/** 363/*
473 * shmem_free_swp - free some swap entries in a directory 364 * Remove swap entry from radix tree, free the swap and its page cache.
474 * @dir: pointer to the directory
475 * @edir: pointer after last entry of the directory
476 * @punch_lock: pointer to spinlock when needed for the holepunch case
477 */ 365 */
478static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, 366static int shmem_free_swap(struct address_space *mapping,
479 spinlock_t *punch_lock) 367 pgoff_t index, void *radswap)
480{ 368{
481 spinlock_t *punch_unlock = NULL; 369 int error;
482 swp_entry_t *ptr; 370
483 int freed = 0; 371 spin_lock_irq(&mapping->tree_lock);
484 372 error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
485 for (ptr = dir; ptr < edir; ptr++) { 373 spin_unlock_irq(&mapping->tree_lock);
486 if (ptr->val) { 374 if (!error)
487 if (unlikely(punch_lock)) { 375 free_swap_and_cache(radix_to_swp_entry(radswap));
488 punch_unlock = punch_lock; 376 return error;
489 punch_lock = NULL;
490 spin_lock(punch_unlock);
491 if (!ptr->val)
492 continue;
493 }
494 free_swap_and_cache(*ptr);
495 *ptr = (swp_entry_t){0};
496 freed++;
497 }
498 }
499 if (punch_unlock)
500 spin_unlock(punch_unlock);
501 return freed;
502}
503
504static int shmem_map_and_free_swp(struct page *subdir, int offset,
505 int limit, struct page ***dir, spinlock_t *punch_lock)
506{
507 swp_entry_t *ptr;
508 int freed = 0;
509
510 ptr = shmem_swp_map(subdir);
511 for (; offset < limit; offset += LATENCY_LIMIT) {
512 int size = limit - offset;
513 if (size > LATENCY_LIMIT)
514 size = LATENCY_LIMIT;
515 freed += shmem_free_swp(ptr+offset, ptr+offset+size,
516 punch_lock);
517 if (need_resched()) {
518 shmem_swp_unmap(ptr);
519 if (*dir) {
520 shmem_dir_unmap(*dir);
521 *dir = NULL;
522 }
523 cond_resched();
524 ptr = shmem_swp_map(subdir);
525 }
526 }
527 shmem_swp_unmap(ptr);
528 return freed;
529} 377}
530 378
531static void shmem_free_pages(struct list_head *next) 379/*
380 * Pagevec may contain swap entries, so shuffle up pages before releasing.
381 */
382static void shmem_pagevec_release(struct pagevec *pvec)
532{ 383{
533 struct page *page; 384 int i, j;
534 int freed = 0; 385
535 386 for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
536 do { 387 struct page *page = pvec->pages[i];
537 page = container_of(next, struct page, lru); 388 if (!radix_tree_exceptional_entry(page))
538 next = next->next; 389 pvec->pages[j++] = page;
539 shmem_dir_free(page); 390 }
540 freed++; 391 pvec->nr = j;
541 if (freed >= LATENCY_LIMIT) { 392 pagevec_release(pvec);
542 cond_resched();
543 freed = 0;
544 }
545 } while (next);
546} 393}
547 394
548void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 395/*
396 * Remove range of pages and swap entries from radix tree, and free them.
397 */
398void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
549{ 399{
400 struct address_space *mapping = inode->i_mapping;
550 struct shmem_inode_info *info = SHMEM_I(inode); 401 struct shmem_inode_info *info = SHMEM_I(inode);
551 unsigned long idx; 402 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
552 unsigned long size; 403 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
553 unsigned long limit; 404 pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
554 unsigned long stage; 405 struct pagevec pvec;
555 unsigned long diroff; 406 pgoff_t indices[PAGEVEC_SIZE];
556 struct page **dir;
557 struct page *topdir;
558 struct page *middir;
559 struct page *subdir;
560 swp_entry_t *ptr;
561 LIST_HEAD(pages_to_free);
562 long nr_pages_to_free = 0;
563 long nr_swaps_freed = 0; 407 long nr_swaps_freed = 0;
564 int offset; 408 pgoff_t index;
565 int freed; 409 int i;
566 int punch_hole;
567 spinlock_t *needs_lock;
568 spinlock_t *punch_lock;
569 unsigned long upper_limit;
570 410
571 truncate_inode_pages_range(inode->i_mapping, start, end); 411 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
572 412
573 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 413 pagevec_init(&pvec, 0);
574 idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 414 index = start;
575 if (idx >= info->next_index) 415 while (index <= end) {
576 return; 416 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
417 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
418 pvec.pages, indices);
419 if (!pvec.nr)
420 break;
421 mem_cgroup_uncharge_start();
422 for (i = 0; i < pagevec_count(&pvec); i++) {
423 struct page *page = pvec.pages[i];
577 424
578 spin_lock(&info->lock); 425 index = indices[i];
579 info->flags |= SHMEM_TRUNCATE; 426 if (index > end)
580 if (likely(end == (loff_t) -1)) { 427 break;
581 limit = info->next_index; 428
582 upper_limit = SHMEM_MAX_INDEX; 429 if (radix_tree_exceptional_entry(page)) {
583 info->next_index = idx; 430 nr_swaps_freed += !shmem_free_swap(mapping,
584 needs_lock = NULL; 431 index, page);
585 punch_hole = 0; 432 continue;
586 } else { 433 }
587 if (end + 1 >= inode->i_size) { /* we may free a little more */
588 limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
589 PAGE_CACHE_SHIFT;
590 upper_limit = SHMEM_MAX_INDEX;
591 } else {
592 limit = (end + 1) >> PAGE_CACHE_SHIFT;
593 upper_limit = limit;
594 }
595 needs_lock = &info->lock;
596 punch_hole = 1;
597 }
598 434
599 topdir = info->i_indirect; 435 if (!trylock_page(page))
600 if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { 436 continue;
601 info->i_indirect = NULL; 437 if (page->mapping == mapping) {
602 nr_pages_to_free++; 438 VM_BUG_ON(PageWriteback(page));
603 list_add(&topdir->lru, &pages_to_free); 439 truncate_inode_page(mapping, page);
440 }
441 unlock_page(page);
442 }
443 shmem_pagevec_release(&pvec);
444 mem_cgroup_uncharge_end();
445 cond_resched();
446 index++;
604 } 447 }
605 spin_unlock(&info->lock);
606 448
607 if (info->swapped && idx < SHMEM_NR_DIRECT) { 449 if (partial) {
608 ptr = info->i_direct; 450 struct page *page = NULL;
609 size = limit; 451 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
610 if (size > SHMEM_NR_DIRECT) 452 if (page) {
611 size = SHMEM_NR_DIRECT; 453 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
612 nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); 454 set_page_dirty(page);
455 unlock_page(page);
456 page_cache_release(page);
457 }
613 } 458 }
614 459
615 /* 460 index = start;
616 * If there are no indirect blocks or we are punching a hole 461 for ( ; ; ) {
617 * below indirect blocks, nothing to be done. 462 cond_resched();
618 */ 463 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
619 if (!topdir || limit <= SHMEM_NR_DIRECT) 464 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
620 goto done2; 465 pvec.pages, indices);
466 if (!pvec.nr) {
467 if (index == start)
468 break;
469 index = start;
470 continue;
471 }
472 if (index == start && indices[0] > end) {
473 shmem_pagevec_release(&pvec);
474 break;
475 }
476 mem_cgroup_uncharge_start();
477 for (i = 0; i < pagevec_count(&pvec); i++) {
478 struct page *page = pvec.pages[i];
621 479
622 /* 480 index = indices[i];
623 * The truncation case has already dropped info->lock, and we're safe 481 if (index > end)
624 * because i_size and next_index have already been lowered, preventing 482 break;
625 * access beyond. But in the punch_hole case, we still need to take
626 * the lock when updating the swap directory, because there might be
627 * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
628 * shmem_writepage. However, whenever we find we can remove a whole
629 * directory page (not at the misaligned start or end of the range),
630 * we first NULLify its pointer in the level above, and then have no
631 * need to take the lock when updating its contents: needs_lock and
632 * punch_lock (either pointing to info->lock or NULL) manage this.
633 */
634 483
635 upper_limit -= SHMEM_NR_DIRECT; 484 if (radix_tree_exceptional_entry(page)) {
636 limit -= SHMEM_NR_DIRECT; 485 nr_swaps_freed += !shmem_free_swap(mapping,
637 idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; 486 index, page);
638 offset = idx % ENTRIES_PER_PAGE; 487 continue;
639 idx -= offset;
640
641 dir = shmem_dir_map(topdir);
642 stage = ENTRIES_PER_PAGEPAGE/2;
643 if (idx < ENTRIES_PER_PAGEPAGE/2) {
644 middir = topdir;
645 diroff = idx/ENTRIES_PER_PAGE;
646 } else {
647 dir += ENTRIES_PER_PAGE/2;
648 dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
649 while (stage <= idx)
650 stage += ENTRIES_PER_PAGEPAGE;
651 middir = *dir;
652 if (*dir) {
653 diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
654 ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
655 if (!diroff && !offset && upper_limit >= stage) {
656 if (needs_lock) {
657 spin_lock(needs_lock);
658 *dir = NULL;
659 spin_unlock(needs_lock);
660 needs_lock = NULL;
661 } else
662 *dir = NULL;
663 nr_pages_to_free++;
664 list_add(&middir->lru, &pages_to_free);
665 } 488 }
666 shmem_dir_unmap(dir);
667 dir = shmem_dir_map(middir);
668 } else {
669 diroff = 0;
670 offset = 0;
671 idx = stage;
672 }
673 }
674 489
675 for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { 490 lock_page(page);
676 if (unlikely(idx == stage)) { 491 if (page->mapping == mapping) {
677 shmem_dir_unmap(dir); 492 VM_BUG_ON(PageWriteback(page));
678 dir = shmem_dir_map(topdir) + 493 truncate_inode_page(mapping, page);
679 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
680 while (!*dir) {
681 dir++;
682 idx += ENTRIES_PER_PAGEPAGE;
683 if (idx >= limit)
684 goto done1;
685 }
686 stage = idx + ENTRIES_PER_PAGEPAGE;
687 middir = *dir;
688 if (punch_hole)
689 needs_lock = &info->lock;
690 if (upper_limit >= stage) {
691 if (needs_lock) {
692 spin_lock(needs_lock);
693 *dir = NULL;
694 spin_unlock(needs_lock);
695 needs_lock = NULL;
696 } else
697 *dir = NULL;
698 nr_pages_to_free++;
699 list_add(&middir->lru, &pages_to_free);
700 } 494 }
701 shmem_dir_unmap(dir); 495 unlock_page(page);
702 cond_resched();
703 dir = shmem_dir_map(middir);
704 diroff = 0;
705 }
706 punch_lock = needs_lock;
707 subdir = dir[diroff];
708 if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
709 if (needs_lock) {
710 spin_lock(needs_lock);
711 dir[diroff] = NULL;
712 spin_unlock(needs_lock);
713 punch_lock = NULL;
714 } else
715 dir[diroff] = NULL;
716 nr_pages_to_free++;
717 list_add(&subdir->lru, &pages_to_free);
718 }
719 if (subdir && page_private(subdir) /* has swap entries */) {
720 size = limit - idx;
721 if (size > ENTRIES_PER_PAGE)
722 size = ENTRIES_PER_PAGE;
723 freed = shmem_map_and_free_swp(subdir,
724 offset, size, &dir, punch_lock);
725 if (!dir)
726 dir = shmem_dir_map(middir);
727 nr_swaps_freed += freed;
728 if (offset || punch_lock) {
729 spin_lock(&info->lock);
730 set_page_private(subdir,
731 page_private(subdir) - freed);
732 spin_unlock(&info->lock);
733 } else
734 BUG_ON(page_private(subdir) != freed);
735 } 496 }
736 offset = 0; 497 shmem_pagevec_release(&pvec);
737 } 498 mem_cgroup_uncharge_end();
738done1: 499 index++;
739 shmem_dir_unmap(dir);
740done2:
741 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
742 /*
743 * Call truncate_inode_pages again: racing shmem_unuse_inode
744 * may have swizzled a page in from swap since
745 * truncate_pagecache or generic_delete_inode did it, before we
746 * lowered next_index. Also, though shmem_getpage checks
747 * i_size before adding to cache, no recheck after: so fix the
748 * narrow window there too.
749 */
750 truncate_inode_pages_range(inode->i_mapping, start, end);
751 } 500 }
752 501
753 spin_lock(&info->lock); 502 spin_lock(&info->lock);
754 info->flags &= ~SHMEM_TRUNCATE;
755 info->swapped -= nr_swaps_freed; 503 info->swapped -= nr_swaps_freed;
756 if (nr_pages_to_free)
757 shmem_free_blocks(inode, nr_pages_to_free);
758 shmem_recalc_inode(inode); 504 shmem_recalc_inode(inode);
759 spin_unlock(&info->lock); 505 spin_unlock(&info->lock);
760 506
761 /* 507 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
762 * Empty swap vector directory pages to be freed?
763 */
764 if (!list_empty(&pages_to_free)) {
765 pages_to_free.prev->next = NULL;
766 shmem_free_pages(pages_to_free.next);
767 }
768} 508}
769EXPORT_SYMBOL_GPL(shmem_truncate_range); 509EXPORT_SYMBOL_GPL(shmem_truncate_range);
770 510
@@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
780 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 520 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
781 loff_t oldsize = inode->i_size; 521 loff_t oldsize = inode->i_size;
782 loff_t newsize = attr->ia_size; 522 loff_t newsize = attr->ia_size;
783 struct page *page = NULL;
784 523
785 if (newsize < oldsize) {
786 /*
787 * If truncating down to a partial page, then
788 * if that page is already allocated, hold it
789 * in memory until the truncation is over, so
790 * truncate_partial_page cannot miss it were
791 * it assigned to swap.
792 */
793 if (newsize & (PAGE_CACHE_SIZE-1)) {
794 (void) shmem_getpage(inode,
795 newsize >> PAGE_CACHE_SHIFT,
796 &page, SGP_READ, NULL);
797 if (page)
798 unlock_page(page);
799 }
800 /*
801 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
802 * detect if any pages might have been added to cache
803 * after truncate_inode_pages. But we needn't bother
804 * if it's being fully truncated to zero-length: the
805 * nrpages check is efficient enough in that case.
806 */
807 if (newsize) {
808 struct shmem_inode_info *info = SHMEM_I(inode);
809 spin_lock(&info->lock);
810 info->flags &= ~SHMEM_PAGEIN;
811 spin_unlock(&info->lock);
812 }
813 }
814 if (newsize != oldsize) { 524 if (newsize != oldsize) {
815 i_size_write(inode, newsize); 525 i_size_write(inode, newsize);
816 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 526 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
822 /* unmap again to remove racily COWed private pages */ 532 /* unmap again to remove racily COWed private pages */
823 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); 533 unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
824 } 534 }
825 if (page)
826 page_cache_release(page);
827 } 535 }
828 536
829 setattr_copy(inode, attr); 537 setattr_copy(inode, attr);
@@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
848 list_del_init(&info->swaplist); 556 list_del_init(&info->swaplist);
849 mutex_unlock(&shmem_swaplist_mutex); 557 mutex_unlock(&shmem_swaplist_mutex);
850 } 558 }
851 } 559 } else
560 kfree(info->symlink);
852 561
853 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { 562 list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
854 kfree(xattr->name); 563 kfree(xattr->name);
@@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
859 end_writeback(inode); 568 end_writeback(inode);
860} 569}
861 570
862static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) 571/*
863{ 572 * If swap found in inode, free it and move page from swapcache to filecache.
864 swp_entry_t *ptr; 573 */
865 574static int shmem_unuse_inode(struct shmem_inode_info *info,
866 for (ptr = dir; ptr < edir; ptr++) { 575 swp_entry_t swap, struct page *page)
867 if (ptr->val == entry.val)
868 return ptr - dir;
869 }
870 return -1;
871}
872
873static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
874{ 576{
875 struct address_space *mapping; 577 struct address_space *mapping = info->vfs_inode.i_mapping;
876 unsigned long idx; 578 void *radswap;
877 unsigned long size; 579 pgoff_t index;
878 unsigned long limit;
879 unsigned long stage;
880 struct page **dir;
881 struct page *subdir;
882 swp_entry_t *ptr;
883 int offset;
884 int error; 580 int error;
885 581
886 idx = 0; 582 radswap = swp_to_radix_entry(swap);
887 ptr = info->i_direct; 583 index = radix_tree_locate_item(&mapping->page_tree, radswap);
888 spin_lock(&info->lock); 584 if (index == -1)
889 if (!info->swapped) { 585 return 0;
890 list_del_init(&info->swaplist);
891 goto lost2;
892 }
893 limit = info->next_index;
894 size = limit;
895 if (size > SHMEM_NR_DIRECT)
896 size = SHMEM_NR_DIRECT;
897 offset = shmem_find_swp(entry, ptr, ptr+size);
898 if (offset >= 0) {
899 shmem_swp_balance_unmap();
900 goto found;
901 }
902 if (!info->i_indirect)
903 goto lost2;
904
905 dir = shmem_dir_map(info->i_indirect);
906 stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
907
908 for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
909 if (unlikely(idx == stage)) {
910 shmem_dir_unmap(dir-1);
911 if (cond_resched_lock(&info->lock)) {
912 /* check it has not been truncated */
913 if (limit > info->next_index) {
914 limit = info->next_index;
915 if (idx >= limit)
916 goto lost2;
917 }
918 }
919 dir = shmem_dir_map(info->i_indirect) +
920 ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
921 while (!*dir) {
922 dir++;
923 idx += ENTRIES_PER_PAGEPAGE;
924 if (idx >= limit)
925 goto lost1;
926 }
927 stage = idx + ENTRIES_PER_PAGEPAGE;
928 subdir = *dir;
929 shmem_dir_unmap(dir);
930 dir = shmem_dir_map(subdir);
931 }
932 subdir = *dir;
933 if (subdir && page_private(subdir)) {
934 ptr = shmem_swp_map(subdir);
935 size = limit - idx;
936 if (size > ENTRIES_PER_PAGE)
937 size = ENTRIES_PER_PAGE;
938 offset = shmem_find_swp(entry, ptr, ptr+size);
939 shmem_swp_unmap(ptr);
940 if (offset >= 0) {
941 shmem_dir_unmap(dir);
942 ptr = shmem_swp_map(subdir);
943 goto found;
944 }
945 }
946 }
947lost1:
948 shmem_dir_unmap(dir-1);
949lost2:
950 spin_unlock(&info->lock);
951 return 0;
952found:
953 idx += offset;
954 ptr += offset;
955 586
956 /* 587 /*
957 * Move _head_ to start search for next from here. 588 * Move _head_ to start search for next from here.
958 * But be careful: shmem_evict_inode checks list_empty without taking 589 * But be careful: shmem_evict_inode checks list_empty without taking
959 * mutex, and there's an instant in list_move_tail when info->swaplist 590 * mutex, and there's an instant in list_move_tail when info->swaplist
960 * would appear empty, if it were the only one on shmem_swaplist. We 591 * would appear empty, if it were the only one on shmem_swaplist.
961 * could avoid doing it if inode NULL; or use this minor optimization.
962 */ 592 */
963 if (shmem_swaplist.next != &info->swaplist) 593 if (shmem_swaplist.next != &info->swaplist)
964 list_move_tail(&shmem_swaplist, &info->swaplist); 594 list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -968,29 +598,34 @@ found:
968 * but also to hold up shmem_evict_inode(): so inode cannot be freed 598 * but also to hold up shmem_evict_inode(): so inode cannot be freed
969 * beneath us (pagelock doesn't help until the page is in pagecache). 599 * beneath us (pagelock doesn't help until the page is in pagecache).
970 */ 600 */
971 mapping = info->vfs_inode.i_mapping; 601 error = shmem_add_to_page_cache(page, mapping, index,
972 error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); 602 GFP_NOWAIT, radswap);
973 /* which does mem_cgroup_uncharge_cache_page on error */ 603 /* which does mem_cgroup_uncharge_cache_page on error */
974 604
975 if (error != -ENOMEM) { 605 if (error != -ENOMEM) {
606 /*
607 * Truncation and eviction use free_swap_and_cache(), which
608 * only does trylock page: if we raced, best clean up here.
609 */
976 delete_from_swap_cache(page); 610 delete_from_swap_cache(page);
977 set_page_dirty(page); 611 set_page_dirty(page);
978 info->flags |= SHMEM_PAGEIN; 612 if (!error) {
979 shmem_swp_set(info, ptr, 0); 613 spin_lock(&info->lock);
980 swap_free(entry); 614 info->swapped--;
615 spin_unlock(&info->lock);
616 swap_free(swap);
617 }
981 error = 1; /* not an error, but entry was found */ 618 error = 1; /* not an error, but entry was found */
982 } 619 }
983 shmem_swp_unmap(ptr);
984 spin_unlock(&info->lock);
985 return error; 620 return error;
986} 621}
987 622
988/* 623/*
989 * shmem_unuse() search for an eventually swapped out shmem page. 624 * Search through swapped inodes to find and replace swap by page.
990 */ 625 */
991int shmem_unuse(swp_entry_t entry, struct page *page) 626int shmem_unuse(swp_entry_t swap, struct page *page)
992{ 627{
993 struct list_head *p, *next; 628 struct list_head *this, *next;
994 struct shmem_inode_info *info; 629 struct shmem_inode_info *info;
995 int found = 0; 630 int found = 0;
996 int error; 631 int error;
@@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
999 * Charge page using GFP_KERNEL while we can wait, before taking 634 * Charge page using GFP_KERNEL while we can wait, before taking
1000 * the shmem_swaplist_mutex which might hold up shmem_writepage(). 635 * the shmem_swaplist_mutex which might hold up shmem_writepage().
1001 * Charged back to the user (not to caller) when swap account is used. 636 * Charged back to the user (not to caller) when swap account is used.
1002 * add_to_page_cache() will be called with GFP_NOWAIT.
1003 */ 637 */
1004 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 638 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
1005 if (error) 639 if (error)
1006 goto out; 640 goto out;
1007 /* 641 /* No radix_tree_preload: swap entry keeps a place for page in tree */
1008 * Try to preload while we can wait, to not make a habit of
1009 * draining atomic reserves; but don't latch on to this cpu,
1010 * it's okay if sometimes we get rescheduled after this.
1011 */
1012 error = radix_tree_preload(GFP_KERNEL);
1013 if (error)
1014 goto uncharge;
1015 radix_tree_preload_end();
1016 642
1017 mutex_lock(&shmem_swaplist_mutex); 643 mutex_lock(&shmem_swaplist_mutex);
1018 list_for_each_safe(p, next, &shmem_swaplist) { 644 list_for_each_safe(this, next, &shmem_swaplist) {
1019 info = list_entry(p, struct shmem_inode_info, swaplist); 645 info = list_entry(this, struct shmem_inode_info, swaplist);
1020 found = shmem_unuse_inode(info, entry, page); 646 if (info->swapped)
647 found = shmem_unuse_inode(info, swap, page);
648 else
649 list_del_init(&info->swaplist);
1021 cond_resched(); 650 cond_resched();
1022 if (found) 651 if (found)
1023 break; 652 break;
1024 } 653 }
1025 mutex_unlock(&shmem_swaplist_mutex); 654 mutex_unlock(&shmem_swaplist_mutex);
1026 655
1027uncharge:
1028 if (!found) 656 if (!found)
1029 mem_cgroup_uncharge_cache_page(page); 657 mem_cgroup_uncharge_cache_page(page);
1030 if (found < 0) 658 if (found < 0)
@@ -1041,10 +669,10 @@ out:
1041static int shmem_writepage(struct page *page, struct writeback_control *wbc) 669static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1042{ 670{
1043 struct shmem_inode_info *info; 671 struct shmem_inode_info *info;
1044 swp_entry_t *entry, swap;
1045 struct address_space *mapping; 672 struct address_space *mapping;
1046 unsigned long index;
1047 struct inode *inode; 673 struct inode *inode;
674 swp_entry_t swap;
675 pgoff_t index;
1048 676
1049 BUG_ON(!PageLocked(page)); 677 BUG_ON(!PageLocked(page));
1050 mapping = page->mapping; 678 mapping = page->mapping;
@@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1073 701
1074 /* 702 /*
1075 * Add inode to shmem_unuse()'s list of swapped-out inodes, 703 * Add inode to shmem_unuse()'s list of swapped-out inodes,
1076 * if it's not already there. Do it now because we cannot take 704 * if it's not already there. Do it now before the page is
1077 * mutex while holding spinlock, and must do so before the page 705 * moved to swap cache, when its pagelock no longer protects
1078 * is moved to swap cache, when its pagelock no longer protects
1079 * the inode from eviction. But don't unlock the mutex until 706 * the inode from eviction. But don't unlock the mutex until
1080 * we've taken the spinlock, because shmem_unuse_inode() will 707 * we've incremented swapped, because shmem_unuse_inode() will
1081 * prune a !swapped inode from the swaplist under both locks. 708 * prune a !swapped inode from the swaplist under this mutex.
1082 */ 709 */
1083 mutex_lock(&shmem_swaplist_mutex); 710 mutex_lock(&shmem_swaplist_mutex);
1084 if (list_empty(&info->swaplist)) 711 if (list_empty(&info->swaplist))
1085 list_add_tail(&info->swaplist, &shmem_swaplist); 712 list_add_tail(&info->swaplist, &shmem_swaplist);
1086 713
1087 spin_lock(&info->lock);
1088 mutex_unlock(&shmem_swaplist_mutex);
1089
1090 if (index >= info->next_index) {
1091 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
1092 goto unlock;
1093 }
1094 entry = shmem_swp_entry(info, index, NULL);
1095 if (entry->val) {
1096 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
1097 free_swap_and_cache(*entry);
1098 shmem_swp_set(info, entry, 0);
1099 }
1100 shmem_recalc_inode(inode);
1101
1102 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 714 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
1103 delete_from_page_cache(page);
1104 shmem_swp_set(info, entry, swap.val);
1105 shmem_swp_unmap(entry);
1106 swap_shmem_alloc(swap); 715 swap_shmem_alloc(swap);
716 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
717
718 spin_lock(&info->lock);
719 info->swapped++;
720 shmem_recalc_inode(inode);
1107 spin_unlock(&info->lock); 721 spin_unlock(&info->lock);
722
723 mutex_unlock(&shmem_swaplist_mutex);
1108 BUG_ON(page_mapped(page)); 724 BUG_ON(page_mapped(page));
1109 swap_writepage(page, wbc); 725 swap_writepage(page, wbc);
1110 return 0; 726 return 0;
1111 } 727 }
1112 728
1113 shmem_swp_unmap(entry); 729 mutex_unlock(&shmem_swaplist_mutex);
1114unlock:
1115 spin_unlock(&info->lock);
1116 /*
1117 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
1118 * clear SWAP_HAS_CACHE flag.
1119 */
1120 swapcache_free(swap, NULL); 730 swapcache_free(swap, NULL);
1121redirty: 731redirty:
1122 set_page_dirty(page); 732 set_page_dirty(page);
@@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1153} 763}
1154#endif /* CONFIG_TMPFS */ 764#endif /* CONFIG_TMPFS */
1155 765
1156static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 766static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1157 struct shmem_inode_info *info, unsigned long idx) 767 struct shmem_inode_info *info, pgoff_t index)
1158{ 768{
1159 struct mempolicy mpol, *spol; 769 struct mempolicy mpol, *spol;
1160 struct vm_area_struct pvma; 770 struct vm_area_struct pvma;
1161 struct page *page;
1162 771
1163 spol = mpol_cond_copy(&mpol, 772 spol = mpol_cond_copy(&mpol,
1164 mpol_shared_policy_lookup(&info->policy, idx)); 773 mpol_shared_policy_lookup(&info->policy, index));
1165 774
1166 /* Create a pseudo vma that just contains the policy */ 775 /* Create a pseudo vma that just contains the policy */
1167 pvma.vm_start = 0; 776 pvma.vm_start = 0;
1168 pvma.vm_pgoff = idx; 777 pvma.vm_pgoff = index;
1169 pvma.vm_ops = NULL; 778 pvma.vm_ops = NULL;
1170 pvma.vm_policy = spol; 779 pvma.vm_policy = spol;
1171 page = swapin_readahead(entry, gfp, &pvma, 0); 780 return swapin_readahead(swap, gfp, &pvma, 0);
1172 return page;
1173} 781}
1174 782
1175static struct page *shmem_alloc_page(gfp_t gfp, 783static struct page *shmem_alloc_page(gfp_t gfp,
1176 struct shmem_inode_info *info, unsigned long idx) 784 struct shmem_inode_info *info, pgoff_t index)
1177{ 785{
1178 struct vm_area_struct pvma; 786 struct vm_area_struct pvma;
1179 787
1180 /* Create a pseudo vma that just contains the policy */ 788 /* Create a pseudo vma that just contains the policy */
1181 pvma.vm_start = 0; 789 pvma.vm_start = 0;
1182 pvma.vm_pgoff = idx; 790 pvma.vm_pgoff = index;
1183 pvma.vm_ops = NULL; 791 pvma.vm_ops = NULL;
1184 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 792 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1185 793
1186 /* 794 /*
1187 * alloc_page_vma() will drop the shared policy reference 795 * alloc_page_vma() will drop the shared policy reference
@@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1190} 798}
1191#else /* !CONFIG_NUMA */ 799#else /* !CONFIG_NUMA */
1192#ifdef CONFIG_TMPFS 800#ifdef CONFIG_TMPFS
1193static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) 801static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1194{ 802{
1195} 803}
1196#endif /* CONFIG_TMPFS */ 804#endif /* CONFIG_TMPFS */
1197 805
1198static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 806static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1199 struct shmem_inode_info *info, unsigned long idx) 807 struct shmem_inode_info *info, pgoff_t index)
1200{ 808{
1201 return swapin_readahead(entry, gfp, NULL, 0); 809 return swapin_readahead(swap, gfp, NULL, 0);
1202} 810}
1203 811
1204static inline struct page *shmem_alloc_page(gfp_t gfp, 812static inline struct page *shmem_alloc_page(gfp_t gfp,
1205 struct shmem_inode_info *info, unsigned long idx) 813 struct shmem_inode_info *info, pgoff_t index)
1206{ 814{
1207 return alloc_page(gfp); 815 return alloc_page(gfp);
1208} 816}
@@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1222 * vm. If we swap it in we mark it dirty since we also free the swap 830 * vm. If we swap it in we mark it dirty since we also free the swap
1223 * entry since a page cannot live in both the swap and page cache 831 * entry since a page cannot live in both the swap and page cache
1224 */ 832 */
1225static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, 833static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1226 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) 834 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
1227{ 835{
1228 struct address_space *mapping = inode->i_mapping; 836 struct address_space *mapping = inode->i_mapping;
1229 struct shmem_inode_info *info = SHMEM_I(inode); 837 struct shmem_inode_info *info;
1230 struct shmem_sb_info *sbinfo; 838 struct shmem_sb_info *sbinfo;
1231 struct page *page; 839 struct page *page;
1232 struct page *prealloc_page = NULL;
1233 swp_entry_t *entry;
1234 swp_entry_t swap; 840 swp_entry_t swap;
1235 int error; 841 int error;
1236 int ret; 842 int once = 0;
1237 843
1238 if (idx >= SHMEM_MAX_INDEX) 844 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
1239 return -EFBIG; 845 return -EFBIG;
1240repeat: 846repeat:
1241 page = find_lock_page(mapping, idx); 847 swap.val = 0;
1242 if (page) { 848 page = find_lock_page(mapping, index);
849 if (radix_tree_exceptional_entry(page)) {
850 swap = radix_to_swp_entry(page);
851 page = NULL;
852 }
853
854 if (sgp != SGP_WRITE &&
855 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
856 error = -EINVAL;
857 goto failed;
858 }
859
860 if (page || (sgp == SGP_READ && !swap.val)) {
1243 /* 861 /*
1244 * Once we can get the page lock, it must be uptodate: 862 * Once we can get the page lock, it must be uptodate:
1245 * if there were an error in reading back from swap, 863 * if there were an error in reading back from swap,
1246 * the page would not be inserted into the filecache. 864 * the page would not be inserted into the filecache.
1247 */ 865 */
1248 BUG_ON(!PageUptodate(page)); 866 BUG_ON(page && !PageUptodate(page));
1249 goto done; 867 *pagep = page;
868 return 0;
1250 } 869 }
1251 870
1252 /* 871 /*
1253 * Try to preload while we can wait, to not make a habit of 872 * Fast cache lookup did not find it:
1254 * draining atomic reserves; but don't latch on to this cpu. 873 * bring it back from swap or allocate.
1255 */ 874 */
1256 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 875 info = SHMEM_I(inode);
1257 if (error) 876 sbinfo = SHMEM_SB(inode->i_sb);
1258 goto out;
1259 radix_tree_preload_end();
1260
1261 if (sgp != SGP_READ && !prealloc_page) {
1262 prealloc_page = shmem_alloc_page(gfp, info, idx);
1263 if (prealloc_page) {
1264 SetPageSwapBacked(prealloc_page);
1265 if (mem_cgroup_cache_charge(prealloc_page,
1266 current->mm, GFP_KERNEL)) {
1267 page_cache_release(prealloc_page);
1268 prealloc_page = NULL;
1269 }
1270 }
1271 }
1272
1273 spin_lock(&info->lock);
1274 shmem_recalc_inode(inode);
1275 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1276 if (IS_ERR(entry)) {
1277 spin_unlock(&info->lock);
1278 error = PTR_ERR(entry);
1279 goto out;
1280 }
1281 swap = *entry;
1282 877
1283 if (swap.val) { 878 if (swap.val) {
1284 /* Look it up and read it in.. */ 879 /* Look it up and read it in.. */
1285 page = lookup_swap_cache(swap); 880 page = lookup_swap_cache(swap);
1286 if (!page) { 881 if (!page) {
1287 shmem_swp_unmap(entry);
1288 spin_unlock(&info->lock);
1289 /* here we actually do the io */ 882 /* here we actually do the io */
1290 if (fault_type) 883 if (fault_type)
1291 *fault_type |= VM_FAULT_MAJOR; 884 *fault_type |= VM_FAULT_MAJOR;
1292 page = shmem_swapin(swap, gfp, info, idx); 885 page = shmem_swapin(swap, gfp, info, index);
1293 if (!page) { 886 if (!page) {
1294 spin_lock(&info->lock); 887 error = -ENOMEM;
1295 entry = shmem_swp_alloc(info, idx, sgp, gfp); 888 goto failed;
1296 if (IS_ERR(entry))
1297 error = PTR_ERR(entry);
1298 else {
1299 if (entry->val == swap.val)
1300 error = -ENOMEM;
1301 shmem_swp_unmap(entry);
1302 }
1303 spin_unlock(&info->lock);
1304 if (error)
1305 goto out;
1306 goto repeat;
1307 } 889 }
1308 wait_on_page_locked(page);
1309 page_cache_release(page);
1310 goto repeat;
1311 } 890 }
1312 891
1313 /* We have to do this with page locked to prevent races */ 892 /* We have to do this with page locked to prevent races */
1314 if (!trylock_page(page)) { 893 lock_page(page);
1315 shmem_swp_unmap(entry);
1316 spin_unlock(&info->lock);
1317 wait_on_page_locked(page);
1318 page_cache_release(page);
1319 goto repeat;
1320 }
1321 if (PageWriteback(page)) {
1322 shmem_swp_unmap(entry);
1323 spin_unlock(&info->lock);
1324 wait_on_page_writeback(page);
1325 unlock_page(page);
1326 page_cache_release(page);
1327 goto repeat;
1328 }
1329 if (!PageUptodate(page)) { 894 if (!PageUptodate(page)) {
1330 shmem_swp_unmap(entry);
1331 spin_unlock(&info->lock);
1332 unlock_page(page);
1333 page_cache_release(page);
1334 error = -EIO; 895 error = -EIO;
1335 goto out; 896 goto failed;
1336 } 897 }
1337 898 wait_on_page_writeback(page);
1338 error = add_to_page_cache_locked(page, mapping, 899
1339 idx, GFP_NOWAIT); 900 /* Someone may have already done it for us */
1340 if (error) { 901 if (page->mapping) {
1341 shmem_swp_unmap(entry); 902 if (page->mapping == mapping &&
1342 spin_unlock(&info->lock); 903 page->index == index)
1343 if (error == -ENOMEM) { 904 goto done;
1344 /* 905 error = -EEXIST;
1345 * reclaim from proper memory cgroup and 906 goto failed;
1346 * call memcg's OOM if needed.
1347 */
1348 error = mem_cgroup_shmem_charge_fallback(
1349 page, current->mm, gfp);
1350 if (error) {
1351 unlock_page(page);
1352 page_cache_release(page);
1353 goto out;
1354 }
1355 }
1356 unlock_page(page);
1357 page_cache_release(page);
1358 goto repeat;
1359 } 907 }
1360 908
1361 info->flags |= SHMEM_PAGEIN; 909 error = mem_cgroup_cache_charge(page, current->mm,
1362 shmem_swp_set(info, entry, 0); 910 gfp & GFP_RECLAIM_MASK);
1363 shmem_swp_unmap(entry); 911 if (!error)
1364 delete_from_swap_cache(page); 912 error = shmem_add_to_page_cache(page, mapping, index,
913 gfp, swp_to_radix_entry(swap));
914 if (error)
915 goto failed;
916
917 spin_lock(&info->lock);
918 info->swapped--;
919 shmem_recalc_inode(inode);
1365 spin_unlock(&info->lock); 920 spin_unlock(&info->lock);
921
922 delete_from_swap_cache(page);
1366 set_page_dirty(page); 923 set_page_dirty(page);
1367 swap_free(swap); 924 swap_free(swap);
1368 925
1369 } else if (sgp == SGP_READ) { 926 } else {
1370 shmem_swp_unmap(entry); 927 if (shmem_acct_block(info->flags)) {
1371 page = find_get_page(mapping, idx); 928 error = -ENOSPC;
1372 if (page && !trylock_page(page)) { 929 goto failed;
1373 spin_unlock(&info->lock);
1374 wait_on_page_locked(page);
1375 page_cache_release(page);
1376 goto repeat;
1377 } 930 }
1378 spin_unlock(&info->lock);
1379
1380 } else if (prealloc_page) {
1381 shmem_swp_unmap(entry);
1382 sbinfo = SHMEM_SB(inode->i_sb);
1383 if (sbinfo->max_blocks) { 931 if (sbinfo->max_blocks) {
1384 if (percpu_counter_compare(&sbinfo->used_blocks, 932 if (percpu_counter_compare(&sbinfo->used_blocks,
1385 sbinfo->max_blocks) >= 0 || 933 sbinfo->max_blocks) >= 0) {
1386 shmem_acct_block(info->flags)) 934 error = -ENOSPC;
1387 goto nospace; 935 goto unacct;
936 }
1388 percpu_counter_inc(&sbinfo->used_blocks); 937 percpu_counter_inc(&sbinfo->used_blocks);
1389 inode->i_blocks += BLOCKS_PER_PAGE;
1390 } else if (shmem_acct_block(info->flags))
1391 goto nospace;
1392
1393 page = prealloc_page;
1394 prealloc_page = NULL;
1395
1396 entry = shmem_swp_alloc(info, idx, sgp, gfp);
1397 if (IS_ERR(entry))
1398 error = PTR_ERR(entry);
1399 else {
1400 swap = *entry;
1401 shmem_swp_unmap(entry);
1402 } 938 }
1403 ret = error || swap.val; 939
1404 if (ret) 940 page = shmem_alloc_page(gfp, info, index);
1405 mem_cgroup_uncharge_cache_page(page); 941 if (!page) {
1406 else 942 error = -ENOMEM;
1407 ret = add_to_page_cache_lru(page, mapping, 943 goto decused;
1408 idx, GFP_NOWAIT);
1409 /*
1410 * At add_to_page_cache_lru() failure,
1411 * uncharge will be done automatically.
1412 */
1413 if (ret) {
1414 shmem_unacct_blocks(info->flags, 1);
1415 shmem_free_blocks(inode, 1);
1416 spin_unlock(&info->lock);
1417 page_cache_release(page);
1418 if (error)
1419 goto out;
1420 goto repeat;
1421 } 944 }
1422 945
1423 info->flags |= SHMEM_PAGEIN; 946 SetPageSwapBacked(page);
947 __set_page_locked(page);
948 error = mem_cgroup_cache_charge(page, current->mm,
949 gfp & GFP_RECLAIM_MASK);
950 if (!error)
951 error = shmem_add_to_page_cache(page, mapping, index,
952 gfp, NULL);
953 if (error)
954 goto decused;
955 lru_cache_add_anon(page);
956
957 spin_lock(&info->lock);
1424 info->alloced++; 958 info->alloced++;
959 inode->i_blocks += BLOCKS_PER_PAGE;
960 shmem_recalc_inode(inode);
1425 spin_unlock(&info->lock); 961 spin_unlock(&info->lock);
962
1426 clear_highpage(page); 963 clear_highpage(page);
1427 flush_dcache_page(page); 964 flush_dcache_page(page);
1428 SetPageUptodate(page); 965 SetPageUptodate(page);
1429 if (sgp == SGP_DIRTY) 966 if (sgp == SGP_DIRTY)
1430 set_page_dirty(page); 967 set_page_dirty(page);
1431
1432 } else {
1433 spin_unlock(&info->lock);
1434 error = -ENOMEM;
1435 goto out;
1436 } 968 }
1437done: 969done:
1438 *pagep = page; 970 /* Perhaps the file has been truncated since we checked */
1439 error = 0; 971 if (sgp != SGP_WRITE &&
1440out: 972 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1441 if (prealloc_page) { 973 error = -EINVAL;
1442 mem_cgroup_uncharge_cache_page(prealloc_page); 974 goto trunc;
1443 page_cache_release(prealloc_page);
1444 } 975 }
1445 return error; 976 *pagep = page;
977 return 0;
1446 978
1447nospace:
1448 /* 979 /*
1449 * Perhaps the page was brought in from swap between find_lock_page 980 * Error recovery.
1450 * and taking info->lock? We allow for that at add_to_page_cache_lru,
1451 * but must also avoid reporting a spurious ENOSPC while working on a
1452 * full tmpfs.
1453 */ 981 */
1454 page = find_get_page(mapping, idx); 982trunc:
983 ClearPageDirty(page);
984 delete_from_page_cache(page);
985 spin_lock(&info->lock);
986 info->alloced--;
987 inode->i_blocks -= BLOCKS_PER_PAGE;
1455 spin_unlock(&info->lock); 988 spin_unlock(&info->lock);
989decused:
990 if (sbinfo->max_blocks)
991 percpu_counter_add(&sbinfo->used_blocks, -1);
992unacct:
993 shmem_unacct_blocks(info->flags, 1);
994failed:
995 if (swap.val && error != -EINVAL) {
996 struct page *test = find_get_page(mapping, index);
997 if (test && !radix_tree_exceptional_entry(test))
998 page_cache_release(test);
999 /* Have another try if the entry has changed */
1000 if (test != swp_to_radix_entry(swap))
1001 error = -EEXIST;
1002 }
1456 if (page) { 1003 if (page) {
1004 unlock_page(page);
1457 page_cache_release(page); 1005 page_cache_release(page);
1006 }
1007 if (error == -ENOSPC && !once++) {
1008 info = SHMEM_I(inode);
1009 spin_lock(&info->lock);
1010 shmem_recalc_inode(inode);
1011 spin_unlock(&info->lock);
1458 goto repeat; 1012 goto repeat;
1459 } 1013 }
1460 error = -ENOSPC; 1014 if (error == -EEXIST)
1461 goto out; 1015 goto repeat;
1016 return error;
1462} 1017}
1463 1018
1464static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1019static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1467 int error; 1022 int error;
1468 int ret = VM_FAULT_LOCKED; 1023 int ret = VM_FAULT_LOCKED;
1469 1024
1470 if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
1471 return VM_FAULT_SIGBUS;
1472
1473 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); 1025 error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
1474 if (error) 1026 if (error)
1475 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1027 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1482} 1034}
1483 1035
1484#ifdef CONFIG_NUMA 1036#ifdef CONFIG_NUMA
1485static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) 1037static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1486{ 1038{
1487 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1039 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1488 return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); 1040 return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
1489} 1041}
1490 1042
1491static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, 1043static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
1492 unsigned long addr) 1044 unsigned long addr)
1493{ 1045{
1494 struct inode *i = vma->vm_file->f_path.dentry->d_inode; 1046 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
1495 unsigned long idx; 1047 pgoff_t index;
1496 1048
1497 idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1049 index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
1498 return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); 1050 return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
1499} 1051}
1500#endif 1052#endif
1501 1053
@@ -1593,7 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1593 1145
1594#ifdef CONFIG_TMPFS 1146#ifdef CONFIG_TMPFS
1595static const struct inode_operations shmem_symlink_inode_operations; 1147static const struct inode_operations shmem_symlink_inode_operations;
1596static const struct inode_operations shmem_symlink_inline_operations; 1148static const struct inode_operations shmem_short_symlink_operations;
1597 1149
1598static int 1150static int
1599shmem_write_begin(struct file *file, struct address_space *mapping, 1151shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1626,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1626{ 1178{
1627 struct inode *inode = filp->f_path.dentry->d_inode; 1179 struct inode *inode = filp->f_path.dentry->d_inode;
1628 struct address_space *mapping = inode->i_mapping; 1180 struct address_space *mapping = inode->i_mapping;
1629 unsigned long index, offset; 1181 pgoff_t index;
1182 unsigned long offset;
1630 enum sgp_type sgp = SGP_READ; 1183 enum sgp_type sgp = SGP_READ;
1631 1184
1632 /* 1185 /*
@@ -1642,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
1642 1195
1643 for (;;) { 1196 for (;;) {
1644 struct page *page = NULL; 1197 struct page *page = NULL;
1645 unsigned long end_index, nr, ret; 1198 pgoff_t end_index;
1199 unsigned long nr, ret;
1646 loff_t i_size = i_size_read(inode); 1200 loff_t i_size = i_size_read(inode);
1647 1201
1648 end_index = i_size >> PAGE_CACHE_SHIFT; 1202 end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1880,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1880 buf->f_namelen = NAME_MAX; 1434 buf->f_namelen = NAME_MAX;
1881 if (sbinfo->max_blocks) { 1435 if (sbinfo->max_blocks) {
1882 buf->f_blocks = sbinfo->max_blocks; 1436 buf->f_blocks = sbinfo->max_blocks;
1883 buf->f_bavail = buf->f_bfree = 1437 buf->f_bavail =
1884 sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); 1438 buf->f_bfree = sbinfo->max_blocks -
1439 percpu_counter_sum(&sbinfo->used_blocks);
1885 } 1440 }
1886 if (sbinfo->max_inodes) { 1441 if (sbinfo->max_inodes) {
1887 buf->f_files = sbinfo->max_inodes; 1442 buf->f_files = sbinfo->max_inodes;
@@ -2055,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2055 1610
2056 info = SHMEM_I(inode); 1611 info = SHMEM_I(inode);
2057 inode->i_size = len-1; 1612 inode->i_size = len-1;
2058 if (len <= SHMEM_SYMLINK_INLINE_LEN) { 1613 if (len <= SHORT_SYMLINK_LEN) {
2059 /* do it inline */ 1614 info->symlink = kmemdup(symname, len, GFP_KERNEL);
2060 memcpy(info->inline_symlink, symname, len); 1615 if (!info->symlink) {
2061 inode->i_op = &shmem_symlink_inline_operations; 1616 iput(inode);
1617 return -ENOMEM;
1618 }
1619 inode->i_op = &shmem_short_symlink_operations;
2062 } else { 1620 } else {
2063 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); 1621 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
2064 if (error) { 1622 if (error) {
@@ -2081,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2081 return 0; 1639 return 0;
2082} 1640}
2083 1641
2084static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) 1642static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2085{ 1643{
2086 nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); 1644 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
2087 return NULL; 1645 return NULL;
2088} 1646}
2089 1647
2090static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 1648static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2091{ 1649{
2092 struct page *page = NULL; 1650 struct page *page = NULL;
2093 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 1651 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
2094 nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); 1652 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2095 if (page) 1653 if (page)
2096 unlock_page(page); 1654 unlock_page(page);
2097 return page; 1655 return page;
@@ -2202,7 +1760,6 @@ out:
2202 return err; 1760 return err;
2203} 1761}
2204 1762
2205
2206static const struct xattr_handler *shmem_xattr_handlers[] = { 1763static const struct xattr_handler *shmem_xattr_handlers[] = {
2207#ifdef CONFIG_TMPFS_POSIX_ACL 1764#ifdef CONFIG_TMPFS_POSIX_ACL
2208 &generic_acl_access_handler, 1765 &generic_acl_access_handler,
@@ -2332,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2332} 1889}
2333#endif /* CONFIG_TMPFS_XATTR */ 1890#endif /* CONFIG_TMPFS_XATTR */
2334 1891
2335static const struct inode_operations shmem_symlink_inline_operations = { 1892static const struct inode_operations shmem_short_symlink_operations = {
2336 .readlink = generic_readlink, 1893 .readlink = generic_readlink,
2337 .follow_link = shmem_follow_link_inline, 1894 .follow_link = shmem_follow_short_symlink,
2338#ifdef CONFIG_TMPFS_XATTR 1895#ifdef CONFIG_TMPFS_XATTR
2339 .setxattr = shmem_setxattr, 1896 .setxattr = shmem_setxattr,
2340 .getxattr = shmem_getxattr, 1897 .getxattr = shmem_getxattr,
@@ -2534,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2534 if (config.max_inodes < inodes) 2091 if (config.max_inodes < inodes)
2535 goto out; 2092 goto out;
2536 /* 2093 /*
2537 * Those tests also disallow limited->unlimited while any are in 2094 * Those tests disallow limited->unlimited while any are in use;
2538 * use, so i_blocks will always be zero when max_blocks is zero;
2539 * but we must separately disallow unlimited->limited, because 2095 * but we must separately disallow unlimited->limited, because
2540 * in that case we have no record of how much is already in use. 2096 * in that case we have no record of how much is already in use.
2541 */ 2097 */
@@ -2627,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2627 goto failed; 2183 goto failed;
2628 sbinfo->free_inodes = sbinfo->max_inodes; 2184 sbinfo->free_inodes = sbinfo->max_inodes;
2629 2185
2630 sb->s_maxbytes = SHMEM_MAX_BYTES; 2186 sb->s_maxbytes = MAX_LFS_FILESIZE;
2631 sb->s_blocksize = PAGE_CACHE_SIZE; 2187 sb->s_blocksize = PAGE_CACHE_SIZE;
2632 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 2188 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
2633 sb->s_magic = TMPFS_MAGIC; 2189 sb->s_magic = TMPFS_MAGIC;
@@ -2662,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep;
2662 2218
2663static struct inode *shmem_alloc_inode(struct super_block *sb) 2219static struct inode *shmem_alloc_inode(struct super_block *sb)
2664{ 2220{
2665 struct shmem_inode_info *p; 2221 struct shmem_inode_info *info;
2666 p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); 2222 info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
2667 if (!p) 2223 if (!info)
2668 return NULL; 2224 return NULL;
2669 return &p->vfs_inode; 2225 return &info->vfs_inode;
2670} 2226}
2671 2227
2672static void shmem_i_callback(struct rcu_head *head) 2228static void shmem_destroy_callback(struct rcu_head *head)
2673{ 2229{
2674 struct inode *inode = container_of(head, struct inode, i_rcu); 2230 struct inode *inode = container_of(head, struct inode, i_rcu);
2675 INIT_LIST_HEAD(&inode->i_dentry); 2231 INIT_LIST_HEAD(&inode->i_dentry);
@@ -2678,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head)
2678 2234
2679static void shmem_destroy_inode(struct inode *inode) 2235static void shmem_destroy_inode(struct inode *inode)
2680{ 2236{
2681 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2237 if ((inode->i_mode & S_IFMT) == S_IFREG)
2682 /* only struct inode is valid if it's an inline symlink */
2683 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2238 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2684 } 2239 call_rcu(&inode->i_rcu, shmem_destroy_callback);
2685 call_rcu(&inode->i_rcu, shmem_i_callback);
2686} 2240}
2687 2241
2688static void init_once(void *foo) 2242static void shmem_init_inode(void *foo)
2689{ 2243{
2690 struct shmem_inode_info *p = (struct shmem_inode_info *) foo; 2244 struct shmem_inode_info *info = foo;
2691 2245 inode_init_once(&info->vfs_inode);
2692 inode_init_once(&p->vfs_inode);
2693} 2246}
2694 2247
2695static int init_inodecache(void) 2248static int shmem_init_inodecache(void)
2696{ 2249{
2697 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", 2250 shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
2698 sizeof(struct shmem_inode_info), 2251 sizeof(struct shmem_inode_info),
2699 0, SLAB_PANIC, init_once); 2252 0, SLAB_PANIC, shmem_init_inode);
2700 return 0; 2253 return 0;
2701} 2254}
2702 2255
2703static void destroy_inodecache(void) 2256static void shmem_destroy_inodecache(void)
2704{ 2257{
2705 kmem_cache_destroy(shmem_inode_cachep); 2258 kmem_cache_destroy(shmem_inode_cachep);
2706} 2259}
@@ -2797,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
2797#endif 2350#endif
2798}; 2351};
2799 2352
2800
2801static struct dentry *shmem_mount(struct file_system_type *fs_type, 2353static struct dentry *shmem_mount(struct file_system_type *fs_type,
2802 int flags, const char *dev_name, void *data) 2354 int flags, const char *dev_name, void *data)
2803{ 2355{
2804 return mount_nodev(fs_type, flags, data, shmem_fill_super); 2356 return mount_nodev(fs_type, flags, data, shmem_fill_super);
2805} 2357}
2806 2358
2807static struct file_system_type tmpfs_fs_type = { 2359static struct file_system_type shmem_fs_type = {
2808 .owner = THIS_MODULE, 2360 .owner = THIS_MODULE,
2809 .name = "tmpfs", 2361 .name = "tmpfs",
2810 .mount = shmem_mount, 2362 .mount = shmem_mount,
2811 .kill_sb = kill_litter_super, 2363 .kill_sb = kill_litter_super,
2812}; 2364};
2813 2365
2814int __init init_tmpfs(void) 2366int __init shmem_init(void)
2815{ 2367{
2816 int error; 2368 int error;
2817 2369
@@ -2819,18 +2371,18 @@ int __init init_tmpfs(void)
2819 if (error) 2371 if (error)
2820 goto out4; 2372 goto out4;
2821 2373
2822 error = init_inodecache(); 2374 error = shmem_init_inodecache();
2823 if (error) 2375 if (error)
2824 goto out3; 2376 goto out3;
2825 2377
2826 error = register_filesystem(&tmpfs_fs_type); 2378 error = register_filesystem(&shmem_fs_type);
2827 if (error) { 2379 if (error) {
2828 printk(KERN_ERR "Could not register tmpfs\n"); 2380 printk(KERN_ERR "Could not register tmpfs\n");
2829 goto out2; 2381 goto out2;
2830 } 2382 }
2831 2383
2832 shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, 2384 shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
2833 tmpfs_fs_type.name, NULL); 2385 shmem_fs_type.name, NULL);
2834 if (IS_ERR(shm_mnt)) { 2386 if (IS_ERR(shm_mnt)) {
2835 error = PTR_ERR(shm_mnt); 2387 error = PTR_ERR(shm_mnt);
2836 printk(KERN_ERR "Could not kern_mount tmpfs\n"); 2388 printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2839,9 +2391,9 @@ int __init init_tmpfs(void)
2839 return 0; 2391 return 0;
2840 2392
2841out1: 2393out1:
2842 unregister_filesystem(&tmpfs_fs_type); 2394 unregister_filesystem(&shmem_fs_type);
2843out2: 2395out2:
2844 destroy_inodecache(); 2396 shmem_destroy_inodecache();
2845out3: 2397out3:
2846 bdi_destroy(&shmem_backing_dev_info); 2398 bdi_destroy(&shmem_backing_dev_info);
2847out4: 2399out4:
@@ -2849,45 +2401,6 @@ out4:
2849 return error; 2401 return error;
2850} 2402}
2851 2403
2852#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2853/**
2854 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2855 * @inode: the inode to be searched
2856 * @pgoff: the offset to be searched
2857 * @pagep: the pointer for the found page to be stored
2858 * @ent: the pointer for the found swap entry to be stored
2859 *
2860 * If a page is found, refcount of it is incremented. Callers should handle
2861 * these refcount.
2862 */
2863void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2864 struct page **pagep, swp_entry_t *ent)
2865{
2866 swp_entry_t entry = { .val = 0 }, *ptr;
2867 struct page *page = NULL;
2868 struct shmem_inode_info *info = SHMEM_I(inode);
2869
2870 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2871 goto out;
2872
2873 spin_lock(&info->lock);
2874 ptr = shmem_swp_entry(info, pgoff, NULL);
2875#ifdef CONFIG_SWAP
2876 if (ptr && ptr->val) {
2877 entry.val = ptr->val;
2878 page = find_get_page(&swapper_space, entry.val);
2879 } else
2880#endif
2881 page = find_get_page(inode->i_mapping, pgoff);
2882 if (ptr)
2883 shmem_swp_unmap(ptr);
2884 spin_unlock(&info->lock);
2885out:
2886 *pagep = page;
2887 *ent = entry;
2888}
2889#endif
2890
2891#else /* !CONFIG_SHMEM */ 2404#else /* !CONFIG_SHMEM */
2892 2405
2893/* 2406/*
@@ -2901,23 +2414,23 @@ out:
2901 2414
2902#include <linux/ramfs.h> 2415#include <linux/ramfs.h>
2903 2416
2904static struct file_system_type tmpfs_fs_type = { 2417static struct file_system_type shmem_fs_type = {
2905 .name = "tmpfs", 2418 .name = "tmpfs",
2906 .mount = ramfs_mount, 2419 .mount = ramfs_mount,
2907 .kill_sb = kill_litter_super, 2420 .kill_sb = kill_litter_super,
2908}; 2421};
2909 2422
2910int __init init_tmpfs(void) 2423int __init shmem_init(void)
2911{ 2424{
2912 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); 2425 BUG_ON(register_filesystem(&shmem_fs_type) != 0);
2913 2426
2914 shm_mnt = kern_mount(&tmpfs_fs_type); 2427 shm_mnt = kern_mount(&shmem_fs_type);
2915 BUG_ON(IS_ERR(shm_mnt)); 2428 BUG_ON(IS_ERR(shm_mnt));
2916 2429
2917 return 0; 2430 return 0;
2918} 2431}
2919 2432
2920int shmem_unuse(swp_entry_t entry, struct page *page) 2433int shmem_unuse(swp_entry_t swap, struct page *page)
2921{ 2434{
2922 return 0; 2435 return 0;
2923} 2436}
@@ -2927,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
2927 return 0; 2440 return 0;
2928} 2441}
2929 2442
2930void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) 2443void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
2931{ 2444{
2932 truncate_inode_pages_range(inode->i_mapping, start, end); 2445 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
2933} 2446}
2934EXPORT_SYMBOL_GPL(shmem_truncate_range); 2447EXPORT_SYMBOL_GPL(shmem_truncate_range);
2935 2448
2936#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2937/**
2938 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
2939 * @inode: the inode to be searched
2940 * @pgoff: the offset to be searched
2941 * @pagep: the pointer for the found page to be stored
2942 * @ent: the pointer for the found swap entry to be stored
2943 *
2944 * If a page is found, refcount of it is incremented. Callers should handle
2945 * these refcount.
2946 */
2947void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
2948 struct page **pagep, swp_entry_t *ent)
2949{
2950 struct page *page = NULL;
2951
2952 if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
2953 goto out;
2954 page = find_get_page(inode->i_mapping, pgoff);
2955out:
2956 *pagep = page;
2957 *ent = (swp_entry_t){ .val = 0 };
2958}
2959#endif
2960
2961#define shmem_vm_ops generic_file_vm_ops 2449#define shmem_vm_ops generic_file_vm_ops
2962#define shmem_file_operations ramfs_file_operations 2450#define shmem_file_operations ramfs_file_operations
2963#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) 2451#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev)
2964#define shmem_acct_size(flags, size) 0 2452#define shmem_acct_size(flags, size) 0
2965#define shmem_unacct_size(flags, size) do {} while (0) 2453#define shmem_unacct_size(flags, size) do {} while (0)
2966#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE
2967 2454
2968#endif /* CONFIG_SHMEM */ 2455#endif /* CONFIG_SHMEM */
2969 2456
@@ -2987,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
2987 if (IS_ERR(shm_mnt)) 2474 if (IS_ERR(shm_mnt))
2988 return (void *)shm_mnt; 2475 return (void *)shm_mnt;
2989 2476
2990 if (size < 0 || size > SHMEM_MAX_BYTES) 2477 if (size < 0 || size > MAX_LFS_FILESIZE)
2991 return ERR_PTR(-EINVAL); 2478 return ERR_PTR(-EINVAL);
2992 2479
2993 if (shmem_acct_size(flags, size)) 2480 if (shmem_acct_size(flags, size))
diff --git a/mm/slab.c b/mm/slab.c
index 1e523ed47c61..6d90a091fdca 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -622,6 +622,51 @@ int slab_is_available(void)
622static struct lock_class_key on_slab_l3_key; 622static struct lock_class_key on_slab_l3_key;
623static struct lock_class_key on_slab_alc_key; 623static struct lock_class_key on_slab_alc_key;
624 624
625static struct lock_class_key debugobj_l3_key;
626static struct lock_class_key debugobj_alc_key;
627
628static void slab_set_lock_classes(struct kmem_cache *cachep,
629 struct lock_class_key *l3_key, struct lock_class_key *alc_key,
630 int q)
631{
632 struct array_cache **alc;
633 struct kmem_list3 *l3;
634 int r;
635
636 l3 = cachep->nodelists[q];
637 if (!l3)
638 return;
639
640 lockdep_set_class(&l3->list_lock, l3_key);
641 alc = l3->alien;
642 /*
643 * FIXME: This check for BAD_ALIEN_MAGIC
644 * should go away when common slab code is taught to
645 * work even without alien caches.
646 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 * for alloc_alien_cache,
648 */
649 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 return;
651 for_each_node(r) {
652 if (alc[r])
653 lockdep_set_class(&alc[r]->lock, alc_key);
654 }
655}
656
657static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
658{
659 slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
660}
661
662static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
663{
664 int node;
665
666 for_each_online_node(node)
667 slab_set_debugobj_lock_classes_node(cachep, node);
668}
669
625static void init_node_lock_keys(int q) 670static void init_node_lock_keys(int q)
626{ 671{
627 struct cache_sizes *s = malloc_sizes; 672 struct cache_sizes *s = malloc_sizes;
@@ -630,29 +675,14 @@ static void init_node_lock_keys(int q)
630 return; 675 return;
631 676
632 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { 677 for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
633 struct array_cache **alc;
634 struct kmem_list3 *l3; 678 struct kmem_list3 *l3;
635 int r;
636 679
637 l3 = s->cs_cachep->nodelists[q]; 680 l3 = s->cs_cachep->nodelists[q];
638 if (!l3 || OFF_SLAB(s->cs_cachep)) 681 if (!l3 || OFF_SLAB(s->cs_cachep))
639 continue; 682 continue;
640 lockdep_set_class(&l3->list_lock, &on_slab_l3_key); 683
641 alc = l3->alien; 684 slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
642 /* 685 &on_slab_alc_key, q);
643 * FIXME: This check for BAD_ALIEN_MAGIC
644 * should go away when common slab code is taught to
645 * work even without alien caches.
646 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
647 * for alloc_alien_cache,
648 */
649 if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
650 continue;
651 for_each_node(r) {
652 if (alc[r])
653 lockdep_set_class(&alc[r]->lock,
654 &on_slab_alc_key);
655 }
656 } 686 }
657} 687}
658 688
@@ -671,6 +701,14 @@ static void init_node_lock_keys(int q)
671static inline void init_lock_keys(void) 701static inline void init_lock_keys(void)
672{ 702{
673} 703}
704
705static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
706{
707}
708
709static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
710{
711}
674#endif 712#endif
675 713
676/* 714/*
@@ -1264,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu)
1264 spin_unlock_irq(&l3->list_lock); 1302 spin_unlock_irq(&l3->list_lock);
1265 kfree(shared); 1303 kfree(shared);
1266 free_alien_cache(alien); 1304 free_alien_cache(alien);
1305 if (cachep->flags & SLAB_DEBUG_OBJECTS)
1306 slab_set_debugobj_lock_classes_node(cachep, node);
1267 } 1307 }
1268 init_node_lock_keys(node); 1308 init_node_lock_keys(node);
1269 1309
@@ -1626,6 +1666,9 @@ void __init kmem_cache_init_late(void)
1626{ 1666{
1627 struct kmem_cache *cachep; 1667 struct kmem_cache *cachep;
1628 1668
1669 /* Annotate slab for lockdep -- annotate the malloc caches */
1670 init_lock_keys();
1671
1629 /* 6) resize the head arrays to their final sizes */ 1672 /* 6) resize the head arrays to their final sizes */
1630 mutex_lock(&cache_chain_mutex); 1673 mutex_lock(&cache_chain_mutex);
1631 list_for_each_entry(cachep, &cache_chain, next) 1674 list_for_each_entry(cachep, &cache_chain, next)
@@ -1636,9 +1679,6 @@ void __init kmem_cache_init_late(void)
1636 /* Done! */ 1679 /* Done! */
1637 g_cpucache_up = FULL; 1680 g_cpucache_up = FULL;
1638 1681
1639 /* Annotate slab for lockdep -- annotate the malloc caches */
1640 init_lock_keys();
1641
1642 /* 1682 /*
1643 * Register a cpu startup notifier callback that initializes 1683 * Register a cpu startup notifier callback that initializes
1644 * cpu_cache_get for all new cpus 1684 * cpu_cache_get for all new cpus
@@ -2426,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2426 goto oops; 2466 goto oops;
2427 } 2467 }
2428 2468
2469 if (flags & SLAB_DEBUG_OBJECTS) {
2470 /*
2471 * Would deadlock through slab_destroy()->call_rcu()->
2472 * debug_object_activate()->kmem_cache_alloc().
2473 */
2474 WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
2475
2476 slab_set_debugobj_lock_classes(cachep);
2477 }
2478
2429 /* cache setup completed, link it into the list */ 2479 /* cache setup completed, link it into the list */
2430 list_add(&cachep->next, &cache_chain); 2480 list_add(&cachep->next, &cache_chain);
2431oops: 2481oops:
@@ -3403,7 +3453,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3403 cache_alloc_debugcheck_before(cachep, flags); 3453 cache_alloc_debugcheck_before(cachep, flags);
3404 local_irq_save(save_flags); 3454 local_irq_save(save_flags);
3405 3455
3406 if (nodeid == -1) 3456 if (nodeid == NUMA_NO_NODE)
3407 nodeid = slab_node; 3457 nodeid = slab_node;
3408 3458
3409 if (unlikely(!cachep->nodelists[nodeid])) { 3459 if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3934,7 +3984,7 @@ fail:
3934 3984
3935struct ccupdate_struct { 3985struct ccupdate_struct {
3936 struct kmem_cache *cachep; 3986 struct kmem_cache *cachep;
3937 struct array_cache *new[NR_CPUS]; 3987 struct array_cache *new[0];
3938}; 3988};
3939 3989
3940static void do_ccupdate_local(void *info) 3990static void do_ccupdate_local(void *info)
@@ -3956,7 +4006,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3956 struct ccupdate_struct *new; 4006 struct ccupdate_struct *new;
3957 int i; 4007 int i;
3958 4008
3959 new = kzalloc(sizeof(*new), gfp); 4009 new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
4010 gfp);
3960 if (!new) 4011 if (!new)
3961 return -ENOMEM; 4012 return -ENOMEM;
3962 4013
diff --git a/mm/slob.c b/mm/slob.c
index 0ae881831ae2..bf3918187165 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -70,7 +70,7 @@
70 70
71#include <trace/events/kmem.h> 71#include <trace/events/kmem.h>
72 72
73#include <asm/atomic.h> 73#include <linux/atomic.h>
74 74
75/* 75/*
76 * slob_block has a field 'units', which indicates size of block if +ve, 76 * slob_block has a field 'units', which indicates size of block if +ve,
diff --git a/mm/slub.c b/mm/slub.c
index f8f5e8efeb88..9f662d70eb47 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
2 * SLUB: A slab allocator that limits cache line use instead of queuing 2 * SLUB: A slab allocator that limits cache line use instead of queuing
3 * objects in per cpu and per node lists. 3 * objects in per cpu and per node lists.
4 * 4 *
5 * The allocator synchronizes using per slab locks and only 5 * The allocator synchronizes using per slab locks or atomic operatios
6 * uses a centralized lock to manage a pool of partial slabs. 6 * and only uses a centralized lock to manage a pool of partial slabs.
7 * 7 *
8 * (C) 2007 SGI, Christoph Lameter 8 * (C) 2007 SGI, Christoph Lameter
9 * (C) 2011 Linux Foundation, Christoph Lameter
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
@@ -33,15 +34,27 @@
33 34
34/* 35/*
35 * Lock order: 36 * Lock order:
36 * 1. slab_lock(page) 37 * 1. slub_lock (Global Semaphore)
37 * 2. slab->list_lock 38 * 2. node->list_lock
39 * 3. slab_lock(page) (Only on some arches and for debugging)
38 * 40 *
39 * The slab_lock protects operations on the object of a particular 41 * slub_lock
40 * slab and its metadata in the page struct. If the slab lock 42 *
41 * has been taken then no allocations nor frees can be performed 43 * The role of the slub_lock is to protect the list of all the slabs
42 * on the objects in the slab nor can the slab be added or removed 44 * and to synchronize major metadata changes to slab cache structures.
43 * from the partial or full lists since this would mean modifying 45 *
44 * the page_struct of the slab. 46 * The slab_lock is only used for debugging and on arches that do not
47 * have the ability to do a cmpxchg_double. It only protects the second
48 * double word in the page struct. Meaning
49 * A. page->freelist -> List of object free in a page
50 * B. page->counters -> Counters of objects
51 * C. page->frozen -> frozen state
52 *
53 * If a slab is frozen then it is exempt from list management. It is not
54 * on any list. The processor that froze the slab is the one who can
55 * perform list operations on the page. Other processors may put objects
56 * onto the freelist but the processor that froze the slab is the only
57 * one that can retrieve the objects from the page's freelist.
45 * 58 *
46 * The list_lock protects the partial and full list on each node and 59 * The list_lock protects the partial and full list on each node and
47 * the partial slab counter. If taken then no new slabs may be added or 60 * the partial slab counter. If taken then no new slabs may be added or
@@ -54,20 +67,6 @@
54 * slabs, operations can continue without any centralized lock. F.e. 67 * slabs, operations can continue without any centralized lock. F.e.
55 * allocating a long series of objects that fill up slabs does not require 68 * allocating a long series of objects that fill up slabs does not require
56 * the list lock. 69 * the list lock.
57 *
58 * The lock order is sometimes inverted when we are trying to get a slab
59 * off a list. We take the list_lock and then look for a page on the list
60 * to use. While we do that objects in the slabs may be freed. We can
61 * only operate on the slab if we have also taken the slab_lock. So we use
62 * a slab_trylock() on the slab. If trylock was successful then no frees
63 * can occur anymore and we can use the slab for allocations etc. If the
64 * slab_trylock() does not succeed then frees are in progress in the slab and
65 * we must stay away from it for a while since we may cause a bouncing
66 * cacheline if we try to acquire the lock. So go onto the next slab.
67 * If all pages are busy then we may allocate a new slab instead of reusing
68 * a partial slab. A new slab has no one operating on it and thus there is
69 * no danger of cacheline contention.
70 *
71 * Interrupts are disabled during allocation and deallocation in order to 70 * Interrupts are disabled during allocation and deallocation in order to
72 * make the slab allocator safe to use in the context of an irq. In addition 71 * make the slab allocator safe to use in the context of an irq. In addition
73 * interrupts are disabled to ensure that the processor does not change 72 * interrupts are disabled to ensure that the processor does not change
@@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
132/* Enable to test recovery from slab corruption on boot */ 131/* Enable to test recovery from slab corruption on boot */
133#undef SLUB_RESILIENCY_TEST 132#undef SLUB_RESILIENCY_TEST
134 133
134/* Enable to log cmpxchg failures */
135#undef SLUB_DEBUG_CMPXCHG
136
135/* 137/*
136 * Mininum number of partial slabs. These will be left on the partial 138 * Mininum number of partial slabs. These will be left on the partial
137 * lists even if they are empty. kmem_cache_shrink may reclaim them. 139 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
167 169
168#define OO_SHIFT 16 170#define OO_SHIFT 16
169#define OO_MASK ((1 << OO_SHIFT) - 1) 171#define OO_MASK ((1 << OO_SHIFT) - 1)
170#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ 172#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
171 173
172/* Internal SLUB flags */ 174/* Internal SLUB flags */
173#define __OBJECT_POISON 0x80000000UL /* Poison object */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */
176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
174 177
175static int kmem_size = sizeof(struct kmem_cache); 178static int kmem_size = sizeof(struct kmem_cache);
176 179
@@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
343 return x.x & OO_MASK; 346 return x.x & OO_MASK;
344} 347}
345 348
349/*
350 * Per slab locking using the pagelock
351 */
352static __always_inline void slab_lock(struct page *page)
353{
354 bit_spin_lock(PG_locked, &page->flags);
355}
356
357static __always_inline void slab_unlock(struct page *page)
358{
359 __bit_spin_unlock(PG_locked, &page->flags);
360}
361
362/* Interrupts must be disabled (for the fallback code to work right) */
363static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
364 void *freelist_old, unsigned long counters_old,
365 void *freelist_new, unsigned long counters_new,
366 const char *n)
367{
368 VM_BUG_ON(!irqs_disabled());
369#ifdef CONFIG_CMPXCHG_DOUBLE
370 if (s->flags & __CMPXCHG_DOUBLE) {
371 if (cmpxchg_double(&page->freelist,
372 freelist_old, counters_old,
373 freelist_new, counters_new))
374 return 1;
375 } else
376#endif
377 {
378 slab_lock(page);
379 if (page->freelist == freelist_old && page->counters == counters_old) {
380 page->freelist = freelist_new;
381 page->counters = counters_new;
382 slab_unlock(page);
383 return 1;
384 }
385 slab_unlock(page);
386 }
387
388 cpu_relax();
389 stat(s, CMPXCHG_DOUBLE_FAIL);
390
391#ifdef SLUB_DEBUG_CMPXCHG
392 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
393#endif
394
395 return 0;
396}
397
398static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
399 void *freelist_old, unsigned long counters_old,
400 void *freelist_new, unsigned long counters_new,
401 const char *n)
402{
403#ifdef CONFIG_CMPXCHG_DOUBLE
404 if (s->flags & __CMPXCHG_DOUBLE) {
405 if (cmpxchg_double(&page->freelist,
406 freelist_old, counters_old,
407 freelist_new, counters_new))
408 return 1;
409 } else
410#endif
411 {
412 unsigned long flags;
413
414 local_irq_save(flags);
415 slab_lock(page);
416 if (page->freelist == freelist_old && page->counters == counters_old) {
417 page->freelist = freelist_new;
418 page->counters = counters_new;
419 slab_unlock(page);
420 local_irq_restore(flags);
421 return 1;
422 }
423 slab_unlock(page);
424 local_irq_restore(flags);
425 }
426
427 cpu_relax();
428 stat(s, CMPXCHG_DOUBLE_FAIL);
429
430#ifdef SLUB_DEBUG_CMPXCHG
431 printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
432#endif
433
434 return 0;
435}
436
346#ifdef CONFIG_SLUB_DEBUG 437#ifdef CONFIG_SLUB_DEBUG
347/* 438/*
348 * Determine a map of object in use on a page. 439 * Determine a map of object in use on a page.
349 * 440 *
350 * Slab lock or node listlock must be held to guarantee that the page does 441 * Node listlock must be held to guarantee that the page does
351 * not vanish from under us. 442 * not vanish from under us.
352 */ 443 */
353static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) 444static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -610,7 +701,7 @@ static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
610 return check_bytes8(start, value, bytes); 701 return check_bytes8(start, value, bytes);
611 702
612 value64 = value | value << 8 | value << 16 | value << 24; 703 value64 = value | value << 8 | value << 16 | value << 24;
613 value64 = value64 | value64 << 32; 704 value64 = (value64 & 0xffffffff) | value64 << 32;
614 prefix = 8 - ((unsigned long)start) % 8; 705 prefix = 8 - ((unsigned long)start) % 8;
615 706
616 if (prefix) { 707 if (prefix) {
@@ -838,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
838static int on_freelist(struct kmem_cache *s, struct page *page, void *search) 929static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
839{ 930{
840 int nr = 0; 931 int nr = 0;
841 void *fp = page->freelist; 932 void *fp;
842 void *object = NULL; 933 void *object = NULL;
843 unsigned long max_objects; 934 unsigned long max_objects;
844 935
936 fp = page->freelist;
845 while (fp && nr <= page->objects) { 937 while (fp && nr <= page->objects) {
846 if (fp == search) 938 if (fp == search)
847 return 1; 939 return 1;
@@ -946,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
946 1038
947/* 1039/*
948 * Tracking of fully allocated slabs for debugging purposes. 1040 * Tracking of fully allocated slabs for debugging purposes.
1041 *
1042 * list_lock must be held.
949 */ 1043 */
950static void add_full(struct kmem_cache_node *n, struct page *page) 1044static void add_full(struct kmem_cache *s,
1045 struct kmem_cache_node *n, struct page *page)
951{ 1046{
952 spin_lock(&n->list_lock); 1047 if (!(s->flags & SLAB_STORE_USER))
1048 return;
1049
953 list_add(&page->lru, &n->full); 1050 list_add(&page->lru, &n->full);
954 spin_unlock(&n->list_lock);
955} 1051}
956 1052
1053/*
1054 * list_lock must be held.
1055 */
957static void remove_full(struct kmem_cache *s, struct page *page) 1056static void remove_full(struct kmem_cache *s, struct page *page)
958{ 1057{
959 struct kmem_cache_node *n;
960
961 if (!(s->flags & SLAB_STORE_USER)) 1058 if (!(s->flags & SLAB_STORE_USER))
962 return; 1059 return;
963 1060
964 n = get_node(s, page_to_nid(page));
965
966 spin_lock(&n->list_lock);
967 list_del(&page->lru); 1061 list_del(&page->lru);
968 spin_unlock(&n->list_lock);
969} 1062}
970 1063
971/* Tracking of the number of slabs for debugging purposes */ 1064/* Tracking of the number of slabs for debugging purposes */
@@ -1021,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
1021 if (!check_slab(s, page)) 1114 if (!check_slab(s, page))
1022 goto bad; 1115 goto bad;
1023 1116
1024 if (!on_freelist(s, page, object)) {
1025 object_err(s, page, object, "Object already allocated");
1026 goto bad;
1027 }
1028
1029 if (!check_valid_pointer(s, page, object)) { 1117 if (!check_valid_pointer(s, page, object)) {
1030 object_err(s, page, object, "Freelist Pointer check fails"); 1118 object_err(s, page, object, "Freelist Pointer check fails");
1031 goto bad; 1119 goto bad;
@@ -1058,6 +1146,12 @@ bad:
1058static noinline int free_debug_processing(struct kmem_cache *s, 1146static noinline int free_debug_processing(struct kmem_cache *s,
1059 struct page *page, void *object, unsigned long addr) 1147 struct page *page, void *object, unsigned long addr)
1060{ 1148{
1149 unsigned long flags;
1150 int rc = 0;
1151
1152 local_irq_save(flags);
1153 slab_lock(page);
1154
1061 if (!check_slab(s, page)) 1155 if (!check_slab(s, page))
1062 goto fail; 1156 goto fail;
1063 1157
@@ -1072,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1072 } 1166 }
1073 1167
1074 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1168 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1075 return 0; 1169 goto out;
1076 1170
1077 if (unlikely(s != page->slab)) { 1171 if (unlikely(s != page->slab)) {
1078 if (!PageSlab(page)) { 1172 if (!PageSlab(page)) {
@@ -1089,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
1089 goto fail; 1183 goto fail;
1090 } 1184 }
1091 1185
1092 /* Special debug activities for freeing objects */
1093 if (!PageSlubFrozen(page) && !page->freelist)
1094 remove_full(s, page);
1095 if (s->flags & SLAB_STORE_USER) 1186 if (s->flags & SLAB_STORE_USER)
1096 set_track(s, object, TRACK_FREE, addr); 1187 set_track(s, object, TRACK_FREE, addr);
1097 trace(s, page, object, 0); 1188 trace(s, page, object, 0);
1098 init_object(s, object, SLUB_RED_INACTIVE); 1189 init_object(s, object, SLUB_RED_INACTIVE);
1099 return 1; 1190 rc = 1;
1191out:
1192 slab_unlock(page);
1193 local_irq_restore(flags);
1194 return rc;
1100 1195
1101fail: 1196fail:
1102 slab_fix(s, "Object at 0x%p not freed", object); 1197 slab_fix(s, "Object at 0x%p not freed", object);
1103 return 0; 1198 goto out;
1104} 1199}
1105 1200
1106static int __init setup_slub_debug(char *str) 1201static int __init setup_slub_debug(char *str)
@@ -1200,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1200 { return 1; } 1295 { return 1; }
1201static inline int check_object(struct kmem_cache *s, struct page *page, 1296static inline int check_object(struct kmem_cache *s, struct page *page,
1202 void *object, u8 val) { return 1; } 1297 void *object, u8 val) { return 1; }
1203static inline void add_full(struct kmem_cache_node *n, struct page *page) {} 1298static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1299 struct page *page) {}
1300static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1204static inline unsigned long kmem_cache_flags(unsigned long objsize, 1301static inline unsigned long kmem_cache_flags(unsigned long objsize,
1205 unsigned long flags, const char *name, 1302 unsigned long flags, const char *name,
1206 void (*ctor)(void *)) 1303 void (*ctor)(void *))
@@ -1252,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1252 struct kmem_cache_order_objects oo = s->oo; 1349 struct kmem_cache_order_objects oo = s->oo;
1253 gfp_t alloc_gfp; 1350 gfp_t alloc_gfp;
1254 1351
1352 flags &= gfp_allowed_mask;
1353
1354 if (flags & __GFP_WAIT)
1355 local_irq_enable();
1356
1255 flags |= s->allocflags; 1357 flags |= s->allocflags;
1256 1358
1257 /* 1359 /*
@@ -1268,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1268 * Try a lower order alloc if possible 1370 * Try a lower order alloc if possible
1269 */ 1371 */
1270 page = alloc_slab_page(flags, node, oo); 1372 page = alloc_slab_page(flags, node, oo);
1271 if (!page)
1272 return NULL;
1273 1373
1274 stat(s, ORDER_FALLBACK); 1374 if (page)
1375 stat(s, ORDER_FALLBACK);
1275 } 1376 }
1276 1377
1378 if (flags & __GFP_WAIT)
1379 local_irq_disable();
1380
1381 if (!page)
1382 return NULL;
1383
1277 if (kmemcheck_enabled 1384 if (kmemcheck_enabled
1278 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1385 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1279 int pages = 1 << oo_order(oo); 1386 int pages = 1 << oo_order(oo);
@@ -1341,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1341 1448
1342 page->freelist = start; 1449 page->freelist = start;
1343 page->inuse = 0; 1450 page->inuse = 0;
1451 page->frozen = 1;
1344out: 1452out:
1345 return page; 1453 return page;
1346} 1454}
@@ -1418,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
1418} 1526}
1419 1527
1420/* 1528/*
1421 * Per slab locking using the pagelock 1529 * Management of partially allocated slabs.
1422 */ 1530 *
1423static __always_inline void slab_lock(struct page *page) 1531 * list_lock must be held.
1424{
1425 bit_spin_lock(PG_locked, &page->flags);
1426}
1427
1428static __always_inline void slab_unlock(struct page *page)
1429{
1430 __bit_spin_unlock(PG_locked, &page->flags);
1431}
1432
1433static __always_inline int slab_trylock(struct page *page)
1434{
1435 int rc = 1;
1436
1437 rc = bit_spin_trylock(PG_locked, &page->flags);
1438 return rc;
1439}
1440
1441/*
1442 * Management of partially allocated slabs
1443 */ 1532 */
1444static void add_partial(struct kmem_cache_node *n, 1533static inline void add_partial(struct kmem_cache_node *n,
1445 struct page *page, int tail) 1534 struct page *page, int tail)
1446{ 1535{
1447 spin_lock(&n->list_lock);
1448 n->nr_partial++; 1536 n->nr_partial++;
1449 if (tail) 1537 if (tail)
1450 list_add_tail(&page->lru, &n->partial); 1538 list_add_tail(&page->lru, &n->partial);
1451 else 1539 else
1452 list_add(&page->lru, &n->partial); 1540 list_add(&page->lru, &n->partial);
1453 spin_unlock(&n->list_lock);
1454} 1541}
1455 1542
1456static inline void __remove_partial(struct kmem_cache_node *n, 1543/*
1544 * list_lock must be held.
1545 */
1546static inline void remove_partial(struct kmem_cache_node *n,
1457 struct page *page) 1547 struct page *page)
1458{ 1548{
1459 list_del(&page->lru); 1549 list_del(&page->lru);
1460 n->nr_partial--; 1550 n->nr_partial--;
1461} 1551}
1462 1552
1463static void remove_partial(struct kmem_cache *s, struct page *page)
1464{
1465 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1466
1467 spin_lock(&n->list_lock);
1468 __remove_partial(n, page);
1469 spin_unlock(&n->list_lock);
1470}
1471
1472/* 1553/*
1473 * Lock slab and remove from the partial list. 1554 * Lock slab, remove from the partial list and put the object into the
1555 * per cpu freelist.
1474 * 1556 *
1475 * Must hold list_lock. 1557 * Must hold list_lock.
1476 */ 1558 */
1477static inline int lock_and_freeze_slab(struct kmem_cache_node *n, 1559static inline int acquire_slab(struct kmem_cache *s,
1478 struct page *page) 1560 struct kmem_cache_node *n, struct page *page)
1479{ 1561{
1480 if (slab_trylock(page)) { 1562 void *freelist;
1481 __remove_partial(n, page); 1563 unsigned long counters;
1482 __SetPageSlubFrozen(page); 1564 struct page new;
1565
1566 /*
1567 * Zap the freelist and set the frozen bit.
1568 * The old freelist is the list of objects for the
1569 * per cpu allocation list.
1570 */
1571 do {
1572 freelist = page->freelist;
1573 counters = page->counters;
1574 new.counters = counters;
1575 new.inuse = page->objects;
1576
1577 VM_BUG_ON(new.frozen);
1578 new.frozen = 1;
1579
1580 } while (!__cmpxchg_double_slab(s, page,
1581 freelist, counters,
1582 NULL, new.counters,
1583 "lock and freeze"));
1584
1585 remove_partial(n, page);
1586
1587 if (freelist) {
1588 /* Populate the per cpu freelist */
1589 this_cpu_write(s->cpu_slab->freelist, freelist);
1590 this_cpu_write(s->cpu_slab->page, page);
1591 this_cpu_write(s->cpu_slab->node, page_to_nid(page));
1483 return 1; 1592 return 1;
1593 } else {
1594 /*
1595 * Slab page came from the wrong list. No object to allocate
1596 * from. Put it onto the correct list and continue partial
1597 * scan.
1598 */
1599 printk(KERN_ERR "SLUB: %s : Page without available objects on"
1600 " partial list\n", s->name);
1601 return 0;
1484 } 1602 }
1485 return 0;
1486} 1603}
1487 1604
1488/* 1605/*
1489 * Try to allocate a partial slab from a specific node. 1606 * Try to allocate a partial slab from a specific node.
1490 */ 1607 */
1491static struct page *get_partial_node(struct kmem_cache_node *n) 1608static struct page *get_partial_node(struct kmem_cache *s,
1609 struct kmem_cache_node *n)
1492{ 1610{
1493 struct page *page; 1611 struct page *page;
1494 1612
@@ -1503,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
1503 1621
1504 spin_lock(&n->list_lock); 1622 spin_lock(&n->list_lock);
1505 list_for_each_entry(page, &n->partial, lru) 1623 list_for_each_entry(page, &n->partial, lru)
1506 if (lock_and_freeze_slab(n, page)) 1624 if (acquire_slab(s, n, page))
1507 goto out; 1625 goto out;
1508 page = NULL; 1626 page = NULL;
1509out: 1627out:
@@ -1554,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1554 1672
1555 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1556 n->nr_partial > s->min_partial) { 1674 n->nr_partial > s->min_partial) {
1557 page = get_partial_node(n); 1675 page = get_partial_node(s, n);
1558 if (page) { 1676 if (page) {
1559 put_mems_allowed(); 1677 put_mems_allowed();
1560 return page; 1678 return page;
@@ -1574,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1574 struct page *page; 1692 struct page *page;
1575 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1693 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1576 1694
1577 page = get_partial_node(get_node(s, searchnode)); 1695 page = get_partial_node(s, get_node(s, searchnode));
1578 if (page || node != NUMA_NO_NODE) 1696 if (page || node != NUMA_NO_NODE)
1579 return page; 1697 return page;
1580 1698
1581 return get_any_partial(s, flags); 1699 return get_any_partial(s, flags);
1582} 1700}
1583 1701
1584/*
1585 * Move a page back to the lists.
1586 *
1587 * Must be called with the slab lock held.
1588 *
1589 * On exit the slab lock will have been dropped.
1590 */
1591static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1592 __releases(bitlock)
1593{
1594 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1595
1596 __ClearPageSlubFrozen(page);
1597 if (page->inuse) {
1598
1599 if (page->freelist) {
1600 add_partial(n, page, tail);
1601 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1602 } else {
1603 stat(s, DEACTIVATE_FULL);
1604 if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
1605 add_full(n, page);
1606 }
1607 slab_unlock(page);
1608 } else {
1609 stat(s, DEACTIVATE_EMPTY);
1610 if (n->nr_partial < s->min_partial) {
1611 /*
1612 * Adding an empty slab to the partial slabs in order
1613 * to avoid page allocator overhead. This slab needs
1614 * to come after the other slabs with objects in
1615 * so that the others get filled first. That way the
1616 * size of the partial list stays small.
1617 *
1618 * kmem_cache_shrink can reclaim any empty slabs from
1619 * the partial list.
1620 */
1621 add_partial(n, page, 1);
1622 slab_unlock(page);
1623 } else {
1624 slab_unlock(page);
1625 stat(s, FREE_SLAB);
1626 discard_slab(s, page);
1627 }
1628 }
1629}
1630
1631#ifdef CONFIG_PREEMPT 1702#ifdef CONFIG_PREEMPT
1632/* 1703/*
1633 * Calculate the next globally unique transaction for disambiguiation 1704 * Calculate the next globally unique transaction for disambiguiation
@@ -1697,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
1697/* 1768/*
1698 * Remove the cpu slab 1769 * Remove the cpu slab
1699 */ 1770 */
1771
1772/*
1773 * Remove the cpu slab
1774 */
1700static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1775static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1701 __releases(bitlock)
1702{ 1776{
1777 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1703 struct page *page = c->page; 1778 struct page *page = c->page;
1704 int tail = 1; 1779 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1705 1780 int lock = 0;
1706 if (page->freelist) 1781 enum slab_modes l = M_NONE, m = M_NONE;
1782 void *freelist;
1783 void *nextfree;
1784 int tail = 0;
1785 struct page new;
1786 struct page old;
1787
1788 if (page->freelist) {
1707 stat(s, DEACTIVATE_REMOTE_FREES); 1789 stat(s, DEACTIVATE_REMOTE_FREES);
1790 tail = 1;
1791 }
1792
1793 c->tid = next_tid(c->tid);
1794 c->page = NULL;
1795 freelist = c->freelist;
1796 c->freelist = NULL;
1797
1708 /* 1798 /*
1709 * Merge cpu freelist into slab freelist. Typically we get here 1799 * Stage one: Free all available per cpu objects back
1710 * because both freelists are empty. So this is unlikely 1800 * to the page freelist while it is still frozen. Leave the
1711 * to occur. 1801 * last one.
1802 *
1803 * There is no need to take the list->lock because the page
1804 * is still frozen.
1805 */
1806 while (freelist && (nextfree = get_freepointer(s, freelist))) {
1807 void *prior;
1808 unsigned long counters;
1809
1810 do {
1811 prior = page->freelist;
1812 counters = page->counters;
1813 set_freepointer(s, freelist, prior);
1814 new.counters = counters;
1815 new.inuse--;
1816 VM_BUG_ON(!new.frozen);
1817
1818 } while (!__cmpxchg_double_slab(s, page,
1819 prior, counters,
1820 freelist, new.counters,
1821 "drain percpu freelist"));
1822
1823 freelist = nextfree;
1824 }
1825
1826 /*
1827 * Stage two: Ensure that the page is unfrozen while the
1828 * list presence reflects the actual number of objects
1829 * during unfreeze.
1830 *
1831 * We setup the list membership and then perform a cmpxchg
1832 * with the count. If there is a mismatch then the page
1833 * is not unfrozen but the page is on the wrong list.
1834 *
1835 * Then we restart the process which may have to remove
1836 * the page from the list that we just put it on again
1837 * because the number of objects in the slab may have
1838 * changed.
1712 */ 1839 */
1713 while (unlikely(c->freelist)) { 1840redo:
1714 void **object;
1715 1841
1716 tail = 0; /* Hot objects. Put the slab first */ 1842 old.freelist = page->freelist;
1843 old.counters = page->counters;
1844 VM_BUG_ON(!old.frozen);
1717 1845
1718 /* Retrieve object from cpu_freelist */ 1846 /* Determine target state of the slab */
1719 object = c->freelist; 1847 new.counters = old.counters;
1720 c->freelist = get_freepointer(s, c->freelist); 1848 if (freelist) {
1849 new.inuse--;
1850 set_freepointer(s, freelist, old.freelist);
1851 new.freelist = freelist;
1852 } else
1853 new.freelist = old.freelist;
1721 1854
1722 /* And put onto the regular freelist */ 1855 new.frozen = 0;
1723 set_freepointer(s, object, page->freelist); 1856
1724 page->freelist = object; 1857 if (!new.inuse && n->nr_partial > s->min_partial)
1725 page->inuse--; 1858 m = M_FREE;
1859 else if (new.freelist) {
1860 m = M_PARTIAL;
1861 if (!lock) {
1862 lock = 1;
1863 /*
1864 * Taking the spinlock removes the possiblity
1865 * that acquire_slab() will see a slab page that
1866 * is frozen
1867 */
1868 spin_lock(&n->list_lock);
1869 }
1870 } else {
1871 m = M_FULL;
1872 if (kmem_cache_debug(s) && !lock) {
1873 lock = 1;
1874 /*
1875 * This also ensures that the scanning of full
1876 * slabs from diagnostic functions will not see
1877 * any frozen slabs.
1878 */
1879 spin_lock(&n->list_lock);
1880 }
1881 }
1882
1883 if (l != m) {
1884
1885 if (l == M_PARTIAL)
1886
1887 remove_partial(n, page);
1888
1889 else if (l == M_FULL)
1890
1891 remove_full(s, page);
1892
1893 if (m == M_PARTIAL) {
1894
1895 add_partial(n, page, tail);
1896 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1897
1898 } else if (m == M_FULL) {
1899
1900 stat(s, DEACTIVATE_FULL);
1901 add_full(s, n, page);
1902
1903 }
1904 }
1905
1906 l = m;
1907 if (!__cmpxchg_double_slab(s, page,
1908 old.freelist, old.counters,
1909 new.freelist, new.counters,
1910 "unfreezing slab"))
1911 goto redo;
1912
1913 if (lock)
1914 spin_unlock(&n->list_lock);
1915
1916 if (m == M_FREE) {
1917 stat(s, DEACTIVATE_EMPTY);
1918 discard_slab(s, page);
1919 stat(s, FREE_SLAB);
1726 } 1920 }
1727 c->page = NULL;
1728 c->tid = next_tid(c->tid);
1729 unfreeze_slab(s, page, tail);
1730} 1921}
1731 1922
1732static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1923static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1733{ 1924{
1734 stat(s, CPUSLAB_FLUSH); 1925 stat(s, CPUSLAB_FLUSH);
1735 slab_lock(c->page);
1736 deactivate_slab(s, c); 1926 deactivate_slab(s, c);
1737} 1927}
1738 1928
@@ -1861,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1861 void **object; 2051 void **object;
1862 struct page *page; 2052 struct page *page;
1863 unsigned long flags; 2053 unsigned long flags;
2054 struct page new;
2055 unsigned long counters;
1864 2056
1865 local_irq_save(flags); 2057 local_irq_save(flags);
1866#ifdef CONFIG_PREEMPT 2058#ifdef CONFIG_PREEMPT
@@ -1879,72 +2071,97 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1879 if (!page) 2071 if (!page)
1880 goto new_slab; 2072 goto new_slab;
1881 2073
1882 slab_lock(page); 2074 if (unlikely(!node_match(c, node))) {
1883 if (unlikely(!node_match(c, node))) 2075 stat(s, ALLOC_NODE_MISMATCH);
1884 goto another_slab; 2076 deactivate_slab(s, c);
2077 goto new_slab;
2078 }
2079
2080 stat(s, ALLOC_SLOWPATH);
2081
2082 do {
2083 object = page->freelist;
2084 counters = page->counters;
2085 new.counters = counters;
2086 VM_BUG_ON(!new.frozen);
2087
2088 /*
2089 * If there is no object left then we use this loop to
2090 * deactivate the slab which is simple since no objects
2091 * are left in the slab and therefore we do not need to
2092 * put the page back onto the partial list.
2093 *
2094 * If there are objects left then we retrieve them
2095 * and use them to refill the per cpu queue.
2096 */
2097
2098 new.inuse = page->objects;
2099 new.frozen = object != NULL;
2100
2101 } while (!__cmpxchg_double_slab(s, page,
2102 object, counters,
2103 NULL, new.counters,
2104 "__slab_alloc"));
2105
2106 if (unlikely(!object)) {
2107 c->page = NULL;
2108 stat(s, DEACTIVATE_BYPASS);
2109 goto new_slab;
2110 }
1885 2111
1886 stat(s, ALLOC_REFILL); 2112 stat(s, ALLOC_REFILL);
1887 2113
1888load_freelist: 2114load_freelist:
1889 object = page->freelist; 2115 VM_BUG_ON(!page->frozen);
1890 if (unlikely(!object))
1891 goto another_slab;
1892 if (kmem_cache_debug(s))
1893 goto debug;
1894
1895 c->freelist = get_freepointer(s, object); 2116 c->freelist = get_freepointer(s, object);
1896 page->inuse = page->objects;
1897 page->freelist = NULL;
1898
1899 slab_unlock(page);
1900 c->tid = next_tid(c->tid); 2117 c->tid = next_tid(c->tid);
1901 local_irq_restore(flags); 2118 local_irq_restore(flags);
1902 stat(s, ALLOC_SLOWPATH);
1903 return object; 2119 return object;
1904 2120
1905another_slab:
1906 deactivate_slab(s, c);
1907
1908new_slab: 2121new_slab:
1909 page = get_partial(s, gfpflags, node); 2122 page = get_partial(s, gfpflags, node);
1910 if (page) { 2123 if (page) {
1911 stat(s, ALLOC_FROM_PARTIAL); 2124 stat(s, ALLOC_FROM_PARTIAL);
1912 c->node = page_to_nid(page); 2125 object = c->freelist;
1913 c->page = page; 2126
2127 if (kmem_cache_debug(s))
2128 goto debug;
1914 goto load_freelist; 2129 goto load_freelist;
1915 } 2130 }
1916 2131
1917 gfpflags &= gfp_allowed_mask;
1918 if (gfpflags & __GFP_WAIT)
1919 local_irq_enable();
1920
1921 page = new_slab(s, gfpflags, node); 2132 page = new_slab(s, gfpflags, node);
1922 2133
1923 if (gfpflags & __GFP_WAIT)
1924 local_irq_disable();
1925
1926 if (page) { 2134 if (page) {
1927 c = __this_cpu_ptr(s->cpu_slab); 2135 c = __this_cpu_ptr(s->cpu_slab);
1928 stat(s, ALLOC_SLAB);
1929 if (c->page) 2136 if (c->page)
1930 flush_slab(s, c); 2137 flush_slab(s, c);
1931 2138
1932 slab_lock(page); 2139 /*
1933 __SetPageSlubFrozen(page); 2140 * No other reference to the page yet so we can
2141 * muck around with it freely without cmpxchg
2142 */
2143 object = page->freelist;
2144 page->freelist = NULL;
2145 page->inuse = page->objects;
2146
2147 stat(s, ALLOC_SLAB);
1934 c->node = page_to_nid(page); 2148 c->node = page_to_nid(page);
1935 c->page = page; 2149 c->page = page;
2150
2151 if (kmem_cache_debug(s))
2152 goto debug;
1936 goto load_freelist; 2153 goto load_freelist;
1937 } 2154 }
1938 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2155 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
1939 slab_out_of_memory(s, gfpflags, node); 2156 slab_out_of_memory(s, gfpflags, node);
1940 local_irq_restore(flags); 2157 local_irq_restore(flags);
1941 return NULL; 2158 return NULL;
2159
1942debug: 2160debug:
1943 if (!alloc_debug_processing(s, page, object, addr)) 2161 if (!object || !alloc_debug_processing(s, page, object, addr))
1944 goto another_slab; 2162 goto new_slab;
1945 2163
1946 page->inuse++; 2164 c->freelist = get_freepointer(s, object);
1947 page->freelist = get_freepointer(s, object);
1948 deactivate_slab(s, c); 2165 deactivate_slab(s, c);
1949 c->page = NULL; 2166 c->page = NULL;
1950 c->node = NUMA_NO_NODE; 2167 c->node = NUMA_NO_NODE;
@@ -2096,52 +2313,89 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2096{ 2313{
2097 void *prior; 2314 void *prior;
2098 void **object = (void *)x; 2315 void **object = (void *)x;
2099 unsigned long flags; 2316 int was_frozen;
2317 int inuse;
2318 struct page new;
2319 unsigned long counters;
2320 struct kmem_cache_node *n = NULL;
2321 unsigned long uninitialized_var(flags);
2100 2322
2101 local_irq_save(flags);
2102 slab_lock(page);
2103 stat(s, FREE_SLOWPATH); 2323 stat(s, FREE_SLOWPATH);
2104 2324
2105 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) 2325 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2106 goto out_unlock; 2326 return;
2107 2327
2108 prior = page->freelist; 2328 do {
2109 set_freepointer(s, object, prior); 2329 prior = page->freelist;
2110 page->freelist = object; 2330 counters = page->counters;
2111 page->inuse--; 2331 set_freepointer(s, object, prior);
2332 new.counters = counters;
2333 was_frozen = new.frozen;
2334 new.inuse--;
2335 if ((!new.inuse || !prior) && !was_frozen && !n) {
2336 n = get_node(s, page_to_nid(page));
2337 /*
2338 * Speculatively acquire the list_lock.
2339 * If the cmpxchg does not succeed then we may
2340 * drop the list_lock without any processing.
2341 *
2342 * Otherwise the list_lock will synchronize with
2343 * other processors updating the list of slabs.
2344 */
2345 spin_lock_irqsave(&n->list_lock, flags);
2346 }
2347 inuse = new.inuse;
2112 2348
2113 if (unlikely(PageSlubFrozen(page))) { 2349 } while (!cmpxchg_double_slab(s, page,
2114 stat(s, FREE_FROZEN); 2350 prior, counters,
2115 goto out_unlock; 2351 object, new.counters,
2116 } 2352 "__slab_free"));
2117 2353
2118 if (unlikely(!page->inuse)) 2354 if (likely(!n)) {
2119 goto slab_empty; 2355 /*
2356 * The list lock was not taken therefore no list
2357 * activity can be necessary.
2358 */
2359 if (was_frozen)
2360 stat(s, FREE_FROZEN);
2361 return;
2362 }
2120 2363
2121 /* 2364 /*
2122 * Objects left in the slab. If it was not on the partial list before 2365 * was_frozen may have been set after we acquired the list_lock in
2123 * then add it. 2366 * an earlier loop. So we need to check it here again.
2124 */ 2367 */
2125 if (unlikely(!prior)) { 2368 if (was_frozen)
2126 add_partial(get_node(s, page_to_nid(page)), page, 1); 2369 stat(s, FREE_FROZEN);
2127 stat(s, FREE_ADD_PARTIAL); 2370 else {
2128 } 2371 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2372 goto slab_empty;
2129 2373
2130out_unlock: 2374 /*
2131 slab_unlock(page); 2375 * Objects left in the slab. If it was not on the partial list before
2132 local_irq_restore(flags); 2376 * then add it.
2377 */
2378 if (unlikely(!prior)) {
2379 remove_full(s, page);
2380 add_partial(n, page, 0);
2381 stat(s, FREE_ADD_PARTIAL);
2382 }
2383 }
2384 spin_unlock_irqrestore(&n->list_lock, flags);
2133 return; 2385 return;
2134 2386
2135slab_empty: 2387slab_empty:
2136 if (prior) { 2388 if (prior) {
2137 /* 2389 /*
2138 * Slab still on the partial list. 2390 * Slab on the partial list.
2139 */ 2391 */
2140 remove_partial(s, page); 2392 remove_partial(n, page);
2141 stat(s, FREE_REMOVE_PARTIAL); 2393 stat(s, FREE_REMOVE_PARTIAL);
2142 } 2394 } else
2143 slab_unlock(page); 2395 /* Slab must be on the full list */
2144 local_irq_restore(flags); 2396 remove_full(s, page);
2397
2398 spin_unlock_irqrestore(&n->list_lock, flags);
2145 stat(s, FREE_SLAB); 2399 stat(s, FREE_SLAB);
2146 discard_slab(s, page); 2400 discard_slab(s, page);
2147} 2401}
@@ -2415,7 +2669,6 @@ static void early_kmem_cache_node_alloc(int node)
2415{ 2669{
2416 struct page *page; 2670 struct page *page;
2417 struct kmem_cache_node *n; 2671 struct kmem_cache_node *n;
2418 unsigned long flags;
2419 2672
2420 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); 2673 BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
2421 2674
@@ -2433,6 +2686,7 @@ static void early_kmem_cache_node_alloc(int node)
2433 BUG_ON(!n); 2686 BUG_ON(!n);
2434 page->freelist = get_freepointer(kmem_cache_node, n); 2687 page->freelist = get_freepointer(kmem_cache_node, n);
2435 page->inuse++; 2688 page->inuse++;
2689 page->frozen = 0;
2436 kmem_cache_node->node[node] = n; 2690 kmem_cache_node->node[node] = n;
2437#ifdef CONFIG_SLUB_DEBUG 2691#ifdef CONFIG_SLUB_DEBUG
2438 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2692 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2441,14 +2695,7 @@ static void early_kmem_cache_node_alloc(int node)
2441 init_kmem_cache_node(n, kmem_cache_node); 2695 init_kmem_cache_node(n, kmem_cache_node);
2442 inc_slabs_node(kmem_cache_node, node, page->objects); 2696 inc_slabs_node(kmem_cache_node, node, page->objects);
2443 2697
2444 /*
2445 * lockdep requires consistent irq usage for each lock
2446 * so even though there cannot be a race this early in
2447 * the boot sequence, we still disable irqs.
2448 */
2449 local_irq_save(flags);
2450 add_partial(n, page, 0); 2698 add_partial(n, page, 0);
2451 local_irq_restore(flags);
2452} 2699}
2453 2700
2454static void free_kmem_cache_nodes(struct kmem_cache *s) 2701static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2654,6 +2901,12 @@ static int kmem_cache_open(struct kmem_cache *s,
2654 } 2901 }
2655 } 2902 }
2656 2903
2904#ifdef CONFIG_CMPXCHG_DOUBLE
2905 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
2906 /* Enable fast mode */
2907 s->flags |= __CMPXCHG_DOUBLE;
2908#endif
2909
2657 /* 2910 /*
2658 * The larger the object size is, the more pages we want on the partial 2911 * The larger the object size is, the more pages we want on the partial
2659 * list to avoid pounding the page allocator excessively. 2912 * list to avoid pounding the page allocator excessively.
@@ -2726,7 +2979,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
2726 spin_lock_irqsave(&n->list_lock, flags); 2979 spin_lock_irqsave(&n->list_lock, flags);
2727 list_for_each_entry_safe(page, h, &n->partial, lru) { 2980 list_for_each_entry_safe(page, h, &n->partial, lru) {
2728 if (!page->inuse) { 2981 if (!page->inuse) {
2729 __remove_partial(n, page); 2982 remove_partial(n, page);
2730 discard_slab(s, page); 2983 discard_slab(s, page);
2731 } else { 2984 } else {
2732 list_slab_objects(s, page, 2985 list_slab_objects(s, page,
@@ -3094,14 +3347,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
3094 * list_lock. page->inuse here is the upper limit. 3347 * list_lock. page->inuse here is the upper limit.
3095 */ 3348 */
3096 list_for_each_entry_safe(page, t, &n->partial, lru) { 3349 list_for_each_entry_safe(page, t, &n->partial, lru) {
3097 if (!page->inuse && slab_trylock(page)) { 3350 if (!page->inuse) {
3098 /* 3351 remove_partial(n, page);
3099 * Must hold slab lock here because slab_free
3100 * may have freed the last object and be
3101 * waiting to release the slab.
3102 */
3103 __remove_partial(n, page);
3104 slab_unlock(page);
3105 discard_slab(s, page); 3352 discard_slab(s, page);
3106 } else { 3353 } else {
3107 list_move(&page->lru, 3354 list_move(&page->lru,
@@ -3689,12 +3936,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3689static void validate_slab_slab(struct kmem_cache *s, struct page *page, 3936static void validate_slab_slab(struct kmem_cache *s, struct page *page,
3690 unsigned long *map) 3937 unsigned long *map)
3691{ 3938{
3692 if (slab_trylock(page)) { 3939 slab_lock(page);
3693 validate_slab(s, page, map); 3940 validate_slab(s, page, map);
3694 slab_unlock(page); 3941 slab_unlock(page);
3695 } else
3696 printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
3697 s->name, page);
3698} 3942}
3699 3943
3700static int validate_slab_node(struct kmem_cache *s, 3944static int validate_slab_node(struct kmem_cache *s,
@@ -4342,8 +4586,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
4342 const char *buf, size_t length) 4586 const char *buf, size_t length)
4343{ 4587{
4344 s->flags &= ~SLAB_DEBUG_FREE; 4588 s->flags &= ~SLAB_DEBUG_FREE;
4345 if (buf[0] == '1') 4589 if (buf[0] == '1') {
4590 s->flags &= ~__CMPXCHG_DOUBLE;
4346 s->flags |= SLAB_DEBUG_FREE; 4591 s->flags |= SLAB_DEBUG_FREE;
4592 }
4347 return length; 4593 return length;
4348} 4594}
4349SLAB_ATTR(sanity_checks); 4595SLAB_ATTR(sanity_checks);
@@ -4357,8 +4603,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
4357 size_t length) 4603 size_t length)
4358{ 4604{
4359 s->flags &= ~SLAB_TRACE; 4605 s->flags &= ~SLAB_TRACE;
4360 if (buf[0] == '1') 4606 if (buf[0] == '1') {
4607 s->flags &= ~__CMPXCHG_DOUBLE;
4361 s->flags |= SLAB_TRACE; 4608 s->flags |= SLAB_TRACE;
4609 }
4362 return length; 4610 return length;
4363} 4611}
4364SLAB_ATTR(trace); 4612SLAB_ATTR(trace);
@@ -4375,8 +4623,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
4375 return -EBUSY; 4623 return -EBUSY;
4376 4624
4377 s->flags &= ~SLAB_RED_ZONE; 4625 s->flags &= ~SLAB_RED_ZONE;
4378 if (buf[0] == '1') 4626 if (buf[0] == '1') {
4627 s->flags &= ~__CMPXCHG_DOUBLE;
4379 s->flags |= SLAB_RED_ZONE; 4628 s->flags |= SLAB_RED_ZONE;
4629 }
4380 calculate_sizes(s, -1); 4630 calculate_sizes(s, -1);
4381 return length; 4631 return length;
4382} 4632}
@@ -4394,8 +4644,10 @@ static ssize_t poison_store(struct kmem_cache *s,
4394 return -EBUSY; 4644 return -EBUSY;
4395 4645
4396 s->flags &= ~SLAB_POISON; 4646 s->flags &= ~SLAB_POISON;
4397 if (buf[0] == '1') 4647 if (buf[0] == '1') {
4648 s->flags &= ~__CMPXCHG_DOUBLE;
4398 s->flags |= SLAB_POISON; 4649 s->flags |= SLAB_POISON;
4650 }
4399 calculate_sizes(s, -1); 4651 calculate_sizes(s, -1);
4400 return length; 4652 return length;
4401} 4653}
@@ -4413,8 +4665,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
4413 return -EBUSY; 4665 return -EBUSY;
4414 4666
4415 s->flags &= ~SLAB_STORE_USER; 4667 s->flags &= ~SLAB_STORE_USER;
4416 if (buf[0] == '1') 4668 if (buf[0] == '1') {
4669 s->flags &= ~__CMPXCHG_DOUBLE;
4417 s->flags |= SLAB_STORE_USER; 4670 s->flags |= SLAB_STORE_USER;
4671 }
4418 calculate_sizes(s, -1); 4672 calculate_sizes(s, -1);
4419 return length; 4673 return length;
4420} 4674}
@@ -4579,6 +4833,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
4579STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 4833STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
4580STAT_ATTR(ALLOC_SLAB, alloc_slab); 4834STAT_ATTR(ALLOC_SLAB, alloc_slab);
4581STAT_ATTR(ALLOC_REFILL, alloc_refill); 4835STAT_ATTR(ALLOC_REFILL, alloc_refill);
4836STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
4582STAT_ATTR(FREE_SLAB, free_slab); 4837STAT_ATTR(FREE_SLAB, free_slab);
4583STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 4838STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
4584STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 4839STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4586,7 +4841,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
4586STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 4841STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
4587STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 4842STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
4588STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 4843STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
4844STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4589STAT_ATTR(ORDER_FALLBACK, order_fallback); 4845STAT_ATTR(ORDER_FALLBACK, order_fallback);
4846STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4847STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4590#endif 4848#endif
4591 4849
4592static struct attribute *slab_attrs[] = { 4850static struct attribute *slab_attrs[] = {
@@ -4636,6 +4894,7 @@ static struct attribute *slab_attrs[] = {
4636 &alloc_from_partial_attr.attr, 4894 &alloc_from_partial_attr.attr,
4637 &alloc_slab_attr.attr, 4895 &alloc_slab_attr.attr,
4638 &alloc_refill_attr.attr, 4896 &alloc_refill_attr.attr,
4897 &alloc_node_mismatch_attr.attr,
4639 &free_slab_attr.attr, 4898 &free_slab_attr.attr,
4640 &cpuslab_flush_attr.attr, 4899 &cpuslab_flush_attr.attr,
4641 &deactivate_full_attr.attr, 4900 &deactivate_full_attr.attr,
@@ -4643,7 +4902,10 @@ static struct attribute *slab_attrs[] = {
4643 &deactivate_to_head_attr.attr, 4902 &deactivate_to_head_attr.attr,
4644 &deactivate_to_tail_attr.attr, 4903 &deactivate_to_tail_attr.attr,
4645 &deactivate_remote_frees_attr.attr, 4904 &deactivate_remote_frees_attr.attr,
4905 &deactivate_bypass_attr.attr,
4646 &order_fallback_attr.attr, 4906 &order_fallback_attr.attr,
4907 &cmpxchg_double_fail_attr.attr,
4908 &cmpxchg_double_cpu_fail_attr.attr,
4647#endif 4909#endif
4648#ifdef CONFIG_FAILSLAB 4910#ifdef CONFIG_FAILSLAB
4649 &failslab_attr.attr, 4911 &failslab_attr.attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1b8c33907242..17bc224bce68 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1924 1924
1925 /* 1925 /*
1926 * Find out how many pages are allowed for a single swap 1926 * Find out how many pages are allowed for a single swap
1927 * device. There are two limiting factors: 1) the number of 1927 * device. There are three limiting factors: 1) the number
1928 * bits for the swap offset in the swp_entry_t type and 1928 * of bits for the swap offset in the swp_entry_t type, and
1929 * 2) the number of bits in the a swap pte as defined by 1929 * 2) the number of bits in the swap pte as defined by the
1930 * the different architectures. In order to find the 1930 * the different architectures, and 3) the number of free bits
1931 * largest possible bit mask a swap entry with swap type 0 1931 * in an exceptional radix_tree entry. In order to find the
1932 * largest possible bit mask, a swap entry with swap type 0
1932 * and swap offset ~0UL is created, encoded to a swap pte, 1933 * and swap offset ~0UL is created, encoded to a swap pte,
1933 * decoded to a swp_entry_t again and finally the swap 1934 * decoded to a swp_entry_t again, and finally the swap
1934 * offset is extracted. This will mask all the bits from 1935 * offset is extracted. This will mask all the bits from
1935 * the initial ~0UL mask that can't be encoded in either 1936 * the initial ~0UL mask that can't be encoded in either
1936 * the swp_entry_t or the architecture definition of a 1937 * the swp_entry_t or the architecture definition of a
1937 * swap pte. 1938 * swap pte. Then the same is done for a radix_tree entry.
1938 */ 1939 */
1939 maxpages = swp_offset(pte_to_swp_entry( 1940 maxpages = swp_offset(pte_to_swp_entry(
1940 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 1941 swp_entry_to_pte(swp_entry(0, ~0UL))));
1942 maxpages = swp_offset(radix_to_swp_entry(
1943 swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
1944
1941 if (maxpages > swap_header->info.last_page) { 1945 if (maxpages > swap_header->info.last_page) {
1942 maxpages = swap_header->info.last_page + 1; 1946 maxpages = swap_header->info.last_page + 1;
1943 /* p->max is an unsigned int: don't overflow it */ 1947 /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/truncate.c b/mm/truncate.c
index 232eb2736a79..b40ac6d4e86e 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
336 unsigned long count = 0; 336 unsigned long count = 0;
337 int i; 337 int i;
338 338
339 /*
340 * Note: this function may get called on a shmem/tmpfs mapping:
341 * pagevec_lookup() might then return 0 prematurely (because it
342 * got a gangful of swap entries); but it's hardly worth worrying
343 * about - it can rarely have anything to free from such a mapping
344 * (most pages are dirty), and already skips over any difficulties.
345 */
346
339 pagevec_init(&pvec, 0); 347 pagevec_init(&pvec, 0);
340 while (index <= end && pagevec_lookup(&pvec, mapping, index, 348 while (index <= end && pagevec_lookup(&pvec, mapping, index,
341 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 349 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ab8494cde007..5016f19e1661 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -26,7 +26,7 @@
26#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
27#include <linux/pfn.h> 27#include <linux/pfn.h>
28#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
29#include <asm/atomic.h> 29#include <linux/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
725#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) 725#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
726#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ 726#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
727#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ 727#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
728#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ 728#define VMAP_BBMAP_BITS \
729 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ 729 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
730 VMALLOC_PAGES / NR_CPUS / 16)) 730 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
731 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
731 732
732#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) 733#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
733 734
@@ -2139,6 +2140,14 @@ struct vm_struct *alloc_vm_area(size_t size)
2139 return NULL; 2140 return NULL;
2140 } 2141 }
2141 2142
2143 /*
2144 * If the allocated address space is passed to a hypercall
2145 * before being used then we cannot rely on a page fault to
2146 * trigger an update of the page tables. So sync all the page
2147 * tables here.
2148 */
2149 vmalloc_sync_all();
2150
2142 return area; 2151 return area;
2143} 2152}
2144EXPORT_SYMBOL_GPL(alloc_vm_area); 2153EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e32698fab66..9fdfce7ba403 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness;
99
100 int order; 98 int order;
101 99
102 /* 100 /*
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 171 struct scan_control *sc, enum lru_list lru)
174{ 172{
175 if (!scanning_global_lru(sc)) 173 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
175 zone_to_nid(zone), zone_idx(zone), BIT(lru));
177 176
178 return zone_page_state(zone, NR_LRU_BASE + lru); 177 return zone_page_state(zone, NR_LRU_BASE + lru);
179} 178}
@@ -1770,6 +1769,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1770 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1769 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1771} 1770}
1772 1771
1772static int vmscan_swappiness(struct scan_control *sc)
1773{
1774 if (scanning_global_lru(sc))
1775 return vm_swappiness;
1776 return mem_cgroup_swappiness(sc->mem_cgroup);
1777}
1778
1773/* 1779/*
1774 * Determine how aggressively the anon and file LRU lists should be 1780 * Determine how aggressively the anon and file LRU lists should be
1775 * scanned. The relative value of each set of LRU lists is determined 1781 * scanned. The relative value of each set of LRU lists is determined
@@ -1788,22 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1788 u64 fraction[2], denominator; 1794 u64 fraction[2], denominator;
1789 enum lru_list l; 1795 enum lru_list l;
1790 int noswap = 0; 1796 int noswap = 0;
1791 int force_scan = 0; 1797 bool force_scan = false;
1792 1798 unsigned long nr_force_scan[2];
1793
1794 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1795 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1796 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1797 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1798 1799
1799 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1800 /* kswapd does zone balancing and needs to scan this zone */
1800 /* kswapd does zone balancing and need to scan this zone */ 1801 if (scanning_global_lru(sc) && current_is_kswapd())
1801 if (scanning_global_lru(sc) && current_is_kswapd()) 1802 force_scan = true;
1802 force_scan = 1; 1803 /* memcg may have small limit and need to avoid priority drop */
1803 /* memcg may have small limit and need to avoid priority drop */ 1804 if (!scanning_global_lru(sc))
1804 if (!scanning_global_lru(sc)) 1805 force_scan = true;
1805 force_scan = 1;
1806 }
1807 1806
1808 /* If we have no swap space, do not bother scanning anon pages. */ 1807 /* If we have no swap space, do not bother scanning anon pages. */
1809 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1808 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1811,9 +1810,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1811 fraction[0] = 0; 1810 fraction[0] = 0;
1812 fraction[1] = 1; 1811 fraction[1] = 1;
1813 denominator = 1; 1812 denominator = 1;
1813 nr_force_scan[0] = 0;
1814 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1814 goto out; 1815 goto out;
1815 } 1816 }
1816 1817
1818 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1819 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1820 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1821 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1822
1817 if (scanning_global_lru(sc)) { 1823 if (scanning_global_lru(sc)) {
1818 free = zone_page_state(zone, NR_FREE_PAGES); 1824 free = zone_page_state(zone, NR_FREE_PAGES);
1819 /* If we have very few page cache pages, 1825 /* If we have very few page cache pages,
@@ -1822,6 +1828,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1822 fraction[0] = 1; 1828 fraction[0] = 1;
1823 fraction[1] = 0; 1829 fraction[1] = 0;
1824 denominator = 1; 1830 denominator = 1;
1831 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1832 nr_force_scan[1] = 0;
1825 goto out; 1833 goto out;
1826 } 1834 }
1827 } 1835 }
@@ -1830,8 +1838,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1830 * With swappiness at 100, anonymous and file have the same priority. 1838 * With swappiness at 100, anonymous and file have the same priority.
1831 * This scanning priority is essentially the inverse of IO cost. 1839 * This scanning priority is essentially the inverse of IO cost.
1832 */ 1840 */
1833 anon_prio = sc->swappiness; 1841 anon_prio = vmscan_swappiness(sc);
1834 file_prio = 200 - sc->swappiness; 1842 file_prio = 200 - vmscan_swappiness(sc);
1835 1843
1836 /* 1844 /*
1837 * OK, so we have swap space and a fair amount of page cache 1845 * OK, so we have swap space and a fair amount of page cache
@@ -1870,6 +1878,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1870 fraction[0] = ap; 1878 fraction[0] = ap;
1871 fraction[1] = fp; 1879 fraction[1] = fp;
1872 denominator = ap + fp + 1; 1880 denominator = ap + fp + 1;
1881 if (force_scan) {
1882 unsigned long scan = SWAP_CLUSTER_MAX;
1883 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1884 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1885 }
1873out: 1886out:
1874 for_each_evictable_lru(l) { 1887 for_each_evictable_lru(l) {
1875 int file = is_file_lru(l); 1888 int file = is_file_lru(l);
@@ -1890,12 +1903,8 @@ out:
1890 * memcg, priority drop can cause big latency. So, it's better 1903 * memcg, priority drop can cause big latency. So, it's better
1891 * to scan small amount. See may_noscan above. 1904 * to scan small amount. See may_noscan above.
1892 */ 1905 */
1893 if (!scan && force_scan) { 1906 if (!scan && force_scan)
1894 if (file) 1907 scan = nr_force_scan[file];
1895 scan = SWAP_CLUSTER_MAX;
1896 else if (!noswap)
1897 scan = SWAP_CLUSTER_MAX;
1898 }
1899 nr[l] = scan; 1908 nr[l] = scan;
1900 } 1909 }
1901} 1910}
@@ -2220,7 +2229,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2220 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2229 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2221 .may_unmap = 1, 2230 .may_unmap = 1,
2222 .may_swap = 1, 2231 .may_swap = 1,
2223 .swappiness = vm_swappiness,
2224 .order = order, 2232 .order = order,
2225 .mem_cgroup = NULL, 2233 .mem_cgroup = NULL,
2226 .nodemask = nodemask, 2234 .nodemask = nodemask,
@@ -2244,7 +2252,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2244 2252
2245unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2253unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2246 gfp_t gfp_mask, bool noswap, 2254 gfp_t gfp_mask, bool noswap,
2247 unsigned int swappiness,
2248 struct zone *zone, 2255 struct zone *zone,
2249 unsigned long *nr_scanned) 2256 unsigned long *nr_scanned)
2250{ 2257{
@@ -2254,7 +2261,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2254 .may_writepage = !laptop_mode, 2261 .may_writepage = !laptop_mode,
2255 .may_unmap = 1, 2262 .may_unmap = 1,
2256 .may_swap = !noswap, 2263 .may_swap = !noswap,
2257 .swappiness = swappiness,
2258 .order = 0, 2264 .order = 0,
2259 .mem_cgroup = mem, 2265 .mem_cgroup = mem,
2260 }; 2266 };
@@ -2283,8 +2289,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2283 2289
2284unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2290unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2285 gfp_t gfp_mask, 2291 gfp_t gfp_mask,
2286 bool noswap, 2292 bool noswap)
2287 unsigned int swappiness)
2288{ 2293{
2289 struct zonelist *zonelist; 2294 struct zonelist *zonelist;
2290 unsigned long nr_reclaimed; 2295 unsigned long nr_reclaimed;
@@ -2294,7 +2299,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2294 .may_unmap = 1, 2299 .may_unmap = 1,
2295 .may_swap = !noswap, 2300 .may_swap = !noswap,
2296 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2301 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2297 .swappiness = swappiness,
2298 .order = 0, 2302 .order = 0,
2299 .mem_cgroup = mem_cont, 2303 .mem_cgroup = mem_cont,
2300 .nodemask = NULL, /* we don't care the placement */ 2304 .nodemask = NULL, /* we don't care the placement */
@@ -2445,7 +2449,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2445 * we want to put equal scanning pressure on each zone. 2449 * we want to put equal scanning pressure on each zone.
2446 */ 2450 */
2447 .nr_to_reclaim = ULONG_MAX, 2451 .nr_to_reclaim = ULONG_MAX,
2448 .swappiness = vm_swappiness,
2449 .order = order, 2452 .order = order,
2450 .mem_cgroup = NULL, 2453 .mem_cgroup = NULL,
2451 }; 2454 };
@@ -2494,6 +2497,9 @@ loop_again:
2494 high_wmark_pages(zone), 0, 0)) { 2497 high_wmark_pages(zone), 0, 0)) {
2495 end_zone = i; 2498 end_zone = i;
2496 break; 2499 break;
2500 } else {
2501 /* If balanced, clear the congested flag */
2502 zone_clear_flag(zone, ZONE_CONGESTED);
2497 } 2503 }
2498 } 2504 }
2499 if (i < 0) 2505 if (i < 0)
@@ -2915,7 +2921,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2915 .may_writepage = 1, 2921 .may_writepage = 1,
2916 .nr_to_reclaim = nr_to_reclaim, 2922 .nr_to_reclaim = nr_to_reclaim,
2917 .hibernation_mode = 1, 2923 .hibernation_mode = 1,
2918 .swappiness = vm_swappiness,
2919 .order = 0, 2924 .order = 0,
2920 }; 2925 };
2921 struct shrink_control shrink = { 2926 struct shrink_control shrink = {
@@ -3102,7 +3107,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3102 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3107 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3103 SWAP_CLUSTER_MAX), 3108 SWAP_CLUSTER_MAX),
3104 .gfp_mask = gfp_mask, 3109 .gfp_mask = gfp_mask,
3105 .swappiness = vm_swappiness,
3106 .order = order, 3110 .order = order,
3107 }; 3111 };
3108 struct shrink_control shrink = { 3112 struct shrink_control shrink = {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b2..d52b13d28e8f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
659} 659}
660#endif 660#endif
661 661
662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) 662#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
663#ifdef CONFIG_ZONE_DMA 663#ifdef CONFIG_ZONE_DMA
664#define TEXT_FOR_DMA(xx) xx "_dma", 664#define TEXT_FOR_DMA(xx) xx "_dma",
665#else 665#else
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = {
788 788
789#endif /* CONFIG_VM_EVENTS_COUNTERS */ 789#endif /* CONFIG_VM_EVENTS_COUNTERS */
790}; 790};
791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ 791#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
792 792
793 793
794#ifdef CONFIG_PROC_FS 794#ifdef CONFIG_PROC_FS