diff options
author | Jiri Kosina <jkosina@suse.cz> | 2011-09-15 09:08:05 -0400 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2011-09-15 09:08:18 -0400 |
commit | e060c38434b2caa78efe7cedaff4191040b65a15 (patch) | |
tree | 407361230bf6733f63d8e788e4b5e6566ee04818 /mm | |
parent | 10e4ac572eeffe5317019bd7330b6058a400dfc2 (diff) | |
parent | cc39c6a9bbdebfcf1a7dee64d83bf302bc38d941 (diff) |
Merge branch 'master' into for-next
Fast-forward merge with Linus to be able to merge patches
based on more recent version of the tree.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 82 | ||||
-rw-r--r-- | mm/failslab.c | 39 | ||||
-rw-r--r-- | mm/filemap.c | 118 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/init-mm.c | 2 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 471 | ||||
-rw-r--r-- | mm/memory-failure.c | 92 | ||||
-rw-r--r-- | mm/mempolicy.c | 25 | ||||
-rw-r--r-- | mm/mincore.c | 11 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 269 | ||||
-rw-r--r-- | mm/page_alloc.c | 60 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 1493 | ||||
-rw-r--r-- | mm/slab.c | 99 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 772 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 17 | ||||
-rw-r--r-- | mm/vmscan.c | 74 | ||||
-rw-r--r-- | mm/vmstat.c | 4 |
23 files changed, 1879 insertions, 1793 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8290b1e88257..d6edf8d14f9c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer; | |||
45 | static int bdi_sync_supers(void *); | 45 | static int bdi_sync_supers(void *); |
46 | static void sync_supers_timer_fn(unsigned long); | 46 | static void sync_supers_timer_fn(unsigned long); |
47 | 47 | ||
48 | void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
49 | { | ||
50 | if (wb1 < wb2) { | ||
51 | spin_lock(&wb1->list_lock); | ||
52 | spin_lock_nested(&wb2->list_lock, 1); | ||
53 | } else { | ||
54 | spin_lock(&wb2->list_lock); | ||
55 | spin_lock_nested(&wb1->list_lock, 1); | ||
56 | } | ||
57 | } | ||
58 | |||
48 | #ifdef CONFIG_DEBUG_FS | 59 | #ifdef CONFIG_DEBUG_FS |
49 | #include <linux/debugfs.h> | 60 | #include <linux/debugfs.h> |
50 | #include <linux/seq_file.h> | 61 | #include <linux/seq_file.h> |
@@ -67,34 +78,42 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
67 | struct inode *inode; | 78 | struct inode *inode; |
68 | 79 | ||
69 | nr_dirty = nr_io = nr_more_io = 0; | 80 | nr_dirty = nr_io = nr_more_io = 0; |
70 | spin_lock(&inode_wb_list_lock); | 81 | spin_lock(&wb->list_lock); |
71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 82 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
72 | nr_dirty++; | 83 | nr_dirty++; |
73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 84 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
74 | nr_io++; | 85 | nr_io++; |
75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 86 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
76 | nr_more_io++; | 87 | nr_more_io++; |
77 | spin_unlock(&inode_wb_list_lock); | 88 | spin_unlock(&wb->list_lock); |
78 | 89 | ||
79 | global_dirty_limits(&background_thresh, &dirty_thresh); | 90 | global_dirty_limits(&background_thresh, &dirty_thresh); |
80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 91 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
81 | 92 | ||
82 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | 93 | #define K(x) ((x) << (PAGE_SHIFT - 10)) |
83 | seq_printf(m, | 94 | seq_printf(m, |
84 | "BdiWriteback: %8lu kB\n" | 95 | "BdiWriteback: %10lu kB\n" |
85 | "BdiReclaimable: %8lu kB\n" | 96 | "BdiReclaimable: %10lu kB\n" |
86 | "BdiDirtyThresh: %8lu kB\n" | 97 | "BdiDirtyThresh: %10lu kB\n" |
87 | "DirtyThresh: %8lu kB\n" | 98 | "DirtyThresh: %10lu kB\n" |
88 | "BackgroundThresh: %8lu kB\n" | 99 | "BackgroundThresh: %10lu kB\n" |
89 | "b_dirty: %8lu\n" | 100 | "BdiWritten: %10lu kB\n" |
90 | "b_io: %8lu\n" | 101 | "BdiWriteBandwidth: %10lu kBps\n" |
91 | "b_more_io: %8lu\n" | 102 | "b_dirty: %10lu\n" |
92 | "bdi_list: %8u\n" | 103 | "b_io: %10lu\n" |
93 | "state: %8lx\n", | 104 | "b_more_io: %10lu\n" |
105 | "bdi_list: %10u\n" | ||
106 | "state: %10lx\n", | ||
94 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 107 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
95 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 108 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
96 | K(bdi_thresh), K(dirty_thresh), | 109 | K(bdi_thresh), |
97 | K(background_thresh), nr_dirty, nr_io, nr_more_io, | 110 | K(dirty_thresh), |
111 | K(background_thresh), | ||
112 | (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), | ||
113 | (unsigned long) K(bdi->write_bandwidth), | ||
114 | nr_dirty, | ||
115 | nr_io, | ||
116 | nr_more_io, | ||
98 | !list_empty(&bdi->bdi_list), bdi->state); | 117 | !list_empty(&bdi->bdi_list), bdi->state); |
99 | #undef K | 118 | #undef K |
100 | 119 | ||
@@ -249,18 +268,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) | |||
249 | return wb_has_dirty_io(&bdi->wb); | 268 | return wb_has_dirty_io(&bdi->wb); |
250 | } | 269 | } |
251 | 270 | ||
252 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
253 | { | ||
254 | struct writeback_control wbc = { | ||
255 | .sync_mode = WB_SYNC_NONE, | ||
256 | .older_than_this = NULL, | ||
257 | .range_cyclic = 1, | ||
258 | .nr_to_write = 1024, | ||
259 | }; | ||
260 | |||
261 | writeback_inodes_wb(&bdi->wb, &wbc); | ||
262 | } | ||
263 | |||
264 | /* | 271 | /* |
265 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() | 272 | * kupdated() used to do this. We cannot do it from the bdi_forker_thread() |
266 | * or we risk deadlocking on ->s_umount. The longer term solution would be | 273 | * or we risk deadlocking on ->s_umount. The longer term solution would be |
@@ -446,9 +453,10 @@ static int bdi_forker_thread(void *ptr) | |||
446 | if (IS_ERR(task)) { | 453 | if (IS_ERR(task)) { |
447 | /* | 454 | /* |
448 | * If thread creation fails, force writeout of | 455 | * If thread creation fails, force writeout of |
449 | * the bdi from the thread. | 456 | * the bdi from the thread. Hopefully 1024 is |
457 | * large enough for efficient IO. | ||
450 | */ | 458 | */ |
451 | bdi_flush_io(bdi); | 459 | writeback_inodes_wb(&bdi->wb, 1024); |
452 | } else { | 460 | } else { |
453 | /* | 461 | /* |
454 | * The spinlock makes sure we do not lose | 462 | * The spinlock makes sure we do not lose |
@@ -629,9 +637,15 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | |||
629 | INIT_LIST_HEAD(&wb->b_dirty); | 637 | INIT_LIST_HEAD(&wb->b_dirty); |
630 | INIT_LIST_HEAD(&wb->b_io); | 638 | INIT_LIST_HEAD(&wb->b_io); |
631 | INIT_LIST_HEAD(&wb->b_more_io); | 639 | INIT_LIST_HEAD(&wb->b_more_io); |
640 | spin_lock_init(&wb->list_lock); | ||
632 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); | 641 | setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); |
633 | } | 642 | } |
634 | 643 | ||
644 | /* | ||
645 | * Initial write bandwidth: 100 MB/s | ||
646 | */ | ||
647 | #define INIT_BW (100 << (20 - PAGE_SHIFT)) | ||
648 | |||
635 | int bdi_init(struct backing_dev_info *bdi) | 649 | int bdi_init(struct backing_dev_info *bdi) |
636 | { | 650 | { |
637 | int i, err; | 651 | int i, err; |
@@ -654,6 +668,13 @@ int bdi_init(struct backing_dev_info *bdi) | |||
654 | } | 668 | } |
655 | 669 | ||
656 | bdi->dirty_exceeded = 0; | 670 | bdi->dirty_exceeded = 0; |
671 | |||
672 | bdi->bw_time_stamp = jiffies; | ||
673 | bdi->written_stamp = 0; | ||
674 | |||
675 | bdi->write_bandwidth = INIT_BW; | ||
676 | bdi->avg_write_bandwidth = INIT_BW; | ||
677 | |||
657 | err = prop_local_init_percpu(&bdi->completions); | 678 | err = prop_local_init_percpu(&bdi->completions); |
658 | 679 | ||
659 | if (err) { | 680 | if (err) { |
@@ -677,11 +698,12 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
677 | if (bdi_has_dirty_io(bdi)) { | 698 | if (bdi_has_dirty_io(bdi)) { |
678 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 699 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
679 | 700 | ||
680 | spin_lock(&inode_wb_list_lock); | 701 | bdi_lock_two(&bdi->wb, dst); |
681 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 702 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
682 | list_splice(&bdi->wb.b_io, &dst->b_io); | 703 | list_splice(&bdi->wb.b_io, &dst->b_io); |
683 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 704 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
684 | spin_unlock(&inode_wb_list_lock); | 705 | spin_unlock(&bdi->wb.list_lock); |
706 | spin_unlock(&dst->list_lock); | ||
685 | } | 707 | } |
686 | 708 | ||
687 | bdi_unregister(bdi); | 709 | bdi_unregister(bdi); |
diff --git a/mm/failslab.c b/mm/failslab.c index c5f88f240ddc..0dd7b8fec71c 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -5,10 +5,6 @@ static struct { | |||
5 | struct fault_attr attr; | 5 | struct fault_attr attr; |
6 | u32 ignore_gfp_wait; | 6 | u32 ignore_gfp_wait; |
7 | int cache_filter; | 7 | int cache_filter; |
8 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
9 | struct dentry *ignore_gfp_wait_file; | ||
10 | struct dentry *cache_filter_file; | ||
11 | #endif | ||
12 | } failslab = { | 8 | } failslab = { |
13 | .attr = FAULT_ATTR_INITIALIZER, | 9 | .attr = FAULT_ATTR_INITIALIZER, |
14 | .ignore_gfp_wait = 1, | 10 | .ignore_gfp_wait = 1, |
@@ -38,32 +34,25 @@ __setup("failslab=", setup_failslab); | |||
38 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 34 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
39 | static int __init failslab_debugfs_init(void) | 35 | static int __init failslab_debugfs_init(void) |
40 | { | 36 | { |
41 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | ||
42 | struct dentry *dir; | 37 | struct dentry *dir; |
43 | int err; | 38 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
44 | |||
45 | err = init_fault_attr_dentries(&failslab.attr, "failslab"); | ||
46 | if (err) | ||
47 | return err; | ||
48 | dir = failslab.attr.dentries.dir; | ||
49 | 39 | ||
50 | failslab.ignore_gfp_wait_file = | 40 | dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); |
51 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 41 | if (IS_ERR(dir)) |
52 | &failslab.ignore_gfp_wait); | 42 | return PTR_ERR(dir); |
53 | 43 | ||
54 | failslab.cache_filter_file = | 44 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
55 | debugfs_create_bool("cache-filter", mode, dir, | 45 | &failslab.ignore_gfp_wait)) |
56 | &failslab.cache_filter); | 46 | goto fail; |
47 | if (!debugfs_create_bool("cache-filter", mode, dir, | ||
48 | &failslab.cache_filter)) | ||
49 | goto fail; | ||
57 | 50 | ||
58 | if (!failslab.ignore_gfp_wait_file || | 51 | return 0; |
59 | !failslab.cache_filter_file) { | 52 | fail: |
60 | err = -ENOMEM; | 53 | debugfs_remove_recursive(dir); |
61 | debugfs_remove(failslab.cache_filter_file); | ||
62 | debugfs_remove(failslab.ignore_gfp_wait_file); | ||
63 | cleanup_fault_attr_dentries(&failslab.attr); | ||
64 | } | ||
65 | 54 | ||
66 | return err; | 55 | return -ENOMEM; |
67 | } | 56 | } |
68 | 57 | ||
69 | late_initcall(failslab_debugfs_init); | 58 | late_initcall(failslab_debugfs_init); |
diff --git a/mm/filemap.c b/mm/filemap.c index 10a171113273..7771871fa353 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -33,7 +33,6 @@ | |||
33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 35 | #include <linux/memcontrol.h> |
36 | #include <linux/mm_inline.h> /* for page_is_file_cache() */ | ||
37 | #include <linux/cleancache.h> | 36 | #include <linux/cleancache.h> |
38 | #include "internal.h" | 37 | #include "internal.h" |
39 | 38 | ||
@@ -78,7 +77,7 @@ | |||
78 | * ->i_mutex (generic_file_buffered_write) | 77 | * ->i_mutex (generic_file_buffered_write) |
79 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) | 78 | * ->mmap_sem (fault_in_pages_readable->do_page_fault) |
80 | * | 79 | * |
81 | * inode_wb_list_lock | 80 | * bdi->wb.list_lock |
82 | * sb_lock (fs/fs-writeback.c) | 81 | * sb_lock (fs/fs-writeback.c) |
83 | * ->mapping->tree_lock (__sync_single_inode) | 82 | * ->mapping->tree_lock (__sync_single_inode) |
84 | * | 83 | * |
@@ -96,9 +95,9 @@ | |||
96 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 95 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
97 | * ->private_lock (page_remove_rmap->set_page_dirty) | 96 | * ->private_lock (page_remove_rmap->set_page_dirty) |
98 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 97 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
99 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) | 98 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
100 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) | 99 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
101 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | 100 | * bdi.wb->list_lock (zap_pte_range->set_page_dirty) |
102 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 103 | * |
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
462 | int error; | 461 | int error; |
463 | 462 | ||
464 | VM_BUG_ON(!PageLocked(page)); | 463 | VM_BUG_ON(!PageLocked(page)); |
464 | VM_BUG_ON(PageSwapBacked(page)); | ||
465 | 465 | ||
466 | error = mem_cgroup_cache_charge(page, current->mm, | 466 | error = mem_cgroup_cache_charge(page, current->mm, |
467 | gfp_mask & GFP_RECLAIM_MASK); | 467 | gfp_mask & GFP_RECLAIM_MASK); |
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
479 | if (likely(!error)) { | 479 | if (likely(!error)) { |
480 | mapping->nrpages++; | 480 | mapping->nrpages++; |
481 | __inc_zone_page_state(page, NR_FILE_PAGES); | 481 | __inc_zone_page_state(page, NR_FILE_PAGES); |
482 | if (PageSwapBacked(page)) | ||
483 | __inc_zone_page_state(page, NR_SHMEM); | ||
484 | spin_unlock_irq(&mapping->tree_lock); | 482 | spin_unlock_irq(&mapping->tree_lock); |
485 | } else { | 483 | } else { |
486 | page->mapping = NULL; | 484 | page->mapping = NULL; |
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
502 | { | 500 | { |
503 | int ret; | 501 | int ret; |
504 | 502 | ||
505 | /* | ||
506 | * Splice_read and readahead add shmem/tmpfs pages into the page cache | ||
507 | * before shmem_readpage has a chance to mark them as SwapBacked: they | ||
508 | * need to go on the anon lru below, and mem_cgroup_cache_charge | ||
509 | * (called in add_to_page_cache) needs to know where they're going too. | ||
510 | */ | ||
511 | if (mapping_cap_swap_backed(mapping)) | ||
512 | SetPageSwapBacked(page); | ||
513 | |||
514 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); | 503 | ret = add_to_page_cache(page, mapping, offset, gfp_mask); |
515 | if (ret == 0) { | 504 | if (ret == 0) |
516 | if (page_is_file_cache(page)) | 505 | lru_cache_add_file(page); |
517 | lru_cache_add_file(page); | ||
518 | else | ||
519 | lru_cache_add_anon(page); | ||
520 | } | ||
521 | return ret; | 506 | return ret; |
522 | } | 507 | } |
523 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | 508 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); |
@@ -714,9 +699,16 @@ repeat: | |||
714 | page = radix_tree_deref_slot(pagep); | 699 | page = radix_tree_deref_slot(pagep); |
715 | if (unlikely(!page)) | 700 | if (unlikely(!page)) |
716 | goto out; | 701 | goto out; |
717 | if (radix_tree_deref_retry(page)) | 702 | if (radix_tree_exception(page)) { |
718 | goto repeat; | 703 | if (radix_tree_deref_retry(page)) |
719 | 704 | goto repeat; | |
705 | /* | ||
706 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
707 | * here as an exceptional entry: so return it without | ||
708 | * attempting to raise page count. | ||
709 | */ | ||
710 | goto out; | ||
711 | } | ||
720 | if (!page_cache_get_speculative(page)) | 712 | if (!page_cache_get_speculative(page)) |
721 | goto repeat; | 713 | goto repeat; |
722 | 714 | ||
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) | |||
753 | 745 | ||
754 | repeat: | 746 | repeat: |
755 | page = find_get_page(mapping, offset); | 747 | page = find_get_page(mapping, offset); |
756 | if (page) { | 748 | if (page && !radix_tree_exception(page)) { |
757 | lock_page(page); | 749 | lock_page(page); |
758 | /* Has the page been truncated? */ | 750 | /* Has the page been truncated? */ |
759 | if (unlikely(page->mapping != mapping)) { | 751 | if (unlikely(page->mapping != mapping)) { |
@@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
835 | { | 827 | { |
836 | unsigned int i; | 828 | unsigned int i; |
837 | unsigned int ret; | 829 | unsigned int ret; |
838 | unsigned int nr_found; | 830 | unsigned int nr_found, nr_skip; |
839 | 831 | ||
840 | rcu_read_lock(); | 832 | rcu_read_lock(); |
841 | restart: | 833 | restart: |
842 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 834 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
843 | (void ***)pages, start, nr_pages); | 835 | (void ***)pages, NULL, start, nr_pages); |
844 | ret = 0; | 836 | ret = 0; |
837 | nr_skip = 0; | ||
845 | for (i = 0; i < nr_found; i++) { | 838 | for (i = 0; i < nr_found; i++) { |
846 | struct page *page; | 839 | struct page *page; |
847 | repeat: | 840 | repeat: |
@@ -849,13 +842,23 @@ repeat: | |||
849 | if (unlikely(!page)) | 842 | if (unlikely(!page)) |
850 | continue; | 843 | continue; |
851 | 844 | ||
852 | /* | 845 | if (radix_tree_exception(page)) { |
853 | * This can only trigger when the entry at index 0 moves out | 846 | if (radix_tree_deref_retry(page)) { |
854 | * of or back to the root: none yet gotten, safe to restart. | 847 | /* |
855 | */ | 848 | * Transient condition which can only trigger |
856 | if (radix_tree_deref_retry(page)) { | 849 | * when entry at index 0 moves out of or back |
857 | WARN_ON(start | i); | 850 | * to root: none yet gotten, safe to restart. |
858 | goto restart; | 851 | */ |
852 | WARN_ON(start | i); | ||
853 | goto restart; | ||
854 | } | ||
855 | /* | ||
856 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
857 | * here as an exceptional entry: so skip over it - | ||
858 | * we only reach this from invalidate_mapping_pages(). | ||
859 | */ | ||
860 | nr_skip++; | ||
861 | continue; | ||
859 | } | 862 | } |
860 | 863 | ||
861 | if (!page_cache_get_speculative(page)) | 864 | if (!page_cache_get_speculative(page)) |
@@ -875,7 +878,7 @@ repeat: | |||
875 | * If all entries were removed before we could secure them, | 878 | * If all entries were removed before we could secure them, |
876 | * try again, because callers stop trying once 0 is returned. | 879 | * try again, because callers stop trying once 0 is returned. |
877 | */ | 880 | */ |
878 | if (unlikely(!ret && nr_found)) | 881 | if (unlikely(!ret && nr_found > nr_skip)) |
879 | goto restart; | 882 | goto restart; |
880 | rcu_read_unlock(); | 883 | rcu_read_unlock(); |
881 | return ret; | 884 | return ret; |
@@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
903 | rcu_read_lock(); | 906 | rcu_read_lock(); |
904 | restart: | 907 | restart: |
905 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | 908 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
906 | (void ***)pages, index, nr_pages); | 909 | (void ***)pages, NULL, index, nr_pages); |
907 | ret = 0; | 910 | ret = 0; |
908 | for (i = 0; i < nr_found; i++) { | 911 | for (i = 0; i < nr_found; i++) { |
909 | struct page *page; | 912 | struct page *page; |
@@ -912,12 +915,22 @@ repeat: | |||
912 | if (unlikely(!page)) | 915 | if (unlikely(!page)) |
913 | continue; | 916 | continue; |
914 | 917 | ||
915 | /* | 918 | if (radix_tree_exception(page)) { |
916 | * This can only trigger when the entry at index 0 moves out | 919 | if (radix_tree_deref_retry(page)) { |
917 | * of or back to the root: none yet gotten, safe to restart. | 920 | /* |
918 | */ | 921 | * Transient condition which can only trigger |
919 | if (radix_tree_deref_retry(page)) | 922 | * when entry at index 0 moves out of or back |
920 | goto restart; | 923 | * to root: none yet gotten, safe to restart. |
924 | */ | ||
925 | goto restart; | ||
926 | } | ||
927 | /* | ||
928 | * Otherwise, shmem/tmpfs must be storing a swap entry | ||
929 | * here as an exceptional entry: so stop looking for | ||
930 | * contiguous pages. | ||
931 | */ | ||
932 | break; | ||
933 | } | ||
921 | 934 | ||
922 | if (!page_cache_get_speculative(page)) | 935 | if (!page_cache_get_speculative(page)) |
923 | goto repeat; | 936 | goto repeat; |
@@ -977,12 +990,21 @@ repeat: | |||
977 | if (unlikely(!page)) | 990 | if (unlikely(!page)) |
978 | continue; | 991 | continue; |
979 | 992 | ||
980 | /* | 993 | if (radix_tree_exception(page)) { |
981 | * This can only trigger when the entry at index 0 moves out | 994 | if (radix_tree_deref_retry(page)) { |
982 | * of or back to the root: none yet gotten, safe to restart. | 995 | /* |
983 | */ | 996 | * Transient condition which can only trigger |
984 | if (radix_tree_deref_retry(page)) | 997 | * when entry at index 0 moves out of or back |
985 | goto restart; | 998 | * to root: none yet gotten, safe to restart. |
999 | */ | ||
1000 | goto restart; | ||
1001 | } | ||
1002 | /* | ||
1003 | * This function is never used on a shmem/tmpfs | ||
1004 | * mapping, so a swap entry won't be found here. | ||
1005 | */ | ||
1006 | BUG(); | ||
1007 | } | ||
986 | 1008 | ||
987 | if (!page_cache_get_speculative(page)) | 1009 | if (!page_cache_get_speculative(page)) |
988 | goto repeat; | 1010 | goto repeat; |
diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2ed..5ef672c07f75 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -326,7 +326,7 @@ static struct page_address_slot { | |||
326 | spinlock_t lock; /* Protect this bucket's list */ | 326 | spinlock_t lock; /* Protect this bucket's list */ |
327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; | 327 | } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; |
328 | 328 | ||
329 | static struct page_address_slot *page_slot(struct page *page) | 329 | static struct page_address_slot *page_slot(const struct page *page) |
330 | { | 330 | { |
331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; | 331 | return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; |
332 | } | 332 | } |
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) | |||
337 | * | 337 | * |
338 | * Returns the page's virtual address. | 338 | * Returns the page's virtual address. |
339 | */ | 339 | */ |
340 | void *page_address(struct page *page) | 340 | void *page_address(const struct page *page) |
341 | { | 341 | { |
342 | unsigned long flags; | 342 | unsigned long flags; |
343 | void *ret; | 343 | void *ret; |
diff --git a/mm/init-mm.c b/mm/init-mm.c index 4019979b2637..a56a851908d2 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
@@ -5,7 +5,7 @@ | |||
5 | #include <linux/list.h> | 5 | #include <linux/list.h> |
6 | #include <linux/cpumask.h> | 6 | #include <linux/cpumask.h> |
7 | 7 | ||
8 | #include <asm/atomic.h> | 8 | #include <linux/atomic.h> |
9 | #include <asm/pgtable.h> | 9 | #include <asm/pgtable.h> |
10 | #include <asm/mmu.h> | 10 | #include <asm/mmu.h> |
11 | 11 | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index aacee45616fc..d6880f542f95 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -96,7 +96,7 @@ | |||
96 | 96 | ||
97 | #include <asm/sections.h> | 97 | #include <asm/sections.h> |
98 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
99 | #include <asm/atomic.h> | 99 | #include <linux/atomic.h> |
100 | 100 | ||
101 | #include <linux/kmemcheck.h> | 101 | #include <linux/kmemcheck.h> |
102 | #include <linux/kmemleak.h> | 102 | #include <linux/kmemleak.h> |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e013b8e57d25..3508777837c7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -35,7 +35,6 @@ | |||
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
38 | #include <linux/shmem_fs.h> | ||
39 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
40 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
41 | #include <linux/swapops.h> | 40 | #include <linux/swapops.h> |
@@ -246,10 +245,13 @@ struct mem_cgroup { | |||
246 | * Should the accounting and control be hierarchical, per subtree? | 245 | * Should the accounting and control be hierarchical, per subtree? |
247 | */ | 246 | */ |
248 | bool use_hierarchy; | 247 | bool use_hierarchy; |
249 | atomic_t oom_lock; | 248 | |
249 | bool oom_lock; | ||
250 | atomic_t under_oom; | ||
251 | |||
250 | atomic_t refcnt; | 252 | atomic_t refcnt; |
251 | 253 | ||
252 | unsigned int swappiness; | 254 | int swappiness; |
253 | /* OOM-Killer disable */ | 255 | /* OOM-Killer disable */ |
254 | int oom_kill_disable; | 256 | int oom_kill_disable; |
255 | 257 | ||
@@ -636,27 +638,44 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
636 | preempt_enable(); | 638 | preempt_enable(); |
637 | } | 639 | } |
638 | 640 | ||
639 | static unsigned long | 641 | unsigned long |
640 | mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) | 642 | mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, |
643 | unsigned int lru_mask) | ||
641 | { | 644 | { |
642 | struct mem_cgroup_per_zone *mz; | 645 | struct mem_cgroup_per_zone *mz; |
646 | enum lru_list l; | ||
647 | unsigned long ret = 0; | ||
648 | |||
649 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
650 | |||
651 | for_each_lru(l) { | ||
652 | if (BIT(l) & lru_mask) | ||
653 | ret += MEM_CGROUP_ZSTAT(mz, l); | ||
654 | } | ||
655 | return ret; | ||
656 | } | ||
657 | |||
658 | static unsigned long | ||
659 | mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, | ||
660 | int nid, unsigned int lru_mask) | ||
661 | { | ||
643 | u64 total = 0; | 662 | u64 total = 0; |
644 | int zid; | 663 | int zid; |
645 | 664 | ||
646 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 665 | for (zid = 0; zid < MAX_NR_ZONES; zid++) |
647 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | 666 | total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); |
648 | total += MEM_CGROUP_ZSTAT(mz, idx); | 667 | |
649 | } | ||
650 | return total; | 668 | return total; |
651 | } | 669 | } |
652 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 670 | |
653 | enum lru_list idx) | 671 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, |
672 | unsigned int lru_mask) | ||
654 | { | 673 | { |
655 | int nid; | 674 | int nid; |
656 | u64 total = 0; | 675 | u64 total = 0; |
657 | 676 | ||
658 | for_each_online_node(nid) | 677 | for_each_node_state(nid, N_HIGH_MEMORY) |
659 | total += mem_cgroup_get_zonestat_node(mem, nid, idx); | 678 | total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); |
660 | return total; | 679 | return total; |
661 | } | 680 | } |
662 | 681 | ||
@@ -1043,6 +1062,21 @@ void mem_cgroup_move_lists(struct page *page, | |||
1043 | mem_cgroup_add_lru_list(page, to); | 1062 | mem_cgroup_add_lru_list(page, to); |
1044 | } | 1063 | } |
1045 | 1064 | ||
1065 | /* | ||
1066 | * Checks whether given mem is same or in the root_mem's | ||
1067 | * hierarchy subtree | ||
1068 | */ | ||
1069 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, | ||
1070 | struct mem_cgroup *mem) | ||
1071 | { | ||
1072 | if (root_mem != mem) { | ||
1073 | return (root_mem->use_hierarchy && | ||
1074 | css_is_ancestor(&mem->css, &root_mem->css)); | ||
1075 | } | ||
1076 | |||
1077 | return true; | ||
1078 | } | ||
1079 | |||
1046 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 1080 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
1047 | { | 1081 | { |
1048 | int ret; | 1082 | int ret; |
@@ -1062,10 +1096,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
1062 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 1096 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* |
1063 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 1097 | * hierarchy(even if use_hierarchy is disabled in "mem"). |
1064 | */ | 1098 | */ |
1065 | if (mem->use_hierarchy) | 1099 | ret = mem_cgroup_same_or_subtree(mem, curr); |
1066 | ret = css_is_ancestor(&curr->css, &mem->css); | ||
1067 | else | ||
1068 | ret = (curr == mem); | ||
1069 | css_put(&curr->css); | 1100 | css_put(&curr->css); |
1070 | return ret; | 1101 | return ret; |
1071 | } | 1102 | } |
@@ -1077,8 +1108,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ | |||
1077 | unsigned long gb; | 1108 | unsigned long gb; |
1078 | unsigned long inactive_ratio; | 1109 | unsigned long inactive_ratio; |
1079 | 1110 | ||
1080 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); | 1111 | inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); |
1081 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); | 1112 | active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); |
1082 | 1113 | ||
1083 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1114 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1084 | if (gb) | 1115 | if (gb) |
@@ -1117,109 +1148,12 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | |||
1117 | unsigned long active; | 1148 | unsigned long active; |
1118 | unsigned long inactive; | 1149 | unsigned long inactive; |
1119 | 1150 | ||
1120 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | 1151 | inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); |
1121 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | 1152 | active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); |
1122 | 1153 | ||
1123 | return (active > inactive); | 1154 | return (active > inactive); |
1124 | } | 1155 | } |
1125 | 1156 | ||
1126 | unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, | ||
1127 | struct zone *zone, | ||
1128 | enum lru_list lru) | ||
1129 | { | ||
1130 | int nid = zone_to_nid(zone); | ||
1131 | int zid = zone_idx(zone); | ||
1132 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
1133 | |||
1134 | return MEM_CGROUP_ZSTAT(mz, lru); | ||
1135 | } | ||
1136 | |||
1137 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | ||
1138 | int nid) | ||
1139 | { | ||
1140 | unsigned long ret; | ||
1141 | |||
1142 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + | ||
1143 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); | ||
1144 | |||
1145 | return ret; | ||
1146 | } | ||
1147 | |||
1148 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1149 | int nid) | ||
1150 | { | ||
1151 | unsigned long ret; | ||
1152 | |||
1153 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1154 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | #if MAX_NUMNODES > 1 | ||
1159 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | ||
1160 | { | ||
1161 | u64 total = 0; | ||
1162 | int nid; | ||
1163 | |||
1164 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1165 | total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); | ||
1166 | |||
1167 | return total; | ||
1168 | } | ||
1169 | |||
1170 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | ||
1171 | { | ||
1172 | u64 total = 0; | ||
1173 | int nid; | ||
1174 | |||
1175 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1176 | total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); | ||
1177 | |||
1178 | return total; | ||
1179 | } | ||
1180 | |||
1181 | static unsigned long | ||
1182 | mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) | ||
1183 | { | ||
1184 | return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); | ||
1185 | } | ||
1186 | |||
1187 | static unsigned long | ||
1188 | mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) | ||
1189 | { | ||
1190 | u64 total = 0; | ||
1191 | int nid; | ||
1192 | |||
1193 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1194 | total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); | ||
1195 | |||
1196 | return total; | ||
1197 | } | ||
1198 | |||
1199 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | ||
1200 | int nid) | ||
1201 | { | ||
1202 | enum lru_list l; | ||
1203 | u64 total = 0; | ||
1204 | |||
1205 | for_each_lru(l) | ||
1206 | total += mem_cgroup_get_zonestat_node(memcg, nid, l); | ||
1207 | |||
1208 | return total; | ||
1209 | } | ||
1210 | |||
1211 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) | ||
1212 | { | ||
1213 | u64 total = 0; | ||
1214 | int nid; | ||
1215 | |||
1216 | for_each_node_state(nid, N_HIGH_MEMORY) | ||
1217 | total += mem_cgroup_node_nr_lru_pages(memcg, nid); | ||
1218 | |||
1219 | return total; | ||
1220 | } | ||
1221 | #endif /* CONFIG_NUMA */ | ||
1222 | |||
1223 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | 1157 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, |
1224 | struct zone *zone) | 1158 | struct zone *zone) |
1225 | { | 1159 | { |
@@ -1329,7 +1263,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) | |||
1329 | return margin >> PAGE_SHIFT; | 1263 | return margin >> PAGE_SHIFT; |
1330 | } | 1264 | } |
1331 | 1265 | ||
1332 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1266 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
1333 | { | 1267 | { |
1334 | struct cgroup *cgrp = memcg->css.cgroup; | 1268 | struct cgroup *cgrp = memcg->css.cgroup; |
1335 | 1269 | ||
@@ -1401,10 +1335,9 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) | |||
1401 | to = mc.to; | 1335 | to = mc.to; |
1402 | if (!from) | 1336 | if (!from) |
1403 | goto unlock; | 1337 | goto unlock; |
1404 | if (from == mem || to == mem | 1338 | |
1405 | || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) | 1339 | ret = mem_cgroup_same_or_subtree(mem, from) |
1406 | || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) | 1340 | || mem_cgroup_same_or_subtree(mem, to); |
1407 | ret = true; | ||
1408 | unlock: | 1341 | unlock: |
1409 | spin_unlock(&mc.lock); | 1342 | spin_unlock(&mc.lock); |
1410 | return ret; | 1343 | return ret; |
@@ -1576,11 +1509,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1576 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | 1509 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, |
1577 | int nid, bool noswap) | 1510 | int nid, bool noswap) |
1578 | { | 1511 | { |
1579 | if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) | 1512 | if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) |
1580 | return true; | 1513 | return true; |
1581 | if (noswap || !total_swap_pages) | 1514 | if (noswap || !total_swap_pages) |
1582 | return false; | 1515 | return false; |
1583 | if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) | 1516 | if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) |
1584 | return true; | 1517 | return true; |
1585 | return false; | 1518 | return false; |
1586 | 1519 | ||
@@ -1730,7 +1663,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1730 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1731 | 1664 | ||
1732 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1665 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1733 | if (!check_soft && root_mem->memsw_is_minimum) | 1666 | if (!check_soft && !shrink && root_mem->memsw_is_minimum) |
1734 | noswap = true; | 1667 | noswap = true; |
1735 | 1668 | ||
1736 | while (1) { | 1669 | while (1) { |
@@ -1776,12 +1709,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1776 | /* we use swappiness of local cgroup */ | 1709 | /* we use swappiness of local cgroup */ |
1777 | if (check_soft) { | 1710 | if (check_soft) { |
1778 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1711 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1779 | noswap, get_swappiness(victim), zone, | 1712 | noswap, zone, &nr_scanned); |
1780 | &nr_scanned); | ||
1781 | *total_scanned += nr_scanned; | 1713 | *total_scanned += nr_scanned; |
1782 | } else | 1714 | } else |
1783 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1715 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, |
1784 | noswap, get_swappiness(victim)); | 1716 | noswap); |
1785 | css_put(&victim->css); | 1717 | css_put(&victim->css); |
1786 | /* | 1718 | /* |
1787 | * At shrinking usage, we can't check we should stop here or | 1719 | * At shrinking usage, we can't check we should stop here or |
@@ -1803,38 +1735,77 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1803 | /* | 1735 | /* |
1804 | * Check OOM-Killer is already running under our hierarchy. | 1736 | * Check OOM-Killer is already running under our hierarchy. |
1805 | * If someone is running, return false. | 1737 | * If someone is running, return false. |
1738 | * Has to be called with memcg_oom_lock | ||
1806 | */ | 1739 | */ |
1807 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1740 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) |
1808 | { | 1741 | { |
1809 | int x, lock_count = 0; | 1742 | struct mem_cgroup *iter, *failed = NULL; |
1810 | struct mem_cgroup *iter; | 1743 | bool cond = true; |
1811 | 1744 | ||
1812 | for_each_mem_cgroup_tree(iter, mem) { | 1745 | for_each_mem_cgroup_tree_cond(iter, mem, cond) { |
1813 | x = atomic_inc_return(&iter->oom_lock); | 1746 | if (iter->oom_lock) { |
1814 | lock_count = max(x, lock_count); | 1747 | /* |
1748 | * this subtree of our hierarchy is already locked | ||
1749 | * so we cannot give a lock. | ||
1750 | */ | ||
1751 | failed = iter; | ||
1752 | cond = false; | ||
1753 | } else | ||
1754 | iter->oom_lock = true; | ||
1815 | } | 1755 | } |
1816 | 1756 | ||
1817 | if (lock_count == 1) | 1757 | if (!failed) |
1818 | return true; | 1758 | return true; |
1759 | |||
1760 | /* | ||
1761 | * OK, we failed to lock the whole subtree so we have to clean up | ||
1762 | * what we set up to the failing subtree | ||
1763 | */ | ||
1764 | cond = true; | ||
1765 | for_each_mem_cgroup_tree_cond(iter, mem, cond) { | ||
1766 | if (iter == failed) { | ||
1767 | cond = false; | ||
1768 | continue; | ||
1769 | } | ||
1770 | iter->oom_lock = false; | ||
1771 | } | ||
1819 | return false; | 1772 | return false; |
1820 | } | 1773 | } |
1821 | 1774 | ||
1775 | /* | ||
1776 | * Has to be called with memcg_oom_lock | ||
1777 | */ | ||
1822 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) | 1778 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1823 | { | 1779 | { |
1824 | struct mem_cgroup *iter; | 1780 | struct mem_cgroup *iter; |
1825 | 1781 | ||
1782 | for_each_mem_cgroup_tree(iter, mem) | ||
1783 | iter->oom_lock = false; | ||
1784 | return 0; | ||
1785 | } | ||
1786 | |||
1787 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) | ||
1788 | { | ||
1789 | struct mem_cgroup *iter; | ||
1790 | |||
1791 | for_each_mem_cgroup_tree(iter, mem) | ||
1792 | atomic_inc(&iter->under_oom); | ||
1793 | } | ||
1794 | |||
1795 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) | ||
1796 | { | ||
1797 | struct mem_cgroup *iter; | ||
1798 | |||
1826 | /* | 1799 | /* |
1827 | * When a new child is created while the hierarchy is under oom, | 1800 | * When a new child is created while the hierarchy is under oom, |
1828 | * mem_cgroup_oom_lock() may not be called. We have to use | 1801 | * mem_cgroup_oom_lock() may not be called. We have to use |
1829 | * atomic_add_unless() here. | 1802 | * atomic_add_unless() here. |
1830 | */ | 1803 | */ |
1831 | for_each_mem_cgroup_tree(iter, mem) | 1804 | for_each_mem_cgroup_tree(iter, mem) |
1832 | atomic_add_unless(&iter->oom_lock, -1, 0); | 1805 | atomic_add_unless(&iter->under_oom, -1, 0); |
1833 | return 0; | ||
1834 | } | 1806 | } |
1835 | 1807 | ||
1836 | 1808 | static DEFINE_SPINLOCK(memcg_oom_lock); | |
1837 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1838 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1809 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1839 | 1810 | ||
1840 | struct oom_wait_info { | 1811 | struct oom_wait_info { |
@@ -1845,25 +1816,20 @@ struct oom_wait_info { | |||
1845 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1816 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1846 | unsigned mode, int sync, void *arg) | 1817 | unsigned mode, int sync, void *arg) |
1847 | { | 1818 | { |
1848 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | 1819 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, |
1820 | *oom_wait_mem; | ||
1849 | struct oom_wait_info *oom_wait_info; | 1821 | struct oom_wait_info *oom_wait_info; |
1850 | 1822 | ||
1851 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1823 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1824 | oom_wait_mem = oom_wait_info->mem; | ||
1852 | 1825 | ||
1853 | if (oom_wait_info->mem == wake_mem) | ||
1854 | goto wakeup; | ||
1855 | /* if no hierarchy, no match */ | ||
1856 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
1857 | return 0; | ||
1858 | /* | 1826 | /* |
1859 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1827 | * Both of oom_wait_info->mem and wake_mem are stable under us. |
1860 | * Then we can use css_is_ancestor without taking care of RCU. | 1828 | * Then we can use css_is_ancestor without taking care of RCU. |
1861 | */ | 1829 | */ |
1862 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | 1830 | if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) |
1863 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | 1831 | && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) |
1864 | return 0; | 1832 | return 0; |
1865 | |||
1866 | wakeup: | ||
1867 | return autoremove_wake_function(wait, mode, sync, arg); | 1833 | return autoremove_wake_function(wait, mode, sync, arg); |
1868 | } | 1834 | } |
1869 | 1835 | ||
@@ -1875,7 +1841,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem) | |||
1875 | 1841 | ||
1876 | static void memcg_oom_recover(struct mem_cgroup *mem) | 1842 | static void memcg_oom_recover(struct mem_cgroup *mem) |
1877 | { | 1843 | { |
1878 | if (mem && atomic_read(&mem->oom_lock)) | 1844 | if (mem && atomic_read(&mem->under_oom)) |
1879 | memcg_wakeup_oom(mem); | 1845 | memcg_wakeup_oom(mem); |
1880 | } | 1846 | } |
1881 | 1847 | ||
@@ -1893,8 +1859,10 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1893 | owait.wait.private = current; | 1859 | owait.wait.private = current; |
1894 | INIT_LIST_HEAD(&owait.wait.task_list); | 1860 | INIT_LIST_HEAD(&owait.wait.task_list); |
1895 | need_to_kill = true; | 1861 | need_to_kill = true; |
1862 | mem_cgroup_mark_under_oom(mem); | ||
1863 | |||
1896 | /* At first, try to OOM lock hierarchy under mem.*/ | 1864 | /* At first, try to OOM lock hierarchy under mem.*/ |
1897 | mutex_lock(&memcg_oom_mutex); | 1865 | spin_lock(&memcg_oom_lock); |
1898 | locked = mem_cgroup_oom_lock(mem); | 1866 | locked = mem_cgroup_oom_lock(mem); |
1899 | /* | 1867 | /* |
1900 | * Even if signal_pending(), we can't quit charge() loop without | 1868 | * Even if signal_pending(), we can't quit charge() loop without |
@@ -1906,7 +1874,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1906 | need_to_kill = false; | 1874 | need_to_kill = false; |
1907 | if (locked) | 1875 | if (locked) |
1908 | mem_cgroup_oom_notify(mem); | 1876 | mem_cgroup_oom_notify(mem); |
1909 | mutex_unlock(&memcg_oom_mutex); | 1877 | spin_unlock(&memcg_oom_lock); |
1910 | 1878 | ||
1911 | if (need_to_kill) { | 1879 | if (need_to_kill) { |
1912 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1880 | finish_wait(&memcg_oom_waitq, &owait.wait); |
@@ -1915,10 +1883,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1915 | schedule(); | 1883 | schedule(); |
1916 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1884 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1917 | } | 1885 | } |
1918 | mutex_lock(&memcg_oom_mutex); | 1886 | spin_lock(&memcg_oom_lock); |
1919 | mem_cgroup_oom_unlock(mem); | 1887 | if (locked) |
1888 | mem_cgroup_oom_unlock(mem); | ||
1920 | memcg_wakeup_oom(mem); | 1889 | memcg_wakeup_oom(mem); |
1921 | mutex_unlock(&memcg_oom_mutex); | 1890 | spin_unlock(&memcg_oom_lock); |
1891 | |||
1892 | mem_cgroup_unmark_under_oom(mem); | ||
1922 | 1893 | ||
1923 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1894 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
1924 | return false; | 1895 | return false; |
@@ -2079,59 +2050,70 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | |||
2079 | } | 2050 | } |
2080 | 2051 | ||
2081 | /* | 2052 | /* |
2082 | * Tries to drain stocked charges in other cpus. This function is asynchronous | 2053 | * Drains all per-CPU charge caches for given root_mem resp. subtree |
2083 | * and just put a work per cpu for draining localy on each cpu. Caller can | 2054 | * of the hierarchy under it. sync flag says whether we should block |
2084 | * expects some charges will be back to res_counter later but cannot wait for | 2055 | * until the work is done. |
2085 | * it. | ||
2086 | */ | 2056 | */ |
2087 | static void drain_all_stock_async(struct mem_cgroup *root_mem) | 2057 | static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) |
2088 | { | 2058 | { |
2089 | int cpu, curcpu; | 2059 | int cpu, curcpu; |
2090 | /* | 2060 | |
2091 | * If someone calls draining, avoid adding more kworker runs. | ||
2092 | */ | ||
2093 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2094 | return; | ||
2095 | /* Notify other cpus that system-wide "drain" is running */ | 2061 | /* Notify other cpus that system-wide "drain" is running */ |
2096 | get_online_cpus(); | 2062 | get_online_cpus(); |
2097 | /* | 2063 | curcpu = get_cpu(); |
2098 | * Get a hint for avoiding draining charges on the current cpu, | ||
2099 | * which must be exhausted by our charging. It is not required that | ||
2100 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2101 | * getcpu()/putcpu(). | ||
2102 | */ | ||
2103 | curcpu = raw_smp_processor_id(); | ||
2104 | for_each_online_cpu(cpu) { | 2064 | for_each_online_cpu(cpu) { |
2105 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2065 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2106 | struct mem_cgroup *mem; | 2066 | struct mem_cgroup *mem; |
2107 | 2067 | ||
2108 | if (cpu == curcpu) | ||
2109 | continue; | ||
2110 | |||
2111 | mem = stock->cached; | 2068 | mem = stock->cached; |
2112 | if (!mem) | 2069 | if (!mem || !stock->nr_pages) |
2113 | continue; | 2070 | continue; |
2114 | if (mem != root_mem) { | 2071 | if (!mem_cgroup_same_or_subtree(root_mem, mem)) |
2115 | if (!root_mem->use_hierarchy) | 2072 | continue; |
2116 | continue; | 2073 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2117 | /* check whether "mem" is under tree of "root_mem" */ | 2074 | if (cpu == curcpu) |
2118 | if (!css_is_ancestor(&mem->css, &root_mem->css)) | 2075 | drain_local_stock(&stock->work); |
2119 | continue; | 2076 | else |
2077 | schedule_work_on(cpu, &stock->work); | ||
2120 | } | 2078 | } |
2121 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2122 | schedule_work_on(cpu, &stock->work); | ||
2123 | } | 2079 | } |
2080 | put_cpu(); | ||
2081 | |||
2082 | if (!sync) | ||
2083 | goto out; | ||
2084 | |||
2085 | for_each_online_cpu(cpu) { | ||
2086 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
2087 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2088 | flush_work(&stock->work); | ||
2089 | } | ||
2090 | out: | ||
2124 | put_online_cpus(); | 2091 | put_online_cpus(); |
2092 | } | ||
2093 | |||
2094 | /* | ||
2095 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
2096 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
2097 | * expects some charges will be back to res_counter later but cannot wait for | ||
2098 | * it. | ||
2099 | */ | ||
2100 | static void drain_all_stock_async(struct mem_cgroup *root_mem) | ||
2101 | { | ||
2102 | /* | ||
2103 | * If someone calls draining, avoid adding more kworker runs. | ||
2104 | */ | ||
2105 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2106 | return; | ||
2107 | drain_all_stock(root_mem, false); | ||
2125 | mutex_unlock(&percpu_charge_mutex); | 2108 | mutex_unlock(&percpu_charge_mutex); |
2126 | /* We don't wait for flush_work */ | ||
2127 | } | 2109 | } |
2128 | 2110 | ||
2129 | /* This is a synchronous drain interface. */ | 2111 | /* This is a synchronous drain interface. */ |
2130 | static void drain_all_stock_sync(void) | 2112 | static void drain_all_stock_sync(struct mem_cgroup *root_mem) |
2131 | { | 2113 | { |
2132 | /* called when force_empty is called */ | 2114 | /* called when force_empty is called */ |
2133 | mutex_lock(&percpu_charge_mutex); | 2115 | mutex_lock(&percpu_charge_mutex); |
2134 | schedule_on_each_cpu(drain_local_stock); | 2116 | drain_all_stock(root_mem, true); |
2135 | mutex_unlock(&percpu_charge_mutex); | 2117 | mutex_unlock(&percpu_charge_mutex); |
2136 | } | 2118 | } |
2137 | 2119 | ||
@@ -2784,30 +2766,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2784 | return 0; | 2766 | return 0; |
2785 | if (PageCompound(page)) | 2767 | if (PageCompound(page)) |
2786 | return 0; | 2768 | return 0; |
2787 | /* | ||
2788 | * Corner case handling. This is called from add_to_page_cache() | ||
2789 | * in usual. But some FS (shmem) precharges this page before calling it | ||
2790 | * and call add_to_page_cache() with GFP_NOWAIT. | ||
2791 | * | ||
2792 | * For GFP_NOWAIT case, the page may be pre-charged before calling | ||
2793 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | ||
2794 | * charge twice. (It works but has to pay a bit larger cost.) | ||
2795 | * And when the page is SwapCache, it should take swap information | ||
2796 | * into account. This is under lock_page() now. | ||
2797 | */ | ||
2798 | if (!(gfp_mask & __GFP_WAIT)) { | ||
2799 | struct page_cgroup *pc; | ||
2800 | |||
2801 | pc = lookup_page_cgroup(page); | ||
2802 | if (!pc) | ||
2803 | return 0; | ||
2804 | lock_page_cgroup(pc); | ||
2805 | if (PageCgroupUsed(pc)) { | ||
2806 | unlock_page_cgroup(pc); | ||
2807 | return 0; | ||
2808 | } | ||
2809 | unlock_page_cgroup(pc); | ||
2810 | } | ||
2811 | 2769 | ||
2812 | if (unlikely(!mm)) | 2770 | if (unlikely(!mm)) |
2813 | mm = &init_mm; | 2771 | mm = &init_mm; |
@@ -3397,31 +3355,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
3397 | cgroup_release_and_wakeup_rmdir(&mem->css); | 3355 | cgroup_release_and_wakeup_rmdir(&mem->css); |
3398 | } | 3356 | } |
3399 | 3357 | ||
3400 | /* | ||
3401 | * A call to try to shrink memory usage on charge failure at shmem's swapin. | ||
3402 | * Calling hierarchical_reclaim is not enough because we should update | ||
3403 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. | ||
3404 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, | ||
3405 | * not from the memcg which this page would be charged to. | ||
3406 | * try_charge_swapin does all of these works properly. | ||
3407 | */ | ||
3408 | int mem_cgroup_shmem_charge_fallback(struct page *page, | ||
3409 | struct mm_struct *mm, | ||
3410 | gfp_t gfp_mask) | ||
3411 | { | ||
3412 | struct mem_cgroup *mem; | ||
3413 | int ret; | ||
3414 | |||
3415 | if (mem_cgroup_disabled()) | ||
3416 | return 0; | ||
3417 | |||
3418 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | ||
3419 | if (!ret) | ||
3420 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ | ||
3421 | |||
3422 | return ret; | ||
3423 | } | ||
3424 | |||
3425 | #ifdef CONFIG_DEBUG_VM | 3358 | #ifdef CONFIG_DEBUG_VM |
3426 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | 3359 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) |
3427 | { | 3360 | { |
@@ -3780,7 +3713,7 @@ move_account: | |||
3780 | goto out; | 3713 | goto out; |
3781 | /* This is for making all *used* pages to be on LRU. */ | 3714 | /* This is for making all *used* pages to be on LRU. */ |
3782 | lru_add_drain_all(); | 3715 | lru_add_drain_all(); |
3783 | drain_all_stock_sync(); | 3716 | drain_all_stock_sync(mem); |
3784 | ret = 0; | 3717 | ret = 0; |
3785 | mem_cgroup_start_move(mem); | 3718 | mem_cgroup_start_move(mem); |
3786 | for_each_node_state(node, N_HIGH_MEMORY) { | 3719 | for_each_node_state(node, N_HIGH_MEMORY) { |
@@ -3826,7 +3759,7 @@ try_to_free: | |||
3826 | goto out; | 3759 | goto out; |
3827 | } | 3760 | } |
3828 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, | 3761 | progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, |
3829 | false, get_swappiness(mem)); | 3762 | false); |
3830 | if (!progress) { | 3763 | if (!progress) { |
3831 | nr_retries--; | 3764 | nr_retries--; |
3832 | /* maybe some writeback is necessary */ | 3765 | /* maybe some writeback is necessary */ |
@@ -4152,15 +4085,15 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
4152 | s->stat[MCS_PGMAJFAULT] += val; | 4085 | s->stat[MCS_PGMAJFAULT] += val; |
4153 | 4086 | ||
4154 | /* per zone stat */ | 4087 | /* per zone stat */ |
4155 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 4088 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); |
4156 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | 4089 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; |
4157 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); | 4090 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); |
4158 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | 4091 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; |
4159 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); | 4092 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); |
4160 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | 4093 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; |
4161 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); | 4094 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); |
4162 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 4095 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
4163 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | 4096 | val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); |
4164 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 4097 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
4165 | } | 4098 | } |
4166 | 4099 | ||
@@ -4182,35 +4115,37 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4182 | struct cgroup *cont = m->private; | 4115 | struct cgroup *cont = m->private; |
4183 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4116 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); |
4184 | 4117 | ||
4185 | total_nr = mem_cgroup_nr_lru_pages(mem_cont); | 4118 | total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); |
4186 | seq_printf(m, "total=%lu", total_nr); | 4119 | seq_printf(m, "total=%lu", total_nr); |
4187 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4120 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4188 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); | 4121 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); |
4189 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4122 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4190 | } | 4123 | } |
4191 | seq_putc(m, '\n'); | 4124 | seq_putc(m, '\n'); |
4192 | 4125 | ||
4193 | file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); | 4126 | file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); |
4194 | seq_printf(m, "file=%lu", file_nr); | 4127 | seq_printf(m, "file=%lu", file_nr); |
4195 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4128 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4196 | node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); | 4129 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4130 | LRU_ALL_FILE); | ||
4197 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4131 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4198 | } | 4132 | } |
4199 | seq_putc(m, '\n'); | 4133 | seq_putc(m, '\n'); |
4200 | 4134 | ||
4201 | anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); | 4135 | anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); |
4202 | seq_printf(m, "anon=%lu", anon_nr); | 4136 | seq_printf(m, "anon=%lu", anon_nr); |
4203 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4137 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4204 | node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); | 4138 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4139 | LRU_ALL_ANON); | ||
4205 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4140 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4206 | } | 4141 | } |
4207 | seq_putc(m, '\n'); | 4142 | seq_putc(m, '\n'); |
4208 | 4143 | ||
4209 | unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); | 4144 | unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); |
4210 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4145 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4211 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4146 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4212 | node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, | 4147 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, |
4213 | nid); | 4148 | BIT(LRU_UNEVICTABLE)); |
4214 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4149 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4215 | } | 4150 | } |
4216 | seq_putc(m, '\n'); | 4151 | seq_putc(m, '\n'); |
@@ -4288,7 +4223,7 @@ static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) | |||
4288 | { | 4223 | { |
4289 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 4224 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
4290 | 4225 | ||
4291 | return get_swappiness(memcg); | 4226 | return mem_cgroup_swappiness(memcg); |
4292 | } | 4227 | } |
4293 | 4228 | ||
4294 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | 4229 | static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, |
@@ -4578,15 +4513,15 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, | |||
4578 | if (!event) | 4513 | if (!event) |
4579 | return -ENOMEM; | 4514 | return -ENOMEM; |
4580 | 4515 | ||
4581 | mutex_lock(&memcg_oom_mutex); | 4516 | spin_lock(&memcg_oom_lock); |
4582 | 4517 | ||
4583 | event->eventfd = eventfd; | 4518 | event->eventfd = eventfd; |
4584 | list_add(&event->list, &memcg->oom_notify); | 4519 | list_add(&event->list, &memcg->oom_notify); |
4585 | 4520 | ||
4586 | /* already in OOM ? */ | 4521 | /* already in OOM ? */ |
4587 | if (atomic_read(&memcg->oom_lock)) | 4522 | if (atomic_read(&memcg->under_oom)) |
4588 | eventfd_signal(eventfd, 1); | 4523 | eventfd_signal(eventfd, 1); |
4589 | mutex_unlock(&memcg_oom_mutex); | 4524 | spin_unlock(&memcg_oom_lock); |
4590 | 4525 | ||
4591 | return 0; | 4526 | return 0; |
4592 | } | 4527 | } |
@@ -4600,7 +4535,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4600 | 4535 | ||
4601 | BUG_ON(type != _OOM_TYPE); | 4536 | BUG_ON(type != _OOM_TYPE); |
4602 | 4537 | ||
4603 | mutex_lock(&memcg_oom_mutex); | 4538 | spin_lock(&memcg_oom_lock); |
4604 | 4539 | ||
4605 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | 4540 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { |
4606 | if (ev->eventfd == eventfd) { | 4541 | if (ev->eventfd == eventfd) { |
@@ -4609,7 +4544,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | |||
4609 | } | 4544 | } |
4610 | } | 4545 | } |
4611 | 4546 | ||
4612 | mutex_unlock(&memcg_oom_mutex); | 4547 | spin_unlock(&memcg_oom_lock); |
4613 | } | 4548 | } |
4614 | 4549 | ||
4615 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | 4550 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, |
@@ -4619,7 +4554,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | |||
4619 | 4554 | ||
4620 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | 4555 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); |
4621 | 4556 | ||
4622 | if (atomic_read(&mem->oom_lock)) | 4557 | if (atomic_read(&mem->under_oom)) |
4623 | cb->fill(cb, "under_oom", 1); | 4558 | cb->fill(cb, "under_oom", 1); |
4624 | else | 4559 | else |
4625 | cb->fill(cb, "under_oom", 0); | 4560 | cb->fill(cb, "under_oom", 0); |
@@ -4997,7 +4932,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4997 | INIT_LIST_HEAD(&mem->oom_notify); | 4932 | INIT_LIST_HEAD(&mem->oom_notify); |
4998 | 4933 | ||
4999 | if (parent) | 4934 | if (parent) |
5000 | mem->swappiness = get_swappiness(parent); | 4935 | mem->swappiness = mem_cgroup_swappiness(parent); |
5001 | atomic_set(&mem->refcnt, 1); | 4936 | atomic_set(&mem->refcnt, 1); |
5002 | mem->move_charge_at_immigrate = 0; | 4937 | mem->move_charge_at_immigrate = 0; |
5003 | mutex_init(&mem->thresholds_lock); | 4938 | mutex_init(&mem->thresholds_lock); |
@@ -5181,15 +5116,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5181 | pgoff = pte_to_pgoff(ptent); | 5116 | pgoff = pte_to_pgoff(ptent); |
5182 | 5117 | ||
5183 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 5118 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
5184 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | 5119 | page = find_get_page(mapping, pgoff); |
5185 | page = find_get_page(mapping, pgoff); | 5120 | |
5186 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | 5121 | #ifdef CONFIG_SWAP |
5187 | swp_entry_t ent; | 5122 | /* shmem/tmpfs may report page out on swap: account for that too. */ |
5188 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | 5123 | if (radix_tree_exceptional_entry(page)) { |
5124 | swp_entry_t swap = radix_to_swp_entry(page); | ||
5189 | if (do_swap_account) | 5125 | if (do_swap_account) |
5190 | entry->val = ent.val; | 5126 | *entry = swap; |
5127 | page = find_get_page(&swapper_space, swap.val); | ||
5191 | } | 5128 | } |
5192 | 5129 | #endif | |
5193 | return page; | 5130 | return page; |
5194 | } | 5131 | } |
5195 | 5132 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059c..2b43ba051ac9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 54 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | 55 | #include <linux/mm_inline.h> |
56 | #include <linux/kfifo.h> | ||
56 | #include "internal.h" | 57 | #include "internal.h" |
57 | 58 | ||
58 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 59 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno) | |||
1178 | __memory_failure(pfn, trapno, 0); | 1179 | __memory_failure(pfn, trapno, 0); |
1179 | } | 1180 | } |
1180 | 1181 | ||
1182 | #define MEMORY_FAILURE_FIFO_ORDER 4 | ||
1183 | #define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) | ||
1184 | |||
1185 | struct memory_failure_entry { | ||
1186 | unsigned long pfn; | ||
1187 | int trapno; | ||
1188 | int flags; | ||
1189 | }; | ||
1190 | |||
1191 | struct memory_failure_cpu { | ||
1192 | DECLARE_KFIFO(fifo, struct memory_failure_entry, | ||
1193 | MEMORY_FAILURE_FIFO_SIZE); | ||
1194 | spinlock_t lock; | ||
1195 | struct work_struct work; | ||
1196 | }; | ||
1197 | |||
1198 | static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); | ||
1199 | |||
1200 | /** | ||
1201 | * memory_failure_queue - Schedule handling memory failure of a page. | ||
1202 | * @pfn: Page Number of the corrupted page | ||
1203 | * @trapno: Trap number reported in the signal to user space. | ||
1204 | * @flags: Flags for memory failure handling | ||
1205 | * | ||
1206 | * This function is called by the low level hardware error handler | ||
1207 | * when it detects hardware memory corruption of a page. It schedules | ||
1208 | * the recovering of error page, including dropping pages, killing | ||
1209 | * processes etc. | ||
1210 | * | ||
1211 | * The function is primarily of use for corruptions that | ||
1212 | * happen outside the current execution context (e.g. when | ||
1213 | * detected by a background scrubber) | ||
1214 | * | ||
1215 | * Can run in IRQ context. | ||
1216 | */ | ||
1217 | void memory_failure_queue(unsigned long pfn, int trapno, int flags) | ||
1218 | { | ||
1219 | struct memory_failure_cpu *mf_cpu; | ||
1220 | unsigned long proc_flags; | ||
1221 | struct memory_failure_entry entry = { | ||
1222 | .pfn = pfn, | ||
1223 | .trapno = trapno, | ||
1224 | .flags = flags, | ||
1225 | }; | ||
1226 | |||
1227 | mf_cpu = &get_cpu_var(memory_failure_cpu); | ||
1228 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1229 | if (kfifo_put(&mf_cpu->fifo, &entry)) | ||
1230 | schedule_work_on(smp_processor_id(), &mf_cpu->work); | ||
1231 | else | ||
1232 | pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", | ||
1233 | pfn); | ||
1234 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1235 | put_cpu_var(memory_failure_cpu); | ||
1236 | } | ||
1237 | EXPORT_SYMBOL_GPL(memory_failure_queue); | ||
1238 | |||
1239 | static void memory_failure_work_func(struct work_struct *work) | ||
1240 | { | ||
1241 | struct memory_failure_cpu *mf_cpu; | ||
1242 | struct memory_failure_entry entry = { 0, }; | ||
1243 | unsigned long proc_flags; | ||
1244 | int gotten; | ||
1245 | |||
1246 | mf_cpu = &__get_cpu_var(memory_failure_cpu); | ||
1247 | for (;;) { | ||
1248 | spin_lock_irqsave(&mf_cpu->lock, proc_flags); | ||
1249 | gotten = kfifo_get(&mf_cpu->fifo, &entry); | ||
1250 | spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); | ||
1251 | if (!gotten) | ||
1252 | break; | ||
1253 | __memory_failure(entry.pfn, entry.trapno, entry.flags); | ||
1254 | } | ||
1255 | } | ||
1256 | |||
1257 | static int __init memory_failure_init(void) | ||
1258 | { | ||
1259 | struct memory_failure_cpu *mf_cpu; | ||
1260 | int cpu; | ||
1261 | |||
1262 | for_each_possible_cpu(cpu) { | ||
1263 | mf_cpu = &per_cpu(memory_failure_cpu, cpu); | ||
1264 | spin_lock_init(&mf_cpu->lock); | ||
1265 | INIT_KFIFO(mf_cpu->fifo); | ||
1266 | INIT_WORK(&mf_cpu->work, memory_failure_work_func); | ||
1267 | } | ||
1268 | |||
1269 | return 0; | ||
1270 | } | ||
1271 | core_initcall(memory_failure_init); | ||
1272 | |||
1181 | /** | 1273 | /** |
1182 | * unpoison_memory - Unpoison a previously poisoned page | 1274 | * unpoison_memory - Unpoison a previously poisoned page |
1183 | * @pfn: Page number of the to be unpoisoned page | 1275 | * @pfn: Page number of the to be unpoisoned page |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e7fb9d25c54e..9c51f9f58cac 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -93,6 +93,7 @@ | |||
93 | 93 | ||
94 | #include <asm/tlbflush.h> | 94 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 95 | #include <asm/uaccess.h> |
96 | #include <linux/random.h> | ||
96 | 97 | ||
97 | #include "internal.h" | 98 | #include "internal.h" |
98 | 99 | ||
@@ -635,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
635 | struct vm_area_struct *prev; | 636 | struct vm_area_struct *prev; |
636 | struct vm_area_struct *vma; | 637 | struct vm_area_struct *vma; |
637 | int err = 0; | 638 | int err = 0; |
638 | pgoff_t pgoff; | ||
639 | unsigned long vmstart; | 639 | unsigned long vmstart; |
640 | unsigned long vmend; | 640 | unsigned long vmend; |
641 | 641 | ||
@@ -648,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, | |||
648 | vmstart = max(start, vma->vm_start); | 648 | vmstart = max(start, vma->vm_start); |
649 | vmend = min(end, vma->vm_end); | 649 | vmend = min(end, vma->vm_end); |
650 | 650 | ||
651 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | ||
652 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, | 651 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
653 | vma->anon_vma, vma->vm_file, pgoff, new_pol); | 652 | vma->anon_vma, vma->vm_file, vma->vm_pgoff, |
653 | new_pol); | ||
654 | if (prev) { | 654 | if (prev) { |
655 | vma = prev; | 655 | vma = prev; |
656 | next = vma->vm_next; | 656 | next = vma->vm_next; |
@@ -1411,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, | |||
1411 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); | 1411 | err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); |
1412 | 1412 | ||
1413 | if (!err && nmask) { | 1413 | if (!err && nmask) { |
1414 | err = copy_from_user(bm, nm, alloc_size); | 1414 | unsigned long copy_size; |
1415 | copy_size = min_t(unsigned long, sizeof(bm), alloc_size); | ||
1416 | err = copy_from_user(bm, nm, copy_size); | ||
1415 | /* ensure entire bitmap is zeroed */ | 1417 | /* ensure entire bitmap is zeroed */ |
1416 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); | 1418 | err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); |
1417 | err |= compat_put_bitmap(nmask, bm, nr_bits); | 1419 | err |= compat_put_bitmap(nmask, bm, nr_bits); |
@@ -1645,6 +1647,21 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1645 | return interleave_nodes(pol); | 1647 | return interleave_nodes(pol); |
1646 | } | 1648 | } |
1647 | 1649 | ||
1650 | /* | ||
1651 | * Return the bit number of a random bit set in the nodemask. | ||
1652 | * (returns -1 if nodemask is empty) | ||
1653 | */ | ||
1654 | int node_random(const nodemask_t *maskp) | ||
1655 | { | ||
1656 | int w, bit = -1; | ||
1657 | |||
1658 | w = nodes_weight(*maskp); | ||
1659 | if (w) | ||
1660 | bit = bitmap_ord_to_pos(maskp->bits, | ||
1661 | get_random_int() % w, MAX_NUMNODES); | ||
1662 | return bit; | ||
1663 | } | ||
1664 | |||
1648 | #ifdef CONFIG_HUGETLBFS | 1665 | #ifdef CONFIG_HUGETLBFS |
1649 | /* | 1666 | /* |
1650 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) | 1667 | * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) |
diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c76..636a86876ff2 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
69 | * file will not get a swp_entry_t in its pte, but rather it is like | 69 | * file will not get a swp_entry_t in its pte, but rather it is like |
70 | * any other file mapping (ie. marked !present and faulted in with | 70 | * any other file mapping (ie. marked !present and faulted in with |
71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. | 71 | * tmpfs's .fault). So swapped out tmpfs mappings are tested here. |
72 | * | ||
73 | * However when tmpfs moves the page from pagecache and into swapcache, | ||
74 | * it is still in core, but the find_get_page below won't find it. | ||
75 | * No big deal, but make a note of it. | ||
76 | */ | 72 | */ |
77 | page = find_get_page(mapping, pgoff); | 73 | page = find_get_page(mapping, pgoff); |
74 | #ifdef CONFIG_SWAP | ||
75 | /* shmem/tmpfs may return swap: account for swapcache page too. */ | ||
76 | if (radix_tree_exceptional_entry(page)) { | ||
77 | swp_entry_t swap = radix_to_swp_entry(page); | ||
78 | page = find_get_page(&swapper_space, swap.val); | ||
79 | } | ||
80 | #endif | ||
78 | if (page) { | 81 | if (page) { |
79 | present = PageUptodate(page); | 82 | present = PageUptodate(page); |
80 | page_cache_release(page); | 83 | page_cache_release(page); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eafff89b3dd6..626303b52f3c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
303 | do_each_thread(g, p) { | 303 | do_each_thread(g, p) { |
304 | unsigned int points; | 304 | unsigned int points; |
305 | 305 | ||
306 | if (!p->mm) | 306 | if (p->exit_state) |
307 | continue; | 307 | continue; |
308 | if (oom_unkillable_task(p, mem, nodemask)) | 308 | if (oom_unkillable_task(p, mem, nodemask)) |
309 | continue; | 309 | continue; |
@@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
319 | */ | 319 | */ |
320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) | 320 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) |
321 | return ERR_PTR(-1UL); | 321 | return ERR_PTR(-1UL); |
322 | if (!p->mm) | ||
323 | continue; | ||
322 | 324 | ||
323 | if (p->flags & PF_EXITING) { | 325 | if (p->flags & PF_EXITING) { |
324 | /* | 326 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8767b381b9c..0e309cd1b5b9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -37,6 +37,16 @@ | |||
37 | #include <trace/events/writeback.h> | 37 | #include <trace/events/writeback.h> |
38 | 38 | ||
39 | /* | 39 | /* |
40 | * Sleep at most 200ms at a time in balance_dirty_pages(). | ||
41 | */ | ||
42 | #define MAX_PAUSE max(HZ/5, 1) | ||
43 | |||
44 | /* | ||
45 | * Estimate write bandwidth at 200ms intervals. | ||
46 | */ | ||
47 | #define BANDWIDTH_INTERVAL max(HZ/5, 1) | ||
48 | |||
49 | /* | ||
40 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 50 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
41 | * will look to see if it needs to force writeback or throttling. | 51 | * will look to see if it needs to force writeback or throttling. |
42 | */ | 52 | */ |
@@ -111,6 +121,7 @@ EXPORT_SYMBOL(laptop_mode); | |||
111 | 121 | ||
112 | /* End of sysctl-exported parameters */ | 122 | /* End of sysctl-exported parameters */ |
113 | 123 | ||
124 | unsigned long global_dirty_limit; | ||
114 | 125 | ||
115 | /* | 126 | /* |
116 | * Scale the writeback cache size proportional to the relative writeout speeds. | 127 | * Scale the writeback cache size proportional to the relative writeout speeds. |
@@ -219,6 +230,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write, | |||
219 | */ | 230 | */ |
220 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) | 231 | static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) |
221 | { | 232 | { |
233 | __inc_bdi_stat(bdi, BDI_WRITTEN); | ||
222 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, | 234 | __prop_inc_percpu_max(&vm_completions, &bdi->completions, |
223 | bdi->max_prop_frac); | 235 | bdi->max_prop_frac); |
224 | } | 236 | } |
@@ -244,13 +256,8 @@ void task_dirty_inc(struct task_struct *tsk) | |||
244 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, | 256 | static void bdi_writeout_fraction(struct backing_dev_info *bdi, |
245 | long *numerator, long *denominator) | 257 | long *numerator, long *denominator) |
246 | { | 258 | { |
247 | if (bdi_cap_writeback_dirty(bdi)) { | 259 | prop_fraction_percpu(&vm_completions, &bdi->completions, |
248 | prop_fraction_percpu(&vm_completions, &bdi->completions, | ||
249 | numerator, denominator); | 260 | numerator, denominator); |
250 | } else { | ||
251 | *numerator = 0; | ||
252 | *denominator = 1; | ||
253 | } | ||
254 | } | 261 | } |
255 | 262 | ||
256 | static inline void task_dirties_fraction(struct task_struct *tsk, | 263 | static inline void task_dirties_fraction(struct task_struct *tsk, |
@@ -274,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
274 | * effectively curb the growth of dirty pages. Light dirtiers with high enough | 281 | * effectively curb the growth of dirty pages. Light dirtiers with high enough |
275 | * dirty threshold may never get throttled. | 282 | * dirty threshold may never get throttled. |
276 | */ | 283 | */ |
284 | #define TASK_LIMIT_FRACTION 8 | ||
277 | static unsigned long task_dirty_limit(struct task_struct *tsk, | 285 | static unsigned long task_dirty_limit(struct task_struct *tsk, |
278 | unsigned long bdi_dirty) | 286 | unsigned long bdi_dirty) |
279 | { | 287 | { |
280 | long numerator, denominator; | 288 | long numerator, denominator; |
281 | unsigned long dirty = bdi_dirty; | 289 | unsigned long dirty = bdi_dirty; |
282 | u64 inv = dirty >> 3; | 290 | u64 inv = dirty / TASK_LIMIT_FRACTION; |
283 | 291 | ||
284 | task_dirties_fraction(tsk, &numerator, &denominator); | 292 | task_dirties_fraction(tsk, &numerator, &denominator); |
285 | inv *= numerator; | 293 | inv *= numerator; |
@@ -290,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk, | |||
290 | return max(dirty, bdi_dirty/2); | 298 | return max(dirty, bdi_dirty/2); |
291 | } | 299 | } |
292 | 300 | ||
301 | /* Minimum limit for any task */ | ||
302 | static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) | ||
303 | { | ||
304 | return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION; | ||
305 | } | ||
306 | |||
293 | /* | 307 | /* |
294 | * | 308 | * |
295 | */ | 309 | */ |
@@ -397,6 +411,11 @@ unsigned long determine_dirtyable_memory(void) | |||
397 | return x + 1; /* Ensure that we never return 0 */ | 411 | return x + 1; /* Ensure that we never return 0 */ |
398 | } | 412 | } |
399 | 413 | ||
414 | static unsigned long hard_dirty_limit(unsigned long thresh) | ||
415 | { | ||
416 | return max(thresh, global_dirty_limit); | ||
417 | } | ||
418 | |||
400 | /* | 419 | /* |
401 | * global_dirty_limits - background-writeback and dirty-throttling thresholds | 420 | * global_dirty_limits - background-writeback and dirty-throttling thresholds |
402 | * | 421 | * |
@@ -435,12 +454,20 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
435 | } | 454 | } |
436 | *pbackground = background; | 455 | *pbackground = background; |
437 | *pdirty = dirty; | 456 | *pdirty = dirty; |
457 | trace_global_dirty_state(background, dirty); | ||
438 | } | 458 | } |
439 | 459 | ||
440 | /* | 460 | /** |
441 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold | 461 | * bdi_dirty_limit - @bdi's share of dirty throttling threshold |
462 | * @bdi: the backing_dev_info to query | ||
463 | * @dirty: global dirty limit in pages | ||
464 | * | ||
465 | * Returns @bdi's dirty limit in pages. The term "dirty" in the context of | ||
466 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
467 | * And the "limit" in the name is not seriously taken as hard limit in | ||
468 | * balance_dirty_pages(). | ||
442 | * | 469 | * |
443 | * Allocate high/low dirty limits to fast/slow devices, in order to prevent | 470 | * It allocates high/low dirty limits to fast/slow devices, in order to prevent |
444 | * - starving fast devices | 471 | * - starving fast devices |
445 | * - piling up dirty pages (that will take long time to sync) on slow devices | 472 | * - piling up dirty pages (that will take long time to sync) on slow devices |
446 | * | 473 | * |
@@ -468,6 +495,153 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
468 | return bdi_dirty; | 495 | return bdi_dirty; |
469 | } | 496 | } |
470 | 497 | ||
498 | static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | ||
499 | unsigned long elapsed, | ||
500 | unsigned long written) | ||
501 | { | ||
502 | const unsigned long period = roundup_pow_of_two(3 * HZ); | ||
503 | unsigned long avg = bdi->avg_write_bandwidth; | ||
504 | unsigned long old = bdi->write_bandwidth; | ||
505 | u64 bw; | ||
506 | |||
507 | /* | ||
508 | * bw = written * HZ / elapsed | ||
509 | * | ||
510 | * bw * elapsed + write_bandwidth * (period - elapsed) | ||
511 | * write_bandwidth = --------------------------------------------------- | ||
512 | * period | ||
513 | */ | ||
514 | bw = written - bdi->written_stamp; | ||
515 | bw *= HZ; | ||
516 | if (unlikely(elapsed > period)) { | ||
517 | do_div(bw, elapsed); | ||
518 | avg = bw; | ||
519 | goto out; | ||
520 | } | ||
521 | bw += (u64)bdi->write_bandwidth * (period - elapsed); | ||
522 | bw >>= ilog2(period); | ||
523 | |||
524 | /* | ||
525 | * one more level of smoothing, for filtering out sudden spikes | ||
526 | */ | ||
527 | if (avg > old && old >= (unsigned long)bw) | ||
528 | avg -= (avg - old) >> 3; | ||
529 | |||
530 | if (avg < old && old <= (unsigned long)bw) | ||
531 | avg += (old - avg) >> 3; | ||
532 | |||
533 | out: | ||
534 | bdi->write_bandwidth = bw; | ||
535 | bdi->avg_write_bandwidth = avg; | ||
536 | } | ||
537 | |||
538 | /* | ||
539 | * The global dirtyable memory and dirty threshold could be suddenly knocked | ||
540 | * down by a large amount (eg. on the startup of KVM in a swapless system). | ||
541 | * This may throw the system into deep dirty exceeded state and throttle | ||
542 | * heavy/light dirtiers alike. To retain good responsiveness, maintain | ||
543 | * global_dirty_limit for tracking slowly down to the knocked down dirty | ||
544 | * threshold. | ||
545 | */ | ||
546 | static void update_dirty_limit(unsigned long thresh, unsigned long dirty) | ||
547 | { | ||
548 | unsigned long limit = global_dirty_limit; | ||
549 | |||
550 | /* | ||
551 | * Follow up in one step. | ||
552 | */ | ||
553 | if (limit < thresh) { | ||
554 | limit = thresh; | ||
555 | goto update; | ||
556 | } | ||
557 | |||
558 | /* | ||
559 | * Follow down slowly. Use the higher one as the target, because thresh | ||
560 | * may drop below dirty. This is exactly the reason to introduce | ||
561 | * global_dirty_limit which is guaranteed to lie above the dirty pages. | ||
562 | */ | ||
563 | thresh = max(thresh, dirty); | ||
564 | if (limit > thresh) { | ||
565 | limit -= (limit - thresh) >> 5; | ||
566 | goto update; | ||
567 | } | ||
568 | return; | ||
569 | update: | ||
570 | global_dirty_limit = limit; | ||
571 | } | ||
572 | |||
573 | static void global_update_bandwidth(unsigned long thresh, | ||
574 | unsigned long dirty, | ||
575 | unsigned long now) | ||
576 | { | ||
577 | static DEFINE_SPINLOCK(dirty_lock); | ||
578 | static unsigned long update_time; | ||
579 | |||
580 | /* | ||
581 | * check locklessly first to optimize away locking for the most time | ||
582 | */ | ||
583 | if (time_before(now, update_time + BANDWIDTH_INTERVAL)) | ||
584 | return; | ||
585 | |||
586 | spin_lock(&dirty_lock); | ||
587 | if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) { | ||
588 | update_dirty_limit(thresh, dirty); | ||
589 | update_time = now; | ||
590 | } | ||
591 | spin_unlock(&dirty_lock); | ||
592 | } | ||
593 | |||
594 | void __bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
595 | unsigned long thresh, | ||
596 | unsigned long dirty, | ||
597 | unsigned long bdi_thresh, | ||
598 | unsigned long bdi_dirty, | ||
599 | unsigned long start_time) | ||
600 | { | ||
601 | unsigned long now = jiffies; | ||
602 | unsigned long elapsed = now - bdi->bw_time_stamp; | ||
603 | unsigned long written; | ||
604 | |||
605 | /* | ||
606 | * rate-limit, only update once every 200ms. | ||
607 | */ | ||
608 | if (elapsed < BANDWIDTH_INTERVAL) | ||
609 | return; | ||
610 | |||
611 | written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); | ||
612 | |||
613 | /* | ||
614 | * Skip quiet periods when disk bandwidth is under-utilized. | ||
615 | * (at least 1s idle time between two flusher runs) | ||
616 | */ | ||
617 | if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) | ||
618 | goto snapshot; | ||
619 | |||
620 | if (thresh) | ||
621 | global_update_bandwidth(thresh, dirty, now); | ||
622 | |||
623 | bdi_update_write_bandwidth(bdi, elapsed, written); | ||
624 | |||
625 | snapshot: | ||
626 | bdi->written_stamp = written; | ||
627 | bdi->bw_time_stamp = now; | ||
628 | } | ||
629 | |||
630 | static void bdi_update_bandwidth(struct backing_dev_info *bdi, | ||
631 | unsigned long thresh, | ||
632 | unsigned long dirty, | ||
633 | unsigned long bdi_thresh, | ||
634 | unsigned long bdi_dirty, | ||
635 | unsigned long start_time) | ||
636 | { | ||
637 | if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) | ||
638 | return; | ||
639 | spin_lock(&bdi->wb.list_lock); | ||
640 | __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty, | ||
641 | start_time); | ||
642 | spin_unlock(&bdi->wb.list_lock); | ||
643 | } | ||
644 | |||
471 | /* | 645 | /* |
472 | * balance_dirty_pages() must be called by processes which are generating dirty | 646 | * balance_dirty_pages() must be called by processes which are generating dirty |
473 | * data. It looks at the number of dirty pages in the machine and will force | 647 | * data. It looks at the number of dirty pages in the machine and will force |
@@ -478,27 +652,25 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) | |||
478 | static void balance_dirty_pages(struct address_space *mapping, | 652 | static void balance_dirty_pages(struct address_space *mapping, |
479 | unsigned long write_chunk) | 653 | unsigned long write_chunk) |
480 | { | 654 | { |
481 | long nr_reclaimable, bdi_nr_reclaimable; | 655 | unsigned long nr_reclaimable, bdi_nr_reclaimable; |
482 | long nr_writeback, bdi_nr_writeback; | 656 | unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ |
657 | unsigned long bdi_dirty; | ||
483 | unsigned long background_thresh; | 658 | unsigned long background_thresh; |
484 | unsigned long dirty_thresh; | 659 | unsigned long dirty_thresh; |
485 | unsigned long bdi_thresh; | 660 | unsigned long bdi_thresh; |
661 | unsigned long task_bdi_thresh; | ||
662 | unsigned long min_task_bdi_thresh; | ||
486 | unsigned long pages_written = 0; | 663 | unsigned long pages_written = 0; |
487 | unsigned long pause = 1; | 664 | unsigned long pause = 1; |
488 | bool dirty_exceeded = false; | 665 | bool dirty_exceeded = false; |
666 | bool clear_dirty_exceeded = true; | ||
489 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 667 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
668 | unsigned long start_time = jiffies; | ||
490 | 669 | ||
491 | for (;;) { | 670 | for (;;) { |
492 | struct writeback_control wbc = { | ||
493 | .sync_mode = WB_SYNC_NONE, | ||
494 | .older_than_this = NULL, | ||
495 | .nr_to_write = write_chunk, | ||
496 | .range_cyclic = 1, | ||
497 | }; | ||
498 | |||
499 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + | 671 | nr_reclaimable = global_page_state(NR_FILE_DIRTY) + |
500 | global_page_state(NR_UNSTABLE_NFS); | 672 | global_page_state(NR_UNSTABLE_NFS); |
501 | nr_writeback = global_page_state(NR_WRITEBACK); | 673 | nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK); |
502 | 674 | ||
503 | global_dirty_limits(&background_thresh, &dirty_thresh); | 675 | global_dirty_limits(&background_thresh, &dirty_thresh); |
504 | 676 | ||
@@ -507,12 +679,12 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
507 | * catch-up. This avoids (excessively) small writeouts | 679 | * catch-up. This avoids (excessively) small writeouts |
508 | * when the bdi limits are ramping up. | 680 | * when the bdi limits are ramping up. |
509 | */ | 681 | */ |
510 | if (nr_reclaimable + nr_writeback <= | 682 | if (nr_dirty <= (background_thresh + dirty_thresh) / 2) |
511 | (background_thresh + dirty_thresh) / 2) | ||
512 | break; | 683 | break; |
513 | 684 | ||
514 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 685 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
515 | bdi_thresh = task_dirty_limit(current, bdi_thresh); | 686 | min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh); |
687 | task_bdi_thresh = task_dirty_limit(current, bdi_thresh); | ||
516 | 688 | ||
517 | /* | 689 | /* |
518 | * In order to avoid the stacked BDI deadlock we need | 690 | * In order to avoid the stacked BDI deadlock we need |
@@ -524,12 +696,14 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
524 | * actually dirty; with m+n sitting in the percpu | 696 | * actually dirty; with m+n sitting in the percpu |
525 | * deltas. | 697 | * deltas. |
526 | */ | 698 | */ |
527 | if (bdi_thresh < 2*bdi_stat_error(bdi)) { | 699 | if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) { |
528 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); | 700 | bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); |
529 | bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); | 701 | bdi_dirty = bdi_nr_reclaimable + |
702 | bdi_stat_sum(bdi, BDI_WRITEBACK); | ||
530 | } else { | 703 | } else { |
531 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); | 704 | bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); |
532 | bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); | 705 | bdi_dirty = bdi_nr_reclaimable + |
706 | bdi_stat(bdi, BDI_WRITEBACK); | ||
533 | } | 707 | } |
534 | 708 | ||
535 | /* | 709 | /* |
@@ -538,9 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
538 | * bdi or process from holding back light ones; The latter is | 712 | * bdi or process from holding back light ones; The latter is |
539 | * the last resort safeguard. | 713 | * the last resort safeguard. |
540 | */ | 714 | */ |
541 | dirty_exceeded = | 715 | dirty_exceeded = (bdi_dirty > task_bdi_thresh) || |
542 | (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) | 716 | (nr_dirty > dirty_thresh); |
543 | || (nr_reclaimable + nr_writeback > dirty_thresh); | 717 | clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) && |
718 | (nr_dirty <= dirty_thresh); | ||
544 | 719 | ||
545 | if (!dirty_exceeded) | 720 | if (!dirty_exceeded) |
546 | break; | 721 | break; |
@@ -548,6 +723,9 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
548 | if (!bdi->dirty_exceeded) | 723 | if (!bdi->dirty_exceeded) |
549 | bdi->dirty_exceeded = 1; | 724 | bdi->dirty_exceeded = 1; |
550 | 725 | ||
726 | bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty, | ||
727 | bdi_thresh, bdi_dirty, start_time); | ||
728 | |||
551 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. | 729 | /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. |
552 | * Unstable writes are a feature of certain networked | 730 | * Unstable writes are a feature of certain networked |
553 | * filesystems (i.e. NFS) in which data may have been | 731 | * filesystems (i.e. NFS) in which data may have been |
@@ -557,17 +735,29 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
557 | * threshold otherwise wait until the disk writes catch | 735 | * threshold otherwise wait until the disk writes catch |
558 | * up. | 736 | * up. |
559 | */ | 737 | */ |
560 | trace_wbc_balance_dirty_start(&wbc, bdi); | 738 | trace_balance_dirty_start(bdi); |
561 | if (bdi_nr_reclaimable > bdi_thresh) { | 739 | if (bdi_nr_reclaimable > task_bdi_thresh) { |
562 | writeback_inodes_wb(&bdi->wb, &wbc); | 740 | pages_written += writeback_inodes_wb(&bdi->wb, |
563 | pages_written += write_chunk - wbc.nr_to_write; | 741 | write_chunk); |
564 | trace_wbc_balance_dirty_written(&wbc, bdi); | 742 | trace_balance_dirty_written(bdi, pages_written); |
565 | if (pages_written >= write_chunk) | 743 | if (pages_written >= write_chunk) |
566 | break; /* We've done our duty */ | 744 | break; /* We've done our duty */ |
567 | } | 745 | } |
568 | trace_wbc_balance_dirty_wait(&wbc, bdi); | ||
569 | __set_current_state(TASK_UNINTERRUPTIBLE); | 746 | __set_current_state(TASK_UNINTERRUPTIBLE); |
570 | io_schedule_timeout(pause); | 747 | io_schedule_timeout(pause); |
748 | trace_balance_dirty_wait(bdi); | ||
749 | |||
750 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
751 | /* | ||
752 | * max-pause area. If dirty exceeded but still within this | ||
753 | * area, no need to sleep for more than 200ms: (a) 8 pages per | ||
754 | * 200ms is typically more than enough to curb heavy dirtiers; | ||
755 | * (b) the pause time limit makes the dirtiers more responsive. | ||
756 | */ | ||
757 | if (nr_dirty < dirty_thresh && | ||
758 | bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && | ||
759 | time_after(jiffies, start_time + MAX_PAUSE)) | ||
760 | break; | ||
571 | 761 | ||
572 | /* | 762 | /* |
573 | * Increase the delay for each loop, up to our previous | 763 | * Increase the delay for each loop, up to our previous |
@@ -578,7 +768,8 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
578 | pause = HZ / 10; | 768 | pause = HZ / 10; |
579 | } | 769 | } |
580 | 770 | ||
581 | if (!dirty_exceeded && bdi->dirty_exceeded) | 771 | /* Clear dirty_exceeded flag only when no task can exceed the limit */ |
772 | if (clear_dirty_exceeded && bdi->dirty_exceeded) | ||
582 | bdi->dirty_exceeded = 0; | 773 | bdi->dirty_exceeded = 0; |
583 | 774 | ||
584 | if (writeback_in_progress(bdi)) | 775 | if (writeback_in_progress(bdi)) |
@@ -626,9 +817,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | |||
626 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 817 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
627 | unsigned long nr_pages_dirtied) | 818 | unsigned long nr_pages_dirtied) |
628 | { | 819 | { |
820 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
629 | unsigned long ratelimit; | 821 | unsigned long ratelimit; |
630 | unsigned long *p; | 822 | unsigned long *p; |
631 | 823 | ||
824 | if (!bdi_cap_account_dirty(bdi)) | ||
825 | return; | ||
826 | |||
632 | ratelimit = ratelimit_pages; | 827 | ratelimit = ratelimit_pages; |
633 | if (mapping->backing_dev_info->dirty_exceeded) | 828 | if (mapping->backing_dev_info->dirty_exceeded) |
634 | ratelimit = 8; | 829 | ratelimit = 8; |
@@ -892,12 +1087,12 @@ int write_cache_pages(struct address_space *mapping, | |||
892 | range_whole = 1; | 1087 | range_whole = 1; |
893 | cycled = 1; /* ignore range_cyclic tests */ | 1088 | cycled = 1; /* ignore range_cyclic tests */ |
894 | } | 1089 | } |
895 | if (wbc->sync_mode == WB_SYNC_ALL) | 1090 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
896 | tag = PAGECACHE_TAG_TOWRITE; | 1091 | tag = PAGECACHE_TAG_TOWRITE; |
897 | else | 1092 | else |
898 | tag = PAGECACHE_TAG_DIRTY; | 1093 | tag = PAGECACHE_TAG_DIRTY; |
899 | retry: | 1094 | retry: |
900 | if (wbc->sync_mode == WB_SYNC_ALL) | 1095 | if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) |
901 | tag_pages_for_writeback(mapping, index, end); | 1096 | tag_pages_for_writeback(mapping, index, end); |
902 | done_index = index; | 1097 | done_index = index; |
903 | while (!done && (index <= end)) { | 1098 | while (!done && (index <= end)) { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 094472377d81..6e8ecb6e021c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1370,21 +1370,12 @@ failed: | |||
1370 | 1370 | ||
1371 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1371 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1372 | 1372 | ||
1373 | static struct fail_page_alloc_attr { | 1373 | static struct { |
1374 | struct fault_attr attr; | 1374 | struct fault_attr attr; |
1375 | 1375 | ||
1376 | u32 ignore_gfp_highmem; | 1376 | u32 ignore_gfp_highmem; |
1377 | u32 ignore_gfp_wait; | 1377 | u32 ignore_gfp_wait; |
1378 | u32 min_order; | 1378 | u32 min_order; |
1379 | |||
1380 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | ||
1381 | |||
1382 | struct dentry *ignore_gfp_highmem_file; | ||
1383 | struct dentry *ignore_gfp_wait_file; | ||
1384 | struct dentry *min_order_file; | ||
1385 | |||
1386 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | ||
1387 | |||
1388 | } fail_page_alloc = { | 1379 | } fail_page_alloc = { |
1389 | .attr = FAULT_ATTR_INITIALIZER, | 1380 | .attr = FAULT_ATTR_INITIALIZER, |
1390 | .ignore_gfp_wait = 1, | 1381 | .ignore_gfp_wait = 1, |
@@ -1418,36 +1409,27 @@ static int __init fail_page_alloc_debugfs(void) | |||
1418 | { | 1409 | { |
1419 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 1410 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
1420 | struct dentry *dir; | 1411 | struct dentry *dir; |
1421 | int err; | ||
1422 | |||
1423 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | ||
1424 | "fail_page_alloc"); | ||
1425 | if (err) | ||
1426 | return err; | ||
1427 | dir = fail_page_alloc.attr.dentries.dir; | ||
1428 | |||
1429 | fail_page_alloc.ignore_gfp_wait_file = | ||
1430 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
1431 | &fail_page_alloc.ignore_gfp_wait); | ||
1432 | |||
1433 | fail_page_alloc.ignore_gfp_highmem_file = | ||
1434 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
1435 | &fail_page_alloc.ignore_gfp_highmem); | ||
1436 | fail_page_alloc.min_order_file = | ||
1437 | debugfs_create_u32("min-order", mode, dir, | ||
1438 | &fail_page_alloc.min_order); | ||
1439 | |||
1440 | if (!fail_page_alloc.ignore_gfp_wait_file || | ||
1441 | !fail_page_alloc.ignore_gfp_highmem_file || | ||
1442 | !fail_page_alloc.min_order_file) { | ||
1443 | err = -ENOMEM; | ||
1444 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | ||
1445 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | ||
1446 | debugfs_remove(fail_page_alloc.min_order_file); | ||
1447 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | ||
1448 | } | ||
1449 | 1412 | ||
1450 | return err; | 1413 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
1414 | &fail_page_alloc.attr); | ||
1415 | if (IS_ERR(dir)) | ||
1416 | return PTR_ERR(dir); | ||
1417 | |||
1418 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
1419 | &fail_page_alloc.ignore_gfp_wait)) | ||
1420 | goto fail; | ||
1421 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
1422 | &fail_page_alloc.ignore_gfp_highmem)) | ||
1423 | goto fail; | ||
1424 | if (!debugfs_create_u32("min-order", mode, dir, | ||
1425 | &fail_page_alloc.min_order)) | ||
1426 | goto fail; | ||
1427 | |||
1428 | return 0; | ||
1429 | fail: | ||
1430 | debugfs_remove_recursive(dir); | ||
1431 | |||
1432 | return -ENOMEM; | ||
1451 | } | 1433 | } |
1452 | 1434 | ||
1453 | late_initcall(fail_page_alloc_debugfs); | 1435 | late_initcall(fail_page_alloc_debugfs); |
@@ -31,11 +31,11 @@ | |||
31 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * mmlist_lock (in mmput, drain_mmlist and others) |
32 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mapping->private_lock (in __set_page_dirty_buffers) |
33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 33 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
34 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 35 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_wb_list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |
diff --git a/mm/shmem.c b/mm/shmem.c index 5cc21f8b4cd3..32f6763f16fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -6,7 +6,8 @@ | |||
6 | * 2000-2001 Christoph Rohland | 6 | * 2000-2001 Christoph Rohland |
7 | * 2000-2001 SAP AG | 7 | * 2000-2001 SAP AG |
8 | * 2002 Red Hat Inc. | 8 | * 2002 Red Hat Inc. |
9 | * Copyright (C) 2002-2005 Hugh Dickins. | 9 | * Copyright (C) 2002-2011 Hugh Dickins. |
10 | * Copyright (C) 2011 Google Inc. | ||
10 | * Copyright (C) 2002-2005 VERITAS Software Corporation. | 11 | * Copyright (C) 2002-2005 VERITAS Software Corporation. |
11 | * Copyright (C) 2004 Andi Kleen, SuSE Labs | 12 | * Copyright (C) 2004 Andi Kleen, SuSE Labs |
12 | * | 13 | * |
@@ -28,7 +29,6 @@ | |||
28 | #include <linux/file.h> | 29 | #include <linux/file.h> |
29 | #include <linux/mm.h> | 30 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 31 | #include <linux/module.h> |
31 | #include <linux/percpu_counter.h> | ||
32 | #include <linux/swap.h> | 32 | #include <linux/swap.h> |
33 | 33 | ||
34 | static struct vfsmount *shm_mnt; | 34 | static struct vfsmount *shm_mnt; |
@@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt; | |||
51 | #include <linux/shmem_fs.h> | 51 | #include <linux/shmem_fs.h> |
52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | ||
55 | #include <linux/percpu_counter.h> | ||
54 | #include <linux/splice.h> | 56 | #include <linux/splice.h> |
55 | #include <linux/security.h> | 57 | #include <linux/security.h> |
56 | #include <linux/swapops.h> | 58 | #include <linux/swapops.h> |
@@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt; | |||
63 | #include <linux/magic.h> | 65 | #include <linux/magic.h> |
64 | 66 | ||
65 | #include <asm/uaccess.h> | 67 | #include <asm/uaccess.h> |
66 | #include <asm/div64.h> | ||
67 | #include <asm/pgtable.h> | 68 | #include <asm/pgtable.h> |
68 | 69 | ||
69 | /* | ||
70 | * The maximum size of a shmem/tmpfs file is limited by the maximum size of | ||
71 | * its triple-indirect swap vector - see illustration at shmem_swp_entry(). | ||
72 | * | ||
73 | * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, | ||
74 | * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum | ||
75 | * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, | ||
76 | * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. | ||
77 | * | ||
78 | * We use / and * instead of shifts in the definitions below, so that the swap | ||
79 | * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. | ||
80 | */ | ||
81 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | ||
82 | #define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | ||
83 | |||
84 | #define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | ||
85 | #define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) | ||
86 | |||
87 | #define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) | ||
88 | #define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) | ||
89 | |||
90 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 70 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) |
91 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) | 71 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) |
92 | 72 | ||
93 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ | ||
94 | #define SHMEM_PAGEIN VM_READ | ||
95 | #define SHMEM_TRUNCATE VM_WRITE | ||
96 | |||
97 | /* Definition to limit shmem_truncate's steps between cond_rescheds */ | ||
98 | #define LATENCY_LIMIT 64 | ||
99 | |||
100 | /* Pretend that each entry is of this size in directory's i_size */ | 73 | /* Pretend that each entry is of this size in directory's i_size */ |
101 | #define BOGO_DIRENT_SIZE 20 | 74 | #define BOGO_DIRENT_SIZE 20 |
102 | 75 | ||
76 | /* Symlink up to this size is kmalloc'ed instead of using a swappable page */ | ||
77 | #define SHORT_SYMLINK_LEN 128 | ||
78 | |||
103 | struct shmem_xattr { | 79 | struct shmem_xattr { |
104 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ | 80 | struct list_head list; /* anchored by shmem_inode_info->xattr_list */ |
105 | char *name; /* xattr name */ | 81 | char *name; /* xattr name */ |
@@ -107,7 +83,7 @@ struct shmem_xattr { | |||
107 | char value[0]; | 83 | char value[0]; |
108 | }; | 84 | }; |
109 | 85 | ||
110 | /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ | 86 | /* Flag allocation requirements to shmem_getpage */ |
111 | enum sgp_type { | 87 | enum sgp_type { |
112 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 88 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
113 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
@@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index, | |||
137 | mapping_gfp_mask(inode->i_mapping), fault_type); | 113 | mapping_gfp_mask(inode->i_mapping), fault_type); |
138 | } | 114 | } |
139 | 115 | ||
140 | static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) | ||
141 | { | ||
142 | /* | ||
143 | * The above definition of ENTRIES_PER_PAGE, and the use of | ||
144 | * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: | ||
145 | * might be reconsidered if it ever diverges from PAGE_SIZE. | ||
146 | * | ||
147 | * Mobility flags are masked out as swap vectors cannot move | ||
148 | */ | ||
149 | return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, | ||
150 | PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
151 | } | ||
152 | |||
153 | static inline void shmem_dir_free(struct page *page) | ||
154 | { | ||
155 | __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); | ||
156 | } | ||
157 | |||
158 | static struct page **shmem_dir_map(struct page *page) | ||
159 | { | ||
160 | return (struct page **)kmap_atomic(page, KM_USER0); | ||
161 | } | ||
162 | |||
163 | static inline void shmem_dir_unmap(struct page **dir) | ||
164 | { | ||
165 | kunmap_atomic(dir, KM_USER0); | ||
166 | } | ||
167 | |||
168 | static swp_entry_t *shmem_swp_map(struct page *page) | ||
169 | { | ||
170 | return (swp_entry_t *)kmap_atomic(page, KM_USER1); | ||
171 | } | ||
172 | |||
173 | static inline void shmem_swp_balance_unmap(void) | ||
174 | { | ||
175 | /* | ||
176 | * When passing a pointer to an i_direct entry, to code which | ||
177 | * also handles indirect entries and so will shmem_swp_unmap, | ||
178 | * we must arrange for the preempt count to remain in balance. | ||
179 | * What kmap_atomic of a lowmem page does depends on config | ||
180 | * and architecture, so pretend to kmap_atomic some lowmem page. | ||
181 | */ | ||
182 | (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); | ||
183 | } | ||
184 | |||
185 | static inline void shmem_swp_unmap(swp_entry_t *entry) | ||
186 | { | ||
187 | kunmap_atomic(entry, KM_USER1); | ||
188 | } | ||
189 | |||
190 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) | 116 | static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) |
191 | { | 117 | { |
192 | return sb->s_fs_info; | 118 | return sb->s_fs_info; |
@@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | |||
244 | static LIST_HEAD(shmem_swaplist); | 170 | static LIST_HEAD(shmem_swaplist); |
245 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 171 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
246 | 172 | ||
247 | static void shmem_free_blocks(struct inode *inode, long pages) | ||
248 | { | ||
249 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
250 | if (sbinfo->max_blocks) { | ||
251 | percpu_counter_add(&sbinfo->used_blocks, -pages); | ||
252 | inode->i_blocks -= pages*BLOCKS_PER_PAGE; | ||
253 | } | ||
254 | } | ||
255 | |||
256 | static int shmem_reserve_inode(struct super_block *sb) | 173 | static int shmem_reserve_inode(struct super_block *sb) |
257 | { | 174 | { |
258 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | 175 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); |
@@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb) | |||
279 | } | 196 | } |
280 | 197 | ||
281 | /** | 198 | /** |
282 | * shmem_recalc_inode - recalculate the size of an inode | 199 | * shmem_recalc_inode - recalculate the block usage of an inode |
283 | * @inode: inode to recalc | 200 | * @inode: inode to recalc |
284 | * | 201 | * |
285 | * We have to calculate the free blocks since the mm can drop | 202 | * We have to calculate the free blocks since the mm can drop |
@@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode) | |||
297 | 214 | ||
298 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; | 215 | freed = info->alloced - info->swapped - inode->i_mapping->nrpages; |
299 | if (freed > 0) { | 216 | if (freed > 0) { |
217 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
218 | if (sbinfo->max_blocks) | ||
219 | percpu_counter_add(&sbinfo->used_blocks, -freed); | ||
300 | info->alloced -= freed; | 220 | info->alloced -= freed; |
221 | inode->i_blocks -= freed * BLOCKS_PER_PAGE; | ||
301 | shmem_unacct_blocks(info->flags, freed); | 222 | shmem_unacct_blocks(info->flags, freed); |
302 | shmem_free_blocks(inode, freed); | ||
303 | } | 223 | } |
304 | } | 224 | } |
305 | 225 | ||
306 | /** | 226 | /* |
307 | * shmem_swp_entry - find the swap vector position in the info structure | 227 | * Replace item expected in radix tree by a new item, while holding tree lock. |
308 | * @info: info structure for the inode | ||
309 | * @index: index of the page to find | ||
310 | * @page: optional page to add to the structure. Has to be preset to | ||
311 | * all zeros | ||
312 | * | ||
313 | * If there is no space allocated yet it will return NULL when | ||
314 | * page is NULL, else it will use the page for the needed block, | ||
315 | * setting it to NULL on return to indicate that it has been used. | ||
316 | * | ||
317 | * The swap vector is organized the following way: | ||
318 | * | ||
319 | * There are SHMEM_NR_DIRECT entries directly stored in the | ||
320 | * shmem_inode_info structure. So small files do not need an addional | ||
321 | * allocation. | ||
322 | * | ||
323 | * For pages with index > SHMEM_NR_DIRECT there is the pointer | ||
324 | * i_indirect which points to a page which holds in the first half | ||
325 | * doubly indirect blocks, in the second half triple indirect blocks: | ||
326 | * | ||
327 | * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the | ||
328 | * following layout (for SHMEM_NR_DIRECT == 16): | ||
329 | * | ||
330 | * i_indirect -> dir --> 16-19 | ||
331 | * | +-> 20-23 | ||
332 | * | | ||
333 | * +-->dir2 --> 24-27 | ||
334 | * | +-> 28-31 | ||
335 | * | +-> 32-35 | ||
336 | * | +-> 36-39 | ||
337 | * | | ||
338 | * +-->dir3 --> 40-43 | ||
339 | * +-> 44-47 | ||
340 | * +-> 48-51 | ||
341 | * +-> 52-55 | ||
342 | */ | 228 | */ |
343 | static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) | 229 | static int shmem_radix_tree_replace(struct address_space *mapping, |
344 | { | 230 | pgoff_t index, void *expected, void *replacement) |
345 | unsigned long offset; | 231 | { |
346 | struct page **dir; | 232 | void **pslot; |
347 | struct page *subdir; | 233 | void *item = NULL; |
348 | 234 | ||
349 | if (index < SHMEM_NR_DIRECT) { | 235 | VM_BUG_ON(!expected); |
350 | shmem_swp_balance_unmap(); | 236 | pslot = radix_tree_lookup_slot(&mapping->page_tree, index); |
351 | return info->i_direct+index; | 237 | if (pslot) |
352 | } | 238 | item = radix_tree_deref_slot_protected(pslot, |
353 | if (!info->i_indirect) { | 239 | &mapping->tree_lock); |
354 | if (page) { | 240 | if (item != expected) |
355 | info->i_indirect = *page; | 241 | return -ENOENT; |
356 | *page = NULL; | 242 | if (replacement) |
357 | } | 243 | radix_tree_replace_slot(pslot, replacement); |
358 | return NULL; /* need another page */ | 244 | else |
359 | } | 245 | radix_tree_delete(&mapping->page_tree, index); |
360 | 246 | return 0; | |
361 | index -= SHMEM_NR_DIRECT; | 247 | } |
362 | offset = index % ENTRIES_PER_PAGE; | ||
363 | index /= ENTRIES_PER_PAGE; | ||
364 | dir = shmem_dir_map(info->i_indirect); | ||
365 | |||
366 | if (index >= ENTRIES_PER_PAGE/2) { | ||
367 | index -= ENTRIES_PER_PAGE/2; | ||
368 | dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; | ||
369 | index %= ENTRIES_PER_PAGE; | ||
370 | subdir = *dir; | ||
371 | if (!subdir) { | ||
372 | if (page) { | ||
373 | *dir = *page; | ||
374 | *page = NULL; | ||
375 | } | ||
376 | shmem_dir_unmap(dir); | ||
377 | return NULL; /* need another page */ | ||
378 | } | ||
379 | shmem_dir_unmap(dir); | ||
380 | dir = shmem_dir_map(subdir); | ||
381 | } | ||
382 | 248 | ||
383 | dir += index; | 249 | /* |
384 | subdir = *dir; | 250 | * Like add_to_page_cache_locked, but error if expected item has gone. |
385 | if (!subdir) { | 251 | */ |
386 | if (!page || !(subdir = *page)) { | 252 | static int shmem_add_to_page_cache(struct page *page, |
387 | shmem_dir_unmap(dir); | 253 | struct address_space *mapping, |
388 | return NULL; /* need a page */ | 254 | pgoff_t index, gfp_t gfp, void *expected) |
255 | { | ||
256 | int error = 0; | ||
257 | |||
258 | VM_BUG_ON(!PageLocked(page)); | ||
259 | VM_BUG_ON(!PageSwapBacked(page)); | ||
260 | |||
261 | if (!expected) | ||
262 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
263 | if (!error) { | ||
264 | page_cache_get(page); | ||
265 | page->mapping = mapping; | ||
266 | page->index = index; | ||
267 | |||
268 | spin_lock_irq(&mapping->tree_lock); | ||
269 | if (!expected) | ||
270 | error = radix_tree_insert(&mapping->page_tree, | ||
271 | index, page); | ||
272 | else | ||
273 | error = shmem_radix_tree_replace(mapping, index, | ||
274 | expected, page); | ||
275 | if (!error) { | ||
276 | mapping->nrpages++; | ||
277 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
278 | __inc_zone_page_state(page, NR_SHMEM); | ||
279 | spin_unlock_irq(&mapping->tree_lock); | ||
280 | } else { | ||
281 | page->mapping = NULL; | ||
282 | spin_unlock_irq(&mapping->tree_lock); | ||
283 | page_cache_release(page); | ||
389 | } | 284 | } |
390 | *dir = subdir; | 285 | if (!expected) |
391 | *page = NULL; | 286 | radix_tree_preload_end(); |
392 | } | 287 | } |
393 | shmem_dir_unmap(dir); | 288 | if (error) |
394 | return shmem_swp_map(subdir) + offset; | 289 | mem_cgroup_uncharge_cache_page(page); |
290 | return error; | ||
395 | } | 291 | } |
396 | 292 | ||
397 | static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) | 293 | /* |
294 | * Like delete_from_page_cache, but substitutes swap for page. | ||
295 | */ | ||
296 | static void shmem_delete_from_page_cache(struct page *page, void *radswap) | ||
398 | { | 297 | { |
399 | long incdec = value? 1: -1; | 298 | struct address_space *mapping = page->mapping; |
299 | int error; | ||
400 | 300 | ||
401 | entry->val = value; | 301 | spin_lock_irq(&mapping->tree_lock); |
402 | info->swapped += incdec; | 302 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
403 | if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { | 303 | page->mapping = NULL; |
404 | struct page *page = kmap_atomic_to_page(entry); | 304 | mapping->nrpages--; |
405 | set_page_private(page, page_private(page) + incdec); | 305 | __dec_zone_page_state(page, NR_FILE_PAGES); |
406 | } | 306 | __dec_zone_page_state(page, NR_SHMEM); |
307 | spin_unlock_irq(&mapping->tree_lock); | ||
308 | page_cache_release(page); | ||
309 | BUG_ON(error); | ||
407 | } | 310 | } |
408 | 311 | ||
409 | /** | 312 | /* |
410 | * shmem_swp_alloc - get the position of the swap entry for the page. | 313 | * Like find_get_pages, but collecting swap entries as well as pages. |
411 | * @info: info structure for the inode | ||
412 | * @index: index of the page to find | ||
413 | * @sgp: check and recheck i_size? skip allocation? | ||
414 | * @gfp: gfp mask to use for any page allocation | ||
415 | * | ||
416 | * If the entry does not exist, allocate it. | ||
417 | */ | 314 | */ |
418 | static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, | 315 | static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, |
419 | unsigned long index, enum sgp_type sgp, gfp_t gfp) | 316 | pgoff_t start, unsigned int nr_pages, |
420 | { | 317 | struct page **pages, pgoff_t *indices) |
421 | struct inode *inode = &info->vfs_inode; | 318 | { |
422 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | 319 | unsigned int i; |
423 | struct page *page = NULL; | 320 | unsigned int ret; |
424 | swp_entry_t *entry; | 321 | unsigned int nr_found; |
425 | 322 | ||
426 | if (sgp != SGP_WRITE && | 323 | rcu_read_lock(); |
427 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | 324 | restart: |
428 | return ERR_PTR(-EINVAL); | 325 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, |
429 | 326 | (void ***)pages, indices, start, nr_pages); | |
430 | while (!(entry = shmem_swp_entry(info, index, &page))) { | 327 | ret = 0; |
431 | if (sgp == SGP_READ) | 328 | for (i = 0; i < nr_found; i++) { |
432 | return shmem_swp_map(ZERO_PAGE(0)); | 329 | struct page *page; |
433 | /* | 330 | repeat: |
434 | * Test used_blocks against 1 less max_blocks, since we have 1 data | 331 | page = radix_tree_deref_slot((void **)pages[i]); |
435 | * page (and perhaps indirect index pages) yet to allocate: | 332 | if (unlikely(!page)) |
436 | * a waste to allocate index if we cannot allocate data. | 333 | continue; |
437 | */ | 334 | if (radix_tree_exception(page)) { |
438 | if (sbinfo->max_blocks) { | 335 | if (radix_tree_deref_retry(page)) |
439 | if (percpu_counter_compare(&sbinfo->used_blocks, | 336 | goto restart; |
440 | sbinfo->max_blocks - 1) >= 0) | 337 | /* |
441 | return ERR_PTR(-ENOSPC); | 338 | * Otherwise, we must be storing a swap entry |
442 | percpu_counter_inc(&sbinfo->used_blocks); | 339 | * here as an exceptional entry: so return it |
443 | inode->i_blocks += BLOCKS_PER_PAGE; | 340 | * without attempting to raise page count. |
341 | */ | ||
342 | goto export; | ||
444 | } | 343 | } |
344 | if (!page_cache_get_speculative(page)) | ||
345 | goto repeat; | ||
445 | 346 | ||
446 | spin_unlock(&info->lock); | 347 | /* Has the page moved? */ |
447 | page = shmem_dir_alloc(gfp); | 348 | if (unlikely(page != *((void **)pages[i]))) { |
448 | spin_lock(&info->lock); | 349 | page_cache_release(page); |
449 | 350 | goto repeat; | |
450 | if (!page) { | ||
451 | shmem_free_blocks(inode, 1); | ||
452 | return ERR_PTR(-ENOMEM); | ||
453 | } | ||
454 | if (sgp != SGP_WRITE && | ||
455 | ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
456 | entry = ERR_PTR(-EINVAL); | ||
457 | break; | ||
458 | } | 351 | } |
459 | if (info->next_index <= index) | 352 | export: |
460 | info->next_index = index + 1; | 353 | indices[ret] = indices[i]; |
461 | } | 354 | pages[ret] = page; |
462 | if (page) { | 355 | ret++; |
463 | /* another task gave its page, or truncated the file */ | 356 | } |
464 | shmem_free_blocks(inode, 1); | 357 | if (unlikely(!ret && nr_found)) |
465 | shmem_dir_free(page); | 358 | goto restart; |
466 | } | 359 | rcu_read_unlock(); |
467 | if (info->next_index <= index && !IS_ERR(entry)) | 360 | return ret; |
468 | info->next_index = index + 1; | ||
469 | return entry; | ||
470 | } | 361 | } |
471 | 362 | ||
472 | /** | 363 | /* |
473 | * shmem_free_swp - free some swap entries in a directory | 364 | * Remove swap entry from radix tree, free the swap and its page cache. |
474 | * @dir: pointer to the directory | ||
475 | * @edir: pointer after last entry of the directory | ||
476 | * @punch_lock: pointer to spinlock when needed for the holepunch case | ||
477 | */ | 365 | */ |
478 | static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, | 366 | static int shmem_free_swap(struct address_space *mapping, |
479 | spinlock_t *punch_lock) | 367 | pgoff_t index, void *radswap) |
480 | { | 368 | { |
481 | spinlock_t *punch_unlock = NULL; | 369 | int error; |
482 | swp_entry_t *ptr; | 370 | |
483 | int freed = 0; | 371 | spin_lock_irq(&mapping->tree_lock); |
484 | 372 | error = shmem_radix_tree_replace(mapping, index, radswap, NULL); | |
485 | for (ptr = dir; ptr < edir; ptr++) { | 373 | spin_unlock_irq(&mapping->tree_lock); |
486 | if (ptr->val) { | 374 | if (!error) |
487 | if (unlikely(punch_lock)) { | 375 | free_swap_and_cache(radix_to_swp_entry(radswap)); |
488 | punch_unlock = punch_lock; | 376 | return error; |
489 | punch_lock = NULL; | ||
490 | spin_lock(punch_unlock); | ||
491 | if (!ptr->val) | ||
492 | continue; | ||
493 | } | ||
494 | free_swap_and_cache(*ptr); | ||
495 | *ptr = (swp_entry_t){0}; | ||
496 | freed++; | ||
497 | } | ||
498 | } | ||
499 | if (punch_unlock) | ||
500 | spin_unlock(punch_unlock); | ||
501 | return freed; | ||
502 | } | ||
503 | |||
504 | static int shmem_map_and_free_swp(struct page *subdir, int offset, | ||
505 | int limit, struct page ***dir, spinlock_t *punch_lock) | ||
506 | { | ||
507 | swp_entry_t *ptr; | ||
508 | int freed = 0; | ||
509 | |||
510 | ptr = shmem_swp_map(subdir); | ||
511 | for (; offset < limit; offset += LATENCY_LIMIT) { | ||
512 | int size = limit - offset; | ||
513 | if (size > LATENCY_LIMIT) | ||
514 | size = LATENCY_LIMIT; | ||
515 | freed += shmem_free_swp(ptr+offset, ptr+offset+size, | ||
516 | punch_lock); | ||
517 | if (need_resched()) { | ||
518 | shmem_swp_unmap(ptr); | ||
519 | if (*dir) { | ||
520 | shmem_dir_unmap(*dir); | ||
521 | *dir = NULL; | ||
522 | } | ||
523 | cond_resched(); | ||
524 | ptr = shmem_swp_map(subdir); | ||
525 | } | ||
526 | } | ||
527 | shmem_swp_unmap(ptr); | ||
528 | return freed; | ||
529 | } | 377 | } |
530 | 378 | ||
531 | static void shmem_free_pages(struct list_head *next) | 379 | /* |
380 | * Pagevec may contain swap entries, so shuffle up pages before releasing. | ||
381 | */ | ||
382 | static void shmem_pagevec_release(struct pagevec *pvec) | ||
532 | { | 383 | { |
533 | struct page *page; | 384 | int i, j; |
534 | int freed = 0; | 385 | |
535 | 386 | for (i = 0, j = 0; i < pagevec_count(pvec); i++) { | |
536 | do { | 387 | struct page *page = pvec->pages[i]; |
537 | page = container_of(next, struct page, lru); | 388 | if (!radix_tree_exceptional_entry(page)) |
538 | next = next->next; | 389 | pvec->pages[j++] = page; |
539 | shmem_dir_free(page); | 390 | } |
540 | freed++; | 391 | pvec->nr = j; |
541 | if (freed >= LATENCY_LIMIT) { | 392 | pagevec_release(pvec); |
542 | cond_resched(); | ||
543 | freed = 0; | ||
544 | } | ||
545 | } while (next); | ||
546 | } | 393 | } |
547 | 394 | ||
548 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 395 | /* |
396 | * Remove range of pages and swap entries from radix tree, and free them. | ||
397 | */ | ||
398 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
549 | { | 399 | { |
400 | struct address_space *mapping = inode->i_mapping; | ||
550 | struct shmem_inode_info *info = SHMEM_I(inode); | 401 | struct shmem_inode_info *info = SHMEM_I(inode); |
551 | unsigned long idx; | 402 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
552 | unsigned long size; | 403 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); |
553 | unsigned long limit; | 404 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); |
554 | unsigned long stage; | 405 | struct pagevec pvec; |
555 | unsigned long diroff; | 406 | pgoff_t indices[PAGEVEC_SIZE]; |
556 | struct page **dir; | ||
557 | struct page *topdir; | ||
558 | struct page *middir; | ||
559 | struct page *subdir; | ||
560 | swp_entry_t *ptr; | ||
561 | LIST_HEAD(pages_to_free); | ||
562 | long nr_pages_to_free = 0; | ||
563 | long nr_swaps_freed = 0; | 407 | long nr_swaps_freed = 0; |
564 | int offset; | 408 | pgoff_t index; |
565 | int freed; | 409 | int i; |
566 | int punch_hole; | ||
567 | spinlock_t *needs_lock; | ||
568 | spinlock_t *punch_lock; | ||
569 | unsigned long upper_limit; | ||
570 | 410 | ||
571 | truncate_inode_pages_range(inode->i_mapping, start, end); | 411 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); |
572 | 412 | ||
573 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 413 | pagevec_init(&pvec, 0); |
574 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 414 | index = start; |
575 | if (idx >= info->next_index) | 415 | while (index <= end) { |
576 | return; | 416 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
417 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | ||
418 | pvec.pages, indices); | ||
419 | if (!pvec.nr) | ||
420 | break; | ||
421 | mem_cgroup_uncharge_start(); | ||
422 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
423 | struct page *page = pvec.pages[i]; | ||
577 | 424 | ||
578 | spin_lock(&info->lock); | 425 | index = indices[i]; |
579 | info->flags |= SHMEM_TRUNCATE; | 426 | if (index > end) |
580 | if (likely(end == (loff_t) -1)) { | 427 | break; |
581 | limit = info->next_index; | 428 | |
582 | upper_limit = SHMEM_MAX_INDEX; | 429 | if (radix_tree_exceptional_entry(page)) { |
583 | info->next_index = idx; | 430 | nr_swaps_freed += !shmem_free_swap(mapping, |
584 | needs_lock = NULL; | 431 | index, page); |
585 | punch_hole = 0; | 432 | continue; |
586 | } else { | 433 | } |
587 | if (end + 1 >= inode->i_size) { /* we may free a little more */ | ||
588 | limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> | ||
589 | PAGE_CACHE_SHIFT; | ||
590 | upper_limit = SHMEM_MAX_INDEX; | ||
591 | } else { | ||
592 | limit = (end + 1) >> PAGE_CACHE_SHIFT; | ||
593 | upper_limit = limit; | ||
594 | } | ||
595 | needs_lock = &info->lock; | ||
596 | punch_hole = 1; | ||
597 | } | ||
598 | 434 | ||
599 | topdir = info->i_indirect; | 435 | if (!trylock_page(page)) |
600 | if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { | 436 | continue; |
601 | info->i_indirect = NULL; | 437 | if (page->mapping == mapping) { |
602 | nr_pages_to_free++; | 438 | VM_BUG_ON(PageWriteback(page)); |
603 | list_add(&topdir->lru, &pages_to_free); | 439 | truncate_inode_page(mapping, page); |
440 | } | ||
441 | unlock_page(page); | ||
442 | } | ||
443 | shmem_pagevec_release(&pvec); | ||
444 | mem_cgroup_uncharge_end(); | ||
445 | cond_resched(); | ||
446 | index++; | ||
604 | } | 447 | } |
605 | spin_unlock(&info->lock); | ||
606 | 448 | ||
607 | if (info->swapped && idx < SHMEM_NR_DIRECT) { | 449 | if (partial) { |
608 | ptr = info->i_direct; | 450 | struct page *page = NULL; |
609 | size = limit; | 451 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
610 | if (size > SHMEM_NR_DIRECT) | 452 | if (page) { |
611 | size = SHMEM_NR_DIRECT; | 453 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
612 | nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); | 454 | set_page_dirty(page); |
455 | unlock_page(page); | ||
456 | page_cache_release(page); | ||
457 | } | ||
613 | } | 458 | } |
614 | 459 | ||
615 | /* | 460 | index = start; |
616 | * If there are no indirect blocks or we are punching a hole | 461 | for ( ; ; ) { |
617 | * below indirect blocks, nothing to be done. | 462 | cond_resched(); |
618 | */ | 463 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
619 | if (!topdir || limit <= SHMEM_NR_DIRECT) | 464 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, |
620 | goto done2; | 465 | pvec.pages, indices); |
466 | if (!pvec.nr) { | ||
467 | if (index == start) | ||
468 | break; | ||
469 | index = start; | ||
470 | continue; | ||
471 | } | ||
472 | if (index == start && indices[0] > end) { | ||
473 | shmem_pagevec_release(&pvec); | ||
474 | break; | ||
475 | } | ||
476 | mem_cgroup_uncharge_start(); | ||
477 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
478 | struct page *page = pvec.pages[i]; | ||
621 | 479 | ||
622 | /* | 480 | index = indices[i]; |
623 | * The truncation case has already dropped info->lock, and we're safe | 481 | if (index > end) |
624 | * because i_size and next_index have already been lowered, preventing | 482 | break; |
625 | * access beyond. But in the punch_hole case, we still need to take | ||
626 | * the lock when updating the swap directory, because there might be | ||
627 | * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or | ||
628 | * shmem_writepage. However, whenever we find we can remove a whole | ||
629 | * directory page (not at the misaligned start or end of the range), | ||
630 | * we first NULLify its pointer in the level above, and then have no | ||
631 | * need to take the lock when updating its contents: needs_lock and | ||
632 | * punch_lock (either pointing to info->lock or NULL) manage this. | ||
633 | */ | ||
634 | 483 | ||
635 | upper_limit -= SHMEM_NR_DIRECT; | 484 | if (radix_tree_exceptional_entry(page)) { |
636 | limit -= SHMEM_NR_DIRECT; | 485 | nr_swaps_freed += !shmem_free_swap(mapping, |
637 | idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; | 486 | index, page); |
638 | offset = idx % ENTRIES_PER_PAGE; | 487 | continue; |
639 | idx -= offset; | ||
640 | |||
641 | dir = shmem_dir_map(topdir); | ||
642 | stage = ENTRIES_PER_PAGEPAGE/2; | ||
643 | if (idx < ENTRIES_PER_PAGEPAGE/2) { | ||
644 | middir = topdir; | ||
645 | diroff = idx/ENTRIES_PER_PAGE; | ||
646 | } else { | ||
647 | dir += ENTRIES_PER_PAGE/2; | ||
648 | dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; | ||
649 | while (stage <= idx) | ||
650 | stage += ENTRIES_PER_PAGEPAGE; | ||
651 | middir = *dir; | ||
652 | if (*dir) { | ||
653 | diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % | ||
654 | ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; | ||
655 | if (!diroff && !offset && upper_limit >= stage) { | ||
656 | if (needs_lock) { | ||
657 | spin_lock(needs_lock); | ||
658 | *dir = NULL; | ||
659 | spin_unlock(needs_lock); | ||
660 | needs_lock = NULL; | ||
661 | } else | ||
662 | *dir = NULL; | ||
663 | nr_pages_to_free++; | ||
664 | list_add(&middir->lru, &pages_to_free); | ||
665 | } | 488 | } |
666 | shmem_dir_unmap(dir); | ||
667 | dir = shmem_dir_map(middir); | ||
668 | } else { | ||
669 | diroff = 0; | ||
670 | offset = 0; | ||
671 | idx = stage; | ||
672 | } | ||
673 | } | ||
674 | 489 | ||
675 | for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { | 490 | lock_page(page); |
676 | if (unlikely(idx == stage)) { | 491 | if (page->mapping == mapping) { |
677 | shmem_dir_unmap(dir); | 492 | VM_BUG_ON(PageWriteback(page)); |
678 | dir = shmem_dir_map(topdir) + | 493 | truncate_inode_page(mapping, page); |
679 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
680 | while (!*dir) { | ||
681 | dir++; | ||
682 | idx += ENTRIES_PER_PAGEPAGE; | ||
683 | if (idx >= limit) | ||
684 | goto done1; | ||
685 | } | ||
686 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
687 | middir = *dir; | ||
688 | if (punch_hole) | ||
689 | needs_lock = &info->lock; | ||
690 | if (upper_limit >= stage) { | ||
691 | if (needs_lock) { | ||
692 | spin_lock(needs_lock); | ||
693 | *dir = NULL; | ||
694 | spin_unlock(needs_lock); | ||
695 | needs_lock = NULL; | ||
696 | } else | ||
697 | *dir = NULL; | ||
698 | nr_pages_to_free++; | ||
699 | list_add(&middir->lru, &pages_to_free); | ||
700 | } | 494 | } |
701 | shmem_dir_unmap(dir); | 495 | unlock_page(page); |
702 | cond_resched(); | ||
703 | dir = shmem_dir_map(middir); | ||
704 | diroff = 0; | ||
705 | } | ||
706 | punch_lock = needs_lock; | ||
707 | subdir = dir[diroff]; | ||
708 | if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { | ||
709 | if (needs_lock) { | ||
710 | spin_lock(needs_lock); | ||
711 | dir[diroff] = NULL; | ||
712 | spin_unlock(needs_lock); | ||
713 | punch_lock = NULL; | ||
714 | } else | ||
715 | dir[diroff] = NULL; | ||
716 | nr_pages_to_free++; | ||
717 | list_add(&subdir->lru, &pages_to_free); | ||
718 | } | ||
719 | if (subdir && page_private(subdir) /* has swap entries */) { | ||
720 | size = limit - idx; | ||
721 | if (size > ENTRIES_PER_PAGE) | ||
722 | size = ENTRIES_PER_PAGE; | ||
723 | freed = shmem_map_and_free_swp(subdir, | ||
724 | offset, size, &dir, punch_lock); | ||
725 | if (!dir) | ||
726 | dir = shmem_dir_map(middir); | ||
727 | nr_swaps_freed += freed; | ||
728 | if (offset || punch_lock) { | ||
729 | spin_lock(&info->lock); | ||
730 | set_page_private(subdir, | ||
731 | page_private(subdir) - freed); | ||
732 | spin_unlock(&info->lock); | ||
733 | } else | ||
734 | BUG_ON(page_private(subdir) != freed); | ||
735 | } | 496 | } |
736 | offset = 0; | 497 | shmem_pagevec_release(&pvec); |
737 | } | 498 | mem_cgroup_uncharge_end(); |
738 | done1: | 499 | index++; |
739 | shmem_dir_unmap(dir); | ||
740 | done2: | ||
741 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { | ||
742 | /* | ||
743 | * Call truncate_inode_pages again: racing shmem_unuse_inode | ||
744 | * may have swizzled a page in from swap since | ||
745 | * truncate_pagecache or generic_delete_inode did it, before we | ||
746 | * lowered next_index. Also, though shmem_getpage checks | ||
747 | * i_size before adding to cache, no recheck after: so fix the | ||
748 | * narrow window there too. | ||
749 | */ | ||
750 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
751 | } | 500 | } |
752 | 501 | ||
753 | spin_lock(&info->lock); | 502 | spin_lock(&info->lock); |
754 | info->flags &= ~SHMEM_TRUNCATE; | ||
755 | info->swapped -= nr_swaps_freed; | 503 | info->swapped -= nr_swaps_freed; |
756 | if (nr_pages_to_free) | ||
757 | shmem_free_blocks(inode, nr_pages_to_free); | ||
758 | shmem_recalc_inode(inode); | 504 | shmem_recalc_inode(inode); |
759 | spin_unlock(&info->lock); | 505 | spin_unlock(&info->lock); |
760 | 506 | ||
761 | /* | 507 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
762 | * Empty swap vector directory pages to be freed? | ||
763 | */ | ||
764 | if (!list_empty(&pages_to_free)) { | ||
765 | pages_to_free.prev->next = NULL; | ||
766 | shmem_free_pages(pages_to_free.next); | ||
767 | } | ||
768 | } | 508 | } |
769 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 509 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
770 | 510 | ||
@@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
780 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | 520 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
781 | loff_t oldsize = inode->i_size; | 521 | loff_t oldsize = inode->i_size; |
782 | loff_t newsize = attr->ia_size; | 522 | loff_t newsize = attr->ia_size; |
783 | struct page *page = NULL; | ||
784 | 523 | ||
785 | if (newsize < oldsize) { | ||
786 | /* | ||
787 | * If truncating down to a partial page, then | ||
788 | * if that page is already allocated, hold it | ||
789 | * in memory until the truncation is over, so | ||
790 | * truncate_partial_page cannot miss it were | ||
791 | * it assigned to swap. | ||
792 | */ | ||
793 | if (newsize & (PAGE_CACHE_SIZE-1)) { | ||
794 | (void) shmem_getpage(inode, | ||
795 | newsize >> PAGE_CACHE_SHIFT, | ||
796 | &page, SGP_READ, NULL); | ||
797 | if (page) | ||
798 | unlock_page(page); | ||
799 | } | ||
800 | /* | ||
801 | * Reset SHMEM_PAGEIN flag so that shmem_truncate can | ||
802 | * detect if any pages might have been added to cache | ||
803 | * after truncate_inode_pages. But we needn't bother | ||
804 | * if it's being fully truncated to zero-length: the | ||
805 | * nrpages check is efficient enough in that case. | ||
806 | */ | ||
807 | if (newsize) { | ||
808 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
809 | spin_lock(&info->lock); | ||
810 | info->flags &= ~SHMEM_PAGEIN; | ||
811 | spin_unlock(&info->lock); | ||
812 | } | ||
813 | } | ||
814 | if (newsize != oldsize) { | 524 | if (newsize != oldsize) { |
815 | i_size_write(inode, newsize); | 525 | i_size_write(inode, newsize); |
816 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 526 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
@@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
822 | /* unmap again to remove racily COWed private pages */ | 532 | /* unmap again to remove racily COWed private pages */ |
823 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | 533 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); |
824 | } | 534 | } |
825 | if (page) | ||
826 | page_cache_release(page); | ||
827 | } | 535 | } |
828 | 536 | ||
829 | setattr_copy(inode, attr); | 537 | setattr_copy(inode, attr); |
@@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode) | |||
848 | list_del_init(&info->swaplist); | 556 | list_del_init(&info->swaplist); |
849 | mutex_unlock(&shmem_swaplist_mutex); | 557 | mutex_unlock(&shmem_swaplist_mutex); |
850 | } | 558 | } |
851 | } | 559 | } else |
560 | kfree(info->symlink); | ||
852 | 561 | ||
853 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { | 562 | list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { |
854 | kfree(xattr->name); | 563 | kfree(xattr->name); |
@@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode) | |||
859 | end_writeback(inode); | 568 | end_writeback(inode); |
860 | } | 569 | } |
861 | 570 | ||
862 | static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) | 571 | /* |
863 | { | 572 | * If swap found in inode, free it and move page from swapcache to filecache. |
864 | swp_entry_t *ptr; | 573 | */ |
865 | 574 | static int shmem_unuse_inode(struct shmem_inode_info *info, | |
866 | for (ptr = dir; ptr < edir; ptr++) { | 575 | swp_entry_t swap, struct page *page) |
867 | if (ptr->val == entry.val) | ||
868 | return ptr - dir; | ||
869 | } | ||
870 | return -1; | ||
871 | } | ||
872 | |||
873 | static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) | ||
874 | { | 576 | { |
875 | struct address_space *mapping; | 577 | struct address_space *mapping = info->vfs_inode.i_mapping; |
876 | unsigned long idx; | 578 | void *radswap; |
877 | unsigned long size; | 579 | pgoff_t index; |
878 | unsigned long limit; | ||
879 | unsigned long stage; | ||
880 | struct page **dir; | ||
881 | struct page *subdir; | ||
882 | swp_entry_t *ptr; | ||
883 | int offset; | ||
884 | int error; | 580 | int error; |
885 | 581 | ||
886 | idx = 0; | 582 | radswap = swp_to_radix_entry(swap); |
887 | ptr = info->i_direct; | 583 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
888 | spin_lock(&info->lock); | 584 | if (index == -1) |
889 | if (!info->swapped) { | 585 | return 0; |
890 | list_del_init(&info->swaplist); | ||
891 | goto lost2; | ||
892 | } | ||
893 | limit = info->next_index; | ||
894 | size = limit; | ||
895 | if (size > SHMEM_NR_DIRECT) | ||
896 | size = SHMEM_NR_DIRECT; | ||
897 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
898 | if (offset >= 0) { | ||
899 | shmem_swp_balance_unmap(); | ||
900 | goto found; | ||
901 | } | ||
902 | if (!info->i_indirect) | ||
903 | goto lost2; | ||
904 | |||
905 | dir = shmem_dir_map(info->i_indirect); | ||
906 | stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; | ||
907 | |||
908 | for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { | ||
909 | if (unlikely(idx == stage)) { | ||
910 | shmem_dir_unmap(dir-1); | ||
911 | if (cond_resched_lock(&info->lock)) { | ||
912 | /* check it has not been truncated */ | ||
913 | if (limit > info->next_index) { | ||
914 | limit = info->next_index; | ||
915 | if (idx >= limit) | ||
916 | goto lost2; | ||
917 | } | ||
918 | } | ||
919 | dir = shmem_dir_map(info->i_indirect) + | ||
920 | ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; | ||
921 | while (!*dir) { | ||
922 | dir++; | ||
923 | idx += ENTRIES_PER_PAGEPAGE; | ||
924 | if (idx >= limit) | ||
925 | goto lost1; | ||
926 | } | ||
927 | stage = idx + ENTRIES_PER_PAGEPAGE; | ||
928 | subdir = *dir; | ||
929 | shmem_dir_unmap(dir); | ||
930 | dir = shmem_dir_map(subdir); | ||
931 | } | ||
932 | subdir = *dir; | ||
933 | if (subdir && page_private(subdir)) { | ||
934 | ptr = shmem_swp_map(subdir); | ||
935 | size = limit - idx; | ||
936 | if (size > ENTRIES_PER_PAGE) | ||
937 | size = ENTRIES_PER_PAGE; | ||
938 | offset = shmem_find_swp(entry, ptr, ptr+size); | ||
939 | shmem_swp_unmap(ptr); | ||
940 | if (offset >= 0) { | ||
941 | shmem_dir_unmap(dir); | ||
942 | ptr = shmem_swp_map(subdir); | ||
943 | goto found; | ||
944 | } | ||
945 | } | ||
946 | } | ||
947 | lost1: | ||
948 | shmem_dir_unmap(dir-1); | ||
949 | lost2: | ||
950 | spin_unlock(&info->lock); | ||
951 | return 0; | ||
952 | found: | ||
953 | idx += offset; | ||
954 | ptr += offset; | ||
955 | 586 | ||
956 | /* | 587 | /* |
957 | * Move _head_ to start search for next from here. | 588 | * Move _head_ to start search for next from here. |
958 | * But be careful: shmem_evict_inode checks list_empty without taking | 589 | * But be careful: shmem_evict_inode checks list_empty without taking |
959 | * mutex, and there's an instant in list_move_tail when info->swaplist | 590 | * mutex, and there's an instant in list_move_tail when info->swaplist |
960 | * would appear empty, if it were the only one on shmem_swaplist. We | 591 | * would appear empty, if it were the only one on shmem_swaplist. |
961 | * could avoid doing it if inode NULL; or use this minor optimization. | ||
962 | */ | 592 | */ |
963 | if (shmem_swaplist.next != &info->swaplist) | 593 | if (shmem_swaplist.next != &info->swaplist) |
964 | list_move_tail(&shmem_swaplist, &info->swaplist); | 594 | list_move_tail(&shmem_swaplist, &info->swaplist); |
@@ -968,29 +598,34 @@ found: | |||
968 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 598 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
969 | * beneath us (pagelock doesn't help until the page is in pagecache). | 599 | * beneath us (pagelock doesn't help until the page is in pagecache). |
970 | */ | 600 | */ |
971 | mapping = info->vfs_inode.i_mapping; | 601 | error = shmem_add_to_page_cache(page, mapping, index, |
972 | error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); | 602 | GFP_NOWAIT, radswap); |
973 | /* which does mem_cgroup_uncharge_cache_page on error */ | 603 | /* which does mem_cgroup_uncharge_cache_page on error */ |
974 | 604 | ||
975 | if (error != -ENOMEM) { | 605 | if (error != -ENOMEM) { |
606 | /* | ||
607 | * Truncation and eviction use free_swap_and_cache(), which | ||
608 | * only does trylock page: if we raced, best clean up here. | ||
609 | */ | ||
976 | delete_from_swap_cache(page); | 610 | delete_from_swap_cache(page); |
977 | set_page_dirty(page); | 611 | set_page_dirty(page); |
978 | info->flags |= SHMEM_PAGEIN; | 612 | if (!error) { |
979 | shmem_swp_set(info, ptr, 0); | 613 | spin_lock(&info->lock); |
980 | swap_free(entry); | 614 | info->swapped--; |
615 | spin_unlock(&info->lock); | ||
616 | swap_free(swap); | ||
617 | } | ||
981 | error = 1; /* not an error, but entry was found */ | 618 | error = 1; /* not an error, but entry was found */ |
982 | } | 619 | } |
983 | shmem_swp_unmap(ptr); | ||
984 | spin_unlock(&info->lock); | ||
985 | return error; | 620 | return error; |
986 | } | 621 | } |
987 | 622 | ||
988 | /* | 623 | /* |
989 | * shmem_unuse() search for an eventually swapped out shmem page. | 624 | * Search through swapped inodes to find and replace swap by page. |
990 | */ | 625 | */ |
991 | int shmem_unuse(swp_entry_t entry, struct page *page) | 626 | int shmem_unuse(swp_entry_t swap, struct page *page) |
992 | { | 627 | { |
993 | struct list_head *p, *next; | 628 | struct list_head *this, *next; |
994 | struct shmem_inode_info *info; | 629 | struct shmem_inode_info *info; |
995 | int found = 0; | 630 | int found = 0; |
996 | int error; | 631 | int error; |
@@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
999 | * Charge page using GFP_KERNEL while we can wait, before taking | 634 | * Charge page using GFP_KERNEL while we can wait, before taking |
1000 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 635 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
1001 | * Charged back to the user (not to caller) when swap account is used. | 636 | * Charged back to the user (not to caller) when swap account is used. |
1002 | * add_to_page_cache() will be called with GFP_NOWAIT. | ||
1003 | */ | 637 | */ |
1004 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 638 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
1005 | if (error) | 639 | if (error) |
1006 | goto out; | 640 | goto out; |
1007 | /* | 641 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
1008 | * Try to preload while we can wait, to not make a habit of | ||
1009 | * draining atomic reserves; but don't latch on to this cpu, | ||
1010 | * it's okay if sometimes we get rescheduled after this. | ||
1011 | */ | ||
1012 | error = radix_tree_preload(GFP_KERNEL); | ||
1013 | if (error) | ||
1014 | goto uncharge; | ||
1015 | radix_tree_preload_end(); | ||
1016 | 642 | ||
1017 | mutex_lock(&shmem_swaplist_mutex); | 643 | mutex_lock(&shmem_swaplist_mutex); |
1018 | list_for_each_safe(p, next, &shmem_swaplist) { | 644 | list_for_each_safe(this, next, &shmem_swaplist) { |
1019 | info = list_entry(p, struct shmem_inode_info, swaplist); | 645 | info = list_entry(this, struct shmem_inode_info, swaplist); |
1020 | found = shmem_unuse_inode(info, entry, page); | 646 | if (info->swapped) |
647 | found = shmem_unuse_inode(info, swap, page); | ||
648 | else | ||
649 | list_del_init(&info->swaplist); | ||
1021 | cond_resched(); | 650 | cond_resched(); |
1022 | if (found) | 651 | if (found) |
1023 | break; | 652 | break; |
1024 | } | 653 | } |
1025 | mutex_unlock(&shmem_swaplist_mutex); | 654 | mutex_unlock(&shmem_swaplist_mutex); |
1026 | 655 | ||
1027 | uncharge: | ||
1028 | if (!found) | 656 | if (!found) |
1029 | mem_cgroup_uncharge_cache_page(page); | 657 | mem_cgroup_uncharge_cache_page(page); |
1030 | if (found < 0) | 658 | if (found < 0) |
@@ -1041,10 +669,10 @@ out: | |||
1041 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) | 669 | static int shmem_writepage(struct page *page, struct writeback_control *wbc) |
1042 | { | 670 | { |
1043 | struct shmem_inode_info *info; | 671 | struct shmem_inode_info *info; |
1044 | swp_entry_t *entry, swap; | ||
1045 | struct address_space *mapping; | 672 | struct address_space *mapping; |
1046 | unsigned long index; | ||
1047 | struct inode *inode; | 673 | struct inode *inode; |
674 | swp_entry_t swap; | ||
675 | pgoff_t index; | ||
1048 | 676 | ||
1049 | BUG_ON(!PageLocked(page)); | 677 | BUG_ON(!PageLocked(page)); |
1050 | mapping = page->mapping; | 678 | mapping = page->mapping; |
@@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1073 | 701 | ||
1074 | /* | 702 | /* |
1075 | * Add inode to shmem_unuse()'s list of swapped-out inodes, | 703 | * Add inode to shmem_unuse()'s list of swapped-out inodes, |
1076 | * if it's not already there. Do it now because we cannot take | 704 | * if it's not already there. Do it now before the page is |
1077 | * mutex while holding spinlock, and must do so before the page | 705 | * moved to swap cache, when its pagelock no longer protects |
1078 | * is moved to swap cache, when its pagelock no longer protects | ||
1079 | * the inode from eviction. But don't unlock the mutex until | 706 | * the inode from eviction. But don't unlock the mutex until |
1080 | * we've taken the spinlock, because shmem_unuse_inode() will | 707 | * we've incremented swapped, because shmem_unuse_inode() will |
1081 | * prune a !swapped inode from the swaplist under both locks. | 708 | * prune a !swapped inode from the swaplist under this mutex. |
1082 | */ | 709 | */ |
1083 | mutex_lock(&shmem_swaplist_mutex); | 710 | mutex_lock(&shmem_swaplist_mutex); |
1084 | if (list_empty(&info->swaplist)) | 711 | if (list_empty(&info->swaplist)) |
1085 | list_add_tail(&info->swaplist, &shmem_swaplist); | 712 | list_add_tail(&info->swaplist, &shmem_swaplist); |
1086 | 713 | ||
1087 | spin_lock(&info->lock); | ||
1088 | mutex_unlock(&shmem_swaplist_mutex); | ||
1089 | |||
1090 | if (index >= info->next_index) { | ||
1091 | BUG_ON(!(info->flags & SHMEM_TRUNCATE)); | ||
1092 | goto unlock; | ||
1093 | } | ||
1094 | entry = shmem_swp_entry(info, index, NULL); | ||
1095 | if (entry->val) { | ||
1096 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | ||
1097 | free_swap_and_cache(*entry); | ||
1098 | shmem_swp_set(info, entry, 0); | ||
1099 | } | ||
1100 | shmem_recalc_inode(inode); | ||
1101 | |||
1102 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 714 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1103 | delete_from_page_cache(page); | ||
1104 | shmem_swp_set(info, entry, swap.val); | ||
1105 | shmem_swp_unmap(entry); | ||
1106 | swap_shmem_alloc(swap); | 715 | swap_shmem_alloc(swap); |
716 | shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); | ||
717 | |||
718 | spin_lock(&info->lock); | ||
719 | info->swapped++; | ||
720 | shmem_recalc_inode(inode); | ||
1107 | spin_unlock(&info->lock); | 721 | spin_unlock(&info->lock); |
722 | |||
723 | mutex_unlock(&shmem_swaplist_mutex); | ||
1108 | BUG_ON(page_mapped(page)); | 724 | BUG_ON(page_mapped(page)); |
1109 | swap_writepage(page, wbc); | 725 | swap_writepage(page, wbc); |
1110 | return 0; | 726 | return 0; |
1111 | } | 727 | } |
1112 | 728 | ||
1113 | shmem_swp_unmap(entry); | 729 | mutex_unlock(&shmem_swaplist_mutex); |
1114 | unlock: | ||
1115 | spin_unlock(&info->lock); | ||
1116 | /* | ||
1117 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
1118 | * clear SWAP_HAS_CACHE flag. | ||
1119 | */ | ||
1120 | swapcache_free(swap, NULL); | 730 | swapcache_free(swap, NULL); |
1121 | redirty: | 731 | redirty: |
1122 | set_page_dirty(page); | 732 | set_page_dirty(page); |
@@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1153 | } | 763 | } |
1154 | #endif /* CONFIG_TMPFS */ | 764 | #endif /* CONFIG_TMPFS */ |
1155 | 765 | ||
1156 | static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 766 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1157 | struct shmem_inode_info *info, unsigned long idx) | 767 | struct shmem_inode_info *info, pgoff_t index) |
1158 | { | 768 | { |
1159 | struct mempolicy mpol, *spol; | 769 | struct mempolicy mpol, *spol; |
1160 | struct vm_area_struct pvma; | 770 | struct vm_area_struct pvma; |
1161 | struct page *page; | ||
1162 | 771 | ||
1163 | spol = mpol_cond_copy(&mpol, | 772 | spol = mpol_cond_copy(&mpol, |
1164 | mpol_shared_policy_lookup(&info->policy, idx)); | 773 | mpol_shared_policy_lookup(&info->policy, index)); |
1165 | 774 | ||
1166 | /* Create a pseudo vma that just contains the policy */ | 775 | /* Create a pseudo vma that just contains the policy */ |
1167 | pvma.vm_start = 0; | 776 | pvma.vm_start = 0; |
1168 | pvma.vm_pgoff = idx; | 777 | pvma.vm_pgoff = index; |
1169 | pvma.vm_ops = NULL; | 778 | pvma.vm_ops = NULL; |
1170 | pvma.vm_policy = spol; | 779 | pvma.vm_policy = spol; |
1171 | page = swapin_readahead(entry, gfp, &pvma, 0); | 780 | return swapin_readahead(swap, gfp, &pvma, 0); |
1172 | return page; | ||
1173 | } | 781 | } |
1174 | 782 | ||
1175 | static struct page *shmem_alloc_page(gfp_t gfp, | 783 | static struct page *shmem_alloc_page(gfp_t gfp, |
1176 | struct shmem_inode_info *info, unsigned long idx) | 784 | struct shmem_inode_info *info, pgoff_t index) |
1177 | { | 785 | { |
1178 | struct vm_area_struct pvma; | 786 | struct vm_area_struct pvma; |
1179 | 787 | ||
1180 | /* Create a pseudo vma that just contains the policy */ | 788 | /* Create a pseudo vma that just contains the policy */ |
1181 | pvma.vm_start = 0; | 789 | pvma.vm_start = 0; |
1182 | pvma.vm_pgoff = idx; | 790 | pvma.vm_pgoff = index; |
1183 | pvma.vm_ops = NULL; | 791 | pvma.vm_ops = NULL; |
1184 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); | 792 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); |
1185 | 793 | ||
1186 | /* | 794 | /* |
1187 | * alloc_page_vma() will drop the shared policy reference | 795 | * alloc_page_vma() will drop the shared policy reference |
@@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1190 | } | 798 | } |
1191 | #else /* !CONFIG_NUMA */ | 799 | #else /* !CONFIG_NUMA */ |
1192 | #ifdef CONFIG_TMPFS | 800 | #ifdef CONFIG_TMPFS |
1193 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) | 801 | static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) |
1194 | { | 802 | { |
1195 | } | 803 | } |
1196 | #endif /* CONFIG_TMPFS */ | 804 | #endif /* CONFIG_TMPFS */ |
1197 | 805 | ||
1198 | static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, | 806 | static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
1199 | struct shmem_inode_info *info, unsigned long idx) | 807 | struct shmem_inode_info *info, pgoff_t index) |
1200 | { | 808 | { |
1201 | return swapin_readahead(entry, gfp, NULL, 0); | 809 | return swapin_readahead(swap, gfp, NULL, 0); |
1202 | } | 810 | } |
1203 | 811 | ||
1204 | static inline struct page *shmem_alloc_page(gfp_t gfp, | 812 | static inline struct page *shmem_alloc_page(gfp_t gfp, |
1205 | struct shmem_inode_info *info, unsigned long idx) | 813 | struct shmem_inode_info *info, pgoff_t index) |
1206 | { | 814 | { |
1207 | return alloc_page(gfp); | 815 | return alloc_page(gfp); |
1208 | } | 816 | } |
@@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
1222 | * vm. If we swap it in we mark it dirty since we also free the swap | 830 | * vm. If we swap it in we mark it dirty since we also free the swap |
1223 | * entry since a page cannot live in both the swap and page cache | 831 | * entry since a page cannot live in both the swap and page cache |
1224 | */ | 832 | */ |
1225 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, | 833 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
1226 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) | 834 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) |
1227 | { | 835 | { |
1228 | struct address_space *mapping = inode->i_mapping; | 836 | struct address_space *mapping = inode->i_mapping; |
1229 | struct shmem_inode_info *info = SHMEM_I(inode); | 837 | struct shmem_inode_info *info; |
1230 | struct shmem_sb_info *sbinfo; | 838 | struct shmem_sb_info *sbinfo; |
1231 | struct page *page; | 839 | struct page *page; |
1232 | struct page *prealloc_page = NULL; | ||
1233 | swp_entry_t *entry; | ||
1234 | swp_entry_t swap; | 840 | swp_entry_t swap; |
1235 | int error; | 841 | int error; |
1236 | int ret; | 842 | int once = 0; |
1237 | 843 | ||
1238 | if (idx >= SHMEM_MAX_INDEX) | 844 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
1239 | return -EFBIG; | 845 | return -EFBIG; |
1240 | repeat: | 846 | repeat: |
1241 | page = find_lock_page(mapping, idx); | 847 | swap.val = 0; |
1242 | if (page) { | 848 | page = find_lock_page(mapping, index); |
849 | if (radix_tree_exceptional_entry(page)) { | ||
850 | swap = radix_to_swp_entry(page); | ||
851 | page = NULL; | ||
852 | } | ||
853 | |||
854 | if (sgp != SGP_WRITE && | ||
855 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | ||
856 | error = -EINVAL; | ||
857 | goto failed; | ||
858 | } | ||
859 | |||
860 | if (page || (sgp == SGP_READ && !swap.val)) { | ||
1243 | /* | 861 | /* |
1244 | * Once we can get the page lock, it must be uptodate: | 862 | * Once we can get the page lock, it must be uptodate: |
1245 | * if there were an error in reading back from swap, | 863 | * if there were an error in reading back from swap, |
1246 | * the page would not be inserted into the filecache. | 864 | * the page would not be inserted into the filecache. |
1247 | */ | 865 | */ |
1248 | BUG_ON(!PageUptodate(page)); | 866 | BUG_ON(page && !PageUptodate(page)); |
1249 | goto done; | 867 | *pagep = page; |
868 | return 0; | ||
1250 | } | 869 | } |
1251 | 870 | ||
1252 | /* | 871 | /* |
1253 | * Try to preload while we can wait, to not make a habit of | 872 | * Fast cache lookup did not find it: |
1254 | * draining atomic reserves; but don't latch on to this cpu. | 873 | * bring it back from swap or allocate. |
1255 | */ | 874 | */ |
1256 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 875 | info = SHMEM_I(inode); |
1257 | if (error) | 876 | sbinfo = SHMEM_SB(inode->i_sb); |
1258 | goto out; | ||
1259 | radix_tree_preload_end(); | ||
1260 | |||
1261 | if (sgp != SGP_READ && !prealloc_page) { | ||
1262 | prealloc_page = shmem_alloc_page(gfp, info, idx); | ||
1263 | if (prealloc_page) { | ||
1264 | SetPageSwapBacked(prealloc_page); | ||
1265 | if (mem_cgroup_cache_charge(prealloc_page, | ||
1266 | current->mm, GFP_KERNEL)) { | ||
1267 | page_cache_release(prealloc_page); | ||
1268 | prealloc_page = NULL; | ||
1269 | } | ||
1270 | } | ||
1271 | } | ||
1272 | |||
1273 | spin_lock(&info->lock); | ||
1274 | shmem_recalc_inode(inode); | ||
1275 | entry = shmem_swp_alloc(info, idx, sgp, gfp); | ||
1276 | if (IS_ERR(entry)) { | ||
1277 | spin_unlock(&info->lock); | ||
1278 | error = PTR_ERR(entry); | ||
1279 | goto out; | ||
1280 | } | ||
1281 | swap = *entry; | ||
1282 | 877 | ||
1283 | if (swap.val) { | 878 | if (swap.val) { |
1284 | /* Look it up and read it in.. */ | 879 | /* Look it up and read it in.. */ |
1285 | page = lookup_swap_cache(swap); | 880 | page = lookup_swap_cache(swap); |
1286 | if (!page) { | 881 | if (!page) { |
1287 | shmem_swp_unmap(entry); | ||
1288 | spin_unlock(&info->lock); | ||
1289 | /* here we actually do the io */ | 882 | /* here we actually do the io */ |
1290 | if (fault_type) | 883 | if (fault_type) |
1291 | *fault_type |= VM_FAULT_MAJOR; | 884 | *fault_type |= VM_FAULT_MAJOR; |
1292 | page = shmem_swapin(swap, gfp, info, idx); | 885 | page = shmem_swapin(swap, gfp, info, index); |
1293 | if (!page) { | 886 | if (!page) { |
1294 | spin_lock(&info->lock); | 887 | error = -ENOMEM; |
1295 | entry = shmem_swp_alloc(info, idx, sgp, gfp); | 888 | goto failed; |
1296 | if (IS_ERR(entry)) | ||
1297 | error = PTR_ERR(entry); | ||
1298 | else { | ||
1299 | if (entry->val == swap.val) | ||
1300 | error = -ENOMEM; | ||
1301 | shmem_swp_unmap(entry); | ||
1302 | } | ||
1303 | spin_unlock(&info->lock); | ||
1304 | if (error) | ||
1305 | goto out; | ||
1306 | goto repeat; | ||
1307 | } | 889 | } |
1308 | wait_on_page_locked(page); | ||
1309 | page_cache_release(page); | ||
1310 | goto repeat; | ||
1311 | } | 890 | } |
1312 | 891 | ||
1313 | /* We have to do this with page locked to prevent races */ | 892 | /* We have to do this with page locked to prevent races */ |
1314 | if (!trylock_page(page)) { | 893 | lock_page(page); |
1315 | shmem_swp_unmap(entry); | ||
1316 | spin_unlock(&info->lock); | ||
1317 | wait_on_page_locked(page); | ||
1318 | page_cache_release(page); | ||
1319 | goto repeat; | ||
1320 | } | ||
1321 | if (PageWriteback(page)) { | ||
1322 | shmem_swp_unmap(entry); | ||
1323 | spin_unlock(&info->lock); | ||
1324 | wait_on_page_writeback(page); | ||
1325 | unlock_page(page); | ||
1326 | page_cache_release(page); | ||
1327 | goto repeat; | ||
1328 | } | ||
1329 | if (!PageUptodate(page)) { | 894 | if (!PageUptodate(page)) { |
1330 | shmem_swp_unmap(entry); | ||
1331 | spin_unlock(&info->lock); | ||
1332 | unlock_page(page); | ||
1333 | page_cache_release(page); | ||
1334 | error = -EIO; | 895 | error = -EIO; |
1335 | goto out; | 896 | goto failed; |
1336 | } | 897 | } |
1337 | 898 | wait_on_page_writeback(page); | |
1338 | error = add_to_page_cache_locked(page, mapping, | 899 | |
1339 | idx, GFP_NOWAIT); | 900 | /* Someone may have already done it for us */ |
1340 | if (error) { | 901 | if (page->mapping) { |
1341 | shmem_swp_unmap(entry); | 902 | if (page->mapping == mapping && |
1342 | spin_unlock(&info->lock); | 903 | page->index == index) |
1343 | if (error == -ENOMEM) { | 904 | goto done; |
1344 | /* | 905 | error = -EEXIST; |
1345 | * reclaim from proper memory cgroup and | 906 | goto failed; |
1346 | * call memcg's OOM if needed. | ||
1347 | */ | ||
1348 | error = mem_cgroup_shmem_charge_fallback( | ||
1349 | page, current->mm, gfp); | ||
1350 | if (error) { | ||
1351 | unlock_page(page); | ||
1352 | page_cache_release(page); | ||
1353 | goto out; | ||
1354 | } | ||
1355 | } | ||
1356 | unlock_page(page); | ||
1357 | page_cache_release(page); | ||
1358 | goto repeat; | ||
1359 | } | 907 | } |
1360 | 908 | ||
1361 | info->flags |= SHMEM_PAGEIN; | 909 | error = mem_cgroup_cache_charge(page, current->mm, |
1362 | shmem_swp_set(info, entry, 0); | 910 | gfp & GFP_RECLAIM_MASK); |
1363 | shmem_swp_unmap(entry); | 911 | if (!error) |
1364 | delete_from_swap_cache(page); | 912 | error = shmem_add_to_page_cache(page, mapping, index, |
913 | gfp, swp_to_radix_entry(swap)); | ||
914 | if (error) | ||
915 | goto failed; | ||
916 | |||
917 | spin_lock(&info->lock); | ||
918 | info->swapped--; | ||
919 | shmem_recalc_inode(inode); | ||
1365 | spin_unlock(&info->lock); | 920 | spin_unlock(&info->lock); |
921 | |||
922 | delete_from_swap_cache(page); | ||
1366 | set_page_dirty(page); | 923 | set_page_dirty(page); |
1367 | swap_free(swap); | 924 | swap_free(swap); |
1368 | 925 | ||
1369 | } else if (sgp == SGP_READ) { | 926 | } else { |
1370 | shmem_swp_unmap(entry); | 927 | if (shmem_acct_block(info->flags)) { |
1371 | page = find_get_page(mapping, idx); | 928 | error = -ENOSPC; |
1372 | if (page && !trylock_page(page)) { | 929 | goto failed; |
1373 | spin_unlock(&info->lock); | ||
1374 | wait_on_page_locked(page); | ||
1375 | page_cache_release(page); | ||
1376 | goto repeat; | ||
1377 | } | 930 | } |
1378 | spin_unlock(&info->lock); | ||
1379 | |||
1380 | } else if (prealloc_page) { | ||
1381 | shmem_swp_unmap(entry); | ||
1382 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1383 | if (sbinfo->max_blocks) { | 931 | if (sbinfo->max_blocks) { |
1384 | if (percpu_counter_compare(&sbinfo->used_blocks, | 932 | if (percpu_counter_compare(&sbinfo->used_blocks, |
1385 | sbinfo->max_blocks) >= 0 || | 933 | sbinfo->max_blocks) >= 0) { |
1386 | shmem_acct_block(info->flags)) | 934 | error = -ENOSPC; |
1387 | goto nospace; | 935 | goto unacct; |
936 | } | ||
1388 | percpu_counter_inc(&sbinfo->used_blocks); | 937 | percpu_counter_inc(&sbinfo->used_blocks); |
1389 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
1390 | } else if (shmem_acct_block(info->flags)) | ||
1391 | goto nospace; | ||
1392 | |||
1393 | page = prealloc_page; | ||
1394 | prealloc_page = NULL; | ||
1395 | |||
1396 | entry = shmem_swp_alloc(info, idx, sgp, gfp); | ||
1397 | if (IS_ERR(entry)) | ||
1398 | error = PTR_ERR(entry); | ||
1399 | else { | ||
1400 | swap = *entry; | ||
1401 | shmem_swp_unmap(entry); | ||
1402 | } | 938 | } |
1403 | ret = error || swap.val; | 939 | |
1404 | if (ret) | 940 | page = shmem_alloc_page(gfp, info, index); |
1405 | mem_cgroup_uncharge_cache_page(page); | 941 | if (!page) { |
1406 | else | 942 | error = -ENOMEM; |
1407 | ret = add_to_page_cache_lru(page, mapping, | 943 | goto decused; |
1408 | idx, GFP_NOWAIT); | ||
1409 | /* | ||
1410 | * At add_to_page_cache_lru() failure, | ||
1411 | * uncharge will be done automatically. | ||
1412 | */ | ||
1413 | if (ret) { | ||
1414 | shmem_unacct_blocks(info->flags, 1); | ||
1415 | shmem_free_blocks(inode, 1); | ||
1416 | spin_unlock(&info->lock); | ||
1417 | page_cache_release(page); | ||
1418 | if (error) | ||
1419 | goto out; | ||
1420 | goto repeat; | ||
1421 | } | 944 | } |
1422 | 945 | ||
1423 | info->flags |= SHMEM_PAGEIN; | 946 | SetPageSwapBacked(page); |
947 | __set_page_locked(page); | ||
948 | error = mem_cgroup_cache_charge(page, current->mm, | ||
949 | gfp & GFP_RECLAIM_MASK); | ||
950 | if (!error) | ||
951 | error = shmem_add_to_page_cache(page, mapping, index, | ||
952 | gfp, NULL); | ||
953 | if (error) | ||
954 | goto decused; | ||
955 | lru_cache_add_anon(page); | ||
956 | |||
957 | spin_lock(&info->lock); | ||
1424 | info->alloced++; | 958 | info->alloced++; |
959 | inode->i_blocks += BLOCKS_PER_PAGE; | ||
960 | shmem_recalc_inode(inode); | ||
1425 | spin_unlock(&info->lock); | 961 | spin_unlock(&info->lock); |
962 | |||
1426 | clear_highpage(page); | 963 | clear_highpage(page); |
1427 | flush_dcache_page(page); | 964 | flush_dcache_page(page); |
1428 | SetPageUptodate(page); | 965 | SetPageUptodate(page); |
1429 | if (sgp == SGP_DIRTY) | 966 | if (sgp == SGP_DIRTY) |
1430 | set_page_dirty(page); | 967 | set_page_dirty(page); |
1431 | |||
1432 | } else { | ||
1433 | spin_unlock(&info->lock); | ||
1434 | error = -ENOMEM; | ||
1435 | goto out; | ||
1436 | } | 968 | } |
1437 | done: | 969 | done: |
1438 | *pagep = page; | 970 | /* Perhaps the file has been truncated since we checked */ |
1439 | error = 0; | 971 | if (sgp != SGP_WRITE && |
1440 | out: | 972 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1441 | if (prealloc_page) { | 973 | error = -EINVAL; |
1442 | mem_cgroup_uncharge_cache_page(prealloc_page); | 974 | goto trunc; |
1443 | page_cache_release(prealloc_page); | ||
1444 | } | 975 | } |
1445 | return error; | 976 | *pagep = page; |
977 | return 0; | ||
1446 | 978 | ||
1447 | nospace: | ||
1448 | /* | 979 | /* |
1449 | * Perhaps the page was brought in from swap between find_lock_page | 980 | * Error recovery. |
1450 | * and taking info->lock? We allow for that at add_to_page_cache_lru, | ||
1451 | * but must also avoid reporting a spurious ENOSPC while working on a | ||
1452 | * full tmpfs. | ||
1453 | */ | 981 | */ |
1454 | page = find_get_page(mapping, idx); | 982 | trunc: |
983 | ClearPageDirty(page); | ||
984 | delete_from_page_cache(page); | ||
985 | spin_lock(&info->lock); | ||
986 | info->alloced--; | ||
987 | inode->i_blocks -= BLOCKS_PER_PAGE; | ||
1455 | spin_unlock(&info->lock); | 988 | spin_unlock(&info->lock); |
989 | decused: | ||
990 | if (sbinfo->max_blocks) | ||
991 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
992 | unacct: | ||
993 | shmem_unacct_blocks(info->flags, 1); | ||
994 | failed: | ||
995 | if (swap.val && error != -EINVAL) { | ||
996 | struct page *test = find_get_page(mapping, index); | ||
997 | if (test && !radix_tree_exceptional_entry(test)) | ||
998 | page_cache_release(test); | ||
999 | /* Have another try if the entry has changed */ | ||
1000 | if (test != swp_to_radix_entry(swap)) | ||
1001 | error = -EEXIST; | ||
1002 | } | ||
1456 | if (page) { | 1003 | if (page) { |
1004 | unlock_page(page); | ||
1457 | page_cache_release(page); | 1005 | page_cache_release(page); |
1006 | } | ||
1007 | if (error == -ENOSPC && !once++) { | ||
1008 | info = SHMEM_I(inode); | ||
1009 | spin_lock(&info->lock); | ||
1010 | shmem_recalc_inode(inode); | ||
1011 | spin_unlock(&info->lock); | ||
1458 | goto repeat; | 1012 | goto repeat; |
1459 | } | 1013 | } |
1460 | error = -ENOSPC; | 1014 | if (error == -EEXIST) |
1461 | goto out; | 1015 | goto repeat; |
1016 | return error; | ||
1462 | } | 1017 | } |
1463 | 1018 | ||
1464 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1019 | static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
@@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1467 | int error; | 1022 | int error; |
1468 | int ret = VM_FAULT_LOCKED; | 1023 | int ret = VM_FAULT_LOCKED; |
1469 | 1024 | ||
1470 | if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
1471 | return VM_FAULT_SIGBUS; | ||
1472 | |||
1473 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); | 1025 | error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); |
1474 | if (error) | 1026 | if (error) |
1475 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1027 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
@@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1482 | } | 1034 | } |
1483 | 1035 | ||
1484 | #ifdef CONFIG_NUMA | 1036 | #ifdef CONFIG_NUMA |
1485 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | 1037 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) |
1486 | { | 1038 | { |
1487 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1039 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1488 | return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); | 1040 | return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); |
1489 | } | 1041 | } |
1490 | 1042 | ||
1491 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, | 1043 | static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, |
1492 | unsigned long addr) | 1044 | unsigned long addr) |
1493 | { | 1045 | { |
1494 | struct inode *i = vma->vm_file->f_path.dentry->d_inode; | 1046 | struct inode *inode = vma->vm_file->f_path.dentry->d_inode; |
1495 | unsigned long idx; | 1047 | pgoff_t index; |
1496 | 1048 | ||
1497 | idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 1049 | index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
1498 | return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); | 1050 | return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); |
1499 | } | 1051 | } |
1500 | #endif | 1052 | #endif |
1501 | 1053 | ||
@@ -1593,7 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1593 | 1145 | ||
1594 | #ifdef CONFIG_TMPFS | 1146 | #ifdef CONFIG_TMPFS |
1595 | static const struct inode_operations shmem_symlink_inode_operations; | 1147 | static const struct inode_operations shmem_symlink_inode_operations; |
1596 | static const struct inode_operations shmem_symlink_inline_operations; | 1148 | static const struct inode_operations shmem_short_symlink_operations; |
1597 | 1149 | ||
1598 | static int | 1150 | static int |
1599 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1151 | shmem_write_begin(struct file *file, struct address_space *mapping, |
@@ -1626,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1626 | { | 1178 | { |
1627 | struct inode *inode = filp->f_path.dentry->d_inode; | 1179 | struct inode *inode = filp->f_path.dentry->d_inode; |
1628 | struct address_space *mapping = inode->i_mapping; | 1180 | struct address_space *mapping = inode->i_mapping; |
1629 | unsigned long index, offset; | 1181 | pgoff_t index; |
1182 | unsigned long offset; | ||
1630 | enum sgp_type sgp = SGP_READ; | 1183 | enum sgp_type sgp = SGP_READ; |
1631 | 1184 | ||
1632 | /* | 1185 | /* |
@@ -1642,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
1642 | 1195 | ||
1643 | for (;;) { | 1196 | for (;;) { |
1644 | struct page *page = NULL; | 1197 | struct page *page = NULL; |
1645 | unsigned long end_index, nr, ret; | 1198 | pgoff_t end_index; |
1199 | unsigned long nr, ret; | ||
1646 | loff_t i_size = i_size_read(inode); | 1200 | loff_t i_size = i_size_read(inode); |
1647 | 1201 | ||
1648 | end_index = i_size >> PAGE_CACHE_SHIFT; | 1202 | end_index = i_size >> PAGE_CACHE_SHIFT; |
@@ -1880,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | |||
1880 | buf->f_namelen = NAME_MAX; | 1434 | buf->f_namelen = NAME_MAX; |
1881 | if (sbinfo->max_blocks) { | 1435 | if (sbinfo->max_blocks) { |
1882 | buf->f_blocks = sbinfo->max_blocks; | 1436 | buf->f_blocks = sbinfo->max_blocks; |
1883 | buf->f_bavail = buf->f_bfree = | 1437 | buf->f_bavail = |
1884 | sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); | 1438 | buf->f_bfree = sbinfo->max_blocks - |
1439 | percpu_counter_sum(&sbinfo->used_blocks); | ||
1885 | } | 1440 | } |
1886 | if (sbinfo->max_inodes) { | 1441 | if (sbinfo->max_inodes) { |
1887 | buf->f_files = sbinfo->max_inodes; | 1442 | buf->f_files = sbinfo->max_inodes; |
@@ -2055,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2055 | 1610 | ||
2056 | info = SHMEM_I(inode); | 1611 | info = SHMEM_I(inode); |
2057 | inode->i_size = len-1; | 1612 | inode->i_size = len-1; |
2058 | if (len <= SHMEM_SYMLINK_INLINE_LEN) { | 1613 | if (len <= SHORT_SYMLINK_LEN) { |
2059 | /* do it inline */ | 1614 | info->symlink = kmemdup(symname, len, GFP_KERNEL); |
2060 | memcpy(info->inline_symlink, symname, len); | 1615 | if (!info->symlink) { |
2061 | inode->i_op = &shmem_symlink_inline_operations; | 1616 | iput(inode); |
1617 | return -ENOMEM; | ||
1618 | } | ||
1619 | inode->i_op = &shmem_short_symlink_operations; | ||
2062 | } else { | 1620 | } else { |
2063 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); | 1621 | error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); |
2064 | if (error) { | 1622 | if (error) { |
@@ -2081,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
2081 | return 0; | 1639 | return 0; |
2082 | } | 1640 | } |
2083 | 1641 | ||
2084 | static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) | 1642 | static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) |
2085 | { | 1643 | { |
2086 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); | 1644 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); |
2087 | return NULL; | 1645 | return NULL; |
2088 | } | 1646 | } |
2089 | 1647 | ||
2090 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | 1648 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) |
2091 | { | 1649 | { |
2092 | struct page *page = NULL; | 1650 | struct page *page = NULL; |
2093 | int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | 1651 | int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); |
2094 | nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); | 1652 | nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); |
2095 | if (page) | 1653 | if (page) |
2096 | unlock_page(page); | 1654 | unlock_page(page); |
2097 | return page; | 1655 | return page; |
@@ -2202,7 +1760,6 @@ out: | |||
2202 | return err; | 1760 | return err; |
2203 | } | 1761 | } |
2204 | 1762 | ||
2205 | |||
2206 | static const struct xattr_handler *shmem_xattr_handlers[] = { | 1763 | static const struct xattr_handler *shmem_xattr_handlers[] = { |
2207 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1764 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2208 | &generic_acl_access_handler, | 1765 | &generic_acl_access_handler, |
@@ -2332,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | |||
2332 | } | 1889 | } |
2333 | #endif /* CONFIG_TMPFS_XATTR */ | 1890 | #endif /* CONFIG_TMPFS_XATTR */ |
2334 | 1891 | ||
2335 | static const struct inode_operations shmem_symlink_inline_operations = { | 1892 | static const struct inode_operations shmem_short_symlink_operations = { |
2336 | .readlink = generic_readlink, | 1893 | .readlink = generic_readlink, |
2337 | .follow_link = shmem_follow_link_inline, | 1894 | .follow_link = shmem_follow_short_symlink, |
2338 | #ifdef CONFIG_TMPFS_XATTR | 1895 | #ifdef CONFIG_TMPFS_XATTR |
2339 | .setxattr = shmem_setxattr, | 1896 | .setxattr = shmem_setxattr, |
2340 | .getxattr = shmem_getxattr, | 1897 | .getxattr = shmem_getxattr, |
@@ -2534,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2534 | if (config.max_inodes < inodes) | 2091 | if (config.max_inodes < inodes) |
2535 | goto out; | 2092 | goto out; |
2536 | /* | 2093 | /* |
2537 | * Those tests also disallow limited->unlimited while any are in | 2094 | * Those tests disallow limited->unlimited while any are in use; |
2538 | * use, so i_blocks will always be zero when max_blocks is zero; | ||
2539 | * but we must separately disallow unlimited->limited, because | 2095 | * but we must separately disallow unlimited->limited, because |
2540 | * in that case we have no record of how much is already in use. | 2096 | * in that case we have no record of how much is already in use. |
2541 | */ | 2097 | */ |
@@ -2627,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2627 | goto failed; | 2183 | goto failed; |
2628 | sbinfo->free_inodes = sbinfo->max_inodes; | 2184 | sbinfo->free_inodes = sbinfo->max_inodes; |
2629 | 2185 | ||
2630 | sb->s_maxbytes = SHMEM_MAX_BYTES; | 2186 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
2631 | sb->s_blocksize = PAGE_CACHE_SIZE; | 2187 | sb->s_blocksize = PAGE_CACHE_SIZE; |
2632 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; | 2188 | sb->s_blocksize_bits = PAGE_CACHE_SHIFT; |
2633 | sb->s_magic = TMPFS_MAGIC; | 2189 | sb->s_magic = TMPFS_MAGIC; |
@@ -2662,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep; | |||
2662 | 2218 | ||
2663 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2219 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
2664 | { | 2220 | { |
2665 | struct shmem_inode_info *p; | 2221 | struct shmem_inode_info *info; |
2666 | p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); | 2222 | info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); |
2667 | if (!p) | 2223 | if (!info) |
2668 | return NULL; | 2224 | return NULL; |
2669 | return &p->vfs_inode; | 2225 | return &info->vfs_inode; |
2670 | } | 2226 | } |
2671 | 2227 | ||
2672 | static void shmem_i_callback(struct rcu_head *head) | 2228 | static void shmem_destroy_callback(struct rcu_head *head) |
2673 | { | 2229 | { |
2674 | struct inode *inode = container_of(head, struct inode, i_rcu); | 2230 | struct inode *inode = container_of(head, struct inode, i_rcu); |
2675 | INIT_LIST_HEAD(&inode->i_dentry); | 2231 | INIT_LIST_HEAD(&inode->i_dentry); |
@@ -2678,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head) | |||
2678 | 2234 | ||
2679 | static void shmem_destroy_inode(struct inode *inode) | 2235 | static void shmem_destroy_inode(struct inode *inode) |
2680 | { | 2236 | { |
2681 | if ((inode->i_mode & S_IFMT) == S_IFREG) { | 2237 | if ((inode->i_mode & S_IFMT) == S_IFREG) |
2682 | /* only struct inode is valid if it's an inline symlink */ | ||
2683 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2238 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
2684 | } | 2239 | call_rcu(&inode->i_rcu, shmem_destroy_callback); |
2685 | call_rcu(&inode->i_rcu, shmem_i_callback); | ||
2686 | } | 2240 | } |
2687 | 2241 | ||
2688 | static void init_once(void *foo) | 2242 | static void shmem_init_inode(void *foo) |
2689 | { | 2243 | { |
2690 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2244 | struct shmem_inode_info *info = foo; |
2691 | 2245 | inode_init_once(&info->vfs_inode); | |
2692 | inode_init_once(&p->vfs_inode); | ||
2693 | } | 2246 | } |
2694 | 2247 | ||
2695 | static int init_inodecache(void) | 2248 | static int shmem_init_inodecache(void) |
2696 | { | 2249 | { |
2697 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", | 2250 | shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", |
2698 | sizeof(struct shmem_inode_info), | 2251 | sizeof(struct shmem_inode_info), |
2699 | 0, SLAB_PANIC, init_once); | 2252 | 0, SLAB_PANIC, shmem_init_inode); |
2700 | return 0; | 2253 | return 0; |
2701 | } | 2254 | } |
2702 | 2255 | ||
2703 | static void destroy_inodecache(void) | 2256 | static void shmem_destroy_inodecache(void) |
2704 | { | 2257 | { |
2705 | kmem_cache_destroy(shmem_inode_cachep); | 2258 | kmem_cache_destroy(shmem_inode_cachep); |
2706 | } | 2259 | } |
@@ -2797,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
2797 | #endif | 2350 | #endif |
2798 | }; | 2351 | }; |
2799 | 2352 | ||
2800 | |||
2801 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 2353 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
2802 | int flags, const char *dev_name, void *data) | 2354 | int flags, const char *dev_name, void *data) |
2803 | { | 2355 | { |
2804 | return mount_nodev(fs_type, flags, data, shmem_fill_super); | 2356 | return mount_nodev(fs_type, flags, data, shmem_fill_super); |
2805 | } | 2357 | } |
2806 | 2358 | ||
2807 | static struct file_system_type tmpfs_fs_type = { | 2359 | static struct file_system_type shmem_fs_type = { |
2808 | .owner = THIS_MODULE, | 2360 | .owner = THIS_MODULE, |
2809 | .name = "tmpfs", | 2361 | .name = "tmpfs", |
2810 | .mount = shmem_mount, | 2362 | .mount = shmem_mount, |
2811 | .kill_sb = kill_litter_super, | 2363 | .kill_sb = kill_litter_super, |
2812 | }; | 2364 | }; |
2813 | 2365 | ||
2814 | int __init init_tmpfs(void) | 2366 | int __init shmem_init(void) |
2815 | { | 2367 | { |
2816 | int error; | 2368 | int error; |
2817 | 2369 | ||
@@ -2819,18 +2371,18 @@ int __init init_tmpfs(void) | |||
2819 | if (error) | 2371 | if (error) |
2820 | goto out4; | 2372 | goto out4; |
2821 | 2373 | ||
2822 | error = init_inodecache(); | 2374 | error = shmem_init_inodecache(); |
2823 | if (error) | 2375 | if (error) |
2824 | goto out3; | 2376 | goto out3; |
2825 | 2377 | ||
2826 | error = register_filesystem(&tmpfs_fs_type); | 2378 | error = register_filesystem(&shmem_fs_type); |
2827 | if (error) { | 2379 | if (error) { |
2828 | printk(KERN_ERR "Could not register tmpfs\n"); | 2380 | printk(KERN_ERR "Could not register tmpfs\n"); |
2829 | goto out2; | 2381 | goto out2; |
2830 | } | 2382 | } |
2831 | 2383 | ||
2832 | shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, | 2384 | shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, |
2833 | tmpfs_fs_type.name, NULL); | 2385 | shmem_fs_type.name, NULL); |
2834 | if (IS_ERR(shm_mnt)) { | 2386 | if (IS_ERR(shm_mnt)) { |
2835 | error = PTR_ERR(shm_mnt); | 2387 | error = PTR_ERR(shm_mnt); |
2836 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); | 2388 | printk(KERN_ERR "Could not kern_mount tmpfs\n"); |
@@ -2839,9 +2391,9 @@ int __init init_tmpfs(void) | |||
2839 | return 0; | 2391 | return 0; |
2840 | 2392 | ||
2841 | out1: | 2393 | out1: |
2842 | unregister_filesystem(&tmpfs_fs_type); | 2394 | unregister_filesystem(&shmem_fs_type); |
2843 | out2: | 2395 | out2: |
2844 | destroy_inodecache(); | 2396 | shmem_destroy_inodecache(); |
2845 | out3: | 2397 | out3: |
2846 | bdi_destroy(&shmem_backing_dev_info); | 2398 | bdi_destroy(&shmem_backing_dev_info); |
2847 | out4: | 2399 | out4: |
@@ -2849,45 +2401,6 @@ out4: | |||
2849 | return error; | 2401 | return error; |
2850 | } | 2402 | } |
2851 | 2403 | ||
2852 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2853 | /** | ||
2854 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2855 | * @inode: the inode to be searched | ||
2856 | * @pgoff: the offset to be searched | ||
2857 | * @pagep: the pointer for the found page to be stored | ||
2858 | * @ent: the pointer for the found swap entry to be stored | ||
2859 | * | ||
2860 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2861 | * these refcount. | ||
2862 | */ | ||
2863 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2864 | struct page **pagep, swp_entry_t *ent) | ||
2865 | { | ||
2866 | swp_entry_t entry = { .val = 0 }, *ptr; | ||
2867 | struct page *page = NULL; | ||
2868 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
2869 | |||
2870 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2871 | goto out; | ||
2872 | |||
2873 | spin_lock(&info->lock); | ||
2874 | ptr = shmem_swp_entry(info, pgoff, NULL); | ||
2875 | #ifdef CONFIG_SWAP | ||
2876 | if (ptr && ptr->val) { | ||
2877 | entry.val = ptr->val; | ||
2878 | page = find_get_page(&swapper_space, entry.val); | ||
2879 | } else | ||
2880 | #endif | ||
2881 | page = find_get_page(inode->i_mapping, pgoff); | ||
2882 | if (ptr) | ||
2883 | shmem_swp_unmap(ptr); | ||
2884 | spin_unlock(&info->lock); | ||
2885 | out: | ||
2886 | *pagep = page; | ||
2887 | *ent = entry; | ||
2888 | } | ||
2889 | #endif | ||
2890 | |||
2891 | #else /* !CONFIG_SHMEM */ | 2404 | #else /* !CONFIG_SHMEM */ |
2892 | 2405 | ||
2893 | /* | 2406 | /* |
@@ -2901,23 +2414,23 @@ out: | |||
2901 | 2414 | ||
2902 | #include <linux/ramfs.h> | 2415 | #include <linux/ramfs.h> |
2903 | 2416 | ||
2904 | static struct file_system_type tmpfs_fs_type = { | 2417 | static struct file_system_type shmem_fs_type = { |
2905 | .name = "tmpfs", | 2418 | .name = "tmpfs", |
2906 | .mount = ramfs_mount, | 2419 | .mount = ramfs_mount, |
2907 | .kill_sb = kill_litter_super, | 2420 | .kill_sb = kill_litter_super, |
2908 | }; | 2421 | }; |
2909 | 2422 | ||
2910 | int __init init_tmpfs(void) | 2423 | int __init shmem_init(void) |
2911 | { | 2424 | { |
2912 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 2425 | BUG_ON(register_filesystem(&shmem_fs_type) != 0); |
2913 | 2426 | ||
2914 | shm_mnt = kern_mount(&tmpfs_fs_type); | 2427 | shm_mnt = kern_mount(&shmem_fs_type); |
2915 | BUG_ON(IS_ERR(shm_mnt)); | 2428 | BUG_ON(IS_ERR(shm_mnt)); |
2916 | 2429 | ||
2917 | return 0; | 2430 | return 0; |
2918 | } | 2431 | } |
2919 | 2432 | ||
2920 | int shmem_unuse(swp_entry_t entry, struct page *page) | 2433 | int shmem_unuse(swp_entry_t swap, struct page *page) |
2921 | { | 2434 | { |
2922 | return 0; | 2435 | return 0; |
2923 | } | 2436 | } |
@@ -2927,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2927 | return 0; | 2440 | return 0; |
2928 | } | 2441 | } |
2929 | 2442 | ||
2930 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 2443 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
2931 | { | 2444 | { |
2932 | truncate_inode_pages_range(inode->i_mapping, start, end); | 2445 | truncate_inode_pages_range(inode->i_mapping, lstart, lend); |
2933 | } | 2446 | } |
2934 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 2447 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
2935 | 2448 | ||
2936 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2937 | /** | ||
2938 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2939 | * @inode: the inode to be searched | ||
2940 | * @pgoff: the offset to be searched | ||
2941 | * @pagep: the pointer for the found page to be stored | ||
2942 | * @ent: the pointer for the found swap entry to be stored | ||
2943 | * | ||
2944 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2945 | * these refcount. | ||
2946 | */ | ||
2947 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2948 | struct page **pagep, swp_entry_t *ent) | ||
2949 | { | ||
2950 | struct page *page = NULL; | ||
2951 | |||
2952 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2953 | goto out; | ||
2954 | page = find_get_page(inode->i_mapping, pgoff); | ||
2955 | out: | ||
2956 | *pagep = page; | ||
2957 | *ent = (swp_entry_t){ .val = 0 }; | ||
2958 | } | ||
2959 | #endif | ||
2960 | |||
2961 | #define shmem_vm_ops generic_file_vm_ops | 2449 | #define shmem_vm_ops generic_file_vm_ops |
2962 | #define shmem_file_operations ramfs_file_operations | 2450 | #define shmem_file_operations ramfs_file_operations |
2963 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) | 2451 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) |
2964 | #define shmem_acct_size(flags, size) 0 | 2452 | #define shmem_acct_size(flags, size) 0 |
2965 | #define shmem_unacct_size(flags, size) do {} while (0) | 2453 | #define shmem_unacct_size(flags, size) do {} while (0) |
2966 | #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE | ||
2967 | 2454 | ||
2968 | #endif /* CONFIG_SHMEM */ | 2455 | #endif /* CONFIG_SHMEM */ |
2969 | 2456 | ||
@@ -2987,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2987 | if (IS_ERR(shm_mnt)) | 2474 | if (IS_ERR(shm_mnt)) |
2988 | return (void *)shm_mnt; | 2475 | return (void *)shm_mnt; |
2989 | 2476 | ||
2990 | if (size < 0 || size > SHMEM_MAX_BYTES) | 2477 | if (size < 0 || size > MAX_LFS_FILESIZE) |
2991 | return ERR_PTR(-EINVAL); | 2478 | return ERR_PTR(-EINVAL); |
2992 | 2479 | ||
2993 | if (shmem_acct_size(flags, size)) | 2480 | if (shmem_acct_size(flags, size)) |
@@ -622,6 +622,51 @@ int slab_is_available(void) | |||
622 | static struct lock_class_key on_slab_l3_key; | 622 | static struct lock_class_key on_slab_l3_key; |
623 | static struct lock_class_key on_slab_alc_key; | 623 | static struct lock_class_key on_slab_alc_key; |
624 | 624 | ||
625 | static struct lock_class_key debugobj_l3_key; | ||
626 | static struct lock_class_key debugobj_alc_key; | ||
627 | |||
628 | static void slab_set_lock_classes(struct kmem_cache *cachep, | ||
629 | struct lock_class_key *l3_key, struct lock_class_key *alc_key, | ||
630 | int q) | ||
631 | { | ||
632 | struct array_cache **alc; | ||
633 | struct kmem_list3 *l3; | ||
634 | int r; | ||
635 | |||
636 | l3 = cachep->nodelists[q]; | ||
637 | if (!l3) | ||
638 | return; | ||
639 | |||
640 | lockdep_set_class(&l3->list_lock, l3_key); | ||
641 | alc = l3->alien; | ||
642 | /* | ||
643 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
644 | * should go away when common slab code is taught to | ||
645 | * work even without alien caches. | ||
646 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
647 | * for alloc_alien_cache, | ||
648 | */ | ||
649 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
650 | return; | ||
651 | for_each_node(r) { | ||
652 | if (alc[r]) | ||
653 | lockdep_set_class(&alc[r]->lock, alc_key); | ||
654 | } | ||
655 | } | ||
656 | |||
657 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
658 | { | ||
659 | slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); | ||
660 | } | ||
661 | |||
662 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
663 | { | ||
664 | int node; | ||
665 | |||
666 | for_each_online_node(node) | ||
667 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
668 | } | ||
669 | |||
625 | static void init_node_lock_keys(int q) | 670 | static void init_node_lock_keys(int q) |
626 | { | 671 | { |
627 | struct cache_sizes *s = malloc_sizes; | 672 | struct cache_sizes *s = malloc_sizes; |
@@ -630,29 +675,14 @@ static void init_node_lock_keys(int q) | |||
630 | return; | 675 | return; |
631 | 676 | ||
632 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { | 677 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
633 | struct array_cache **alc; | ||
634 | struct kmem_list3 *l3; | 678 | struct kmem_list3 *l3; |
635 | int r; | ||
636 | 679 | ||
637 | l3 = s->cs_cachep->nodelists[q]; | 680 | l3 = s->cs_cachep->nodelists[q]; |
638 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 681 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
639 | continue; | 682 | continue; |
640 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 683 | |
641 | alc = l3->alien; | 684 | slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, |
642 | /* | 685 | &on_slab_alc_key, q); |
643 | * FIXME: This check for BAD_ALIEN_MAGIC | ||
644 | * should go away when common slab code is taught to | ||
645 | * work even without alien caches. | ||
646 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | ||
647 | * for alloc_alien_cache, | ||
648 | */ | ||
649 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | ||
650 | continue; | ||
651 | for_each_node(r) { | ||
652 | if (alc[r]) | ||
653 | lockdep_set_class(&alc[r]->lock, | ||
654 | &on_slab_alc_key); | ||
655 | } | ||
656 | } | 686 | } |
657 | } | 687 | } |
658 | 688 | ||
@@ -671,6 +701,14 @@ static void init_node_lock_keys(int q) | |||
671 | static inline void init_lock_keys(void) | 701 | static inline void init_lock_keys(void) |
672 | { | 702 | { |
673 | } | 703 | } |
704 | |||
705 | static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) | ||
706 | { | ||
707 | } | ||
708 | |||
709 | static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) | ||
710 | { | ||
711 | } | ||
674 | #endif | 712 | #endif |
675 | 713 | ||
676 | /* | 714 | /* |
@@ -1264,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1264 | spin_unlock_irq(&l3->list_lock); | 1302 | spin_unlock_irq(&l3->list_lock); |
1265 | kfree(shared); | 1303 | kfree(shared); |
1266 | free_alien_cache(alien); | 1304 | free_alien_cache(alien); |
1305 | if (cachep->flags & SLAB_DEBUG_OBJECTS) | ||
1306 | slab_set_debugobj_lock_classes_node(cachep, node); | ||
1267 | } | 1307 | } |
1268 | init_node_lock_keys(node); | 1308 | init_node_lock_keys(node); |
1269 | 1309 | ||
@@ -1626,6 +1666,9 @@ void __init kmem_cache_init_late(void) | |||
1626 | { | 1666 | { |
1627 | struct kmem_cache *cachep; | 1667 | struct kmem_cache *cachep; |
1628 | 1668 | ||
1669 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1670 | init_lock_keys(); | ||
1671 | |||
1629 | /* 6) resize the head arrays to their final sizes */ | 1672 | /* 6) resize the head arrays to their final sizes */ |
1630 | mutex_lock(&cache_chain_mutex); | 1673 | mutex_lock(&cache_chain_mutex); |
1631 | list_for_each_entry(cachep, &cache_chain, next) | 1674 | list_for_each_entry(cachep, &cache_chain, next) |
@@ -1636,9 +1679,6 @@ void __init kmem_cache_init_late(void) | |||
1636 | /* Done! */ | 1679 | /* Done! */ |
1637 | g_cpucache_up = FULL; | 1680 | g_cpucache_up = FULL; |
1638 | 1681 | ||
1639 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
1640 | init_lock_keys(); | ||
1641 | |||
1642 | /* | 1682 | /* |
1643 | * Register a cpu startup notifier callback that initializes | 1683 | * Register a cpu startup notifier callback that initializes |
1644 | * cpu_cache_get for all new cpus | 1684 | * cpu_cache_get for all new cpus |
@@ -2426,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2426 | goto oops; | 2466 | goto oops; |
2427 | } | 2467 | } |
2428 | 2468 | ||
2469 | if (flags & SLAB_DEBUG_OBJECTS) { | ||
2470 | /* | ||
2471 | * Would deadlock through slab_destroy()->call_rcu()-> | ||
2472 | * debug_object_activate()->kmem_cache_alloc(). | ||
2473 | */ | ||
2474 | WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); | ||
2475 | |||
2476 | slab_set_debugobj_lock_classes(cachep); | ||
2477 | } | ||
2478 | |||
2429 | /* cache setup completed, link it into the list */ | 2479 | /* cache setup completed, link it into the list */ |
2430 | list_add(&cachep->next, &cache_chain); | 2480 | list_add(&cachep->next, &cache_chain); |
2431 | oops: | 2481 | oops: |
@@ -3403,7 +3453,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3403 | cache_alloc_debugcheck_before(cachep, flags); | 3453 | cache_alloc_debugcheck_before(cachep, flags); |
3404 | local_irq_save(save_flags); | 3454 | local_irq_save(save_flags); |
3405 | 3455 | ||
3406 | if (nodeid == -1) | 3456 | if (nodeid == NUMA_NO_NODE) |
3407 | nodeid = slab_node; | 3457 | nodeid = slab_node; |
3408 | 3458 | ||
3409 | if (unlikely(!cachep->nodelists[nodeid])) { | 3459 | if (unlikely(!cachep->nodelists[nodeid])) { |
@@ -3934,7 +3984,7 @@ fail: | |||
3934 | 3984 | ||
3935 | struct ccupdate_struct { | 3985 | struct ccupdate_struct { |
3936 | struct kmem_cache *cachep; | 3986 | struct kmem_cache *cachep; |
3937 | struct array_cache *new[NR_CPUS]; | 3987 | struct array_cache *new[0]; |
3938 | }; | 3988 | }; |
3939 | 3989 | ||
3940 | static void do_ccupdate_local(void *info) | 3990 | static void do_ccupdate_local(void *info) |
@@ -3956,7 +4006,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3956 | struct ccupdate_struct *new; | 4006 | struct ccupdate_struct *new; |
3957 | int i; | 4007 | int i; |
3958 | 4008 | ||
3959 | new = kzalloc(sizeof(*new), gfp); | 4009 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), |
4010 | gfp); | ||
3960 | if (!new) | 4011 | if (!new) |
3961 | return -ENOMEM; | 4012 | return -ENOMEM; |
3962 | 4013 | ||
@@ -70,7 +70,7 @@ | |||
70 | 70 | ||
71 | #include <trace/events/kmem.h> | 71 | #include <trace/events/kmem.h> |
72 | 72 | ||
73 | #include <asm/atomic.h> | 73 | #include <linux/atomic.h> |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * slob_block has a field 'units', which indicates size of block if +ve, | 76 | * slob_block has a field 'units', which indicates size of block if +ve, |
@@ -2,10 +2,11 @@ | |||
2 | * SLUB: A slab allocator that limits cache line use instead of queuing | 2 | * SLUB: A slab allocator that limits cache line use instead of queuing |
3 | * objects in per cpu and per node lists. | 3 | * objects in per cpu and per node lists. |
4 | * | 4 | * |
5 | * The allocator synchronizes using per slab locks and only | 5 | * The allocator synchronizes using per slab locks or atomic operatios |
6 | * uses a centralized lock to manage a pool of partial slabs. | 6 | * and only uses a centralized lock to manage a pool of partial slabs. |
7 | * | 7 | * |
8 | * (C) 2007 SGI, Christoph Lameter | 8 | * (C) 2007 SGI, Christoph Lameter |
9 | * (C) 2011 Linux Foundation, Christoph Lameter | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -33,15 +34,27 @@ | |||
33 | 34 | ||
34 | /* | 35 | /* |
35 | * Lock order: | 36 | * Lock order: |
36 | * 1. slab_lock(page) | 37 | * 1. slub_lock (Global Semaphore) |
37 | * 2. slab->list_lock | 38 | * 2. node->list_lock |
39 | * 3. slab_lock(page) (Only on some arches and for debugging) | ||
38 | * | 40 | * |
39 | * The slab_lock protects operations on the object of a particular | 41 | * slub_lock |
40 | * slab and its metadata in the page struct. If the slab lock | 42 | * |
41 | * has been taken then no allocations nor frees can be performed | 43 | * The role of the slub_lock is to protect the list of all the slabs |
42 | * on the objects in the slab nor can the slab be added or removed | 44 | * and to synchronize major metadata changes to slab cache structures. |
43 | * from the partial or full lists since this would mean modifying | 45 | * |
44 | * the page_struct of the slab. | 46 | * The slab_lock is only used for debugging and on arches that do not |
47 | * have the ability to do a cmpxchg_double. It only protects the second | ||
48 | * double word in the page struct. Meaning | ||
49 | * A. page->freelist -> List of object free in a page | ||
50 | * B. page->counters -> Counters of objects | ||
51 | * C. page->frozen -> frozen state | ||
52 | * | ||
53 | * If a slab is frozen then it is exempt from list management. It is not | ||
54 | * on any list. The processor that froze the slab is the one who can | ||
55 | * perform list operations on the page. Other processors may put objects | ||
56 | * onto the freelist but the processor that froze the slab is the only | ||
57 | * one that can retrieve the objects from the page's freelist. | ||
45 | * | 58 | * |
46 | * The list_lock protects the partial and full list on each node and | 59 | * The list_lock protects the partial and full list on each node and |
47 | * the partial slab counter. If taken then no new slabs may be added or | 60 | * the partial slab counter. If taken then no new slabs may be added or |
@@ -54,20 +67,6 @@ | |||
54 | * slabs, operations can continue without any centralized lock. F.e. | 67 | * slabs, operations can continue without any centralized lock. F.e. |
55 | * allocating a long series of objects that fill up slabs does not require | 68 | * allocating a long series of objects that fill up slabs does not require |
56 | * the list lock. | 69 | * the list lock. |
57 | * | ||
58 | * The lock order is sometimes inverted when we are trying to get a slab | ||
59 | * off a list. We take the list_lock and then look for a page on the list | ||
60 | * to use. While we do that objects in the slabs may be freed. We can | ||
61 | * only operate on the slab if we have also taken the slab_lock. So we use | ||
62 | * a slab_trylock() on the slab. If trylock was successful then no frees | ||
63 | * can occur anymore and we can use the slab for allocations etc. If the | ||
64 | * slab_trylock() does not succeed then frees are in progress in the slab and | ||
65 | * we must stay away from it for a while since we may cause a bouncing | ||
66 | * cacheline if we try to acquire the lock. So go onto the next slab. | ||
67 | * If all pages are busy then we may allocate a new slab instead of reusing | ||
68 | * a partial slab. A new slab has no one operating on it and thus there is | ||
69 | * no danger of cacheline contention. | ||
70 | * | ||
71 | * Interrupts are disabled during allocation and deallocation in order to | 70 | * Interrupts are disabled during allocation and deallocation in order to |
72 | * make the slab allocator safe to use in the context of an irq. In addition | 71 | * make the slab allocator safe to use in the context of an irq. In addition |
73 | * interrupts are disabled to ensure that the processor does not change | 72 | * interrupts are disabled to ensure that the processor does not change |
@@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
132 | /* Enable to test recovery from slab corruption on boot */ | 131 | /* Enable to test recovery from slab corruption on boot */ |
133 | #undef SLUB_RESILIENCY_TEST | 132 | #undef SLUB_RESILIENCY_TEST |
134 | 133 | ||
134 | /* Enable to log cmpxchg failures */ | ||
135 | #undef SLUB_DEBUG_CMPXCHG | ||
136 | |||
135 | /* | 137 | /* |
136 | * Mininum number of partial slabs. These will be left on the partial | 138 | * Mininum number of partial slabs. These will be left on the partial |
137 | * lists even if they are empty. kmem_cache_shrink may reclaim them. | 139 | * lists even if they are empty. kmem_cache_shrink may reclaim them. |
@@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
167 | 169 | ||
168 | #define OO_SHIFT 16 | 170 | #define OO_SHIFT 16 |
169 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 171 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
170 | #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ | 172 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
171 | 173 | ||
172 | /* Internal SLUB flags */ | 174 | /* Internal SLUB flags */ |
173 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 175 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
176 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | ||
174 | 177 | ||
175 | static int kmem_size = sizeof(struct kmem_cache); | 178 | static int kmem_size = sizeof(struct kmem_cache); |
176 | 179 | ||
@@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
343 | return x.x & OO_MASK; | 346 | return x.x & OO_MASK; |
344 | } | 347 | } |
345 | 348 | ||
349 | /* | ||
350 | * Per slab locking using the pagelock | ||
351 | */ | ||
352 | static __always_inline void slab_lock(struct page *page) | ||
353 | { | ||
354 | bit_spin_lock(PG_locked, &page->flags); | ||
355 | } | ||
356 | |||
357 | static __always_inline void slab_unlock(struct page *page) | ||
358 | { | ||
359 | __bit_spin_unlock(PG_locked, &page->flags); | ||
360 | } | ||
361 | |||
362 | /* Interrupts must be disabled (for the fallback code to work right) */ | ||
363 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
364 | void *freelist_old, unsigned long counters_old, | ||
365 | void *freelist_new, unsigned long counters_new, | ||
366 | const char *n) | ||
367 | { | ||
368 | VM_BUG_ON(!irqs_disabled()); | ||
369 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
370 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
371 | if (cmpxchg_double(&page->freelist, | ||
372 | freelist_old, counters_old, | ||
373 | freelist_new, counters_new)) | ||
374 | return 1; | ||
375 | } else | ||
376 | #endif | ||
377 | { | ||
378 | slab_lock(page); | ||
379 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
380 | page->freelist = freelist_new; | ||
381 | page->counters = counters_new; | ||
382 | slab_unlock(page); | ||
383 | return 1; | ||
384 | } | ||
385 | slab_unlock(page); | ||
386 | } | ||
387 | |||
388 | cpu_relax(); | ||
389 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
390 | |||
391 | #ifdef SLUB_DEBUG_CMPXCHG | ||
392 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
393 | #endif | ||
394 | |||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
399 | void *freelist_old, unsigned long counters_old, | ||
400 | void *freelist_new, unsigned long counters_new, | ||
401 | const char *n) | ||
402 | { | ||
403 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
404 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
405 | if (cmpxchg_double(&page->freelist, | ||
406 | freelist_old, counters_old, | ||
407 | freelist_new, counters_new)) | ||
408 | return 1; | ||
409 | } else | ||
410 | #endif | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | |||
414 | local_irq_save(flags); | ||
415 | slab_lock(page); | ||
416 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
417 | page->freelist = freelist_new; | ||
418 | page->counters = counters_new; | ||
419 | slab_unlock(page); | ||
420 | local_irq_restore(flags); | ||
421 | return 1; | ||
422 | } | ||
423 | slab_unlock(page); | ||
424 | local_irq_restore(flags); | ||
425 | } | ||
426 | |||
427 | cpu_relax(); | ||
428 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
429 | |||
430 | #ifdef SLUB_DEBUG_CMPXCHG | ||
431 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
432 | #endif | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | |||
346 | #ifdef CONFIG_SLUB_DEBUG | 437 | #ifdef CONFIG_SLUB_DEBUG |
347 | /* | 438 | /* |
348 | * Determine a map of object in use on a page. | 439 | * Determine a map of object in use on a page. |
349 | * | 440 | * |
350 | * Slab lock or node listlock must be held to guarantee that the page does | 441 | * Node listlock must be held to guarantee that the page does |
351 | * not vanish from under us. | 442 | * not vanish from under us. |
352 | */ | 443 | */ |
353 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | 444 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) |
@@ -610,7 +701,7 @@ static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) | |||
610 | return check_bytes8(start, value, bytes); | 701 | return check_bytes8(start, value, bytes); |
611 | 702 | ||
612 | value64 = value | value << 8 | value << 16 | value << 24; | 703 | value64 = value | value << 8 | value << 16 | value << 24; |
613 | value64 = value64 | value64 << 32; | 704 | value64 = (value64 & 0xffffffff) | value64 << 32; |
614 | prefix = 8 - ((unsigned long)start) % 8; | 705 | prefix = 8 - ((unsigned long)start) % 8; |
615 | 706 | ||
616 | if (prefix) { | 707 | if (prefix) { |
@@ -838,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
838 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 929 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
839 | { | 930 | { |
840 | int nr = 0; | 931 | int nr = 0; |
841 | void *fp = page->freelist; | 932 | void *fp; |
842 | void *object = NULL; | 933 | void *object = NULL; |
843 | unsigned long max_objects; | 934 | unsigned long max_objects; |
844 | 935 | ||
936 | fp = page->freelist; | ||
845 | while (fp && nr <= page->objects) { | 937 | while (fp && nr <= page->objects) { |
846 | if (fp == search) | 938 | if (fp == search) |
847 | return 1; | 939 | return 1; |
@@ -946,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
946 | 1038 | ||
947 | /* | 1039 | /* |
948 | * Tracking of fully allocated slabs for debugging purposes. | 1040 | * Tracking of fully allocated slabs for debugging purposes. |
1041 | * | ||
1042 | * list_lock must be held. | ||
949 | */ | 1043 | */ |
950 | static void add_full(struct kmem_cache_node *n, struct page *page) | 1044 | static void add_full(struct kmem_cache *s, |
1045 | struct kmem_cache_node *n, struct page *page) | ||
951 | { | 1046 | { |
952 | spin_lock(&n->list_lock); | 1047 | if (!(s->flags & SLAB_STORE_USER)) |
1048 | return; | ||
1049 | |||
953 | list_add(&page->lru, &n->full); | 1050 | list_add(&page->lru, &n->full); |
954 | spin_unlock(&n->list_lock); | ||
955 | } | 1051 | } |
956 | 1052 | ||
1053 | /* | ||
1054 | * list_lock must be held. | ||
1055 | */ | ||
957 | static void remove_full(struct kmem_cache *s, struct page *page) | 1056 | static void remove_full(struct kmem_cache *s, struct page *page) |
958 | { | 1057 | { |
959 | struct kmem_cache_node *n; | ||
960 | |||
961 | if (!(s->flags & SLAB_STORE_USER)) | 1058 | if (!(s->flags & SLAB_STORE_USER)) |
962 | return; | 1059 | return; |
963 | 1060 | ||
964 | n = get_node(s, page_to_nid(page)); | ||
965 | |||
966 | spin_lock(&n->list_lock); | ||
967 | list_del(&page->lru); | 1061 | list_del(&page->lru); |
968 | spin_unlock(&n->list_lock); | ||
969 | } | 1062 | } |
970 | 1063 | ||
971 | /* Tracking of the number of slabs for debugging purposes */ | 1064 | /* Tracking of the number of slabs for debugging purposes */ |
@@ -1021,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa | |||
1021 | if (!check_slab(s, page)) | 1114 | if (!check_slab(s, page)) |
1022 | goto bad; | 1115 | goto bad; |
1023 | 1116 | ||
1024 | if (!on_freelist(s, page, object)) { | ||
1025 | object_err(s, page, object, "Object already allocated"); | ||
1026 | goto bad; | ||
1027 | } | ||
1028 | |||
1029 | if (!check_valid_pointer(s, page, object)) { | 1117 | if (!check_valid_pointer(s, page, object)) { |
1030 | object_err(s, page, object, "Freelist Pointer check fails"); | 1118 | object_err(s, page, object, "Freelist Pointer check fails"); |
1031 | goto bad; | 1119 | goto bad; |
@@ -1058,6 +1146,12 @@ bad: | |||
1058 | static noinline int free_debug_processing(struct kmem_cache *s, | 1146 | static noinline int free_debug_processing(struct kmem_cache *s, |
1059 | struct page *page, void *object, unsigned long addr) | 1147 | struct page *page, void *object, unsigned long addr) |
1060 | { | 1148 | { |
1149 | unsigned long flags; | ||
1150 | int rc = 0; | ||
1151 | |||
1152 | local_irq_save(flags); | ||
1153 | slab_lock(page); | ||
1154 | |||
1061 | if (!check_slab(s, page)) | 1155 | if (!check_slab(s, page)) |
1062 | goto fail; | 1156 | goto fail; |
1063 | 1157 | ||
@@ -1072,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1072 | } | 1166 | } |
1073 | 1167 | ||
1074 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1168 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1075 | return 0; | 1169 | goto out; |
1076 | 1170 | ||
1077 | if (unlikely(s != page->slab)) { | 1171 | if (unlikely(s != page->slab)) { |
1078 | if (!PageSlab(page)) { | 1172 | if (!PageSlab(page)) { |
@@ -1089,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1089 | goto fail; | 1183 | goto fail; |
1090 | } | 1184 | } |
1091 | 1185 | ||
1092 | /* Special debug activities for freeing objects */ | ||
1093 | if (!PageSlubFrozen(page) && !page->freelist) | ||
1094 | remove_full(s, page); | ||
1095 | if (s->flags & SLAB_STORE_USER) | 1186 | if (s->flags & SLAB_STORE_USER) |
1096 | set_track(s, object, TRACK_FREE, addr); | 1187 | set_track(s, object, TRACK_FREE, addr); |
1097 | trace(s, page, object, 0); | 1188 | trace(s, page, object, 0); |
1098 | init_object(s, object, SLUB_RED_INACTIVE); | 1189 | init_object(s, object, SLUB_RED_INACTIVE); |
1099 | return 1; | 1190 | rc = 1; |
1191 | out: | ||
1192 | slab_unlock(page); | ||
1193 | local_irq_restore(flags); | ||
1194 | return rc; | ||
1100 | 1195 | ||
1101 | fail: | 1196 | fail: |
1102 | slab_fix(s, "Object at 0x%p not freed", object); | 1197 | slab_fix(s, "Object at 0x%p not freed", object); |
1103 | return 0; | 1198 | goto out; |
1104 | } | 1199 | } |
1105 | 1200 | ||
1106 | static int __init setup_slub_debug(char *str) | 1201 | static int __init setup_slub_debug(char *str) |
@@ -1200,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
1200 | { return 1; } | 1295 | { return 1; } |
1201 | static inline int check_object(struct kmem_cache *s, struct page *page, | 1296 | static inline int check_object(struct kmem_cache *s, struct page *page, |
1202 | void *object, u8 val) { return 1; } | 1297 | void *object, u8 val) { return 1; } |
1203 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1298 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1299 | struct page *page) {} | ||
1300 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | ||
1204 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1301 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1205 | unsigned long flags, const char *name, | 1302 | unsigned long flags, const char *name, |
1206 | void (*ctor)(void *)) | 1303 | void (*ctor)(void *)) |
@@ -1252,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1252 | struct kmem_cache_order_objects oo = s->oo; | 1349 | struct kmem_cache_order_objects oo = s->oo; |
1253 | gfp_t alloc_gfp; | 1350 | gfp_t alloc_gfp; |
1254 | 1351 | ||
1352 | flags &= gfp_allowed_mask; | ||
1353 | |||
1354 | if (flags & __GFP_WAIT) | ||
1355 | local_irq_enable(); | ||
1356 | |||
1255 | flags |= s->allocflags; | 1357 | flags |= s->allocflags; |
1256 | 1358 | ||
1257 | /* | 1359 | /* |
@@ -1268,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1268 | * Try a lower order alloc if possible | 1370 | * Try a lower order alloc if possible |
1269 | */ | 1371 | */ |
1270 | page = alloc_slab_page(flags, node, oo); | 1372 | page = alloc_slab_page(flags, node, oo); |
1271 | if (!page) | ||
1272 | return NULL; | ||
1273 | 1373 | ||
1274 | stat(s, ORDER_FALLBACK); | 1374 | if (page) |
1375 | stat(s, ORDER_FALLBACK); | ||
1275 | } | 1376 | } |
1276 | 1377 | ||
1378 | if (flags & __GFP_WAIT) | ||
1379 | local_irq_disable(); | ||
1380 | |||
1381 | if (!page) | ||
1382 | return NULL; | ||
1383 | |||
1277 | if (kmemcheck_enabled | 1384 | if (kmemcheck_enabled |
1278 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1385 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1279 | int pages = 1 << oo_order(oo); | 1386 | int pages = 1 << oo_order(oo); |
@@ -1341,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1341 | 1448 | ||
1342 | page->freelist = start; | 1449 | page->freelist = start; |
1343 | page->inuse = 0; | 1450 | page->inuse = 0; |
1451 | page->frozen = 1; | ||
1344 | out: | 1452 | out: |
1345 | return page; | 1453 | return page; |
1346 | } | 1454 | } |
@@ -1418,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1418 | } | 1526 | } |
1419 | 1527 | ||
1420 | /* | 1528 | /* |
1421 | * Per slab locking using the pagelock | 1529 | * Management of partially allocated slabs. |
1422 | */ | 1530 | * |
1423 | static __always_inline void slab_lock(struct page *page) | 1531 | * list_lock must be held. |
1424 | { | ||
1425 | bit_spin_lock(PG_locked, &page->flags); | ||
1426 | } | ||
1427 | |||
1428 | static __always_inline void slab_unlock(struct page *page) | ||
1429 | { | ||
1430 | __bit_spin_unlock(PG_locked, &page->flags); | ||
1431 | } | ||
1432 | |||
1433 | static __always_inline int slab_trylock(struct page *page) | ||
1434 | { | ||
1435 | int rc = 1; | ||
1436 | |||
1437 | rc = bit_spin_trylock(PG_locked, &page->flags); | ||
1438 | return rc; | ||
1439 | } | ||
1440 | |||
1441 | /* | ||
1442 | * Management of partially allocated slabs | ||
1443 | */ | 1532 | */ |
1444 | static void add_partial(struct kmem_cache_node *n, | 1533 | static inline void add_partial(struct kmem_cache_node *n, |
1445 | struct page *page, int tail) | 1534 | struct page *page, int tail) |
1446 | { | 1535 | { |
1447 | spin_lock(&n->list_lock); | ||
1448 | n->nr_partial++; | 1536 | n->nr_partial++; |
1449 | if (tail) | 1537 | if (tail) |
1450 | list_add_tail(&page->lru, &n->partial); | 1538 | list_add_tail(&page->lru, &n->partial); |
1451 | else | 1539 | else |
1452 | list_add(&page->lru, &n->partial); | 1540 | list_add(&page->lru, &n->partial); |
1453 | spin_unlock(&n->list_lock); | ||
1454 | } | 1541 | } |
1455 | 1542 | ||
1456 | static inline void __remove_partial(struct kmem_cache_node *n, | 1543 | /* |
1544 | * list_lock must be held. | ||
1545 | */ | ||
1546 | static inline void remove_partial(struct kmem_cache_node *n, | ||
1457 | struct page *page) | 1547 | struct page *page) |
1458 | { | 1548 | { |
1459 | list_del(&page->lru); | 1549 | list_del(&page->lru); |
1460 | n->nr_partial--; | 1550 | n->nr_partial--; |
1461 | } | 1551 | } |
1462 | 1552 | ||
1463 | static void remove_partial(struct kmem_cache *s, struct page *page) | ||
1464 | { | ||
1465 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1466 | |||
1467 | spin_lock(&n->list_lock); | ||
1468 | __remove_partial(n, page); | ||
1469 | spin_unlock(&n->list_lock); | ||
1470 | } | ||
1471 | |||
1472 | /* | 1553 | /* |
1473 | * Lock slab and remove from the partial list. | 1554 | * Lock slab, remove from the partial list and put the object into the |
1555 | * per cpu freelist. | ||
1474 | * | 1556 | * |
1475 | * Must hold list_lock. | 1557 | * Must hold list_lock. |
1476 | */ | 1558 | */ |
1477 | static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | 1559 | static inline int acquire_slab(struct kmem_cache *s, |
1478 | struct page *page) | 1560 | struct kmem_cache_node *n, struct page *page) |
1479 | { | 1561 | { |
1480 | if (slab_trylock(page)) { | 1562 | void *freelist; |
1481 | __remove_partial(n, page); | 1563 | unsigned long counters; |
1482 | __SetPageSlubFrozen(page); | 1564 | struct page new; |
1565 | |||
1566 | /* | ||
1567 | * Zap the freelist and set the frozen bit. | ||
1568 | * The old freelist is the list of objects for the | ||
1569 | * per cpu allocation list. | ||
1570 | */ | ||
1571 | do { | ||
1572 | freelist = page->freelist; | ||
1573 | counters = page->counters; | ||
1574 | new.counters = counters; | ||
1575 | new.inuse = page->objects; | ||
1576 | |||
1577 | VM_BUG_ON(new.frozen); | ||
1578 | new.frozen = 1; | ||
1579 | |||
1580 | } while (!__cmpxchg_double_slab(s, page, | ||
1581 | freelist, counters, | ||
1582 | NULL, new.counters, | ||
1583 | "lock and freeze")); | ||
1584 | |||
1585 | remove_partial(n, page); | ||
1586 | |||
1587 | if (freelist) { | ||
1588 | /* Populate the per cpu freelist */ | ||
1589 | this_cpu_write(s->cpu_slab->freelist, freelist); | ||
1590 | this_cpu_write(s->cpu_slab->page, page); | ||
1591 | this_cpu_write(s->cpu_slab->node, page_to_nid(page)); | ||
1483 | return 1; | 1592 | return 1; |
1593 | } else { | ||
1594 | /* | ||
1595 | * Slab page came from the wrong list. No object to allocate | ||
1596 | * from. Put it onto the correct list and continue partial | ||
1597 | * scan. | ||
1598 | */ | ||
1599 | printk(KERN_ERR "SLUB: %s : Page without available objects on" | ||
1600 | " partial list\n", s->name); | ||
1601 | return 0; | ||
1484 | } | 1602 | } |
1485 | return 0; | ||
1486 | } | 1603 | } |
1487 | 1604 | ||
1488 | /* | 1605 | /* |
1489 | * Try to allocate a partial slab from a specific node. | 1606 | * Try to allocate a partial slab from a specific node. |
1490 | */ | 1607 | */ |
1491 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1608 | static struct page *get_partial_node(struct kmem_cache *s, |
1609 | struct kmem_cache_node *n) | ||
1492 | { | 1610 | { |
1493 | struct page *page; | 1611 | struct page *page; |
1494 | 1612 | ||
@@ -1503,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
1503 | 1621 | ||
1504 | spin_lock(&n->list_lock); | 1622 | spin_lock(&n->list_lock); |
1505 | list_for_each_entry(page, &n->partial, lru) | 1623 | list_for_each_entry(page, &n->partial, lru) |
1506 | if (lock_and_freeze_slab(n, page)) | 1624 | if (acquire_slab(s, n, page)) |
1507 | goto out; | 1625 | goto out; |
1508 | page = NULL; | 1626 | page = NULL; |
1509 | out: | 1627 | out: |
@@ -1554,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1554 | 1672 | ||
1555 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1673 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1556 | n->nr_partial > s->min_partial) { | 1674 | n->nr_partial > s->min_partial) { |
1557 | page = get_partial_node(n); | 1675 | page = get_partial_node(s, n); |
1558 | if (page) { | 1676 | if (page) { |
1559 | put_mems_allowed(); | 1677 | put_mems_allowed(); |
1560 | return page; | 1678 | return page; |
@@ -1574,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1574 | struct page *page; | 1692 | struct page *page; |
1575 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; | 1693 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; |
1576 | 1694 | ||
1577 | page = get_partial_node(get_node(s, searchnode)); | 1695 | page = get_partial_node(s, get_node(s, searchnode)); |
1578 | if (page || node != NUMA_NO_NODE) | 1696 | if (page || node != NUMA_NO_NODE) |
1579 | return page; | 1697 | return page; |
1580 | 1698 | ||
1581 | return get_any_partial(s, flags); | 1699 | return get_any_partial(s, flags); |
1582 | } | 1700 | } |
1583 | 1701 | ||
1584 | /* | ||
1585 | * Move a page back to the lists. | ||
1586 | * | ||
1587 | * Must be called with the slab lock held. | ||
1588 | * | ||
1589 | * On exit the slab lock will have been dropped. | ||
1590 | */ | ||
1591 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | ||
1592 | __releases(bitlock) | ||
1593 | { | ||
1594 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1595 | |||
1596 | __ClearPageSlubFrozen(page); | ||
1597 | if (page->inuse) { | ||
1598 | |||
1599 | if (page->freelist) { | ||
1600 | add_partial(n, page, tail); | ||
1601 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1602 | } else { | ||
1603 | stat(s, DEACTIVATE_FULL); | ||
1604 | if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) | ||
1605 | add_full(n, page); | ||
1606 | } | ||
1607 | slab_unlock(page); | ||
1608 | } else { | ||
1609 | stat(s, DEACTIVATE_EMPTY); | ||
1610 | if (n->nr_partial < s->min_partial) { | ||
1611 | /* | ||
1612 | * Adding an empty slab to the partial slabs in order | ||
1613 | * to avoid page allocator overhead. This slab needs | ||
1614 | * to come after the other slabs with objects in | ||
1615 | * so that the others get filled first. That way the | ||
1616 | * size of the partial list stays small. | ||
1617 | * | ||
1618 | * kmem_cache_shrink can reclaim any empty slabs from | ||
1619 | * the partial list. | ||
1620 | */ | ||
1621 | add_partial(n, page, 1); | ||
1622 | slab_unlock(page); | ||
1623 | } else { | ||
1624 | slab_unlock(page); | ||
1625 | stat(s, FREE_SLAB); | ||
1626 | discard_slab(s, page); | ||
1627 | } | ||
1628 | } | ||
1629 | } | ||
1630 | |||
1631 | #ifdef CONFIG_PREEMPT | 1702 | #ifdef CONFIG_PREEMPT |
1632 | /* | 1703 | /* |
1633 | * Calculate the next globally unique transaction for disambiguiation | 1704 | * Calculate the next globally unique transaction for disambiguiation |
@@ -1697,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1697 | /* | 1768 | /* |
1698 | * Remove the cpu slab | 1769 | * Remove the cpu slab |
1699 | */ | 1770 | */ |
1771 | |||
1772 | /* | ||
1773 | * Remove the cpu slab | ||
1774 | */ | ||
1700 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1775 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1701 | __releases(bitlock) | ||
1702 | { | 1776 | { |
1777 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | ||
1703 | struct page *page = c->page; | 1778 | struct page *page = c->page; |
1704 | int tail = 1; | 1779 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1705 | 1780 | int lock = 0; | |
1706 | if (page->freelist) | 1781 | enum slab_modes l = M_NONE, m = M_NONE; |
1782 | void *freelist; | ||
1783 | void *nextfree; | ||
1784 | int tail = 0; | ||
1785 | struct page new; | ||
1786 | struct page old; | ||
1787 | |||
1788 | if (page->freelist) { | ||
1707 | stat(s, DEACTIVATE_REMOTE_FREES); | 1789 | stat(s, DEACTIVATE_REMOTE_FREES); |
1790 | tail = 1; | ||
1791 | } | ||
1792 | |||
1793 | c->tid = next_tid(c->tid); | ||
1794 | c->page = NULL; | ||
1795 | freelist = c->freelist; | ||
1796 | c->freelist = NULL; | ||
1797 | |||
1708 | /* | 1798 | /* |
1709 | * Merge cpu freelist into slab freelist. Typically we get here | 1799 | * Stage one: Free all available per cpu objects back |
1710 | * because both freelists are empty. So this is unlikely | 1800 | * to the page freelist while it is still frozen. Leave the |
1711 | * to occur. | 1801 | * last one. |
1802 | * | ||
1803 | * There is no need to take the list->lock because the page | ||
1804 | * is still frozen. | ||
1805 | */ | ||
1806 | while (freelist && (nextfree = get_freepointer(s, freelist))) { | ||
1807 | void *prior; | ||
1808 | unsigned long counters; | ||
1809 | |||
1810 | do { | ||
1811 | prior = page->freelist; | ||
1812 | counters = page->counters; | ||
1813 | set_freepointer(s, freelist, prior); | ||
1814 | new.counters = counters; | ||
1815 | new.inuse--; | ||
1816 | VM_BUG_ON(!new.frozen); | ||
1817 | |||
1818 | } while (!__cmpxchg_double_slab(s, page, | ||
1819 | prior, counters, | ||
1820 | freelist, new.counters, | ||
1821 | "drain percpu freelist")); | ||
1822 | |||
1823 | freelist = nextfree; | ||
1824 | } | ||
1825 | |||
1826 | /* | ||
1827 | * Stage two: Ensure that the page is unfrozen while the | ||
1828 | * list presence reflects the actual number of objects | ||
1829 | * during unfreeze. | ||
1830 | * | ||
1831 | * We setup the list membership and then perform a cmpxchg | ||
1832 | * with the count. If there is a mismatch then the page | ||
1833 | * is not unfrozen but the page is on the wrong list. | ||
1834 | * | ||
1835 | * Then we restart the process which may have to remove | ||
1836 | * the page from the list that we just put it on again | ||
1837 | * because the number of objects in the slab may have | ||
1838 | * changed. | ||
1712 | */ | 1839 | */ |
1713 | while (unlikely(c->freelist)) { | 1840 | redo: |
1714 | void **object; | ||
1715 | 1841 | ||
1716 | tail = 0; /* Hot objects. Put the slab first */ | 1842 | old.freelist = page->freelist; |
1843 | old.counters = page->counters; | ||
1844 | VM_BUG_ON(!old.frozen); | ||
1717 | 1845 | ||
1718 | /* Retrieve object from cpu_freelist */ | 1846 | /* Determine target state of the slab */ |
1719 | object = c->freelist; | 1847 | new.counters = old.counters; |
1720 | c->freelist = get_freepointer(s, c->freelist); | 1848 | if (freelist) { |
1849 | new.inuse--; | ||
1850 | set_freepointer(s, freelist, old.freelist); | ||
1851 | new.freelist = freelist; | ||
1852 | } else | ||
1853 | new.freelist = old.freelist; | ||
1721 | 1854 | ||
1722 | /* And put onto the regular freelist */ | 1855 | new.frozen = 0; |
1723 | set_freepointer(s, object, page->freelist); | 1856 | |
1724 | page->freelist = object; | 1857 | if (!new.inuse && n->nr_partial > s->min_partial) |
1725 | page->inuse--; | 1858 | m = M_FREE; |
1859 | else if (new.freelist) { | ||
1860 | m = M_PARTIAL; | ||
1861 | if (!lock) { | ||
1862 | lock = 1; | ||
1863 | /* | ||
1864 | * Taking the spinlock removes the possiblity | ||
1865 | * that acquire_slab() will see a slab page that | ||
1866 | * is frozen | ||
1867 | */ | ||
1868 | spin_lock(&n->list_lock); | ||
1869 | } | ||
1870 | } else { | ||
1871 | m = M_FULL; | ||
1872 | if (kmem_cache_debug(s) && !lock) { | ||
1873 | lock = 1; | ||
1874 | /* | ||
1875 | * This also ensures that the scanning of full | ||
1876 | * slabs from diagnostic functions will not see | ||
1877 | * any frozen slabs. | ||
1878 | */ | ||
1879 | spin_lock(&n->list_lock); | ||
1880 | } | ||
1881 | } | ||
1882 | |||
1883 | if (l != m) { | ||
1884 | |||
1885 | if (l == M_PARTIAL) | ||
1886 | |||
1887 | remove_partial(n, page); | ||
1888 | |||
1889 | else if (l == M_FULL) | ||
1890 | |||
1891 | remove_full(s, page); | ||
1892 | |||
1893 | if (m == M_PARTIAL) { | ||
1894 | |||
1895 | add_partial(n, page, tail); | ||
1896 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1897 | |||
1898 | } else if (m == M_FULL) { | ||
1899 | |||
1900 | stat(s, DEACTIVATE_FULL); | ||
1901 | add_full(s, n, page); | ||
1902 | |||
1903 | } | ||
1904 | } | ||
1905 | |||
1906 | l = m; | ||
1907 | if (!__cmpxchg_double_slab(s, page, | ||
1908 | old.freelist, old.counters, | ||
1909 | new.freelist, new.counters, | ||
1910 | "unfreezing slab")) | ||
1911 | goto redo; | ||
1912 | |||
1913 | if (lock) | ||
1914 | spin_unlock(&n->list_lock); | ||
1915 | |||
1916 | if (m == M_FREE) { | ||
1917 | stat(s, DEACTIVATE_EMPTY); | ||
1918 | discard_slab(s, page); | ||
1919 | stat(s, FREE_SLAB); | ||
1726 | } | 1920 | } |
1727 | c->page = NULL; | ||
1728 | c->tid = next_tid(c->tid); | ||
1729 | unfreeze_slab(s, page, tail); | ||
1730 | } | 1921 | } |
1731 | 1922 | ||
1732 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1923 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1733 | { | 1924 | { |
1734 | stat(s, CPUSLAB_FLUSH); | 1925 | stat(s, CPUSLAB_FLUSH); |
1735 | slab_lock(c->page); | ||
1736 | deactivate_slab(s, c); | 1926 | deactivate_slab(s, c); |
1737 | } | 1927 | } |
1738 | 1928 | ||
@@ -1861,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1861 | void **object; | 2051 | void **object; |
1862 | struct page *page; | 2052 | struct page *page; |
1863 | unsigned long flags; | 2053 | unsigned long flags; |
2054 | struct page new; | ||
2055 | unsigned long counters; | ||
1864 | 2056 | ||
1865 | local_irq_save(flags); | 2057 | local_irq_save(flags); |
1866 | #ifdef CONFIG_PREEMPT | 2058 | #ifdef CONFIG_PREEMPT |
@@ -1879,72 +2071,97 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1879 | if (!page) | 2071 | if (!page) |
1880 | goto new_slab; | 2072 | goto new_slab; |
1881 | 2073 | ||
1882 | slab_lock(page); | 2074 | if (unlikely(!node_match(c, node))) { |
1883 | if (unlikely(!node_match(c, node))) | 2075 | stat(s, ALLOC_NODE_MISMATCH); |
1884 | goto another_slab; | 2076 | deactivate_slab(s, c); |
2077 | goto new_slab; | ||
2078 | } | ||
2079 | |||
2080 | stat(s, ALLOC_SLOWPATH); | ||
2081 | |||
2082 | do { | ||
2083 | object = page->freelist; | ||
2084 | counters = page->counters; | ||
2085 | new.counters = counters; | ||
2086 | VM_BUG_ON(!new.frozen); | ||
2087 | |||
2088 | /* | ||
2089 | * If there is no object left then we use this loop to | ||
2090 | * deactivate the slab which is simple since no objects | ||
2091 | * are left in the slab and therefore we do not need to | ||
2092 | * put the page back onto the partial list. | ||
2093 | * | ||
2094 | * If there are objects left then we retrieve them | ||
2095 | * and use them to refill the per cpu queue. | ||
2096 | */ | ||
2097 | |||
2098 | new.inuse = page->objects; | ||
2099 | new.frozen = object != NULL; | ||
2100 | |||
2101 | } while (!__cmpxchg_double_slab(s, page, | ||
2102 | object, counters, | ||
2103 | NULL, new.counters, | ||
2104 | "__slab_alloc")); | ||
2105 | |||
2106 | if (unlikely(!object)) { | ||
2107 | c->page = NULL; | ||
2108 | stat(s, DEACTIVATE_BYPASS); | ||
2109 | goto new_slab; | ||
2110 | } | ||
1885 | 2111 | ||
1886 | stat(s, ALLOC_REFILL); | 2112 | stat(s, ALLOC_REFILL); |
1887 | 2113 | ||
1888 | load_freelist: | 2114 | load_freelist: |
1889 | object = page->freelist; | 2115 | VM_BUG_ON(!page->frozen); |
1890 | if (unlikely(!object)) | ||
1891 | goto another_slab; | ||
1892 | if (kmem_cache_debug(s)) | ||
1893 | goto debug; | ||
1894 | |||
1895 | c->freelist = get_freepointer(s, object); | 2116 | c->freelist = get_freepointer(s, object); |
1896 | page->inuse = page->objects; | ||
1897 | page->freelist = NULL; | ||
1898 | |||
1899 | slab_unlock(page); | ||
1900 | c->tid = next_tid(c->tid); | 2117 | c->tid = next_tid(c->tid); |
1901 | local_irq_restore(flags); | 2118 | local_irq_restore(flags); |
1902 | stat(s, ALLOC_SLOWPATH); | ||
1903 | return object; | 2119 | return object; |
1904 | 2120 | ||
1905 | another_slab: | ||
1906 | deactivate_slab(s, c); | ||
1907 | |||
1908 | new_slab: | 2121 | new_slab: |
1909 | page = get_partial(s, gfpflags, node); | 2122 | page = get_partial(s, gfpflags, node); |
1910 | if (page) { | 2123 | if (page) { |
1911 | stat(s, ALLOC_FROM_PARTIAL); | 2124 | stat(s, ALLOC_FROM_PARTIAL); |
1912 | c->node = page_to_nid(page); | 2125 | object = c->freelist; |
1913 | c->page = page; | 2126 | |
2127 | if (kmem_cache_debug(s)) | ||
2128 | goto debug; | ||
1914 | goto load_freelist; | 2129 | goto load_freelist; |
1915 | } | 2130 | } |
1916 | 2131 | ||
1917 | gfpflags &= gfp_allowed_mask; | ||
1918 | if (gfpflags & __GFP_WAIT) | ||
1919 | local_irq_enable(); | ||
1920 | |||
1921 | page = new_slab(s, gfpflags, node); | 2132 | page = new_slab(s, gfpflags, node); |
1922 | 2133 | ||
1923 | if (gfpflags & __GFP_WAIT) | ||
1924 | local_irq_disable(); | ||
1925 | |||
1926 | if (page) { | 2134 | if (page) { |
1927 | c = __this_cpu_ptr(s->cpu_slab); | 2135 | c = __this_cpu_ptr(s->cpu_slab); |
1928 | stat(s, ALLOC_SLAB); | ||
1929 | if (c->page) | 2136 | if (c->page) |
1930 | flush_slab(s, c); | 2137 | flush_slab(s, c); |
1931 | 2138 | ||
1932 | slab_lock(page); | 2139 | /* |
1933 | __SetPageSlubFrozen(page); | 2140 | * No other reference to the page yet so we can |
2141 | * muck around with it freely without cmpxchg | ||
2142 | */ | ||
2143 | object = page->freelist; | ||
2144 | page->freelist = NULL; | ||
2145 | page->inuse = page->objects; | ||
2146 | |||
2147 | stat(s, ALLOC_SLAB); | ||
1934 | c->node = page_to_nid(page); | 2148 | c->node = page_to_nid(page); |
1935 | c->page = page; | 2149 | c->page = page; |
2150 | |||
2151 | if (kmem_cache_debug(s)) | ||
2152 | goto debug; | ||
1936 | goto load_freelist; | 2153 | goto load_freelist; |
1937 | } | 2154 | } |
1938 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2155 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1939 | slab_out_of_memory(s, gfpflags, node); | 2156 | slab_out_of_memory(s, gfpflags, node); |
1940 | local_irq_restore(flags); | 2157 | local_irq_restore(flags); |
1941 | return NULL; | 2158 | return NULL; |
2159 | |||
1942 | debug: | 2160 | debug: |
1943 | if (!alloc_debug_processing(s, page, object, addr)) | 2161 | if (!object || !alloc_debug_processing(s, page, object, addr)) |
1944 | goto another_slab; | 2162 | goto new_slab; |
1945 | 2163 | ||
1946 | page->inuse++; | 2164 | c->freelist = get_freepointer(s, object); |
1947 | page->freelist = get_freepointer(s, object); | ||
1948 | deactivate_slab(s, c); | 2165 | deactivate_slab(s, c); |
1949 | c->page = NULL; | 2166 | c->page = NULL; |
1950 | c->node = NUMA_NO_NODE; | 2167 | c->node = NUMA_NO_NODE; |
@@ -2096,52 +2313,89 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2096 | { | 2313 | { |
2097 | void *prior; | 2314 | void *prior; |
2098 | void **object = (void *)x; | 2315 | void **object = (void *)x; |
2099 | unsigned long flags; | 2316 | int was_frozen; |
2317 | int inuse; | ||
2318 | struct page new; | ||
2319 | unsigned long counters; | ||
2320 | struct kmem_cache_node *n = NULL; | ||
2321 | unsigned long uninitialized_var(flags); | ||
2100 | 2322 | ||
2101 | local_irq_save(flags); | ||
2102 | slab_lock(page); | ||
2103 | stat(s, FREE_SLOWPATH); | 2323 | stat(s, FREE_SLOWPATH); |
2104 | 2324 | ||
2105 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2325 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
2106 | goto out_unlock; | 2326 | return; |
2107 | 2327 | ||
2108 | prior = page->freelist; | 2328 | do { |
2109 | set_freepointer(s, object, prior); | 2329 | prior = page->freelist; |
2110 | page->freelist = object; | 2330 | counters = page->counters; |
2111 | page->inuse--; | 2331 | set_freepointer(s, object, prior); |
2332 | new.counters = counters; | ||
2333 | was_frozen = new.frozen; | ||
2334 | new.inuse--; | ||
2335 | if ((!new.inuse || !prior) && !was_frozen && !n) { | ||
2336 | n = get_node(s, page_to_nid(page)); | ||
2337 | /* | ||
2338 | * Speculatively acquire the list_lock. | ||
2339 | * If the cmpxchg does not succeed then we may | ||
2340 | * drop the list_lock without any processing. | ||
2341 | * | ||
2342 | * Otherwise the list_lock will synchronize with | ||
2343 | * other processors updating the list of slabs. | ||
2344 | */ | ||
2345 | spin_lock_irqsave(&n->list_lock, flags); | ||
2346 | } | ||
2347 | inuse = new.inuse; | ||
2112 | 2348 | ||
2113 | if (unlikely(PageSlubFrozen(page))) { | 2349 | } while (!cmpxchg_double_slab(s, page, |
2114 | stat(s, FREE_FROZEN); | 2350 | prior, counters, |
2115 | goto out_unlock; | 2351 | object, new.counters, |
2116 | } | 2352 | "__slab_free")); |
2117 | 2353 | ||
2118 | if (unlikely(!page->inuse)) | 2354 | if (likely(!n)) { |
2119 | goto slab_empty; | 2355 | /* |
2356 | * The list lock was not taken therefore no list | ||
2357 | * activity can be necessary. | ||
2358 | */ | ||
2359 | if (was_frozen) | ||
2360 | stat(s, FREE_FROZEN); | ||
2361 | return; | ||
2362 | } | ||
2120 | 2363 | ||
2121 | /* | 2364 | /* |
2122 | * Objects left in the slab. If it was not on the partial list before | 2365 | * was_frozen may have been set after we acquired the list_lock in |
2123 | * then add it. | 2366 | * an earlier loop. So we need to check it here again. |
2124 | */ | 2367 | */ |
2125 | if (unlikely(!prior)) { | 2368 | if (was_frozen) |
2126 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 2369 | stat(s, FREE_FROZEN); |
2127 | stat(s, FREE_ADD_PARTIAL); | 2370 | else { |
2128 | } | 2371 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) |
2372 | goto slab_empty; | ||
2129 | 2373 | ||
2130 | out_unlock: | 2374 | /* |
2131 | slab_unlock(page); | 2375 | * Objects left in the slab. If it was not on the partial list before |
2132 | local_irq_restore(flags); | 2376 | * then add it. |
2377 | */ | ||
2378 | if (unlikely(!prior)) { | ||
2379 | remove_full(s, page); | ||
2380 | add_partial(n, page, 0); | ||
2381 | stat(s, FREE_ADD_PARTIAL); | ||
2382 | } | ||
2383 | } | ||
2384 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2133 | return; | 2385 | return; |
2134 | 2386 | ||
2135 | slab_empty: | 2387 | slab_empty: |
2136 | if (prior) { | 2388 | if (prior) { |
2137 | /* | 2389 | /* |
2138 | * Slab still on the partial list. | 2390 | * Slab on the partial list. |
2139 | */ | 2391 | */ |
2140 | remove_partial(s, page); | 2392 | remove_partial(n, page); |
2141 | stat(s, FREE_REMOVE_PARTIAL); | 2393 | stat(s, FREE_REMOVE_PARTIAL); |
2142 | } | 2394 | } else |
2143 | slab_unlock(page); | 2395 | /* Slab must be on the full list */ |
2144 | local_irq_restore(flags); | 2396 | remove_full(s, page); |
2397 | |||
2398 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2145 | stat(s, FREE_SLAB); | 2399 | stat(s, FREE_SLAB); |
2146 | discard_slab(s, page); | 2400 | discard_slab(s, page); |
2147 | } | 2401 | } |
@@ -2415,7 +2669,6 @@ static void early_kmem_cache_node_alloc(int node) | |||
2415 | { | 2669 | { |
2416 | struct page *page; | 2670 | struct page *page; |
2417 | struct kmem_cache_node *n; | 2671 | struct kmem_cache_node *n; |
2418 | unsigned long flags; | ||
2419 | 2672 | ||
2420 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); | 2673 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); |
2421 | 2674 | ||
@@ -2433,6 +2686,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2433 | BUG_ON(!n); | 2686 | BUG_ON(!n); |
2434 | page->freelist = get_freepointer(kmem_cache_node, n); | 2687 | page->freelist = get_freepointer(kmem_cache_node, n); |
2435 | page->inuse++; | 2688 | page->inuse++; |
2689 | page->frozen = 0; | ||
2436 | kmem_cache_node->node[node] = n; | 2690 | kmem_cache_node->node[node] = n; |
2437 | #ifdef CONFIG_SLUB_DEBUG | 2691 | #ifdef CONFIG_SLUB_DEBUG |
2438 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2692 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
@@ -2441,14 +2695,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2441 | init_kmem_cache_node(n, kmem_cache_node); | 2695 | init_kmem_cache_node(n, kmem_cache_node); |
2442 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2696 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2443 | 2697 | ||
2444 | /* | ||
2445 | * lockdep requires consistent irq usage for each lock | ||
2446 | * so even though there cannot be a race this early in | ||
2447 | * the boot sequence, we still disable irqs. | ||
2448 | */ | ||
2449 | local_irq_save(flags); | ||
2450 | add_partial(n, page, 0); | 2698 | add_partial(n, page, 0); |
2451 | local_irq_restore(flags); | ||
2452 | } | 2699 | } |
2453 | 2700 | ||
2454 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2701 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
@@ -2654,6 +2901,12 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2654 | } | 2901 | } |
2655 | } | 2902 | } |
2656 | 2903 | ||
2904 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
2905 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | ||
2906 | /* Enable fast mode */ | ||
2907 | s->flags |= __CMPXCHG_DOUBLE; | ||
2908 | #endif | ||
2909 | |||
2657 | /* | 2910 | /* |
2658 | * The larger the object size is, the more pages we want on the partial | 2911 | * The larger the object size is, the more pages we want on the partial |
2659 | * list to avoid pounding the page allocator excessively. | 2912 | * list to avoid pounding the page allocator excessively. |
@@ -2726,7 +2979,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | |||
2726 | spin_lock_irqsave(&n->list_lock, flags); | 2979 | spin_lock_irqsave(&n->list_lock, flags); |
2727 | list_for_each_entry_safe(page, h, &n->partial, lru) { | 2980 | list_for_each_entry_safe(page, h, &n->partial, lru) { |
2728 | if (!page->inuse) { | 2981 | if (!page->inuse) { |
2729 | __remove_partial(n, page); | 2982 | remove_partial(n, page); |
2730 | discard_slab(s, page); | 2983 | discard_slab(s, page); |
2731 | } else { | 2984 | } else { |
2732 | list_slab_objects(s, page, | 2985 | list_slab_objects(s, page, |
@@ -3094,14 +3347,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3094 | * list_lock. page->inuse here is the upper limit. | 3347 | * list_lock. page->inuse here is the upper limit. |
3095 | */ | 3348 | */ |
3096 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3349 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
3097 | if (!page->inuse && slab_trylock(page)) { | 3350 | if (!page->inuse) { |
3098 | /* | 3351 | remove_partial(n, page); |
3099 | * Must hold slab lock here because slab_free | ||
3100 | * may have freed the last object and be | ||
3101 | * waiting to release the slab. | ||
3102 | */ | ||
3103 | __remove_partial(n, page); | ||
3104 | slab_unlock(page); | ||
3105 | discard_slab(s, page); | 3352 | discard_slab(s, page); |
3106 | } else { | 3353 | } else { |
3107 | list_move(&page->lru, | 3354 | list_move(&page->lru, |
@@ -3689,12 +3936,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3689 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, | 3936 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, |
3690 | unsigned long *map) | 3937 | unsigned long *map) |
3691 | { | 3938 | { |
3692 | if (slab_trylock(page)) { | 3939 | slab_lock(page); |
3693 | validate_slab(s, page, map); | 3940 | validate_slab(s, page, map); |
3694 | slab_unlock(page); | 3941 | slab_unlock(page); |
3695 | } else | ||
3696 | printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", | ||
3697 | s->name, page); | ||
3698 | } | 3942 | } |
3699 | 3943 | ||
3700 | static int validate_slab_node(struct kmem_cache *s, | 3944 | static int validate_slab_node(struct kmem_cache *s, |
@@ -4342,8 +4586,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, | |||
4342 | const char *buf, size_t length) | 4586 | const char *buf, size_t length) |
4343 | { | 4587 | { |
4344 | s->flags &= ~SLAB_DEBUG_FREE; | 4588 | s->flags &= ~SLAB_DEBUG_FREE; |
4345 | if (buf[0] == '1') | 4589 | if (buf[0] == '1') { |
4590 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4346 | s->flags |= SLAB_DEBUG_FREE; | 4591 | s->flags |= SLAB_DEBUG_FREE; |
4592 | } | ||
4347 | return length; | 4593 | return length; |
4348 | } | 4594 | } |
4349 | SLAB_ATTR(sanity_checks); | 4595 | SLAB_ATTR(sanity_checks); |
@@ -4357,8 +4603,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4357 | size_t length) | 4603 | size_t length) |
4358 | { | 4604 | { |
4359 | s->flags &= ~SLAB_TRACE; | 4605 | s->flags &= ~SLAB_TRACE; |
4360 | if (buf[0] == '1') | 4606 | if (buf[0] == '1') { |
4607 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4361 | s->flags |= SLAB_TRACE; | 4608 | s->flags |= SLAB_TRACE; |
4609 | } | ||
4362 | return length; | 4610 | return length; |
4363 | } | 4611 | } |
4364 | SLAB_ATTR(trace); | 4612 | SLAB_ATTR(trace); |
@@ -4375,8 +4623,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, | |||
4375 | return -EBUSY; | 4623 | return -EBUSY; |
4376 | 4624 | ||
4377 | s->flags &= ~SLAB_RED_ZONE; | 4625 | s->flags &= ~SLAB_RED_ZONE; |
4378 | if (buf[0] == '1') | 4626 | if (buf[0] == '1') { |
4627 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4379 | s->flags |= SLAB_RED_ZONE; | 4628 | s->flags |= SLAB_RED_ZONE; |
4629 | } | ||
4380 | calculate_sizes(s, -1); | 4630 | calculate_sizes(s, -1); |
4381 | return length; | 4631 | return length; |
4382 | } | 4632 | } |
@@ -4394,8 +4644,10 @@ static ssize_t poison_store(struct kmem_cache *s, | |||
4394 | return -EBUSY; | 4644 | return -EBUSY; |
4395 | 4645 | ||
4396 | s->flags &= ~SLAB_POISON; | 4646 | s->flags &= ~SLAB_POISON; |
4397 | if (buf[0] == '1') | 4647 | if (buf[0] == '1') { |
4648 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4398 | s->flags |= SLAB_POISON; | 4649 | s->flags |= SLAB_POISON; |
4650 | } | ||
4399 | calculate_sizes(s, -1); | 4651 | calculate_sizes(s, -1); |
4400 | return length; | 4652 | return length; |
4401 | } | 4653 | } |
@@ -4413,8 +4665,10 @@ static ssize_t store_user_store(struct kmem_cache *s, | |||
4413 | return -EBUSY; | 4665 | return -EBUSY; |
4414 | 4666 | ||
4415 | s->flags &= ~SLAB_STORE_USER; | 4667 | s->flags &= ~SLAB_STORE_USER; |
4416 | if (buf[0] == '1') | 4668 | if (buf[0] == '1') { |
4669 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4417 | s->flags |= SLAB_STORE_USER; | 4670 | s->flags |= SLAB_STORE_USER; |
4671 | } | ||
4418 | calculate_sizes(s, -1); | 4672 | calculate_sizes(s, -1); |
4419 | return length; | 4673 | return length; |
4420 | } | 4674 | } |
@@ -4579,6 +4833,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); | |||
4579 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); | 4833 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); |
4580 | STAT_ATTR(ALLOC_SLAB, alloc_slab); | 4834 | STAT_ATTR(ALLOC_SLAB, alloc_slab); |
4581 | STAT_ATTR(ALLOC_REFILL, alloc_refill); | 4835 | STAT_ATTR(ALLOC_REFILL, alloc_refill); |
4836 | STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); | ||
4582 | STAT_ATTR(FREE_SLAB, free_slab); | 4837 | STAT_ATTR(FREE_SLAB, free_slab); |
4583 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); | 4838 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); |
4584 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); | 4839 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); |
@@ -4586,7 +4841,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); | |||
4586 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); | 4841 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); |
4587 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); | 4842 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); |
4588 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); | 4843 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); |
4844 | STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); | ||
4589 | STAT_ATTR(ORDER_FALLBACK, order_fallback); | 4845 | STAT_ATTR(ORDER_FALLBACK, order_fallback); |
4846 | STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); | ||
4847 | STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); | ||
4590 | #endif | 4848 | #endif |
4591 | 4849 | ||
4592 | static struct attribute *slab_attrs[] = { | 4850 | static struct attribute *slab_attrs[] = { |
@@ -4636,6 +4894,7 @@ static struct attribute *slab_attrs[] = { | |||
4636 | &alloc_from_partial_attr.attr, | 4894 | &alloc_from_partial_attr.attr, |
4637 | &alloc_slab_attr.attr, | 4895 | &alloc_slab_attr.attr, |
4638 | &alloc_refill_attr.attr, | 4896 | &alloc_refill_attr.attr, |
4897 | &alloc_node_mismatch_attr.attr, | ||
4639 | &free_slab_attr.attr, | 4898 | &free_slab_attr.attr, |
4640 | &cpuslab_flush_attr.attr, | 4899 | &cpuslab_flush_attr.attr, |
4641 | &deactivate_full_attr.attr, | 4900 | &deactivate_full_attr.attr, |
@@ -4643,7 +4902,10 @@ static struct attribute *slab_attrs[] = { | |||
4643 | &deactivate_to_head_attr.attr, | 4902 | &deactivate_to_head_attr.attr, |
4644 | &deactivate_to_tail_attr.attr, | 4903 | &deactivate_to_tail_attr.attr, |
4645 | &deactivate_remote_frees_attr.attr, | 4904 | &deactivate_remote_frees_attr.attr, |
4905 | &deactivate_bypass_attr.attr, | ||
4646 | &order_fallback_attr.attr, | 4906 | &order_fallback_attr.attr, |
4907 | &cmpxchg_double_fail_attr.attr, | ||
4908 | &cmpxchg_double_cpu_fail_attr.attr, | ||
4647 | #endif | 4909 | #endif |
4648 | #ifdef CONFIG_FAILSLAB | 4910 | #ifdef CONFIG_FAILSLAB |
4649 | &failslab_attr.attr, | 4911 | &failslab_attr.attr, |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b8c33907242..17bc224bce68 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
1924 | 1924 | ||
1925 | /* | 1925 | /* |
1926 | * Find out how many pages are allowed for a single swap | 1926 | * Find out how many pages are allowed for a single swap |
1927 | * device. There are two limiting factors: 1) the number of | 1927 | * device. There are three limiting factors: 1) the number |
1928 | * bits for the swap offset in the swp_entry_t type and | 1928 | * of bits for the swap offset in the swp_entry_t type, and |
1929 | * 2) the number of bits in the a swap pte as defined by | 1929 | * 2) the number of bits in the swap pte as defined by the |
1930 | * the different architectures. In order to find the | 1930 | * the different architectures, and 3) the number of free bits |
1931 | * largest possible bit mask a swap entry with swap type 0 | 1931 | * in an exceptional radix_tree entry. In order to find the |
1932 | * largest possible bit mask, a swap entry with swap type 0 | ||
1932 | * and swap offset ~0UL is created, encoded to a swap pte, | 1933 | * and swap offset ~0UL is created, encoded to a swap pte, |
1933 | * decoded to a swp_entry_t again and finally the swap | 1934 | * decoded to a swp_entry_t again, and finally the swap |
1934 | * offset is extracted. This will mask all the bits from | 1935 | * offset is extracted. This will mask all the bits from |
1935 | * the initial ~0UL mask that can't be encoded in either | 1936 | * the initial ~0UL mask that can't be encoded in either |
1936 | * the swp_entry_t or the architecture definition of a | 1937 | * the swp_entry_t or the architecture definition of a |
1937 | * swap pte. | 1938 | * swap pte. Then the same is done for a radix_tree entry. |
1938 | */ | 1939 | */ |
1939 | maxpages = swp_offset(pte_to_swp_entry( | 1940 | maxpages = swp_offset(pte_to_swp_entry( |
1940 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; | 1941 | swp_entry_to_pte(swp_entry(0, ~0UL)))); |
1942 | maxpages = swp_offset(radix_to_swp_entry( | ||
1943 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
1944 | |||
1941 | if (maxpages > swap_header->info.last_page) { | 1945 | if (maxpages > swap_header->info.last_page) { |
1942 | maxpages = swap_header->info.last_page + 1; | 1946 | maxpages = swap_header->info.last_page + 1; |
1943 | /* p->max is an unsigned int: don't overflow it */ | 1947 | /* p->max is an unsigned int: don't overflow it */ |
diff --git a/mm/truncate.c b/mm/truncate.c index 232eb2736a79..b40ac6d4e86e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
336 | unsigned long count = 0; | 336 | unsigned long count = 0; |
337 | int i; | 337 | int i; |
338 | 338 | ||
339 | /* | ||
340 | * Note: this function may get called on a shmem/tmpfs mapping: | ||
341 | * pagevec_lookup() might then return 0 prematurely (because it | ||
342 | * got a gangful of swap entries); but it's hardly worth worrying | ||
343 | * about - it can rarely have anything to free from such a mapping | ||
344 | * (most pages are dirty), and already skips over any difficulties. | ||
345 | */ | ||
346 | |||
339 | pagevec_init(&pvec, 0); | 347 | pagevec_init(&pvec, 0); |
340 | while (index <= end && pagevec_lookup(&pvec, mapping, index, | 348 | while (index <= end && pagevec_lookup(&pvec, mapping, index, |
341 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 349 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ab8494cde007..5016f19e1661 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #include <linux/rcupdate.h> | 26 | #include <linux/rcupdate.h> |
27 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
28 | #include <linux/kmemleak.h> | 28 | #include <linux/kmemleak.h> |
29 | #include <asm/atomic.h> | 29 | #include <linux/atomic.h> |
30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
32 | #include <asm/shmparam.h> | 32 | #include <asm/shmparam.h> |
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) | |||
725 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | 725 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) |
726 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | 726 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ |
727 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | 727 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ |
728 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | 728 | #define VMAP_BBMAP_BITS \ |
729 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | 729 | VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ |
730 | VMALLOC_PAGES / NR_CPUS / 16)) | 730 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ |
731 | VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) | ||
731 | 732 | ||
732 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | 733 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) |
733 | 734 | ||
@@ -2139,6 +2140,14 @@ struct vm_struct *alloc_vm_area(size_t size) | |||
2139 | return NULL; | 2140 | return NULL; |
2140 | } | 2141 | } |
2141 | 2142 | ||
2143 | /* | ||
2144 | * If the allocated address space is passed to a hypercall | ||
2145 | * before being used then we cannot rely on a page fault to | ||
2146 | * trigger an update of the page tables. So sync all the page | ||
2147 | * tables here. | ||
2148 | */ | ||
2149 | vmalloc_sync_all(); | ||
2150 | |||
2142 | return area; | 2151 | return area; |
2143 | } | 2152 | } |
2144 | EXPORT_SYMBOL_GPL(alloc_vm_area); | 2153 | EXPORT_SYMBOL_GPL(alloc_vm_area); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 8e32698fab66..9fdfce7ba403 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -95,8 +95,6 @@ struct scan_control { | |||
95 | /* Can pages be swapped as part of reclaim? */ | 95 | /* Can pages be swapped as part of reclaim? */ |
96 | int may_swap; | 96 | int may_swap; |
97 | 97 | ||
98 | int swappiness; | ||
99 | |||
100 | int order; | 98 | int order; |
101 | 99 | ||
102 | /* | 100 | /* |
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, | |||
173 | struct scan_control *sc, enum lru_list lru) | 171 | struct scan_control *sc, enum lru_list lru) |
174 | { | 172 | { |
175 | if (!scanning_global_lru(sc)) | 173 | if (!scanning_global_lru(sc)) |
176 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); | 174 | return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, |
175 | zone_to_nid(zone), zone_idx(zone), BIT(lru)); | ||
177 | 176 | ||
178 | return zone_page_state(zone, NR_LRU_BASE + lru); | 177 | return zone_page_state(zone, NR_LRU_BASE + lru); |
179 | } | 178 | } |
@@ -1770,6 +1769,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | |||
1770 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1769 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
1771 | } | 1770 | } |
1772 | 1771 | ||
1772 | static int vmscan_swappiness(struct scan_control *sc) | ||
1773 | { | ||
1774 | if (scanning_global_lru(sc)) | ||
1775 | return vm_swappiness; | ||
1776 | return mem_cgroup_swappiness(sc->mem_cgroup); | ||
1777 | } | ||
1778 | |||
1773 | /* | 1779 | /* |
1774 | * Determine how aggressively the anon and file LRU lists should be | 1780 | * Determine how aggressively the anon and file LRU lists should be |
1775 | * scanned. The relative value of each set of LRU lists is determined | 1781 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1788,22 +1794,15 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1788 | u64 fraction[2], denominator; | 1794 | u64 fraction[2], denominator; |
1789 | enum lru_list l; | 1795 | enum lru_list l; |
1790 | int noswap = 0; | 1796 | int noswap = 0; |
1791 | int force_scan = 0; | 1797 | bool force_scan = false; |
1792 | 1798 | unsigned long nr_force_scan[2]; | |
1793 | |||
1794 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1795 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1796 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1797 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1798 | 1799 | ||
1799 | if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { | 1800 | /* kswapd does zone balancing and needs to scan this zone */ |
1800 | /* kswapd does zone balancing and need to scan this zone */ | 1801 | if (scanning_global_lru(sc) && current_is_kswapd()) |
1801 | if (scanning_global_lru(sc) && current_is_kswapd()) | 1802 | force_scan = true; |
1802 | force_scan = 1; | 1803 | /* memcg may have small limit and need to avoid priority drop */ |
1803 | /* memcg may have small limit and need to avoid priority drop */ | 1804 | if (!scanning_global_lru(sc)) |
1804 | if (!scanning_global_lru(sc)) | 1805 | force_scan = true; |
1805 | force_scan = 1; | ||
1806 | } | ||
1807 | 1806 | ||
1808 | /* If we have no swap space, do not bother scanning anon pages. */ | 1807 | /* If we have no swap space, do not bother scanning anon pages. */ |
1809 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1808 | if (!sc->may_swap || (nr_swap_pages <= 0)) { |
@@ -1811,9 +1810,16 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1811 | fraction[0] = 0; | 1810 | fraction[0] = 0; |
1812 | fraction[1] = 1; | 1811 | fraction[1] = 1; |
1813 | denominator = 1; | 1812 | denominator = 1; |
1813 | nr_force_scan[0] = 0; | ||
1814 | nr_force_scan[1] = SWAP_CLUSTER_MAX; | ||
1814 | goto out; | 1815 | goto out; |
1815 | } | 1816 | } |
1816 | 1817 | ||
1818 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
1819 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | ||
1820 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
1821 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); | ||
1822 | |||
1817 | if (scanning_global_lru(sc)) { | 1823 | if (scanning_global_lru(sc)) { |
1818 | free = zone_page_state(zone, NR_FREE_PAGES); | 1824 | free = zone_page_state(zone, NR_FREE_PAGES); |
1819 | /* If we have very few page cache pages, | 1825 | /* If we have very few page cache pages, |
@@ -1822,6 +1828,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1822 | fraction[0] = 1; | 1828 | fraction[0] = 1; |
1823 | fraction[1] = 0; | 1829 | fraction[1] = 0; |
1824 | denominator = 1; | 1830 | denominator = 1; |
1831 | nr_force_scan[0] = SWAP_CLUSTER_MAX; | ||
1832 | nr_force_scan[1] = 0; | ||
1825 | goto out; | 1833 | goto out; |
1826 | } | 1834 | } |
1827 | } | 1835 | } |
@@ -1830,8 +1838,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1830 | * With swappiness at 100, anonymous and file have the same priority. | 1838 | * With swappiness at 100, anonymous and file have the same priority. |
1831 | * This scanning priority is essentially the inverse of IO cost. | 1839 | * This scanning priority is essentially the inverse of IO cost. |
1832 | */ | 1840 | */ |
1833 | anon_prio = sc->swappiness; | 1841 | anon_prio = vmscan_swappiness(sc); |
1834 | file_prio = 200 - sc->swappiness; | 1842 | file_prio = 200 - vmscan_swappiness(sc); |
1835 | 1843 | ||
1836 | /* | 1844 | /* |
1837 | * OK, so we have swap space and a fair amount of page cache | 1845 | * OK, so we have swap space and a fair amount of page cache |
@@ -1870,6 +1878,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, | |||
1870 | fraction[0] = ap; | 1878 | fraction[0] = ap; |
1871 | fraction[1] = fp; | 1879 | fraction[1] = fp; |
1872 | denominator = ap + fp + 1; | 1880 | denominator = ap + fp + 1; |
1881 | if (force_scan) { | ||
1882 | unsigned long scan = SWAP_CLUSTER_MAX; | ||
1883 | nr_force_scan[0] = div64_u64(scan * ap, denominator); | ||
1884 | nr_force_scan[1] = div64_u64(scan * fp, denominator); | ||
1885 | } | ||
1873 | out: | 1886 | out: |
1874 | for_each_evictable_lru(l) { | 1887 | for_each_evictable_lru(l) { |
1875 | int file = is_file_lru(l); | 1888 | int file = is_file_lru(l); |
@@ -1890,12 +1903,8 @@ out: | |||
1890 | * memcg, priority drop can cause big latency. So, it's better | 1903 | * memcg, priority drop can cause big latency. So, it's better |
1891 | * to scan small amount. See may_noscan above. | 1904 | * to scan small amount. See may_noscan above. |
1892 | */ | 1905 | */ |
1893 | if (!scan && force_scan) { | 1906 | if (!scan && force_scan) |
1894 | if (file) | 1907 | scan = nr_force_scan[file]; |
1895 | scan = SWAP_CLUSTER_MAX; | ||
1896 | else if (!noswap) | ||
1897 | scan = SWAP_CLUSTER_MAX; | ||
1898 | } | ||
1899 | nr[l] = scan; | 1908 | nr[l] = scan; |
1900 | } | 1909 | } |
1901 | } | 1910 | } |
@@ -2220,7 +2229,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2220 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2229 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2221 | .may_unmap = 1, | 2230 | .may_unmap = 1, |
2222 | .may_swap = 1, | 2231 | .may_swap = 1, |
2223 | .swappiness = vm_swappiness, | ||
2224 | .order = order, | 2232 | .order = order, |
2225 | .mem_cgroup = NULL, | 2233 | .mem_cgroup = NULL, |
2226 | .nodemask = nodemask, | 2234 | .nodemask = nodemask, |
@@ -2244,7 +2252,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2244 | 2252 | ||
2245 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 2253 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
2246 | gfp_t gfp_mask, bool noswap, | 2254 | gfp_t gfp_mask, bool noswap, |
2247 | unsigned int swappiness, | ||
2248 | struct zone *zone, | 2255 | struct zone *zone, |
2249 | unsigned long *nr_scanned) | 2256 | unsigned long *nr_scanned) |
2250 | { | 2257 | { |
@@ -2254,7 +2261,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2254 | .may_writepage = !laptop_mode, | 2261 | .may_writepage = !laptop_mode, |
2255 | .may_unmap = 1, | 2262 | .may_unmap = 1, |
2256 | .may_swap = !noswap, | 2263 | .may_swap = !noswap, |
2257 | .swappiness = swappiness, | ||
2258 | .order = 0, | 2264 | .order = 0, |
2259 | .mem_cgroup = mem, | 2265 | .mem_cgroup = mem, |
2260 | }; | 2266 | }; |
@@ -2283,8 +2289,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
2283 | 2289 | ||
2284 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 2290 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
2285 | gfp_t gfp_mask, | 2291 | gfp_t gfp_mask, |
2286 | bool noswap, | 2292 | bool noswap) |
2287 | unsigned int swappiness) | ||
2288 | { | 2293 | { |
2289 | struct zonelist *zonelist; | 2294 | struct zonelist *zonelist; |
2290 | unsigned long nr_reclaimed; | 2295 | unsigned long nr_reclaimed; |
@@ -2294,7 +2299,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2294 | .may_unmap = 1, | 2299 | .may_unmap = 1, |
2295 | .may_swap = !noswap, | 2300 | .may_swap = !noswap, |
2296 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2301 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2297 | .swappiness = swappiness, | ||
2298 | .order = 0, | 2302 | .order = 0, |
2299 | .mem_cgroup = mem_cont, | 2303 | .mem_cgroup = mem_cont, |
2300 | .nodemask = NULL, /* we don't care the placement */ | 2304 | .nodemask = NULL, /* we don't care the placement */ |
@@ -2445,7 +2449,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
2445 | * we want to put equal scanning pressure on each zone. | 2449 | * we want to put equal scanning pressure on each zone. |
2446 | */ | 2450 | */ |
2447 | .nr_to_reclaim = ULONG_MAX, | 2451 | .nr_to_reclaim = ULONG_MAX, |
2448 | .swappiness = vm_swappiness, | ||
2449 | .order = order, | 2452 | .order = order, |
2450 | .mem_cgroup = NULL, | 2453 | .mem_cgroup = NULL, |
2451 | }; | 2454 | }; |
@@ -2494,6 +2497,9 @@ loop_again: | |||
2494 | high_wmark_pages(zone), 0, 0)) { | 2497 | high_wmark_pages(zone), 0, 0)) { |
2495 | end_zone = i; | 2498 | end_zone = i; |
2496 | break; | 2499 | break; |
2500 | } else { | ||
2501 | /* If balanced, clear the congested flag */ | ||
2502 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2497 | } | 2503 | } |
2498 | } | 2504 | } |
2499 | if (i < 0) | 2505 | if (i < 0) |
@@ -2915,7 +2921,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
2915 | .may_writepage = 1, | 2921 | .may_writepage = 1, |
2916 | .nr_to_reclaim = nr_to_reclaim, | 2922 | .nr_to_reclaim = nr_to_reclaim, |
2917 | .hibernation_mode = 1, | 2923 | .hibernation_mode = 1, |
2918 | .swappiness = vm_swappiness, | ||
2919 | .order = 0, | 2924 | .order = 0, |
2920 | }; | 2925 | }; |
2921 | struct shrink_control shrink = { | 2926 | struct shrink_control shrink = { |
@@ -3102,7 +3107,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3102 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3107 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
3103 | SWAP_CLUSTER_MAX), | 3108 | SWAP_CLUSTER_MAX), |
3104 | .gfp_mask = gfp_mask, | 3109 | .gfp_mask = gfp_mask, |
3105 | .swappiness = vm_swappiness, | ||
3106 | .order = order, | 3110 | .order = order, |
3107 | }; | 3111 | }; |
3108 | struct shrink_control shrink = { | 3112 | struct shrink_control shrink = { |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b2..d52b13d28e8f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
659 | } | 659 | } |
660 | #endif | 660 | #endif |
661 | 661 | ||
662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) | 662 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
663 | #ifdef CONFIG_ZONE_DMA | 663 | #ifdef CONFIG_ZONE_DMA |
664 | #define TEXT_FOR_DMA(xx) xx "_dma", | 664 | #define TEXT_FOR_DMA(xx) xx "_dma", |
665 | #else | 665 | #else |
@@ -788,7 +788,7 @@ const char * const vmstat_text[] = { | |||
788 | 788 | ||
789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 789 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
790 | }; | 790 | }; |
791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ | 791 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
792 | 792 | ||
793 | 793 | ||
794 | #ifdef CONFIG_PROC_FS | 794 | #ifdef CONFIG_PROC_FS |