diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/filemap.c | 84 | ||||
-rw-r--r-- | mm/memcontrol.c | 630 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 75 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mlock.c | 4 | ||||
-rw-r--r-- | mm/nobootmem.c | 8 | ||||
-rw-r--r-- | mm/nommu.c | 6 | ||||
-rw-r--r-- | mm/oom_kill.c | 13 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 10 | ||||
-rw-r--r-- | mm/page_cgroup.c | 131 | ||||
-rw-r--r-- | mm/page_io.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 18 | ||||
-rw-r--r-- | mm/rmap.c | 5 | ||||
-rw-r--r-- | mm/shmem.c | 1 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/swapfile.c | 46 | ||||
-rw-r--r-- | mm/vmscan.c | 2 |
22 files changed, 573 insertions, 517 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 027100d30227..0d9a036ada66 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -14,17 +14,11 @@ | |||
14 | 14 | ||
15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | 15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); |
16 | 16 | ||
17 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
18 | { | ||
19 | } | ||
20 | EXPORT_SYMBOL(default_unplug_io_fn); | ||
21 | |||
22 | struct backing_dev_info default_backing_dev_info = { | 17 | struct backing_dev_info default_backing_dev_info = { |
23 | .name = "default", | 18 | .name = "default", |
24 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | 19 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
25 | .state = 0, | 20 | .state = 0, |
26 | .capabilities = BDI_CAP_MAP_COPY, | 21 | .capabilities = BDI_CAP_MAP_COPY, |
27 | .unplug_io_fn = default_unplug_io_fn, | ||
28 | }; | 22 | }; |
29 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 23 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
30 | 24 | ||
@@ -73,14 +67,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
73 | struct inode *inode; | 67 | struct inode *inode; |
74 | 68 | ||
75 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | 69 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; |
76 | spin_lock(&inode_lock); | 70 | spin_lock(&inode_wb_list_lock); |
77 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) | 71 | list_for_each_entry(inode, &wb->b_dirty, i_wb_list) |
78 | nr_dirty++; | 72 | nr_dirty++; |
79 | list_for_each_entry(inode, &wb->b_io, i_wb_list) | 73 | list_for_each_entry(inode, &wb->b_io, i_wb_list) |
80 | nr_io++; | 74 | nr_io++; |
81 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) | 75 | list_for_each_entry(inode, &wb->b_more_io, i_wb_list) |
82 | nr_more_io++; | 76 | nr_more_io++; |
83 | spin_unlock(&inode_lock); | 77 | spin_unlock(&inode_wb_list_lock); |
84 | 78 | ||
85 | global_dirty_limits(&background_thresh, &dirty_thresh); | 79 | global_dirty_limits(&background_thresh, &dirty_thresh); |
86 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); | 80 | bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); |
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) | |||
604 | spin_lock(&sb_lock); | 598 | spin_lock(&sb_lock); |
605 | list_for_each_entry(sb, &super_blocks, s_list) { | 599 | list_for_each_entry(sb, &super_blocks, s_list) { |
606 | if (sb->s_bdi == bdi) | 600 | if (sb->s_bdi == bdi) |
607 | sb->s_bdi = NULL; | 601 | sb->s_bdi = &default_backing_dev_info; |
608 | } | 602 | } |
609 | spin_unlock(&sb_lock); | 603 | spin_unlock(&sb_lock); |
610 | } | 604 | } |
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
682 | if (bdi_has_dirty_io(bdi)) { | 676 | if (bdi_has_dirty_io(bdi)) { |
683 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | 677 | struct bdi_writeback *dst = &default_backing_dev_info.wb; |
684 | 678 | ||
685 | spin_lock(&inode_lock); | 679 | spin_lock(&inode_wb_list_lock); |
686 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | 680 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); |
687 | list_splice(&bdi->wb.b_io, &dst->b_io); | 681 | list_splice(&bdi->wb.b_io, &dst->b_io); |
688 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | 682 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); |
689 | spin_unlock(&inode_lock); | 683 | spin_unlock(&inode_wb_list_lock); |
690 | } | 684 | } |
691 | 685 | ||
692 | bdi_unregister(bdi); | 686 | bdi_unregister(bdi); |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 07aeb89e396e..01d5a4b3dd0c 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -34,14 +34,6 @@ unsigned long max_low_pfn; | |||
34 | unsigned long min_low_pfn; | 34 | unsigned long min_low_pfn; |
35 | unsigned long max_pfn; | 35 | unsigned long max_pfn; |
36 | 36 | ||
37 | #ifdef CONFIG_CRASH_DUMP | ||
38 | /* | ||
39 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
40 | * to know the amount of memory that the previous kernel used. | ||
41 | */ | ||
42 | unsigned long saved_max_pfn; | ||
43 | #endif | ||
44 | |||
45 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 37 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
46 | 38 | ||
47 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 39 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
diff --git a/mm/filemap.c b/mm/filemap.c index f807afda86f2..c641edf553a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -80,8 +80,8 @@ | |||
80 | * ->i_mutex | 80 | * ->i_mutex |
81 | * ->i_alloc_sem (various) | 81 | * ->i_alloc_sem (various) |
82 | * | 82 | * |
83 | * ->inode_lock | 83 | * inode_wb_list_lock |
84 | * ->sb_lock (fs/fs-writeback.c) | 84 | * sb_lock (fs/fs-writeback.c) |
85 | * ->mapping->tree_lock (__sync_single_inode) | 85 | * ->mapping->tree_lock (__sync_single_inode) |
86 | * | 86 | * |
87 | * ->i_mmap_lock | 87 | * ->i_mmap_lock |
@@ -98,8 +98,10 @@ | |||
98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) | 98 | * ->zone.lru_lock (check_pte_range->isolate_lru_page) |
99 | * ->private_lock (page_remove_rmap->set_page_dirty) | 99 | * ->private_lock (page_remove_rmap->set_page_dirty) |
100 | * ->tree_lock (page_remove_rmap->set_page_dirty) | 100 | * ->tree_lock (page_remove_rmap->set_page_dirty) |
101 | * ->inode_lock (page_remove_rmap->set_page_dirty) | 101 | * inode_wb_list_lock (page_remove_rmap->set_page_dirty) |
102 | * ->inode_lock (zap_pte_range->set_page_dirty) | 102 | * ->inode->i_lock (page_remove_rmap->set_page_dirty) |
103 | * inode_wb_list_lock (zap_pte_range->set_page_dirty) | ||
104 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | ||
103 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 105 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
104 | * | 106 | * |
105 | * (code doesn't rely on that order, so you could switch it around) | 107 | * (code doesn't rely on that order, so you could switch it around) |
@@ -164,45 +166,15 @@ void delete_from_page_cache(struct page *page) | |||
164 | } | 166 | } |
165 | EXPORT_SYMBOL(delete_from_page_cache); | 167 | EXPORT_SYMBOL(delete_from_page_cache); |
166 | 168 | ||
167 | static int sync_page(void *word) | 169 | static int sleep_on_page(void *word) |
168 | { | 170 | { |
169 | struct address_space *mapping; | ||
170 | struct page *page; | ||
171 | |||
172 | page = container_of((unsigned long *)word, struct page, flags); | ||
173 | |||
174 | /* | ||
175 | * page_mapping() is being called without PG_locked held. | ||
176 | * Some knowledge of the state and use of the page is used to | ||
177 | * reduce the requirements down to a memory barrier. | ||
178 | * The danger here is of a stale page_mapping() return value | ||
179 | * indicating a struct address_space different from the one it's | ||
180 | * associated with when it is associated with one. | ||
181 | * After smp_mb(), it's either the correct page_mapping() for | ||
182 | * the page, or an old page_mapping() and the page's own | ||
183 | * page_mapping() has gone NULL. | ||
184 | * The ->sync_page() address_space operation must tolerate | ||
185 | * page_mapping() going NULL. By an amazing coincidence, | ||
186 | * this comes about because none of the users of the page | ||
187 | * in the ->sync_page() methods make essential use of the | ||
188 | * page_mapping(), merely passing the page down to the backing | ||
189 | * device's unplug functions when it's non-NULL, which in turn | ||
190 | * ignore it for all cases but swap, where only page_private(page) is | ||
191 | * of interest. When page_mapping() does go NULL, the entire | ||
192 | * call stack gracefully ignores the page and returns. | ||
193 | * -- wli | ||
194 | */ | ||
195 | smp_mb(); | ||
196 | mapping = page_mapping(page); | ||
197 | if (mapping && mapping->a_ops && mapping->a_ops->sync_page) | ||
198 | mapping->a_ops->sync_page(page); | ||
199 | io_schedule(); | 171 | io_schedule(); |
200 | return 0; | 172 | return 0; |
201 | } | 173 | } |
202 | 174 | ||
203 | static int sync_page_killable(void *word) | 175 | static int sleep_on_page_killable(void *word) |
204 | { | 176 | { |
205 | sync_page(word); | 177 | sleep_on_page(word); |
206 | return fatal_signal_pending(current) ? -EINTR : 0; | 178 | return fatal_signal_pending(current) ? -EINTR : 0; |
207 | } | 179 | } |
208 | 180 | ||
@@ -558,12 +530,6 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
558 | EXPORT_SYMBOL(__page_cache_alloc); | 530 | EXPORT_SYMBOL(__page_cache_alloc); |
559 | #endif | 531 | #endif |
560 | 532 | ||
561 | static int __sleep_on_page_lock(void *word) | ||
562 | { | ||
563 | io_schedule(); | ||
564 | return 0; | ||
565 | } | ||
566 | |||
567 | /* | 533 | /* |
568 | * In order to wait for pages to become available there must be | 534 | * In order to wait for pages to become available there must be |
569 | * waitqueues associated with pages. By using a hash table of | 535 | * waitqueues associated with pages. By using a hash table of |
@@ -591,7 +557,7 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
591 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); | 557 | DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); |
592 | 558 | ||
593 | if (test_bit(bit_nr, &page->flags)) | 559 | if (test_bit(bit_nr, &page->flags)) |
594 | __wait_on_bit(page_waitqueue(page), &wait, sync_page, | 560 | __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page, |
595 | TASK_UNINTERRUPTIBLE); | 561 | TASK_UNINTERRUPTIBLE); |
596 | } | 562 | } |
597 | EXPORT_SYMBOL(wait_on_page_bit); | 563 | EXPORT_SYMBOL(wait_on_page_bit); |
@@ -655,17 +621,12 @@ EXPORT_SYMBOL(end_page_writeback); | |||
655 | /** | 621 | /** |
656 | * __lock_page - get a lock on the page, assuming we need to sleep to get it | 622 | * __lock_page - get a lock on the page, assuming we need to sleep to get it |
657 | * @page: the page to lock | 623 | * @page: the page to lock |
658 | * | ||
659 | * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some | ||
660 | * random driver's requestfn sets TASK_RUNNING, we could busywait. However | ||
661 | * chances are that on the second loop, the block layer's plug list is empty, | ||
662 | * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. | ||
663 | */ | 624 | */ |
664 | void __lock_page(struct page *page) | 625 | void __lock_page(struct page *page) |
665 | { | 626 | { |
666 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 627 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
667 | 628 | ||
668 | __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page, | 629 | __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page, |
669 | TASK_UNINTERRUPTIBLE); | 630 | TASK_UNINTERRUPTIBLE); |
670 | } | 631 | } |
671 | EXPORT_SYMBOL(__lock_page); | 632 | EXPORT_SYMBOL(__lock_page); |
@@ -675,24 +636,10 @@ int __lock_page_killable(struct page *page) | |||
675 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 636 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); |
676 | 637 | ||
677 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 638 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
678 | sync_page_killable, TASK_KILLABLE); | 639 | sleep_on_page_killable, TASK_KILLABLE); |
679 | } | 640 | } |
680 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 641 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
681 | 642 | ||
682 | /** | ||
683 | * __lock_page_nosync - get a lock on the page, without calling sync_page() | ||
684 | * @page: the page to lock | ||
685 | * | ||
686 | * Variant of lock_page that does not require the caller to hold a reference | ||
687 | * on the page's mapping. | ||
688 | */ | ||
689 | void __lock_page_nosync(struct page *page) | ||
690 | { | ||
691 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | ||
692 | __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock, | ||
693 | TASK_UNINTERRUPTIBLE); | ||
694 | } | ||
695 | |||
696 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | 643 | int __lock_page_or_retry(struct page *page, struct mm_struct *mm, |
697 | unsigned int flags) | 644 | unsigned int flags) |
698 | { | 645 | { |
@@ -1407,12 +1354,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1407 | unsigned long seg = 0; | 1354 | unsigned long seg = 0; |
1408 | size_t count; | 1355 | size_t count; |
1409 | loff_t *ppos = &iocb->ki_pos; | 1356 | loff_t *ppos = &iocb->ki_pos; |
1357 | struct blk_plug plug; | ||
1410 | 1358 | ||
1411 | count = 0; | 1359 | count = 0; |
1412 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); | 1360 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
1413 | if (retval) | 1361 | if (retval) |
1414 | return retval; | 1362 | return retval; |
1415 | 1363 | ||
1364 | blk_start_plug(&plug); | ||
1365 | |||
1416 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ | 1366 | /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ |
1417 | if (filp->f_flags & O_DIRECT) { | 1367 | if (filp->f_flags & O_DIRECT) { |
1418 | loff_t size; | 1368 | loff_t size; |
@@ -1485,6 +1435,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1485 | break; | 1435 | break; |
1486 | } | 1436 | } |
1487 | out: | 1437 | out: |
1438 | blk_finish_plug(&plug); | ||
1488 | return retval; | 1439 | return retval; |
1489 | } | 1440 | } |
1490 | EXPORT_SYMBOL(generic_file_aio_read); | 1441 | EXPORT_SYMBOL(generic_file_aio_read); |
@@ -2596,11 +2547,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2596 | { | 2547 | { |
2597 | struct file *file = iocb->ki_filp; | 2548 | struct file *file = iocb->ki_filp; |
2598 | struct inode *inode = file->f_mapping->host; | 2549 | struct inode *inode = file->f_mapping->host; |
2550 | struct blk_plug plug; | ||
2599 | ssize_t ret; | 2551 | ssize_t ret; |
2600 | 2552 | ||
2601 | BUG_ON(iocb->ki_pos != pos); | 2553 | BUG_ON(iocb->ki_pos != pos); |
2602 | 2554 | ||
2603 | mutex_lock(&inode->i_mutex); | 2555 | mutex_lock(&inode->i_mutex); |
2556 | blk_start_plug(&plug); | ||
2604 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); | 2557 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
2605 | mutex_unlock(&inode->i_mutex); | 2558 | mutex_unlock(&inode->i_mutex); |
2606 | 2559 | ||
@@ -2611,6 +2564,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2611 | if (err < 0 && ret > 0) | 2564 | if (err < 0 && ret > 0) |
2612 | ret = err; | 2565 | ret = err; |
2613 | } | 2566 | } |
2567 | blk_finish_plug(&plug); | ||
2614 | return ret; | 2568 | return ret; |
2615 | } | 2569 | } |
2616 | EXPORT_SYMBOL(generic_file_aio_write); | 2570 | EXPORT_SYMBOL(generic_file_aio_write); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e1ee6ad9c971..1f0b460fe58c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; | |||
73 | #define do_swap_account (0) | 73 | #define do_swap_account (0) |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | /* | ||
77 | * Per memcg event counter is incremented at every pagein/pageout. This counter | ||
78 | * is used for trigger some periodic events. This is straightforward and better | ||
79 | * than using jiffies etc. to handle periodic memcg event. | ||
80 | * | ||
81 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
82 | */ | ||
83 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
84 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
85 | 76 | ||
86 | /* | 77 | /* |
87 | * Statistics for memory cgroup. | 78 | * Statistics for memory cgroup. |
@@ -93,19 +84,36 @@ enum mem_cgroup_stat_index { | |||
93 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 84 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
94 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 85 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
95 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 86 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
96 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | ||
97 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | ||
98 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 87 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
99 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 88 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
100 | /* incremented at every pagein/pageout */ | ||
101 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
102 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | 89 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ |
103 | |||
104 | MEM_CGROUP_STAT_NSTATS, | 90 | MEM_CGROUP_STAT_NSTATS, |
105 | }; | 91 | }; |
106 | 92 | ||
93 | enum mem_cgroup_events_index { | ||
94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | ||
97 | MEM_CGROUP_EVENTS_NSTATS, | ||
98 | }; | ||
99 | /* | ||
100 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | ||
101 | * it will be incremated by the number of pages. This counter is used for | ||
102 | * for trigger some periodic events. This is straightforward and better | ||
103 | * than using jiffies etc. to handle periodic memcg event. | ||
104 | */ | ||
105 | enum mem_cgroup_events_target { | ||
106 | MEM_CGROUP_TARGET_THRESH, | ||
107 | MEM_CGROUP_TARGET_SOFTLIMIT, | ||
108 | MEM_CGROUP_NTARGETS, | ||
109 | }; | ||
110 | #define THRESHOLDS_EVENTS_TARGET (128) | ||
111 | #define SOFTLIMIT_EVENTS_TARGET (1024) | ||
112 | |||
107 | struct mem_cgroup_stat_cpu { | 113 | struct mem_cgroup_stat_cpu { |
108 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 114 | long count[MEM_CGROUP_STAT_NSTATS]; |
115 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | ||
116 | unsigned long targets[MEM_CGROUP_NTARGETS]; | ||
109 | }; | 117 | }; |
110 | 118 | ||
111 | /* | 119 | /* |
@@ -218,12 +226,6 @@ struct mem_cgroup { | |||
218 | * per zone LRU lists. | 226 | * per zone LRU lists. |
219 | */ | 227 | */ |
220 | struct mem_cgroup_lru_info info; | 228 | struct mem_cgroup_lru_info info; |
221 | |||
222 | /* | ||
223 | protect against reclaim related member. | ||
224 | */ | ||
225 | spinlock_t reclaim_param_lock; | ||
226 | |||
227 | /* | 229 | /* |
228 | * While reclaiming in a hierarchy, we cache the last child we | 230 | * While reclaiming in a hierarchy, we cache the last child we |
229 | * reclaimed from. | 231 | * reclaimed from. |
@@ -327,13 +329,6 @@ enum charge_type { | |||
327 | NR_CHARGE_TYPE, | 329 | NR_CHARGE_TYPE, |
328 | }; | 330 | }; |
329 | 331 | ||
330 | /* only for here (for easy reading.) */ | ||
331 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
332 | #define PCGF_USED (1UL << PCG_USED) | ||
333 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
334 | /* Not used, but added here for completeness */ | ||
335 | #define PCGF_ACCT (1UL << PCG_ACCT) | ||
336 | |||
337 | /* for encoding cft->private value on file */ | 332 | /* for encoding cft->private value on file */ |
338 | #define _MEM (0) | 333 | #define _MEM (0) |
339 | #define _MEMSWAP (1) | 334 | #define _MEMSWAP (1) |
@@ -371,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | |||
371 | } | 366 | } |
372 | 367 | ||
373 | static struct mem_cgroup_per_zone * | 368 | static struct mem_cgroup_per_zone * |
374 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 369 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) |
375 | { | 370 | { |
376 | struct mem_cgroup *mem = pc->mem_cgroup; | 371 | int nid = page_to_nid(page); |
377 | int nid = page_cgroup_nid(pc); | 372 | int zid = page_zonenum(page); |
378 | int zid = page_cgroup_zid(pc); | ||
379 | |||
380 | if (!mem) | ||
381 | return NULL; | ||
382 | 373 | ||
383 | return mem_cgroup_zoneinfo(mem, nid, zid); | 374 | return mem_cgroup_zoneinfo(mem, nid, zid); |
384 | } | 375 | } |
@@ -504,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | |||
504 | } | 495 | } |
505 | } | 496 | } |
506 | 497 | ||
507 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
508 | { | ||
509 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
510 | } | ||
511 | |||
512 | static struct mem_cgroup_per_zone * | 498 | static struct mem_cgroup_per_zone * |
513 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 499 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
514 | { | 500 | { |
@@ -565,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
565 | * common workload, threashold and synchonization as vmstat[] should be | 551 | * common workload, threashold and synchonization as vmstat[] should be |
566 | * implemented. | 552 | * implemented. |
567 | */ | 553 | */ |
568 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 554 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, |
569 | enum mem_cgroup_stat_index idx) | 555 | enum mem_cgroup_stat_index idx) |
570 | { | 556 | { |
557 | long val = 0; | ||
571 | int cpu; | 558 | int cpu; |
572 | s64 val = 0; | ||
573 | 559 | ||
574 | get_online_cpus(); | 560 | get_online_cpus(); |
575 | for_each_online_cpu(cpu) | 561 | for_each_online_cpu(cpu) |
@@ -583,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
583 | return val; | 569 | return val; |
584 | } | 570 | } |
585 | 571 | ||
586 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | 572 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) |
587 | { | 573 | { |
588 | s64 ret; | 574 | long ret; |
589 | 575 | ||
590 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 576 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
591 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 577 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
@@ -599,6 +585,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | |||
599 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 585 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
600 | } | 586 | } |
601 | 587 | ||
588 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | ||
589 | enum mem_cgroup_events_index idx) | ||
590 | { | ||
591 | unsigned long val = 0; | ||
592 | int cpu; | ||
593 | |||
594 | for_each_online_cpu(cpu) | ||
595 | val += per_cpu(mem->stat->events[idx], cpu); | ||
596 | #ifdef CONFIG_HOTPLUG_CPU | ||
597 | spin_lock(&mem->pcp_counter_lock); | ||
598 | val += mem->nocpu_base.events[idx]; | ||
599 | spin_unlock(&mem->pcp_counter_lock); | ||
600 | #endif | ||
601 | return val; | ||
602 | } | ||
603 | |||
602 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 604 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
603 | bool file, int nr_pages) | 605 | bool file, int nr_pages) |
604 | { | 606 | { |
@@ -611,13 +613,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
611 | 613 | ||
612 | /* pagein of a big page is an event. So, ignore page size */ | 614 | /* pagein of a big page is an event. So, ignore page size */ |
613 | if (nr_pages > 0) | 615 | if (nr_pages > 0) |
614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); | 616 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
615 | else { | 617 | else { |
616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); | 618 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
617 | nr_pages = -nr_pages; /* for event */ | 619 | nr_pages = -nr_pages; /* for event */ |
618 | } | 620 | } |
619 | 621 | ||
620 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); | 622 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); |
621 | 623 | ||
622 | preempt_enable(); | 624 | preempt_enable(); |
623 | } | 625 | } |
@@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
637 | return total; | 639 | return total; |
638 | } | 640 | } |
639 | 641 | ||
640 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | 642 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) |
641 | { | 643 | { |
642 | s64 val; | 644 | unsigned long val, next; |
643 | 645 | ||
644 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | 646 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
647 | next = this_cpu_read(mem->stat->targets[target]); | ||
648 | /* from time_after() in jiffies.h */ | ||
649 | return ((long)next - (long)val < 0); | ||
650 | } | ||
645 | 651 | ||
646 | return !(val & ((1 << event_mask_shift) - 1)); | 652 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) |
653 | { | ||
654 | unsigned long val, next; | ||
655 | |||
656 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | ||
657 | |||
658 | switch (target) { | ||
659 | case MEM_CGROUP_TARGET_THRESH: | ||
660 | next = val + THRESHOLDS_EVENTS_TARGET; | ||
661 | break; | ||
662 | case MEM_CGROUP_TARGET_SOFTLIMIT: | ||
663 | next = val + SOFTLIMIT_EVENTS_TARGET; | ||
664 | break; | ||
665 | default: | ||
666 | return; | ||
667 | } | ||
668 | |||
669 | this_cpu_write(mem->stat->targets[target], next); | ||
647 | } | 670 | } |
648 | 671 | ||
649 | /* | 672 | /* |
@@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | |||
653 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | 676 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) |
654 | { | 677 | { |
655 | /* threshold event is triggered in finer grain than soft limit */ | 678 | /* threshold event is triggered in finer grain than soft limit */ |
656 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | 679 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { |
657 | mem_cgroup_threshold(mem); | 680 | mem_cgroup_threshold(mem); |
658 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | 681 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
682 | if (unlikely(__memcg_event_check(mem, | ||
683 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | ||
659 | mem_cgroup_update_tree(mem, page); | 684 | mem_cgroup_update_tree(mem, page); |
685 | __mem_cgroup_target_update(mem, | ||
686 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
687 | } | ||
660 | } | 688 | } |
661 | } | 689 | } |
662 | 690 | ||
@@ -815,7 +843,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
815 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 843 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
816 | * removed from global LRU. | 844 | * removed from global LRU. |
817 | */ | 845 | */ |
818 | mz = page_cgroup_zoneinfo(pc); | 846 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
819 | /* huge page split is done under lru_lock. so, we have no races. */ | 847 | /* huge page split is done under lru_lock. so, we have no races. */ |
820 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 848 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
821 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 849 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
@@ -851,7 +879,7 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page) | |||
851 | smp_rmb(); | 879 | smp_rmb(); |
852 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 880 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
853 | return; | 881 | return; |
854 | mz = page_cgroup_zoneinfo(pc); | 882 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
855 | list_move_tail(&pc->lru, &mz->lists[lru]); | 883 | list_move_tail(&pc->lru, &mz->lists[lru]); |
856 | } | 884 | } |
857 | 885 | ||
@@ -871,7 +899,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
871 | smp_rmb(); | 899 | smp_rmb(); |
872 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 900 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
873 | return; | 901 | return; |
874 | mz = page_cgroup_zoneinfo(pc); | 902 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
875 | list_move(&pc->lru, &mz->lists[lru]); | 903 | list_move(&pc->lru, &mz->lists[lru]); |
876 | } | 904 | } |
877 | 905 | ||
@@ -888,7 +916,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
888 | return; | 916 | return; |
889 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 917 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
890 | smp_rmb(); | 918 | smp_rmb(); |
891 | mz = page_cgroup_zoneinfo(pc); | 919 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
892 | /* huge page split is done under lru_lock. so, we have no races. */ | 920 | /* huge page split is done under lru_lock. so, we have no races. */ |
893 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 921 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
894 | SetPageCgroupAcctLRU(pc); | 922 | SetPageCgroupAcctLRU(pc); |
@@ -898,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
898 | } | 926 | } |
899 | 927 | ||
900 | /* | 928 | /* |
901 | * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to | 929 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed |
902 | * lru because the page may.be reused after it's fully uncharged (because of | 930 | * while it's linked to lru because the page may be reused after it's fully |
903 | * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge | 931 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. |
904 | * it again. This function is only used to charge SwapCache. It's done under | 932 | * It's done under lock_page and expected that zone->lru_lock isnever held. |
905 | * lock_page and expected that zone->lru_lock is never held. | ||
906 | */ | 933 | */ |
907 | static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | 934 | static void mem_cgroup_lru_del_before_commit(struct page *page) |
908 | { | 935 | { |
909 | unsigned long flags; | 936 | unsigned long flags; |
910 | struct zone *zone = page_zone(page); | 937 | struct zone *zone = page_zone(page); |
911 | struct page_cgroup *pc = lookup_page_cgroup(page); | 938 | struct page_cgroup *pc = lookup_page_cgroup(page); |
912 | 939 | ||
940 | /* | ||
941 | * Doing this check without taking ->lru_lock seems wrong but this | ||
942 | * is safe. Because if page_cgroup's USED bit is unset, the page | ||
943 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is | ||
944 | * set, the commit after this will fail, anyway. | ||
945 | * This all charge/uncharge is done under some mutual execustion. | ||
946 | * So, we don't need to taking care of changes in USED bit. | ||
947 | */ | ||
948 | if (likely(!PageLRU(page))) | ||
949 | return; | ||
950 | |||
913 | spin_lock_irqsave(&zone->lru_lock, flags); | 951 | spin_lock_irqsave(&zone->lru_lock, flags); |
914 | /* | 952 | /* |
915 | * Forget old LRU when this page_cgroup is *not* used. This Used bit | 953 | * Forget old LRU when this page_cgroup is *not* used. This Used bit |
@@ -920,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | |||
920 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 958 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
921 | } | 959 | } |
922 | 960 | ||
923 | static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | 961 | static void mem_cgroup_lru_add_after_commit(struct page *page) |
924 | { | 962 | { |
925 | unsigned long flags; | 963 | unsigned long flags; |
926 | struct zone *zone = page_zone(page); | 964 | struct zone *zone = page_zone(page); |
927 | struct page_cgroup *pc = lookup_page_cgroup(page); | 965 | struct page_cgroup *pc = lookup_page_cgroup(page); |
928 | 966 | ||
967 | /* taking care of that the page is added to LRU while we commit it */ | ||
968 | if (likely(!PageLRU(page))) | ||
969 | return; | ||
929 | spin_lock_irqsave(&zone->lru_lock, flags); | 970 | spin_lock_irqsave(&zone->lru_lock, flags); |
930 | /* link when the page is linked to LRU but page_cgroup isn't */ | 971 | /* link when the page is linked to LRU but page_cgroup isn't */ |
931 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) | 972 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
@@ -1058,10 +1099,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1058 | return NULL; | 1099 | return NULL; |
1059 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1100 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
1060 | smp_rmb(); | 1101 | smp_rmb(); |
1061 | mz = page_cgroup_zoneinfo(pc); | 1102 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
1062 | if (!mz) | ||
1063 | return NULL; | ||
1064 | |||
1065 | return &mz->reclaim_stat; | 1103 | return &mz->reclaim_stat; |
1066 | } | 1104 | } |
1067 | 1105 | ||
@@ -1093,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1093 | if (scan >= nr_to_scan) | 1131 | if (scan >= nr_to_scan) |
1094 | break; | 1132 | break; |
1095 | 1133 | ||
1096 | page = pc->page; | ||
1097 | if (unlikely(!PageCgroupUsed(pc))) | 1134 | if (unlikely(!PageCgroupUsed(pc))) |
1098 | continue; | 1135 | continue; |
1136 | |||
1137 | page = lookup_cgroup_page(pc); | ||
1138 | |||
1099 | if (unlikely(!PageLRU(page))) | 1139 | if (unlikely(!PageLRU(page))) |
1100 | continue; | 1140 | continue; |
1101 | 1141 | ||
@@ -1127,49 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1127 | #define mem_cgroup_from_res_counter(counter, member) \ | 1167 | #define mem_cgroup_from_res_counter(counter, member) \ |
1128 | container_of(counter, struct mem_cgroup, member) | 1168 | container_of(counter, struct mem_cgroup, member) |
1129 | 1169 | ||
1130 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | ||
1131 | { | ||
1132 | if (do_swap_account) { | ||
1133 | if (res_counter_check_under_limit(&mem->res) && | ||
1134 | res_counter_check_under_limit(&mem->memsw)) | ||
1135 | return true; | ||
1136 | } else | ||
1137 | if (res_counter_check_under_limit(&mem->res)) | ||
1138 | return true; | ||
1139 | return false; | ||
1140 | } | ||
1141 | |||
1142 | /** | 1170 | /** |
1143 | * mem_cgroup_check_margin - check if the memory cgroup allows charging | 1171 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1144 | * @mem: memory cgroup to check | 1172 | * @mem: the memory cgroup |
1145 | * @bytes: the number of bytes the caller intends to charge | ||
1146 | * | 1173 | * |
1147 | * Returns a boolean value on whether @mem can be charged @bytes or | 1174 | * Returns the maximum amount of memory @mem can be charged with, in |
1148 | * whether this would exceed the limit. | 1175 | * pages. |
1149 | */ | 1176 | */ |
1150 | static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) | 1177 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) |
1151 | { | 1178 | { |
1152 | if (!res_counter_check_margin(&mem->res, bytes)) | 1179 | unsigned long long margin; |
1153 | return false; | 1180 | |
1154 | if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) | 1181 | margin = res_counter_margin(&mem->res); |
1155 | return false; | 1182 | if (do_swap_account) |
1156 | return true; | 1183 | margin = min(margin, res_counter_margin(&mem->memsw)); |
1184 | return margin >> PAGE_SHIFT; | ||
1157 | } | 1185 | } |
1158 | 1186 | ||
1159 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1187 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1160 | { | 1188 | { |
1161 | struct cgroup *cgrp = memcg->css.cgroup; | 1189 | struct cgroup *cgrp = memcg->css.cgroup; |
1162 | unsigned int swappiness; | ||
1163 | 1190 | ||
1164 | /* root ? */ | 1191 | /* root ? */ |
1165 | if (cgrp->parent == NULL) | 1192 | if (cgrp->parent == NULL) |
1166 | return vm_swappiness; | 1193 | return vm_swappiness; |
1167 | 1194 | ||
1168 | spin_lock(&memcg->reclaim_param_lock); | 1195 | return memcg->swappiness; |
1169 | swappiness = memcg->swappiness; | ||
1170 | spin_unlock(&memcg->reclaim_param_lock); | ||
1171 | |||
1172 | return swappiness; | ||
1173 | } | 1196 | } |
1174 | 1197 | ||
1175 | static void mem_cgroup_start_move(struct mem_cgroup *mem) | 1198 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
@@ -1385,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1385 | 1408 | ||
1386 | rcu_read_unlock(); | 1409 | rcu_read_unlock(); |
1387 | /* Updates scanning parameter */ | 1410 | /* Updates scanning parameter */ |
1388 | spin_lock(&root_mem->reclaim_param_lock); | ||
1389 | if (!css) { | 1411 | if (!css) { |
1390 | /* this means start scan from ID:1 */ | 1412 | /* this means start scan from ID:1 */ |
1391 | root_mem->last_scanned_child = 0; | 1413 | root_mem->last_scanned_child = 0; |
1392 | } else | 1414 | } else |
1393 | root_mem->last_scanned_child = found; | 1415 | root_mem->last_scanned_child = found; |
1394 | spin_unlock(&root_mem->reclaim_param_lock); | ||
1395 | } | 1416 | } |
1396 | 1417 | ||
1397 | return ret; | 1418 | return ret; |
@@ -1420,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1420 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1441 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1421 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1442 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1422 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1443 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1423 | unsigned long excess = mem_cgroup_get_excess(root_mem); | 1444 | unsigned long excess; |
1445 | |||
1446 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | ||
1424 | 1447 | ||
1425 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1448 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1426 | if (root_mem->memsw_is_minimum) | 1449 | if (root_mem->memsw_is_minimum) |
@@ -1477,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1477 | return ret; | 1500 | return ret; |
1478 | total += ret; | 1501 | total += ret; |
1479 | if (check_soft) { | 1502 | if (check_soft) { |
1480 | if (res_counter_check_under_soft_limit(&root_mem->res)) | 1503 | if (!res_counter_soft_limit_excess(&root_mem->res)) |
1481 | return total; | 1504 | return total; |
1482 | } else if (mem_cgroup_check_under_limit(root_mem)) | 1505 | } else if (mem_cgroup_margin(root_mem)) |
1483 | return 1 + total; | 1506 | return 1 + total; |
1484 | } | 1507 | } |
1485 | return total; | 1508 | return total; |
@@ -1687,17 +1710,17 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |||
1687 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1710 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
1688 | * TODO: maybe necessary to use big numbers in big irons. | 1711 | * TODO: maybe necessary to use big numbers in big irons. |
1689 | */ | 1712 | */ |
1690 | #define CHARGE_SIZE (32 * PAGE_SIZE) | 1713 | #define CHARGE_BATCH 32U |
1691 | struct memcg_stock_pcp { | 1714 | struct memcg_stock_pcp { |
1692 | struct mem_cgroup *cached; /* this never be root cgroup */ | 1715 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1693 | int charge; | 1716 | unsigned int nr_pages; |
1694 | struct work_struct work; | 1717 | struct work_struct work; |
1695 | }; | 1718 | }; |
1696 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1719 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1697 | static atomic_t memcg_drain_count; | 1720 | static atomic_t memcg_drain_count; |
1698 | 1721 | ||
1699 | /* | 1722 | /* |
1700 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | 1723 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
1701 | * from local stock and true is returned. If the stock is 0 or charges from a | 1724 | * from local stock and true is returned. If the stock is 0 or charges from a |
1702 | * cgroup which is not current target, returns false. This stock will be | 1725 | * cgroup which is not current target, returns false. This stock will be |
1703 | * refilled. | 1726 | * refilled. |
@@ -1708,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem) | |||
1708 | bool ret = true; | 1731 | bool ret = true; |
1709 | 1732 | ||
1710 | stock = &get_cpu_var(memcg_stock); | 1733 | stock = &get_cpu_var(memcg_stock); |
1711 | if (mem == stock->cached && stock->charge) | 1734 | if (mem == stock->cached && stock->nr_pages) |
1712 | stock->charge -= PAGE_SIZE; | 1735 | stock->nr_pages--; |
1713 | else /* need to call res_counter_charge */ | 1736 | else /* need to call res_counter_charge */ |
1714 | ret = false; | 1737 | ret = false; |
1715 | put_cpu_var(memcg_stock); | 1738 | put_cpu_var(memcg_stock); |
@@ -1723,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) | |||
1723 | { | 1746 | { |
1724 | struct mem_cgroup *old = stock->cached; | 1747 | struct mem_cgroup *old = stock->cached; |
1725 | 1748 | ||
1726 | if (stock->charge) { | 1749 | if (stock->nr_pages) { |
1727 | res_counter_uncharge(&old->res, stock->charge); | 1750 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; |
1751 | |||
1752 | res_counter_uncharge(&old->res, bytes); | ||
1728 | if (do_swap_account) | 1753 | if (do_swap_account) |
1729 | res_counter_uncharge(&old->memsw, stock->charge); | 1754 | res_counter_uncharge(&old->memsw, bytes); |
1755 | stock->nr_pages = 0; | ||
1730 | } | 1756 | } |
1731 | stock->cached = NULL; | 1757 | stock->cached = NULL; |
1732 | stock->charge = 0; | ||
1733 | } | 1758 | } |
1734 | 1759 | ||
1735 | /* | 1760 | /* |
@@ -1746,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1746 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 1771 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
1747 | * This will be consumed by consume_stock() function, later. | 1772 | * This will be consumed by consume_stock() function, later. |
1748 | */ | 1773 | */ |
1749 | static void refill_stock(struct mem_cgroup *mem, int val) | 1774 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) |
1750 | { | 1775 | { |
1751 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 1776 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
1752 | 1777 | ||
@@ -1754,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val) | |||
1754 | drain_stock(stock); | 1779 | drain_stock(stock); |
1755 | stock->cached = mem; | 1780 | stock->cached = mem; |
1756 | } | 1781 | } |
1757 | stock->charge += val; | 1782 | stock->nr_pages += nr_pages; |
1758 | put_cpu_var(memcg_stock); | 1783 | put_cpu_var(memcg_stock); |
1759 | } | 1784 | } |
1760 | 1785 | ||
@@ -1806,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | |||
1806 | 1831 | ||
1807 | spin_lock(&mem->pcp_counter_lock); | 1832 | spin_lock(&mem->pcp_counter_lock); |
1808 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 1833 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { |
1809 | s64 x = per_cpu(mem->stat->count[i], cpu); | 1834 | long x = per_cpu(mem->stat->count[i], cpu); |
1810 | 1835 | ||
1811 | per_cpu(mem->stat->count[i], cpu) = 0; | 1836 | per_cpu(mem->stat->count[i], cpu) = 0; |
1812 | mem->nocpu_base.count[i] += x; | 1837 | mem->nocpu_base.count[i] += x; |
1813 | } | 1838 | } |
1839 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | ||
1840 | unsigned long x = per_cpu(mem->stat->events[i], cpu); | ||
1841 | |||
1842 | per_cpu(mem->stat->events[i], cpu) = 0; | ||
1843 | mem->nocpu_base.events[i] += x; | ||
1844 | } | ||
1814 | /* need to clear ON_MOVE value, works as a kind of lock. */ | 1845 | /* need to clear ON_MOVE value, works as a kind of lock. */ |
1815 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | 1846 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; |
1816 | spin_unlock(&mem->pcp_counter_lock); | 1847 | spin_unlock(&mem->pcp_counter_lock); |
@@ -1860,9 +1891,10 @@ enum { | |||
1860 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | 1891 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ |
1861 | }; | 1892 | }; |
1862 | 1893 | ||
1863 | static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | 1894 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, |
1864 | int csize, bool oom_check) | 1895 | unsigned int nr_pages, bool oom_check) |
1865 | { | 1896 | { |
1897 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
1866 | struct mem_cgroup *mem_over_limit; | 1898 | struct mem_cgroup *mem_over_limit; |
1867 | struct res_counter *fail_res; | 1899 | struct res_counter *fail_res; |
1868 | unsigned long flags = 0; | 1900 | unsigned long flags = 0; |
@@ -1883,14 +1915,13 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1883 | } else | 1915 | } else |
1884 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1916 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1885 | /* | 1917 | /* |
1886 | * csize can be either a huge page (HPAGE_SIZE), a batch of | 1918 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch |
1887 | * regular pages (CHARGE_SIZE), or a single regular page | 1919 | * of regular pages (CHARGE_BATCH), or a single regular page (1). |
1888 | * (PAGE_SIZE). | ||
1889 | * | 1920 | * |
1890 | * Never reclaim on behalf of optional batching, retry with a | 1921 | * Never reclaim on behalf of optional batching, retry with a |
1891 | * single page instead. | 1922 | * single page instead. |
1892 | */ | 1923 | */ |
1893 | if (csize == CHARGE_SIZE) | 1924 | if (nr_pages == CHARGE_BATCH) |
1894 | return CHARGE_RETRY; | 1925 | return CHARGE_RETRY; |
1895 | 1926 | ||
1896 | if (!(gfp_mask & __GFP_WAIT)) | 1927 | if (!(gfp_mask & __GFP_WAIT)) |
@@ -1898,7 +1929,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1898 | 1929 | ||
1899 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1930 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1900 | gfp_mask, flags); | 1931 | gfp_mask, flags); |
1901 | if (mem_cgroup_check_margin(mem_over_limit, csize)) | 1932 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
1902 | return CHARGE_RETRY; | 1933 | return CHARGE_RETRY; |
1903 | /* | 1934 | /* |
1904 | * Even though the limit is exceeded at this point, reclaim | 1935 | * Even though the limit is exceeded at this point, reclaim |
@@ -1909,7 +1940,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1909 | * unlikely to succeed so close to the limit, and we fall back | 1940 | * unlikely to succeed so close to the limit, and we fall back |
1910 | * to regular pages anyway in case of failure. | 1941 | * to regular pages anyway in case of failure. |
1911 | */ | 1942 | */ |
1912 | if (csize == PAGE_SIZE && ret) | 1943 | if (nr_pages == 1 && ret) |
1913 | return CHARGE_RETRY; | 1944 | return CHARGE_RETRY; |
1914 | 1945 | ||
1915 | /* | 1946 | /* |
@@ -1935,13 +1966,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1935 | */ | 1966 | */ |
1936 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1967 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1937 | gfp_t gfp_mask, | 1968 | gfp_t gfp_mask, |
1938 | struct mem_cgroup **memcg, bool oom, | 1969 | unsigned int nr_pages, |
1939 | int page_size) | 1970 | struct mem_cgroup **memcg, |
1971 | bool oom) | ||
1940 | { | 1972 | { |
1973 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | ||
1941 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1974 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1942 | struct mem_cgroup *mem = NULL; | 1975 | struct mem_cgroup *mem = NULL; |
1943 | int ret; | 1976 | int ret; |
1944 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); | ||
1945 | 1977 | ||
1946 | /* | 1978 | /* |
1947 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1979 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
@@ -1966,7 +1998,7 @@ again: | |||
1966 | VM_BUG_ON(css_is_removed(&mem->css)); | 1998 | VM_BUG_ON(css_is_removed(&mem->css)); |
1967 | if (mem_cgroup_is_root(mem)) | 1999 | if (mem_cgroup_is_root(mem)) |
1968 | goto done; | 2000 | goto done; |
1969 | if (page_size == PAGE_SIZE && consume_stock(mem)) | 2001 | if (nr_pages == 1 && consume_stock(mem)) |
1970 | goto done; | 2002 | goto done; |
1971 | css_get(&mem->css); | 2003 | css_get(&mem->css); |
1972 | } else { | 2004 | } else { |
@@ -1989,7 +2021,7 @@ again: | |||
1989 | rcu_read_unlock(); | 2021 | rcu_read_unlock(); |
1990 | goto done; | 2022 | goto done; |
1991 | } | 2023 | } |
1992 | if (page_size == PAGE_SIZE && consume_stock(mem)) { | 2024 | if (nr_pages == 1 && consume_stock(mem)) { |
1993 | /* | 2025 | /* |
1994 | * It seems dagerous to access memcg without css_get(). | 2026 | * It seems dagerous to access memcg without css_get(). |
1995 | * But considering how consume_stok works, it's not | 2027 | * But considering how consume_stok works, it's not |
@@ -2024,13 +2056,12 @@ again: | |||
2024 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2056 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2025 | } | 2057 | } |
2026 | 2058 | ||
2027 | ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); | 2059 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); |
2028 | |||
2029 | switch (ret) { | 2060 | switch (ret) { |
2030 | case CHARGE_OK: | 2061 | case CHARGE_OK: |
2031 | break; | 2062 | break; |
2032 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2063 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2033 | csize = page_size; | 2064 | batch = nr_pages; |
2034 | css_put(&mem->css); | 2065 | css_put(&mem->css); |
2035 | mem = NULL; | 2066 | mem = NULL; |
2036 | goto again; | 2067 | goto again; |
@@ -2051,8 +2082,8 @@ again: | |||
2051 | } | 2082 | } |
2052 | } while (ret != CHARGE_OK); | 2083 | } while (ret != CHARGE_OK); |
2053 | 2084 | ||
2054 | if (csize > page_size) | 2085 | if (batch > nr_pages) |
2055 | refill_stock(mem, csize - page_size); | 2086 | refill_stock(mem, batch - nr_pages); |
2056 | css_put(&mem->css); | 2087 | css_put(&mem->css); |
2057 | done: | 2088 | done: |
2058 | *memcg = mem; | 2089 | *memcg = mem; |
@@ -2071,21 +2102,17 @@ bypass: | |||
2071 | * gotten by try_charge(). | 2102 | * gotten by try_charge(). |
2072 | */ | 2103 | */ |
2073 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | 2104 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2074 | unsigned long count) | 2105 | unsigned int nr_pages) |
2075 | { | 2106 | { |
2076 | if (!mem_cgroup_is_root(mem)) { | 2107 | if (!mem_cgroup_is_root(mem)) { |
2077 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 2108 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2109 | |||
2110 | res_counter_uncharge(&mem->res, bytes); | ||
2078 | if (do_swap_account) | 2111 | if (do_swap_account) |
2079 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | 2112 | res_counter_uncharge(&mem->memsw, bytes); |
2080 | } | 2113 | } |
2081 | } | 2114 | } |
2082 | 2115 | ||
2083 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, | ||
2084 | int page_size) | ||
2085 | { | ||
2086 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); | ||
2087 | } | ||
2088 | |||
2089 | /* | 2116 | /* |
2090 | * A helper function to get mem_cgroup from ID. must be called under | 2117 | * A helper function to get mem_cgroup from ID. must be called under |
2091 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2118 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -2134,20 +2161,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2134 | } | 2161 | } |
2135 | 2162 | ||
2136 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2163 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
2164 | struct page *page, | ||
2165 | unsigned int nr_pages, | ||
2137 | struct page_cgroup *pc, | 2166 | struct page_cgroup *pc, |
2138 | enum charge_type ctype, | 2167 | enum charge_type ctype) |
2139 | int page_size) | ||
2140 | { | 2168 | { |
2141 | int nr_pages = page_size >> PAGE_SHIFT; | ||
2142 | |||
2143 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
2144 | if (!mem) | ||
2145 | return; | ||
2146 | |||
2147 | lock_page_cgroup(pc); | 2169 | lock_page_cgroup(pc); |
2148 | if (unlikely(PageCgroupUsed(pc))) { | 2170 | if (unlikely(PageCgroupUsed(pc))) { |
2149 | unlock_page_cgroup(pc); | 2171 | unlock_page_cgroup(pc); |
2150 | mem_cgroup_cancel_charge(mem, page_size); | 2172 | __mem_cgroup_cancel_charge(mem, nr_pages); |
2151 | return; | 2173 | return; |
2152 | } | 2174 | } |
2153 | /* | 2175 | /* |
@@ -2184,7 +2206,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2184 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2206 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
2185 | * if they exceeds softlimit. | 2207 | * if they exceeds softlimit. |
2186 | */ | 2208 | */ |
2187 | memcg_check_events(mem, pc->page); | 2209 | memcg_check_events(mem, page); |
2188 | } | 2210 | } |
2189 | 2211 | ||
2190 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2212 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -2221,7 +2243,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2221 | * We hold lru_lock, then, reduce counter directly. | 2243 | * We hold lru_lock, then, reduce counter directly. |
2222 | */ | 2244 | */ |
2223 | lru = page_lru(head); | 2245 | lru = page_lru(head); |
2224 | mz = page_cgroup_zoneinfo(head_pc); | 2246 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); |
2225 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 2247 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
2226 | } | 2248 | } |
2227 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 2249 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
@@ -2230,7 +2252,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2230 | #endif | 2252 | #endif |
2231 | 2253 | ||
2232 | /** | 2254 | /** |
2233 | * __mem_cgroup_move_account - move account of the page | 2255 | * mem_cgroup_move_account - move account of the page |
2256 | * @page: the page | ||
2257 | * @nr_pages: number of regular pages (>1 for huge pages) | ||
2234 | * @pc: page_cgroup of the page. | 2258 | * @pc: page_cgroup of the page. |
2235 | * @from: mem_cgroup which the page is moved from. | 2259 | * @from: mem_cgroup which the page is moved from. |
2236 | * @to: mem_cgroup which the page is moved to. @from != @to. | 2260 | * @to: mem_cgroup which the page is moved to. @from != @to. |
@@ -2238,25 +2262,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2238 | * | 2262 | * |
2239 | * The caller must confirm following. | 2263 | * The caller must confirm following. |
2240 | * - page is not on LRU (isolate_page() is useful.) | 2264 | * - page is not on LRU (isolate_page() is useful.) |
2241 | * - the pc is locked, used, and ->mem_cgroup points to @from. | 2265 | * - compound_lock is held when nr_pages > 1 |
2242 | * | 2266 | * |
2243 | * This function doesn't do "charge" nor css_get to new cgroup. It should be | 2267 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
2244 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is | 2268 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
2245 | * true, this function does "uncharge" from old cgroup, but it doesn't if | 2269 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
2246 | * @uncharge is false, so a caller should do "uncharge". | 2270 | * @uncharge is false, so a caller should do "uncharge". |
2247 | */ | 2271 | */ |
2248 | 2272 | static int mem_cgroup_move_account(struct page *page, | |
2249 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 2273 | unsigned int nr_pages, |
2250 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, | 2274 | struct page_cgroup *pc, |
2251 | int charge_size) | 2275 | struct mem_cgroup *from, |
2276 | struct mem_cgroup *to, | ||
2277 | bool uncharge) | ||
2252 | { | 2278 | { |
2253 | int nr_pages = charge_size >> PAGE_SHIFT; | 2279 | unsigned long flags; |
2280 | int ret; | ||
2254 | 2281 | ||
2255 | VM_BUG_ON(from == to); | 2282 | VM_BUG_ON(from == to); |
2256 | VM_BUG_ON(PageLRU(pc->page)); | 2283 | VM_BUG_ON(PageLRU(page)); |
2257 | VM_BUG_ON(!page_is_cgroup_locked(pc)); | 2284 | /* |
2258 | VM_BUG_ON(!PageCgroupUsed(pc)); | 2285 | * The page is isolated from LRU. So, collapse function |
2259 | VM_BUG_ON(pc->mem_cgroup != from); | 2286 | * will not handle this page. But page splitting can happen. |
2287 | * Do this check under compound_page_lock(). The caller should | ||
2288 | * hold it. | ||
2289 | */ | ||
2290 | ret = -EBUSY; | ||
2291 | if (nr_pages > 1 && !PageTransHuge(page)) | ||
2292 | goto out; | ||
2293 | |||
2294 | lock_page_cgroup(pc); | ||
2295 | |||
2296 | ret = -EINVAL; | ||
2297 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | ||
2298 | goto unlock; | ||
2299 | |||
2300 | move_lock_page_cgroup(pc, &flags); | ||
2260 | 2301 | ||
2261 | if (PageCgroupFileMapped(pc)) { | 2302 | if (PageCgroupFileMapped(pc)) { |
2262 | /* Update mapped_file data for mem_cgroup */ | 2303 | /* Update mapped_file data for mem_cgroup */ |
@@ -2268,7 +2309,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2268 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2309 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); |
2269 | if (uncharge) | 2310 | if (uncharge) |
2270 | /* This is not "cancel", but cancel_charge does all we need. */ | 2311 | /* This is not "cancel", but cancel_charge does all we need. */ |
2271 | mem_cgroup_cancel_charge(from, charge_size); | 2312 | __mem_cgroup_cancel_charge(from, nr_pages); |
2272 | 2313 | ||
2273 | /* caller should have done css_get */ | 2314 | /* caller should have done css_get */ |
2274 | pc->mem_cgroup = to; | 2315 | pc->mem_cgroup = to; |
@@ -2280,40 +2321,16 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2280 | * garanteed that "to" is never removed. So, we don't check rmdir | 2321 | * garanteed that "to" is never removed. So, we don't check rmdir |
2281 | * status here. | 2322 | * status here. |
2282 | */ | 2323 | */ |
2283 | } | 2324 | move_unlock_page_cgroup(pc, &flags); |
2284 | 2325 | ret = 0; | |
2285 | /* | 2326 | unlock: |
2286 | * check whether the @pc is valid for moving account and call | ||
2287 | * __mem_cgroup_move_account() | ||
2288 | */ | ||
2289 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
2290 | struct mem_cgroup *from, struct mem_cgroup *to, | ||
2291 | bool uncharge, int charge_size) | ||
2292 | { | ||
2293 | int ret = -EINVAL; | ||
2294 | unsigned long flags; | ||
2295 | /* | ||
2296 | * The page is isolated from LRU. So, collapse function | ||
2297 | * will not handle this page. But page splitting can happen. | ||
2298 | * Do this check under compound_page_lock(). The caller should | ||
2299 | * hold it. | ||
2300 | */ | ||
2301 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) | ||
2302 | return -EBUSY; | ||
2303 | |||
2304 | lock_page_cgroup(pc); | ||
2305 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
2306 | move_lock_page_cgroup(pc, &flags); | ||
2307 | __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); | ||
2308 | move_unlock_page_cgroup(pc, &flags); | ||
2309 | ret = 0; | ||
2310 | } | ||
2311 | unlock_page_cgroup(pc); | 2327 | unlock_page_cgroup(pc); |
2312 | /* | 2328 | /* |
2313 | * check events | 2329 | * check events |
2314 | */ | 2330 | */ |
2315 | memcg_check_events(to, pc->page); | 2331 | memcg_check_events(to, page); |
2316 | memcg_check_events(from, pc->page); | 2332 | memcg_check_events(from, page); |
2333 | out: | ||
2317 | return ret; | 2334 | return ret; |
2318 | } | 2335 | } |
2319 | 2336 | ||
@@ -2321,16 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
2321 | * move charges to its parent. | 2338 | * move charges to its parent. |
2322 | */ | 2339 | */ |
2323 | 2340 | ||
2324 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | 2341 | static int mem_cgroup_move_parent(struct page *page, |
2342 | struct page_cgroup *pc, | ||
2325 | struct mem_cgroup *child, | 2343 | struct mem_cgroup *child, |
2326 | gfp_t gfp_mask) | 2344 | gfp_t gfp_mask) |
2327 | { | 2345 | { |
2328 | struct page *page = pc->page; | ||
2329 | struct cgroup *cg = child->css.cgroup; | 2346 | struct cgroup *cg = child->css.cgroup; |
2330 | struct cgroup *pcg = cg->parent; | 2347 | struct cgroup *pcg = cg->parent; |
2331 | struct mem_cgroup *parent; | 2348 | struct mem_cgroup *parent; |
2332 | int page_size = PAGE_SIZE; | 2349 | unsigned int nr_pages; |
2333 | unsigned long flags; | 2350 | unsigned long uninitialized_var(flags); |
2334 | int ret; | 2351 | int ret; |
2335 | 2352 | ||
2336 | /* Is ROOT ? */ | 2353 | /* Is ROOT ? */ |
@@ -2343,23 +2360,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2343 | if (isolate_lru_page(page)) | 2360 | if (isolate_lru_page(page)) |
2344 | goto put; | 2361 | goto put; |
2345 | 2362 | ||
2346 | if (PageTransHuge(page)) | 2363 | nr_pages = hpage_nr_pages(page); |
2347 | page_size = HPAGE_SIZE; | ||
2348 | 2364 | ||
2349 | parent = mem_cgroup_from_cont(pcg); | 2365 | parent = mem_cgroup_from_cont(pcg); |
2350 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, | 2366 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2351 | &parent, false, page_size); | ||
2352 | if (ret || !parent) | 2367 | if (ret || !parent) |
2353 | goto put_back; | 2368 | goto put_back; |
2354 | 2369 | ||
2355 | if (page_size > PAGE_SIZE) | 2370 | if (nr_pages > 1) |
2356 | flags = compound_lock_irqsave(page); | 2371 | flags = compound_lock_irqsave(page); |
2357 | 2372 | ||
2358 | ret = mem_cgroup_move_account(pc, child, parent, true, page_size); | 2373 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); |
2359 | if (ret) | 2374 | if (ret) |
2360 | mem_cgroup_cancel_charge(parent, page_size); | 2375 | __mem_cgroup_cancel_charge(parent, nr_pages); |
2361 | 2376 | ||
2362 | if (page_size > PAGE_SIZE) | 2377 | if (nr_pages > 1) |
2363 | compound_unlock_irqrestore(page, flags); | 2378 | compound_unlock_irqrestore(page, flags); |
2364 | put_back: | 2379 | put_back: |
2365 | putback_lru_page(page); | 2380 | putback_lru_page(page); |
@@ -2379,13 +2394,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2379 | gfp_t gfp_mask, enum charge_type ctype) | 2394 | gfp_t gfp_mask, enum charge_type ctype) |
2380 | { | 2395 | { |
2381 | struct mem_cgroup *mem = NULL; | 2396 | struct mem_cgroup *mem = NULL; |
2382 | int page_size = PAGE_SIZE; | 2397 | unsigned int nr_pages = 1; |
2383 | struct page_cgroup *pc; | 2398 | struct page_cgroup *pc; |
2384 | bool oom = true; | 2399 | bool oom = true; |
2385 | int ret; | 2400 | int ret; |
2386 | 2401 | ||
2387 | if (PageTransHuge(page)) { | 2402 | if (PageTransHuge(page)) { |
2388 | page_size <<= compound_order(page); | 2403 | nr_pages <<= compound_order(page); |
2389 | VM_BUG_ON(!PageTransHuge(page)); | 2404 | VM_BUG_ON(!PageTransHuge(page)); |
2390 | /* | 2405 | /* |
2391 | * Never OOM-kill a process for a huge page. The | 2406 | * Never OOM-kill a process for a huge page. The |
@@ -2395,16 +2410,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2395 | } | 2410 | } |
2396 | 2411 | ||
2397 | pc = lookup_page_cgroup(page); | 2412 | pc = lookup_page_cgroup(page); |
2398 | /* can happen at boot */ | 2413 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ |
2399 | if (unlikely(!pc)) | ||
2400 | return 0; | ||
2401 | prefetchw(pc); | ||
2402 | 2414 | ||
2403 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); | 2415 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); |
2404 | if (ret || !mem) | 2416 | if (ret || !mem) |
2405 | return ret; | 2417 | return ret; |
2406 | 2418 | ||
2407 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); | 2419 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); |
2408 | return 0; | 2420 | return 0; |
2409 | } | 2421 | } |
2410 | 2422 | ||
@@ -2432,9 +2444,26 @@ static void | |||
2432 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2444 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2433 | enum charge_type ctype); | 2445 | enum charge_type ctype); |
2434 | 2446 | ||
2447 | static void | ||
2448 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | ||
2449 | enum charge_type ctype) | ||
2450 | { | ||
2451 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2452 | /* | ||
2453 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | ||
2454 | * is already on LRU. It means the page may on some other page_cgroup's | ||
2455 | * LRU. Take care of it. | ||
2456 | */ | ||
2457 | mem_cgroup_lru_del_before_commit(page); | ||
2458 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | ||
2459 | mem_cgroup_lru_add_after_commit(page); | ||
2460 | return; | ||
2461 | } | ||
2462 | |||
2435 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2463 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2436 | gfp_t gfp_mask) | 2464 | gfp_t gfp_mask) |
2437 | { | 2465 | { |
2466 | struct mem_cgroup *mem = NULL; | ||
2438 | int ret; | 2467 | int ret; |
2439 | 2468 | ||
2440 | if (mem_cgroup_disabled()) | 2469 | if (mem_cgroup_disabled()) |
@@ -2469,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2469 | if (unlikely(!mm)) | 2498 | if (unlikely(!mm)) |
2470 | mm = &init_mm; | 2499 | mm = &init_mm; |
2471 | 2500 | ||
2472 | if (page_is_file_cache(page)) | 2501 | if (page_is_file_cache(page)) { |
2473 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2502 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); |
2474 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 2503 | if (ret || !mem) |
2504 | return ret; | ||
2475 | 2505 | ||
2506 | /* | ||
2507 | * FUSE reuses pages without going through the final | ||
2508 | * put that would remove them from the LRU list, make | ||
2509 | * sure that they get relinked properly. | ||
2510 | */ | ||
2511 | __mem_cgroup_commit_charge_lrucare(page, mem, | ||
2512 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
2513 | return ret; | ||
2514 | } | ||
2476 | /* shmem */ | 2515 | /* shmem */ |
2477 | if (PageSwapCache(page)) { | 2516 | if (PageSwapCache(page)) { |
2478 | struct mem_cgroup *mem = NULL; | ||
2479 | |||
2480 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2517 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
2481 | if (!ret) | 2518 | if (!ret) |
2482 | __mem_cgroup_commit_charge_swapin(page, mem, | 2519 | __mem_cgroup_commit_charge_swapin(page, mem, |
@@ -2501,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2501 | struct mem_cgroup *mem; | 2538 | struct mem_cgroup *mem; |
2502 | int ret; | 2539 | int ret; |
2503 | 2540 | ||
2541 | *ptr = NULL; | ||
2542 | |||
2504 | if (mem_cgroup_disabled()) | 2543 | if (mem_cgroup_disabled()) |
2505 | return 0; | 2544 | return 0; |
2506 | 2545 | ||
@@ -2518,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2518 | if (!mem) | 2557 | if (!mem) |
2519 | goto charge_cur_mm; | 2558 | goto charge_cur_mm; |
2520 | *ptr = mem; | 2559 | *ptr = mem; |
2521 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); | 2560 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); |
2522 | css_put(&mem->css); | 2561 | css_put(&mem->css); |
2523 | return ret; | 2562 | return ret; |
2524 | charge_cur_mm: | 2563 | charge_cur_mm: |
2525 | if (unlikely(!mm)) | 2564 | if (unlikely(!mm)) |
2526 | mm = &init_mm; | 2565 | mm = &init_mm; |
2527 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); | 2566 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); |
2528 | } | 2567 | } |
2529 | 2568 | ||
2530 | static void | 2569 | static void |
2531 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2570 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2532 | enum charge_type ctype) | 2571 | enum charge_type ctype) |
2533 | { | 2572 | { |
2534 | struct page_cgroup *pc; | ||
2535 | |||
2536 | if (mem_cgroup_disabled()) | 2573 | if (mem_cgroup_disabled()) |
2537 | return; | 2574 | return; |
2538 | if (!ptr) | 2575 | if (!ptr) |
2539 | return; | 2576 | return; |
2540 | cgroup_exclude_rmdir(&ptr->css); | 2577 | cgroup_exclude_rmdir(&ptr->css); |
2541 | pc = lookup_page_cgroup(page); | 2578 | |
2542 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2579 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); |
2543 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); | ||
2544 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
2545 | /* | 2580 | /* |
2546 | * Now swap is on-memory. This means this page may be | 2581 | * Now swap is on-memory. This means this page may be |
2547 | * counted both as mem and swap....double count. | 2582 | * counted both as mem and swap....double count. |
@@ -2589,15 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
2589 | return; | 2624 | return; |
2590 | if (!mem) | 2625 | if (!mem) |
2591 | return; | 2626 | return; |
2592 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); | 2627 | __mem_cgroup_cancel_charge(mem, 1); |
2593 | } | 2628 | } |
2594 | 2629 | ||
2595 | static void | 2630 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, |
2596 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | 2631 | unsigned int nr_pages, |
2597 | int page_size) | 2632 | const enum charge_type ctype) |
2598 | { | 2633 | { |
2599 | struct memcg_batch_info *batch = NULL; | 2634 | struct memcg_batch_info *batch = NULL; |
2600 | bool uncharge_memsw = true; | 2635 | bool uncharge_memsw = true; |
2636 | |||
2601 | /* If swapout, usage of swap doesn't decrease */ | 2637 | /* If swapout, usage of swap doesn't decrease */ |
2602 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2638 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2603 | uncharge_memsw = false; | 2639 | uncharge_memsw = false; |
@@ -2621,7 +2657,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2621 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2657 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2622 | goto direct_uncharge; | 2658 | goto direct_uncharge; |
2623 | 2659 | ||
2624 | if (page_size != PAGE_SIZE) | 2660 | if (nr_pages > 1) |
2625 | goto direct_uncharge; | 2661 | goto direct_uncharge; |
2626 | 2662 | ||
2627 | /* | 2663 | /* |
@@ -2632,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2632 | if (batch->memcg != mem) | 2668 | if (batch->memcg != mem) |
2633 | goto direct_uncharge; | 2669 | goto direct_uncharge; |
2634 | /* remember freed charge and uncharge it later */ | 2670 | /* remember freed charge and uncharge it later */ |
2635 | batch->bytes += PAGE_SIZE; | 2671 | batch->nr_pages++; |
2636 | if (uncharge_memsw) | 2672 | if (uncharge_memsw) |
2637 | batch->memsw_bytes += PAGE_SIZE; | 2673 | batch->memsw_nr_pages++; |
2638 | return; | 2674 | return; |
2639 | direct_uncharge: | 2675 | direct_uncharge: |
2640 | res_counter_uncharge(&mem->res, page_size); | 2676 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); |
2641 | if (uncharge_memsw) | 2677 | if (uncharge_memsw) |
2642 | res_counter_uncharge(&mem->memsw, page_size); | 2678 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); |
2643 | if (unlikely(batch->memcg != mem)) | 2679 | if (unlikely(batch->memcg != mem)) |
2644 | memcg_oom_recover(mem); | 2680 | memcg_oom_recover(mem); |
2645 | return; | 2681 | return; |
@@ -2651,10 +2687,9 @@ direct_uncharge: | |||
2651 | static struct mem_cgroup * | 2687 | static struct mem_cgroup * |
2652 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2688 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2653 | { | 2689 | { |
2654 | int count; | ||
2655 | struct page_cgroup *pc; | ||
2656 | struct mem_cgroup *mem = NULL; | 2690 | struct mem_cgroup *mem = NULL; |
2657 | int page_size = PAGE_SIZE; | 2691 | unsigned int nr_pages = 1; |
2692 | struct page_cgroup *pc; | ||
2658 | 2693 | ||
2659 | if (mem_cgroup_disabled()) | 2694 | if (mem_cgroup_disabled()) |
2660 | return NULL; | 2695 | return NULL; |
@@ -2663,11 +2698,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2663 | return NULL; | 2698 | return NULL; |
2664 | 2699 | ||
2665 | if (PageTransHuge(page)) { | 2700 | if (PageTransHuge(page)) { |
2666 | page_size <<= compound_order(page); | 2701 | nr_pages <<= compound_order(page); |
2667 | VM_BUG_ON(!PageTransHuge(page)); | 2702 | VM_BUG_ON(!PageTransHuge(page)); |
2668 | } | 2703 | } |
2669 | |||
2670 | count = page_size >> PAGE_SHIFT; | ||
2671 | /* | 2704 | /* |
2672 | * Check if our page_cgroup is valid | 2705 | * Check if our page_cgroup is valid |
2673 | */ | 2706 | */ |
@@ -2700,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2700 | break; | 2733 | break; |
2701 | } | 2734 | } |
2702 | 2735 | ||
2703 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); | 2736 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); |
2704 | 2737 | ||
2705 | ClearPageCgroupUsed(pc); | 2738 | ClearPageCgroupUsed(pc); |
2706 | /* | 2739 | /* |
@@ -2721,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2721 | mem_cgroup_get(mem); | 2754 | mem_cgroup_get(mem); |
2722 | } | 2755 | } |
2723 | if (!mem_cgroup_is_root(mem)) | 2756 | if (!mem_cgroup_is_root(mem)) |
2724 | __do_uncharge(mem, ctype, page_size); | 2757 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); |
2725 | 2758 | ||
2726 | return mem; | 2759 | return mem; |
2727 | 2760 | ||
@@ -2761,8 +2794,8 @@ void mem_cgroup_uncharge_start(void) | |||
2761 | /* We can do nest. */ | 2794 | /* We can do nest. */ |
2762 | if (current->memcg_batch.do_batch == 1) { | 2795 | if (current->memcg_batch.do_batch == 1) { |
2763 | current->memcg_batch.memcg = NULL; | 2796 | current->memcg_batch.memcg = NULL; |
2764 | current->memcg_batch.bytes = 0; | 2797 | current->memcg_batch.nr_pages = 0; |
2765 | current->memcg_batch.memsw_bytes = 0; | 2798 | current->memcg_batch.memsw_nr_pages = 0; |
2766 | } | 2799 | } |
2767 | } | 2800 | } |
2768 | 2801 | ||
@@ -2783,10 +2816,12 @@ void mem_cgroup_uncharge_end(void) | |||
2783 | * This "batch->memcg" is valid without any css_get/put etc... | 2816 | * This "batch->memcg" is valid without any css_get/put etc... |
2784 | * bacause we hide charges behind us. | 2817 | * bacause we hide charges behind us. |
2785 | */ | 2818 | */ |
2786 | if (batch->bytes) | 2819 | if (batch->nr_pages) |
2787 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2820 | res_counter_uncharge(&batch->memcg->res, |
2788 | if (batch->memsw_bytes) | 2821 | batch->nr_pages * PAGE_SIZE); |
2789 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2822 | if (batch->memsw_nr_pages) |
2823 | res_counter_uncharge(&batch->memcg->memsw, | ||
2824 | batch->memsw_nr_pages * PAGE_SIZE); | ||
2790 | memcg_oom_recover(batch->memcg); | 2825 | memcg_oom_recover(batch->memcg); |
2791 | /* forget this pointer (for sanity check) */ | 2826 | /* forget this pointer (for sanity check) */ |
2792 | batch->memcg = NULL; | 2827 | batch->memcg = NULL; |
@@ -2911,11 +2946,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2911 | int mem_cgroup_prepare_migration(struct page *page, | 2946 | int mem_cgroup_prepare_migration(struct page *page, |
2912 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 2947 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) |
2913 | { | 2948 | { |
2914 | struct page_cgroup *pc; | ||
2915 | struct mem_cgroup *mem = NULL; | 2949 | struct mem_cgroup *mem = NULL; |
2950 | struct page_cgroup *pc; | ||
2916 | enum charge_type ctype; | 2951 | enum charge_type ctype; |
2917 | int ret = 0; | 2952 | int ret = 0; |
2918 | 2953 | ||
2954 | *ptr = NULL; | ||
2955 | |||
2919 | VM_BUG_ON(PageTransHuge(page)); | 2956 | VM_BUG_ON(PageTransHuge(page)); |
2920 | if (mem_cgroup_disabled()) | 2957 | if (mem_cgroup_disabled()) |
2921 | return 0; | 2958 | return 0; |
@@ -2966,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2966 | return 0; | 3003 | return 0; |
2967 | 3004 | ||
2968 | *ptr = mem; | 3005 | *ptr = mem; |
2969 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, ptr, false, PAGE_SIZE); | 3006 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); |
2970 | css_put(&mem->css);/* drop extra refcnt */ | 3007 | css_put(&mem->css);/* drop extra refcnt */ |
2971 | if (ret || *ptr == NULL) { | 3008 | if (ret || *ptr == NULL) { |
2972 | if (PageAnon(page)) { | 3009 | if (PageAnon(page)) { |
@@ -2993,7 +3030,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2993 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3030 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
2994 | else | 3031 | else |
2995 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3032 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2996 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); | 3033 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); |
2997 | return ret; | 3034 | return ret; |
2998 | } | 3035 | } |
2999 | 3036 | ||
@@ -3058,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3058 | struct mm_struct *mm, | 3095 | struct mm_struct *mm, |
3059 | gfp_t gfp_mask) | 3096 | gfp_t gfp_mask) |
3060 | { | 3097 | { |
3061 | struct mem_cgroup *mem = NULL; | 3098 | struct mem_cgroup *mem; |
3062 | int ret; | 3099 | int ret; |
3063 | 3100 | ||
3064 | if (mem_cgroup_disabled()) | 3101 | if (mem_cgroup_disabled()) |
@@ -3071,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3071 | return ret; | 3108 | return ret; |
3072 | } | 3109 | } |
3073 | 3110 | ||
3111 | #ifdef CONFIG_DEBUG_VM | ||
3112 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | ||
3113 | { | ||
3114 | struct page_cgroup *pc; | ||
3115 | |||
3116 | pc = lookup_page_cgroup(page); | ||
3117 | if (likely(pc) && PageCgroupUsed(pc)) | ||
3118 | return pc; | ||
3119 | return NULL; | ||
3120 | } | ||
3121 | |||
3122 | bool mem_cgroup_bad_page_check(struct page *page) | ||
3123 | { | ||
3124 | if (mem_cgroup_disabled()) | ||
3125 | return false; | ||
3126 | |||
3127 | return lookup_page_cgroup_used(page) != NULL; | ||
3128 | } | ||
3129 | |||
3130 | void mem_cgroup_print_bad_page(struct page *page) | ||
3131 | { | ||
3132 | struct page_cgroup *pc; | ||
3133 | |||
3134 | pc = lookup_page_cgroup_used(page); | ||
3135 | if (pc) { | ||
3136 | int ret = -1; | ||
3137 | char *path; | ||
3138 | |||
3139 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", | ||
3140 | pc, pc->flags, pc->mem_cgroup); | ||
3141 | |||
3142 | path = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3143 | if (path) { | ||
3144 | rcu_read_lock(); | ||
3145 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, | ||
3146 | path, PATH_MAX); | ||
3147 | rcu_read_unlock(); | ||
3148 | } | ||
3149 | |||
3150 | printk(KERN_CONT "(%s)\n", | ||
3151 | (ret < 0) ? "cannot get the path" : path); | ||
3152 | kfree(path); | ||
3153 | } | ||
3154 | } | ||
3155 | #endif | ||
3156 | |||
3074 | static DEFINE_MUTEX(set_limit_mutex); | 3157 | static DEFINE_MUTEX(set_limit_mutex); |
3075 | 3158 | ||
3076 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3159 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
@@ -3314,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3314 | loop += 256; | 3397 | loop += 256; |
3315 | busy = NULL; | 3398 | busy = NULL; |
3316 | while (loop--) { | 3399 | while (loop--) { |
3400 | struct page *page; | ||
3401 | |||
3317 | ret = 0; | 3402 | ret = 0; |
3318 | spin_lock_irqsave(&zone->lru_lock, flags); | 3403 | spin_lock_irqsave(&zone->lru_lock, flags); |
3319 | if (list_empty(list)) { | 3404 | if (list_empty(list)) { |
@@ -3329,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3329 | } | 3414 | } |
3330 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3415 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3331 | 3416 | ||
3332 | ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); | 3417 | page = lookup_cgroup_page(pc); |
3418 | |||
3419 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); | ||
3333 | if (ret == -ENOMEM) | 3420 | if (ret == -ENOMEM) |
3334 | break; | 3421 | break; |
3335 | 3422 | ||
@@ -3477,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3477 | } | 3564 | } |
3478 | 3565 | ||
3479 | 3566 | ||
3480 | static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | 3567 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, |
3481 | enum mem_cgroup_stat_index idx) | 3568 | enum mem_cgroup_stat_index idx) |
3482 | { | 3569 | { |
3483 | struct mem_cgroup *iter; | 3570 | struct mem_cgroup *iter; |
3484 | s64 val = 0; | 3571 | long val = 0; |
3485 | 3572 | ||
3486 | /* each per cpu's value can be minus.Then, use s64 */ | 3573 | /* Per-cpu values can be negative, use a signed accumulator */ |
3487 | for_each_mem_cgroup_tree(iter, mem) | 3574 | for_each_mem_cgroup_tree(iter, mem) |
3488 | val += mem_cgroup_read_stat(iter, idx); | 3575 | val += mem_cgroup_read_stat(iter, idx); |
3489 | 3576 | ||
@@ -3503,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | |||
3503 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3590 | return res_counter_read_u64(&mem->memsw, RES_USAGE); |
3504 | } | 3591 | } |
3505 | 3592 | ||
3506 | val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); | 3593 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); |
3507 | val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); | 3594 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); |
3508 | 3595 | ||
3509 | if (swap) | 3596 | if (swap) |
3510 | val += mem_cgroup_get_recursive_idx_stat(mem, | 3597 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
3511 | MEM_CGROUP_STAT_SWAPOUT); | ||
3512 | 3598 | ||
3513 | return val << PAGE_SHIFT; | 3599 | return val << PAGE_SHIFT; |
3514 | } | 3600 | } |
@@ -3728,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
3728 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3814 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
3729 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); | 3815 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
3730 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3816 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
3731 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3817 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); |
3732 | s->stat[MCS_PGPGIN] += val; | 3818 | s->stat[MCS_PGPGIN] += val; |
3733 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3819 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); |
3734 | s->stat[MCS_PGPGOUT] += val; | 3820 | s->stat[MCS_PGPGOUT] += val; |
3735 | if (do_swap_account) { | 3821 | if (do_swap_account) { |
3736 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3822 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
@@ -3854,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
3854 | return -EINVAL; | 3940 | return -EINVAL; |
3855 | } | 3941 | } |
3856 | 3942 | ||
3857 | spin_lock(&memcg->reclaim_param_lock); | ||
3858 | memcg->swappiness = val; | 3943 | memcg->swappiness = val; |
3859 | spin_unlock(&memcg->reclaim_param_lock); | ||
3860 | 3944 | ||
3861 | cgroup_unlock(); | 3945 | cgroup_unlock(); |
3862 | 3946 | ||
@@ -4512,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4512 | res_counter_init(&mem->memsw, NULL); | 4596 | res_counter_init(&mem->memsw, NULL); |
4513 | } | 4597 | } |
4514 | mem->last_scanned_child = 0; | 4598 | mem->last_scanned_child = 0; |
4515 | spin_lock_init(&mem->reclaim_param_lock); | ||
4516 | INIT_LIST_HEAD(&mem->oom_notify); | 4599 | INIT_LIST_HEAD(&mem->oom_notify); |
4517 | 4600 | ||
4518 | if (parent) | 4601 | if (parent) |
@@ -4600,8 +4683,7 @@ one_by_one: | |||
4600 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4683 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4601 | cond_resched(); | 4684 | cond_resched(); |
4602 | } | 4685 | } |
4603 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 4686 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); |
4604 | PAGE_SIZE); | ||
4605 | if (ret || !mem) | 4687 | if (ret || !mem) |
4606 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4688 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4607 | return -ENOMEM; | 4689 | return -ENOMEM; |
@@ -4947,8 +5029,8 @@ retry: | |||
4947 | if (isolate_lru_page(page)) | 5029 | if (isolate_lru_page(page)) |
4948 | goto put; | 5030 | goto put; |
4949 | pc = lookup_page_cgroup(page); | 5031 | pc = lookup_page_cgroup(page); |
4950 | if (!mem_cgroup_move_account(pc, | 5032 | if (!mem_cgroup_move_account(page, 1, pc, |
4951 | mc.from, mc.to, false, PAGE_SIZE)) { | 5033 | mc.from, mc.to, false)) { |
4952 | mc.precharge--; | 5034 | mc.precharge--; |
4953 | /* we uncharge from mc.from later. */ | 5035 | /* we uncharge from mc.from later. */ |
4954 | mc.moved_charge++; | 5036 | mc.moved_charge++; |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e0af336530c6..37feb9fec228 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -945,7 +945,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
945 | collect_procs(ppage, &tokill); | 945 | collect_procs(ppage, &tokill); |
946 | 946 | ||
947 | if (hpage != ppage) | 947 | if (hpage != ppage) |
948 | lock_page_nosync(ppage); | 948 | lock_page(ppage); |
949 | 949 | ||
950 | ret = try_to_unmap(ppage, ttu); | 950 | ret = try_to_unmap(ppage, ttu); |
951 | if (ret != SWAP_SUCCESS) | 951 | if (ret != SWAP_SUCCESS) |
@@ -1038,7 +1038,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1038 | * Check "just unpoisoned", "filter hit", and | 1038 | * Check "just unpoisoned", "filter hit", and |
1039 | * "race with other subpage." | 1039 | * "race with other subpage." |
1040 | */ | 1040 | */ |
1041 | lock_page_nosync(hpage); | 1041 | lock_page(hpage); |
1042 | if (!PageHWPoison(hpage) | 1042 | if (!PageHWPoison(hpage) |
1043 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) | 1043 | || (hwpoison_filter(p) && TestClearPageHWPoison(p)) |
1044 | || (p != hpage && TestSetPageHWPoison(hpage))) { | 1044 | || (p != hpage && TestSetPageHWPoison(hpage))) { |
@@ -1088,7 +1088,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1088 | * It's very difficult to mess with pages currently under IO | 1088 | * It's very difficult to mess with pages currently under IO |
1089 | * and in many cases impossible, so we just avoid it here. | 1089 | * and in many cases impossible, so we just avoid it here. |
1090 | */ | 1090 | */ |
1091 | lock_page_nosync(hpage); | 1091 | lock_page(hpage); |
1092 | 1092 | ||
1093 | /* | 1093 | /* |
1094 | * unpoison always clear PG_hwpoison inside page lock | 1094 | * unpoison always clear PG_hwpoison inside page lock |
@@ -1231,7 +1231,7 @@ int unpoison_memory(unsigned long pfn) | |||
1231 | return 0; | 1231 | return 0; |
1232 | } | 1232 | } |
1233 | 1233 | ||
1234 | lock_page_nosync(page); | 1234 | lock_page(page); |
1235 | /* | 1235 | /* |
1236 | * This test is racy because PG_hwpoison is set outside of page lock. | 1236 | * This test is racy because PG_hwpoison is set outside of page lock. |
1237 | * That's acceptable because that won't trigger kernel panic. Instead, | 1237 | * That's acceptable because that won't trigger kernel panic. Instead, |
diff --git a/mm/memory.c b/mm/memory.c index 615be5127ce1..51a5c23704af 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1486,9 +1486,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1486 | struct vm_area_struct *vma; | 1486 | struct vm_area_struct *vma; |
1487 | 1487 | ||
1488 | vma = find_extend_vma(mm, start); | 1488 | vma = find_extend_vma(mm, start); |
1489 | if (!vma && in_gate_area(tsk, start)) { | 1489 | if (!vma && in_gate_area(mm, start)) { |
1490 | unsigned long pg = start & PAGE_MASK; | 1490 | unsigned long pg = start & PAGE_MASK; |
1491 | struct vm_area_struct *gate_vma = get_gate_vma(tsk); | 1491 | struct vm_area_struct *gate_vma = get_gate_vma(mm); |
1492 | pgd_t *pgd; | 1492 | pgd_t *pgd; |
1493 | pud_t *pud; | 1493 | pud_t *pud; |
1494 | pmd_t *pmd; | 1494 | pmd_t *pmd; |
@@ -1591,10 +1591,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1591 | return i ? i : -EFAULT; | 1591 | return i ? i : -EFAULT; |
1592 | BUG(); | 1592 | BUG(); |
1593 | } | 1593 | } |
1594 | if (ret & VM_FAULT_MAJOR) | 1594 | |
1595 | tsk->maj_flt++; | 1595 | if (tsk) { |
1596 | else | 1596 | if (ret & VM_FAULT_MAJOR) |
1597 | tsk->min_flt++; | 1597 | tsk->maj_flt++; |
1598 | else | ||
1599 | tsk->min_flt++; | ||
1600 | } | ||
1598 | 1601 | ||
1599 | if (ret & VM_FAULT_RETRY) { | 1602 | if (ret & VM_FAULT_RETRY) { |
1600 | if (nonblocking) | 1603 | if (nonblocking) |
@@ -1641,7 +1644,8 @@ EXPORT_SYMBOL(__get_user_pages); | |||
1641 | 1644 | ||
1642 | /** | 1645 | /** |
1643 | * get_user_pages() - pin user pages in memory | 1646 | * get_user_pages() - pin user pages in memory |
1644 | * @tsk: task_struct of target task | 1647 | * @tsk: the task_struct to use for page fault accounting, or |
1648 | * NULL if faults are not to be recorded. | ||
1645 | * @mm: mm_struct of target mm | 1649 | * @mm: mm_struct of target mm |
1646 | * @start: starting user address | 1650 | * @start: starting user address |
1647 | * @nr_pages: number of pages from start to pin | 1651 | * @nr_pages: number of pages from start to pin |
@@ -2767,7 +2771,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2767 | swp_entry_t entry; | 2771 | swp_entry_t entry; |
2768 | pte_t pte; | 2772 | pte_t pte; |
2769 | int locked; | 2773 | int locked; |
2770 | struct mem_cgroup *ptr = NULL; | 2774 | struct mem_cgroup *ptr; |
2771 | int exclusive = 0; | 2775 | int exclusive = 0; |
2772 | int ret = 0; | 2776 | int ret = 0; |
2773 | 2777 | ||
@@ -3499,7 +3503,7 @@ static int __init gate_vma_init(void) | |||
3499 | __initcall(gate_vma_init); | 3503 | __initcall(gate_vma_init); |
3500 | #endif | 3504 | #endif |
3501 | 3505 | ||
3502 | struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | 3506 | struct vm_area_struct *get_gate_vma(struct mm_struct *mm) |
3503 | { | 3507 | { |
3504 | #ifdef AT_SYSINFO_EHDR | 3508 | #ifdef AT_SYSINFO_EHDR |
3505 | return &gate_vma; | 3509 | return &gate_vma; |
@@ -3508,7 +3512,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) | |||
3508 | #endif | 3512 | #endif |
3509 | } | 3513 | } |
3510 | 3514 | ||
3511 | int in_gate_area_no_task(unsigned long addr) | 3515 | int in_gate_area_no_mm(unsigned long addr) |
3512 | { | 3516 | { |
3513 | #ifdef AT_SYSINFO_EHDR | 3517 | #ifdef AT_SYSINFO_EHDR |
3514 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) | 3518 | if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) |
@@ -3649,20 +3653,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
3649 | #endif | 3653 | #endif |
3650 | 3654 | ||
3651 | /* | 3655 | /* |
3652 | * Access another process' address space. | 3656 | * Access another process' address space as given in mm. If non-NULL, use the |
3653 | * Source/target buffer must be kernel space, | 3657 | * given task for page fault accounting. |
3654 | * Do not walk the page table directly, use get_user_pages | ||
3655 | */ | 3658 | */ |
3656 | int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) | 3659 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
3660 | unsigned long addr, void *buf, int len, int write) | ||
3657 | { | 3661 | { |
3658 | struct mm_struct *mm; | ||
3659 | struct vm_area_struct *vma; | 3662 | struct vm_area_struct *vma; |
3660 | void *old_buf = buf; | 3663 | void *old_buf = buf; |
3661 | 3664 | ||
3662 | mm = get_task_mm(tsk); | ||
3663 | if (!mm) | ||
3664 | return 0; | ||
3665 | |||
3666 | down_read(&mm->mmap_sem); | 3665 | down_read(&mm->mmap_sem); |
3667 | /* ignore errors, just check how much was successfully transferred */ | 3666 | /* ignore errors, just check how much was successfully transferred */ |
3668 | while (len) { | 3667 | while (len) { |
@@ -3711,11 +3710,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
3711 | addr += bytes; | 3710 | addr += bytes; |
3712 | } | 3711 | } |
3713 | up_read(&mm->mmap_sem); | 3712 | up_read(&mm->mmap_sem); |
3714 | mmput(mm); | ||
3715 | 3713 | ||
3716 | return buf - old_buf; | 3714 | return buf - old_buf; |
3717 | } | 3715 | } |
3718 | 3716 | ||
3717 | /** | ||
3718 | * @access_remote_vm - access another process' address space | ||
3719 | * @mm: the mm_struct of the target address space | ||
3720 | * @addr: start address to access | ||
3721 | * @buf: source or destination buffer | ||
3722 | * @len: number of bytes to transfer | ||
3723 | * @write: whether the access is a write | ||
3724 | * | ||
3725 | * The caller must hold a reference on @mm. | ||
3726 | */ | ||
3727 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, | ||
3728 | void *buf, int len, int write) | ||
3729 | { | ||
3730 | return __access_remote_vm(NULL, mm, addr, buf, len, write); | ||
3731 | } | ||
3732 | |||
3733 | /* | ||
3734 | * Access another process' address space. | ||
3735 | * Source/target buffer must be kernel space, | ||
3736 | * Do not walk the page table directly, use get_user_pages | ||
3737 | */ | ||
3738 | int access_process_vm(struct task_struct *tsk, unsigned long addr, | ||
3739 | void *buf, int len, int write) | ||
3740 | { | ||
3741 | struct mm_struct *mm; | ||
3742 | int ret; | ||
3743 | |||
3744 | mm = get_task_mm(tsk); | ||
3745 | if (!mm) | ||
3746 | return 0; | ||
3747 | |||
3748 | ret = __access_remote_vm(tsk, mm, addr, buf, len, write); | ||
3749 | mmput(mm); | ||
3750 | |||
3751 | return ret; | ||
3752 | } | ||
3753 | |||
3719 | /* | 3754 | /* |
3720 | * Print the name of a VMA. | 3755 | * Print the name of a VMA. |
3721 | */ | 3756 | */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 89e5c3fe8bbc..b0406d739ea7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -633,7 +633,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
633 | struct page *newpage = get_new_page(page, private, &result); | 633 | struct page *newpage = get_new_page(page, private, &result); |
634 | int remap_swapcache = 1; | 634 | int remap_swapcache = 1; |
635 | int charge = 0; | 635 | int charge = 0; |
636 | struct mem_cgroup *mem = NULL; | 636 | struct mem_cgroup *mem; |
637 | struct anon_vma *anon_vma = NULL; | 637 | struct anon_vma *anon_vma = NULL; |
638 | 638 | ||
639 | if (!newpage) | 639 | if (!newpage) |
diff --git a/mm/mlock.c b/mm/mlock.c index c3924c7f00be..2689a08c79af 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -237,7 +237,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
237 | 237 | ||
238 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || | 238 | if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || |
239 | is_vm_hugetlb_page(vma) || | 239 | is_vm_hugetlb_page(vma) || |
240 | vma == get_gate_vma(current))) { | 240 | vma == get_gate_vma(current->mm))) { |
241 | 241 | ||
242 | __mlock_vma_pages_range(vma, start, end, NULL); | 242 | __mlock_vma_pages_range(vma, start, end, NULL); |
243 | 243 | ||
@@ -332,7 +332,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
332 | int lock = newflags & VM_LOCKED; | 332 | int lock = newflags & VM_LOCKED; |
333 | 333 | ||
334 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || | 334 | if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || |
335 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) | 335 | is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) |
336 | goto out; /* don't set VM_LOCKED, don't count */ | 336 | goto out; /* don't set VM_LOCKED, don't count */ |
337 | 337 | ||
338 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); | 338 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e2bdb07079ce..e99f6cd1da1f 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -32,14 +32,6 @@ unsigned long max_low_pfn; | |||
32 | unsigned long min_low_pfn; | 32 | unsigned long min_low_pfn; |
33 | unsigned long max_pfn; | 33 | unsigned long max_pfn; |
34 | 34 | ||
35 | #ifdef CONFIG_CRASH_DUMP | ||
36 | /* | ||
37 | * If we have booted due to a crash, max_pfn will be a very low value. We need | ||
38 | * to know the amount of memory that the previous kernel used. | ||
39 | */ | ||
40 | unsigned long saved_max_pfn; | ||
41 | #endif | ||
42 | |||
43 | static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | 35 | static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, |
44 | u64 goal, u64 limit) | 36 | u64 goal, u64 limit) |
45 | { | 37 | { |
diff --git a/mm/nommu.c b/mm/nommu.c index f59e1424d3db..cb86e7d5e7f5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1842,10 +1842,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
1842 | } | 1842 | } |
1843 | EXPORT_SYMBOL(remap_vmalloc_range); | 1843 | EXPORT_SYMBOL(remap_vmalloc_range); |
1844 | 1844 | ||
1845 | void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | ||
1846 | { | ||
1847 | } | ||
1848 | |||
1849 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, | 1845 | unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, |
1850 | unsigned long len, unsigned long pgoff, unsigned long flags) | 1846 | unsigned long len, unsigned long pgoff, unsigned long flags) |
1851 | { | 1847 | { |
@@ -1963,7 +1959,7 @@ error: | |||
1963 | return -ENOMEM; | 1959 | return -ENOMEM; |
1964 | } | 1960 | } |
1965 | 1961 | ||
1966 | int in_gate_area_no_task(unsigned long addr) | 1962 | int in_gate_area_no_mm(unsigned long addr) |
1967 | { | 1963 | { |
1968 | return 0; | 1964 | return 0; |
1969 | } | 1965 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3100bc57036b..6a819d1b2c7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -406,7 +406,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
406 | task_unlock(current); | 406 | task_unlock(current); |
407 | dump_stack(); | 407 | dump_stack(); |
408 | mem_cgroup_print_oom_info(mem, p); | 408 | mem_cgroup_print_oom_info(mem, p); |
409 | __show_mem(SHOW_MEM_FILTER_NODES); | 409 | show_mem(SHOW_MEM_FILTER_NODES); |
410 | if (sysctl_oom_dump_tasks) | 410 | if (sysctl_oom_dump_tasks) |
411 | dump_tasks(mem, nodemask); | 411 | dump_tasks(mem, nodemask); |
412 | } | 412 | } |
@@ -549,6 +549,17 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
549 | unsigned int points = 0; | 549 | unsigned int points = 0; |
550 | struct task_struct *p; | 550 | struct task_struct *p; |
551 | 551 | ||
552 | /* | ||
553 | * If current has a pending SIGKILL, then automatically select it. The | ||
554 | * goal is to allow it to allocate so that it may quickly exit and free | ||
555 | * its memory. | ||
556 | */ | ||
557 | if (fatal_signal_pending(current)) { | ||
558 | set_thread_flag(TIF_MEMDIE); | ||
559 | boost_dying_task_prio(current, NULL); | ||
560 | return; | ||
561 | } | ||
562 | |||
552 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 563 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); |
553 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; | 564 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; |
554 | read_lock(&tasklist_lock); | 565 | read_lock(&tasklist_lock); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 632b46479c94..31f698862420 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1040,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc, | |||
1040 | int generic_writepages(struct address_space *mapping, | 1040 | int generic_writepages(struct address_space *mapping, |
1041 | struct writeback_control *wbc) | 1041 | struct writeback_control *wbc) |
1042 | { | 1042 | { |
1043 | struct blk_plug plug; | ||
1044 | int ret; | ||
1045 | |||
1043 | /* deal with chardevs and other special file */ | 1046 | /* deal with chardevs and other special file */ |
1044 | if (!mapping->a_ops->writepage) | 1047 | if (!mapping->a_ops->writepage) |
1045 | return 0; | 1048 | return 0; |
1046 | 1049 | ||
1047 | return write_cache_pages(mapping, wbc, __writepage, mapping); | 1050 | blk_start_plug(&plug); |
1051 | ret = write_cache_pages(mapping, wbc, __writepage, mapping); | ||
1052 | blk_finish_plug(&plug); | ||
1053 | return ret; | ||
1048 | } | 1054 | } |
1049 | 1055 | ||
1050 | EXPORT_SYMBOL(generic_writepages); | 1056 | EXPORT_SYMBOL(generic_writepages); |
@@ -1251,7 +1257,7 @@ int set_page_dirty_lock(struct page *page) | |||
1251 | { | 1257 | { |
1252 | int ret; | 1258 | int ret; |
1253 | 1259 | ||
1254 | lock_page_nosync(page); | 1260 | lock_page(page); |
1255 | ret = set_page_dirty(page); | 1261 | ret = set_page_dirty(page); |
1256 | unlock_page(page); | 1262 | unlock_page(page); |
1257 | return ret; | 1263 | return ret; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3a58221f4c22..d6e7ba7373be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #include <linux/compaction.h> | 53 | #include <linux/compaction.h> |
54 | #include <trace/events/kmem.h> | 54 | #include <trace/events/kmem.h> |
55 | #include <linux/ftrace_event.h> | 55 | #include <linux/ftrace_event.h> |
56 | #include <linux/memcontrol.h> | ||
56 | 57 | ||
57 | #include <asm/tlbflush.h> | 58 | #include <asm/tlbflush.h> |
58 | #include <asm/div64.h> | 59 | #include <asm/div64.h> |
@@ -565,7 +566,8 @@ static inline int free_pages_check(struct page *page) | |||
565 | if (unlikely(page_mapcount(page) | | 566 | if (unlikely(page_mapcount(page) | |
566 | (page->mapping != NULL) | | 567 | (page->mapping != NULL) | |
567 | (atomic_read(&page->_count) != 0) | | 568 | (atomic_read(&page->_count) != 0) | |
568 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 569 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | |
570 | (mem_cgroup_bad_page_check(page)))) { | ||
569 | bad_page(page); | 571 | bad_page(page); |
570 | return 1; | 572 | return 1; |
571 | } | 573 | } |
@@ -754,7 +756,8 @@ static inline int check_new_page(struct page *page) | |||
754 | if (unlikely(page_mapcount(page) | | 756 | if (unlikely(page_mapcount(page) | |
755 | (page->mapping != NULL) | | 757 | (page->mapping != NULL) | |
756 | (atomic_read(&page->_count) != 0) | | 758 | (atomic_read(&page->_count) != 0) | |
757 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 759 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | |
760 | (mem_cgroup_bad_page_check(page)))) { | ||
758 | bad_page(page); | 761 | bad_page(page); |
759 | return 1; | 762 | return 1; |
760 | } | 763 | } |
@@ -2192,7 +2195,7 @@ nopage: | |||
2192 | current->comm, order, gfp_mask); | 2195 | current->comm, order, gfp_mask); |
2193 | dump_stack(); | 2196 | dump_stack(); |
2194 | if (!should_suppress_show_mem()) | 2197 | if (!should_suppress_show_mem()) |
2195 | __show_mem(filter); | 2198 | show_mem(filter); |
2196 | } | 2199 | } |
2197 | return page; | 2200 | return page; |
2198 | got_pg: | 2201 | got_pg: |
@@ -5684,4 +5687,5 @@ void dump_page(struct page *page) | |||
5684 | page, atomic_read(&page->_count), page_mapcount(page), | 5687 | page, atomic_read(&page->_count), page_mapcount(page), |
5685 | page->mapping, page->index); | 5688 | page->mapping, page->index); |
5686 | dump_page_flags(page->flags); | 5689 | dump_page_flags(page->flags); |
5690 | mem_cgroup_print_bad_page(page); | ||
5687 | } | 5691 | } |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 59a3cd4c799d..a12cc3fa9859 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -11,12 +11,11 @@ | |||
11 | #include <linux/swapops.h> | 11 | #include <linux/swapops.h> |
12 | #include <linux/kmemleak.h> | 12 | #include <linux/kmemleak.h> |
13 | 13 | ||
14 | static void __meminit | 14 | static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id) |
15 | __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) | ||
16 | { | 15 | { |
17 | pc->flags = 0; | 16 | pc->flags = 0; |
17 | set_page_cgroup_array_id(pc, id); | ||
18 | pc->mem_cgroup = NULL; | 18 | pc->mem_cgroup = NULL; |
19 | pc->page = pfn_to_page(pfn); | ||
20 | INIT_LIST_HEAD(&pc->lru); | 19 | INIT_LIST_HEAD(&pc->lru); |
21 | } | 20 | } |
22 | static unsigned long total_usage; | 21 | static unsigned long total_usage; |
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
43 | return base + offset; | 42 | return base + offset; |
44 | } | 43 | } |
45 | 44 | ||
45 | struct page *lookup_cgroup_page(struct page_cgroup *pc) | ||
46 | { | ||
47 | unsigned long pfn; | ||
48 | struct page *page; | ||
49 | pg_data_t *pgdat; | ||
50 | |||
51 | pgdat = NODE_DATA(page_cgroup_array_id(pc)); | ||
52 | pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn; | ||
53 | page = pfn_to_page(pfn); | ||
54 | VM_BUG_ON(pc != lookup_page_cgroup(page)); | ||
55 | return page; | ||
56 | } | ||
57 | |||
46 | static int __init alloc_node_page_cgroup(int nid) | 58 | static int __init alloc_node_page_cgroup(int nid) |
47 | { | 59 | { |
48 | struct page_cgroup *base, *pc; | 60 | struct page_cgroup *base, *pc; |
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid) | |||
63 | return -ENOMEM; | 75 | return -ENOMEM; |
64 | for (index = 0; index < nr_pages; index++) { | 76 | for (index = 0; index < nr_pages; index++) { |
65 | pc = base + index; | 77 | pc = base + index; |
66 | __init_page_cgroup(pc, start_pfn + index); | 78 | init_page_cgroup(pc, nid); |
67 | } | 79 | } |
68 | NODE_DATA(nid)->node_page_cgroup = base; | 80 | NODE_DATA(nid)->node_page_cgroup = base; |
69 | total_usage += table_size; | 81 | total_usage += table_size; |
@@ -105,46 +117,75 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
105 | return section->page_cgroup + pfn; | 117 | return section->page_cgroup + pfn; |
106 | } | 118 | } |
107 | 119 | ||
108 | /* __alloc_bootmem...() is protected by !slab_available() */ | 120 | struct page *lookup_cgroup_page(struct page_cgroup *pc) |
121 | { | ||
122 | struct mem_section *section; | ||
123 | struct page *page; | ||
124 | unsigned long nr; | ||
125 | |||
126 | nr = page_cgroup_array_id(pc); | ||
127 | section = __nr_to_section(nr); | ||
128 | page = pfn_to_page(pc - section->page_cgroup); | ||
129 | VM_BUG_ON(pc != lookup_page_cgroup(page)); | ||
130 | return page; | ||
131 | } | ||
132 | |||
133 | static void *__init_refok alloc_page_cgroup(size_t size, int nid) | ||
134 | { | ||
135 | void *addr = NULL; | ||
136 | |||
137 | addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_NOWARN); | ||
138 | if (addr) | ||
139 | return addr; | ||
140 | |||
141 | if (node_state(nid, N_HIGH_MEMORY)) | ||
142 | addr = vmalloc_node(size, nid); | ||
143 | else | ||
144 | addr = vmalloc(size); | ||
145 | |||
146 | return addr; | ||
147 | } | ||
148 | |||
149 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
150 | static void free_page_cgroup(void *addr) | ||
151 | { | ||
152 | if (is_vmalloc_addr(addr)) { | ||
153 | vfree(addr); | ||
154 | } else { | ||
155 | struct page *page = virt_to_page(addr); | ||
156 | size_t table_size = | ||
157 | sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
158 | |||
159 | BUG_ON(PageReserved(page)); | ||
160 | free_pages_exact(addr, table_size); | ||
161 | } | ||
162 | } | ||
163 | #endif | ||
164 | |||
109 | static int __init_refok init_section_page_cgroup(unsigned long pfn) | 165 | static int __init_refok init_section_page_cgroup(unsigned long pfn) |
110 | { | 166 | { |
111 | struct mem_section *section = __pfn_to_section(pfn); | ||
112 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | ||
113 | unsigned long table_size; | 169 | unsigned long table_size; |
170 | unsigned long nr; | ||
114 | int nid, index; | 171 | int nid, index; |
115 | 172 | ||
116 | if (!section->page_cgroup) { | 173 | nr = pfn_to_section_nr(pfn); |
117 | nid = page_to_nid(pfn_to_page(pfn)); | 174 | section = __nr_to_section(nr); |
118 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 175 | |
119 | VM_BUG_ON(!slab_is_available()); | 176 | if (section->page_cgroup) |
120 | if (node_state(nid, N_HIGH_MEMORY)) { | 177 | return 0; |
121 | base = kmalloc_node(table_size, | 178 | |
122 | GFP_KERNEL | __GFP_NOWARN, nid); | 179 | nid = page_to_nid(pfn_to_page(pfn)); |
123 | if (!base) | 180 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
124 | base = vmalloc_node(table_size, nid); | 181 | base = alloc_page_cgroup(table_size, nid); |
125 | } else { | 182 | |
126 | base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); | 183 | /* |
127 | if (!base) | 184 | * The value stored in section->page_cgroup is (base - pfn) |
128 | base = vmalloc(table_size); | 185 | * and it does not point to the memory block allocated above, |
129 | } | 186 | * causing kmemleak false positives. |
130 | /* | 187 | */ |
131 | * The value stored in section->page_cgroup is (base - pfn) | 188 | kmemleak_not_leak(base); |
132 | * and it does not point to the memory block allocated above, | ||
133 | * causing kmemleak false positives. | ||
134 | */ | ||
135 | kmemleak_not_leak(base); | ||
136 | } else { | ||
137 | /* | ||
138 | * We don't have to allocate page_cgroup again, but | ||
139 | * address of memmap may be changed. So, we have to initialize | ||
140 | * again. | ||
141 | */ | ||
142 | base = section->page_cgroup + pfn; | ||
143 | table_size = 0; | ||
144 | /* check address of memmap is changed or not. */ | ||
145 | if (base->page == pfn_to_page(pfn)) | ||
146 | return 0; | ||
147 | } | ||
148 | 189 | ||
149 | if (!base) { | 190 | if (!base) { |
150 | printk(KERN_ERR "page cgroup allocation failure\n"); | 191 | printk(KERN_ERR "page cgroup allocation failure\n"); |
@@ -153,7 +194,7 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
153 | 194 | ||
154 | for (index = 0; index < PAGES_PER_SECTION; index++) { | 195 | for (index = 0; index < PAGES_PER_SECTION; index++) { |
155 | pc = base + index; | 196 | pc = base + index; |
156 | __init_page_cgroup(pc, pfn + index); | 197 | init_page_cgroup(pc, nr); |
157 | } | 198 | } |
158 | 199 | ||
159 | section->page_cgroup = base - pfn; | 200 | section->page_cgroup = base - pfn; |
@@ -170,16 +211,8 @@ void __free_page_cgroup(unsigned long pfn) | |||
170 | if (!ms || !ms->page_cgroup) | 211 | if (!ms || !ms->page_cgroup) |
171 | return; | 212 | return; |
172 | base = ms->page_cgroup + pfn; | 213 | base = ms->page_cgroup + pfn; |
173 | if (is_vmalloc_addr(base)) { | 214 | free_page_cgroup(base); |
174 | vfree(base); | 215 | ms->page_cgroup = NULL; |
175 | ms->page_cgroup = NULL; | ||
176 | } else { | ||
177 | struct page *page = virt_to_page(base); | ||
178 | if (!PageReserved(page)) { /* Is bootmem ? */ | ||
179 | kfree(base); | ||
180 | ms->page_cgroup = NULL; | ||
181 | } | ||
182 | } | ||
183 | } | 216 | } |
184 | 217 | ||
185 | int __meminit online_page_cgroup(unsigned long start_pfn, | 218 | int __meminit online_page_cgroup(unsigned long start_pfn, |
diff --git a/mm/page_io.c b/mm/page_io.c index 2dee975bf469..dc76b4d0611e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
106 | goto out; | 106 | goto out; |
107 | } | 107 | } |
108 | if (wbc->sync_mode == WB_SYNC_ALL) | 108 | if (wbc->sync_mode == WB_SYNC_ALL) |
109 | rw |= REQ_SYNC | REQ_UNPLUG; | 109 | rw |= REQ_SYNC; |
110 | count_vm_event(PSWPOUT); | 110 | count_vm_event(PSWPOUT); |
111 | set_page_writeback(page); | 111 | set_page_writeback(page); |
112 | unlock_page(page); | 112 | unlock_page(page); |
diff --git a/mm/readahead.c b/mm/readahead.c index 77506a291a2d..2c0cc489e288 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages); | |||
109 | static int read_pages(struct address_space *mapping, struct file *filp, | 109 | static int read_pages(struct address_space *mapping, struct file *filp, |
110 | struct list_head *pages, unsigned nr_pages) | 110 | struct list_head *pages, unsigned nr_pages) |
111 | { | 111 | { |
112 | struct blk_plug plug; | ||
112 | unsigned page_idx; | 113 | unsigned page_idx; |
113 | int ret; | 114 | int ret; |
114 | 115 | ||
116 | blk_start_plug(&plug); | ||
117 | |||
115 | if (mapping->a_ops->readpages) { | 118 | if (mapping->a_ops->readpages) { |
116 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); | 119 | ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); |
117 | /* Clean up the remaining pages */ | 120 | /* Clean up the remaining pages */ |
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
129 | page_cache_release(page); | 132 | page_cache_release(page); |
130 | } | 133 | } |
131 | ret = 0; | 134 | ret = 0; |
135 | |||
132 | out: | 136 | out: |
137 | blk_finish_plug(&plug); | ||
138 | |||
133 | return ret; | 139 | return ret; |
134 | } | 140 | } |
135 | 141 | ||
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping, | |||
554 | 560 | ||
555 | /* do read-ahead */ | 561 | /* do read-ahead */ |
556 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 562 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
557 | |||
558 | #ifdef CONFIG_BLOCK | ||
559 | /* | ||
560 | * Normally the current page is !uptodate and lock_page() will be | ||
561 | * immediately called to implicitly unplug the device. However this | ||
562 | * is not always true for RAID conifgurations, where data arrives | ||
563 | * not strictly in their submission order. In this case we need to | ||
564 | * explicitly kick off the IO. | ||
565 | */ | ||
566 | if (PageUptodate(page)) | ||
567 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
568 | #endif | ||
569 | } | 563 | } |
570 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 564 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
@@ -31,11 +31,12 @@ | |||
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * inode_lock (in set_page_dirty's __mark_inode_dirty) | 34 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
35 | * inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty) | ||
35 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 36 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
36 | * mapping->tree_lock (widely used, in set_page_dirty, | 37 | * mapping->tree_lock (widely used, in set_page_dirty, |
37 | * in arch-dependent flush_dcache_mmap_lock, | 38 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within inode_lock in __sync_single_inode) | 39 | * within inode_wb_list_lock in __sync_single_inode) |
39 | * | 40 | * |
40 | * (code doesn't rely on that order so it could be switched around) | 41 | * (code doesn't rely on that order so it could be switched around) |
41 | * ->tasklist_lock | 42 | * ->tasklist_lock |
diff --git a/mm/shmem.c b/mm/shmem.c index 91ce9a1024d7..58da7c150ba6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -224,7 +224,6 @@ static const struct vm_operations_struct shmem_vm_ops; | |||
224 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 224 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { |
225 | .ra_pages = 0, /* No readahead */ | 225 | .ra_pages = 0, /* No readahead */ |
226 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 226 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
227 | .unplug_io_fn = default_unplug_io_fn, | ||
228 | }; | 227 | }; |
229 | 228 | ||
230 | static LIST_HEAD(shmem_swaplist); | 229 | static LIST_HEAD(shmem_swaplist); |
@@ -849,11 +849,11 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
849 | local_irq_save(flags); | 849 | local_irq_save(flags); |
850 | kmemcheck_slab_free(s, x, s->objsize); | 850 | kmemcheck_slab_free(s, x, s->objsize); |
851 | debug_check_no_locks_freed(x, s->objsize); | 851 | debug_check_no_locks_freed(x, s->objsize); |
852 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | ||
853 | debug_check_no_obj_freed(x, s->objsize); | ||
854 | local_irq_restore(flags); | 852 | local_irq_restore(flags); |
855 | } | 853 | } |
856 | #endif | 854 | #endif |
855 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | ||
856 | debug_check_no_obj_freed(x, s->objsize); | ||
857 | } | 857 | } |
858 | 858 | ||
859 | /* | 859 | /* |
@@ -1604,7 +1604,7 @@ static inline void note_cmpxchg_failure(const char *n, | |||
1604 | 1604 | ||
1605 | void init_kmem_cache_cpus(struct kmem_cache *s) | 1605 | void init_kmem_cache_cpus(struct kmem_cache *s) |
1606 | { | 1606 | { |
1607 | #if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT) | 1607 | #ifdef CONFIG_CMPXCHG_LOCAL |
1608 | int cpu; | 1608 | int cpu; |
1609 | 1609 | ||
1610 | for_each_possible_cpu(cpu) | 1610 | for_each_possible_cpu(cpu) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 5c8cfabbc9bc..46680461785b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -24,12 +24,10 @@ | |||
24 | 24 | ||
25 | /* | 25 | /* |
26 | * swapper_space is a fiction, retained to simplify the path through | 26 | * swapper_space is a fiction, retained to simplify the path through |
27 | * vmscan's shrink_page_list, to make sync_page look nicer, and to allow | 27 | * vmscan's shrink_page_list. |
28 | * future use of radix_tree tags in the swap cache. | ||
29 | */ | 28 | */ |
30 | static const struct address_space_operations swap_aops = { | 29 | static const struct address_space_operations swap_aops = { |
31 | .writepage = swap_writepage, | 30 | .writepage = swap_writepage, |
32 | .sync_page = block_sync_page, | ||
33 | .set_page_dirty = __set_page_dirty_nobuffers, | 31 | .set_page_dirty = __set_page_dirty_nobuffers, |
34 | .migratepage = migrate_page, | 32 | .migratepage = migrate_page, |
35 | }; | 33 | }; |
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = { | |||
37 | static struct backing_dev_info swap_backing_dev_info = { | 35 | static struct backing_dev_info swap_backing_dev_info = { |
38 | .name = "swap", | 36 | .name = "swap", |
39 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 37 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
40 | .unplug_io_fn = swap_unplug_io_fn, | ||
41 | }; | 38 | }; |
42 | 39 | ||
43 | struct address_space swapper_space = { | 40 | struct address_space swapper_space = { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index aafcf3611b31..8c6b3ce38f09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -95,39 +95,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | |||
95 | } | 95 | } |
96 | 96 | ||
97 | /* | 97 | /* |
98 | * We need this because the bdev->unplug_fn can sleep and we cannot | ||
99 | * hold swap_lock while calling the unplug_fn. And swap_lock | ||
100 | * cannot be turned into a mutex. | ||
101 | */ | ||
102 | static DECLARE_RWSEM(swap_unplug_sem); | ||
103 | |||
104 | void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | ||
105 | { | ||
106 | swp_entry_t entry; | ||
107 | |||
108 | down_read(&swap_unplug_sem); | ||
109 | entry.val = page_private(page); | ||
110 | if (PageSwapCache(page)) { | ||
111 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; | ||
112 | struct backing_dev_info *bdi; | ||
113 | |||
114 | /* | ||
115 | * If the page is removed from swapcache from under us (with a | ||
116 | * racy try_to_unuse/swapoff) we need an additional reference | ||
117 | * count to avoid reading garbage from page_private(page) above. | ||
118 | * If the WARN_ON triggers during a swapoff it maybe the race | ||
119 | * condition and it's harmless. However if it triggers without | ||
120 | * swapoff it signals a problem. | ||
121 | */ | ||
122 | WARN_ON(page_count(page) <= 1); | ||
123 | |||
124 | bdi = bdev->bd_inode->i_mapping->backing_dev_info; | ||
125 | blk_run_backing_dev(bdi, page); | ||
126 | } | ||
127 | up_read(&swap_unplug_sem); | ||
128 | } | ||
129 | |||
130 | /* | ||
131 | * swapon tell device that all the old swap contents can be discarded, | 98 | * swapon tell device that all the old swap contents can be discarded, |
132 | * to allow the swap device to optimize its wear-levelling. | 99 | * to allow the swap device to optimize its wear-levelling. |
133 | */ | 100 | */ |
@@ -880,7 +847,7 @@ unsigned int count_swap_pages(int type, int free) | |||
880 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | 847 | static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, |
881 | unsigned long addr, swp_entry_t entry, struct page *page) | 848 | unsigned long addr, swp_entry_t entry, struct page *page) |
882 | { | 849 | { |
883 | struct mem_cgroup *ptr = NULL; | 850 | struct mem_cgroup *ptr; |
884 | spinlock_t *ptl; | 851 | spinlock_t *ptl; |
885 | pte_t *pte; | 852 | pte_t *pte; |
886 | int ret = 1; | 853 | int ret = 1; |
@@ -1662,10 +1629,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
1662 | goto out_dput; | 1629 | goto out_dput; |
1663 | } | 1630 | } |
1664 | 1631 | ||
1665 | /* wait for any unplug function to finish */ | ||
1666 | down_write(&swap_unplug_sem); | ||
1667 | up_write(&swap_unplug_sem); | ||
1668 | |||
1669 | destroy_swap_extents(p); | 1632 | destroy_swap_extents(p); |
1670 | if (p->flags & SWP_CONTINUED) | 1633 | if (p->flags & SWP_CONTINUED) |
1671 | free_swap_count_continuations(p); | 1634 | free_swap_count_continuations(p); |
@@ -2088,7 +2051,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2088 | 2051 | ||
2089 | p->swap_file = swap_file; | 2052 | p->swap_file = swap_file; |
2090 | mapping = swap_file->f_mapping; | 2053 | mapping = swap_file->f_mapping; |
2091 | inode = mapping->host; | ||
2092 | 2054 | ||
2093 | for (i = 0; i < nr_swapfiles; i++) { | 2055 | for (i = 0; i < nr_swapfiles; i++) { |
2094 | struct swap_info_struct *q = swap_info[i]; | 2056 | struct swap_info_struct *q = swap_info[i]; |
@@ -2101,6 +2063,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2101 | } | 2063 | } |
2102 | } | 2064 | } |
2103 | 2065 | ||
2066 | inode = mapping->host; | ||
2067 | /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ | ||
2104 | error = claim_swapfile(p, inode); | 2068 | error = claim_swapfile(p, inode); |
2105 | if (unlikely(error)) | 2069 | if (unlikely(error)) |
2106 | goto bad_swap; | 2070 | goto bad_swap; |
@@ -2187,8 +2151,10 @@ bad_swap: | |||
2187 | spin_unlock(&swap_lock); | 2151 | spin_unlock(&swap_lock); |
2188 | vfree(swap_map); | 2152 | vfree(swap_map); |
2189 | if (swap_file) { | 2153 | if (swap_file) { |
2190 | if (inode && S_ISREG(inode->i_mode)) | 2154 | if (inode && S_ISREG(inode->i_mode)) { |
2191 | mutex_unlock(&inode->i_mutex); | 2155 | mutex_unlock(&inode->i_mutex); |
2156 | inode = NULL; | ||
2157 | } | ||
2192 | filp_close(swap_file, NULL); | 2158 | filp_close(swap_file, NULL); |
2193 | } | 2159 | } |
2194 | out: | 2160 | out: |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 060e4c191403..f73b8657c2d0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -358,7 +358,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi, | |||
358 | static void handle_write_error(struct address_space *mapping, | 358 | static void handle_write_error(struct address_space *mapping, |
359 | struct page *page, int error) | 359 | struct page *page, int error) |
360 | { | 360 | { |
361 | lock_page_nosync(page); | 361 | lock_page(page); |
362 | if (page_mapping(page) == mapping) | 362 | if (page_mapping(page) == mapping) |
363 | mapping_set_error(mapping, error); | 363 | mapping_set_error(mapping, error); |
364 | unlock_page(page); | 364 | unlock_page(page); |