diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 630 |
1 files changed, 356 insertions, 274 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e1ee6ad9c971..1f0b460fe58c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -73,15 +73,6 @@ static int really_do_swap_account __initdata = 0; | |||
73 | #define do_swap_account (0) | 73 | #define do_swap_account (0) |
74 | #endif | 74 | #endif |
75 | 75 | ||
76 | /* | ||
77 | * Per memcg event counter is incremented at every pagein/pageout. This counter | ||
78 | * is used for trigger some periodic events. This is straightforward and better | ||
79 | * than using jiffies etc. to handle periodic memcg event. | ||
80 | * | ||
81 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
82 | */ | ||
83 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
84 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
85 | 76 | ||
86 | /* | 77 | /* |
87 | * Statistics for memory cgroup. | 78 | * Statistics for memory cgroup. |
@@ -93,19 +84,36 @@ enum mem_cgroup_stat_index { | |||
93 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 84 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
94 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 85 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
95 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 86 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
96 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | ||
97 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | ||
98 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 87 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
99 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 88 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
100 | /* incremented at every pagein/pageout */ | ||
101 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
102 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | 89 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ |
103 | |||
104 | MEM_CGROUP_STAT_NSTATS, | 90 | MEM_CGROUP_STAT_NSTATS, |
105 | }; | 91 | }; |
106 | 92 | ||
93 | enum mem_cgroup_events_index { | ||
94 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
95 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
96 | MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ | ||
97 | MEM_CGROUP_EVENTS_NSTATS, | ||
98 | }; | ||
99 | /* | ||
100 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | ||
101 | * it will be incremated by the number of pages. This counter is used for | ||
102 | * for trigger some periodic events. This is straightforward and better | ||
103 | * than using jiffies etc. to handle periodic memcg event. | ||
104 | */ | ||
105 | enum mem_cgroup_events_target { | ||
106 | MEM_CGROUP_TARGET_THRESH, | ||
107 | MEM_CGROUP_TARGET_SOFTLIMIT, | ||
108 | MEM_CGROUP_NTARGETS, | ||
109 | }; | ||
110 | #define THRESHOLDS_EVENTS_TARGET (128) | ||
111 | #define SOFTLIMIT_EVENTS_TARGET (1024) | ||
112 | |||
107 | struct mem_cgroup_stat_cpu { | 113 | struct mem_cgroup_stat_cpu { |
108 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 114 | long count[MEM_CGROUP_STAT_NSTATS]; |
115 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | ||
116 | unsigned long targets[MEM_CGROUP_NTARGETS]; | ||
109 | }; | 117 | }; |
110 | 118 | ||
111 | /* | 119 | /* |
@@ -218,12 +226,6 @@ struct mem_cgroup { | |||
218 | * per zone LRU lists. | 226 | * per zone LRU lists. |
219 | */ | 227 | */ |
220 | struct mem_cgroup_lru_info info; | 228 | struct mem_cgroup_lru_info info; |
221 | |||
222 | /* | ||
223 | protect against reclaim related member. | ||
224 | */ | ||
225 | spinlock_t reclaim_param_lock; | ||
226 | |||
227 | /* | 229 | /* |
228 | * While reclaiming in a hierarchy, we cache the last child we | 230 | * While reclaiming in a hierarchy, we cache the last child we |
229 | * reclaimed from. | 231 | * reclaimed from. |
@@ -327,13 +329,6 @@ enum charge_type { | |||
327 | NR_CHARGE_TYPE, | 329 | NR_CHARGE_TYPE, |
328 | }; | 330 | }; |
329 | 331 | ||
330 | /* only for here (for easy reading.) */ | ||
331 | #define PCGF_CACHE (1UL << PCG_CACHE) | ||
332 | #define PCGF_USED (1UL << PCG_USED) | ||
333 | #define PCGF_LOCK (1UL << PCG_LOCK) | ||
334 | /* Not used, but added here for completeness */ | ||
335 | #define PCGF_ACCT (1UL << PCG_ACCT) | ||
336 | |||
337 | /* for encoding cft->private value on file */ | 332 | /* for encoding cft->private value on file */ |
338 | #define _MEM (0) | 333 | #define _MEM (0) |
339 | #define _MEMSWAP (1) | 334 | #define _MEMSWAP (1) |
@@ -371,14 +366,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | |||
371 | } | 366 | } |
372 | 367 | ||
373 | static struct mem_cgroup_per_zone * | 368 | static struct mem_cgroup_per_zone * |
374 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 369 | page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) |
375 | { | 370 | { |
376 | struct mem_cgroup *mem = pc->mem_cgroup; | 371 | int nid = page_to_nid(page); |
377 | int nid = page_cgroup_nid(pc); | 372 | int zid = page_zonenum(page); |
378 | int zid = page_cgroup_zid(pc); | ||
379 | |||
380 | if (!mem) | ||
381 | return NULL; | ||
382 | 373 | ||
383 | return mem_cgroup_zoneinfo(mem, nid, zid); | 374 | return mem_cgroup_zoneinfo(mem, nid, zid); |
384 | } | 375 | } |
@@ -504,11 +495,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | |||
504 | } | 495 | } |
505 | } | 496 | } |
506 | 497 | ||
507 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
508 | { | ||
509 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
510 | } | ||
511 | |||
512 | static struct mem_cgroup_per_zone * | 498 | static struct mem_cgroup_per_zone * |
513 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 499 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
514 | { | 500 | { |
@@ -565,11 +551,11 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
565 | * common workload, threashold and synchonization as vmstat[] should be | 551 | * common workload, threashold and synchonization as vmstat[] should be |
566 | * implemented. | 552 | * implemented. |
567 | */ | 553 | */ |
568 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 554 | static long mem_cgroup_read_stat(struct mem_cgroup *mem, |
569 | enum mem_cgroup_stat_index idx) | 555 | enum mem_cgroup_stat_index idx) |
570 | { | 556 | { |
557 | long val = 0; | ||
571 | int cpu; | 558 | int cpu; |
572 | s64 val = 0; | ||
573 | 559 | ||
574 | get_online_cpus(); | 560 | get_online_cpus(); |
575 | for_each_online_cpu(cpu) | 561 | for_each_online_cpu(cpu) |
@@ -583,9 +569,9 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
583 | return val; | 569 | return val; |
584 | } | 570 | } |
585 | 571 | ||
586 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | 572 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) |
587 | { | 573 | { |
588 | s64 ret; | 574 | long ret; |
589 | 575 | ||
590 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | 576 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
591 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | 577 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
@@ -599,6 +585,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | |||
599 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 585 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
600 | } | 586 | } |
601 | 587 | ||
588 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, | ||
589 | enum mem_cgroup_events_index idx) | ||
590 | { | ||
591 | unsigned long val = 0; | ||
592 | int cpu; | ||
593 | |||
594 | for_each_online_cpu(cpu) | ||
595 | val += per_cpu(mem->stat->events[idx], cpu); | ||
596 | #ifdef CONFIG_HOTPLUG_CPU | ||
597 | spin_lock(&mem->pcp_counter_lock); | ||
598 | val += mem->nocpu_base.events[idx]; | ||
599 | spin_unlock(&mem->pcp_counter_lock); | ||
600 | #endif | ||
601 | return val; | ||
602 | } | ||
603 | |||
602 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 604 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
603 | bool file, int nr_pages) | 605 | bool file, int nr_pages) |
604 | { | 606 | { |
@@ -611,13 +613,13 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
611 | 613 | ||
612 | /* pagein of a big page is an event. So, ignore page size */ | 614 | /* pagein of a big page is an event. So, ignore page size */ |
613 | if (nr_pages > 0) | 615 | if (nr_pages > 0) |
614 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); | 616 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
615 | else { | 617 | else { |
616 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); | 618 | __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
617 | nr_pages = -nr_pages; /* for event */ | 619 | nr_pages = -nr_pages; /* for event */ |
618 | } | 620 | } |
619 | 621 | ||
620 | __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); | 622 | __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); |
621 | 623 | ||
622 | preempt_enable(); | 624 | preempt_enable(); |
623 | } | 625 | } |
@@ -637,13 +639,34 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
637 | return total; | 639 | return total; |
638 | } | 640 | } |
639 | 641 | ||
640 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | 642 | static bool __memcg_event_check(struct mem_cgroup *mem, int target) |
641 | { | 643 | { |
642 | s64 val; | 644 | unsigned long val, next; |
643 | 645 | ||
644 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | 646 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
647 | next = this_cpu_read(mem->stat->targets[target]); | ||
648 | /* from time_after() in jiffies.h */ | ||
649 | return ((long)next - (long)val < 0); | ||
650 | } | ||
645 | 651 | ||
646 | return !(val & ((1 << event_mask_shift) - 1)); | 652 | static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) |
653 | { | ||
654 | unsigned long val, next; | ||
655 | |||
656 | val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); | ||
657 | |||
658 | switch (target) { | ||
659 | case MEM_CGROUP_TARGET_THRESH: | ||
660 | next = val + THRESHOLDS_EVENTS_TARGET; | ||
661 | break; | ||
662 | case MEM_CGROUP_TARGET_SOFTLIMIT: | ||
663 | next = val + SOFTLIMIT_EVENTS_TARGET; | ||
664 | break; | ||
665 | default: | ||
666 | return; | ||
667 | } | ||
668 | |||
669 | this_cpu_write(mem->stat->targets[target], next); | ||
647 | } | 670 | } |
648 | 671 | ||
649 | /* | 672 | /* |
@@ -653,10 +676,15 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | |||
653 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | 676 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) |
654 | { | 677 | { |
655 | /* threshold event is triggered in finer grain than soft limit */ | 678 | /* threshold event is triggered in finer grain than soft limit */ |
656 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | 679 | if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { |
657 | mem_cgroup_threshold(mem); | 680 | mem_cgroup_threshold(mem); |
658 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | 681 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
682 | if (unlikely(__memcg_event_check(mem, | ||
683 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | ||
659 | mem_cgroup_update_tree(mem, page); | 684 | mem_cgroup_update_tree(mem, page); |
685 | __mem_cgroup_target_update(mem, | ||
686 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
687 | } | ||
660 | } | 688 | } |
661 | } | 689 | } |
662 | 690 | ||
@@ -815,7 +843,7 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | |||
815 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 843 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
816 | * removed from global LRU. | 844 | * removed from global LRU. |
817 | */ | 845 | */ |
818 | mz = page_cgroup_zoneinfo(pc); | 846 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
819 | /* huge page split is done under lru_lock. so, we have no races. */ | 847 | /* huge page split is done under lru_lock. so, we have no races. */ |
820 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 848 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
821 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 849 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
@@ -851,7 +879,7 @@ void mem_cgroup_rotate_reclaimable_page(struct page *page) | |||
851 | smp_rmb(); | 879 | smp_rmb(); |
852 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 880 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
853 | return; | 881 | return; |
854 | mz = page_cgroup_zoneinfo(pc); | 882 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
855 | list_move_tail(&pc->lru, &mz->lists[lru]); | 883 | list_move_tail(&pc->lru, &mz->lists[lru]); |
856 | } | 884 | } |
857 | 885 | ||
@@ -871,7 +899,7 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
871 | smp_rmb(); | 899 | smp_rmb(); |
872 | if (mem_cgroup_is_root(pc->mem_cgroup)) | 900 | if (mem_cgroup_is_root(pc->mem_cgroup)) |
873 | return; | 901 | return; |
874 | mz = page_cgroup_zoneinfo(pc); | 902 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
875 | list_move(&pc->lru, &mz->lists[lru]); | 903 | list_move(&pc->lru, &mz->lists[lru]); |
876 | } | 904 | } |
877 | 905 | ||
@@ -888,7 +916,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
888 | return; | 916 | return; |
889 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 917 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
890 | smp_rmb(); | 918 | smp_rmb(); |
891 | mz = page_cgroup_zoneinfo(pc); | 919 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
892 | /* huge page split is done under lru_lock. so, we have no races. */ | 920 | /* huge page split is done under lru_lock. so, we have no races. */ |
893 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 921 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
894 | SetPageCgroupAcctLRU(pc); | 922 | SetPageCgroupAcctLRU(pc); |
@@ -898,18 +926,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
898 | } | 926 | } |
899 | 927 | ||
900 | /* | 928 | /* |
901 | * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to | 929 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed |
902 | * lru because the page may.be reused after it's fully uncharged (because of | 930 | * while it's linked to lru because the page may be reused after it's fully |
903 | * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge | 931 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. |
904 | * it again. This function is only used to charge SwapCache. It's done under | 932 | * It's done under lock_page and expected that zone->lru_lock isnever held. |
905 | * lock_page and expected that zone->lru_lock is never held. | ||
906 | */ | 933 | */ |
907 | static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | 934 | static void mem_cgroup_lru_del_before_commit(struct page *page) |
908 | { | 935 | { |
909 | unsigned long flags; | 936 | unsigned long flags; |
910 | struct zone *zone = page_zone(page); | 937 | struct zone *zone = page_zone(page); |
911 | struct page_cgroup *pc = lookup_page_cgroup(page); | 938 | struct page_cgroup *pc = lookup_page_cgroup(page); |
912 | 939 | ||
940 | /* | ||
941 | * Doing this check without taking ->lru_lock seems wrong but this | ||
942 | * is safe. Because if page_cgroup's USED bit is unset, the page | ||
943 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is | ||
944 | * set, the commit after this will fail, anyway. | ||
945 | * This all charge/uncharge is done under some mutual execustion. | ||
946 | * So, we don't need to taking care of changes in USED bit. | ||
947 | */ | ||
948 | if (likely(!PageLRU(page))) | ||
949 | return; | ||
950 | |||
913 | spin_lock_irqsave(&zone->lru_lock, flags); | 951 | spin_lock_irqsave(&zone->lru_lock, flags); |
914 | /* | 952 | /* |
915 | * Forget old LRU when this page_cgroup is *not* used. This Used bit | 953 | * Forget old LRU when this page_cgroup is *not* used. This Used bit |
@@ -920,12 +958,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page) | |||
920 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 958 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
921 | } | 959 | } |
922 | 960 | ||
923 | static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | 961 | static void mem_cgroup_lru_add_after_commit(struct page *page) |
924 | { | 962 | { |
925 | unsigned long flags; | 963 | unsigned long flags; |
926 | struct zone *zone = page_zone(page); | 964 | struct zone *zone = page_zone(page); |
927 | struct page_cgroup *pc = lookup_page_cgroup(page); | 965 | struct page_cgroup *pc = lookup_page_cgroup(page); |
928 | 966 | ||
967 | /* taking care of that the page is added to LRU while we commit it */ | ||
968 | if (likely(!PageLRU(page))) | ||
969 | return; | ||
929 | spin_lock_irqsave(&zone->lru_lock, flags); | 970 | spin_lock_irqsave(&zone->lru_lock, flags); |
930 | /* link when the page is linked to LRU but page_cgroup isn't */ | 971 | /* link when the page is linked to LRU but page_cgroup isn't */ |
931 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) | 972 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
@@ -1058,10 +1099,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1058 | return NULL; | 1099 | return NULL; |
1059 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1100 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
1060 | smp_rmb(); | 1101 | smp_rmb(); |
1061 | mz = page_cgroup_zoneinfo(pc); | 1102 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
1062 | if (!mz) | ||
1063 | return NULL; | ||
1064 | |||
1065 | return &mz->reclaim_stat; | 1103 | return &mz->reclaim_stat; |
1066 | } | 1104 | } |
1067 | 1105 | ||
@@ -1093,9 +1131,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1093 | if (scan >= nr_to_scan) | 1131 | if (scan >= nr_to_scan) |
1094 | break; | 1132 | break; |
1095 | 1133 | ||
1096 | page = pc->page; | ||
1097 | if (unlikely(!PageCgroupUsed(pc))) | 1134 | if (unlikely(!PageCgroupUsed(pc))) |
1098 | continue; | 1135 | continue; |
1136 | |||
1137 | page = lookup_cgroup_page(pc); | ||
1138 | |||
1099 | if (unlikely(!PageLRU(page))) | 1139 | if (unlikely(!PageLRU(page))) |
1100 | continue; | 1140 | continue; |
1101 | 1141 | ||
@@ -1127,49 +1167,32 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
1127 | #define mem_cgroup_from_res_counter(counter, member) \ | 1167 | #define mem_cgroup_from_res_counter(counter, member) \ |
1128 | container_of(counter, struct mem_cgroup, member) | 1168 | container_of(counter, struct mem_cgroup, member) |
1129 | 1169 | ||
1130 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | ||
1131 | { | ||
1132 | if (do_swap_account) { | ||
1133 | if (res_counter_check_under_limit(&mem->res) && | ||
1134 | res_counter_check_under_limit(&mem->memsw)) | ||
1135 | return true; | ||
1136 | } else | ||
1137 | if (res_counter_check_under_limit(&mem->res)) | ||
1138 | return true; | ||
1139 | return false; | ||
1140 | } | ||
1141 | |||
1142 | /** | 1170 | /** |
1143 | * mem_cgroup_check_margin - check if the memory cgroup allows charging | 1171 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1144 | * @mem: memory cgroup to check | 1172 | * @mem: the memory cgroup |
1145 | * @bytes: the number of bytes the caller intends to charge | ||
1146 | * | 1173 | * |
1147 | * Returns a boolean value on whether @mem can be charged @bytes or | 1174 | * Returns the maximum amount of memory @mem can be charged with, in |
1148 | * whether this would exceed the limit. | 1175 | * pages. |
1149 | */ | 1176 | */ |
1150 | static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) | 1177 | static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) |
1151 | { | 1178 | { |
1152 | if (!res_counter_check_margin(&mem->res, bytes)) | 1179 | unsigned long long margin; |
1153 | return false; | 1180 | |
1154 | if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) | 1181 | margin = res_counter_margin(&mem->res); |
1155 | return false; | 1182 | if (do_swap_account) |
1156 | return true; | 1183 | margin = min(margin, res_counter_margin(&mem->memsw)); |
1184 | return margin >> PAGE_SHIFT; | ||
1157 | } | 1185 | } |
1158 | 1186 | ||
1159 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1187 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
1160 | { | 1188 | { |
1161 | struct cgroup *cgrp = memcg->css.cgroup; | 1189 | struct cgroup *cgrp = memcg->css.cgroup; |
1162 | unsigned int swappiness; | ||
1163 | 1190 | ||
1164 | /* root ? */ | 1191 | /* root ? */ |
1165 | if (cgrp->parent == NULL) | 1192 | if (cgrp->parent == NULL) |
1166 | return vm_swappiness; | 1193 | return vm_swappiness; |
1167 | 1194 | ||
1168 | spin_lock(&memcg->reclaim_param_lock); | 1195 | return memcg->swappiness; |
1169 | swappiness = memcg->swappiness; | ||
1170 | spin_unlock(&memcg->reclaim_param_lock); | ||
1171 | |||
1172 | return swappiness; | ||
1173 | } | 1196 | } |
1174 | 1197 | ||
1175 | static void mem_cgroup_start_move(struct mem_cgroup *mem) | 1198 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
@@ -1385,13 +1408,11 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1385 | 1408 | ||
1386 | rcu_read_unlock(); | 1409 | rcu_read_unlock(); |
1387 | /* Updates scanning parameter */ | 1410 | /* Updates scanning parameter */ |
1388 | spin_lock(&root_mem->reclaim_param_lock); | ||
1389 | if (!css) { | 1411 | if (!css) { |
1390 | /* this means start scan from ID:1 */ | 1412 | /* this means start scan from ID:1 */ |
1391 | root_mem->last_scanned_child = 0; | 1413 | root_mem->last_scanned_child = 0; |
1392 | } else | 1414 | } else |
1393 | root_mem->last_scanned_child = found; | 1415 | root_mem->last_scanned_child = found; |
1394 | spin_unlock(&root_mem->reclaim_param_lock); | ||
1395 | } | 1416 | } |
1396 | 1417 | ||
1397 | return ret; | 1418 | return ret; |
@@ -1420,7 +1441,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1420 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | 1441 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
1421 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | 1442 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
1422 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | 1443 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
1423 | unsigned long excess = mem_cgroup_get_excess(root_mem); | 1444 | unsigned long excess; |
1445 | |||
1446 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | ||
1424 | 1447 | ||
1425 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1448 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1426 | if (root_mem->memsw_is_minimum) | 1449 | if (root_mem->memsw_is_minimum) |
@@ -1477,9 +1500,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1477 | return ret; | 1500 | return ret; |
1478 | total += ret; | 1501 | total += ret; |
1479 | if (check_soft) { | 1502 | if (check_soft) { |
1480 | if (res_counter_check_under_soft_limit(&root_mem->res)) | 1503 | if (!res_counter_soft_limit_excess(&root_mem->res)) |
1481 | return total; | 1504 | return total; |
1482 | } else if (mem_cgroup_check_under_limit(root_mem)) | 1505 | } else if (mem_cgroup_margin(root_mem)) |
1483 | return 1 + total; | 1506 | return 1 + total; |
1484 | } | 1507 | } |
1485 | return total; | 1508 | return total; |
@@ -1687,17 +1710,17 @@ EXPORT_SYMBOL(mem_cgroup_update_page_stat); | |||
1687 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1710 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
1688 | * TODO: maybe necessary to use big numbers in big irons. | 1711 | * TODO: maybe necessary to use big numbers in big irons. |
1689 | */ | 1712 | */ |
1690 | #define CHARGE_SIZE (32 * PAGE_SIZE) | 1713 | #define CHARGE_BATCH 32U |
1691 | struct memcg_stock_pcp { | 1714 | struct memcg_stock_pcp { |
1692 | struct mem_cgroup *cached; /* this never be root cgroup */ | 1715 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1693 | int charge; | 1716 | unsigned int nr_pages; |
1694 | struct work_struct work; | 1717 | struct work_struct work; |
1695 | }; | 1718 | }; |
1696 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1719 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1697 | static atomic_t memcg_drain_count; | 1720 | static atomic_t memcg_drain_count; |
1698 | 1721 | ||
1699 | /* | 1722 | /* |
1700 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | 1723 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
1701 | * from local stock and true is returned. If the stock is 0 or charges from a | 1724 | * from local stock and true is returned. If the stock is 0 or charges from a |
1702 | * cgroup which is not current target, returns false. This stock will be | 1725 | * cgroup which is not current target, returns false. This stock will be |
1703 | * refilled. | 1726 | * refilled. |
@@ -1708,8 +1731,8 @@ static bool consume_stock(struct mem_cgroup *mem) | |||
1708 | bool ret = true; | 1731 | bool ret = true; |
1709 | 1732 | ||
1710 | stock = &get_cpu_var(memcg_stock); | 1733 | stock = &get_cpu_var(memcg_stock); |
1711 | if (mem == stock->cached && stock->charge) | 1734 | if (mem == stock->cached && stock->nr_pages) |
1712 | stock->charge -= PAGE_SIZE; | 1735 | stock->nr_pages--; |
1713 | else /* need to call res_counter_charge */ | 1736 | else /* need to call res_counter_charge */ |
1714 | ret = false; | 1737 | ret = false; |
1715 | put_cpu_var(memcg_stock); | 1738 | put_cpu_var(memcg_stock); |
@@ -1723,13 +1746,15 @@ static void drain_stock(struct memcg_stock_pcp *stock) | |||
1723 | { | 1746 | { |
1724 | struct mem_cgroup *old = stock->cached; | 1747 | struct mem_cgroup *old = stock->cached; |
1725 | 1748 | ||
1726 | if (stock->charge) { | 1749 | if (stock->nr_pages) { |
1727 | res_counter_uncharge(&old->res, stock->charge); | 1750 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; |
1751 | |||
1752 | res_counter_uncharge(&old->res, bytes); | ||
1728 | if (do_swap_account) | 1753 | if (do_swap_account) |
1729 | res_counter_uncharge(&old->memsw, stock->charge); | 1754 | res_counter_uncharge(&old->memsw, bytes); |
1755 | stock->nr_pages = 0; | ||
1730 | } | 1756 | } |
1731 | stock->cached = NULL; | 1757 | stock->cached = NULL; |
1732 | stock->charge = 0; | ||
1733 | } | 1758 | } |
1734 | 1759 | ||
1735 | /* | 1760 | /* |
@@ -1746,7 +1771,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1746 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 1771 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
1747 | * This will be consumed by consume_stock() function, later. | 1772 | * This will be consumed by consume_stock() function, later. |
1748 | */ | 1773 | */ |
1749 | static void refill_stock(struct mem_cgroup *mem, int val) | 1774 | static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) |
1750 | { | 1775 | { |
1751 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 1776 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
1752 | 1777 | ||
@@ -1754,7 +1779,7 @@ static void refill_stock(struct mem_cgroup *mem, int val) | |||
1754 | drain_stock(stock); | 1779 | drain_stock(stock); |
1755 | stock->cached = mem; | 1780 | stock->cached = mem; |
1756 | } | 1781 | } |
1757 | stock->charge += val; | 1782 | stock->nr_pages += nr_pages; |
1758 | put_cpu_var(memcg_stock); | 1783 | put_cpu_var(memcg_stock); |
1759 | } | 1784 | } |
1760 | 1785 | ||
@@ -1806,11 +1831,17 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | |||
1806 | 1831 | ||
1807 | spin_lock(&mem->pcp_counter_lock); | 1832 | spin_lock(&mem->pcp_counter_lock); |
1808 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | 1833 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { |
1809 | s64 x = per_cpu(mem->stat->count[i], cpu); | 1834 | long x = per_cpu(mem->stat->count[i], cpu); |
1810 | 1835 | ||
1811 | per_cpu(mem->stat->count[i], cpu) = 0; | 1836 | per_cpu(mem->stat->count[i], cpu) = 0; |
1812 | mem->nocpu_base.count[i] += x; | 1837 | mem->nocpu_base.count[i] += x; |
1813 | } | 1838 | } |
1839 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | ||
1840 | unsigned long x = per_cpu(mem->stat->events[i], cpu); | ||
1841 | |||
1842 | per_cpu(mem->stat->events[i], cpu) = 0; | ||
1843 | mem->nocpu_base.events[i] += x; | ||
1844 | } | ||
1814 | /* need to clear ON_MOVE value, works as a kind of lock. */ | 1845 | /* need to clear ON_MOVE value, works as a kind of lock. */ |
1815 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | 1846 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; |
1816 | spin_unlock(&mem->pcp_counter_lock); | 1847 | spin_unlock(&mem->pcp_counter_lock); |
@@ -1860,9 +1891,10 @@ enum { | |||
1860 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | 1891 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ |
1861 | }; | 1892 | }; |
1862 | 1893 | ||
1863 | static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | 1894 | static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, |
1864 | int csize, bool oom_check) | 1895 | unsigned int nr_pages, bool oom_check) |
1865 | { | 1896 | { |
1897 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
1866 | struct mem_cgroup *mem_over_limit; | 1898 | struct mem_cgroup *mem_over_limit; |
1867 | struct res_counter *fail_res; | 1899 | struct res_counter *fail_res; |
1868 | unsigned long flags = 0; | 1900 | unsigned long flags = 0; |
@@ -1883,14 +1915,13 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1883 | } else | 1915 | } else |
1884 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 1916 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
1885 | /* | 1917 | /* |
1886 | * csize can be either a huge page (HPAGE_SIZE), a batch of | 1918 | * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch |
1887 | * regular pages (CHARGE_SIZE), or a single regular page | 1919 | * of regular pages (CHARGE_BATCH), or a single regular page (1). |
1888 | * (PAGE_SIZE). | ||
1889 | * | 1920 | * |
1890 | * Never reclaim on behalf of optional batching, retry with a | 1921 | * Never reclaim on behalf of optional batching, retry with a |
1891 | * single page instead. | 1922 | * single page instead. |
1892 | */ | 1923 | */ |
1893 | if (csize == CHARGE_SIZE) | 1924 | if (nr_pages == CHARGE_BATCH) |
1894 | return CHARGE_RETRY; | 1925 | return CHARGE_RETRY; |
1895 | 1926 | ||
1896 | if (!(gfp_mask & __GFP_WAIT)) | 1927 | if (!(gfp_mask & __GFP_WAIT)) |
@@ -1898,7 +1929,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1898 | 1929 | ||
1899 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 1930 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1900 | gfp_mask, flags); | 1931 | gfp_mask, flags); |
1901 | if (mem_cgroup_check_margin(mem_over_limit, csize)) | 1932 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
1902 | return CHARGE_RETRY; | 1933 | return CHARGE_RETRY; |
1903 | /* | 1934 | /* |
1904 | * Even though the limit is exceeded at this point, reclaim | 1935 | * Even though the limit is exceeded at this point, reclaim |
@@ -1909,7 +1940,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1909 | * unlikely to succeed so close to the limit, and we fall back | 1940 | * unlikely to succeed so close to the limit, and we fall back |
1910 | * to regular pages anyway in case of failure. | 1941 | * to regular pages anyway in case of failure. |
1911 | */ | 1942 | */ |
1912 | if (csize == PAGE_SIZE && ret) | 1943 | if (nr_pages == 1 && ret) |
1913 | return CHARGE_RETRY; | 1944 | return CHARGE_RETRY; |
1914 | 1945 | ||
1915 | /* | 1946 | /* |
@@ -1935,13 +1966,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, | |||
1935 | */ | 1966 | */ |
1936 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1967 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1937 | gfp_t gfp_mask, | 1968 | gfp_t gfp_mask, |
1938 | struct mem_cgroup **memcg, bool oom, | 1969 | unsigned int nr_pages, |
1939 | int page_size) | 1970 | struct mem_cgroup **memcg, |
1971 | bool oom) | ||
1940 | { | 1972 | { |
1973 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | ||
1941 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1974 | int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1942 | struct mem_cgroup *mem = NULL; | 1975 | struct mem_cgroup *mem = NULL; |
1943 | int ret; | 1976 | int ret; |
1944 | int csize = max(CHARGE_SIZE, (unsigned long) page_size); | ||
1945 | 1977 | ||
1946 | /* | 1978 | /* |
1947 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage | 1979 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
@@ -1966,7 +1998,7 @@ again: | |||
1966 | VM_BUG_ON(css_is_removed(&mem->css)); | 1998 | VM_BUG_ON(css_is_removed(&mem->css)); |
1967 | if (mem_cgroup_is_root(mem)) | 1999 | if (mem_cgroup_is_root(mem)) |
1968 | goto done; | 2000 | goto done; |
1969 | if (page_size == PAGE_SIZE && consume_stock(mem)) | 2001 | if (nr_pages == 1 && consume_stock(mem)) |
1970 | goto done; | 2002 | goto done; |
1971 | css_get(&mem->css); | 2003 | css_get(&mem->css); |
1972 | } else { | 2004 | } else { |
@@ -1989,7 +2021,7 @@ again: | |||
1989 | rcu_read_unlock(); | 2021 | rcu_read_unlock(); |
1990 | goto done; | 2022 | goto done; |
1991 | } | 2023 | } |
1992 | if (page_size == PAGE_SIZE && consume_stock(mem)) { | 2024 | if (nr_pages == 1 && consume_stock(mem)) { |
1993 | /* | 2025 | /* |
1994 | * It seems dagerous to access memcg without css_get(). | 2026 | * It seems dagerous to access memcg without css_get(). |
1995 | * But considering how consume_stok works, it's not | 2027 | * But considering how consume_stok works, it's not |
@@ -2024,13 +2056,12 @@ again: | |||
2024 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2056 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2025 | } | 2057 | } |
2026 | 2058 | ||
2027 | ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); | 2059 | ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); |
2028 | |||
2029 | switch (ret) { | 2060 | switch (ret) { |
2030 | case CHARGE_OK: | 2061 | case CHARGE_OK: |
2031 | break; | 2062 | break; |
2032 | case CHARGE_RETRY: /* not in OOM situation but retry */ | 2063 | case CHARGE_RETRY: /* not in OOM situation but retry */ |
2033 | csize = page_size; | 2064 | batch = nr_pages; |
2034 | css_put(&mem->css); | 2065 | css_put(&mem->css); |
2035 | mem = NULL; | 2066 | mem = NULL; |
2036 | goto again; | 2067 | goto again; |
@@ -2051,8 +2082,8 @@ again: | |||
2051 | } | 2082 | } |
2052 | } while (ret != CHARGE_OK); | 2083 | } while (ret != CHARGE_OK); |
2053 | 2084 | ||
2054 | if (csize > page_size) | 2085 | if (batch > nr_pages) |
2055 | refill_stock(mem, csize - page_size); | 2086 | refill_stock(mem, batch - nr_pages); |
2056 | css_put(&mem->css); | 2087 | css_put(&mem->css); |
2057 | done: | 2088 | done: |
2058 | *memcg = mem; | 2089 | *memcg = mem; |
@@ -2071,21 +2102,17 @@ bypass: | |||
2071 | * gotten by try_charge(). | 2102 | * gotten by try_charge(). |
2072 | */ | 2103 | */ |
2073 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | 2104 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
2074 | unsigned long count) | 2105 | unsigned int nr_pages) |
2075 | { | 2106 | { |
2076 | if (!mem_cgroup_is_root(mem)) { | 2107 | if (!mem_cgroup_is_root(mem)) { |
2077 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | 2108 | unsigned long bytes = nr_pages * PAGE_SIZE; |
2109 | |||
2110 | res_counter_uncharge(&mem->res, bytes); | ||
2078 | if (do_swap_account) | 2111 | if (do_swap_account) |
2079 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | 2112 | res_counter_uncharge(&mem->memsw, bytes); |
2080 | } | 2113 | } |
2081 | } | 2114 | } |
2082 | 2115 | ||
2083 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, | ||
2084 | int page_size) | ||
2085 | { | ||
2086 | __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); | ||
2087 | } | ||
2088 | |||
2089 | /* | 2116 | /* |
2090 | * A helper function to get mem_cgroup from ID. must be called under | 2117 | * A helper function to get mem_cgroup from ID. must be called under |
2091 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 2118 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -2134,20 +2161,15 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2134 | } | 2161 | } |
2135 | 2162 | ||
2136 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | 2163 | static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, |
2164 | struct page *page, | ||
2165 | unsigned int nr_pages, | ||
2137 | struct page_cgroup *pc, | 2166 | struct page_cgroup *pc, |
2138 | enum charge_type ctype, | 2167 | enum charge_type ctype) |
2139 | int page_size) | ||
2140 | { | 2168 | { |
2141 | int nr_pages = page_size >> PAGE_SHIFT; | ||
2142 | |||
2143 | /* try_charge() can return NULL to *memcg, taking care of it. */ | ||
2144 | if (!mem) | ||
2145 | return; | ||
2146 | |||
2147 | lock_page_cgroup(pc); | 2169 | lock_page_cgroup(pc); |
2148 | if (unlikely(PageCgroupUsed(pc))) { | 2170 | if (unlikely(PageCgroupUsed(pc))) { |
2149 | unlock_page_cgroup(pc); | 2171 | unlock_page_cgroup(pc); |
2150 | mem_cgroup_cancel_charge(mem, page_size); | 2172 | __mem_cgroup_cancel_charge(mem, nr_pages); |
2151 | return; | 2173 | return; |
2152 | } | 2174 | } |
2153 | /* | 2175 | /* |
@@ -2184,7 +2206,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
2184 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2206 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
2185 | * if they exceeds softlimit. | 2207 | * if they exceeds softlimit. |
2186 | */ | 2208 | */ |
2187 | memcg_check_events(mem, pc->page); | 2209 | memcg_check_events(mem, page); |
2188 | } | 2210 | } |
2189 | 2211 | ||
2190 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2212 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -2221,7 +2243,7 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2221 | * We hold lru_lock, then, reduce counter directly. | 2243 | * We hold lru_lock, then, reduce counter directly. |
2222 | */ | 2244 | */ |
2223 | lru = page_lru(head); | 2245 | lru = page_lru(head); |
2224 | mz = page_cgroup_zoneinfo(head_pc); | 2246 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); |
2225 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 2247 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
2226 | } | 2248 | } |
2227 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | 2249 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
@@ -2230,7 +2252,9 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2230 | #endif | 2252 | #endif |
2231 | 2253 | ||
2232 | /** | 2254 | /** |
2233 | * __mem_cgroup_move_account - move account of the page | 2255 | * mem_cgroup_move_account - move account of the page |
2256 | * @page: the page | ||
2257 | * @nr_pages: number of regular pages (>1 for huge pages) | ||
2234 | * @pc: page_cgroup of the page. | 2258 | * @pc: page_cgroup of the page. |
2235 | * @from: mem_cgroup which the page is moved from. | 2259 | * @from: mem_cgroup which the page is moved from. |
2236 | * @to: mem_cgroup which the page is moved to. @from != @to. | 2260 | * @to: mem_cgroup which the page is moved to. @from != @to. |
@@ -2238,25 +2262,42 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | |||
2238 | * | 2262 | * |
2239 | * The caller must confirm following. | 2263 | * The caller must confirm following. |
2240 | * - page is not on LRU (isolate_page() is useful.) | 2264 | * - page is not on LRU (isolate_page() is useful.) |
2241 | * - the pc is locked, used, and ->mem_cgroup points to @from. | 2265 | * - compound_lock is held when nr_pages > 1 |
2242 | * | 2266 | * |
2243 | * This function doesn't do "charge" nor css_get to new cgroup. It should be | 2267 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
2244 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is | 2268 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
2245 | * true, this function does "uncharge" from old cgroup, but it doesn't if | 2269 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
2246 | * @uncharge is false, so a caller should do "uncharge". | 2270 | * @uncharge is false, so a caller should do "uncharge". |
2247 | */ | 2271 | */ |
2248 | 2272 | static int mem_cgroup_move_account(struct page *page, | |
2249 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 2273 | unsigned int nr_pages, |
2250 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, | 2274 | struct page_cgroup *pc, |
2251 | int charge_size) | 2275 | struct mem_cgroup *from, |
2276 | struct mem_cgroup *to, | ||
2277 | bool uncharge) | ||
2252 | { | 2278 | { |
2253 | int nr_pages = charge_size >> PAGE_SHIFT; | 2279 | unsigned long flags; |
2280 | int ret; | ||
2254 | 2281 | ||
2255 | VM_BUG_ON(from == to); | 2282 | VM_BUG_ON(from == to); |
2256 | VM_BUG_ON(PageLRU(pc->page)); | 2283 | VM_BUG_ON(PageLRU(page)); |
2257 | VM_BUG_ON(!page_is_cgroup_locked(pc)); | 2284 | /* |
2258 | VM_BUG_ON(!PageCgroupUsed(pc)); | 2285 | * The page is isolated from LRU. So, collapse function |
2259 | VM_BUG_ON(pc->mem_cgroup != from); | 2286 | * will not handle this page. But page splitting can happen. |
2287 | * Do this check under compound_page_lock(). The caller should | ||
2288 | * hold it. | ||
2289 | */ | ||
2290 | ret = -EBUSY; | ||
2291 | if (nr_pages > 1 && !PageTransHuge(page)) | ||
2292 | goto out; | ||
2293 | |||
2294 | lock_page_cgroup(pc); | ||
2295 | |||
2296 | ret = -EINVAL; | ||
2297 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | ||
2298 | goto unlock; | ||
2299 | |||
2300 | move_lock_page_cgroup(pc, &flags); | ||
2260 | 2301 | ||
2261 | if (PageCgroupFileMapped(pc)) { | 2302 | if (PageCgroupFileMapped(pc)) { |
2262 | /* Update mapped_file data for mem_cgroup */ | 2303 | /* Update mapped_file data for mem_cgroup */ |
@@ -2268,7 +2309,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2268 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2309 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); |
2269 | if (uncharge) | 2310 | if (uncharge) |
2270 | /* This is not "cancel", but cancel_charge does all we need. */ | 2311 | /* This is not "cancel", but cancel_charge does all we need. */ |
2271 | mem_cgroup_cancel_charge(from, charge_size); | 2312 | __mem_cgroup_cancel_charge(from, nr_pages); |
2272 | 2313 | ||
2273 | /* caller should have done css_get */ | 2314 | /* caller should have done css_get */ |
2274 | pc->mem_cgroup = to; | 2315 | pc->mem_cgroup = to; |
@@ -2280,40 +2321,16 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
2280 | * garanteed that "to" is never removed. So, we don't check rmdir | 2321 | * garanteed that "to" is never removed. So, we don't check rmdir |
2281 | * status here. | 2322 | * status here. |
2282 | */ | 2323 | */ |
2283 | } | 2324 | move_unlock_page_cgroup(pc, &flags); |
2284 | 2325 | ret = 0; | |
2285 | /* | 2326 | unlock: |
2286 | * check whether the @pc is valid for moving account and call | ||
2287 | * __mem_cgroup_move_account() | ||
2288 | */ | ||
2289 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
2290 | struct mem_cgroup *from, struct mem_cgroup *to, | ||
2291 | bool uncharge, int charge_size) | ||
2292 | { | ||
2293 | int ret = -EINVAL; | ||
2294 | unsigned long flags; | ||
2295 | /* | ||
2296 | * The page is isolated from LRU. So, collapse function | ||
2297 | * will not handle this page. But page splitting can happen. | ||
2298 | * Do this check under compound_page_lock(). The caller should | ||
2299 | * hold it. | ||
2300 | */ | ||
2301 | if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) | ||
2302 | return -EBUSY; | ||
2303 | |||
2304 | lock_page_cgroup(pc); | ||
2305 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
2306 | move_lock_page_cgroup(pc, &flags); | ||
2307 | __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); | ||
2308 | move_unlock_page_cgroup(pc, &flags); | ||
2309 | ret = 0; | ||
2310 | } | ||
2311 | unlock_page_cgroup(pc); | 2327 | unlock_page_cgroup(pc); |
2312 | /* | 2328 | /* |
2313 | * check events | 2329 | * check events |
2314 | */ | 2330 | */ |
2315 | memcg_check_events(to, pc->page); | 2331 | memcg_check_events(to, page); |
2316 | memcg_check_events(from, pc->page); | 2332 | memcg_check_events(from, page); |
2333 | out: | ||
2317 | return ret; | 2334 | return ret; |
2318 | } | 2335 | } |
2319 | 2336 | ||
@@ -2321,16 +2338,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
2321 | * move charges to its parent. | 2338 | * move charges to its parent. |
2322 | */ | 2339 | */ |
2323 | 2340 | ||
2324 | static int mem_cgroup_move_parent(struct page_cgroup *pc, | 2341 | static int mem_cgroup_move_parent(struct page *page, |
2342 | struct page_cgroup *pc, | ||
2325 | struct mem_cgroup *child, | 2343 | struct mem_cgroup *child, |
2326 | gfp_t gfp_mask) | 2344 | gfp_t gfp_mask) |
2327 | { | 2345 | { |
2328 | struct page *page = pc->page; | ||
2329 | struct cgroup *cg = child->css.cgroup; | 2346 | struct cgroup *cg = child->css.cgroup; |
2330 | struct cgroup *pcg = cg->parent; | 2347 | struct cgroup *pcg = cg->parent; |
2331 | struct mem_cgroup *parent; | 2348 | struct mem_cgroup *parent; |
2332 | int page_size = PAGE_SIZE; | 2349 | unsigned int nr_pages; |
2333 | unsigned long flags; | 2350 | unsigned long uninitialized_var(flags); |
2334 | int ret; | 2351 | int ret; |
2335 | 2352 | ||
2336 | /* Is ROOT ? */ | 2353 | /* Is ROOT ? */ |
@@ -2343,23 +2360,21 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
2343 | if (isolate_lru_page(page)) | 2360 | if (isolate_lru_page(page)) |
2344 | goto put; | 2361 | goto put; |
2345 | 2362 | ||
2346 | if (PageTransHuge(page)) | 2363 | nr_pages = hpage_nr_pages(page); |
2347 | page_size = HPAGE_SIZE; | ||
2348 | 2364 | ||
2349 | parent = mem_cgroup_from_cont(pcg); | 2365 | parent = mem_cgroup_from_cont(pcg); |
2350 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, | 2366 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2351 | &parent, false, page_size); | ||
2352 | if (ret || !parent) | 2367 | if (ret || !parent) |
2353 | goto put_back; | 2368 | goto put_back; |
2354 | 2369 | ||
2355 | if (page_size > PAGE_SIZE) | 2370 | if (nr_pages > 1) |
2356 | flags = compound_lock_irqsave(page); | 2371 | flags = compound_lock_irqsave(page); |
2357 | 2372 | ||
2358 | ret = mem_cgroup_move_account(pc, child, parent, true, page_size); | 2373 | ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); |
2359 | if (ret) | 2374 | if (ret) |
2360 | mem_cgroup_cancel_charge(parent, page_size); | 2375 | __mem_cgroup_cancel_charge(parent, nr_pages); |
2361 | 2376 | ||
2362 | if (page_size > PAGE_SIZE) | 2377 | if (nr_pages > 1) |
2363 | compound_unlock_irqrestore(page, flags); | 2378 | compound_unlock_irqrestore(page, flags); |
2364 | put_back: | 2379 | put_back: |
2365 | putback_lru_page(page); | 2380 | putback_lru_page(page); |
@@ -2379,13 +2394,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2379 | gfp_t gfp_mask, enum charge_type ctype) | 2394 | gfp_t gfp_mask, enum charge_type ctype) |
2380 | { | 2395 | { |
2381 | struct mem_cgroup *mem = NULL; | 2396 | struct mem_cgroup *mem = NULL; |
2382 | int page_size = PAGE_SIZE; | 2397 | unsigned int nr_pages = 1; |
2383 | struct page_cgroup *pc; | 2398 | struct page_cgroup *pc; |
2384 | bool oom = true; | 2399 | bool oom = true; |
2385 | int ret; | 2400 | int ret; |
2386 | 2401 | ||
2387 | if (PageTransHuge(page)) { | 2402 | if (PageTransHuge(page)) { |
2388 | page_size <<= compound_order(page); | 2403 | nr_pages <<= compound_order(page); |
2389 | VM_BUG_ON(!PageTransHuge(page)); | 2404 | VM_BUG_ON(!PageTransHuge(page)); |
2390 | /* | 2405 | /* |
2391 | * Never OOM-kill a process for a huge page. The | 2406 | * Never OOM-kill a process for a huge page. The |
@@ -2395,16 +2410,13 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2395 | } | 2410 | } |
2396 | 2411 | ||
2397 | pc = lookup_page_cgroup(page); | 2412 | pc = lookup_page_cgroup(page); |
2398 | /* can happen at boot */ | 2413 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ |
2399 | if (unlikely(!pc)) | ||
2400 | return 0; | ||
2401 | prefetchw(pc); | ||
2402 | 2414 | ||
2403 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); | 2415 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); |
2404 | if (ret || !mem) | 2416 | if (ret || !mem) |
2405 | return ret; | 2417 | return ret; |
2406 | 2418 | ||
2407 | __mem_cgroup_commit_charge(mem, pc, ctype, page_size); | 2419 | __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); |
2408 | return 0; | 2420 | return 0; |
2409 | } | 2421 | } |
2410 | 2422 | ||
@@ -2432,9 +2444,26 @@ static void | |||
2432 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2444 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2433 | enum charge_type ctype); | 2445 | enum charge_type ctype); |
2434 | 2446 | ||
2447 | static void | ||
2448 | __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, | ||
2449 | enum charge_type ctype) | ||
2450 | { | ||
2451 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2452 | /* | ||
2453 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | ||
2454 | * is already on LRU. It means the page may on some other page_cgroup's | ||
2455 | * LRU. Take care of it. | ||
2456 | */ | ||
2457 | mem_cgroup_lru_del_before_commit(page); | ||
2458 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); | ||
2459 | mem_cgroup_lru_add_after_commit(page); | ||
2460 | return; | ||
2461 | } | ||
2462 | |||
2435 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 2463 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
2436 | gfp_t gfp_mask) | 2464 | gfp_t gfp_mask) |
2437 | { | 2465 | { |
2466 | struct mem_cgroup *mem = NULL; | ||
2438 | int ret; | 2467 | int ret; |
2439 | 2468 | ||
2440 | if (mem_cgroup_disabled()) | 2469 | if (mem_cgroup_disabled()) |
@@ -2469,14 +2498,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2469 | if (unlikely(!mm)) | 2498 | if (unlikely(!mm)) |
2470 | mm = &init_mm; | 2499 | mm = &init_mm; |
2471 | 2500 | ||
2472 | if (page_is_file_cache(page)) | 2501 | if (page_is_file_cache(page)) { |
2473 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2502 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); |
2474 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 2503 | if (ret || !mem) |
2504 | return ret; | ||
2475 | 2505 | ||
2506 | /* | ||
2507 | * FUSE reuses pages without going through the final | ||
2508 | * put that would remove them from the LRU list, make | ||
2509 | * sure that they get relinked properly. | ||
2510 | */ | ||
2511 | __mem_cgroup_commit_charge_lrucare(page, mem, | ||
2512 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
2513 | return ret; | ||
2514 | } | ||
2476 | /* shmem */ | 2515 | /* shmem */ |
2477 | if (PageSwapCache(page)) { | 2516 | if (PageSwapCache(page)) { |
2478 | struct mem_cgroup *mem = NULL; | ||
2479 | |||
2480 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); | 2517 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
2481 | if (!ret) | 2518 | if (!ret) |
2482 | __mem_cgroup_commit_charge_swapin(page, mem, | 2519 | __mem_cgroup_commit_charge_swapin(page, mem, |
@@ -2501,6 +2538,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2501 | struct mem_cgroup *mem; | 2538 | struct mem_cgroup *mem; |
2502 | int ret; | 2539 | int ret; |
2503 | 2540 | ||
2541 | *ptr = NULL; | ||
2542 | |||
2504 | if (mem_cgroup_disabled()) | 2543 | if (mem_cgroup_disabled()) |
2505 | return 0; | 2544 | return 0; |
2506 | 2545 | ||
@@ -2518,30 +2557,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2518 | if (!mem) | 2557 | if (!mem) |
2519 | goto charge_cur_mm; | 2558 | goto charge_cur_mm; |
2520 | *ptr = mem; | 2559 | *ptr = mem; |
2521 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); | 2560 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); |
2522 | css_put(&mem->css); | 2561 | css_put(&mem->css); |
2523 | return ret; | 2562 | return ret; |
2524 | charge_cur_mm: | 2563 | charge_cur_mm: |
2525 | if (unlikely(!mm)) | 2564 | if (unlikely(!mm)) |
2526 | mm = &init_mm; | 2565 | mm = &init_mm; |
2527 | return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); | 2566 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); |
2528 | } | 2567 | } |
2529 | 2568 | ||
2530 | static void | 2569 | static void |
2531 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2570 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, |
2532 | enum charge_type ctype) | 2571 | enum charge_type ctype) |
2533 | { | 2572 | { |
2534 | struct page_cgroup *pc; | ||
2535 | |||
2536 | if (mem_cgroup_disabled()) | 2573 | if (mem_cgroup_disabled()) |
2537 | return; | 2574 | return; |
2538 | if (!ptr) | 2575 | if (!ptr) |
2539 | return; | 2576 | return; |
2540 | cgroup_exclude_rmdir(&ptr->css); | 2577 | cgroup_exclude_rmdir(&ptr->css); |
2541 | pc = lookup_page_cgroup(page); | 2578 | |
2542 | mem_cgroup_lru_del_before_commit_swapcache(page); | 2579 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); |
2543 | __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); | ||
2544 | mem_cgroup_lru_add_after_commit_swapcache(page); | ||
2545 | /* | 2580 | /* |
2546 | * Now swap is on-memory. This means this page may be | 2581 | * Now swap is on-memory. This means this page may be |
2547 | * counted both as mem and swap....double count. | 2582 | * counted both as mem and swap....double count. |
@@ -2589,15 +2624,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
2589 | return; | 2624 | return; |
2590 | if (!mem) | 2625 | if (!mem) |
2591 | return; | 2626 | return; |
2592 | mem_cgroup_cancel_charge(mem, PAGE_SIZE); | 2627 | __mem_cgroup_cancel_charge(mem, 1); |
2593 | } | 2628 | } |
2594 | 2629 | ||
2595 | static void | 2630 | static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, |
2596 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | 2631 | unsigned int nr_pages, |
2597 | int page_size) | 2632 | const enum charge_type ctype) |
2598 | { | 2633 | { |
2599 | struct memcg_batch_info *batch = NULL; | 2634 | struct memcg_batch_info *batch = NULL; |
2600 | bool uncharge_memsw = true; | 2635 | bool uncharge_memsw = true; |
2636 | |||
2601 | /* If swapout, usage of swap doesn't decrease */ | 2637 | /* If swapout, usage of swap doesn't decrease */ |
2602 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2638 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2603 | uncharge_memsw = false; | 2639 | uncharge_memsw = false; |
@@ -2621,7 +2657,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2621 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | 2657 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) |
2622 | goto direct_uncharge; | 2658 | goto direct_uncharge; |
2623 | 2659 | ||
2624 | if (page_size != PAGE_SIZE) | 2660 | if (nr_pages > 1) |
2625 | goto direct_uncharge; | 2661 | goto direct_uncharge; |
2626 | 2662 | ||
2627 | /* | 2663 | /* |
@@ -2632,14 +2668,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, | |||
2632 | if (batch->memcg != mem) | 2668 | if (batch->memcg != mem) |
2633 | goto direct_uncharge; | 2669 | goto direct_uncharge; |
2634 | /* remember freed charge and uncharge it later */ | 2670 | /* remember freed charge and uncharge it later */ |
2635 | batch->bytes += PAGE_SIZE; | 2671 | batch->nr_pages++; |
2636 | if (uncharge_memsw) | 2672 | if (uncharge_memsw) |
2637 | batch->memsw_bytes += PAGE_SIZE; | 2673 | batch->memsw_nr_pages++; |
2638 | return; | 2674 | return; |
2639 | direct_uncharge: | 2675 | direct_uncharge: |
2640 | res_counter_uncharge(&mem->res, page_size); | 2676 | res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); |
2641 | if (uncharge_memsw) | 2677 | if (uncharge_memsw) |
2642 | res_counter_uncharge(&mem->memsw, page_size); | 2678 | res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); |
2643 | if (unlikely(batch->memcg != mem)) | 2679 | if (unlikely(batch->memcg != mem)) |
2644 | memcg_oom_recover(mem); | 2680 | memcg_oom_recover(mem); |
2645 | return; | 2681 | return; |
@@ -2651,10 +2687,9 @@ direct_uncharge: | |||
2651 | static struct mem_cgroup * | 2687 | static struct mem_cgroup * |
2652 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | 2688 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) |
2653 | { | 2689 | { |
2654 | int count; | ||
2655 | struct page_cgroup *pc; | ||
2656 | struct mem_cgroup *mem = NULL; | 2690 | struct mem_cgroup *mem = NULL; |
2657 | int page_size = PAGE_SIZE; | 2691 | unsigned int nr_pages = 1; |
2692 | struct page_cgroup *pc; | ||
2658 | 2693 | ||
2659 | if (mem_cgroup_disabled()) | 2694 | if (mem_cgroup_disabled()) |
2660 | return NULL; | 2695 | return NULL; |
@@ -2663,11 +2698,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2663 | return NULL; | 2698 | return NULL; |
2664 | 2699 | ||
2665 | if (PageTransHuge(page)) { | 2700 | if (PageTransHuge(page)) { |
2666 | page_size <<= compound_order(page); | 2701 | nr_pages <<= compound_order(page); |
2667 | VM_BUG_ON(!PageTransHuge(page)); | 2702 | VM_BUG_ON(!PageTransHuge(page)); |
2668 | } | 2703 | } |
2669 | |||
2670 | count = page_size >> PAGE_SHIFT; | ||
2671 | /* | 2704 | /* |
2672 | * Check if our page_cgroup is valid | 2705 | * Check if our page_cgroup is valid |
2673 | */ | 2706 | */ |
@@ -2700,7 +2733,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2700 | break; | 2733 | break; |
2701 | } | 2734 | } |
2702 | 2735 | ||
2703 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); | 2736 | mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); |
2704 | 2737 | ||
2705 | ClearPageCgroupUsed(pc); | 2738 | ClearPageCgroupUsed(pc); |
2706 | /* | 2739 | /* |
@@ -2721,7 +2754,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2721 | mem_cgroup_get(mem); | 2754 | mem_cgroup_get(mem); |
2722 | } | 2755 | } |
2723 | if (!mem_cgroup_is_root(mem)) | 2756 | if (!mem_cgroup_is_root(mem)) |
2724 | __do_uncharge(mem, ctype, page_size); | 2757 | mem_cgroup_do_uncharge(mem, nr_pages, ctype); |
2725 | 2758 | ||
2726 | return mem; | 2759 | return mem; |
2727 | 2760 | ||
@@ -2761,8 +2794,8 @@ void mem_cgroup_uncharge_start(void) | |||
2761 | /* We can do nest. */ | 2794 | /* We can do nest. */ |
2762 | if (current->memcg_batch.do_batch == 1) { | 2795 | if (current->memcg_batch.do_batch == 1) { |
2763 | current->memcg_batch.memcg = NULL; | 2796 | current->memcg_batch.memcg = NULL; |
2764 | current->memcg_batch.bytes = 0; | 2797 | current->memcg_batch.nr_pages = 0; |
2765 | current->memcg_batch.memsw_bytes = 0; | 2798 | current->memcg_batch.memsw_nr_pages = 0; |
2766 | } | 2799 | } |
2767 | } | 2800 | } |
2768 | 2801 | ||
@@ -2783,10 +2816,12 @@ void mem_cgroup_uncharge_end(void) | |||
2783 | * This "batch->memcg" is valid without any css_get/put etc... | 2816 | * This "batch->memcg" is valid without any css_get/put etc... |
2784 | * bacause we hide charges behind us. | 2817 | * bacause we hide charges behind us. |
2785 | */ | 2818 | */ |
2786 | if (batch->bytes) | 2819 | if (batch->nr_pages) |
2787 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2820 | res_counter_uncharge(&batch->memcg->res, |
2788 | if (batch->memsw_bytes) | 2821 | batch->nr_pages * PAGE_SIZE); |
2789 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2822 | if (batch->memsw_nr_pages) |
2823 | res_counter_uncharge(&batch->memcg->memsw, | ||
2824 | batch->memsw_nr_pages * PAGE_SIZE); | ||
2790 | memcg_oom_recover(batch->memcg); | 2825 | memcg_oom_recover(batch->memcg); |
2791 | /* forget this pointer (for sanity check) */ | 2826 | /* forget this pointer (for sanity check) */ |
2792 | batch->memcg = NULL; | 2827 | batch->memcg = NULL; |
@@ -2911,11 +2946,13 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2911 | int mem_cgroup_prepare_migration(struct page *page, | 2946 | int mem_cgroup_prepare_migration(struct page *page, |
2912 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 2947 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) |
2913 | { | 2948 | { |
2914 | struct page_cgroup *pc; | ||
2915 | struct mem_cgroup *mem = NULL; | 2949 | struct mem_cgroup *mem = NULL; |
2950 | struct page_cgroup *pc; | ||
2916 | enum charge_type ctype; | 2951 | enum charge_type ctype; |
2917 | int ret = 0; | 2952 | int ret = 0; |
2918 | 2953 | ||
2954 | *ptr = NULL; | ||
2955 | |||
2919 | VM_BUG_ON(PageTransHuge(page)); | 2956 | VM_BUG_ON(PageTransHuge(page)); |
2920 | if (mem_cgroup_disabled()) | 2957 | if (mem_cgroup_disabled()) |
2921 | return 0; | 2958 | return 0; |
@@ -2966,7 +3003,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2966 | return 0; | 3003 | return 0; |
2967 | 3004 | ||
2968 | *ptr = mem; | 3005 | *ptr = mem; |
2969 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, ptr, false, PAGE_SIZE); | 3006 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); |
2970 | css_put(&mem->css);/* drop extra refcnt */ | 3007 | css_put(&mem->css);/* drop extra refcnt */ |
2971 | if (ret || *ptr == NULL) { | 3008 | if (ret || *ptr == NULL) { |
2972 | if (PageAnon(page)) { | 3009 | if (PageAnon(page)) { |
@@ -2993,7 +3030,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
2993 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3030 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; |
2994 | else | 3031 | else |
2995 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3032 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
2996 | __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); | 3033 | __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); |
2997 | return ret; | 3034 | return ret; |
2998 | } | 3035 | } |
2999 | 3036 | ||
@@ -3058,7 +3095,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3058 | struct mm_struct *mm, | 3095 | struct mm_struct *mm, |
3059 | gfp_t gfp_mask) | 3096 | gfp_t gfp_mask) |
3060 | { | 3097 | { |
3061 | struct mem_cgroup *mem = NULL; | 3098 | struct mem_cgroup *mem; |
3062 | int ret; | 3099 | int ret; |
3063 | 3100 | ||
3064 | if (mem_cgroup_disabled()) | 3101 | if (mem_cgroup_disabled()) |
@@ -3071,6 +3108,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page, | |||
3071 | return ret; | 3108 | return ret; |
3072 | } | 3109 | } |
3073 | 3110 | ||
3111 | #ifdef CONFIG_DEBUG_VM | ||
3112 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | ||
3113 | { | ||
3114 | struct page_cgroup *pc; | ||
3115 | |||
3116 | pc = lookup_page_cgroup(page); | ||
3117 | if (likely(pc) && PageCgroupUsed(pc)) | ||
3118 | return pc; | ||
3119 | return NULL; | ||
3120 | } | ||
3121 | |||
3122 | bool mem_cgroup_bad_page_check(struct page *page) | ||
3123 | { | ||
3124 | if (mem_cgroup_disabled()) | ||
3125 | return false; | ||
3126 | |||
3127 | return lookup_page_cgroup_used(page) != NULL; | ||
3128 | } | ||
3129 | |||
3130 | void mem_cgroup_print_bad_page(struct page *page) | ||
3131 | { | ||
3132 | struct page_cgroup *pc; | ||
3133 | |||
3134 | pc = lookup_page_cgroup_used(page); | ||
3135 | if (pc) { | ||
3136 | int ret = -1; | ||
3137 | char *path; | ||
3138 | |||
3139 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", | ||
3140 | pc, pc->flags, pc->mem_cgroup); | ||
3141 | |||
3142 | path = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3143 | if (path) { | ||
3144 | rcu_read_lock(); | ||
3145 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, | ||
3146 | path, PATH_MAX); | ||
3147 | rcu_read_unlock(); | ||
3148 | } | ||
3149 | |||
3150 | printk(KERN_CONT "(%s)\n", | ||
3151 | (ret < 0) ? "cannot get the path" : path); | ||
3152 | kfree(path); | ||
3153 | } | ||
3154 | } | ||
3155 | #endif | ||
3156 | |||
3074 | static DEFINE_MUTEX(set_limit_mutex); | 3157 | static DEFINE_MUTEX(set_limit_mutex); |
3075 | 3158 | ||
3076 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3159 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
@@ -3314,6 +3397,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3314 | loop += 256; | 3397 | loop += 256; |
3315 | busy = NULL; | 3398 | busy = NULL; |
3316 | while (loop--) { | 3399 | while (loop--) { |
3400 | struct page *page; | ||
3401 | |||
3317 | ret = 0; | 3402 | ret = 0; |
3318 | spin_lock_irqsave(&zone->lru_lock, flags); | 3403 | spin_lock_irqsave(&zone->lru_lock, flags); |
3319 | if (list_empty(list)) { | 3404 | if (list_empty(list)) { |
@@ -3329,7 +3414,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
3329 | } | 3414 | } |
3330 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3415 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3331 | 3416 | ||
3332 | ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL); | 3417 | page = lookup_cgroup_page(pc); |
3418 | |||
3419 | ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); | ||
3333 | if (ret == -ENOMEM) | 3420 | if (ret == -ENOMEM) |
3334 | break; | 3421 | break; |
3335 | 3422 | ||
@@ -3477,13 +3564,13 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3477 | } | 3564 | } |
3478 | 3565 | ||
3479 | 3566 | ||
3480 | static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | 3567 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, |
3481 | enum mem_cgroup_stat_index idx) | 3568 | enum mem_cgroup_stat_index idx) |
3482 | { | 3569 | { |
3483 | struct mem_cgroup *iter; | 3570 | struct mem_cgroup *iter; |
3484 | s64 val = 0; | 3571 | long val = 0; |
3485 | 3572 | ||
3486 | /* each per cpu's value can be minus.Then, use s64 */ | 3573 | /* Per-cpu values can be negative, use a signed accumulator */ |
3487 | for_each_mem_cgroup_tree(iter, mem) | 3574 | for_each_mem_cgroup_tree(iter, mem) |
3488 | val += mem_cgroup_read_stat(iter, idx); | 3575 | val += mem_cgroup_read_stat(iter, idx); |
3489 | 3576 | ||
@@ -3503,12 +3590,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | |||
3503 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3590 | return res_counter_read_u64(&mem->memsw, RES_USAGE); |
3504 | } | 3591 | } |
3505 | 3592 | ||
3506 | val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); | 3593 | val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); |
3507 | val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); | 3594 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); |
3508 | 3595 | ||
3509 | if (swap) | 3596 | if (swap) |
3510 | val += mem_cgroup_get_recursive_idx_stat(mem, | 3597 | val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
3511 | MEM_CGROUP_STAT_SWAPOUT); | ||
3512 | 3598 | ||
3513 | return val << PAGE_SHIFT; | 3599 | return val << PAGE_SHIFT; |
3514 | } | 3600 | } |
@@ -3728,9 +3814,9 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | |||
3728 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3814 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
3729 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); | 3815 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
3730 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3816 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
3731 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3817 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); |
3732 | s->stat[MCS_PGPGIN] += val; | 3818 | s->stat[MCS_PGPGIN] += val; |
3733 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3819 | val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); |
3734 | s->stat[MCS_PGPGOUT] += val; | 3820 | s->stat[MCS_PGPGOUT] += val; |
3735 | if (do_swap_account) { | 3821 | if (do_swap_account) { |
3736 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); | 3822 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
@@ -3854,9 +3940,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
3854 | return -EINVAL; | 3940 | return -EINVAL; |
3855 | } | 3941 | } |
3856 | 3942 | ||
3857 | spin_lock(&memcg->reclaim_param_lock); | ||
3858 | memcg->swappiness = val; | 3943 | memcg->swappiness = val; |
3859 | spin_unlock(&memcg->reclaim_param_lock); | ||
3860 | 3944 | ||
3861 | cgroup_unlock(); | 3945 | cgroup_unlock(); |
3862 | 3946 | ||
@@ -4512,7 +4596,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4512 | res_counter_init(&mem->memsw, NULL); | 4596 | res_counter_init(&mem->memsw, NULL); |
4513 | } | 4597 | } |
4514 | mem->last_scanned_child = 0; | 4598 | mem->last_scanned_child = 0; |
4515 | spin_lock_init(&mem->reclaim_param_lock); | ||
4516 | INIT_LIST_HEAD(&mem->oom_notify); | 4599 | INIT_LIST_HEAD(&mem->oom_notify); |
4517 | 4600 | ||
4518 | if (parent) | 4601 | if (parent) |
@@ -4600,8 +4683,7 @@ one_by_one: | |||
4600 | batch_count = PRECHARGE_COUNT_AT_ONCE; | 4683 | batch_count = PRECHARGE_COUNT_AT_ONCE; |
4601 | cond_resched(); | 4684 | cond_resched(); |
4602 | } | 4685 | } |
4603 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 4686 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); |
4604 | PAGE_SIZE); | ||
4605 | if (ret || !mem) | 4687 | if (ret || !mem) |
4606 | /* mem_cgroup_clear_mc() will do uncharge later */ | 4688 | /* mem_cgroup_clear_mc() will do uncharge later */ |
4607 | return -ENOMEM; | 4689 | return -ENOMEM; |
@@ -4947,8 +5029,8 @@ retry: | |||
4947 | if (isolate_lru_page(page)) | 5029 | if (isolate_lru_page(page)) |
4948 | goto put; | 5030 | goto put; |
4949 | pc = lookup_page_cgroup(page); | 5031 | pc = lookup_page_cgroup(page); |
4950 | if (!mem_cgroup_move_account(pc, | 5032 | if (!mem_cgroup_move_account(page, 1, pc, |
4951 | mc.from, mc.to, false, PAGE_SIZE)) { | 5033 | mc.from, mc.to, false)) { |
4952 | mc.precharge--; | 5034 | mc.precharge--; |
4953 | /* we uncharge from mc.from later. */ | 5035 | /* we uncharge from mc.from later. */ |
4954 | mc.moved_charge++; | 5036 | mc.moved_charge++; |