aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c1102
1 files changed, 500 insertions, 602 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d87aa3510c5e..602207be9853 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu {
123 unsigned long targets[MEM_CGROUP_NTARGETS]; 123 unsigned long targets[MEM_CGROUP_NTARGETS];
124}; 124};
125 125
126struct mem_cgroup_reclaim_iter {
127 /* css_id of the last scanned hierarchy member */
128 int position;
129 /* scan generation, increased every round-trip */
130 unsigned int generation;
131};
132
126/* 133/*
127 * per-zone information in memory controller. 134 * per-zone information in memory controller.
128 */ 135 */
129struct mem_cgroup_per_zone { 136struct mem_cgroup_per_zone {
130 /* 137 struct lruvec lruvec;
131 * spin_lock to protect the per cgroup LRU
132 */
133 struct list_head lists[NR_LRU_LISTS];
134 unsigned long count[NR_LRU_LISTS]; 138 unsigned long count[NR_LRU_LISTS];
135 139
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141
136 struct zone_reclaim_stat reclaim_stat; 142 struct zone_reclaim_stat reclaim_stat;
137 struct rb_node tree_node; /* RB tree node */ 143 struct rb_node tree_node; /* RB tree node */
138 unsigned long long usage_in_excess;/* Set to the value by which */ 144 unsigned long long usage_in_excess;/* Set to the value by which */
@@ -233,11 +239,6 @@ struct mem_cgroup {
233 * per zone LRU lists. 239 * per zone LRU lists.
234 */ 240 */
235 struct mem_cgroup_lru_info info; 241 struct mem_cgroup_lru_info info;
236 /*
237 * While reclaiming in a hierarchy, we cache the last child we
238 * reclaimed from.
239 */
240 int last_scanned_child;
241 int last_scanned_node; 242 int last_scanned_node;
242#if MAX_NUMNODES > 1 243#if MAX_NUMNODES > 1
243 nodemask_t scan_nodes; 244 nodemask_t scan_nodes;
@@ -366,8 +367,6 @@ enum charge_type {
366#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 367#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
367#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 368#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
368#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 369#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
369#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
370#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
371 370
372static void mem_cgroup_get(struct mem_cgroup *memcg); 371static void mem_cgroup_get(struct mem_cgroup *memcg);
373static void mem_cgroup_put(struct mem_cgroup *memcg); 372static void mem_cgroup_put(struct mem_cgroup *memcg);
@@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
566 struct mem_cgroup_per_zone *mz; 565 struct mem_cgroup_per_zone *mz;
567 struct mem_cgroup_tree_per_zone *mctz; 566 struct mem_cgroup_tree_per_zone *mctz;
568 567
569 for_each_node_state(node, N_POSSIBLE) { 568 for_each_node(node) {
570 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 569 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
571 mz = mem_cgroup_zoneinfo(memcg, node, zone); 570 mz = mem_cgroup_zoneinfo(memcg, node, zone);
572 mctz = soft_limit_tree_node_zone(node, zone); 571 mctz = soft_limit_tree_node_zone(node, zone);
@@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
656 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 655 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
657} 656}
658 657
659void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
660{
661 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
662}
663
664void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
665{
666 this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
667}
668
669static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 658static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
670 enum mem_cgroup_events_index idx) 659 enum mem_cgroup_events_index idx)
671{ 660{
@@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
749 return total; 738 return total;
750} 739}
751 740
752static bool __memcg_event_check(struct mem_cgroup *memcg, int target) 741static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
742 enum mem_cgroup_events_target target)
753{ 743{
754 unsigned long val, next; 744 unsigned long val, next;
755 745
756 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 746 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
757 next = __this_cpu_read(memcg->stat->targets[target]); 747 next = __this_cpu_read(memcg->stat->targets[target]);
758 /* from time_after() in jiffies.h */ 748 /* from time_after() in jiffies.h */
759 return ((long)next - (long)val < 0); 749 if ((long)next - (long)val < 0) {
760} 750 switch (target) {
761 751 case MEM_CGROUP_TARGET_THRESH:
762static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) 752 next = val + THRESHOLDS_EVENTS_TARGET;
763{ 753 break;
764 unsigned long val, next; 754 case MEM_CGROUP_TARGET_SOFTLIMIT:
765 755 next = val + SOFTLIMIT_EVENTS_TARGET;
766 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 756 break;
767 757 case MEM_CGROUP_TARGET_NUMAINFO:
768 switch (target) { 758 next = val + NUMAINFO_EVENTS_TARGET;
769 case MEM_CGROUP_TARGET_THRESH: 759 break;
770 next = val + THRESHOLDS_EVENTS_TARGET; 760 default:
771 break; 761 break;
772 case MEM_CGROUP_TARGET_SOFTLIMIT: 762 }
773 next = val + SOFTLIMIT_EVENTS_TARGET; 763 __this_cpu_write(memcg->stat->targets[target], next);
774 break; 764 return true;
775 case MEM_CGROUP_TARGET_NUMAINFO:
776 next = val + NUMAINFO_EVENTS_TARGET;
777 break;
778 default:
779 return;
780 } 765 }
781 766 return false;
782 __this_cpu_write(memcg->stat->targets[target], next);
783} 767}
784 768
785/* 769/*
@@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
790{ 774{
791 preempt_disable(); 775 preempt_disable();
792 /* threshold event is triggered in finer grain than soft limit */ 776 /* threshold event is triggered in finer grain than soft limit */
793 if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { 777 if (unlikely(mem_cgroup_event_ratelimit(memcg,
778 MEM_CGROUP_TARGET_THRESH))) {
779 bool do_softlimit, do_numainfo;
780
781 do_softlimit = mem_cgroup_event_ratelimit(memcg,
782 MEM_CGROUP_TARGET_SOFTLIMIT);
783#if MAX_NUMNODES > 1
784 do_numainfo = mem_cgroup_event_ratelimit(memcg,
785 MEM_CGROUP_TARGET_NUMAINFO);
786#endif
787 preempt_enable();
788
794 mem_cgroup_threshold(memcg); 789 mem_cgroup_threshold(memcg);
795 __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); 790 if (unlikely(do_softlimit))
796 if (unlikely(__memcg_event_check(memcg,
797 MEM_CGROUP_TARGET_SOFTLIMIT))) {
798 mem_cgroup_update_tree(memcg, page); 791 mem_cgroup_update_tree(memcg, page);
799 __mem_cgroup_target_update(memcg,
800 MEM_CGROUP_TARGET_SOFTLIMIT);
801 }
802#if MAX_NUMNODES > 1 792#if MAX_NUMNODES > 1
803 if (unlikely(__memcg_event_check(memcg, 793 if (unlikely(do_numainfo))
804 MEM_CGROUP_TARGET_NUMAINFO))) {
805 atomic_inc(&memcg->numainfo_events); 794 atomic_inc(&memcg->numainfo_events);
806 __mem_cgroup_target_update(memcg,
807 MEM_CGROUP_TARGET_NUMAINFO);
808 }
809#endif 795#endif
810 } 796 } else
811 preempt_enable(); 797 preempt_enable();
812} 798}
813 799
814struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 800struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
853 return memcg; 839 return memcg;
854} 840}
855 841
856/* The caller has to guarantee "mem" exists before calling this */ 842/**
857static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) 843 * mem_cgroup_iter - iterate over memory cgroup hierarchy
844 * @root: hierarchy root
845 * @prev: previously returned memcg, NULL on first invocation
846 * @reclaim: cookie for shared reclaim walks, NULL for full walks
847 *
848 * Returns references to children of the hierarchy below @root, or
849 * @root itself, or %NULL after a full round-trip.
850 *
851 * Caller must pass the return value in @prev on subsequent
852 * invocations for reference counting, or use mem_cgroup_iter_break()
853 * to cancel a hierarchy walk before the round-trip is complete.
854 *
855 * Reclaimers can specify a zone and a priority level in @reclaim to
856 * divide up the memcgs in the hierarchy among all concurrent
857 * reclaimers operating on the same zone and priority.
858 */
859struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
860 struct mem_cgroup *prev,
861 struct mem_cgroup_reclaim_cookie *reclaim)
858{ 862{
859 struct cgroup_subsys_state *css; 863 struct mem_cgroup *memcg = NULL;
860 int found; 864 int id = 0;
861 865
862 if (!memcg) /* ROOT cgroup has the smallest ID */ 866 if (mem_cgroup_disabled())
863 return root_mem_cgroup; /*css_put/get against root is ignored*/
864 if (!memcg->use_hierarchy) {
865 if (css_tryget(&memcg->css))
866 return memcg;
867 return NULL; 867 return NULL;
868 }
869 rcu_read_lock();
870 /*
871 * searching a memory cgroup which has the smallest ID under given
872 * ROOT cgroup. (ID >= 1)
873 */
874 css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
875 if (css && css_tryget(css))
876 memcg = container_of(css, struct mem_cgroup, css);
877 else
878 memcg = NULL;
879 rcu_read_unlock();
880 return memcg;
881}
882 868
883static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 869 if (!root)
884 struct mem_cgroup *root, 870 root = root_mem_cgroup;
885 bool cond)
886{
887 int nextid = css_id(&iter->css) + 1;
888 int found;
889 int hierarchy_used;
890 struct cgroup_subsys_state *css;
891 871
892 hierarchy_used = iter->use_hierarchy; 872 if (prev && !reclaim)
873 id = css_id(&prev->css);
893 874
894 css_put(&iter->css); 875 if (prev && prev != root)
895 /* If no ROOT, walk all, ignore hierarchy */ 876 css_put(&prev->css);
896 if (!cond || (root && !hierarchy_used))
897 return NULL;
898 877
899 if (!root) 878 if (!root->use_hierarchy && root != root_mem_cgroup) {
900 root = root_mem_cgroup; 879 if (prev)
880 return NULL;
881 return root;
882 }
901 883
902 do { 884 while (!memcg) {
903 iter = NULL; 885 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
904 rcu_read_lock(); 886 struct cgroup_subsys_state *css;
887
888 if (reclaim) {
889 int nid = zone_to_nid(reclaim->zone);
890 int zid = zone_idx(reclaim->zone);
891 struct mem_cgroup_per_zone *mz;
905 892
906 css = css_get_next(&mem_cgroup_subsys, nextid, 893 mz = mem_cgroup_zoneinfo(root, nid, zid);
907 &root->css, &found); 894 iter = &mz->reclaim_iter[reclaim->priority];
908 if (css && css_tryget(css)) 895 if (prev && reclaim->generation != iter->generation)
909 iter = container_of(css, struct mem_cgroup, css); 896 return NULL;
897 id = iter->position;
898 }
899
900 rcu_read_lock();
901 css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
902 if (css) {
903 if (css == &root->css || css_tryget(css))
904 memcg = container_of(css,
905 struct mem_cgroup, css);
906 } else
907 id = 0;
910 rcu_read_unlock(); 908 rcu_read_unlock();
911 /* If css is NULL, no more cgroups will be found */
912 nextid = found + 1;
913 } while (css && !iter);
914 909
915 return iter; 910 if (reclaim) {
911 iter->position = id;
912 if (!css)
913 iter->generation++;
914 else if (!prev && memcg)
915 reclaim->generation = iter->generation;
916 }
917
918 if (prev && !css)
919 return NULL;
920 }
921 return memcg;
916} 922}
917/*
918 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
919 * be careful that "break" loop is not allowed. We have reference count.
920 * Instead of that modify "cond" to be false and "continue" to exit the loop.
921 */
922#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
923 for (iter = mem_cgroup_start_loop(root);\
924 iter != NULL;\
925 iter = mem_cgroup_get_next(iter, root, cond))
926 923
927#define for_each_mem_cgroup_tree(iter, root) \ 924/**
928 for_each_mem_cgroup_tree_cond(iter, root, true) 925 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
926 * @root: hierarchy root
927 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
928 */
929void mem_cgroup_iter_break(struct mem_cgroup *root,
930 struct mem_cgroup *prev)
931{
932 if (!root)
933 root = root_mem_cgroup;
934 if (prev && prev != root)
935 css_put(&prev->css);
936}
929 937
930#define for_each_mem_cgroup_all(iter) \ 938/*
931 for_each_mem_cgroup_tree_cond(iter, NULL, true) 939 * Iteration constructs for visiting all cgroups (under a tree). If
940 * loops are exited prematurely (break), mem_cgroup_iter_break() must
941 * be used for reference counting.
942 */
943#define for_each_mem_cgroup_tree(iter, root) \
944 for (iter = mem_cgroup_iter(root, NULL, NULL); \
945 iter != NULL; \
946 iter = mem_cgroup_iter(root, iter, NULL))
932 947
948#define for_each_mem_cgroup(iter) \
949 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
950 iter != NULL; \
951 iter = mem_cgroup_iter(NULL, iter, NULL))
933 952
934static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 953static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
935{ 954{
@@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
949 goto out; 968 goto out;
950 969
951 switch (idx) { 970 switch (idx) {
952 case PGMAJFAULT:
953 mem_cgroup_pgmajfault(memcg, 1);
954 break;
955 case PGFAULT: 971 case PGFAULT:
956 mem_cgroup_pgfault(memcg, 1); 972 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
973 break;
974 case PGMAJFAULT:
975 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
957 break; 976 break;
958 default: 977 default:
959 BUG(); 978 BUG();
@@ -963,6 +982,27 @@ out:
963} 982}
964EXPORT_SYMBOL(mem_cgroup_count_vm_event); 983EXPORT_SYMBOL(mem_cgroup_count_vm_event);
965 984
985/**
986 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
987 * @zone: zone of the wanted lruvec
988 * @mem: memcg of the wanted lruvec
989 *
990 * Returns the lru list vector holding pages for the given @zone and
991 * @mem. This can be the global zone lruvec, if the memory controller
992 * is disabled.
993 */
994struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
995 struct mem_cgroup *memcg)
996{
997 struct mem_cgroup_per_zone *mz;
998
999 if (mem_cgroup_disabled())
1000 return &zone->lruvec;
1001
1002 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1003 return &mz->lruvec;
1004}
1005
966/* 1006/*
967 * Following LRU functions are allowed to be used without PCG_LOCK. 1007 * Following LRU functions are allowed to be used without PCG_LOCK.
968 * Operations are called by routine of global LRU independently from memcg. 1008 * Operations are called by routine of global LRU independently from memcg.
@@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
977 * When moving account, the page is not on LRU. It's isolated. 1017 * When moving account, the page is not on LRU. It's isolated.
978 */ 1018 */
979 1019
980void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 1020/**
981{ 1021 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
982 struct page_cgroup *pc; 1022 * @zone: zone of the page
983 struct mem_cgroup_per_zone *mz; 1023 * @page: the page
984 1024 * @lru: current lru
985 if (mem_cgroup_disabled()) 1025 *
986 return; 1026 * This function accounts for @page being added to @lru, and returns
987 pc = lookup_page_cgroup(page); 1027 * the lruvec for the given @zone and the memcg @page is charged to.
988 /* can happen while we handle swapcache. */ 1028 *
989 if (!TestClearPageCgroupAcctLRU(pc)) 1029 * The callsite is then responsible for physically linking the page to
990 return; 1030 * the returned lruvec->lists[@lru].
991 VM_BUG_ON(!pc->mem_cgroup);
992 /*
993 * We don't check PCG_USED bit. It's cleared when the "page" is finally
994 * removed from global LRU.
995 */
996 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
997 /* huge page split is done under lru_lock. so, we have no races. */
998 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
999 if (mem_cgroup_is_root(pc->mem_cgroup))
1000 return;
1001 VM_BUG_ON(list_empty(&pc->lru));
1002 list_del_init(&pc->lru);
1003}
1004
1005void mem_cgroup_del_lru(struct page *page)
1006{
1007 mem_cgroup_del_lru_list(page, page_lru(page));
1008}
1009
1010/*
1011 * Writeback is about to end against a page which has been marked for immediate
1012 * reclaim. If it still appears to be reclaimable, move it to the tail of the
1013 * inactive list.
1014 */ 1031 */
1015void mem_cgroup_rotate_reclaimable_page(struct page *page) 1032struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1033 enum lru_list lru)
1016{ 1034{
1017 struct mem_cgroup_per_zone *mz; 1035 struct mem_cgroup_per_zone *mz;
1036 struct mem_cgroup *memcg;
1018 struct page_cgroup *pc; 1037 struct page_cgroup *pc;
1019 enum lru_list lru = page_lru(page);
1020 1038
1021 if (mem_cgroup_disabled()) 1039 if (mem_cgroup_disabled())
1022 return; 1040 return &zone->lruvec;
1023 1041
1024 pc = lookup_page_cgroup(page); 1042 pc = lookup_page_cgroup(page);
1025 /* unused or root page is not rotated. */ 1043 memcg = pc->mem_cgroup;
1026 if (!PageCgroupUsed(pc)) 1044 mz = page_cgroup_zoneinfo(memcg, page);
1027 return; 1045 /* compound_order() is stabilized through lru_lock */
1028 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1046 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1029 smp_rmb(); 1047 return &mz->lruvec;
1030 if (mem_cgroup_is_root(pc->mem_cgroup))
1031 return;
1032 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1033 list_move_tail(&pc->lru, &mz->lists[lru]);
1034} 1048}
1035 1049
1036void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 1050/**
1051 * mem_cgroup_lru_del_list - account for removing an lru page
1052 * @page: the page
1053 * @lru: target lru
1054 *
1055 * This function accounts for @page being removed from @lru.
1056 *
1057 * The callsite is then responsible for physically unlinking
1058 * @page->lru.
1059 */
1060void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1037{ 1061{
1038 struct mem_cgroup_per_zone *mz; 1062 struct mem_cgroup_per_zone *mz;
1063 struct mem_cgroup *memcg;
1039 struct page_cgroup *pc; 1064 struct page_cgroup *pc;
1040 1065
1041 if (mem_cgroup_disabled()) 1066 if (mem_cgroup_disabled())
1042 return; 1067 return;
1043 1068
1044 pc = lookup_page_cgroup(page); 1069 pc = lookup_page_cgroup(page);
1045 /* unused or root page is not rotated. */ 1070 memcg = pc->mem_cgroup;
1046 if (!PageCgroupUsed(pc)) 1071 VM_BUG_ON(!memcg);
1047 return; 1072 mz = page_cgroup_zoneinfo(memcg, page);
1048 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1049 smp_rmb();
1050 if (mem_cgroup_is_root(pc->mem_cgroup))
1051 return;
1052 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1053 list_move(&pc->lru, &mz->lists[lru]);
1054}
1055
1056void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1057{
1058 struct page_cgroup *pc;
1059 struct mem_cgroup_per_zone *mz;
1060
1061 if (mem_cgroup_disabled())
1062 return;
1063 pc = lookup_page_cgroup(page);
1064 VM_BUG_ON(PageCgroupAcctLRU(pc));
1065 /*
1066 * putback: charge:
1067 * SetPageLRU SetPageCgroupUsed
1068 * smp_mb smp_mb
1069 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1070 *
1071 * Ensure that one of the two sides adds the page to the memcg
1072 * LRU during a race.
1073 */
1074 smp_mb();
1075 if (!PageCgroupUsed(pc))
1076 return;
1077 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1078 smp_rmb();
1079 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1080 /* huge page split is done under lru_lock. so, we have no races. */ 1073 /* huge page split is done under lru_lock. so, we have no races. */
1081 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1074 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page)));
1082 SetPageCgroupAcctLRU(pc); 1075 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
1083 if (mem_cgroup_is_root(pc->mem_cgroup))
1084 return;
1085 list_add(&pc->lru, &mz->lists[lru]);
1086}
1087
1088/*
1089 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1090 * while it's linked to lru because the page may be reused after it's fully
1091 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1092 * It's done under lock_page and expected that zone->lru_lock isnever held.
1093 */
1094static void mem_cgroup_lru_del_before_commit(struct page *page)
1095{
1096 unsigned long flags;
1097 struct zone *zone = page_zone(page);
1098 struct page_cgroup *pc = lookup_page_cgroup(page);
1099
1100 /*
1101 * Doing this check without taking ->lru_lock seems wrong but this
1102 * is safe. Because if page_cgroup's USED bit is unset, the page
1103 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1104 * set, the commit after this will fail, anyway.
1105 * This all charge/uncharge is done under some mutual execustion.
1106 * So, we don't need to taking care of changes in USED bit.
1107 */
1108 if (likely(!PageLRU(page)))
1109 return;
1110
1111 spin_lock_irqsave(&zone->lru_lock, flags);
1112 /*
1113 * Forget old LRU when this page_cgroup is *not* used. This Used bit
1114 * is guarded by lock_page() because the page is SwapCache.
1115 */
1116 if (!PageCgroupUsed(pc))
1117 mem_cgroup_del_lru_list(page, page_lru(page));
1118 spin_unlock_irqrestore(&zone->lru_lock, flags);
1119} 1076}
1120 1077
1121static void mem_cgroup_lru_add_after_commit(struct page *page) 1078void mem_cgroup_lru_del(struct page *page)
1122{ 1079{
1123 unsigned long flags; 1080 mem_cgroup_lru_del_list(page, page_lru(page));
1124 struct zone *zone = page_zone(page);
1125 struct page_cgroup *pc = lookup_page_cgroup(page);
1126 /*
1127 * putback: charge:
1128 * SetPageLRU SetPageCgroupUsed
1129 * smp_mb smp_mb
1130 * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU
1131 *
1132 * Ensure that one of the two sides adds the page to the memcg
1133 * LRU during a race.
1134 */
1135 smp_mb();
1136 /* taking care of that the page is added to LRU while we commit it */
1137 if (likely(!PageLRU(page)))
1138 return;
1139 spin_lock_irqsave(&zone->lru_lock, flags);
1140 /* link when the page is linked to LRU but page_cgroup isn't */
1141 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1142 mem_cgroup_add_lru_list(page, page_lru(page));
1143 spin_unlock_irqrestore(&zone->lru_lock, flags);
1144} 1081}
1145 1082
1146 1083/**
1147void mem_cgroup_move_lists(struct page *page, 1084 * mem_cgroup_lru_move_lists - account for moving a page between lrus
1148 enum lru_list from, enum lru_list to) 1085 * @zone: zone of the page
1086 * @page: the page
1087 * @from: current lru
1088 * @to: target lru
1089 *
1090 * This function accounts for @page being moved between the lrus @from
1091 * and @to, and returns the lruvec for the given @zone and the memcg
1092 * @page is charged to.
1093 *
1094 * The callsite is then responsible for physically relinking
1095 * @page->lru to the returned lruvec->lists[@to].
1096 */
1097struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1098 struct page *page,
1099 enum lru_list from,
1100 enum lru_list to)
1149{ 1101{
1150 if (mem_cgroup_disabled()) 1102 /* XXX: Optimize this, especially for @from == @to */
1151 return; 1103 mem_cgroup_lru_del_list(page, from);
1152 mem_cgroup_del_lru_list(page, from); 1104 return mem_cgroup_lru_add_list(zone, page, to);
1153 mem_cgroup_add_lru_list(page, to);
1154} 1105}
1155 1106
1156/* 1107/*
@@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1175 struct task_struct *p; 1126 struct task_struct *p;
1176 1127
1177 p = find_lock_task_mm(task); 1128 p = find_lock_task_mm(task);
1178 if (!p) 1129 if (p) {
1179 return 0; 1130 curr = try_get_mem_cgroup_from_mm(p->mm);
1180 curr = try_get_mem_cgroup_from_mm(p->mm); 1131 task_unlock(p);
1181 task_unlock(p); 1132 } else {
1133 /*
1134 * All threads may have already detached their mm's, but the oom
1135 * killer still needs to detect if they have already been oom
1136 * killed to prevent needlessly killing additional tasks.
1137 */
1138 task_lock(task);
1139 curr = mem_cgroup_from_task(task);
1140 if (curr)
1141 css_get(&curr->css);
1142 task_unlock(task);
1143 }
1182 if (!curr) 1144 if (!curr)
1183 return 0; 1145 return 0;
1184 /* 1146 /*
@@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1258 return &mz->reclaim_stat; 1220 return &mz->reclaim_stat;
1259} 1221}
1260 1222
1261unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1262 struct list_head *dst,
1263 unsigned long *scanned, int order,
1264 isolate_mode_t mode,
1265 struct zone *z,
1266 struct mem_cgroup *mem_cont,
1267 int active, int file)
1268{
1269 unsigned long nr_taken = 0;
1270 struct page *page;
1271 unsigned long scan;
1272 LIST_HEAD(pc_list);
1273 struct list_head *src;
1274 struct page_cgroup *pc, *tmp;
1275 int nid = zone_to_nid(z);
1276 int zid = zone_idx(z);
1277 struct mem_cgroup_per_zone *mz;
1278 int lru = LRU_FILE * file + active;
1279 int ret;
1280
1281 BUG_ON(!mem_cont);
1282 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1283 src = &mz->lists[lru];
1284
1285 scan = 0;
1286 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1287 if (scan >= nr_to_scan)
1288 break;
1289
1290 if (unlikely(!PageCgroupUsed(pc)))
1291 continue;
1292
1293 page = lookup_cgroup_page(pc);
1294
1295 if (unlikely(!PageLRU(page)))
1296 continue;
1297
1298 scan++;
1299 ret = __isolate_lru_page(page, mode, file);
1300 switch (ret) {
1301 case 0:
1302 list_move(&page->lru, dst);
1303 mem_cgroup_del_lru(page);
1304 nr_taken += hpage_nr_pages(page);
1305 break;
1306 case -EBUSY:
1307 /* we don't affect global LRU but rotate in our LRU */
1308 mem_cgroup_rotate_lru_list(page, page_lru(page));
1309 break;
1310 default:
1311 break;
1312 }
1313 }
1314
1315 *scanned = scan;
1316
1317 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1318 0, 0, 0, mode);
1319
1320 return nr_taken;
1321}
1322
1323#define mem_cgroup_from_res_counter(counter, member) \ 1223#define mem_cgroup_from_res_counter(counter, member) \
1324 container_of(counter, struct mem_cgroup, member) 1224 container_of(counter, struct mem_cgroup, member)
1325 1225
@@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1536 return min(limit, memsw); 1436 return min(limit, memsw);
1537} 1437}
1538 1438
1539/* 1439static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1540 * Visit the first child (need not be the first child as per the ordering 1440 gfp_t gfp_mask,
1541 * of the cgroup list, since we track last_scanned_child) of @mem and use 1441 unsigned long flags)
1542 * that to reclaim free pages from.
1543 */
1544static struct mem_cgroup *
1545mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
1546{ 1442{
1547 struct mem_cgroup *ret = NULL; 1443 unsigned long total = 0;
1548 struct cgroup_subsys_state *css; 1444 bool noswap = false;
1549 int nextid, found; 1445 int loop;
1550
1551 if (!root_memcg->use_hierarchy) {
1552 css_get(&root_memcg->css);
1553 ret = root_memcg;
1554 }
1555 1446
1556 while (!ret) { 1447 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1557 rcu_read_lock(); 1448 noswap = true;
1558 nextid = root_memcg->last_scanned_child + 1; 1449 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1559 css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, 1450 noswap = true;
1560 &found);
1561 if (css && css_tryget(css))
1562 ret = container_of(css, struct mem_cgroup, css);
1563 1451
1564 rcu_read_unlock(); 1452 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1565 /* Updates scanning parameter */ 1453 if (loop)
1566 if (!css) { 1454 drain_all_stock_async(memcg);
1567 /* this means start scan from ID:1 */ 1455 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1568 root_memcg->last_scanned_child = 0; 1456 /*
1569 } else 1457 * Allow limit shrinkers, which are triggered directly
1570 root_memcg->last_scanned_child = found; 1458 * by userspace, to catch signals and stop reclaim
1459 * after minimal progress, regardless of the margin.
1460 */
1461 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1462 break;
1463 if (mem_cgroup_margin(memcg))
1464 break;
1465 /*
1466 * If nothing was reclaimed after two attempts, there
1467 * may be no reclaimable pages in this hierarchy.
1468 */
1469 if (loop && !total)
1470 break;
1571 } 1471 }
1572 1472 return total;
1573 return ret;
1574} 1473}
1575 1474
1576/** 1475/**
@@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1710} 1609}
1711#endif 1610#endif
1712 1611
1713/* 1612static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1714 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1613 struct zone *zone,
1715 * we reclaimed from, so that we don't end up penalizing one child extensively 1614 gfp_t gfp_mask,
1716 * based on its position in the children list. 1615 unsigned long *total_scanned)
1717 * 1616{
1718 * root_memcg is the original ancestor that we've been reclaim from. 1617 struct mem_cgroup *victim = NULL;
1719 * 1618 int total = 0;
1720 * We give up and return to the caller when we visit root_memcg twice.
1721 * (other groups can be removed while we're walking....)
1722 *
1723 * If shrink==true, for avoiding to free too much, this returns immedieately.
1724 */
1725static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1726 struct zone *zone,
1727 gfp_t gfp_mask,
1728 unsigned long reclaim_options,
1729 unsigned long *total_scanned)
1730{
1731 struct mem_cgroup *victim;
1732 int ret, total = 0;
1733 int loop = 0; 1619 int loop = 0;
1734 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1735 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1736 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1737 unsigned long excess; 1620 unsigned long excess;
1738 unsigned long nr_scanned; 1621 unsigned long nr_scanned;
1622 struct mem_cgroup_reclaim_cookie reclaim = {
1623 .zone = zone,
1624 .priority = 0,
1625 };
1739 1626
1740 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1627 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
1741 1628
1742 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1743 if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
1744 noswap = true;
1745
1746 while (1) { 1629 while (1) {
1747 victim = mem_cgroup_select_victim(root_memcg); 1630 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1748 if (victim == root_memcg) { 1631 if (!victim) {
1749 loop++; 1632 loop++;
1750 /*
1751 * We are not draining per cpu cached charges during
1752 * soft limit reclaim because global reclaim doesn't
1753 * care about charges. It tries to free some memory and
1754 * charges will not give any.
1755 */
1756 if (!check_soft && loop >= 1)
1757 drain_all_stock_async(root_memcg);
1758 if (loop >= 2) { 1633 if (loop >= 2) {
1759 /* 1634 /*
1760 * If we have not been able to reclaim 1635 * If we have not been able to reclaim
1761 * anything, it might because there are 1636 * anything, it might because there are
1762 * no reclaimable pages under this hierarchy 1637 * no reclaimable pages under this hierarchy
1763 */ 1638 */
1764 if (!check_soft || !total) { 1639 if (!total)
1765 css_put(&victim->css);
1766 break; 1640 break;
1767 }
1768 /* 1641 /*
1769 * We want to do more targeted reclaim. 1642 * We want to do more targeted reclaim.
1770 * excess >> 2 is not to excessive so as to 1643 * excess >> 2 is not to excessive so as to
@@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1772 * coming back to reclaim from this cgroup 1645 * coming back to reclaim from this cgroup
1773 */ 1646 */
1774 if (total >= (excess >> 2) || 1647 if (total >= (excess >> 2) ||
1775 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1648 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1776 css_put(&victim->css);
1777 break; 1649 break;
1778 }
1779 } 1650 }
1780 }
1781 if (!mem_cgroup_reclaimable(victim, noswap)) {
1782 /* this cgroup's local usage == 0 */
1783 css_put(&victim->css);
1784 continue; 1651 continue;
1785 } 1652 }
1786 /* we use swappiness of local cgroup */ 1653 if (!mem_cgroup_reclaimable(victim, false))
1787 if (check_soft) { 1654 continue;
1788 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1655 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1789 noswap, zone, &nr_scanned); 1656 zone, &nr_scanned);
1790 *total_scanned += nr_scanned; 1657 *total_scanned += nr_scanned;
1791 } else 1658 if (!res_counter_soft_limit_excess(&root_memcg->res))
1792 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1659 break;
1793 noswap);
1794 css_put(&victim->css);
1795 /*
1796 * At shrinking usage, we can't check we should stop here or
1797 * reclaim more. It's depends on callers. last_scanned_child
1798 * will work enough for keeping fairness under tree.
1799 */
1800 if (shrink)
1801 return ret;
1802 total += ret;
1803 if (check_soft) {
1804 if (!res_counter_soft_limit_excess(&root_memcg->res))
1805 return total;
1806 } else if (mem_cgroup_margin(root_memcg))
1807 return total;
1808 } 1660 }
1661 mem_cgroup_iter_break(root_memcg, victim);
1809 return total; 1662 return total;
1810} 1663}
1811 1664
@@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
1817static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) 1670static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1818{ 1671{
1819 struct mem_cgroup *iter, *failed = NULL; 1672 struct mem_cgroup *iter, *failed = NULL;
1820 bool cond = true;
1821 1673
1822 for_each_mem_cgroup_tree_cond(iter, memcg, cond) { 1674 for_each_mem_cgroup_tree(iter, memcg) {
1823 if (iter->oom_lock) { 1675 if (iter->oom_lock) {
1824 /* 1676 /*
1825 * this subtree of our hierarchy is already locked 1677 * this subtree of our hierarchy is already locked
1826 * so we cannot give a lock. 1678 * so we cannot give a lock.
1827 */ 1679 */
1828 failed = iter; 1680 failed = iter;
1829 cond = false; 1681 mem_cgroup_iter_break(memcg, iter);
1682 break;
1830 } else 1683 } else
1831 iter->oom_lock = true; 1684 iter->oom_lock = true;
1832 } 1685 }
@@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
1838 * OK, we failed to lock the whole subtree so we have to clean up 1691 * OK, we failed to lock the whole subtree so we have to clean up
1839 * what we set up to the failing subtree 1692 * what we set up to the failing subtree
1840 */ 1693 */
1841 cond = true; 1694 for_each_mem_cgroup_tree(iter, memcg) {
1842 for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
1843 if (iter == failed) { 1695 if (iter == failed) {
1844 cond = false; 1696 mem_cgroup_iter_break(memcg, iter);
1845 continue; 1697 break;
1846 } 1698 }
1847 iter->oom_lock = false; 1699 iter->oom_lock = false;
1848 } 1700 }
@@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page,
2007 bool need_unlock = false; 1859 bool need_unlock = false;
2008 unsigned long uninitialized_var(flags); 1860 unsigned long uninitialized_var(flags);
2009 1861
2010 if (unlikely(!pc)) 1862 if (mem_cgroup_disabled())
2011 return; 1863 return;
2012 1864
2013 rcu_read_lock(); 1865 rcu_read_lock();
@@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2238 struct mem_cgroup *iter; 2090 struct mem_cgroup *iter;
2239 2091
2240 if ((action == CPU_ONLINE)) { 2092 if ((action == CPU_ONLINE)) {
2241 for_each_mem_cgroup_all(iter) 2093 for_each_mem_cgroup(iter)
2242 synchronize_mem_cgroup_on_move(iter, cpu); 2094 synchronize_mem_cgroup_on_move(iter, cpu);
2243 return NOTIFY_OK; 2095 return NOTIFY_OK;
2244 } 2096 }
@@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2246 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2098 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2247 return NOTIFY_OK; 2099 return NOTIFY_OK;
2248 2100
2249 for_each_mem_cgroup_all(iter) 2101 for_each_mem_cgroup(iter)
2250 mem_cgroup_drain_pcp_counter(iter, cpu); 2102 mem_cgroup_drain_pcp_counter(iter, cpu);
2251 2103
2252 stock = &per_cpu(memcg_stock, cpu); 2104 stock = &per_cpu(memcg_stock, cpu);
@@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2300 if (!(gfp_mask & __GFP_WAIT)) 2152 if (!(gfp_mask & __GFP_WAIT))
2301 return CHARGE_WOULDBLOCK; 2153 return CHARGE_WOULDBLOCK;
2302 2154
2303 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2155 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2304 gfp_mask, flags, NULL);
2305 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2156 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2306 return CHARGE_RETRY; 2157 return CHARGE_RETRY;
2307 /* 2158 /*
@@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2334} 2185}
2335 2186
2336/* 2187/*
2337 * Unlike exported interface, "oom" parameter is added. if oom==true, 2188 * __mem_cgroup_try_charge() does
2338 * oom-killer can be invoked. 2189 * 1. detect memcg to be charged against from passed *mm and *ptr,
2190 * 2. update res_counter
2191 * 3. call memory reclaim if necessary.
2192 *
2193 * In some special case, if the task is fatal, fatal_signal_pending() or
2194 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
2195 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
2196 * as possible without any hazards. 2: all pages should have a valid
2197 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
2198 * pointer, that is treated as a charge to root_mem_cgroup.
2199 *
2200 * So __mem_cgroup_try_charge() will return
2201 * 0 ... on success, filling *ptr with a valid memcg pointer.
2202 * -ENOMEM ... charge failure because of resource limits.
2203 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup.
2204 *
2205 * Unlike the exported interface, an "oom" parameter is added. if oom==true,
2206 * the oom-killer can be invoked.
2339 */ 2207 */
2340static int __mem_cgroup_try_charge(struct mm_struct *mm, 2208static int __mem_cgroup_try_charge(struct mm_struct *mm,
2341 gfp_t gfp_mask, 2209 gfp_t gfp_mask,
@@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
2364 * set, if so charge the init_mm (happens for pagecache usage). 2232 * set, if so charge the init_mm (happens for pagecache usage).
2365 */ 2233 */
2366 if (!*ptr && !mm) 2234 if (!*ptr && !mm)
2367 goto bypass; 2235 *ptr = root_mem_cgroup;
2368again: 2236again:
2369 if (*ptr) { /* css should be a valid one */ 2237 if (*ptr) { /* css should be a valid one */
2370 memcg = *ptr; 2238 memcg = *ptr;
@@ -2390,7 +2258,9 @@ again:
2390 * task-struct. So, mm->owner can be NULL. 2258 * task-struct. So, mm->owner can be NULL.
2391 */ 2259 */
2392 memcg = mem_cgroup_from_task(p); 2260 memcg = mem_cgroup_from_task(p);
2393 if (!memcg || mem_cgroup_is_root(memcg)) { 2261 if (!memcg)
2262 memcg = root_mem_cgroup;
2263 if (mem_cgroup_is_root(memcg)) {
2394 rcu_read_unlock(); 2264 rcu_read_unlock();
2395 goto done; 2265 goto done;
2396 } 2266 }
@@ -2465,8 +2335,8 @@ nomem:
2465 *ptr = NULL; 2335 *ptr = NULL;
2466 return -ENOMEM; 2336 return -ENOMEM;
2467bypass: 2337bypass:
2468 *ptr = NULL; 2338 *ptr = root_mem_cgroup;
2469 return 0; 2339 return -EINTR;
2470} 2340}
2471 2341
2472/* 2342/*
@@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2522 memcg = NULL; 2392 memcg = NULL;
2523 } else if (PageSwapCache(page)) { 2393 } else if (PageSwapCache(page)) {
2524 ent.val = page_private(page); 2394 ent.val = page_private(page);
2525 id = lookup_swap_cgroup(ent); 2395 id = lookup_swap_cgroup_id(ent);
2526 rcu_read_lock(); 2396 rcu_read_lock();
2527 memcg = mem_cgroup_lookup(id); 2397 memcg = mem_cgroup_lookup(id);
2528 if (memcg && !css_tryget(&memcg->css)) 2398 if (memcg && !css_tryget(&memcg->css))
@@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2574 2444
2575 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2445 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
2576 unlock_page_cgroup(pc); 2446 unlock_page_cgroup(pc);
2447 WARN_ON_ONCE(PageLRU(page));
2577 /* 2448 /*
2578 * "charge_statistics" updated event counter. Then, check it. 2449 * "charge_statistics" updated event counter. Then, check it.
2579 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2450 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
@@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2585#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2456#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2586 2457
2587#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2458#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2588 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2459 (1 << PCG_MIGRATION))
2589/* 2460/*
2590 * Because tail pages are not marked as "used", set it. We're under 2461 * Because tail pages are not marked as "used", set it. We're under
2591 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2462 * zone->lru_lock, 'splitting on pmd' and compound_lock.
2463 * charge/uncharge will be never happen and move_account() is done under
2464 * compound_lock(), so we don't have to take care of races.
2592 */ 2465 */
2593void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2466void mem_cgroup_split_huge_fixup(struct page *head)
2594{ 2467{
2595 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2468 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2596 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2469 struct page_cgroup *pc;
2597 unsigned long flags; 2470 int i;
2598 2471
2599 if (mem_cgroup_disabled()) 2472 if (mem_cgroup_disabled())
2600 return; 2473 return;
2601 /* 2474 for (i = 1; i < HPAGE_PMD_NR; i++) {
2602 * We have no races with charge/uncharge but will have races with 2475 pc = head_pc + i;
2603 * page state accounting. 2476 pc->mem_cgroup = head_pc->mem_cgroup;
2604 */ 2477 smp_wmb();/* see __commit_charge() */
2605 move_lock_page_cgroup(head_pc, &flags); 2478 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2606
2607 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2608 smp_wmb(); /* see __commit_charge() */
2609 if (PageCgroupAcctLRU(head_pc)) {
2610 enum lru_list lru;
2611 struct mem_cgroup_per_zone *mz;
2612
2613 /*
2614 * LRU flags cannot be copied because we need to add tail
2615 *.page to LRU by generic call and our hook will be called.
2616 * We hold lru_lock, then, reduce counter directly.
2617 */
2618 lru = page_lru(head);
2619 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2620 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2621 } 2479 }
2622 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2623 move_unlock_page_cgroup(head_pc, &flags);
2624} 2480}
2625#endif 2481#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2626 2482
2627/** 2483/**
2628 * mem_cgroup_move_account - move account of the page 2484 * mem_cgroup_move_account - move account of the page
@@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page,
2737 2593
2738 parent = mem_cgroup_from_cont(pcg); 2594 parent = mem_cgroup_from_cont(pcg);
2739 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2595 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2740 if (ret || !parent) 2596 if (ret)
2741 goto put_back; 2597 goto put_back;
2742 2598
2743 if (nr_pages > 1) 2599 if (nr_pages > 1)
@@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2783 } 2639 }
2784 2640
2785 pc = lookup_page_cgroup(page); 2641 pc = lookup_page_cgroup(page);
2786 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2787
2788 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 2642 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
2789 if (ret || !memcg) 2643 if (ret == -ENOMEM)
2790 return ret; 2644 return ret;
2791
2792 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); 2645 __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
2793 return 0; 2646 return 0;
2794} 2647}
@@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page,
2798{ 2651{
2799 if (mem_cgroup_disabled()) 2652 if (mem_cgroup_disabled())
2800 return 0; 2653 return 0;
2801 /* 2654 VM_BUG_ON(page_mapped(page));
2802 * If already mapped, we don't have to account. 2655 VM_BUG_ON(page->mapping && !PageAnon(page));
2803 * If page cache, page->mapping has address_space. 2656 VM_BUG_ON(!mm);
2804 * But page->mapping may have out-of-use anon_vma pointer,
2805 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2806 * is NULL.
2807 */
2808 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2809 return 0;
2810 if (unlikely(!mm))
2811 mm = &init_mm;
2812 return mem_cgroup_charge_common(page, mm, gfp_mask, 2657 return mem_cgroup_charge_common(page, mm, gfp_mask,
2813 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2658 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2814} 2659}
2815 2660
2816static void 2661static void
@@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
2822 enum charge_type ctype) 2667 enum charge_type ctype)
2823{ 2668{
2824 struct page_cgroup *pc = lookup_page_cgroup(page); 2669 struct page_cgroup *pc = lookup_page_cgroup(page);
2670 struct zone *zone = page_zone(page);
2671 unsigned long flags;
2672 bool removed = false;
2673
2825 /* 2674 /*
2826 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2675 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2827 * is already on LRU. It means the page may on some other page_cgroup's 2676 * is already on LRU. It means the page may on some other page_cgroup's
2828 * LRU. Take care of it. 2677 * LRU. Take care of it.
2829 */ 2678 */
2830 mem_cgroup_lru_del_before_commit(page); 2679 spin_lock_irqsave(&zone->lru_lock, flags);
2680 if (PageLRU(page)) {
2681 del_page_from_lru_list(zone, page, page_lru(page));
2682 ClearPageLRU(page);
2683 removed = true;
2684 }
2831 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); 2685 __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
2832 mem_cgroup_lru_add_after_commit(page); 2686 if (removed) {
2687 add_page_to_lru_list(zone, page, page_lru(page));
2688 SetPageLRU(page);
2689 }
2690 spin_unlock_irqrestore(&zone->lru_lock, flags);
2833 return; 2691 return;
2834} 2692}
2835 2693
@@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2837 gfp_t gfp_mask) 2695 gfp_t gfp_mask)
2838{ 2696{
2839 struct mem_cgroup *memcg = NULL; 2697 struct mem_cgroup *memcg = NULL;
2698 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
2840 int ret; 2699 int ret;
2841 2700
2842 if (mem_cgroup_disabled()) 2701 if (mem_cgroup_disabled())
@@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2846 2705
2847 if (unlikely(!mm)) 2706 if (unlikely(!mm))
2848 mm = &init_mm; 2707 mm = &init_mm;
2708 if (!page_is_file_cache(page))
2709 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2849 2710
2850 if (page_is_file_cache(page)) { 2711 if (!PageSwapCache(page))
2851 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); 2712 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
2852 if (ret || !memcg) 2713 else { /* page is swapcache/shmem */
2853 return ret;
2854
2855 /*
2856 * FUSE reuses pages without going through the final
2857 * put that would remove them from the LRU list, make
2858 * sure that they get relinked properly.
2859 */
2860 __mem_cgroup_commit_charge_lrucare(page, memcg,
2861 MEM_CGROUP_CHARGE_TYPE_CACHE);
2862 return ret;
2863 }
2864 /* shmem */
2865 if (PageSwapCache(page)) {
2866 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); 2714 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
2867 if (!ret) 2715 if (!ret)
2868 __mem_cgroup_commit_charge_swapin(page, memcg, 2716 __mem_cgroup_commit_charge_swapin(page, memcg, type);
2869 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2717 }
2870 } else
2871 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2872 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2873
2874 return ret; 2718 return ret;
2875} 2719}
2876 2720
@@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2882 */ 2726 */
2883int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2727int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2884 struct page *page, 2728 struct page *page,
2885 gfp_t mask, struct mem_cgroup **ptr) 2729 gfp_t mask, struct mem_cgroup **memcgp)
2886{ 2730{
2887 struct mem_cgroup *memcg; 2731 struct mem_cgroup *memcg;
2888 int ret; 2732 int ret;
2889 2733
2890 *ptr = NULL; 2734 *memcgp = NULL;
2891 2735
2892 if (mem_cgroup_disabled()) 2736 if (mem_cgroup_disabled())
2893 return 0; 2737 return 0;
@@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2905 memcg = try_get_mem_cgroup_from_page(page); 2749 memcg = try_get_mem_cgroup_from_page(page);
2906 if (!memcg) 2750 if (!memcg)
2907 goto charge_cur_mm; 2751 goto charge_cur_mm;
2908 *ptr = memcg; 2752 *memcgp = memcg;
2909 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2753 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
2910 css_put(&memcg->css); 2754 css_put(&memcg->css);
2755 if (ret == -EINTR)
2756 ret = 0;
2911 return ret; 2757 return ret;
2912charge_cur_mm: 2758charge_cur_mm:
2913 if (unlikely(!mm)) 2759 if (unlikely(!mm))
2914 mm = &init_mm; 2760 mm = &init_mm;
2915 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2761 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
2762 if (ret == -EINTR)
2763 ret = 0;
2764 return ret;
2916} 2765}
2917 2766
2918static void 2767static void
2919__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2768__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2920 enum charge_type ctype) 2769 enum charge_type ctype)
2921{ 2770{
2922 if (mem_cgroup_disabled()) 2771 if (mem_cgroup_disabled())
2923 return; 2772 return;
2924 if (!ptr) 2773 if (!memcg)
2925 return; 2774 return;
2926 cgroup_exclude_rmdir(&ptr->css); 2775 cgroup_exclude_rmdir(&memcg->css);
2927 2776
2928 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2777 __mem_cgroup_commit_charge_lrucare(page, memcg, ctype);
2929 /* 2778 /*
2930 * Now swap is on-memory. This means this page may be 2779 * Now swap is on-memory. This means this page may be
2931 * counted both as mem and swap....double count. 2780 * counted both as mem and swap....double count.
@@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2935 */ 2784 */
2936 if (do_swap_account && PageSwapCache(page)) { 2785 if (do_swap_account && PageSwapCache(page)) {
2937 swp_entry_t ent = {.val = page_private(page)}; 2786 swp_entry_t ent = {.val = page_private(page)};
2787 struct mem_cgroup *swap_memcg;
2938 unsigned short id; 2788 unsigned short id;
2939 struct mem_cgroup *memcg;
2940 2789
2941 id = swap_cgroup_record(ent, 0); 2790 id = swap_cgroup_record(ent, 0);
2942 rcu_read_lock(); 2791 rcu_read_lock();
2943 memcg = mem_cgroup_lookup(id); 2792 swap_memcg = mem_cgroup_lookup(id);
2944 if (memcg) { 2793 if (swap_memcg) {
2945 /* 2794 /*
2946 * This recorded memcg can be obsolete one. So, avoid 2795 * This recorded memcg can be obsolete one. So, avoid
2947 * calling css_tryget 2796 * calling css_tryget
2948 */ 2797 */
2949 if (!mem_cgroup_is_root(memcg)) 2798 if (!mem_cgroup_is_root(swap_memcg))
2950 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2799 res_counter_uncharge(&swap_memcg->memsw,
2951 mem_cgroup_swap_statistics(memcg, false); 2800 PAGE_SIZE);
2952 mem_cgroup_put(memcg); 2801 mem_cgroup_swap_statistics(swap_memcg, false);
2802 mem_cgroup_put(swap_memcg);
2953 } 2803 }
2954 rcu_read_unlock(); 2804 rcu_read_unlock();
2955 } 2805 }
@@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2958 * So, rmdir()->pre_destroy() can be called while we do this charge. 2808 * So, rmdir()->pre_destroy() can be called while we do this charge.
2959 * In that case, we need to call pre_destroy() again. check it here. 2809 * In that case, we need to call pre_destroy() again. check it here.
2960 */ 2810 */
2961 cgroup_release_and_wakeup_rmdir(&ptr->css); 2811 cgroup_release_and_wakeup_rmdir(&memcg->css);
2962} 2812}
2963 2813
2964void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2814void mem_cgroup_commit_charge_swapin(struct page *page,
2815 struct mem_cgroup *memcg)
2965{ 2816{
2966 __mem_cgroup_commit_charge_swapin(page, ptr, 2817 __mem_cgroup_commit_charge_swapin(page, memcg,
2967 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2818 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2968} 2819}
2969 2820
2970void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 2821void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
@@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3054 * Check if our page_cgroup is valid 2905 * Check if our page_cgroup is valid
3055 */ 2906 */
3056 pc = lookup_page_cgroup(page); 2907 pc = lookup_page_cgroup(page);
3057 if (unlikely(!pc || !PageCgroupUsed(pc))) 2908 if (unlikely(!PageCgroupUsed(pc)))
3058 return NULL; 2909 return NULL;
3059 2910
3060 lock_page_cgroup(pc); 2911 lock_page_cgroup(pc);
@@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page)
3117 /* early check. */ 2968 /* early check. */
3118 if (page_mapped(page)) 2969 if (page_mapped(page))
3119 return; 2970 return;
3120 if (page->mapping && !PageAnon(page)) 2971 VM_BUG_ON(page->mapping && !PageAnon(page));
3121 return;
3122 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2972 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3123} 2973}
3124 2974
@@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void)
3176 batch->memcg = NULL; 3026 batch->memcg = NULL;
3177} 3027}
3178 3028
3029/*
3030 * A function for resetting pc->mem_cgroup for newly allocated pages.
3031 * This function should be called if the newpage will be added to LRU
3032 * before start accounting.
3033 */
3034void mem_cgroup_reset_owner(struct page *newpage)
3035{
3036 struct page_cgroup *pc;
3037
3038 if (mem_cgroup_disabled())
3039 return;
3040
3041 pc = lookup_page_cgroup(newpage);
3042 VM_BUG_ON(PageCgroupUsed(pc));
3043 pc->mem_cgroup = root_mem_cgroup;
3044}
3045
3179#ifdef CONFIG_SWAP 3046#ifdef CONFIG_SWAP
3180/* 3047/*
3181 * called after __delete_from_swap_cache() and drop "page" account. 3048 * called after __delete_from_swap_cache() and drop "page" account.
@@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3293 * page belongs to. 3160 * page belongs to.
3294 */ 3161 */
3295int mem_cgroup_prepare_migration(struct page *page, 3162int mem_cgroup_prepare_migration(struct page *page,
3296 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3163 struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
3297{ 3164{
3298 struct mem_cgroup *memcg = NULL; 3165 struct mem_cgroup *memcg = NULL;
3299 struct page_cgroup *pc; 3166 struct page_cgroup *pc;
3300 enum charge_type ctype; 3167 enum charge_type ctype;
3301 int ret = 0; 3168 int ret = 0;
3302 3169
3303 *ptr = NULL; 3170 *memcgp = NULL;
3304 3171
3305 VM_BUG_ON(PageTransHuge(page)); 3172 VM_BUG_ON(PageTransHuge(page));
3306 if (mem_cgroup_disabled()) 3173 if (mem_cgroup_disabled())
@@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page,
3351 if (!memcg) 3218 if (!memcg)
3352 return 0; 3219 return 0;
3353 3220
3354 *ptr = memcg; 3221 *memcgp = memcg;
3355 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3222 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
3356 css_put(&memcg->css);/* drop extra refcnt */ 3223 css_put(&memcg->css);/* drop extra refcnt */
3357 if (ret || *ptr == NULL) { 3224 if (ret) {
3358 if (PageAnon(page)) { 3225 if (PageAnon(page)) {
3359 lock_page_cgroup(pc); 3226 lock_page_cgroup(pc);
3360 ClearPageCgroupMigration(pc); 3227 ClearPageCgroupMigration(pc);
@@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page,
3364 */ 3231 */
3365 mem_cgroup_uncharge_page(page); 3232 mem_cgroup_uncharge_page(page);
3366 } 3233 }
3234 /* we'll need to revisit this error code (we have -EINTR) */
3367 return -ENOMEM; 3235 return -ENOMEM;
3368 } 3236 }
3369 /* 3237 /*
@@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3432 cgroup_release_and_wakeup_rmdir(&memcg->css); 3300 cgroup_release_and_wakeup_rmdir(&memcg->css);
3433} 3301}
3434 3302
3303/*
3304 * At replace page cache, newpage is not under any memcg but it's on
3305 * LRU. So, this function doesn't touch res_counter but handles LRU
3306 * in correct way. Both pages are locked so we cannot race with uncharge.
3307 */
3308void mem_cgroup_replace_page_cache(struct page *oldpage,
3309 struct page *newpage)
3310{
3311 struct mem_cgroup *memcg;
3312 struct page_cgroup *pc;
3313 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3314
3315 if (mem_cgroup_disabled())
3316 return;
3317
3318 pc = lookup_page_cgroup(oldpage);
3319 /* fix accounting on old pages */
3320 lock_page_cgroup(pc);
3321 memcg = pc->mem_cgroup;
3322 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
3323 ClearPageCgroupUsed(pc);
3324 unlock_page_cgroup(pc);
3325
3326 if (PageSwapBacked(oldpage))
3327 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3328
3329 /*
3330 * Even if newpage->mapping was NULL before starting replacement,
3331 * the newpage may be on LRU(or pagevec for LRU) already. We lock
3332 * LRU while we overwrite pc->mem_cgroup.
3333 */
3334 __mem_cgroup_commit_charge_lrucare(newpage, memcg, type);
3335}
3336
3435#ifdef CONFIG_DEBUG_VM 3337#ifdef CONFIG_DEBUG_VM
3436static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3338static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3437{ 3339{
3438 struct page_cgroup *pc; 3340 struct page_cgroup *pc;
3439 3341
3440 pc = lookup_page_cgroup(page); 3342 pc = lookup_page_cgroup(page);
3343 /*
3344 * Can be NULL while feeding pages into the page allocator for
3345 * the first time, i.e. during boot or memory hotplug;
3346 * or when mem_cgroup_disabled().
3347 */
3441 if (likely(pc) && PageCgroupUsed(pc)) 3348 if (likely(pc) && PageCgroupUsed(pc))
3442 return pc; 3349 return pc;
3443 return NULL; 3350 return NULL;
@@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page)
3457 3364
3458 pc = lookup_page_cgroup_used(page); 3365 pc = lookup_page_cgroup_used(page);
3459 if (pc) { 3366 if (pc) {
3460 int ret = -1; 3367 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3461 char *path;
3462
3463 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3464 pc, pc->flags, pc->mem_cgroup); 3368 pc, pc->flags, pc->mem_cgroup);
3465
3466 path = kmalloc(PATH_MAX, GFP_KERNEL);
3467 if (path) {
3468 rcu_read_lock();
3469 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3470 path, PATH_MAX);
3471 rcu_read_unlock();
3472 }
3473
3474 printk(KERN_CONT "(%s)\n",
3475 (ret < 0) ? "cannot get the path" : path);
3476 kfree(path);
3477 } 3369 }
3478} 3370}
3479#endif 3371#endif
@@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3534 if (!ret) 3426 if (!ret)
3535 break; 3427 break;
3536 3428
3537 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3429 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3538 MEM_CGROUP_RECLAIM_SHRINK, 3430 MEM_CGROUP_RECLAIM_SHRINK);
3539 NULL);
3540 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3431 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3541 /* Usage is reduced ? */ 3432 /* Usage is reduced ? */
3542 if (curusage >= oldusage) 3433 if (curusage >= oldusage)
@@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3594 if (!ret) 3485 if (!ret)
3595 break; 3486 break;
3596 3487
3597 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3488 mem_cgroup_reclaim(memcg, GFP_KERNEL,
3598 MEM_CGROUP_RECLAIM_NOSWAP | 3489 MEM_CGROUP_RECLAIM_NOSWAP |
3599 MEM_CGROUP_RECLAIM_SHRINK, 3490 MEM_CGROUP_RECLAIM_SHRINK);
3600 NULL);
3601 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3491 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3602 /* Usage is reduced ? */ 3492 /* Usage is reduced ? */
3603 if (curusage >= oldusage) 3493 if (curusage >= oldusage)
@@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3640 break; 3530 break;
3641 3531
3642 nr_scanned = 0; 3532 nr_scanned = 0;
3643 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3533 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone,
3644 gfp_mask, 3534 gfp_mask, &nr_scanned);
3645 MEM_CGROUP_RECLAIM_SOFT,
3646 &nr_scanned);
3647 nr_reclaimed += reclaimed; 3535 nr_reclaimed += reclaimed;
3648 *total_scanned += nr_scanned; 3536 *total_scanned += nr_scanned;
3649 spin_lock(&mctz->lock); 3537 spin_lock(&mctz->lock);
@@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3711static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3599static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3712 int node, int zid, enum lru_list lru) 3600 int node, int zid, enum lru_list lru)
3713{ 3601{
3714 struct zone *zone;
3715 struct mem_cgroup_per_zone *mz; 3602 struct mem_cgroup_per_zone *mz;
3716 struct page_cgroup *pc, *busy;
3717 unsigned long flags, loop; 3603 unsigned long flags, loop;
3718 struct list_head *list; 3604 struct list_head *list;
3605 struct page *busy;
3606 struct zone *zone;
3719 int ret = 0; 3607 int ret = 0;
3720 3608
3721 zone = &NODE_DATA(node)->node_zones[zid]; 3609 zone = &NODE_DATA(node)->node_zones[zid];
3722 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3610 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3723 list = &mz->lists[lru]; 3611 list = &mz->lruvec.lists[lru];
3724 3612
3725 loop = MEM_CGROUP_ZSTAT(mz, lru); 3613 loop = MEM_CGROUP_ZSTAT(mz, lru);
3726 /* give some margin against EBUSY etc...*/ 3614 /* give some margin against EBUSY etc...*/
3727 loop += 256; 3615 loop += 256;
3728 busy = NULL; 3616 busy = NULL;
3729 while (loop--) { 3617 while (loop--) {
3618 struct page_cgroup *pc;
3730 struct page *page; 3619 struct page *page;
3731 3620
3732 ret = 0; 3621 ret = 0;
@@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3735 spin_unlock_irqrestore(&zone->lru_lock, flags); 3624 spin_unlock_irqrestore(&zone->lru_lock, flags);
3736 break; 3625 break;
3737 } 3626 }
3738 pc = list_entry(list->prev, struct page_cgroup, lru); 3627 page = list_entry(list->prev, struct page, lru);
3739 if (busy == pc) { 3628 if (busy == page) {
3740 list_move(&pc->lru, list); 3629 list_move(&page->lru, list);
3741 busy = NULL; 3630 busy = NULL;
3742 spin_unlock_irqrestore(&zone->lru_lock, flags); 3631 spin_unlock_irqrestore(&zone->lru_lock, flags);
3743 continue; 3632 continue;
3744 } 3633 }
3745 spin_unlock_irqrestore(&zone->lru_lock, flags); 3634 spin_unlock_irqrestore(&zone->lru_lock, flags);
3746 3635
3747 page = lookup_cgroup_page(pc); 3636 pc = lookup_page_cgroup(page);
3748 3637
3749 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); 3638 ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
3750 if (ret == -ENOMEM) 3639 if (ret == -ENOMEM || ret == -EINTR)
3751 break; 3640 break;
3752 3641
3753 if (ret == -EBUSY || ret == -EINVAL) { 3642 if (ret == -EBUSY || ret == -EINVAL) {
3754 /* found lock contention or "pc" is obsolete. */ 3643 /* found lock contention or "pc" is obsolete. */
3755 busy = pc; 3644 busy = page;
3756 cond_resched(); 3645 cond_resched();
3757 } else 3646 } else
3758 busy = NULL; 3647 busy = NULL;
@@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4846 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4735 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4847 mz = &pn->zoneinfo[zone]; 4736 mz = &pn->zoneinfo[zone];
4848 for_each_lru(l) 4737 for_each_lru(l)
4849 INIT_LIST_HEAD(&mz->lists[l]); 4738 INIT_LIST_HEAD(&mz->lruvec.lists[l]);
4850 mz->usage_in_excess = 0; 4739 mz->usage_in_excess = 0;
4851 mz->on_tree = false; 4740 mz->on_tree = false;
4852 mz->mem = memcg; 4741 mz->mem = memcg;
@@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4906 mem_cgroup_remove_from_trees(memcg); 4795 mem_cgroup_remove_from_trees(memcg);
4907 free_css_id(&mem_cgroup_subsys, &memcg->css); 4796 free_css_id(&mem_cgroup_subsys, &memcg->css);
4908 4797
4909 for_each_node_state(node, N_POSSIBLE) 4798 for_each_node(node)
4910 free_mem_cgroup_per_zone_info(memcg, node); 4799 free_mem_cgroup_per_zone_info(memcg, node);
4911 4800
4912 free_percpu(memcg->stat); 4801 free_percpu(memcg->stat);
@@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void)
4965 struct mem_cgroup_tree_per_zone *rtpz; 4854 struct mem_cgroup_tree_per_zone *rtpz;
4966 int tmp, node, zone; 4855 int tmp, node, zone;
4967 4856
4968 for_each_node_state(node, N_POSSIBLE) { 4857 for_each_node(node) {
4969 tmp = node; 4858 tmp = node;
4970 if (!node_state(node, N_NORMAL_MEMORY)) 4859 if (!node_state(node, N_NORMAL_MEMORY))
4971 tmp = -1; 4860 tmp = -1;
4972 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4861 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4973 if (!rtpn) 4862 if (!rtpn)
4974 return 1; 4863 goto err_cleanup;
4975 4864
4976 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4865 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4977 4866
@@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
4982 } 4871 }
4983 } 4872 }
4984 return 0; 4873 return 0;
4874
4875err_cleanup:
4876 for_each_node(node) {
4877 if (!soft_limit_tree.rb_tree_per_node[node])
4878 break;
4879 kfree(soft_limit_tree.rb_tree_per_node[node]);
4880 soft_limit_tree.rb_tree_per_node[node] = NULL;
4881 }
4882 return 1;
4883
4985} 4884}
4986 4885
4987static struct cgroup_subsys_state * __ref 4886static struct cgroup_subsys_state * __ref
@@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4995 if (!memcg) 4894 if (!memcg)
4996 return ERR_PTR(error); 4895 return ERR_PTR(error);
4997 4896
4998 for_each_node_state(node, N_POSSIBLE) 4897 for_each_node(node)
4999 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 4898 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5000 goto free_out; 4899 goto free_out;
5001 4900
@@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5033 res_counter_init(&memcg->res, NULL); 4932 res_counter_init(&memcg->res, NULL);
5034 res_counter_init(&memcg->memsw, NULL); 4933 res_counter_init(&memcg->memsw, NULL);
5035 } 4934 }
5036 memcg->last_scanned_child = 0;
5037 memcg->last_scanned_node = MAX_NUMNODES; 4935 memcg->last_scanned_node = MAX_NUMNODES;
5038 INIT_LIST_HEAD(&memcg->oom_notify); 4936 INIT_LIST_HEAD(&memcg->oom_notify);
5039 4937
@@ -5129,9 +5027,9 @@ one_by_one:
5129 } 5027 }
5130 ret = __mem_cgroup_try_charge(NULL, 5028 ret = __mem_cgroup_try_charge(NULL,
5131 GFP_KERNEL, 1, &memcg, false); 5029 GFP_KERNEL, 1, &memcg, false);
5132 if (ret || !memcg) 5030 if (ret)
5133 /* mem_cgroup_clear_mc() will do uncharge later */ 5031 /* mem_cgroup_clear_mc() will do uncharge later */
5134 return -ENOMEM; 5032 return ret;
5135 mc.precharge++; 5033 mc.precharge++;
5136 } 5034 }
5137 return ret; 5035 return ret;
@@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5276 } 5174 }
5277 /* There is a swap entry and a page doesn't exist or isn't charged */ 5175 /* There is a swap entry and a page doesn't exist or isn't charged */
5278 if (ent.val && !ret && 5176 if (ent.val && !ret &&
5279 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5177 css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
5280 ret = MC_TARGET_SWAP; 5178 ret = MC_TARGET_SWAP;
5281 if (target) 5179 if (target)
5282 target->ent = ent; 5180 target->ent = ent;