diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 1102 |
1 files changed, 500 insertions, 602 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d87aa3510c5e..602207be9853 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -123,16 +123,22 @@ struct mem_cgroup_stat_cpu { | |||
123 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 123 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
124 | }; | 124 | }; |
125 | 125 | ||
126 | struct mem_cgroup_reclaim_iter { | ||
127 | /* css_id of the last scanned hierarchy member */ | ||
128 | int position; | ||
129 | /* scan generation, increased every round-trip */ | ||
130 | unsigned int generation; | ||
131 | }; | ||
132 | |||
126 | /* | 133 | /* |
127 | * per-zone information in memory controller. | 134 | * per-zone information in memory controller. |
128 | */ | 135 | */ |
129 | struct mem_cgroup_per_zone { | 136 | struct mem_cgroup_per_zone { |
130 | /* | 137 | struct lruvec lruvec; |
131 | * spin_lock to protect the per cgroup LRU | ||
132 | */ | ||
133 | struct list_head lists[NR_LRU_LISTS]; | ||
134 | unsigned long count[NR_LRU_LISTS]; | 138 | unsigned long count[NR_LRU_LISTS]; |
135 | 139 | ||
140 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | ||
141 | |||
136 | struct zone_reclaim_stat reclaim_stat; | 142 | struct zone_reclaim_stat reclaim_stat; |
137 | struct rb_node tree_node; /* RB tree node */ | 143 | struct rb_node tree_node; /* RB tree node */ |
138 | unsigned long long usage_in_excess;/* Set to the value by which */ | 144 | unsigned long long usage_in_excess;/* Set to the value by which */ |
@@ -233,11 +239,6 @@ struct mem_cgroup { | |||
233 | * per zone LRU lists. | 239 | * per zone LRU lists. |
234 | */ | 240 | */ |
235 | struct mem_cgroup_lru_info info; | 241 | struct mem_cgroup_lru_info info; |
236 | /* | ||
237 | * While reclaiming in a hierarchy, we cache the last child we | ||
238 | * reclaimed from. | ||
239 | */ | ||
240 | int last_scanned_child; | ||
241 | int last_scanned_node; | 242 | int last_scanned_node; |
242 | #if MAX_NUMNODES > 1 | 243 | #if MAX_NUMNODES > 1 |
243 | nodemask_t scan_nodes; | 244 | nodemask_t scan_nodes; |
@@ -366,8 +367,6 @@ enum charge_type { | |||
366 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | 367 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) |
367 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | 368 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 |
368 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | 369 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) |
369 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
370 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
371 | 370 | ||
372 | static void mem_cgroup_get(struct mem_cgroup *memcg); | 371 | static void mem_cgroup_get(struct mem_cgroup *memcg); |
373 | static void mem_cgroup_put(struct mem_cgroup *memcg); | 372 | static void mem_cgroup_put(struct mem_cgroup *memcg); |
@@ -566,7 +565,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | |||
566 | struct mem_cgroup_per_zone *mz; | 565 | struct mem_cgroup_per_zone *mz; |
567 | struct mem_cgroup_tree_per_zone *mctz; | 566 | struct mem_cgroup_tree_per_zone *mctz; |
568 | 567 | ||
569 | for_each_node_state(node, N_POSSIBLE) { | 568 | for_each_node(node) { |
570 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 569 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
571 | mz = mem_cgroup_zoneinfo(memcg, node, zone); | 570 | mz = mem_cgroup_zoneinfo(memcg, node, zone); |
572 | mctz = soft_limit_tree_node_zone(node, zone); | 571 | mctz = soft_limit_tree_node_zone(node, zone); |
@@ -656,16 +655,6 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
656 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); | 655 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
657 | } | 656 | } |
658 | 657 | ||
659 | void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) | ||
660 | { | ||
661 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); | ||
662 | } | ||
663 | |||
664 | void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) | ||
665 | { | ||
666 | this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); | ||
667 | } | ||
668 | |||
669 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 658 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
670 | enum mem_cgroup_events_index idx) | 659 | enum mem_cgroup_events_index idx) |
671 | { | 660 | { |
@@ -749,37 +738,32 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |||
749 | return total; | 738 | return total; |
750 | } | 739 | } |
751 | 740 | ||
752 | static bool __memcg_event_check(struct mem_cgroup *memcg, int target) | 741 | static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, |
742 | enum mem_cgroup_events_target target) | ||
753 | { | 743 | { |
754 | unsigned long val, next; | 744 | unsigned long val, next; |
755 | 745 | ||
756 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 746 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); |
757 | next = __this_cpu_read(memcg->stat->targets[target]); | 747 | next = __this_cpu_read(memcg->stat->targets[target]); |
758 | /* from time_after() in jiffies.h */ | 748 | /* from time_after() in jiffies.h */ |
759 | return ((long)next - (long)val < 0); | 749 | if ((long)next - (long)val < 0) { |
760 | } | 750 | switch (target) { |
761 | 751 | case MEM_CGROUP_TARGET_THRESH: | |
762 | static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) | 752 | next = val + THRESHOLDS_EVENTS_TARGET; |
763 | { | 753 | break; |
764 | unsigned long val, next; | 754 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
765 | 755 | next = val + SOFTLIMIT_EVENTS_TARGET; | |
766 | val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); | 756 | break; |
767 | 757 | case MEM_CGROUP_TARGET_NUMAINFO: | |
768 | switch (target) { | 758 | next = val + NUMAINFO_EVENTS_TARGET; |
769 | case MEM_CGROUP_TARGET_THRESH: | 759 | break; |
770 | next = val + THRESHOLDS_EVENTS_TARGET; | 760 | default: |
771 | break; | 761 | break; |
772 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 762 | } |
773 | next = val + SOFTLIMIT_EVENTS_TARGET; | 763 | __this_cpu_write(memcg->stat->targets[target], next); |
774 | break; | 764 | return true; |
775 | case MEM_CGROUP_TARGET_NUMAINFO: | ||
776 | next = val + NUMAINFO_EVENTS_TARGET; | ||
777 | break; | ||
778 | default: | ||
779 | return; | ||
780 | } | 765 | } |
781 | 766 | return false; | |
782 | __this_cpu_write(memcg->stat->targets[target], next); | ||
783 | } | 767 | } |
784 | 768 | ||
785 | /* | 769 | /* |
@@ -790,25 +774,27 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | |||
790 | { | 774 | { |
791 | preempt_disable(); | 775 | preempt_disable(); |
792 | /* threshold event is triggered in finer grain than soft limit */ | 776 | /* threshold event is triggered in finer grain than soft limit */ |
793 | if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { | 777 | if (unlikely(mem_cgroup_event_ratelimit(memcg, |
778 | MEM_CGROUP_TARGET_THRESH))) { | ||
779 | bool do_softlimit, do_numainfo; | ||
780 | |||
781 | do_softlimit = mem_cgroup_event_ratelimit(memcg, | ||
782 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
783 | #if MAX_NUMNODES > 1 | ||
784 | do_numainfo = mem_cgroup_event_ratelimit(memcg, | ||
785 | MEM_CGROUP_TARGET_NUMAINFO); | ||
786 | #endif | ||
787 | preempt_enable(); | ||
788 | |||
794 | mem_cgroup_threshold(memcg); | 789 | mem_cgroup_threshold(memcg); |
795 | __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); | 790 | if (unlikely(do_softlimit)) |
796 | if (unlikely(__memcg_event_check(memcg, | ||
797 | MEM_CGROUP_TARGET_SOFTLIMIT))) { | ||
798 | mem_cgroup_update_tree(memcg, page); | 791 | mem_cgroup_update_tree(memcg, page); |
799 | __mem_cgroup_target_update(memcg, | ||
800 | MEM_CGROUP_TARGET_SOFTLIMIT); | ||
801 | } | ||
802 | #if MAX_NUMNODES > 1 | 792 | #if MAX_NUMNODES > 1 |
803 | if (unlikely(__memcg_event_check(memcg, | 793 | if (unlikely(do_numainfo)) |
804 | MEM_CGROUP_TARGET_NUMAINFO))) { | ||
805 | atomic_inc(&memcg->numainfo_events); | 794 | atomic_inc(&memcg->numainfo_events); |
806 | __mem_cgroup_target_update(memcg, | ||
807 | MEM_CGROUP_TARGET_NUMAINFO); | ||
808 | } | ||
809 | #endif | 795 | #endif |
810 | } | 796 | } else |
811 | preempt_enable(); | 797 | preempt_enable(); |
812 | } | 798 | } |
813 | 799 | ||
814 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 800 | struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
@@ -853,83 +839,116 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
853 | return memcg; | 839 | return memcg; |
854 | } | 840 | } |
855 | 841 | ||
856 | /* The caller has to guarantee "mem" exists before calling this */ | 842 | /** |
857 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) | 843 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
844 | * @root: hierarchy root | ||
845 | * @prev: previously returned memcg, NULL on first invocation | ||
846 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | ||
847 | * | ||
848 | * Returns references to children of the hierarchy below @root, or | ||
849 | * @root itself, or %NULL after a full round-trip. | ||
850 | * | ||
851 | * Caller must pass the return value in @prev on subsequent | ||
852 | * invocations for reference counting, or use mem_cgroup_iter_break() | ||
853 | * to cancel a hierarchy walk before the round-trip is complete. | ||
854 | * | ||
855 | * Reclaimers can specify a zone and a priority level in @reclaim to | ||
856 | * divide up the memcgs in the hierarchy among all concurrent | ||
857 | * reclaimers operating on the same zone and priority. | ||
858 | */ | ||
859 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | ||
860 | struct mem_cgroup *prev, | ||
861 | struct mem_cgroup_reclaim_cookie *reclaim) | ||
858 | { | 862 | { |
859 | struct cgroup_subsys_state *css; | 863 | struct mem_cgroup *memcg = NULL; |
860 | int found; | 864 | int id = 0; |
861 | 865 | ||
862 | if (!memcg) /* ROOT cgroup has the smallest ID */ | 866 | if (mem_cgroup_disabled()) |
863 | return root_mem_cgroup; /*css_put/get against root is ignored*/ | ||
864 | if (!memcg->use_hierarchy) { | ||
865 | if (css_tryget(&memcg->css)) | ||
866 | return memcg; | ||
867 | return NULL; | 867 | return NULL; |
868 | } | ||
869 | rcu_read_lock(); | ||
870 | /* | ||
871 | * searching a memory cgroup which has the smallest ID under given | ||
872 | * ROOT cgroup. (ID >= 1) | ||
873 | */ | ||
874 | css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); | ||
875 | if (css && css_tryget(css)) | ||
876 | memcg = container_of(css, struct mem_cgroup, css); | ||
877 | else | ||
878 | memcg = NULL; | ||
879 | rcu_read_unlock(); | ||
880 | return memcg; | ||
881 | } | ||
882 | 868 | ||
883 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | 869 | if (!root) |
884 | struct mem_cgroup *root, | 870 | root = root_mem_cgroup; |
885 | bool cond) | ||
886 | { | ||
887 | int nextid = css_id(&iter->css) + 1; | ||
888 | int found; | ||
889 | int hierarchy_used; | ||
890 | struct cgroup_subsys_state *css; | ||
891 | 871 | ||
892 | hierarchy_used = iter->use_hierarchy; | 872 | if (prev && !reclaim) |
873 | id = css_id(&prev->css); | ||
893 | 874 | ||
894 | css_put(&iter->css); | 875 | if (prev && prev != root) |
895 | /* If no ROOT, walk all, ignore hierarchy */ | 876 | css_put(&prev->css); |
896 | if (!cond || (root && !hierarchy_used)) | ||
897 | return NULL; | ||
898 | 877 | ||
899 | if (!root) | 878 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
900 | root = root_mem_cgroup; | 879 | if (prev) |
880 | return NULL; | ||
881 | return root; | ||
882 | } | ||
901 | 883 | ||
902 | do { | 884 | while (!memcg) { |
903 | iter = NULL; | 885 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); |
904 | rcu_read_lock(); | 886 | struct cgroup_subsys_state *css; |
887 | |||
888 | if (reclaim) { | ||
889 | int nid = zone_to_nid(reclaim->zone); | ||
890 | int zid = zone_idx(reclaim->zone); | ||
891 | struct mem_cgroup_per_zone *mz; | ||
905 | 892 | ||
906 | css = css_get_next(&mem_cgroup_subsys, nextid, | 893 | mz = mem_cgroup_zoneinfo(root, nid, zid); |
907 | &root->css, &found); | 894 | iter = &mz->reclaim_iter[reclaim->priority]; |
908 | if (css && css_tryget(css)) | 895 | if (prev && reclaim->generation != iter->generation) |
909 | iter = container_of(css, struct mem_cgroup, css); | 896 | return NULL; |
897 | id = iter->position; | ||
898 | } | ||
899 | |||
900 | rcu_read_lock(); | ||
901 | css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); | ||
902 | if (css) { | ||
903 | if (css == &root->css || css_tryget(css)) | ||
904 | memcg = container_of(css, | ||
905 | struct mem_cgroup, css); | ||
906 | } else | ||
907 | id = 0; | ||
910 | rcu_read_unlock(); | 908 | rcu_read_unlock(); |
911 | /* If css is NULL, no more cgroups will be found */ | ||
912 | nextid = found + 1; | ||
913 | } while (css && !iter); | ||
914 | 909 | ||
915 | return iter; | 910 | if (reclaim) { |
911 | iter->position = id; | ||
912 | if (!css) | ||
913 | iter->generation++; | ||
914 | else if (!prev && memcg) | ||
915 | reclaim->generation = iter->generation; | ||
916 | } | ||
917 | |||
918 | if (prev && !css) | ||
919 | return NULL; | ||
920 | } | ||
921 | return memcg; | ||
916 | } | 922 | } |
917 | /* | ||
918 | * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please | ||
919 | * be careful that "break" loop is not allowed. We have reference count. | ||
920 | * Instead of that modify "cond" to be false and "continue" to exit the loop. | ||
921 | */ | ||
922 | #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ | ||
923 | for (iter = mem_cgroup_start_loop(root);\ | ||
924 | iter != NULL;\ | ||
925 | iter = mem_cgroup_get_next(iter, root, cond)) | ||
926 | 923 | ||
927 | #define for_each_mem_cgroup_tree(iter, root) \ | 924 | /** |
928 | for_each_mem_cgroup_tree_cond(iter, root, true) | 925 | * mem_cgroup_iter_break - abort a hierarchy walk prematurely |
926 | * @root: hierarchy root | ||
927 | * @prev: last visited hierarchy member as returned by mem_cgroup_iter() | ||
928 | */ | ||
929 | void mem_cgroup_iter_break(struct mem_cgroup *root, | ||
930 | struct mem_cgroup *prev) | ||
931 | { | ||
932 | if (!root) | ||
933 | root = root_mem_cgroup; | ||
934 | if (prev && prev != root) | ||
935 | css_put(&prev->css); | ||
936 | } | ||
929 | 937 | ||
930 | #define for_each_mem_cgroup_all(iter) \ | 938 | /* |
931 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | 939 | * Iteration constructs for visiting all cgroups (under a tree). If |
940 | * loops are exited prematurely (break), mem_cgroup_iter_break() must | ||
941 | * be used for reference counting. | ||
942 | */ | ||
943 | #define for_each_mem_cgroup_tree(iter, root) \ | ||
944 | for (iter = mem_cgroup_iter(root, NULL, NULL); \ | ||
945 | iter != NULL; \ | ||
946 | iter = mem_cgroup_iter(root, iter, NULL)) | ||
932 | 947 | ||
948 | #define for_each_mem_cgroup(iter) \ | ||
949 | for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ | ||
950 | iter != NULL; \ | ||
951 | iter = mem_cgroup_iter(NULL, iter, NULL)) | ||
933 | 952 | ||
934 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 953 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
935 | { | 954 | { |
@@ -949,11 +968,11 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
949 | goto out; | 968 | goto out; |
950 | 969 | ||
951 | switch (idx) { | 970 | switch (idx) { |
952 | case PGMAJFAULT: | ||
953 | mem_cgroup_pgmajfault(memcg, 1); | ||
954 | break; | ||
955 | case PGFAULT: | 971 | case PGFAULT: |
956 | mem_cgroup_pgfault(memcg, 1); | 972 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); |
973 | break; | ||
974 | case PGMAJFAULT: | ||
975 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); | ||
957 | break; | 976 | break; |
958 | default: | 977 | default: |
959 | BUG(); | 978 | BUG(); |
@@ -963,6 +982,27 @@ out: | |||
963 | } | 982 | } |
964 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | 983 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); |
965 | 984 | ||
985 | /** | ||
986 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | ||
987 | * @zone: zone of the wanted lruvec | ||
988 | * @mem: memcg of the wanted lruvec | ||
989 | * | ||
990 | * Returns the lru list vector holding pages for the given @zone and | ||
991 | * @mem. This can be the global zone lruvec, if the memory controller | ||
992 | * is disabled. | ||
993 | */ | ||
994 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | ||
995 | struct mem_cgroup *memcg) | ||
996 | { | ||
997 | struct mem_cgroup_per_zone *mz; | ||
998 | |||
999 | if (mem_cgroup_disabled()) | ||
1000 | return &zone->lruvec; | ||
1001 | |||
1002 | mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); | ||
1003 | return &mz->lruvec; | ||
1004 | } | ||
1005 | |||
966 | /* | 1006 | /* |
967 | * Following LRU functions are allowed to be used without PCG_LOCK. | 1007 | * Following LRU functions are allowed to be used without PCG_LOCK. |
968 | * Operations are called by routine of global LRU independently from memcg. | 1008 | * Operations are called by routine of global LRU independently from memcg. |
@@ -977,180 +1017,91 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event); | |||
977 | * When moving account, the page is not on LRU. It's isolated. | 1017 | * When moving account, the page is not on LRU. It's isolated. |
978 | */ | 1018 | */ |
979 | 1019 | ||
980 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 1020 | /** |
981 | { | 1021 | * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec |
982 | struct page_cgroup *pc; | 1022 | * @zone: zone of the page |
983 | struct mem_cgroup_per_zone *mz; | 1023 | * @page: the page |
984 | 1024 | * @lru: current lru | |
985 | if (mem_cgroup_disabled()) | 1025 | * |
986 | return; | 1026 | * This function accounts for @page being added to @lru, and returns |
987 | pc = lookup_page_cgroup(page); | 1027 | * the lruvec for the given @zone and the memcg @page is charged to. |
988 | /* can happen while we handle swapcache. */ | 1028 | * |
989 | if (!TestClearPageCgroupAcctLRU(pc)) | 1029 | * The callsite is then responsible for physically linking the page to |
990 | return; | 1030 | * the returned lruvec->lists[@lru]. |
991 | VM_BUG_ON(!pc->mem_cgroup); | ||
992 | /* | ||
993 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | ||
994 | * removed from global LRU. | ||
995 | */ | ||
996 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
997 | /* huge page split is done under lru_lock. so, we have no races. */ | ||
998 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | ||
999 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1000 | return; | ||
1001 | VM_BUG_ON(list_empty(&pc->lru)); | ||
1002 | list_del_init(&pc->lru); | ||
1003 | } | ||
1004 | |||
1005 | void mem_cgroup_del_lru(struct page *page) | ||
1006 | { | ||
1007 | mem_cgroup_del_lru_list(page, page_lru(page)); | ||
1008 | } | ||
1009 | |||
1010 | /* | ||
1011 | * Writeback is about to end against a page which has been marked for immediate | ||
1012 | * reclaim. If it still appears to be reclaimable, move it to the tail of the | ||
1013 | * inactive list. | ||
1014 | */ | 1031 | */ |
1015 | void mem_cgroup_rotate_reclaimable_page(struct page *page) | 1032 | struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, |
1033 | enum lru_list lru) | ||
1016 | { | 1034 | { |
1017 | struct mem_cgroup_per_zone *mz; | 1035 | struct mem_cgroup_per_zone *mz; |
1036 | struct mem_cgroup *memcg; | ||
1018 | struct page_cgroup *pc; | 1037 | struct page_cgroup *pc; |
1019 | enum lru_list lru = page_lru(page); | ||
1020 | 1038 | ||
1021 | if (mem_cgroup_disabled()) | 1039 | if (mem_cgroup_disabled()) |
1022 | return; | 1040 | return &zone->lruvec; |
1023 | 1041 | ||
1024 | pc = lookup_page_cgroup(page); | 1042 | pc = lookup_page_cgroup(page); |
1025 | /* unused or root page is not rotated. */ | 1043 | memcg = pc->mem_cgroup; |
1026 | if (!PageCgroupUsed(pc)) | 1044 | mz = page_cgroup_zoneinfo(memcg, page); |
1027 | return; | 1045 | /* compound_order() is stabilized through lru_lock */ |
1028 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1046 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); |
1029 | smp_rmb(); | 1047 | return &mz->lruvec; |
1030 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1031 | return; | ||
1032 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1033 | list_move_tail(&pc->lru, &mz->lists[lru]); | ||
1034 | } | 1048 | } |
1035 | 1049 | ||
1036 | void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | 1050 | /** |
1051 | * mem_cgroup_lru_del_list - account for removing an lru page | ||
1052 | * @page: the page | ||
1053 | * @lru: target lru | ||
1054 | * | ||
1055 | * This function accounts for @page being removed from @lru. | ||
1056 | * | ||
1057 | * The callsite is then responsible for physically unlinking | ||
1058 | * @page->lru. | ||
1059 | */ | ||
1060 | void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | ||
1037 | { | 1061 | { |
1038 | struct mem_cgroup_per_zone *mz; | 1062 | struct mem_cgroup_per_zone *mz; |
1063 | struct mem_cgroup *memcg; | ||
1039 | struct page_cgroup *pc; | 1064 | struct page_cgroup *pc; |
1040 | 1065 | ||
1041 | if (mem_cgroup_disabled()) | 1066 | if (mem_cgroup_disabled()) |
1042 | return; | 1067 | return; |
1043 | 1068 | ||
1044 | pc = lookup_page_cgroup(page); | 1069 | pc = lookup_page_cgroup(page); |
1045 | /* unused or root page is not rotated. */ | 1070 | memcg = pc->mem_cgroup; |
1046 | if (!PageCgroupUsed(pc)) | 1071 | VM_BUG_ON(!memcg); |
1047 | return; | 1072 | mz = page_cgroup_zoneinfo(memcg, page); |
1048 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1049 | smp_rmb(); | ||
1050 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1051 | return; | ||
1052 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1053 | list_move(&pc->lru, &mz->lists[lru]); | ||
1054 | } | ||
1055 | |||
1056 | void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | ||
1057 | { | ||
1058 | struct page_cgroup *pc; | ||
1059 | struct mem_cgroup_per_zone *mz; | ||
1060 | |||
1061 | if (mem_cgroup_disabled()) | ||
1062 | return; | ||
1063 | pc = lookup_page_cgroup(page); | ||
1064 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
1065 | /* | ||
1066 | * putback: charge: | ||
1067 | * SetPageLRU SetPageCgroupUsed | ||
1068 | * smp_mb smp_mb | ||
1069 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1070 | * | ||
1071 | * Ensure that one of the two sides adds the page to the memcg | ||
1072 | * LRU during a race. | ||
1073 | */ | ||
1074 | smp_mb(); | ||
1075 | if (!PageCgroupUsed(pc)) | ||
1076 | return; | ||
1077 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | ||
1078 | smp_rmb(); | ||
1079 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | ||
1080 | /* huge page split is done under lru_lock. so, we have no races. */ | 1073 | /* huge page split is done under lru_lock. so, we have no races. */ |
1081 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 1074 | VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); |
1082 | SetPageCgroupAcctLRU(pc); | 1075 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); |
1083 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
1084 | return; | ||
1085 | list_add(&pc->lru, &mz->lists[lru]); | ||
1086 | } | ||
1087 | |||
1088 | /* | ||
1089 | * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed | ||
1090 | * while it's linked to lru because the page may be reused after it's fully | ||
1091 | * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. | ||
1092 | * It's done under lock_page and expected that zone->lru_lock isnever held. | ||
1093 | */ | ||
1094 | static void mem_cgroup_lru_del_before_commit(struct page *page) | ||
1095 | { | ||
1096 | unsigned long flags; | ||
1097 | struct zone *zone = page_zone(page); | ||
1098 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1099 | |||
1100 | /* | ||
1101 | * Doing this check without taking ->lru_lock seems wrong but this | ||
1102 | * is safe. Because if page_cgroup's USED bit is unset, the page | ||
1103 | * will not be added to any memcg's LRU. If page_cgroup's USED bit is | ||
1104 | * set, the commit after this will fail, anyway. | ||
1105 | * This all charge/uncharge is done under some mutual execustion. | ||
1106 | * So, we don't need to taking care of changes in USED bit. | ||
1107 | */ | ||
1108 | if (likely(!PageLRU(page))) | ||
1109 | return; | ||
1110 | |||
1111 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
1112 | /* | ||
1113 | * Forget old LRU when this page_cgroup is *not* used. This Used bit | ||
1114 | * is guarded by lock_page() because the page is SwapCache. | ||
1115 | */ | ||
1116 | if (!PageCgroupUsed(pc)) | ||
1117 | mem_cgroup_del_lru_list(page, page_lru(page)); | ||
1118 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1119 | } | 1076 | } |
1120 | 1077 | ||
1121 | static void mem_cgroup_lru_add_after_commit(struct page *page) | 1078 | void mem_cgroup_lru_del(struct page *page) |
1122 | { | 1079 | { |
1123 | unsigned long flags; | 1080 | mem_cgroup_lru_del_list(page, page_lru(page)); |
1124 | struct zone *zone = page_zone(page); | ||
1125 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1126 | /* | ||
1127 | * putback: charge: | ||
1128 | * SetPageLRU SetPageCgroupUsed | ||
1129 | * smp_mb smp_mb | ||
1130 | * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU | ||
1131 | * | ||
1132 | * Ensure that one of the two sides adds the page to the memcg | ||
1133 | * LRU during a race. | ||
1134 | */ | ||
1135 | smp_mb(); | ||
1136 | /* taking care of that the page is added to LRU while we commit it */ | ||
1137 | if (likely(!PageLRU(page))) | ||
1138 | return; | ||
1139 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
1140 | /* link when the page is linked to LRU but page_cgroup isn't */ | ||
1141 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) | ||
1142 | mem_cgroup_add_lru_list(page, page_lru(page)); | ||
1143 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
1144 | } | 1081 | } |
1145 | 1082 | ||
1146 | 1083 | /** | |
1147 | void mem_cgroup_move_lists(struct page *page, | 1084 | * mem_cgroup_lru_move_lists - account for moving a page between lrus |
1148 | enum lru_list from, enum lru_list to) | 1085 | * @zone: zone of the page |
1086 | * @page: the page | ||
1087 | * @from: current lru | ||
1088 | * @to: target lru | ||
1089 | * | ||
1090 | * This function accounts for @page being moved between the lrus @from | ||
1091 | * and @to, and returns the lruvec for the given @zone and the memcg | ||
1092 | * @page is charged to. | ||
1093 | * | ||
1094 | * The callsite is then responsible for physically relinking | ||
1095 | * @page->lru to the returned lruvec->lists[@to]. | ||
1096 | */ | ||
1097 | struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, | ||
1098 | struct page *page, | ||
1099 | enum lru_list from, | ||
1100 | enum lru_list to) | ||
1149 | { | 1101 | { |
1150 | if (mem_cgroup_disabled()) | 1102 | /* XXX: Optimize this, especially for @from == @to */ |
1151 | return; | 1103 | mem_cgroup_lru_del_list(page, from); |
1152 | mem_cgroup_del_lru_list(page, from); | 1104 | return mem_cgroup_lru_add_list(zone, page, to); |
1153 | mem_cgroup_add_lru_list(page, to); | ||
1154 | } | 1105 | } |
1155 | 1106 | ||
1156 | /* | 1107 | /* |
@@ -1175,10 +1126,21 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | |||
1175 | struct task_struct *p; | 1126 | struct task_struct *p; |
1176 | 1127 | ||
1177 | p = find_lock_task_mm(task); | 1128 | p = find_lock_task_mm(task); |
1178 | if (!p) | 1129 | if (p) { |
1179 | return 0; | 1130 | curr = try_get_mem_cgroup_from_mm(p->mm); |
1180 | curr = try_get_mem_cgroup_from_mm(p->mm); | 1131 | task_unlock(p); |
1181 | task_unlock(p); | 1132 | } else { |
1133 | /* | ||
1134 | * All threads may have already detached their mm's, but the oom | ||
1135 | * killer still needs to detect if they have already been oom | ||
1136 | * killed to prevent needlessly killing additional tasks. | ||
1137 | */ | ||
1138 | task_lock(task); | ||
1139 | curr = mem_cgroup_from_task(task); | ||
1140 | if (curr) | ||
1141 | css_get(&curr->css); | ||
1142 | task_unlock(task); | ||
1143 | } | ||
1182 | if (!curr) | 1144 | if (!curr) |
1183 | return 0; | 1145 | return 0; |
1184 | /* | 1146 | /* |
@@ -1258,68 +1220,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1258 | return &mz->reclaim_stat; | 1220 | return &mz->reclaim_stat; |
1259 | } | 1221 | } |
1260 | 1222 | ||
1261 | unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | ||
1262 | struct list_head *dst, | ||
1263 | unsigned long *scanned, int order, | ||
1264 | isolate_mode_t mode, | ||
1265 | struct zone *z, | ||
1266 | struct mem_cgroup *mem_cont, | ||
1267 | int active, int file) | ||
1268 | { | ||
1269 | unsigned long nr_taken = 0; | ||
1270 | struct page *page; | ||
1271 | unsigned long scan; | ||
1272 | LIST_HEAD(pc_list); | ||
1273 | struct list_head *src; | ||
1274 | struct page_cgroup *pc, *tmp; | ||
1275 | int nid = zone_to_nid(z); | ||
1276 | int zid = zone_idx(z); | ||
1277 | struct mem_cgroup_per_zone *mz; | ||
1278 | int lru = LRU_FILE * file + active; | ||
1279 | int ret; | ||
1280 | |||
1281 | BUG_ON(!mem_cont); | ||
1282 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | ||
1283 | src = &mz->lists[lru]; | ||
1284 | |||
1285 | scan = 0; | ||
1286 | list_for_each_entry_safe_reverse(pc, tmp, src, lru) { | ||
1287 | if (scan >= nr_to_scan) | ||
1288 | break; | ||
1289 | |||
1290 | if (unlikely(!PageCgroupUsed(pc))) | ||
1291 | continue; | ||
1292 | |||
1293 | page = lookup_cgroup_page(pc); | ||
1294 | |||
1295 | if (unlikely(!PageLRU(page))) | ||
1296 | continue; | ||
1297 | |||
1298 | scan++; | ||
1299 | ret = __isolate_lru_page(page, mode, file); | ||
1300 | switch (ret) { | ||
1301 | case 0: | ||
1302 | list_move(&page->lru, dst); | ||
1303 | mem_cgroup_del_lru(page); | ||
1304 | nr_taken += hpage_nr_pages(page); | ||
1305 | break; | ||
1306 | case -EBUSY: | ||
1307 | /* we don't affect global LRU but rotate in our LRU */ | ||
1308 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
1309 | break; | ||
1310 | default: | ||
1311 | break; | ||
1312 | } | ||
1313 | } | ||
1314 | |||
1315 | *scanned = scan; | ||
1316 | |||
1317 | trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, | ||
1318 | 0, 0, 0, mode); | ||
1319 | |||
1320 | return nr_taken; | ||
1321 | } | ||
1322 | |||
1323 | #define mem_cgroup_from_res_counter(counter, member) \ | 1223 | #define mem_cgroup_from_res_counter(counter, member) \ |
1324 | container_of(counter, struct mem_cgroup, member) | 1224 | container_of(counter, struct mem_cgroup, member) |
1325 | 1225 | ||
@@ -1536,41 +1436,40 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | |||
1536 | return min(limit, memsw); | 1436 | return min(limit, memsw); |
1537 | } | 1437 | } |
1538 | 1438 | ||
1539 | /* | 1439 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, |
1540 | * Visit the first child (need not be the first child as per the ordering | 1440 | gfp_t gfp_mask, |
1541 | * of the cgroup list, since we track last_scanned_child) of @mem and use | 1441 | unsigned long flags) |
1542 | * that to reclaim free pages from. | ||
1543 | */ | ||
1544 | static struct mem_cgroup * | ||
1545 | mem_cgroup_select_victim(struct mem_cgroup *root_memcg) | ||
1546 | { | 1442 | { |
1547 | struct mem_cgroup *ret = NULL; | 1443 | unsigned long total = 0; |
1548 | struct cgroup_subsys_state *css; | 1444 | bool noswap = false; |
1549 | int nextid, found; | 1445 | int loop; |
1550 | |||
1551 | if (!root_memcg->use_hierarchy) { | ||
1552 | css_get(&root_memcg->css); | ||
1553 | ret = root_memcg; | ||
1554 | } | ||
1555 | 1446 | ||
1556 | while (!ret) { | 1447 | if (flags & MEM_CGROUP_RECLAIM_NOSWAP) |
1557 | rcu_read_lock(); | 1448 | noswap = true; |
1558 | nextid = root_memcg->last_scanned_child + 1; | 1449 | if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) |
1559 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, | 1450 | noswap = true; |
1560 | &found); | ||
1561 | if (css && css_tryget(css)) | ||
1562 | ret = container_of(css, struct mem_cgroup, css); | ||
1563 | 1451 | ||
1564 | rcu_read_unlock(); | 1452 | for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { |
1565 | /* Updates scanning parameter */ | 1453 | if (loop) |
1566 | if (!css) { | 1454 | drain_all_stock_async(memcg); |
1567 | /* this means start scan from ID:1 */ | 1455 | total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); |
1568 | root_memcg->last_scanned_child = 0; | 1456 | /* |
1569 | } else | 1457 | * Allow limit shrinkers, which are triggered directly |
1570 | root_memcg->last_scanned_child = found; | 1458 | * by userspace, to catch signals and stop reclaim |
1459 | * after minimal progress, regardless of the margin. | ||
1460 | */ | ||
1461 | if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) | ||
1462 | break; | ||
1463 | if (mem_cgroup_margin(memcg)) | ||
1464 | break; | ||
1465 | /* | ||
1466 | * If nothing was reclaimed after two attempts, there | ||
1467 | * may be no reclaimable pages in this hierarchy. | ||
1468 | */ | ||
1469 | if (loop && !total) | ||
1470 | break; | ||
1571 | } | 1471 | } |
1572 | 1472 | return total; | |
1573 | return ret; | ||
1574 | } | 1473 | } |
1575 | 1474 | ||
1576 | /** | 1475 | /** |
@@ -1710,61 +1609,35 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | |||
1710 | } | 1609 | } |
1711 | #endif | 1610 | #endif |
1712 | 1611 | ||
1713 | /* | 1612 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1714 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1613 | struct zone *zone, |
1715 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1614 | gfp_t gfp_mask, |
1716 | * based on its position in the children list. | 1615 | unsigned long *total_scanned) |
1717 | * | 1616 | { |
1718 | * root_memcg is the original ancestor that we've been reclaim from. | 1617 | struct mem_cgroup *victim = NULL; |
1719 | * | 1618 | int total = 0; |
1720 | * We give up and return to the caller when we visit root_memcg twice. | ||
1721 | * (other groups can be removed while we're walking....) | ||
1722 | * | ||
1723 | * If shrink==true, for avoiding to free too much, this returns immedieately. | ||
1724 | */ | ||
1725 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, | ||
1726 | struct zone *zone, | ||
1727 | gfp_t gfp_mask, | ||
1728 | unsigned long reclaim_options, | ||
1729 | unsigned long *total_scanned) | ||
1730 | { | ||
1731 | struct mem_cgroup *victim; | ||
1732 | int ret, total = 0; | ||
1733 | int loop = 0; | 1619 | int loop = 0; |
1734 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | ||
1735 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | ||
1736 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
1737 | unsigned long excess; | 1620 | unsigned long excess; |
1738 | unsigned long nr_scanned; | 1621 | unsigned long nr_scanned; |
1622 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
1623 | .zone = zone, | ||
1624 | .priority = 0, | ||
1625 | }; | ||
1739 | 1626 | ||
1740 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1627 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; |
1741 | 1628 | ||
1742 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | ||
1743 | if (!check_soft && !shrink && root_memcg->memsw_is_minimum) | ||
1744 | noswap = true; | ||
1745 | |||
1746 | while (1) { | 1629 | while (1) { |
1747 | victim = mem_cgroup_select_victim(root_memcg); | 1630 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
1748 | if (victim == root_memcg) { | 1631 | if (!victim) { |
1749 | loop++; | 1632 | loop++; |
1750 | /* | ||
1751 | * We are not draining per cpu cached charges during | ||
1752 | * soft limit reclaim because global reclaim doesn't | ||
1753 | * care about charges. It tries to free some memory and | ||
1754 | * charges will not give any. | ||
1755 | */ | ||
1756 | if (!check_soft && loop >= 1) | ||
1757 | drain_all_stock_async(root_memcg); | ||
1758 | if (loop >= 2) { | 1633 | if (loop >= 2) { |
1759 | /* | 1634 | /* |
1760 | * If we have not been able to reclaim | 1635 | * If we have not been able to reclaim |
1761 | * anything, it might because there are | 1636 | * anything, it might because there are |
1762 | * no reclaimable pages under this hierarchy | 1637 | * no reclaimable pages under this hierarchy |
1763 | */ | 1638 | */ |
1764 | if (!check_soft || !total) { | 1639 | if (!total) |
1765 | css_put(&victim->css); | ||
1766 | break; | 1640 | break; |
1767 | } | ||
1768 | /* | 1641 | /* |
1769 | * We want to do more targeted reclaim. | 1642 | * We want to do more targeted reclaim. |
1770 | * excess >> 2 is not to excessive so as to | 1643 | * excess >> 2 is not to excessive so as to |
@@ -1772,40 +1645,20 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, | |||
1772 | * coming back to reclaim from this cgroup | 1645 | * coming back to reclaim from this cgroup |
1773 | */ | 1646 | */ |
1774 | if (total >= (excess >> 2) || | 1647 | if (total >= (excess >> 2) || |
1775 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | 1648 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) |
1776 | css_put(&victim->css); | ||
1777 | break; | 1649 | break; |
1778 | } | ||
1779 | } | 1650 | } |
1780 | } | ||
1781 | if (!mem_cgroup_reclaimable(victim, noswap)) { | ||
1782 | /* this cgroup's local usage == 0 */ | ||
1783 | css_put(&victim->css); | ||
1784 | continue; | 1651 | continue; |
1785 | } | 1652 | } |
1786 | /* we use swappiness of local cgroup */ | 1653 | if (!mem_cgroup_reclaimable(victim, false)) |
1787 | if (check_soft) { | 1654 | continue; |
1788 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, | 1655 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, |
1789 | noswap, zone, &nr_scanned); | 1656 | zone, &nr_scanned); |
1790 | *total_scanned += nr_scanned; | 1657 | *total_scanned += nr_scanned; |
1791 | } else | 1658 | if (!res_counter_soft_limit_excess(&root_memcg->res)) |
1792 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | 1659 | break; |
1793 | noswap); | ||
1794 | css_put(&victim->css); | ||
1795 | /* | ||
1796 | * At shrinking usage, we can't check we should stop here or | ||
1797 | * reclaim more. It's depends on callers. last_scanned_child | ||
1798 | * will work enough for keeping fairness under tree. | ||
1799 | */ | ||
1800 | if (shrink) | ||
1801 | return ret; | ||
1802 | total += ret; | ||
1803 | if (check_soft) { | ||
1804 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | ||
1805 | return total; | ||
1806 | } else if (mem_cgroup_margin(root_memcg)) | ||
1807 | return total; | ||
1808 | } | 1660 | } |
1661 | mem_cgroup_iter_break(root_memcg, victim); | ||
1809 | return total; | 1662 | return total; |
1810 | } | 1663 | } |
1811 | 1664 | ||
@@ -1817,16 +1670,16 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, | |||
1817 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | 1670 | static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) |
1818 | { | 1671 | { |
1819 | struct mem_cgroup *iter, *failed = NULL; | 1672 | struct mem_cgroup *iter, *failed = NULL; |
1820 | bool cond = true; | ||
1821 | 1673 | ||
1822 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { | 1674 | for_each_mem_cgroup_tree(iter, memcg) { |
1823 | if (iter->oom_lock) { | 1675 | if (iter->oom_lock) { |
1824 | /* | 1676 | /* |
1825 | * this subtree of our hierarchy is already locked | 1677 | * this subtree of our hierarchy is already locked |
1826 | * so we cannot give a lock. | 1678 | * so we cannot give a lock. |
1827 | */ | 1679 | */ |
1828 | failed = iter; | 1680 | failed = iter; |
1829 | cond = false; | 1681 | mem_cgroup_iter_break(memcg, iter); |
1682 | break; | ||
1830 | } else | 1683 | } else |
1831 | iter->oom_lock = true; | 1684 | iter->oom_lock = true; |
1832 | } | 1685 | } |
@@ -1838,11 +1691,10 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) | |||
1838 | * OK, we failed to lock the whole subtree so we have to clean up | 1691 | * OK, we failed to lock the whole subtree so we have to clean up |
1839 | * what we set up to the failing subtree | 1692 | * what we set up to the failing subtree |
1840 | */ | 1693 | */ |
1841 | cond = true; | 1694 | for_each_mem_cgroup_tree(iter, memcg) { |
1842 | for_each_mem_cgroup_tree_cond(iter, memcg, cond) { | ||
1843 | if (iter == failed) { | 1695 | if (iter == failed) { |
1844 | cond = false; | 1696 | mem_cgroup_iter_break(memcg, iter); |
1845 | continue; | 1697 | break; |
1846 | } | 1698 | } |
1847 | iter->oom_lock = false; | 1699 | iter->oom_lock = false; |
1848 | } | 1700 | } |
@@ -2007,7 +1859,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
2007 | bool need_unlock = false; | 1859 | bool need_unlock = false; |
2008 | unsigned long uninitialized_var(flags); | 1860 | unsigned long uninitialized_var(flags); |
2009 | 1861 | ||
2010 | if (unlikely(!pc)) | 1862 | if (mem_cgroup_disabled()) |
2011 | return; | 1863 | return; |
2012 | 1864 | ||
2013 | rcu_read_lock(); | 1865 | rcu_read_lock(); |
@@ -2238,7 +2090,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2238 | struct mem_cgroup *iter; | 2090 | struct mem_cgroup *iter; |
2239 | 2091 | ||
2240 | if ((action == CPU_ONLINE)) { | 2092 | if ((action == CPU_ONLINE)) { |
2241 | for_each_mem_cgroup_all(iter) | 2093 | for_each_mem_cgroup(iter) |
2242 | synchronize_mem_cgroup_on_move(iter, cpu); | 2094 | synchronize_mem_cgroup_on_move(iter, cpu); |
2243 | return NOTIFY_OK; | 2095 | return NOTIFY_OK; |
2244 | } | 2096 | } |
@@ -2246,7 +2098,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2246 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 2098 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
2247 | return NOTIFY_OK; | 2099 | return NOTIFY_OK; |
2248 | 2100 | ||
2249 | for_each_mem_cgroup_all(iter) | 2101 | for_each_mem_cgroup(iter) |
2250 | mem_cgroup_drain_pcp_counter(iter, cpu); | 2102 | mem_cgroup_drain_pcp_counter(iter, cpu); |
2251 | 2103 | ||
2252 | stock = &per_cpu(memcg_stock, cpu); | 2104 | stock = &per_cpu(memcg_stock, cpu); |
@@ -2300,8 +2152,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2300 | if (!(gfp_mask & __GFP_WAIT)) | 2152 | if (!(gfp_mask & __GFP_WAIT)) |
2301 | return CHARGE_WOULDBLOCK; | 2153 | return CHARGE_WOULDBLOCK; |
2302 | 2154 | ||
2303 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, | 2155 | ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); |
2304 | gfp_mask, flags, NULL); | ||
2305 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2156 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2306 | return CHARGE_RETRY; | 2157 | return CHARGE_RETRY; |
2307 | /* | 2158 | /* |
@@ -2334,8 +2185,25 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2334 | } | 2185 | } |
2335 | 2186 | ||
2336 | /* | 2187 | /* |
2337 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 2188 | * __mem_cgroup_try_charge() does |
2338 | * oom-killer can be invoked. | 2189 | * 1. detect memcg to be charged against from passed *mm and *ptr, |
2190 | * 2. update res_counter | ||
2191 | * 3. call memory reclaim if necessary. | ||
2192 | * | ||
2193 | * In some special case, if the task is fatal, fatal_signal_pending() or | ||
2194 | * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup | ||
2195 | * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon | ||
2196 | * as possible without any hazards. 2: all pages should have a valid | ||
2197 | * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg | ||
2198 | * pointer, that is treated as a charge to root_mem_cgroup. | ||
2199 | * | ||
2200 | * So __mem_cgroup_try_charge() will return | ||
2201 | * 0 ... on success, filling *ptr with a valid memcg pointer. | ||
2202 | * -ENOMEM ... charge failure because of resource limits. | ||
2203 | * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. | ||
2204 | * | ||
2205 | * Unlike the exported interface, an "oom" parameter is added. if oom==true, | ||
2206 | * the oom-killer can be invoked. | ||
2339 | */ | 2207 | */ |
2340 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 2208 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
2341 | gfp_t gfp_mask, | 2209 | gfp_t gfp_mask, |
@@ -2364,7 +2232,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
2364 | * set, if so charge the init_mm (happens for pagecache usage). | 2232 | * set, if so charge the init_mm (happens for pagecache usage). |
2365 | */ | 2233 | */ |
2366 | if (!*ptr && !mm) | 2234 | if (!*ptr && !mm) |
2367 | goto bypass; | 2235 | *ptr = root_mem_cgroup; |
2368 | again: | 2236 | again: |
2369 | if (*ptr) { /* css should be a valid one */ | 2237 | if (*ptr) { /* css should be a valid one */ |
2370 | memcg = *ptr; | 2238 | memcg = *ptr; |
@@ -2390,7 +2258,9 @@ again: | |||
2390 | * task-struct. So, mm->owner can be NULL. | 2258 | * task-struct. So, mm->owner can be NULL. |
2391 | */ | 2259 | */ |
2392 | memcg = mem_cgroup_from_task(p); | 2260 | memcg = mem_cgroup_from_task(p); |
2393 | if (!memcg || mem_cgroup_is_root(memcg)) { | 2261 | if (!memcg) |
2262 | memcg = root_mem_cgroup; | ||
2263 | if (mem_cgroup_is_root(memcg)) { | ||
2394 | rcu_read_unlock(); | 2264 | rcu_read_unlock(); |
2395 | goto done; | 2265 | goto done; |
2396 | } | 2266 | } |
@@ -2465,8 +2335,8 @@ nomem: | |||
2465 | *ptr = NULL; | 2335 | *ptr = NULL; |
2466 | return -ENOMEM; | 2336 | return -ENOMEM; |
2467 | bypass: | 2337 | bypass: |
2468 | *ptr = NULL; | 2338 | *ptr = root_mem_cgroup; |
2469 | return 0; | 2339 | return -EINTR; |
2470 | } | 2340 | } |
2471 | 2341 | ||
2472 | /* | 2342 | /* |
@@ -2522,7 +2392,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
2522 | memcg = NULL; | 2392 | memcg = NULL; |
2523 | } else if (PageSwapCache(page)) { | 2393 | } else if (PageSwapCache(page)) { |
2524 | ent.val = page_private(page); | 2394 | ent.val = page_private(page); |
2525 | id = lookup_swap_cgroup(ent); | 2395 | id = lookup_swap_cgroup_id(ent); |
2526 | rcu_read_lock(); | 2396 | rcu_read_lock(); |
2527 | memcg = mem_cgroup_lookup(id); | 2397 | memcg = mem_cgroup_lookup(id); |
2528 | if (memcg && !css_tryget(&memcg->css)) | 2398 | if (memcg && !css_tryget(&memcg->css)) |
@@ -2574,6 +2444,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2574 | 2444 | ||
2575 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); | 2445 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); |
2576 | unlock_page_cgroup(pc); | 2446 | unlock_page_cgroup(pc); |
2447 | WARN_ON_ONCE(PageLRU(page)); | ||
2577 | /* | 2448 | /* |
2578 | * "charge_statistics" updated event counter. Then, check it. | 2449 | * "charge_statistics" updated event counter. Then, check it. |
2579 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 2450 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
@@ -2585,44 +2456,29 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2585 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2456 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2586 | 2457 | ||
2587 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2458 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ |
2588 | (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) | 2459 | (1 << PCG_MIGRATION)) |
2589 | /* | 2460 | /* |
2590 | * Because tail pages are not marked as "used", set it. We're under | 2461 | * Because tail pages are not marked as "used", set it. We're under |
2591 | * zone->lru_lock, 'splitting on pmd' and compund_lock. | 2462 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
2463 | * charge/uncharge will be never happen and move_account() is done under | ||
2464 | * compound_lock(), so we don't have to take care of races. | ||
2592 | */ | 2465 | */ |
2593 | void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) | 2466 | void mem_cgroup_split_huge_fixup(struct page *head) |
2594 | { | 2467 | { |
2595 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 2468 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
2596 | struct page_cgroup *tail_pc = lookup_page_cgroup(tail); | 2469 | struct page_cgroup *pc; |
2597 | unsigned long flags; | 2470 | int i; |
2598 | 2471 | ||
2599 | if (mem_cgroup_disabled()) | 2472 | if (mem_cgroup_disabled()) |
2600 | return; | 2473 | return; |
2601 | /* | 2474 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
2602 | * We have no races with charge/uncharge but will have races with | 2475 | pc = head_pc + i; |
2603 | * page state accounting. | 2476 | pc->mem_cgroup = head_pc->mem_cgroup; |
2604 | */ | 2477 | smp_wmb();/* see __commit_charge() */ |
2605 | move_lock_page_cgroup(head_pc, &flags); | 2478 | pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; |
2606 | |||
2607 | tail_pc->mem_cgroup = head_pc->mem_cgroup; | ||
2608 | smp_wmb(); /* see __commit_charge() */ | ||
2609 | if (PageCgroupAcctLRU(head_pc)) { | ||
2610 | enum lru_list lru; | ||
2611 | struct mem_cgroup_per_zone *mz; | ||
2612 | |||
2613 | /* | ||
2614 | * LRU flags cannot be copied because we need to add tail | ||
2615 | *.page to LRU by generic call and our hook will be called. | ||
2616 | * We hold lru_lock, then, reduce counter directly. | ||
2617 | */ | ||
2618 | lru = page_lru(head); | ||
2619 | mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); | ||
2620 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | ||
2621 | } | 2479 | } |
2622 | tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; | ||
2623 | move_unlock_page_cgroup(head_pc, &flags); | ||
2624 | } | 2480 | } |
2625 | #endif | 2481 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
2626 | 2482 | ||
2627 | /** | 2483 | /** |
2628 | * mem_cgroup_move_account - move account of the page | 2484 | * mem_cgroup_move_account - move account of the page |
@@ -2737,7 +2593,7 @@ static int mem_cgroup_move_parent(struct page *page, | |||
2737 | 2593 | ||
2738 | parent = mem_cgroup_from_cont(pcg); | 2594 | parent = mem_cgroup_from_cont(pcg); |
2739 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); | 2595 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); |
2740 | if (ret || !parent) | 2596 | if (ret) |
2741 | goto put_back; | 2597 | goto put_back; |
2742 | 2598 | ||
2743 | if (nr_pages > 1) | 2599 | if (nr_pages > 1) |
@@ -2783,12 +2639,9 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
2783 | } | 2639 | } |
2784 | 2640 | ||
2785 | pc = lookup_page_cgroup(page); | 2641 | pc = lookup_page_cgroup(page); |
2786 | BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ | ||
2787 | |||
2788 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); | 2642 | ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); |
2789 | if (ret || !memcg) | 2643 | if (ret == -ENOMEM) |
2790 | return ret; | 2644 | return ret; |
2791 | |||
2792 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); | 2645 | __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); |
2793 | return 0; | 2646 | return 0; |
2794 | } | 2647 | } |
@@ -2798,19 +2651,11 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
2798 | { | 2651 | { |
2799 | if (mem_cgroup_disabled()) | 2652 | if (mem_cgroup_disabled()) |
2800 | return 0; | 2653 | return 0; |
2801 | /* | 2654 | VM_BUG_ON(page_mapped(page)); |
2802 | * If already mapped, we don't have to account. | 2655 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
2803 | * If page cache, page->mapping has address_space. | 2656 | VM_BUG_ON(!mm); |
2804 | * But page->mapping may have out-of-use anon_vma pointer, | ||
2805 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping | ||
2806 | * is NULL. | ||
2807 | */ | ||
2808 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | ||
2809 | return 0; | ||
2810 | if (unlikely(!mm)) | ||
2811 | mm = &init_mm; | ||
2812 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 2657 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
2813 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2658 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2814 | } | 2659 | } |
2815 | 2660 | ||
2816 | static void | 2661 | static void |
@@ -2822,14 +2667,27 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, | |||
2822 | enum charge_type ctype) | 2667 | enum charge_type ctype) |
2823 | { | 2668 | { |
2824 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2669 | struct page_cgroup *pc = lookup_page_cgroup(page); |
2670 | struct zone *zone = page_zone(page); | ||
2671 | unsigned long flags; | ||
2672 | bool removed = false; | ||
2673 | |||
2825 | /* | 2674 | /* |
2826 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page | 2675 | * In some case, SwapCache, FUSE(splice_buf->radixtree), the page |
2827 | * is already on LRU. It means the page may on some other page_cgroup's | 2676 | * is already on LRU. It means the page may on some other page_cgroup's |
2828 | * LRU. Take care of it. | 2677 | * LRU. Take care of it. |
2829 | */ | 2678 | */ |
2830 | mem_cgroup_lru_del_before_commit(page); | 2679 | spin_lock_irqsave(&zone->lru_lock, flags); |
2680 | if (PageLRU(page)) { | ||
2681 | del_page_from_lru_list(zone, page, page_lru(page)); | ||
2682 | ClearPageLRU(page); | ||
2683 | removed = true; | ||
2684 | } | ||
2831 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); | 2685 | __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); |
2832 | mem_cgroup_lru_add_after_commit(page); | 2686 | if (removed) { |
2687 | add_page_to_lru_list(zone, page, page_lru(page)); | ||
2688 | SetPageLRU(page); | ||
2689 | } | ||
2690 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
2833 | return; | 2691 | return; |
2834 | } | 2692 | } |
2835 | 2693 | ||
@@ -2837,6 +2695,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2837 | gfp_t gfp_mask) | 2695 | gfp_t gfp_mask) |
2838 | { | 2696 | { |
2839 | struct mem_cgroup *memcg = NULL; | 2697 | struct mem_cgroup *memcg = NULL; |
2698 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2840 | int ret; | 2699 | int ret; |
2841 | 2700 | ||
2842 | if (mem_cgroup_disabled()) | 2701 | if (mem_cgroup_disabled()) |
@@ -2846,31 +2705,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2846 | 2705 | ||
2847 | if (unlikely(!mm)) | 2706 | if (unlikely(!mm)) |
2848 | mm = &init_mm; | 2707 | mm = &init_mm; |
2708 | if (!page_is_file_cache(page)) | ||
2709 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2849 | 2710 | ||
2850 | if (page_is_file_cache(page)) { | 2711 | if (!PageSwapCache(page)) |
2851 | ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); | 2712 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); |
2852 | if (ret || !memcg) | 2713 | else { /* page is swapcache/shmem */ |
2853 | return ret; | ||
2854 | |||
2855 | /* | ||
2856 | * FUSE reuses pages without going through the final | ||
2857 | * put that would remove them from the LRU list, make | ||
2858 | * sure that they get relinked properly. | ||
2859 | */ | ||
2860 | __mem_cgroup_commit_charge_lrucare(page, memcg, | ||
2861 | MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
2862 | return ret; | ||
2863 | } | ||
2864 | /* shmem */ | ||
2865 | if (PageSwapCache(page)) { | ||
2866 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); | 2714 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); |
2867 | if (!ret) | 2715 | if (!ret) |
2868 | __mem_cgroup_commit_charge_swapin(page, memcg, | 2716 | __mem_cgroup_commit_charge_swapin(page, memcg, type); |
2869 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | 2717 | } |
2870 | } else | ||
2871 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | ||
2872 | MEM_CGROUP_CHARGE_TYPE_SHMEM); | ||
2873 | |||
2874 | return ret; | 2718 | return ret; |
2875 | } | 2719 | } |
2876 | 2720 | ||
@@ -2882,12 +2726,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
2882 | */ | 2726 | */ |
2883 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2727 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
2884 | struct page *page, | 2728 | struct page *page, |
2885 | gfp_t mask, struct mem_cgroup **ptr) | 2729 | gfp_t mask, struct mem_cgroup **memcgp) |
2886 | { | 2730 | { |
2887 | struct mem_cgroup *memcg; | 2731 | struct mem_cgroup *memcg; |
2888 | int ret; | 2732 | int ret; |
2889 | 2733 | ||
2890 | *ptr = NULL; | 2734 | *memcgp = NULL; |
2891 | 2735 | ||
2892 | if (mem_cgroup_disabled()) | 2736 | if (mem_cgroup_disabled()) |
2893 | return 0; | 2737 | return 0; |
@@ -2905,27 +2749,32 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
2905 | memcg = try_get_mem_cgroup_from_page(page); | 2749 | memcg = try_get_mem_cgroup_from_page(page); |
2906 | if (!memcg) | 2750 | if (!memcg) |
2907 | goto charge_cur_mm; | 2751 | goto charge_cur_mm; |
2908 | *ptr = memcg; | 2752 | *memcgp = memcg; |
2909 | ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); | 2753 | ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); |
2910 | css_put(&memcg->css); | 2754 | css_put(&memcg->css); |
2755 | if (ret == -EINTR) | ||
2756 | ret = 0; | ||
2911 | return ret; | 2757 | return ret; |
2912 | charge_cur_mm: | 2758 | charge_cur_mm: |
2913 | if (unlikely(!mm)) | 2759 | if (unlikely(!mm)) |
2914 | mm = &init_mm; | 2760 | mm = &init_mm; |
2915 | return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); | 2761 | ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); |
2762 | if (ret == -EINTR) | ||
2763 | ret = 0; | ||
2764 | return ret; | ||
2916 | } | 2765 | } |
2917 | 2766 | ||
2918 | static void | 2767 | static void |
2919 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | 2768 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, |
2920 | enum charge_type ctype) | 2769 | enum charge_type ctype) |
2921 | { | 2770 | { |
2922 | if (mem_cgroup_disabled()) | 2771 | if (mem_cgroup_disabled()) |
2923 | return; | 2772 | return; |
2924 | if (!ptr) | 2773 | if (!memcg) |
2925 | return; | 2774 | return; |
2926 | cgroup_exclude_rmdir(&ptr->css); | 2775 | cgroup_exclude_rmdir(&memcg->css); |
2927 | 2776 | ||
2928 | __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); | 2777 | __mem_cgroup_commit_charge_lrucare(page, memcg, ctype); |
2929 | /* | 2778 | /* |
2930 | * Now swap is on-memory. This means this page may be | 2779 | * Now swap is on-memory. This means this page may be |
2931 | * counted both as mem and swap....double count. | 2780 | * counted both as mem and swap....double count. |
@@ -2935,21 +2784,22 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2935 | */ | 2784 | */ |
2936 | if (do_swap_account && PageSwapCache(page)) { | 2785 | if (do_swap_account && PageSwapCache(page)) { |
2937 | swp_entry_t ent = {.val = page_private(page)}; | 2786 | swp_entry_t ent = {.val = page_private(page)}; |
2787 | struct mem_cgroup *swap_memcg; | ||
2938 | unsigned short id; | 2788 | unsigned short id; |
2939 | struct mem_cgroup *memcg; | ||
2940 | 2789 | ||
2941 | id = swap_cgroup_record(ent, 0); | 2790 | id = swap_cgroup_record(ent, 0); |
2942 | rcu_read_lock(); | 2791 | rcu_read_lock(); |
2943 | memcg = mem_cgroup_lookup(id); | 2792 | swap_memcg = mem_cgroup_lookup(id); |
2944 | if (memcg) { | 2793 | if (swap_memcg) { |
2945 | /* | 2794 | /* |
2946 | * This recorded memcg can be obsolete one. So, avoid | 2795 | * This recorded memcg can be obsolete one. So, avoid |
2947 | * calling css_tryget | 2796 | * calling css_tryget |
2948 | */ | 2797 | */ |
2949 | if (!mem_cgroup_is_root(memcg)) | 2798 | if (!mem_cgroup_is_root(swap_memcg)) |
2950 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 2799 | res_counter_uncharge(&swap_memcg->memsw, |
2951 | mem_cgroup_swap_statistics(memcg, false); | 2800 | PAGE_SIZE); |
2952 | mem_cgroup_put(memcg); | 2801 | mem_cgroup_swap_statistics(swap_memcg, false); |
2802 | mem_cgroup_put(swap_memcg); | ||
2953 | } | 2803 | } |
2954 | rcu_read_unlock(); | 2804 | rcu_read_unlock(); |
2955 | } | 2805 | } |
@@ -2958,13 +2808,14 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
2958 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2808 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
2959 | * In that case, we need to call pre_destroy() again. check it here. | 2809 | * In that case, we need to call pre_destroy() again. check it here. |
2960 | */ | 2810 | */ |
2961 | cgroup_release_and_wakeup_rmdir(&ptr->css); | 2811 | cgroup_release_and_wakeup_rmdir(&memcg->css); |
2962 | } | 2812 | } |
2963 | 2813 | ||
2964 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 2814 | void mem_cgroup_commit_charge_swapin(struct page *page, |
2815 | struct mem_cgroup *memcg) | ||
2965 | { | 2816 | { |
2966 | __mem_cgroup_commit_charge_swapin(page, ptr, | 2817 | __mem_cgroup_commit_charge_swapin(page, memcg, |
2967 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2818 | MEM_CGROUP_CHARGE_TYPE_MAPPED); |
2968 | } | 2819 | } |
2969 | 2820 | ||
2970 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) | 2821 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) |
@@ -3054,7 +2905,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
3054 | * Check if our page_cgroup is valid | 2905 | * Check if our page_cgroup is valid |
3055 | */ | 2906 | */ |
3056 | pc = lookup_page_cgroup(page); | 2907 | pc = lookup_page_cgroup(page); |
3057 | if (unlikely(!pc || !PageCgroupUsed(pc))) | 2908 | if (unlikely(!PageCgroupUsed(pc))) |
3058 | return NULL; | 2909 | return NULL; |
3059 | 2910 | ||
3060 | lock_page_cgroup(pc); | 2911 | lock_page_cgroup(pc); |
@@ -3117,8 +2968,7 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
3117 | /* early check. */ | 2968 | /* early check. */ |
3118 | if (page_mapped(page)) | 2969 | if (page_mapped(page)) |
3119 | return; | 2970 | return; |
3120 | if (page->mapping && !PageAnon(page)) | 2971 | VM_BUG_ON(page->mapping && !PageAnon(page)); |
3121 | return; | ||
3122 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 2972 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); |
3123 | } | 2973 | } |
3124 | 2974 | ||
@@ -3176,6 +3026,23 @@ void mem_cgroup_uncharge_end(void) | |||
3176 | batch->memcg = NULL; | 3026 | batch->memcg = NULL; |
3177 | } | 3027 | } |
3178 | 3028 | ||
3029 | /* | ||
3030 | * A function for resetting pc->mem_cgroup for newly allocated pages. | ||
3031 | * This function should be called if the newpage will be added to LRU | ||
3032 | * before start accounting. | ||
3033 | */ | ||
3034 | void mem_cgroup_reset_owner(struct page *newpage) | ||
3035 | { | ||
3036 | struct page_cgroup *pc; | ||
3037 | |||
3038 | if (mem_cgroup_disabled()) | ||
3039 | return; | ||
3040 | |||
3041 | pc = lookup_page_cgroup(newpage); | ||
3042 | VM_BUG_ON(PageCgroupUsed(pc)); | ||
3043 | pc->mem_cgroup = root_mem_cgroup; | ||
3044 | } | ||
3045 | |||
3179 | #ifdef CONFIG_SWAP | 3046 | #ifdef CONFIG_SWAP |
3180 | /* | 3047 | /* |
3181 | * called after __delete_from_swap_cache() and drop "page" account. | 3048 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -3293,14 +3160,14 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3293 | * page belongs to. | 3160 | * page belongs to. |
3294 | */ | 3161 | */ |
3295 | int mem_cgroup_prepare_migration(struct page *page, | 3162 | int mem_cgroup_prepare_migration(struct page *page, |
3296 | struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) | 3163 | struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) |
3297 | { | 3164 | { |
3298 | struct mem_cgroup *memcg = NULL; | 3165 | struct mem_cgroup *memcg = NULL; |
3299 | struct page_cgroup *pc; | 3166 | struct page_cgroup *pc; |
3300 | enum charge_type ctype; | 3167 | enum charge_type ctype; |
3301 | int ret = 0; | 3168 | int ret = 0; |
3302 | 3169 | ||
3303 | *ptr = NULL; | 3170 | *memcgp = NULL; |
3304 | 3171 | ||
3305 | VM_BUG_ON(PageTransHuge(page)); | 3172 | VM_BUG_ON(PageTransHuge(page)); |
3306 | if (mem_cgroup_disabled()) | 3173 | if (mem_cgroup_disabled()) |
@@ -3351,10 +3218,10 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3351 | if (!memcg) | 3218 | if (!memcg) |
3352 | return 0; | 3219 | return 0; |
3353 | 3220 | ||
3354 | *ptr = memcg; | 3221 | *memcgp = memcg; |
3355 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); | 3222 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); |
3356 | css_put(&memcg->css);/* drop extra refcnt */ | 3223 | css_put(&memcg->css);/* drop extra refcnt */ |
3357 | if (ret || *ptr == NULL) { | 3224 | if (ret) { |
3358 | if (PageAnon(page)) { | 3225 | if (PageAnon(page)) { |
3359 | lock_page_cgroup(pc); | 3226 | lock_page_cgroup(pc); |
3360 | ClearPageCgroupMigration(pc); | 3227 | ClearPageCgroupMigration(pc); |
@@ -3364,6 +3231,7 @@ int mem_cgroup_prepare_migration(struct page *page, | |||
3364 | */ | 3231 | */ |
3365 | mem_cgroup_uncharge_page(page); | 3232 | mem_cgroup_uncharge_page(page); |
3366 | } | 3233 | } |
3234 | /* we'll need to revisit this error code (we have -EINTR) */ | ||
3367 | return -ENOMEM; | 3235 | return -ENOMEM; |
3368 | } | 3236 | } |
3369 | /* | 3237 | /* |
@@ -3432,12 +3300,51 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3432 | cgroup_release_and_wakeup_rmdir(&memcg->css); | 3300 | cgroup_release_and_wakeup_rmdir(&memcg->css); |
3433 | } | 3301 | } |
3434 | 3302 | ||
3303 | /* | ||
3304 | * At replace page cache, newpage is not under any memcg but it's on | ||
3305 | * LRU. So, this function doesn't touch res_counter but handles LRU | ||
3306 | * in correct way. Both pages are locked so we cannot race with uncharge. | ||
3307 | */ | ||
3308 | void mem_cgroup_replace_page_cache(struct page *oldpage, | ||
3309 | struct page *newpage) | ||
3310 | { | ||
3311 | struct mem_cgroup *memcg; | ||
3312 | struct page_cgroup *pc; | ||
3313 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
3314 | |||
3315 | if (mem_cgroup_disabled()) | ||
3316 | return; | ||
3317 | |||
3318 | pc = lookup_page_cgroup(oldpage); | ||
3319 | /* fix accounting on old pages */ | ||
3320 | lock_page_cgroup(pc); | ||
3321 | memcg = pc->mem_cgroup; | ||
3322 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | ||
3323 | ClearPageCgroupUsed(pc); | ||
3324 | unlock_page_cgroup(pc); | ||
3325 | |||
3326 | if (PageSwapBacked(oldpage)) | ||
3327 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
3328 | |||
3329 | /* | ||
3330 | * Even if newpage->mapping was NULL before starting replacement, | ||
3331 | * the newpage may be on LRU(or pagevec for LRU) already. We lock | ||
3332 | * LRU while we overwrite pc->mem_cgroup. | ||
3333 | */ | ||
3334 | __mem_cgroup_commit_charge_lrucare(newpage, memcg, type); | ||
3335 | } | ||
3336 | |||
3435 | #ifdef CONFIG_DEBUG_VM | 3337 | #ifdef CONFIG_DEBUG_VM |
3436 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | 3338 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) |
3437 | { | 3339 | { |
3438 | struct page_cgroup *pc; | 3340 | struct page_cgroup *pc; |
3439 | 3341 | ||
3440 | pc = lookup_page_cgroup(page); | 3342 | pc = lookup_page_cgroup(page); |
3343 | /* | ||
3344 | * Can be NULL while feeding pages into the page allocator for | ||
3345 | * the first time, i.e. during boot or memory hotplug; | ||
3346 | * or when mem_cgroup_disabled(). | ||
3347 | */ | ||
3441 | if (likely(pc) && PageCgroupUsed(pc)) | 3348 | if (likely(pc) && PageCgroupUsed(pc)) |
3442 | return pc; | 3349 | return pc; |
3443 | return NULL; | 3350 | return NULL; |
@@ -3457,23 +3364,8 @@ void mem_cgroup_print_bad_page(struct page *page) | |||
3457 | 3364 | ||
3458 | pc = lookup_page_cgroup_used(page); | 3365 | pc = lookup_page_cgroup_used(page); |
3459 | if (pc) { | 3366 | if (pc) { |
3460 | int ret = -1; | 3367 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", |
3461 | char *path; | ||
3462 | |||
3463 | printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", | ||
3464 | pc, pc->flags, pc->mem_cgroup); | 3368 | pc, pc->flags, pc->mem_cgroup); |
3465 | |||
3466 | path = kmalloc(PATH_MAX, GFP_KERNEL); | ||
3467 | if (path) { | ||
3468 | rcu_read_lock(); | ||
3469 | ret = cgroup_path(pc->mem_cgroup->css.cgroup, | ||
3470 | path, PATH_MAX); | ||
3471 | rcu_read_unlock(); | ||
3472 | } | ||
3473 | |||
3474 | printk(KERN_CONT "(%s)\n", | ||
3475 | (ret < 0) ? "cannot get the path" : path); | ||
3476 | kfree(path); | ||
3477 | } | 3369 | } |
3478 | } | 3370 | } |
3479 | #endif | 3371 | #endif |
@@ -3534,9 +3426,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3534 | if (!ret) | 3426 | if (!ret) |
3535 | break; | 3427 | break; |
3536 | 3428 | ||
3537 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3429 | mem_cgroup_reclaim(memcg, GFP_KERNEL, |
3538 | MEM_CGROUP_RECLAIM_SHRINK, | 3430 | MEM_CGROUP_RECLAIM_SHRINK); |
3539 | NULL); | ||
3540 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3431 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3541 | /* Usage is reduced ? */ | 3432 | /* Usage is reduced ? */ |
3542 | if (curusage >= oldusage) | 3433 | if (curusage >= oldusage) |
@@ -3594,10 +3485,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3594 | if (!ret) | 3485 | if (!ret) |
3595 | break; | 3486 | break; |
3596 | 3487 | ||
3597 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, | 3488 | mem_cgroup_reclaim(memcg, GFP_KERNEL, |
3598 | MEM_CGROUP_RECLAIM_NOSWAP | | 3489 | MEM_CGROUP_RECLAIM_NOSWAP | |
3599 | MEM_CGROUP_RECLAIM_SHRINK, | 3490 | MEM_CGROUP_RECLAIM_SHRINK); |
3600 | NULL); | ||
3601 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3491 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3602 | /* Usage is reduced ? */ | 3492 | /* Usage is reduced ? */ |
3603 | if (curusage >= oldusage) | 3493 | if (curusage >= oldusage) |
@@ -3640,10 +3530,8 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3640 | break; | 3530 | break; |
3641 | 3531 | ||
3642 | nr_scanned = 0; | 3532 | nr_scanned = 0; |
3643 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | 3533 | reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, |
3644 | gfp_mask, | 3534 | gfp_mask, &nr_scanned); |
3645 | MEM_CGROUP_RECLAIM_SOFT, | ||
3646 | &nr_scanned); | ||
3647 | nr_reclaimed += reclaimed; | 3535 | nr_reclaimed += reclaimed; |
3648 | *total_scanned += nr_scanned; | 3536 | *total_scanned += nr_scanned; |
3649 | spin_lock(&mctz->lock); | 3537 | spin_lock(&mctz->lock); |
@@ -3711,22 +3599,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3711 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3599 | static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3712 | int node, int zid, enum lru_list lru) | 3600 | int node, int zid, enum lru_list lru) |
3713 | { | 3601 | { |
3714 | struct zone *zone; | ||
3715 | struct mem_cgroup_per_zone *mz; | 3602 | struct mem_cgroup_per_zone *mz; |
3716 | struct page_cgroup *pc, *busy; | ||
3717 | unsigned long flags, loop; | 3603 | unsigned long flags, loop; |
3718 | struct list_head *list; | 3604 | struct list_head *list; |
3605 | struct page *busy; | ||
3606 | struct zone *zone; | ||
3719 | int ret = 0; | 3607 | int ret = 0; |
3720 | 3608 | ||
3721 | zone = &NODE_DATA(node)->node_zones[zid]; | 3609 | zone = &NODE_DATA(node)->node_zones[zid]; |
3722 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3610 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3723 | list = &mz->lists[lru]; | 3611 | list = &mz->lruvec.lists[lru]; |
3724 | 3612 | ||
3725 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3613 | loop = MEM_CGROUP_ZSTAT(mz, lru); |
3726 | /* give some margin against EBUSY etc...*/ | 3614 | /* give some margin against EBUSY etc...*/ |
3727 | loop += 256; | 3615 | loop += 256; |
3728 | busy = NULL; | 3616 | busy = NULL; |
3729 | while (loop--) { | 3617 | while (loop--) { |
3618 | struct page_cgroup *pc; | ||
3730 | struct page *page; | 3619 | struct page *page; |
3731 | 3620 | ||
3732 | ret = 0; | 3621 | ret = 0; |
@@ -3735,24 +3624,24 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3735 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3624 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3736 | break; | 3625 | break; |
3737 | } | 3626 | } |
3738 | pc = list_entry(list->prev, struct page_cgroup, lru); | 3627 | page = list_entry(list->prev, struct page, lru); |
3739 | if (busy == pc) { | 3628 | if (busy == page) { |
3740 | list_move(&pc->lru, list); | 3629 | list_move(&page->lru, list); |
3741 | busy = NULL; | 3630 | busy = NULL; |
3742 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3631 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3743 | continue; | 3632 | continue; |
3744 | } | 3633 | } |
3745 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3634 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3746 | 3635 | ||
3747 | page = lookup_cgroup_page(pc); | 3636 | pc = lookup_page_cgroup(page); |
3748 | 3637 | ||
3749 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); | 3638 | ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); |
3750 | if (ret == -ENOMEM) | 3639 | if (ret == -ENOMEM || ret == -EINTR) |
3751 | break; | 3640 | break; |
3752 | 3641 | ||
3753 | if (ret == -EBUSY || ret == -EINVAL) { | 3642 | if (ret == -EBUSY || ret == -EINVAL) { |
3754 | /* found lock contention or "pc" is obsolete. */ | 3643 | /* found lock contention or "pc" is obsolete. */ |
3755 | busy = pc; | 3644 | busy = page; |
3756 | cond_resched(); | 3645 | cond_resched(); |
3757 | } else | 3646 | } else |
3758 | busy = NULL; | 3647 | busy = NULL; |
@@ -4846,7 +4735,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4846 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4735 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4847 | mz = &pn->zoneinfo[zone]; | 4736 | mz = &pn->zoneinfo[zone]; |
4848 | for_each_lru(l) | 4737 | for_each_lru(l) |
4849 | INIT_LIST_HEAD(&mz->lists[l]); | 4738 | INIT_LIST_HEAD(&mz->lruvec.lists[l]); |
4850 | mz->usage_in_excess = 0; | 4739 | mz->usage_in_excess = 0; |
4851 | mz->on_tree = false; | 4740 | mz->on_tree = false; |
4852 | mz->mem = memcg; | 4741 | mz->mem = memcg; |
@@ -4906,7 +4795,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4906 | mem_cgroup_remove_from_trees(memcg); | 4795 | mem_cgroup_remove_from_trees(memcg); |
4907 | free_css_id(&mem_cgroup_subsys, &memcg->css); | 4796 | free_css_id(&mem_cgroup_subsys, &memcg->css); |
4908 | 4797 | ||
4909 | for_each_node_state(node, N_POSSIBLE) | 4798 | for_each_node(node) |
4910 | free_mem_cgroup_per_zone_info(memcg, node); | 4799 | free_mem_cgroup_per_zone_info(memcg, node); |
4911 | 4800 | ||
4912 | free_percpu(memcg->stat); | 4801 | free_percpu(memcg->stat); |
@@ -4965,13 +4854,13 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
4965 | struct mem_cgroup_tree_per_zone *rtpz; | 4854 | struct mem_cgroup_tree_per_zone *rtpz; |
4966 | int tmp, node, zone; | 4855 | int tmp, node, zone; |
4967 | 4856 | ||
4968 | for_each_node_state(node, N_POSSIBLE) { | 4857 | for_each_node(node) { |
4969 | tmp = node; | 4858 | tmp = node; |
4970 | if (!node_state(node, N_NORMAL_MEMORY)) | 4859 | if (!node_state(node, N_NORMAL_MEMORY)) |
4971 | tmp = -1; | 4860 | tmp = -1; |
4972 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 4861 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
4973 | if (!rtpn) | 4862 | if (!rtpn) |
4974 | return 1; | 4863 | goto err_cleanup; |
4975 | 4864 | ||
4976 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 4865 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
4977 | 4866 | ||
@@ -4982,6 +4871,16 @@ static int mem_cgroup_soft_limit_tree_init(void) | |||
4982 | } | 4871 | } |
4983 | } | 4872 | } |
4984 | return 0; | 4873 | return 0; |
4874 | |||
4875 | err_cleanup: | ||
4876 | for_each_node(node) { | ||
4877 | if (!soft_limit_tree.rb_tree_per_node[node]) | ||
4878 | break; | ||
4879 | kfree(soft_limit_tree.rb_tree_per_node[node]); | ||
4880 | soft_limit_tree.rb_tree_per_node[node] = NULL; | ||
4881 | } | ||
4882 | return 1; | ||
4883 | |||
4985 | } | 4884 | } |
4986 | 4885 | ||
4987 | static struct cgroup_subsys_state * __ref | 4886 | static struct cgroup_subsys_state * __ref |
@@ -4995,7 +4894,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4995 | if (!memcg) | 4894 | if (!memcg) |
4996 | return ERR_PTR(error); | 4895 | return ERR_PTR(error); |
4997 | 4896 | ||
4998 | for_each_node_state(node, N_POSSIBLE) | 4897 | for_each_node(node) |
4999 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) | 4898 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) |
5000 | goto free_out; | 4899 | goto free_out; |
5001 | 4900 | ||
@@ -5033,7 +4932,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
5033 | res_counter_init(&memcg->res, NULL); | 4932 | res_counter_init(&memcg->res, NULL); |
5034 | res_counter_init(&memcg->memsw, NULL); | 4933 | res_counter_init(&memcg->memsw, NULL); |
5035 | } | 4934 | } |
5036 | memcg->last_scanned_child = 0; | ||
5037 | memcg->last_scanned_node = MAX_NUMNODES; | 4935 | memcg->last_scanned_node = MAX_NUMNODES; |
5038 | INIT_LIST_HEAD(&memcg->oom_notify); | 4936 | INIT_LIST_HEAD(&memcg->oom_notify); |
5039 | 4937 | ||
@@ -5129,9 +5027,9 @@ one_by_one: | |||
5129 | } | 5027 | } |
5130 | ret = __mem_cgroup_try_charge(NULL, | 5028 | ret = __mem_cgroup_try_charge(NULL, |
5131 | GFP_KERNEL, 1, &memcg, false); | 5029 | GFP_KERNEL, 1, &memcg, false); |
5132 | if (ret || !memcg) | 5030 | if (ret) |
5133 | /* mem_cgroup_clear_mc() will do uncharge later */ | 5031 | /* mem_cgroup_clear_mc() will do uncharge later */ |
5134 | return -ENOMEM; | 5032 | return ret; |
5135 | mc.precharge++; | 5033 | mc.precharge++; |
5136 | } | 5034 | } |
5137 | return ret; | 5035 | return ret; |
@@ -5276,7 +5174,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5276 | } | 5174 | } |
5277 | /* There is a swap entry and a page doesn't exist or isn't charged */ | 5175 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
5278 | if (ent.val && !ret && | 5176 | if (ent.val && !ret && |
5279 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | 5177 | css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { |
5280 | ret = MC_TARGET_SWAP; | 5178 | ret = MC_TARGET_SWAP; |
5281 | if (target) | 5179 | if (target) |
5282 | target->ent = ent; | 5180 | target->ent = ent; |