aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 21:34:42 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 21:34:42 -0500
commitb6da0076bab5a12afb19312ffee41c95490af2a0 (patch)
tree52a5675b9c2ff95d88b981d5b9a3822f6073c112 /mm
parentcbfe0de303a55ed96d8831c2d5f56f8131cd6612 (diff)
parenta53b831549141aa060a8b54b76e3a42870d74cc0 (diff)
Merge branch 'akpm' (patchbomb from Andrew)
Merge first patchbomb from Andrew Morton: - a few minor cifs fixes - dma-debug upadtes - ocfs2 - slab - about half of MM - procfs - kernel/exit.c - panic.c tweaks - printk upates - lib/ updates - checkpatch updates - fs/binfmt updates - the drivers/rtc tree - nilfs - kmod fixes - more kernel/exit.c - various other misc tweaks and fixes * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits) exit: pidns: fix/update the comments in zap_pid_ns_processes() exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting exit: exit_notify: re-use "dead" list to autoreap current exit: reparent: call forget_original_parent() under tasklist_lock exit: reparent: avoid find_new_reaper() if no children exit: reparent: introduce find_alive_thread() exit: reparent: introduce find_child_reaper() exit: reparent: document the ->has_child_subreaper checks exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper() exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting exit: proc: don't try to flush /proc/tgid/task/tgid exit: release_task: fix the comment about group leader accounting exit: wait: drop tasklist_lock before psig->c* accounting exit: wait: don't use zombie->real_parent exit: wait: cleanup the ptrace_reparented() checks usermodehelper: kill the kmod_thread_locker logic usermodehelper: don't use CLONE_VFORK for ____call_usermodehelper() fs/hfs/catalog.c: fix comparison bug in hfs_cat_keycmp nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile4
-rw-r--r--mm/cma.c14
-rw-r--r--mm/compaction.c139
-rw-r--r--mm/debug.c5
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/huge_memory.c1
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/hugetlb_cgroup.c103
-rw-r--r--mm/internal.h7
-rw-r--r--mm/memcontrol.c1706
-rw-r--r--mm/memory-failure.c4
-rw-r--r--mm/memory_hotplug.c4
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c137
-rw-r--r--mm/page_cgroup.c530
-rw-r--r--mm/page_counter.c192
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/slab.c23
-rw-r--r--mm/slab.h8
-rw-r--r--mm/slab_common.c40
-rw-r--r--mm/slub.c21
-rw-r--r--mm/swap_cgroup.c208
-rw-r--r--mm/swap_state.c1
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/vmalloc.c3
-rw-r--r--mm/vmscan.c18
28 files changed, 1236 insertions, 1954 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0023a9..b3c6ce932c64 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,7 +55,9 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
55obj-$(CONFIG_MIGRATION) += migrate.o 55obj-$(CONFIG_MIGRATION) += migrate.o
56obj-$(CONFIG_QUICKLIST) += quicklist.o 56obj-$(CONFIG_QUICKLIST) += quicklist.o
57obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 57obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
58obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o 58obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
59obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
60obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
59obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o 61obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
60obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 62obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
61obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 63obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/cma.c b/mm/cma.c
index fde706e1284f..8e9ec13d31db 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -215,9 +215,21 @@ int __init cma_declare_contiguous(phys_addr_t base,
215 bool fixed, struct cma **res_cma) 215 bool fixed, struct cma **res_cma)
216{ 216{
217 phys_addr_t memblock_end = memblock_end_of_DRAM(); 217 phys_addr_t memblock_end = memblock_end_of_DRAM();
218 phys_addr_t highmem_start = __pa(high_memory); 218 phys_addr_t highmem_start;
219 int ret = 0; 219 int ret = 0;
220 220
221#ifdef CONFIG_X86
222 /*
223 * high_memory isn't direct mapped memory so retrieving its physical
224 * address isn't appropriate. But it would be useful to check the
225 * physical address of the highmem boundary so it's justfiable to get
226 * the physical address from it. On x86 there is a validation check for
227 * this case, so the following workaround is needed to avoid it.
228 */
229 highmem_start = __pa_nodebug(high_memory);
230#else
231 highmem_start = __pa(high_memory);
232#endif
221 pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", 233 pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n",
222 __func__, &size, &base, &limit, &alignment); 234 __func__, &size, &base, &limit, &alignment);
223 235
diff --git a/mm/compaction.c b/mm/compaction.c
index f9792ba3537c..546e571e9d60 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
41static unsigned long release_freepages(struct list_head *freelist) 41static unsigned long release_freepages(struct list_head *freelist)
42{ 42{
43 struct page *page, *next; 43 struct page *page, *next;
44 unsigned long count = 0; 44 unsigned long high_pfn = 0;
45 45
46 list_for_each_entry_safe(page, next, freelist, lru) { 46 list_for_each_entry_safe(page, next, freelist, lru) {
47 unsigned long pfn = page_to_pfn(page);
47 list_del(&page->lru); 48 list_del(&page->lru);
48 __free_page(page); 49 __free_page(page);
49 count++; 50 if (pfn > high_pfn)
51 high_pfn = pfn;
50 } 52 }
51 53
52 return count; 54 return high_pfn;
53} 55}
54 56
55static void map_pages(struct list_head *list) 57static void map_pages(struct list_head *list)
@@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc,
195 197
196 /* Update where async and sync compaction should restart */ 198 /* Update where async and sync compaction should restart */
197 if (migrate_scanner) { 199 if (migrate_scanner) {
198 if (cc->finished_update_migrate)
199 return;
200 if (pfn > zone->compact_cached_migrate_pfn[0]) 200 if (pfn > zone->compact_cached_migrate_pfn[0])
201 zone->compact_cached_migrate_pfn[0] = pfn; 201 zone->compact_cached_migrate_pfn[0] = pfn;
202 if (cc->mode != MIGRATE_ASYNC && 202 if (cc->mode != MIGRATE_ASYNC &&
203 pfn > zone->compact_cached_migrate_pfn[1]) 203 pfn > zone->compact_cached_migrate_pfn[1])
204 zone->compact_cached_migrate_pfn[1] = pfn; 204 zone->compact_cached_migrate_pfn[1] = pfn;
205 } else { 205 } else {
206 if (cc->finished_update_free)
207 return;
208 if (pfn < zone->compact_cached_free_pfn) 206 if (pfn < zone->compact_cached_free_pfn)
209 zone->compact_cached_free_pfn = pfn; 207 zone->compact_cached_free_pfn = pfn;
210 } 208 }
@@ -715,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
715 del_page_from_lru_list(page, lruvec, page_lru(page)); 713 del_page_from_lru_list(page, lruvec, page_lru(page));
716 714
717isolate_success: 715isolate_success:
718 cc->finished_update_migrate = true;
719 list_add(&page->lru, migratelist); 716 list_add(&page->lru, migratelist);
720 cc->nr_migratepages++; 717 cc->nr_migratepages++;
721 nr_isolated++; 718 nr_isolated++;
@@ -889,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc)
889 block_start_pfn - pageblock_nr_pages; 886 block_start_pfn - pageblock_nr_pages;
890 887
891 /* 888 /*
892 * Set a flag that we successfully isolated in this pageblock.
893 * In the next loop iteration, zone->compact_cached_free_pfn
894 * will not be updated and thus it will effectively contain the
895 * highest pageblock we isolated pages from.
896 */
897 if (isolated)
898 cc->finished_update_free = true;
899
900 /*
901 * isolate_freepages_block() might have aborted due to async 889 * isolate_freepages_block() might have aborted due to async
902 * compaction being contended 890 * compaction being contended
903 */ 891 */
@@ -1086,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1086 1074
1087 /* Compaction run is not finished if the watermark is not met */ 1075 /* Compaction run is not finished if the watermark is not met */
1088 watermark = low_wmark_pages(zone); 1076 watermark = low_wmark_pages(zone);
1089 watermark += (1 << cc->order);
1090 1077
1091 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 1078 if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx,
1079 cc->alloc_flags))
1092 return COMPACT_CONTINUE; 1080 return COMPACT_CONTINUE;
1093 1081
1094 /* Direct compactor: Is a suitable page free? */ 1082 /* Direct compactor: Is a suitable page free? */
@@ -1114,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1114 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1102 * COMPACT_PARTIAL - If the allocation would succeed without compaction
1115 * COMPACT_CONTINUE - If compaction should run now 1103 * COMPACT_CONTINUE - If compaction should run now
1116 */ 1104 */
1117unsigned long compaction_suitable(struct zone *zone, int order) 1105unsigned long compaction_suitable(struct zone *zone, int order,
1106 int alloc_flags, int classzone_idx)
1118{ 1107{
1119 int fragindex; 1108 int fragindex;
1120 unsigned long watermark; 1109 unsigned long watermark;
@@ -1126,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order)
1126 if (order == -1) 1115 if (order == -1)
1127 return COMPACT_CONTINUE; 1116 return COMPACT_CONTINUE;
1128 1117
1118 watermark = low_wmark_pages(zone);
1119 /*
1120 * If watermarks for high-order allocation are already met, there
1121 * should be no need for compaction at all.
1122 */
1123 if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1124 alloc_flags))
1125 return COMPACT_PARTIAL;
1126
1129 /* 1127 /*
1130 * Watermarks for order-0 must be met for compaction. Note the 2UL. 1128 * Watermarks for order-0 must be met for compaction. Note the 2UL.
1131 * This is because during migration, copies of pages need to be 1129 * This is because during migration, copies of pages need to be
1132 * allocated and for a short time, the footprint is higher 1130 * allocated and for a short time, the footprint is higher
1133 */ 1131 */
1134 watermark = low_wmark_pages(zone) + (2UL << order); 1132 watermark += (2UL << order);
1135 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1133 if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
1136 return COMPACT_SKIPPED; 1134 return COMPACT_SKIPPED;
1137 1135
1138 /* 1136 /*
1139 * fragmentation index determines if allocation failures are due to 1137 * fragmentation index determines if allocation failures are due to
1140 * low memory or external fragmentation 1138 * low memory or external fragmentation
1141 * 1139 *
1142 * index of -1000 implies allocations might succeed depending on 1140 * index of -1000 would imply allocations might succeed depending on
1143 * watermarks 1141 * watermarks, but we already failed the high-order watermark check
1144 * index towards 0 implies failure is due to lack of memory 1142 * index towards 0 implies failure is due to lack of memory
1145 * index towards 1000 implies failure is due to fragmentation 1143 * index towards 1000 implies failure is due to fragmentation
1146 * 1144 *
@@ -1150,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order)
1150 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1148 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1151 return COMPACT_SKIPPED; 1149 return COMPACT_SKIPPED;
1152 1150
1153 if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
1154 0, 0))
1155 return COMPACT_PARTIAL;
1156
1157 return COMPACT_CONTINUE; 1151 return COMPACT_CONTINUE;
1158} 1152}
1159 1153
@@ -1164,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1164 unsigned long end_pfn = zone_end_pfn(zone); 1158 unsigned long end_pfn = zone_end_pfn(zone);
1165 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); 1159 const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1166 const bool sync = cc->mode != MIGRATE_ASYNC; 1160 const bool sync = cc->mode != MIGRATE_ASYNC;
1161 unsigned long last_migrated_pfn = 0;
1167 1162
1168 ret = compaction_suitable(zone, cc->order); 1163 ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1164 cc->classzone_idx);
1169 switch (ret) { 1165 switch (ret) {
1170 case COMPACT_PARTIAL: 1166 case COMPACT_PARTIAL:
1171 case COMPACT_SKIPPED: 1167 case COMPACT_SKIPPED:
@@ -1208,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1208 while ((ret = compact_finished(zone, cc, migratetype)) == 1204 while ((ret = compact_finished(zone, cc, migratetype)) ==
1209 COMPACT_CONTINUE) { 1205 COMPACT_CONTINUE) {
1210 int err; 1206 int err;
1207 unsigned long isolate_start_pfn = cc->migrate_pfn;
1211 1208
1212 switch (isolate_migratepages(zone, cc)) { 1209 switch (isolate_migratepages(zone, cc)) {
1213 case ISOLATE_ABORT: 1210 case ISOLATE_ABORT:
@@ -1216,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1216 cc->nr_migratepages = 0; 1213 cc->nr_migratepages = 0;
1217 goto out; 1214 goto out;
1218 case ISOLATE_NONE: 1215 case ISOLATE_NONE:
1219 continue; 1216 /*
1217 * We haven't isolated and migrated anything, but
1218 * there might still be unflushed migrations from
1219 * previous cc->order aligned block.
1220 */
1221 goto check_drain;
1220 case ISOLATE_SUCCESS: 1222 case ISOLATE_SUCCESS:
1221 ; 1223 ;
1222 } 1224 }
@@ -1241,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1241 goto out; 1243 goto out;
1242 } 1244 }
1243 } 1245 }
1246
1247 /*
1248 * Record where we could have freed pages by migration and not
1249 * yet flushed them to buddy allocator. We use the pfn that
1250 * isolate_migratepages() started from in this loop iteration
1251 * - this is the lowest page that could have been isolated and
1252 * then freed by migration.
1253 */
1254 if (!last_migrated_pfn)
1255 last_migrated_pfn = isolate_start_pfn;
1256
1257check_drain:
1258 /*
1259 * Has the migration scanner moved away from the previous
1260 * cc->order aligned block where we migrated from? If yes,
1261 * flush the pages that were freed, so that they can merge and
1262 * compact_finished() can detect immediately if allocation
1263 * would succeed.
1264 */
1265 if (cc->order > 0 && last_migrated_pfn) {
1266 int cpu;
1267 unsigned long current_block_start =
1268 cc->migrate_pfn & ~((1UL << cc->order) - 1);
1269
1270 if (last_migrated_pfn < current_block_start) {
1271 cpu = get_cpu();
1272 lru_add_drain_cpu(cpu);
1273 drain_local_pages(zone);
1274 put_cpu();
1275 /* No more flushing until we migrate again */
1276 last_migrated_pfn = 0;
1277 }
1278 }
1279
1244 } 1280 }
1245 1281
1246out: 1282out:
1247 /* Release free pages and check accounting */ 1283 /*
1248 cc->nr_freepages -= release_freepages(&cc->freepages); 1284 * Release free pages and update where the free scanner should restart,
1249 VM_BUG_ON(cc->nr_freepages != 0); 1285 * so we don't leave any returned pages behind in the next attempt.
1286 */
1287 if (cc->nr_freepages > 0) {
1288 unsigned long free_pfn = release_freepages(&cc->freepages);
1289
1290 cc->nr_freepages = 0;
1291 VM_BUG_ON(free_pfn == 0);
1292 /* The cached pfn is always the first in a pageblock */
1293 free_pfn &= ~(pageblock_nr_pages-1);
1294 /*
1295 * Only go back, not forward. The cached pfn might have been
1296 * already reset to zone end in compact_finished()
1297 */
1298 if (free_pfn > zone->compact_cached_free_pfn)
1299 zone->compact_cached_free_pfn = free_pfn;
1300 }
1250 1301
1251 trace_mm_compaction_end(ret); 1302 trace_mm_compaction_end(ret);
1252 1303
@@ -1254,7 +1305,8 @@ out:
1254} 1305}
1255 1306
1256static unsigned long compact_zone_order(struct zone *zone, int order, 1307static unsigned long compact_zone_order(struct zone *zone, int order,
1257 gfp_t gfp_mask, enum migrate_mode mode, int *contended) 1308 gfp_t gfp_mask, enum migrate_mode mode, int *contended,
1309 int alloc_flags, int classzone_idx)
1258{ 1310{
1259 unsigned long ret; 1311 unsigned long ret;
1260 struct compact_control cc = { 1312 struct compact_control cc = {
@@ -1264,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order,
1264 .gfp_mask = gfp_mask, 1316 .gfp_mask = gfp_mask,
1265 .zone = zone, 1317 .zone = zone,
1266 .mode = mode, 1318 .mode = mode,
1319 .alloc_flags = alloc_flags,
1320 .classzone_idx = classzone_idx,
1267 }; 1321 };
1268 INIT_LIST_HEAD(&cc.freepages); 1322 INIT_LIST_HEAD(&cc.freepages);
1269 INIT_LIST_HEAD(&cc.migratepages); 1323 INIT_LIST_HEAD(&cc.migratepages);
@@ -1288,14 +1342,13 @@ int sysctl_extfrag_threshold = 500;
1288 * @mode: The migration mode for async, sync light, or sync migration 1342 * @mode: The migration mode for async, sync light, or sync migration
1289 * @contended: Return value that determines if compaction was aborted due to 1343 * @contended: Return value that determines if compaction was aborted due to
1290 * need_resched() or lock contention 1344 * need_resched() or lock contention
1291 * @candidate_zone: Return the zone where we think allocation should succeed
1292 * 1345 *
1293 * This is the main entry point for direct page compaction. 1346 * This is the main entry point for direct page compaction.
1294 */ 1347 */
1295unsigned long try_to_compact_pages(struct zonelist *zonelist, 1348unsigned long try_to_compact_pages(struct zonelist *zonelist,
1296 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1349 int order, gfp_t gfp_mask, nodemask_t *nodemask,
1297 enum migrate_mode mode, int *contended, 1350 enum migrate_mode mode, int *contended,
1298 struct zone **candidate_zone) 1351 int alloc_flags, int classzone_idx)
1299{ 1352{
1300 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1353 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1301 int may_enter_fs = gfp_mask & __GFP_FS; 1354 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -1303,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1303 struct zoneref *z; 1356 struct zoneref *z;
1304 struct zone *zone; 1357 struct zone *zone;
1305 int rc = COMPACT_DEFERRED; 1358 int rc = COMPACT_DEFERRED;
1306 int alloc_flags = 0;
1307 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ 1359 int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
1308 1360
1309 *contended = COMPACT_CONTENDED_NONE; 1361 *contended = COMPACT_CONTENDED_NONE;
@@ -1312,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1312 if (!order || !may_enter_fs || !may_perform_io) 1364 if (!order || !may_enter_fs || !may_perform_io)
1313 return COMPACT_SKIPPED; 1365 return COMPACT_SKIPPED;
1314 1366
1315#ifdef CONFIG_CMA
1316 if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1317 alloc_flags |= ALLOC_CMA;
1318#endif
1319 /* Compact each zone in the list */ 1367 /* Compact each zone in the list */
1320 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1368 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1321 nodemask) { 1369 nodemask) {
@@ -1326,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1326 continue; 1374 continue;
1327 1375
1328 status = compact_zone_order(zone, order, gfp_mask, mode, 1376 status = compact_zone_order(zone, order, gfp_mask, mode,
1329 &zone_contended); 1377 &zone_contended, alloc_flags, classzone_idx);
1330 rc = max(status, rc); 1378 rc = max(status, rc);
1331 /* 1379 /*
1332 * It takes at least one zone that wasn't lock contended 1380 * It takes at least one zone that wasn't lock contended
@@ -1335,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1335 all_zones_contended &= zone_contended; 1383 all_zones_contended &= zone_contended;
1336 1384
1337 /* If a normal allocation would succeed, stop compacting */ 1385 /* If a normal allocation would succeed, stop compacting */
1338 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 1386 if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
1339 alloc_flags)) { 1387 classzone_idx, alloc_flags)) {
1340 *candidate_zone = zone;
1341 /* 1388 /*
1342 * We think the allocation will succeed in this zone, 1389 * We think the allocation will succeed in this zone,
1343 * but it is not certain, hence the false. The caller 1390 * but it is not certain, hence the false. The caller
@@ -1359,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1359 goto break_loop; 1406 goto break_loop;
1360 } 1407 }
1361 1408
1362 if (mode != MIGRATE_ASYNC) { 1409 if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
1363 /* 1410 /*
1364 * We think that allocation won't succeed in this zone 1411 * We think that allocation won't succeed in this zone
1365 * so we defer compaction there. If it ends up 1412 * so we defer compaction there. If it ends up
diff --git a/mm/debug.c b/mm/debug.c
index 5ce45c9a29b5..0e58f3211f89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason,
95 dump_flags(page->flags & badflags, 95 dump_flags(page->flags & badflags,
96 pageflag_names, ARRAY_SIZE(pageflag_names)); 96 pageflag_names, ARRAY_SIZE(pageflag_names));
97 } 97 }
98 mem_cgroup_print_bad_page(page); 98#ifdef CONFIG_MEMCG
99 if (page->mem_cgroup)
100 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
101#endif
99} 102}
100 103
101void dump_page(struct page *page, const char *reason) 104void dump_page(struct page *page, const char *reason)
diff --git a/mm/frontswap.c b/mm/frontswap.c
index f2a3571c6e22..8d82809eb085 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map)
182 if (frontswap_ops) 182 if (frontswap_ops)
183 frontswap_ops->init(type); 183 frontswap_ops->init(type);
184 else { 184 else {
185 BUG_ON(type > MAX_SWAPFILES); 185 BUG_ON(type >= MAX_SWAPFILES);
186 set_bit(type, need_init); 186 set_bit(type, need_init);
187 } 187 }
188} 188}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de984159cf0b..5b2c6875fc38 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
784 if (!pmd_none(*pmd)) 784 if (!pmd_none(*pmd))
785 return false; 785 return false;
786 entry = mk_pmd(zero_page, vma->vm_page_prot); 786 entry = mk_pmd(zero_page, vma->vm_page_prot);
787 entry = pmd_wrprotect(entry);
788 entry = pmd_mkhuge(entry); 787 entry = pmd_mkhuge(entry);
789 pgtable_trans_huge_deposit(mm, pmd, pgtable); 788 pgtable_trans_huge_deposit(mm, pmd, pgtable);
790 set_pmd_at(mm, haddr, pmd, entry); 789 set_pmd_at(mm, haddr, pmd, entry);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fd722769927..30cd96879152 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2638,8 +2638,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2638 2638
2639 tlb_start_vma(tlb, vma); 2639 tlb_start_vma(tlb, vma);
2640 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2640 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2641 address = start;
2641again: 2642again:
2642 for (address = start; address < end; address += sz) { 2643 for (; address < end; address += sz) {
2643 ptep = huge_pte_offset(mm, address); 2644 ptep = huge_pte_offset(mm, address);
2644 if (!ptep) 2645 if (!ptep)
2645 continue; 2646 continue;
@@ -2686,6 +2687,7 @@ again:
2686 page_remove_rmap(page); 2687 page_remove_rmap(page);
2687 force_flush = !__tlb_remove_page(tlb, page); 2688 force_flush = !__tlb_remove_page(tlb, page);
2688 if (force_flush) { 2689 if (force_flush) {
2690 address += sz;
2689 spin_unlock(ptl); 2691 spin_unlock(ptl);
2690 break; 2692 break;
2691 } 2693 }
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a67c26e0f360..037e1c00a5b7 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -14,6 +14,7 @@
14 */ 14 */
15 15
16#include <linux/cgroup.h> 16#include <linux/cgroup.h>
17#include <linux/page_counter.h>
17#include <linux/slab.h> 18#include <linux/slab.h>
18#include <linux/hugetlb.h> 19#include <linux/hugetlb.h>
19#include <linux/hugetlb_cgroup.h> 20#include <linux/hugetlb_cgroup.h>
@@ -23,7 +24,7 @@ struct hugetlb_cgroup {
23 /* 24 /*
24 * the counter to account for hugepages from hugetlb. 25 * the counter to account for hugepages from hugetlb.
25 */ 26 */
26 struct res_counter hugepage[HUGE_MAX_HSTATE]; 27 struct page_counter hugepage[HUGE_MAX_HSTATE];
27}; 28};
28 29
29#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 30#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
@@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
60 int idx; 61 int idx;
61 62
62 for (idx = 0; idx < hugetlb_max_hstate; idx++) { 63 for (idx = 0; idx < hugetlb_max_hstate; idx++) {
63 if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) 64 if (page_counter_read(&h_cg->hugepage[idx]))
64 return true; 65 return true;
65 } 66 }
66 return false; 67 return false;
@@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
79 80
80 if (parent_h_cgroup) { 81 if (parent_h_cgroup) {
81 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 82 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
82 res_counter_init(&h_cgroup->hugepage[idx], 83 page_counter_init(&h_cgroup->hugepage[idx],
83 &parent_h_cgroup->hugepage[idx]); 84 &parent_h_cgroup->hugepage[idx]);
84 } else { 85 } else {
85 root_h_cgroup = h_cgroup; 86 root_h_cgroup = h_cgroup;
86 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) 87 for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
87 res_counter_init(&h_cgroup->hugepage[idx], NULL); 88 page_counter_init(&h_cgroup->hugepage[idx], NULL);
88 } 89 }
89 return &h_cgroup->css; 90 return &h_cgroup->css;
90} 91}
@@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
108static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, 109static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
109 struct page *page) 110 struct page *page)
110{ 111{
111 int csize; 112 unsigned int nr_pages;
112 struct res_counter *counter; 113 struct page_counter *counter;
113 struct res_counter *fail_res;
114 struct hugetlb_cgroup *page_hcg; 114 struct hugetlb_cgroup *page_hcg;
115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); 115 struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);
116 116
@@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
123 if (!page_hcg || page_hcg != h_cg) 123 if (!page_hcg || page_hcg != h_cg)
124 goto out; 124 goto out;
125 125
126 csize = PAGE_SIZE << compound_order(page); 126 nr_pages = 1 << compound_order(page);
127 if (!parent) { 127 if (!parent) {
128 parent = root_h_cgroup; 128 parent = root_h_cgroup;
129 /* root has no limit */ 129 /* root has no limit */
130 res_counter_charge_nofail(&parent->hugepage[idx], 130 page_counter_charge(&parent->hugepage[idx], nr_pages);
131 csize, &fail_res);
132 } 131 }
133 counter = &h_cg->hugepage[idx]; 132 counter = &h_cg->hugepage[idx];
134 res_counter_uncharge_until(counter, counter->parent, csize); 133 /* Take the pages off the local counter */
134 page_counter_cancel(counter, nr_pages);
135 135
136 set_hugetlb_cgroup(page, parent); 136 set_hugetlb_cgroup(page, parent);
137out: 137out:
@@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
166 struct hugetlb_cgroup **ptr) 166 struct hugetlb_cgroup **ptr)
167{ 167{
168 int ret = 0; 168 int ret = 0;
169 struct res_counter *fail_res; 169 struct page_counter *counter;
170 struct hugetlb_cgroup *h_cg = NULL; 170 struct hugetlb_cgroup *h_cg = NULL;
171 unsigned long csize = nr_pages * PAGE_SIZE;
172 171
173 if (hugetlb_cgroup_disabled()) 172 if (hugetlb_cgroup_disabled())
174 goto done; 173 goto done;
@@ -187,7 +186,7 @@ again:
187 } 186 }
188 rcu_read_unlock(); 187 rcu_read_unlock();
189 188
190 ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); 189 ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter);
191 css_put(&h_cg->css); 190 css_put(&h_cg->css);
192done: 191done:
193 *ptr = h_cg; 192 *ptr = h_cg;
@@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
213 struct page *page) 212 struct page *page)
214{ 213{
215 struct hugetlb_cgroup *h_cg; 214 struct hugetlb_cgroup *h_cg;
216 unsigned long csize = nr_pages * PAGE_SIZE;
217 215
218 if (hugetlb_cgroup_disabled()) 216 if (hugetlb_cgroup_disabled())
219 return; 217 return;
@@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
222 if (unlikely(!h_cg)) 220 if (unlikely(!h_cg))
223 return; 221 return;
224 set_hugetlb_cgroup(page, NULL); 222 set_hugetlb_cgroup(page, NULL);
225 res_counter_uncharge(&h_cg->hugepage[idx], csize); 223 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
226 return; 224 return;
227} 225}
228 226
229void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, 227void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
230 struct hugetlb_cgroup *h_cg) 228 struct hugetlb_cgroup *h_cg)
231{ 229{
232 unsigned long csize = nr_pages * PAGE_SIZE;
233
234 if (hugetlb_cgroup_disabled() || !h_cg) 230 if (hugetlb_cgroup_disabled() || !h_cg)
235 return; 231 return;
236 232
237 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) 233 if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
238 return; 234 return;
239 235
240 res_counter_uncharge(&h_cg->hugepage[idx], csize); 236 page_counter_uncharge(&h_cg->hugepage[idx], nr_pages);
241 return; 237 return;
242} 238}
243 239
240enum {
241 RES_USAGE,
242 RES_LIMIT,
243 RES_MAX_USAGE,
244 RES_FAILCNT,
245};
246
244static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, 247static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
245 struct cftype *cft) 248 struct cftype *cft)
246{ 249{
247 int idx, name; 250 struct page_counter *counter;
248 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); 251 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
249 252
250 idx = MEMFILE_IDX(cft->private); 253 counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
251 name = MEMFILE_ATTR(cft->private);
252 254
253 return res_counter_read_u64(&h_cg->hugepage[idx], name); 255 switch (MEMFILE_ATTR(cft->private)) {
256 case RES_USAGE:
257 return (u64)page_counter_read(counter) * PAGE_SIZE;
258 case RES_LIMIT:
259 return (u64)counter->limit * PAGE_SIZE;
260 case RES_MAX_USAGE:
261 return (u64)counter->watermark * PAGE_SIZE;
262 case RES_FAILCNT:
263 return counter->failcnt;
264 default:
265 BUG();
266 }
254} 267}
255 268
269static DEFINE_MUTEX(hugetlb_limit_mutex);
270
256static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, 271static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
257 char *buf, size_t nbytes, loff_t off) 272 char *buf, size_t nbytes, loff_t off)
258{ 273{
259 int idx, name, ret; 274 int ret, idx;
260 unsigned long long val; 275 unsigned long nr_pages;
261 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 276 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
262 277
278 if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
279 return -EINVAL;
280
263 buf = strstrip(buf); 281 buf = strstrip(buf);
282 ret = page_counter_memparse(buf, &nr_pages);
283 if (ret)
284 return ret;
285
264 idx = MEMFILE_IDX(of_cft(of)->private); 286 idx = MEMFILE_IDX(of_cft(of)->private);
265 name = MEMFILE_ATTR(of_cft(of)->private);
266 287
267 switch (name) { 288 switch (MEMFILE_ATTR(of_cft(of)->private)) {
268 case RES_LIMIT: 289 case RES_LIMIT:
269 if (hugetlb_cgroup_is_root(h_cg)) { 290 mutex_lock(&hugetlb_limit_mutex);
270 /* Can't set limit on root */ 291 ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages);
271 ret = -EINVAL; 292 mutex_unlock(&hugetlb_limit_mutex);
272 break;
273 }
274 /* This function does all necessary parse...reuse it */
275 ret = res_counter_memparse_write_strategy(buf, &val);
276 if (ret)
277 break;
278 val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx]));
279 ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
280 break; 293 break;
281 default: 294 default:
282 ret = -EINVAL; 295 ret = -EINVAL;
@@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
288static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, 301static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
289 char *buf, size_t nbytes, loff_t off) 302 char *buf, size_t nbytes, loff_t off)
290{ 303{
291 int idx, name, ret = 0; 304 int ret = 0;
305 struct page_counter *counter;
292 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); 306 struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
293 307
294 idx = MEMFILE_IDX(of_cft(of)->private); 308 counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
295 name = MEMFILE_ATTR(of_cft(of)->private);
296 309
297 switch (name) { 310 switch (MEMFILE_ATTR(of_cft(of)->private)) {
298 case RES_MAX_USAGE: 311 case RES_MAX_USAGE:
299 res_counter_reset_max(&h_cg->hugepage[idx]); 312 page_counter_reset_watermark(counter);
300 break; 313 break;
301 case RES_FAILCNT: 314 case RES_FAILCNT:
302 res_counter_reset_failcnt(&h_cg->hugepage[idx]); 315 counter->failcnt = 0;
303 break; 316 break;
304 default: 317 default:
305 ret = -EINVAL; 318 ret = -EINVAL;
diff --git a/mm/internal.h b/mm/internal.h
index a4f90ba7068e..efad241f7014 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -161,13 +161,10 @@ struct compact_control {
161 unsigned long migrate_pfn; /* isolate_migratepages search base */ 161 unsigned long migrate_pfn; /* isolate_migratepages search base */
162 enum migrate_mode mode; /* Async or sync migration mode */ 162 enum migrate_mode mode; /* Async or sync migration mode */
163 bool ignore_skip_hint; /* Scan blocks even if marked skip */ 163 bool ignore_skip_hint; /* Scan blocks even if marked skip */
164 bool finished_update_free; /* True when the zone cached pfns are
165 * no longer being updated
166 */
167 bool finished_update_migrate;
168
169 int order; /* order a direct compactor needs */ 164 int order; /* order a direct compactor needs */
170 const gfp_t gfp_mask; /* gfp mask of a direct compactor */ 165 const gfp_t gfp_mask; /* gfp mask of a direct compactor */
166 const int alloc_flags; /* alloc flags of a direct compactor */
167 const int classzone_idx; /* zone index of a direct compactor */
171 struct zone *zone; 168 struct zone *zone;
172 int contended; /* Signal need_sched() or lock 169 int contended; /* Signal need_sched() or lock
173 * contention detected during 170 * contention detected during
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ee48428cf8e3..85df503ec023 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,7 +25,7 @@
25 * GNU General Public License for more details. 25 * GNU General Public License for more details.
26 */ 26 */
27 27
28#include <linux/res_counter.h> 28#include <linux/page_counter.h>
29#include <linux/memcontrol.h> 29#include <linux/memcontrol.h>
30#include <linux/cgroup.h> 30#include <linux/cgroup.h>
31#include <linux/mm.h> 31#include <linux/mm.h>
@@ -51,7 +51,7 @@
51#include <linux/seq_file.h> 51#include <linux/seq_file.h>
52#include <linux/vmpressure.h> 52#include <linux/vmpressure.h>
53#include <linux/mm_inline.h> 53#include <linux/mm_inline.h>
54#include <linux/page_cgroup.h> 54#include <linux/swap_cgroup.h>
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/oom.h> 56#include <linux/oom.h>
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu {
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 143 unsigned long targets[MEM_CGROUP_NTARGETS];
144}; 144};
145 145
146struct mem_cgroup_reclaim_iter { 146struct reclaim_iter {
147 /* 147 struct mem_cgroup *position;
148 * last scanned hierarchy member. Valid only if last_dead_count
149 * matches memcg->dead_count of the hierarchy root group.
150 */
151 struct mem_cgroup *last_visited;
152 int last_dead_count;
153
154 /* scan generation, increased every round-trip */ 148 /* scan generation, increased every round-trip */
155 unsigned int generation; 149 unsigned int generation;
156}; 150};
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone {
162 struct lruvec lruvec; 156 struct lruvec lruvec;
163 unsigned long lru_size[NR_LRU_LISTS]; 157 unsigned long lru_size[NR_LRU_LISTS];
164 158
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 159 struct reclaim_iter iter[DEF_PRIORITY + 1];
166 160
167 struct rb_node tree_node; /* RB tree node */ 161 struct rb_node tree_node; /* RB tree node */
168 unsigned long long usage_in_excess;/* Set to the value by which */ 162 unsigned long usage_in_excess;/* Set to the value by which */
169 /* the soft limit is exceeded*/ 163 /* the soft limit is exceeded*/
170 bool on_tree; 164 bool on_tree;
171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 165 struct mem_cgroup *memcg; /* Back pointer, we cannot */
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198 192
199struct mem_cgroup_threshold { 193struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd; 194 struct eventfd_ctx *eventfd;
201 u64 threshold; 195 unsigned long threshold;
202}; 196};
203 197
204/* For threshold */ 198/* For threshold */
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
284 */ 278 */
285struct mem_cgroup { 279struct mem_cgroup {
286 struct cgroup_subsys_state css; 280 struct cgroup_subsys_state css;
287 /* 281
288 * the counter to account for memory usage 282 /* Accounted resources */
289 */ 283 struct page_counter memory;
290 struct res_counter res; 284 struct page_counter memsw;
285 struct page_counter kmem;
286
287 unsigned long soft_limit;
291 288
292 /* vmpressure notifications */ 289 /* vmpressure notifications */
293 struct vmpressure vmpressure; 290 struct vmpressure vmpressure;
@@ -296,15 +293,6 @@ struct mem_cgroup {
296 int initialized; 293 int initialized;
297 294
298 /* 295 /*
299 * the counter to account for mem+swap usage.
300 */
301 struct res_counter memsw;
302
303 /*
304 * the counter to account for kernel memory usage.
305 */
306 struct res_counter kmem;
307 /*
308 * Should the accounting and control be hierarchical, per subtree? 296 * Should the accounting and control be hierarchical, per subtree?
309 */ 297 */
310 bool use_hierarchy; 298 bool use_hierarchy;
@@ -352,7 +340,6 @@ struct mem_cgroup {
352 struct mem_cgroup_stat_cpu nocpu_base; 340 struct mem_cgroup_stat_cpu nocpu_base;
353 spinlock_t pcp_counter_lock; 341 spinlock_t pcp_counter_lock;
354 342
355 atomic_t dead_count;
356#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 343#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357 struct cg_proto tcp_mem; 344 struct cg_proto tcp_mem;
358#endif 345#endif
@@ -382,7 +369,6 @@ struct mem_cgroup {
382/* internal only representation about the status of kmem accounting. */ 369/* internal only representation about the status of kmem accounting. */
383enum { 370enum {
384 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ 371 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
385 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
386}; 372};
387 373
388#ifdef CONFIG_MEMCG_KMEM 374#ifdef CONFIG_MEMCG_KMEM
@@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
396 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 382 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
397} 383}
398 384
399static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
400{
401 /*
402 * Our caller must use css_get() first, because memcg_uncharge_kmem()
403 * will call css_put() if it sees the memcg is dead.
404 */
405 smp_wmb();
406 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
407 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
408}
409
410static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
411{
412 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
413 &memcg->kmem_account_flags);
414}
415#endif 385#endif
416 386
417/* Stuffs for move charges at task migration. */ 387/* Stuffs for move charges at task migration. */
@@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg)
650 * This check can't live in kmem destruction function, 620 * This check can't live in kmem destruction function,
651 * since the charges will outlive the cgroup 621 * since the charges will outlive the cgroup
652 */ 622 */
653 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 623 WARN_ON(page_counter_read(&memcg->kmem));
654} 624}
655#else 625#else
656static void disarm_kmem_keys(struct mem_cgroup *memcg) 626static void disarm_kmem_keys(struct mem_cgroup *memcg)
@@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
664 disarm_kmem_keys(memcg); 634 disarm_kmem_keys(memcg);
665} 635}
666 636
667static void drain_all_stock_async(struct mem_cgroup *memcg);
668
669static struct mem_cgroup_per_zone * 637static struct mem_cgroup_per_zone *
670mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 638mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
671{ 639{
@@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page)
706 674
707static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 675static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
708 struct mem_cgroup_tree_per_zone *mctz, 676 struct mem_cgroup_tree_per_zone *mctz,
709 unsigned long long new_usage_in_excess) 677 unsigned long new_usage_in_excess)
710{ 678{
711 struct rb_node **p = &mctz->rb_root.rb_node; 679 struct rb_node **p = &mctz->rb_root.rb_node;
712 struct rb_node *parent = NULL; 680 struct rb_node *parent = NULL;
@@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
755 spin_unlock_irqrestore(&mctz->lock, flags); 723 spin_unlock_irqrestore(&mctz->lock, flags);
756} 724}
757 725
726static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
727{
728 unsigned long nr_pages = page_counter_read(&memcg->memory);
729 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
730 unsigned long excess = 0;
731
732 if (nr_pages > soft_limit)
733 excess = nr_pages - soft_limit;
734
735 return excess;
736}
758 737
759static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 738static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
760{ 739{
761 unsigned long long excess; 740 unsigned long excess;
762 struct mem_cgroup_per_zone *mz; 741 struct mem_cgroup_per_zone *mz;
763 struct mem_cgroup_tree_per_zone *mctz; 742 struct mem_cgroup_tree_per_zone *mctz;
764 743
@@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
769 */ 748 */
770 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 749 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
771 mz = mem_cgroup_page_zoneinfo(memcg, page); 750 mz = mem_cgroup_page_zoneinfo(memcg, page);
772 excess = res_counter_soft_limit_excess(&memcg->res); 751 excess = soft_limit_excess(memcg);
773 /* 752 /*
774 * We have to update the tree if mz is on RB-tree or 753 * We have to update the tree if mz is on RB-tree or
775 * mem is over its softlimit. 754 * mem is over its softlimit.
@@ -825,7 +804,7 @@ retry:
825 * position in the tree. 804 * position in the tree.
826 */ 805 */
827 __mem_cgroup_remove_exceeded(mz, mctz); 806 __mem_cgroup_remove_exceeded(mz, mctz);
828 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 807 if (!soft_limit_excess(mz->memcg) ||
829 !css_tryget_online(&mz->memcg->css)) 808 !css_tryget_online(&mz->memcg->css))
830 goto retry; 809 goto retry;
831done: 810done:
@@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1062 return memcg; 1041 return memcg;
1063} 1042}
1064 1043
1065/*
1066 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1067 * ref. count) or NULL if the whole root's subtree has been visited.
1068 *
1069 * helper function to be used by mem_cgroup_iter
1070 */
1071static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1072 struct mem_cgroup *last_visited)
1073{
1074 struct cgroup_subsys_state *prev_css, *next_css;
1075
1076 prev_css = last_visited ? &last_visited->css : NULL;
1077skip_node:
1078 next_css = css_next_descendant_pre(prev_css, &root->css);
1079
1080 /*
1081 * Even if we found a group we have to make sure it is
1082 * alive. css && !memcg means that the groups should be
1083 * skipped and we should continue the tree walk.
1084 * last_visited css is safe to use because it is
1085 * protected by css_get and the tree walk is rcu safe.
1086 *
1087 * We do not take a reference on the root of the tree walk
1088 * because we might race with the root removal when it would
1089 * be the only node in the iterated hierarchy and mem_cgroup_iter
1090 * would end up in an endless loop because it expects that at
1091 * least one valid node will be returned. Root cannot disappear
1092 * because caller of the iterator should hold it already so
1093 * skipping css reference should be safe.
1094 */
1095 if (next_css) {
1096 struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
1097
1098 if (next_css == &root->css)
1099 return memcg;
1100
1101 if (css_tryget_online(next_css)) {
1102 /*
1103 * Make sure the memcg is initialized:
1104 * mem_cgroup_css_online() orders the the
1105 * initialization against setting the flag.
1106 */
1107 if (smp_load_acquire(&memcg->initialized))
1108 return memcg;
1109 css_put(next_css);
1110 }
1111
1112 prev_css = next_css;
1113 goto skip_node;
1114 }
1115
1116 return NULL;
1117}
1118
1119static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1120{
1121 /*
1122 * When a group in the hierarchy below root is destroyed, the
1123 * hierarchy iterator can no longer be trusted since it might
1124 * have pointed to the destroyed group. Invalidate it.
1125 */
1126 atomic_inc(&root->dead_count);
1127}
1128
1129static struct mem_cgroup *
1130mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1131 struct mem_cgroup *root,
1132 int *sequence)
1133{
1134 struct mem_cgroup *position = NULL;
1135 /*
1136 * A cgroup destruction happens in two stages: offlining and
1137 * release. They are separated by a RCU grace period.
1138 *
1139 * If the iterator is valid, we may still race with an
1140 * offlining. The RCU lock ensures the object won't be
1141 * released, tryget will fail if we lost the race.
1142 */
1143 *sequence = atomic_read(&root->dead_count);
1144 if (iter->last_dead_count == *sequence) {
1145 smp_rmb();
1146 position = iter->last_visited;
1147
1148 /*
1149 * We cannot take a reference to root because we might race
1150 * with root removal and returning NULL would end up in
1151 * an endless loop on the iterator user level when root
1152 * would be returned all the time.
1153 */
1154 if (position && position != root &&
1155 !css_tryget_online(&position->css))
1156 position = NULL;
1157 }
1158 return position;
1159}
1160
1161static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1162 struct mem_cgroup *last_visited,
1163 struct mem_cgroup *new_position,
1164 struct mem_cgroup *root,
1165 int sequence)
1166{
1167 /* root reference counting symmetric to mem_cgroup_iter_load */
1168 if (last_visited && last_visited != root)
1169 css_put(&last_visited->css);
1170 /*
1171 * We store the sequence count from the time @last_visited was
1172 * loaded successfully instead of rereading it here so that we
1173 * don't lose destruction events in between. We could have
1174 * raced with the destruction of @new_position after all.
1175 */
1176 iter->last_visited = new_position;
1177 smp_wmb();
1178 iter->last_dead_count = sequence;
1179}
1180
1181/** 1044/**
1182 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1045 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1183 * @root: hierarchy root 1046 * @root: hierarchy root
@@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1199 struct mem_cgroup *prev, 1062 struct mem_cgroup *prev,
1200 struct mem_cgroup_reclaim_cookie *reclaim) 1063 struct mem_cgroup_reclaim_cookie *reclaim)
1201{ 1064{
1065 struct reclaim_iter *uninitialized_var(iter);
1066 struct cgroup_subsys_state *css = NULL;
1202 struct mem_cgroup *memcg = NULL; 1067 struct mem_cgroup *memcg = NULL;
1203 struct mem_cgroup *last_visited = NULL; 1068 struct mem_cgroup *pos = NULL;
1204 1069
1205 if (mem_cgroup_disabled()) 1070 if (mem_cgroup_disabled())
1206 return NULL; 1071 return NULL;
@@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1209 root = root_mem_cgroup; 1074 root = root_mem_cgroup;
1210 1075
1211 if (prev && !reclaim) 1076 if (prev && !reclaim)
1212 last_visited = prev; 1077 pos = prev;
1213 1078
1214 if (!root->use_hierarchy && root != root_mem_cgroup) { 1079 if (!root->use_hierarchy && root != root_mem_cgroup) {
1215 if (prev) 1080 if (prev)
1216 goto out_css_put; 1081 goto out;
1217 return root; 1082 return root;
1218 } 1083 }
1219 1084
1220 rcu_read_lock(); 1085 rcu_read_lock();
1221 while (!memcg) {
1222 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1223 int uninitialized_var(seq);
1224
1225 if (reclaim) {
1226 struct mem_cgroup_per_zone *mz;
1227
1228 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1229 iter = &mz->reclaim_iter[reclaim->priority];
1230 if (prev && reclaim->generation != iter->generation) {
1231 iter->last_visited = NULL;
1232 goto out_unlock;
1233 }
1234 1086
1235 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1087 if (reclaim) {
1088 struct mem_cgroup_per_zone *mz;
1089
1090 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1091 iter = &mz->iter[reclaim->priority];
1092
1093 if (prev && reclaim->generation != iter->generation)
1094 goto out_unlock;
1095
1096 do {
1097 pos = ACCESS_ONCE(iter->position);
1098 /*
1099 * A racing update may change the position and
1100 * put the last reference, hence css_tryget(),
1101 * or retry to see the updated position.
1102 */
1103 } while (pos && !css_tryget(&pos->css));
1104 }
1105
1106 if (pos)
1107 css = &pos->css;
1108
1109 for (;;) {
1110 css = css_next_descendant_pre(css, &root->css);
1111 if (!css) {
1112 /*
1113 * Reclaimers share the hierarchy walk, and a
1114 * new one might jump in right at the end of
1115 * the hierarchy - make sure they see at least
1116 * one group and restart from the beginning.
1117 */
1118 if (!prev)
1119 continue;
1120 break;
1236 } 1121 }
1237 1122
1238 memcg = __mem_cgroup_iter_next(root, last_visited); 1123 /*
1124 * Verify the css and acquire a reference. The root
1125 * is provided by the caller, so we know it's alive
1126 * and kicking, and don't take an extra reference.
1127 */
1128 memcg = mem_cgroup_from_css(css);
1129
1130 if (css == &root->css)
1131 break;
1239 1132
1240 if (reclaim) { 1133 if (css_tryget(css)) {
1241 mem_cgroup_iter_update(iter, last_visited, memcg, root, 1134 /*
1242 seq); 1135 * Make sure the memcg is initialized:
1136 * mem_cgroup_css_online() orders the the
1137 * initialization against setting the flag.
1138 */
1139 if (smp_load_acquire(&memcg->initialized))
1140 break;
1243 1141
1244 if (!memcg) 1142 css_put(css);
1245 iter->generation++;
1246 else if (!prev && memcg)
1247 reclaim->generation = iter->generation;
1248 } 1143 }
1249 1144
1250 if (prev && !memcg) 1145 memcg = NULL;
1251 goto out_unlock; 1146 }
1147
1148 if (reclaim) {
1149 if (cmpxchg(&iter->position, pos, memcg) == pos) {
1150 if (memcg)
1151 css_get(&memcg->css);
1152 if (pos)
1153 css_put(&pos->css);
1154 }
1155
1156 /*
1157 * pairs with css_tryget when dereferencing iter->position
1158 * above.
1159 */
1160 if (pos)
1161 css_put(&pos->css);
1162
1163 if (!memcg)
1164 iter->generation++;
1165 else if (!prev)
1166 reclaim->generation = iter->generation;
1252 } 1167 }
1168
1253out_unlock: 1169out_unlock:
1254 rcu_read_unlock(); 1170 rcu_read_unlock();
1255out_css_put: 1171out:
1256 if (prev && prev != root) 1172 if (prev && prev != root)
1257 css_put(&prev->css); 1173 css_put(&prev->css);
1258 1174
@@ -1346,15 +1262,18 @@ out:
1346} 1262}
1347 1263
1348/** 1264/**
1349 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1265 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1350 * @page: the page 1266 * @page: the page
1351 * @zone: zone of the page 1267 * @zone: zone of the page
1268 *
1269 * This function is only safe when following the LRU page isolation
1270 * and putback protocol: the LRU lock must be held, and the page must
1271 * either be PageLRU() or the caller must have isolated/allocated it.
1352 */ 1272 */
1353struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1273struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1354{ 1274{
1355 struct mem_cgroup_per_zone *mz; 1275 struct mem_cgroup_per_zone *mz;
1356 struct mem_cgroup *memcg; 1276 struct mem_cgroup *memcg;
1357 struct page_cgroup *pc;
1358 struct lruvec *lruvec; 1277 struct lruvec *lruvec;
1359 1278
1360 if (mem_cgroup_disabled()) { 1279 if (mem_cgroup_disabled()) {
@@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1362 goto out; 1281 goto out;
1363 } 1282 }
1364 1283
1365 pc = lookup_page_cgroup(page); 1284 memcg = page->mem_cgroup;
1366 memcg = pc->mem_cgroup;
1367
1368 /* 1285 /*
1369 * Surreptitiously switch any uncharged offlist page to root: 1286 * Swapcache readahead pages are added to the LRU - and
1370 * an uncharged page off lru does nothing to secure 1287 * possibly migrated - before they are charged.
1371 * its former mem_cgroup from sudden removal.
1372 *
1373 * Our caller holds lru_lock, and PageCgroupUsed is updated
1374 * under page_cgroup lock: between them, they make all uses
1375 * of pc->mem_cgroup safe.
1376 */ 1288 */
1377 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1289 if (!memcg)
1378 pc->mem_cgroup = memcg = root_mem_cgroup; 1290 memcg = root_mem_cgroup;
1379 1291
1380 mz = mem_cgroup_page_zoneinfo(memcg, page); 1292 mz = mem_cgroup_page_zoneinfo(memcg, page);
1381 lruvec = &mz->lruvec; 1293 lruvec = &mz->lruvec;
@@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1414 VM_BUG_ON((long)(*lru_size) < 0); 1326 VM_BUG_ON((long)(*lru_size) < 0);
1415} 1327}
1416 1328
1417/* 1329bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
1418 * Checks whether given mem is same or in the root_mem_cgroup's
1419 * hierarchy subtree
1420 */
1421bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1422 struct mem_cgroup *memcg)
1423{ 1330{
1424 if (root_memcg == memcg) 1331 if (root == memcg)
1425 return true; 1332 return true;
1426 if (!root_memcg->use_hierarchy || !memcg) 1333 if (!root->use_hierarchy)
1427 return false; 1334 return false;
1428 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1335 return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
1429}
1430
1431static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1432 struct mem_cgroup *memcg)
1433{
1434 bool ret;
1435
1436 rcu_read_lock();
1437 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1438 rcu_read_unlock();
1439 return ret;
1440} 1336}
1441 1337
1442bool task_in_mem_cgroup(struct task_struct *task, 1338bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1443 const struct mem_cgroup *memcg)
1444{ 1339{
1445 struct mem_cgroup *curr = NULL; 1340 struct mem_cgroup *task_memcg;
1446 struct task_struct *p; 1341 struct task_struct *p;
1447 bool ret; 1342 bool ret;
1448 1343
1449 p = find_lock_task_mm(task); 1344 p = find_lock_task_mm(task);
1450 if (p) { 1345 if (p) {
1451 curr = get_mem_cgroup_from_mm(p->mm); 1346 task_memcg = get_mem_cgroup_from_mm(p->mm);
1452 task_unlock(p); 1347 task_unlock(p);
1453 } else { 1348 } else {
1454 /* 1349 /*
@@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task,
1457 * killed to prevent needlessly killing additional tasks. 1352 * killed to prevent needlessly killing additional tasks.
1458 */ 1353 */
1459 rcu_read_lock(); 1354 rcu_read_lock();
1460 curr = mem_cgroup_from_task(task); 1355 task_memcg = mem_cgroup_from_task(task);
1461 if (curr) 1356 css_get(&task_memcg->css);
1462 css_get(&curr->css);
1463 rcu_read_unlock(); 1357 rcu_read_unlock();
1464 } 1358 }
1465 /* 1359 ret = mem_cgroup_is_descendant(task_memcg, memcg);
1466 * We should check use_hierarchy of "memcg" not "curr". Because checking 1360 css_put(&task_memcg->css);
1467 * use_hierarchy of "curr" here make this function true if hierarchy is
1468 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1469 * hierarchy(even if use_hierarchy is disabled in "memcg").
1470 */
1471 ret = mem_cgroup_same_or_subtree(memcg, curr);
1472 css_put(&curr->css);
1473 return ret; 1361 return ret;
1474} 1362}
1475 1363
@@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1492 return inactive * inactive_ratio < active; 1380 return inactive * inactive_ratio < active;
1493} 1381}
1494 1382
1495#define mem_cgroup_from_res_counter(counter, member) \ 1383#define mem_cgroup_from_counter(counter, member) \
1496 container_of(counter, struct mem_cgroup, member) 1384 container_of(counter, struct mem_cgroup, member)
1497 1385
1498/** 1386/**
@@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1504 */ 1392 */
1505static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1393static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1506{ 1394{
1507 unsigned long long margin; 1395 unsigned long margin = 0;
1396 unsigned long count;
1397 unsigned long limit;
1508 1398
1509 margin = res_counter_margin(&memcg->res); 1399 count = page_counter_read(&memcg->memory);
1510 if (do_swap_account) 1400 limit = ACCESS_ONCE(memcg->memory.limit);
1511 margin = min(margin, res_counter_margin(&memcg->memsw)); 1401 if (count < limit)
1512 return margin >> PAGE_SHIFT; 1402 margin = limit - count;
1403
1404 if (do_swap_account) {
1405 count = page_counter_read(&memcg->memsw);
1406 limit = ACCESS_ONCE(memcg->memsw.limit);
1407 if (count <= limit)
1408 margin = min(margin, limit - count);
1409 }
1410
1411 return margin;
1513} 1412}
1514 1413
1515int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1414int mem_cgroup_swappiness(struct mem_cgroup *memcg)
@@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1522} 1421}
1523 1422
1524/* 1423/*
1525 * memcg->moving_account is used for checking possibility that some thread is
1526 * calling move_account(). When a thread on CPU-A starts moving pages under
1527 * a memcg, other threads should check memcg->moving_account under
1528 * rcu_read_lock(), like this:
1529 *
1530 * CPU-A CPU-B
1531 * rcu_read_lock()
1532 * memcg->moving_account+1 if (memcg->mocing_account)
1533 * take heavy locks.
1534 * synchronize_rcu() update something.
1535 * rcu_read_unlock()
1536 * start move here.
1537 */
1538
1539static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1540{
1541 atomic_inc(&memcg->moving_account);
1542 synchronize_rcu();
1543}
1544
1545static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1546{
1547 /*
1548 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1549 * We check NULL in callee rather than caller.
1550 */
1551 if (memcg)
1552 atomic_dec(&memcg->moving_account);
1553}
1554
1555/*
1556 * A routine for checking "mem" is under move_account() or not. 1424 * A routine for checking "mem" is under move_account() or not.
1557 * 1425 *
1558 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1426 * Checking a cgroup is mc.from or mc.to or under hierarchy of
@@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1574 if (!from) 1442 if (!from)
1575 goto unlock; 1443 goto unlock;
1576 1444
1577 ret = mem_cgroup_same_or_subtree(memcg, from) 1445 ret = mem_cgroup_is_descendant(from, memcg) ||
1578 || mem_cgroup_same_or_subtree(memcg, to); 1446 mem_cgroup_is_descendant(to, memcg);
1579unlock: 1447unlock:
1580 spin_unlock(&mc.lock); 1448 spin_unlock(&mc.lock);
1581 return ret; 1449 return ret;
@@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1597 return false; 1465 return false;
1598} 1466}
1599 1467
1600/*
1601 * Take this lock when
1602 * - a code tries to modify page's memcg while it's USED.
1603 * - a code tries to modify page state accounting in a memcg.
1604 */
1605static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1606 unsigned long *flags)
1607{
1608 spin_lock_irqsave(&memcg->move_lock, *flags);
1609}
1610
1611static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1612 unsigned long *flags)
1613{
1614 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1615}
1616
1617#define K(x) ((x) << (PAGE_SHIFT-10)) 1468#define K(x) ((x) << (PAGE_SHIFT-10))
1618/** 1469/**
1619 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1470 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1644 1495
1645 rcu_read_unlock(); 1496 rcu_read_unlock();
1646 1497
1647 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1498 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1648 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1499 K((u64)page_counter_read(&memcg->memory)),
1649 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1500 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1650 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1501 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1651 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1502 K((u64)page_counter_read(&memcg->memsw)),
1652 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1503 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1653 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1504 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1654 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1505 K((u64)page_counter_read(&memcg->kmem)),
1655 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1506 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1656 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1657 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1658 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1659 1507
1660 for_each_mem_cgroup_tree(iter, memcg) { 1508 for_each_mem_cgroup_tree(iter, memcg) {
1661 pr_info("Memory cgroup stats for "); 1509 pr_info("Memory cgroup stats for ");
@@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1695/* 1543/*
1696 * Return the memory (and swap, if configured) limit for a memcg. 1544 * Return the memory (and swap, if configured) limit for a memcg.
1697 */ 1545 */
1698static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1546static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1699{ 1547{
1700 u64 limit; 1548 unsigned long limit;
1701
1702 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1703 1549
1704 /* 1550 limit = memcg->memory.limit;
1705 * Do not consider swap space if we cannot swap due to swappiness
1706 */
1707 if (mem_cgroup_swappiness(memcg)) { 1551 if (mem_cgroup_swappiness(memcg)) {
1708 u64 memsw; 1552 unsigned long memsw_limit;
1709 1553
1710 limit += total_swap_pages << PAGE_SHIFT; 1554 memsw_limit = memcg->memsw.limit;
1711 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1555 limit = min(limit + total_swap_pages, memsw_limit);
1712
1713 /*
1714 * If memsw is finite and limits the amount of swap space
1715 * available to this memcg, return that limit.
1716 */
1717 limit = min(limit, memsw);
1718 } 1556 }
1719
1720 return limit; 1557 return limit;
1721} 1558}
1722 1559
@@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1740 } 1577 }
1741 1578
1742 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1579 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1743 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1580 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1744 for_each_mem_cgroup_tree(iter, memcg) { 1581 for_each_mem_cgroup_tree(iter, memcg) {
1745 struct css_task_iter it; 1582 struct css_task_iter it;
1746 struct task_struct *task; 1583 struct task_struct *task;
@@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1880 memcg->last_scanned_node = node; 1717 memcg->last_scanned_node = node;
1881 return node; 1718 return node;
1882} 1719}
1883
1884/*
1885 * Check all nodes whether it contains reclaimable pages or not.
1886 * For quick scan, we make use of scan_nodes. This will allow us to skip
1887 * unused nodes. But scan_nodes is lazily updated and may not cotain
1888 * enough new information. We need to do double check.
1889 */
1890static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1891{
1892 int nid;
1893
1894 /*
1895 * quick check...making use of scan_node.
1896 * We can skip unused nodes.
1897 */
1898 if (!nodes_empty(memcg->scan_nodes)) {
1899 for (nid = first_node(memcg->scan_nodes);
1900 nid < MAX_NUMNODES;
1901 nid = next_node(nid, memcg->scan_nodes)) {
1902
1903 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1904 return true;
1905 }
1906 }
1907 /*
1908 * Check rest of nodes.
1909 */
1910 for_each_node_state(nid, N_MEMORY) {
1911 if (node_isset(nid, memcg->scan_nodes))
1912 continue;
1913 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1914 return true;
1915 }
1916 return false;
1917}
1918
1919#else 1720#else
1920int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1721int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1921{ 1722{
1922 return 0; 1723 return 0;
1923} 1724}
1924
1925static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1926{
1927 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1928}
1929#endif 1725#endif
1930 1726
1931static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1727static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
@@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1943 .priority = 0, 1739 .priority = 0,
1944 }; 1740 };
1945 1741
1946 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 1742 excess = soft_limit_excess(root_memcg);
1947 1743
1948 while (1) { 1744 while (1) {
1949 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1745 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
@@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1969 } 1765 }
1970 continue; 1766 continue;
1971 } 1767 }
1972 if (!mem_cgroup_reclaimable(victim, false))
1973 continue;
1974 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1768 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1975 zone, &nr_scanned); 1769 zone, &nr_scanned);
1976 *total_scanned += nr_scanned; 1770 *total_scanned += nr_scanned;
1977 if (!res_counter_soft_limit_excess(&root_memcg->res)) 1771 if (!soft_limit_excess(root_memcg))
1978 break; 1772 break;
1979 } 1773 }
1980 mem_cgroup_iter_break(root_memcg, victim); 1774 mem_cgroup_iter_break(root_memcg, victim);
@@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
2081 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1875 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2082 oom_wait_memcg = oom_wait_info->memcg; 1876 oom_wait_memcg = oom_wait_info->memcg;
2083 1877
2084 /* 1878 if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
2085 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 1879 !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
2086 * Then we can use css_is_ancestor without taking care of RCU.
2087 */
2088 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2089 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2090 return 0; 1880 return 0;
2091 return autoremove_wake_function(wait, mode, sync, arg); 1881 return autoremove_wake_function(wait, mode, sync, arg);
2092} 1882}
@@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2228 unsigned long *flags) 2018 unsigned long *flags)
2229{ 2019{
2230 struct mem_cgroup *memcg; 2020 struct mem_cgroup *memcg;
2231 struct page_cgroup *pc;
2232 2021
2233 rcu_read_lock(); 2022 rcu_read_lock();
2234 2023
2235 if (mem_cgroup_disabled()) 2024 if (mem_cgroup_disabled())
2236 return NULL; 2025 return NULL;
2237
2238 pc = lookup_page_cgroup(page);
2239again: 2026again:
2240 memcg = pc->mem_cgroup; 2027 memcg = page->mem_cgroup;
2241 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2028 if (unlikely(!memcg))
2242 return NULL; 2029 return NULL;
2243 2030
2244 *locked = false; 2031 *locked = false;
2245 if (atomic_read(&memcg->moving_account) <= 0) 2032 if (atomic_read(&memcg->moving_account) <= 0)
2246 return memcg; 2033 return memcg;
2247 2034
2248 move_lock_mem_cgroup(memcg, flags); 2035 spin_lock_irqsave(&memcg->move_lock, *flags);
2249 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2036 if (memcg != page->mem_cgroup) {
2250 move_unlock_mem_cgroup(memcg, flags); 2037 spin_unlock_irqrestore(&memcg->move_lock, *flags);
2251 goto again; 2038 goto again;
2252 } 2039 }
2253 *locked = true; 2040 *locked = true;
@@ -2261,11 +2048,11 @@ again:
2261 * @locked: value received from mem_cgroup_begin_page_stat() 2048 * @locked: value received from mem_cgroup_begin_page_stat()
2262 * @flags: value received from mem_cgroup_begin_page_stat() 2049 * @flags: value received from mem_cgroup_begin_page_stat()
2263 */ 2050 */
2264void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, 2051void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
2265 unsigned long flags) 2052 unsigned long *flags)
2266{ 2053{
2267 if (memcg && locked) 2054 if (memcg && *locked)
2268 move_unlock_mem_cgroup(memcg, &flags); 2055 spin_unlock_irqrestore(&memcg->move_lock, *flags);
2269 2056
2270 rcu_read_unlock(); 2057 rcu_read_unlock();
2271} 2058}
@@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex);
2316static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2103static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2317{ 2104{
2318 struct memcg_stock_pcp *stock; 2105 struct memcg_stock_pcp *stock;
2319 bool ret = true; 2106 bool ret = false;
2320 2107
2321 if (nr_pages > CHARGE_BATCH) 2108 if (nr_pages > CHARGE_BATCH)
2322 return false; 2109 return ret;
2323 2110
2324 stock = &get_cpu_var(memcg_stock); 2111 stock = &get_cpu_var(memcg_stock);
2325 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2112 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2326 stock->nr_pages -= nr_pages; 2113 stock->nr_pages -= nr_pages;
2327 else /* need to call res_counter_charge */ 2114 ret = true;
2328 ret = false; 2115 }
2329 put_cpu_var(memcg_stock); 2116 put_cpu_var(memcg_stock);
2330 return ret; 2117 return ret;
2331} 2118}
2332 2119
2333/* 2120/*
2334 * Returns stocks cached in percpu to res_counter and reset cached information. 2121 * Returns stocks cached in percpu and reset cached information.
2335 */ 2122 */
2336static void drain_stock(struct memcg_stock_pcp *stock) 2123static void drain_stock(struct memcg_stock_pcp *stock)
2337{ 2124{
2338 struct mem_cgroup *old = stock->cached; 2125 struct mem_cgroup *old = stock->cached;
2339 2126
2340 if (stock->nr_pages) { 2127 if (stock->nr_pages) {
2341 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2128 page_counter_uncharge(&old->memory, stock->nr_pages);
2342
2343 res_counter_uncharge(&old->res, bytes);
2344 if (do_swap_account) 2129 if (do_swap_account)
2345 res_counter_uncharge(&old->memsw, bytes); 2130 page_counter_uncharge(&old->memsw, stock->nr_pages);
2131 css_put_many(&old->css, stock->nr_pages);
2346 stock->nr_pages = 0; 2132 stock->nr_pages = 0;
2347 } 2133 }
2348 stock->cached = NULL; 2134 stock->cached = NULL;
@@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void)
2371} 2157}
2372 2158
2373/* 2159/*
2374 * Cache charges(val) which is from res_counter, to local per_cpu area. 2160 * Cache charges(val) to local per_cpu area.
2375 * This will be consumed by consume_stock() function, later. 2161 * This will be consumed by consume_stock() function, later.
2376 */ 2162 */
2377static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2163static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2388 2174
2389/* 2175/*
2390 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2176 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2391 * of the hierarchy under it. sync flag says whether we should block 2177 * of the hierarchy under it.
2392 * until the work is done.
2393 */ 2178 */
2394static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2179static void drain_all_stock(struct mem_cgroup *root_memcg)
2395{ 2180{
2396 int cpu, curcpu; 2181 int cpu, curcpu;
2397 2182
2183 /* If someone's already draining, avoid adding running more workers. */
2184 if (!mutex_trylock(&percpu_charge_mutex))
2185 return;
2398 /* Notify other cpus that system-wide "drain" is running */ 2186 /* Notify other cpus that system-wide "drain" is running */
2399 get_online_cpus(); 2187 get_online_cpus();
2400 curcpu = get_cpu(); 2188 curcpu = get_cpu();
@@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2405 memcg = stock->cached; 2193 memcg = stock->cached;
2406 if (!memcg || !stock->nr_pages) 2194 if (!memcg || !stock->nr_pages)
2407 continue; 2195 continue;
2408 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2196 if (!mem_cgroup_is_descendant(memcg, root_memcg))
2409 continue; 2197 continue;
2410 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2198 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2411 if (cpu == curcpu) 2199 if (cpu == curcpu)
@@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2415 } 2203 }
2416 } 2204 }
2417 put_cpu(); 2205 put_cpu();
2418
2419 if (!sync)
2420 goto out;
2421
2422 for_each_online_cpu(cpu) {
2423 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2424 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2425 flush_work(&stock->work);
2426 }
2427out:
2428 put_online_cpus(); 2206 put_online_cpus();
2429}
2430
2431/*
2432 * Tries to drain stocked charges in other cpus. This function is asynchronous
2433 * and just put a work per cpu for draining localy on each cpu. Caller can
2434 * expects some charges will be back to res_counter later but cannot wait for
2435 * it.
2436 */
2437static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2438{
2439 /*
2440 * If someone calls draining, avoid adding more kworker runs.
2441 */
2442 if (!mutex_trylock(&percpu_charge_mutex))
2443 return;
2444 drain_all_stock(root_memcg, false);
2445 mutex_unlock(&percpu_charge_mutex);
2446}
2447
2448/* This is a synchronous drain interface. */
2449static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2450{
2451 /* called when force_empty is called */
2452 mutex_lock(&percpu_charge_mutex);
2453 drain_all_stock(root_memcg, true);
2454 mutex_unlock(&percpu_charge_mutex); 2207 mutex_unlock(&percpu_charge_mutex);
2455} 2208}
2456 2209
@@ -2506,9 +2259,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2506 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2259 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2507 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2260 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2508 struct mem_cgroup *mem_over_limit; 2261 struct mem_cgroup *mem_over_limit;
2509 struct res_counter *fail_res; 2262 struct page_counter *counter;
2510 unsigned long nr_reclaimed; 2263 unsigned long nr_reclaimed;
2511 unsigned long long size;
2512 bool may_swap = true; 2264 bool may_swap = true;
2513 bool drained = false; 2265 bool drained = false;
2514 int ret = 0; 2266 int ret = 0;
@@ -2519,16 +2271,15 @@ retry:
2519 if (consume_stock(memcg, nr_pages)) 2271 if (consume_stock(memcg, nr_pages))
2520 goto done; 2272 goto done;
2521 2273
2522 size = batch * PAGE_SIZE;
2523 if (!do_swap_account || 2274 if (!do_swap_account ||
2524 !res_counter_charge(&memcg->memsw, size, &fail_res)) { 2275 !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2525 if (!res_counter_charge(&memcg->res, size, &fail_res)) 2276 if (!page_counter_try_charge(&memcg->memory, batch, &counter))
2526 goto done_restock; 2277 goto done_restock;
2527 if (do_swap_account) 2278 if (do_swap_account)
2528 res_counter_uncharge(&memcg->memsw, size); 2279 page_counter_uncharge(&memcg->memsw, batch);
2529 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2280 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2530 } else { 2281 } else {
2531 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2282 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2532 may_swap = false; 2283 may_swap = false;
2533 } 2284 }
2534 2285
@@ -2561,7 +2312,7 @@ retry:
2561 goto retry; 2312 goto retry;
2562 2313
2563 if (!drained) { 2314 if (!drained) {
2564 drain_all_stock_async(mem_over_limit); 2315 drain_all_stock(mem_over_limit);
2565 drained = true; 2316 drained = true;
2566 goto retry; 2317 goto retry;
2567 } 2318 }
@@ -2603,6 +2354,7 @@ bypass:
2603 return -EINTR; 2354 return -EINTR;
2604 2355
2605done_restock: 2356done_restock:
2357 css_get_many(&memcg->css, batch);
2606 if (batch > nr_pages) 2358 if (batch > nr_pages)
2607 refill_stock(memcg, batch - nr_pages); 2359 refill_stock(memcg, batch - nr_pages);
2608done: 2360done:
@@ -2611,32 +2363,14 @@ done:
2611 2363
2612static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2364static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2613{ 2365{
2614 unsigned long bytes = nr_pages * PAGE_SIZE;
2615
2616 if (mem_cgroup_is_root(memcg)) 2366 if (mem_cgroup_is_root(memcg))
2617 return; 2367 return;
2618 2368
2619 res_counter_uncharge(&memcg->res, bytes); 2369 page_counter_uncharge(&memcg->memory, nr_pages);
2620 if (do_swap_account) 2370 if (do_swap_account)
2621 res_counter_uncharge(&memcg->memsw, bytes); 2371 page_counter_uncharge(&memcg->memsw, nr_pages);
2622}
2623
2624/*
2625 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2626 * This is useful when moving usage to parent cgroup.
2627 */
2628static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2629 unsigned int nr_pages)
2630{
2631 unsigned long bytes = nr_pages * PAGE_SIZE;
2632
2633 if (mem_cgroup_is_root(memcg))
2634 return;
2635 2372
2636 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2373 css_put_many(&memcg->css, nr_pages);
2637 if (do_swap_account)
2638 res_counter_uncharge_until(&memcg->memsw,
2639 memcg->memsw.parent, bytes);
2640} 2374}
2641 2375
2642/* 2376/*
@@ -2665,17 +2399,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2665 */ 2399 */
2666struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2400struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2667{ 2401{
2668 struct mem_cgroup *memcg = NULL; 2402 struct mem_cgroup *memcg;
2669 struct page_cgroup *pc;
2670 unsigned short id; 2403 unsigned short id;
2671 swp_entry_t ent; 2404 swp_entry_t ent;
2672 2405
2673 VM_BUG_ON_PAGE(!PageLocked(page), page); 2406 VM_BUG_ON_PAGE(!PageLocked(page), page);
2674 2407
2675 pc = lookup_page_cgroup(page); 2408 memcg = page->mem_cgroup;
2676 if (PageCgroupUsed(pc)) { 2409 if (memcg) {
2677 memcg = pc->mem_cgroup; 2410 if (!css_tryget_online(&memcg->css))
2678 if (memcg && !css_tryget_online(&memcg->css))
2679 memcg = NULL; 2411 memcg = NULL;
2680 } else if (PageSwapCache(page)) { 2412 } else if (PageSwapCache(page)) {
2681 ent.val = page_private(page); 2413 ent.val = page_private(page);
@@ -2723,14 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
2723static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2455static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2724 bool lrucare) 2456 bool lrucare)
2725{ 2457{
2726 struct page_cgroup *pc = lookup_page_cgroup(page);
2727 int isolated; 2458 int isolated;
2728 2459
2729 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2460 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2730 /*
2731 * we don't need page_cgroup_lock about tail pages, becase they are not
2732 * accessed by any other context at this point.
2733 */
2734 2461
2735 /* 2462 /*
2736 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2463 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2741,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2741 2468
2742 /* 2469 /*
2743 * Nobody should be changing or seriously looking at 2470 * Nobody should be changing or seriously looking at
2744 * pc->mem_cgroup and pc->flags at this point: 2471 * page->mem_cgroup at this point:
2745 * 2472 *
2746 * - the page is uncharged 2473 * - the page is uncharged
2747 * 2474 *
@@ -2753,15 +2480,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2753 * - a page cache insertion, a swapin fault, or a migration 2480 * - a page cache insertion, a swapin fault, or a migration
2754 * have the page locked 2481 * have the page locked
2755 */ 2482 */
2756 pc->mem_cgroup = memcg; 2483 page->mem_cgroup = memcg;
2757 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
2758 2484
2759 if (lrucare) 2485 if (lrucare)
2760 unlock_page_lru(page, isolated); 2486 unlock_page_lru(page, isolated);
2761} 2487}
2762 2488
2763static DEFINE_MUTEX(set_limit_mutex);
2764
2765#ifdef CONFIG_MEMCG_KMEM 2489#ifdef CONFIG_MEMCG_KMEM
2766/* 2490/*
2767 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2491 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
@@ -2769,8 +2493,6 @@ static DEFINE_MUTEX(set_limit_mutex);
2769 */ 2493 */
2770static DEFINE_MUTEX(memcg_slab_mutex); 2494static DEFINE_MUTEX(memcg_slab_mutex);
2771 2495
2772static DEFINE_MUTEX(activate_kmem_mutex);
2773
2774/* 2496/*
2775 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2497 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2776 * in the memcg_cache_params struct. 2498 * in the memcg_cache_params struct.
@@ -2784,36 +2506,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2784 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2506 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2785} 2507}
2786 2508
2787#ifdef CONFIG_SLABINFO 2509static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2788static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 2510 unsigned long nr_pages)
2789{
2790 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2791 struct memcg_cache_params *params;
2792
2793 if (!memcg_kmem_is_active(memcg))
2794 return -EIO;
2795
2796 print_slabinfo_header(m);
2797
2798 mutex_lock(&memcg_slab_mutex);
2799 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2800 cache_show(memcg_params_to_cache(params), m);
2801 mutex_unlock(&memcg_slab_mutex);
2802
2803 return 0;
2804}
2805#endif
2806
2807static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2808{ 2511{
2809 struct res_counter *fail_res; 2512 struct page_counter *counter;
2810 int ret = 0; 2513 int ret = 0;
2811 2514
2812 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2515 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
2813 if (ret) 2516 if (ret < 0)
2814 return ret; 2517 return ret;
2815 2518
2816 ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); 2519 ret = try_charge(memcg, gfp, nr_pages);
2817 if (ret == -EINTR) { 2520 if (ret == -EINTR) {
2818 /* 2521 /*
2819 * try_charge() chose to bypass to root due to OOM kill or 2522 * try_charge() chose to bypass to root due to OOM kill or
@@ -2830,37 +2533,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2830 * when the allocation triggers should have been already 2533 * when the allocation triggers should have been already
2831 * directed to the root cgroup in memcontrol.h 2534 * directed to the root cgroup in memcontrol.h
2832 */ 2535 */
2833 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2536 page_counter_charge(&memcg->memory, nr_pages);
2834 if (do_swap_account) 2537 if (do_swap_account)
2835 res_counter_charge_nofail(&memcg->memsw, size, 2538 page_counter_charge(&memcg->memsw, nr_pages);
2836 &fail_res); 2539 css_get_many(&memcg->css, nr_pages);
2837 ret = 0; 2540 ret = 0;
2838 } else if (ret) 2541 } else if (ret)
2839 res_counter_uncharge(&memcg->kmem, size); 2542 page_counter_uncharge(&memcg->kmem, nr_pages);
2840 2543
2841 return ret; 2544 return ret;
2842} 2545}
2843 2546
2844static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2547static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
2548 unsigned long nr_pages)
2845{ 2549{
2846 res_counter_uncharge(&memcg->res, size); 2550 page_counter_uncharge(&memcg->memory, nr_pages);
2847 if (do_swap_account) 2551 if (do_swap_account)
2848 res_counter_uncharge(&memcg->memsw, size); 2552 page_counter_uncharge(&memcg->memsw, nr_pages);
2849 2553
2850 /* Not down to 0 */ 2554 page_counter_uncharge(&memcg->kmem, nr_pages);
2851 if (res_counter_uncharge(&memcg->kmem, size))
2852 return;
2853 2555
2854 /* 2556 css_put_many(&memcg->css, nr_pages);
2855 * Releases a reference taken in kmem_cgroup_css_offline in case
2856 * this last uncharge is racing with the offlining code or it is
2857 * outliving the memcg existence.
2858 *
2859 * The memory barrier imposed by test&clear is paired with the
2860 * explicit one in memcg_kmem_mark_dead().
2861 */
2862 if (memcg_kmem_test_and_clear_dead(memcg))
2863 css_put(&memcg->css);
2864} 2557}
2865 2558
2866/* 2559/*
@@ -3124,19 +2817,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3124 2817
3125int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 2818int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3126{ 2819{
2820 unsigned int nr_pages = 1 << order;
3127 int res; 2821 int res;
3128 2822
3129 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, 2823 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
3130 PAGE_SIZE << order);
3131 if (!res) 2824 if (!res)
3132 atomic_add(1 << order, &cachep->memcg_params->nr_pages); 2825 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
3133 return res; 2826 return res;
3134} 2827}
3135 2828
3136void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 2829void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3137{ 2830{
3138 memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); 2831 unsigned int nr_pages = 1 << order;
3139 atomic_sub(1 << order, &cachep->memcg_params->nr_pages); 2832
2833 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2834 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
3140} 2835}
3141 2836
3142/* 2837/*
@@ -3257,7 +2952,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3257 return true; 2952 return true;
3258 } 2953 }
3259 2954
3260 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 2955 ret = memcg_charge_kmem(memcg, gfp, 1 << order);
3261 if (!ret) 2956 if (!ret)
3262 *_memcg = memcg; 2957 *_memcg = memcg;
3263 2958
@@ -3268,46 +2963,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3268void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2963void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3269 int order) 2964 int order)
3270{ 2965{
3271 struct page_cgroup *pc;
3272
3273 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2966 VM_BUG_ON(mem_cgroup_is_root(memcg));
3274 2967
3275 /* The page allocation failed. Revert */ 2968 /* The page allocation failed. Revert */
3276 if (!page) { 2969 if (!page) {
3277 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 2970 memcg_uncharge_kmem(memcg, 1 << order);
3278 return; 2971 return;
3279 } 2972 }
3280 /* 2973 page->mem_cgroup = memcg;
3281 * The page is freshly allocated and not visible to any
3282 * outside callers yet. Set up pc non-atomically.
3283 */
3284 pc = lookup_page_cgroup(page);
3285 pc->mem_cgroup = memcg;
3286 pc->flags = PCG_USED;
3287} 2974}
3288 2975
3289void __memcg_kmem_uncharge_pages(struct page *page, int order) 2976void __memcg_kmem_uncharge_pages(struct page *page, int order)
3290{ 2977{
3291 struct mem_cgroup *memcg = NULL; 2978 struct mem_cgroup *memcg = page->mem_cgroup;
3292 struct page_cgroup *pc;
3293
3294 2979
3295 pc = lookup_page_cgroup(page);
3296 if (!PageCgroupUsed(pc))
3297 return;
3298
3299 memcg = pc->mem_cgroup;
3300 pc->flags = 0;
3301
3302 /*
3303 * We trust that only if there is a memcg associated with the page, it
3304 * is a valid allocation
3305 */
3306 if (!memcg) 2980 if (!memcg)
3307 return; 2981 return;
3308 2982
3309 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2983 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3310 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 2984
2985 memcg_uncharge_kmem(memcg, 1 << order);
2986 page->mem_cgroup = NULL;
3311} 2987}
3312#else 2988#else
3313static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 2989static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3325,21 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3325 */ 3001 */
3326void mem_cgroup_split_huge_fixup(struct page *head) 3002void mem_cgroup_split_huge_fixup(struct page *head)
3327{ 3003{
3328 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3329 struct page_cgroup *pc;
3330 struct mem_cgroup *memcg;
3331 int i; 3004 int i;
3332 3005
3333 if (mem_cgroup_disabled()) 3006 if (mem_cgroup_disabled())
3334 return; 3007 return;
3335 3008
3336 memcg = head_pc->mem_cgroup; 3009 for (i = 1; i < HPAGE_PMD_NR; i++)
3337 for (i = 1; i < HPAGE_PMD_NR; i++) { 3010 head[i].mem_cgroup = head->mem_cgroup;
3338 pc = head_pc + i; 3011
3339 pc->mem_cgroup = memcg; 3012 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3340 pc->flags = head_pc->flags;
3341 }
3342 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3343 HPAGE_PMD_NR); 3013 HPAGE_PMD_NR);
3344} 3014}
3345#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3015#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3348,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3348 * mem_cgroup_move_account - move account of the page 3018 * mem_cgroup_move_account - move account of the page
3349 * @page: the page 3019 * @page: the page
3350 * @nr_pages: number of regular pages (>1 for huge pages) 3020 * @nr_pages: number of regular pages (>1 for huge pages)
3351 * @pc: page_cgroup of the page.
3352 * @from: mem_cgroup which the page is moved from. 3021 * @from: mem_cgroup which the page is moved from.
3353 * @to: mem_cgroup which the page is moved to. @from != @to. 3022 * @to: mem_cgroup which the page is moved to. @from != @to.
3354 * 3023 *
@@ -3361,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3361 */ 3030 */
3362static int mem_cgroup_move_account(struct page *page, 3031static int mem_cgroup_move_account(struct page *page,
3363 unsigned int nr_pages, 3032 unsigned int nr_pages,
3364 struct page_cgroup *pc,
3365 struct mem_cgroup *from, 3033 struct mem_cgroup *from,
3366 struct mem_cgroup *to) 3034 struct mem_cgroup *to)
3367{ 3035{
@@ -3381,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
3381 goto out; 3049 goto out;
3382 3050
3383 /* 3051 /*
3384 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3052 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
3385 * of its source page while we change it: page migration takes 3053 * of its source page while we change it: page migration takes
3386 * both pages off the LRU, but page cache replacement doesn't. 3054 * both pages off the LRU, but page cache replacement doesn't.
3387 */ 3055 */
@@ -3389,10 +3057,10 @@ static int mem_cgroup_move_account(struct page *page,
3389 goto out; 3057 goto out;
3390 3058
3391 ret = -EINVAL; 3059 ret = -EINVAL;
3392 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3060 if (page->mem_cgroup != from)
3393 goto out_unlock; 3061 goto out_unlock;
3394 3062
3395 move_lock_mem_cgroup(from, &flags); 3063 spin_lock_irqsave(&from->move_lock, flags);
3396 3064
3397 if (!PageAnon(page) && page_mapped(page)) { 3065 if (!PageAnon(page) && page_mapped(page)) {
3398 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3066 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
@@ -3409,14 +3077,15 @@ static int mem_cgroup_move_account(struct page *page,
3409 } 3077 }
3410 3078
3411 /* 3079 /*
3412 * It is safe to change pc->mem_cgroup here because the page 3080 * It is safe to change page->mem_cgroup here because the page
3413 * is referenced, charged, and isolated - we can't race with 3081 * is referenced, charged, and isolated - we can't race with
3414 * uncharging, charging, migration, or LRU putback. 3082 * uncharging, charging, migration, or LRU putback.
3415 */ 3083 */
3416 3084
3417 /* caller should have done css_get */ 3085 /* caller should have done css_get */
3418 pc->mem_cgroup = to; 3086 page->mem_cgroup = to;
3419 move_unlock_mem_cgroup(from, &flags); 3087 spin_unlock_irqrestore(&from->move_lock, flags);
3088
3420 ret = 0; 3089 ret = 0;
3421 3090
3422 local_irq_disable(); 3091 local_irq_disable();
@@ -3431,72 +3100,6 @@ out:
3431 return ret; 3100 return ret;
3432} 3101}
3433 3102
3434/**
3435 * mem_cgroup_move_parent - moves page to the parent group
3436 * @page: the page to move
3437 * @pc: page_cgroup of the page
3438 * @child: page's cgroup
3439 *
3440 * move charges to its parent or the root cgroup if the group has no
3441 * parent (aka use_hierarchy==0).
3442 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3443 * mem_cgroup_move_account fails) the failure is always temporary and
3444 * it signals a race with a page removal/uncharge or migration. In the
3445 * first case the page is on the way out and it will vanish from the LRU
3446 * on the next attempt and the call should be retried later.
3447 * Isolation from the LRU fails only if page has been isolated from
3448 * the LRU since we looked at it and that usually means either global
3449 * reclaim or migration going on. The page will either get back to the
3450 * LRU or vanish.
3451 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3452 * (!PageCgroupUsed) or moved to a different group. The page will
3453 * disappear in the next attempt.
3454 */
3455static int mem_cgroup_move_parent(struct page *page,
3456 struct page_cgroup *pc,
3457 struct mem_cgroup *child)
3458{
3459 struct mem_cgroup *parent;
3460 unsigned int nr_pages;
3461 unsigned long uninitialized_var(flags);
3462 int ret;
3463
3464 VM_BUG_ON(mem_cgroup_is_root(child));
3465
3466 ret = -EBUSY;
3467 if (!get_page_unless_zero(page))
3468 goto out;
3469 if (isolate_lru_page(page))
3470 goto put;
3471
3472 nr_pages = hpage_nr_pages(page);
3473
3474 parent = parent_mem_cgroup(child);
3475 /*
3476 * If no parent, move charges to root cgroup.
3477 */
3478 if (!parent)
3479 parent = root_mem_cgroup;
3480
3481 if (nr_pages > 1) {
3482 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3483 flags = compound_lock_irqsave(page);
3484 }
3485
3486 ret = mem_cgroup_move_account(page, nr_pages,
3487 pc, child, parent);
3488 if (!ret)
3489 __mem_cgroup_cancel_local_charge(child, nr_pages);
3490
3491 if (nr_pages > 1)
3492 compound_unlock_irqrestore(page, flags);
3493 putback_lru_page(page);
3494put:
3495 put_page(page);
3496out:
3497 return ret;
3498}
3499
3500#ifdef CONFIG_MEMCG_SWAP 3103#ifdef CONFIG_MEMCG_SWAP
3501static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3104static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3502 bool charge) 3105 bool charge)
@@ -3516,7 +3119,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3516 * 3119 *
3517 * Returns 0 on success, -EINVAL on failure. 3120 * Returns 0 on success, -EINVAL on failure.
3518 * 3121 *
3519 * The caller must have charged to @to, IOW, called res_counter_charge() about 3122 * The caller must have charged to @to, IOW, called page_counter_charge() about
3520 * both res and memsw, and called css_get(). 3123 * both res and memsw, and called css_get().
3521 */ 3124 */
3522static int mem_cgroup_move_swap_account(swp_entry_t entry, 3125static int mem_cgroup_move_swap_account(swp_entry_t entry,
@@ -3532,7 +3135,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3532 mem_cgroup_swap_statistics(to, true); 3135 mem_cgroup_swap_statistics(to, true);
3533 /* 3136 /*
3534 * This function is only called from task migration context now. 3137 * This function is only called from task migration context now.
3535 * It postpones res_counter and refcount handling till the end 3138 * It postpones page_counter and refcount handling till the end
3536 * of task migration(mem_cgroup_clear_mc()) for performance 3139 * of task migration(mem_cgroup_clear_mc()) for performance
3537 * improvement. But we cannot postpone css_get(to) because if 3140 * improvement. But we cannot postpone css_get(to) because if
3538 * the process that has been moved to @to does swap-in, the 3141 * the process that has been moved to @to does swap-in, the
@@ -3554,96 +3157,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3554} 3157}
3555#endif 3158#endif
3556 3159
3557#ifdef CONFIG_DEBUG_VM 3160static DEFINE_MUTEX(memcg_limit_mutex);
3558static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3559{
3560 struct page_cgroup *pc;
3561
3562 pc = lookup_page_cgroup(page);
3563 /*
3564 * Can be NULL while feeding pages into the page allocator for
3565 * the first time, i.e. during boot or memory hotplug;
3566 * or when mem_cgroup_disabled().
3567 */
3568 if (likely(pc) && PageCgroupUsed(pc))
3569 return pc;
3570 return NULL;
3571}
3572
3573bool mem_cgroup_bad_page_check(struct page *page)
3574{
3575 if (mem_cgroup_disabled())
3576 return false;
3577
3578 return lookup_page_cgroup_used(page) != NULL;
3579}
3580
3581void mem_cgroup_print_bad_page(struct page *page)
3582{
3583 struct page_cgroup *pc;
3584
3585 pc = lookup_page_cgroup_used(page);
3586 if (pc) {
3587 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3588 pc, pc->flags, pc->mem_cgroup);
3589 }
3590}
3591#endif
3592 3161
3593static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3162static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3594 unsigned long long val) 3163 unsigned long limit)
3595{ 3164{
3165 unsigned long curusage;
3166 unsigned long oldusage;
3167 bool enlarge = false;
3596 int retry_count; 3168 int retry_count;
3597 int ret = 0; 3169 int ret;
3598 int children = mem_cgroup_count_children(memcg);
3599 u64 curusage, oldusage;
3600 int enlarge;
3601 3170
3602 /* 3171 /*
3603 * For keeping hierarchical_reclaim simple, how long we should retry 3172 * For keeping hierarchical_reclaim simple, how long we should retry
3604 * is depends on callers. We set our retry-count to be function 3173 * is depends on callers. We set our retry-count to be function
3605 * of # of children which we should visit in this loop. 3174 * of # of children which we should visit in this loop.
3606 */ 3175 */
3607 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3176 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3177 mem_cgroup_count_children(memcg);
3608 3178
3609 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3179 oldusage = page_counter_read(&memcg->memory);
3610 3180
3611 enlarge = 0; 3181 do {
3612 while (retry_count) {
3613 if (signal_pending(current)) { 3182 if (signal_pending(current)) {
3614 ret = -EINTR; 3183 ret = -EINTR;
3615 break; 3184 break;
3616 } 3185 }
3617 /* 3186
3618 * Rather than hide all in some function, I do this in 3187 mutex_lock(&memcg_limit_mutex);
3619 * open coded manner. You see what this really does. 3188 if (limit > memcg->memsw.limit) {
3620 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3189 mutex_unlock(&memcg_limit_mutex);
3621 */
3622 mutex_lock(&set_limit_mutex);
3623 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
3624 ret = -EINVAL; 3190 ret = -EINVAL;
3625 mutex_unlock(&set_limit_mutex);
3626 break; 3191 break;
3627 } 3192 }
3628 3193 if (limit > memcg->memory.limit)
3629 if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) 3194 enlarge = true;
3630 enlarge = 1; 3195 ret = page_counter_limit(&memcg->memory, limit);
3631 3196 mutex_unlock(&memcg_limit_mutex);
3632 ret = res_counter_set_limit(&memcg->res, val);
3633 mutex_unlock(&set_limit_mutex);
3634 3197
3635 if (!ret) 3198 if (!ret)
3636 break; 3199 break;
3637 3200
3638 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3201 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3639 3202
3640 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3203 curusage = page_counter_read(&memcg->memory);
3641 /* Usage is reduced ? */ 3204 /* Usage is reduced ? */
3642 if (curusage >= oldusage) 3205 if (curusage >= oldusage)
3643 retry_count--; 3206 retry_count--;
3644 else 3207 else
3645 oldusage = curusage; 3208 oldusage = curusage;
3646 } 3209 } while (retry_count);
3210
3647 if (!ret && enlarge) 3211 if (!ret && enlarge)
3648 memcg_oom_recover(memcg); 3212 memcg_oom_recover(memcg);
3649 3213
@@ -3651,52 +3215,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3651} 3215}
3652 3216
3653static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3217static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3654 unsigned long long val) 3218 unsigned long limit)
3655{ 3219{
3220 unsigned long curusage;
3221 unsigned long oldusage;
3222 bool enlarge = false;
3656 int retry_count; 3223 int retry_count;
3657 u64 oldusage, curusage; 3224 int ret;
3658 int children = mem_cgroup_count_children(memcg);
3659 int ret = -EBUSY;
3660 int enlarge = 0;
3661 3225
3662 /* see mem_cgroup_resize_res_limit */ 3226 /* see mem_cgroup_resize_res_limit */
3663 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3227 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3664 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3228 mem_cgroup_count_children(memcg);
3665 while (retry_count) { 3229
3230 oldusage = page_counter_read(&memcg->memsw);
3231
3232 do {
3666 if (signal_pending(current)) { 3233 if (signal_pending(current)) {
3667 ret = -EINTR; 3234 ret = -EINTR;
3668 break; 3235 break;
3669 } 3236 }
3670 /* 3237
3671 * Rather than hide all in some function, I do this in 3238 mutex_lock(&memcg_limit_mutex);
3672 * open coded manner. You see what this really does. 3239 if (limit < memcg->memory.limit) {
3673 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 3240 mutex_unlock(&memcg_limit_mutex);
3674 */
3675 mutex_lock(&set_limit_mutex);
3676 if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
3677 ret = -EINVAL; 3241 ret = -EINVAL;
3678 mutex_unlock(&set_limit_mutex);
3679 break; 3242 break;
3680 } 3243 }
3681 if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) 3244 if (limit > memcg->memsw.limit)
3682 enlarge = 1; 3245 enlarge = true;
3683 ret = res_counter_set_limit(&memcg->memsw, val); 3246 ret = page_counter_limit(&memcg->memsw, limit);
3684 mutex_unlock(&set_limit_mutex); 3247 mutex_unlock(&memcg_limit_mutex);
3685 3248
3686 if (!ret) 3249 if (!ret)
3687 break; 3250 break;
3688 3251
3689 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3252 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3690 3253
3691 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3254 curusage = page_counter_read(&memcg->memsw);
3692 /* Usage is reduced ? */ 3255 /* Usage is reduced ? */
3693 if (curusage >= oldusage) 3256 if (curusage >= oldusage)
3694 retry_count--; 3257 retry_count--;
3695 else 3258 else
3696 oldusage = curusage; 3259 oldusage = curusage;
3697 } 3260 } while (retry_count);
3261
3698 if (!ret && enlarge) 3262 if (!ret && enlarge)
3699 memcg_oom_recover(memcg); 3263 memcg_oom_recover(memcg);
3264
3700 return ret; 3265 return ret;
3701} 3266}
3702 3267
@@ -3709,7 +3274,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3709 unsigned long reclaimed; 3274 unsigned long reclaimed;
3710 int loop = 0; 3275 int loop = 0;
3711 struct mem_cgroup_tree_per_zone *mctz; 3276 struct mem_cgroup_tree_per_zone *mctz;
3712 unsigned long long excess; 3277 unsigned long excess;
3713 unsigned long nr_scanned; 3278 unsigned long nr_scanned;
3714 3279
3715 if (order > 0) 3280 if (order > 0)
@@ -3735,35 +3300,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3735 nr_reclaimed += reclaimed; 3300 nr_reclaimed += reclaimed;
3736 *total_scanned += nr_scanned; 3301 *total_scanned += nr_scanned;
3737 spin_lock_irq(&mctz->lock); 3302 spin_lock_irq(&mctz->lock);
3303 __mem_cgroup_remove_exceeded(mz, mctz);
3738 3304
3739 /* 3305 /*
3740 * If we failed to reclaim anything from this memory cgroup 3306 * If we failed to reclaim anything from this memory cgroup
3741 * it is time to move on to the next cgroup 3307 * it is time to move on to the next cgroup
3742 */ 3308 */
3743 next_mz = NULL; 3309 next_mz = NULL;
3744 if (!reclaimed) { 3310 if (!reclaimed)
3745 do { 3311 next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3746 /* 3312
3747 * Loop until we find yet another one. 3313 excess = soft_limit_excess(mz->memcg);
3748 *
3749 * By the time we get the soft_limit lock
3750 * again, someone might have aded the
3751 * group back on the RB tree. Iterate to
3752 * make sure we get a different mem.
3753 * mem_cgroup_largest_soft_limit_node returns
3754 * NULL if no other cgroup is present on
3755 * the tree
3756 */
3757 next_mz =
3758 __mem_cgroup_largest_soft_limit_node(mctz);
3759 if (next_mz == mz)
3760 css_put(&next_mz->memcg->css);
3761 else /* next_mz == NULL or other memcg */
3762 break;
3763 } while (1);
3764 }
3765 __mem_cgroup_remove_exceeded(mz, mctz);
3766 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3767 /* 3314 /*
3768 * One school of thought says that we should not add 3315 * One school of thought says that we should not add
3769 * back the node to the tree if reclaim returns 0. 3316 * back the node to the tree if reclaim returns 0.
@@ -3792,107 +3339,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3792 return nr_reclaimed; 3339 return nr_reclaimed;
3793} 3340}
3794 3341
3795/**
3796 * mem_cgroup_force_empty_list - clears LRU of a group
3797 * @memcg: group to clear
3798 * @node: NUMA node
3799 * @zid: zone id
3800 * @lru: lru to to clear
3801 *
3802 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3803 * reclaim the pages page themselves - pages are moved to the parent (or root)
3804 * group.
3805 */
3806static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3807 int node, int zid, enum lru_list lru)
3808{
3809 struct lruvec *lruvec;
3810 unsigned long flags;
3811 struct list_head *list;
3812 struct page *busy;
3813 struct zone *zone;
3814
3815 zone = &NODE_DATA(node)->node_zones[zid];
3816 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3817 list = &lruvec->lists[lru];
3818
3819 busy = NULL;
3820 do {
3821 struct page_cgroup *pc;
3822 struct page *page;
3823
3824 spin_lock_irqsave(&zone->lru_lock, flags);
3825 if (list_empty(list)) {
3826 spin_unlock_irqrestore(&zone->lru_lock, flags);
3827 break;
3828 }
3829 page = list_entry(list->prev, struct page, lru);
3830 if (busy == page) {
3831 list_move(&page->lru, list);
3832 busy = NULL;
3833 spin_unlock_irqrestore(&zone->lru_lock, flags);
3834 continue;
3835 }
3836 spin_unlock_irqrestore(&zone->lru_lock, flags);
3837
3838 pc = lookup_page_cgroup(page);
3839
3840 if (mem_cgroup_move_parent(page, pc, memcg)) {
3841 /* found lock contention or "pc" is obsolete. */
3842 busy = page;
3843 } else
3844 busy = NULL;
3845 cond_resched();
3846 } while (!list_empty(list));
3847}
3848
3849/*
3850 * make mem_cgroup's charge to be 0 if there is no task by moving
3851 * all the charges and pages to the parent.
3852 * This enables deleting this mem_cgroup.
3853 *
3854 * Caller is responsible for holding css reference on the memcg.
3855 */
3856static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3857{
3858 int node, zid;
3859 u64 usage;
3860
3861 do {
3862 /* This is for making all *used* pages to be on LRU. */
3863 lru_add_drain_all();
3864 drain_all_stock_sync(memcg);
3865 mem_cgroup_start_move(memcg);
3866 for_each_node_state(node, N_MEMORY) {
3867 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3868 enum lru_list lru;
3869 for_each_lru(lru) {
3870 mem_cgroup_force_empty_list(memcg,
3871 node, zid, lru);
3872 }
3873 }
3874 }
3875 mem_cgroup_end_move(memcg);
3876 memcg_oom_recover(memcg);
3877 cond_resched();
3878
3879 /*
3880 * Kernel memory may not necessarily be trackable to a specific
3881 * process. So they are not migrated, and therefore we can't
3882 * expect their value to drop to 0 here.
3883 * Having res filled up with kmem only is enough.
3884 *
3885 * This is a safety check because mem_cgroup_force_empty_list
3886 * could have raced with mem_cgroup_replace_page_cache callers
3887 * so the lru seemed empty but the page could have been added
3888 * right after the check. RES_USAGE should be safe as we always
3889 * charge before adding to the LRU.
3890 */
3891 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
3892 res_counter_read_u64(&memcg->kmem, RES_USAGE);
3893 } while (usage > 0);
3894}
3895
3896/* 3342/*
3897 * Test whether @memcg has children, dead or alive. Note that this 3343 * Test whether @memcg has children, dead or alive. Note that this
3898 * function doesn't care whether @memcg has use_hierarchy enabled and 3344 * function doesn't care whether @memcg has use_hierarchy enabled and
@@ -3930,7 +3376,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3930 /* we call try-to-free pages for make this cgroup empty */ 3376 /* we call try-to-free pages for make this cgroup empty */
3931 lru_add_drain_all(); 3377 lru_add_drain_all();
3932 /* try to free all pages in this cgroup */ 3378 /* try to free all pages in this cgroup */
3933 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 3379 while (nr_retries && page_counter_read(&memcg->memory)) {
3934 int progress; 3380 int progress;
3935 3381
3936 if (signal_pending(current)) 3382 if (signal_pending(current))
@@ -4001,8 +3447,8 @@ out:
4001 return retval; 3447 return retval;
4002} 3448}
4003 3449
4004static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 3450static unsigned long tree_stat(struct mem_cgroup *memcg,
4005 enum mem_cgroup_stat_index idx) 3451 enum mem_cgroup_stat_index idx)
4006{ 3452{
4007 struct mem_cgroup *iter; 3453 struct mem_cgroup *iter;
4008 long val = 0; 3454 long val = 0;
@@ -4020,55 +3466,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4020{ 3466{
4021 u64 val; 3467 u64 val;
4022 3468
4023 if (!mem_cgroup_is_root(memcg)) { 3469 if (mem_cgroup_is_root(memcg)) {
3470 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
3471 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
3472 if (swap)
3473 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
3474 } else {
4024 if (!swap) 3475 if (!swap)
4025 return res_counter_read_u64(&memcg->res, RES_USAGE); 3476 val = page_counter_read(&memcg->memory);
4026 else 3477 else
4027 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 3478 val = page_counter_read(&memcg->memsw);
4028 } 3479 }
4029
4030 /*
4031 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4032 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4033 */
4034 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4035 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4036
4037 if (swap)
4038 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4039
4040 return val << PAGE_SHIFT; 3480 return val << PAGE_SHIFT;
4041} 3481}
4042 3482
3483enum {
3484 RES_USAGE,
3485 RES_LIMIT,
3486 RES_MAX_USAGE,
3487 RES_FAILCNT,
3488 RES_SOFT_LIMIT,
3489};
4043 3490
4044static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3491static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4045 struct cftype *cft) 3492 struct cftype *cft)
4046{ 3493{
4047 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3494 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4048 enum res_type type = MEMFILE_TYPE(cft->private); 3495 struct page_counter *counter;
4049 int name = MEMFILE_ATTR(cft->private);
4050 3496
4051 switch (type) { 3497 switch (MEMFILE_TYPE(cft->private)) {
4052 case _MEM: 3498 case _MEM:
4053 if (name == RES_USAGE) 3499 counter = &memcg->memory;
4054 return mem_cgroup_usage(memcg, false); 3500 break;
4055 return res_counter_read_u64(&memcg->res, name);
4056 case _MEMSWAP: 3501 case _MEMSWAP:
4057 if (name == RES_USAGE) 3502 counter = &memcg->memsw;
4058 return mem_cgroup_usage(memcg, true); 3503 break;
4059 return res_counter_read_u64(&memcg->memsw, name);
4060 case _KMEM: 3504 case _KMEM:
4061 return res_counter_read_u64(&memcg->kmem, name); 3505 counter = &memcg->kmem;
4062 break; 3506 break;
4063 default: 3507 default:
4064 BUG(); 3508 BUG();
4065 } 3509 }
3510
3511 switch (MEMFILE_ATTR(cft->private)) {
3512 case RES_USAGE:
3513 if (counter == &memcg->memory)
3514 return mem_cgroup_usage(memcg, false);
3515 if (counter == &memcg->memsw)
3516 return mem_cgroup_usage(memcg, true);
3517 return (u64)page_counter_read(counter) * PAGE_SIZE;
3518 case RES_LIMIT:
3519 return (u64)counter->limit * PAGE_SIZE;
3520 case RES_MAX_USAGE:
3521 return (u64)counter->watermark * PAGE_SIZE;
3522 case RES_FAILCNT:
3523 return counter->failcnt;
3524 case RES_SOFT_LIMIT:
3525 return (u64)memcg->soft_limit * PAGE_SIZE;
3526 default:
3527 BUG();
3528 }
4066} 3529}
4067 3530
4068#ifdef CONFIG_MEMCG_KMEM 3531#ifdef CONFIG_MEMCG_KMEM
4069/* should be called with activate_kmem_mutex held */ 3532static int memcg_activate_kmem(struct mem_cgroup *memcg,
4070static int __memcg_activate_kmem(struct mem_cgroup *memcg, 3533 unsigned long nr_pages)
4071 unsigned long long limit)
4072{ 3534{
4073 int err = 0; 3535 int err = 0;
4074 int memcg_id; 3536 int memcg_id;
@@ -4115,7 +3577,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
4115 * We couldn't have accounted to this cgroup, because it hasn't got the 3577 * We couldn't have accounted to this cgroup, because it hasn't got the
4116 * active bit set yet, so this should succeed. 3578 * active bit set yet, so this should succeed.
4117 */ 3579 */
4118 err = res_counter_set_limit(&memcg->kmem, limit); 3580 err = page_counter_limit(&memcg->kmem, nr_pages);
4119 VM_BUG_ON(err); 3581 VM_BUG_ON(err);
4120 3582
4121 static_key_slow_inc(&memcg_kmem_enabled_key); 3583 static_key_slow_inc(&memcg_kmem_enabled_key);
@@ -4130,26 +3592,17 @@ out:
4130 return err; 3592 return err;
4131} 3593}
4132 3594
4133static int memcg_activate_kmem(struct mem_cgroup *memcg,
4134 unsigned long long limit)
4135{
4136 int ret;
4137
4138 mutex_lock(&activate_kmem_mutex);
4139 ret = __memcg_activate_kmem(memcg, limit);
4140 mutex_unlock(&activate_kmem_mutex);
4141 return ret;
4142}
4143
4144static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3595static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4145 unsigned long long val) 3596 unsigned long limit)
4146{ 3597{
4147 int ret; 3598 int ret;
4148 3599
3600 mutex_lock(&memcg_limit_mutex);
4149 if (!memcg_kmem_is_active(memcg)) 3601 if (!memcg_kmem_is_active(memcg))
4150 ret = memcg_activate_kmem(memcg, val); 3602 ret = memcg_activate_kmem(memcg, limit);
4151 else 3603 else
4152 ret = res_counter_set_limit(&memcg->kmem, val); 3604 ret = page_counter_limit(&memcg->kmem, limit);
3605 mutex_unlock(&memcg_limit_mutex);
4153 return ret; 3606 return ret;
4154} 3607}
4155 3608
@@ -4161,19 +3614,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4161 if (!parent) 3614 if (!parent)
4162 return 0; 3615 return 0;
4163 3616
4164 mutex_lock(&activate_kmem_mutex); 3617 mutex_lock(&memcg_limit_mutex);
4165 /* 3618 /*
4166 * If the parent cgroup is not kmem-active now, it cannot be activated 3619 * If the parent cgroup is not kmem-active now, it cannot be activated
4167 * after this point, because it has at least one child already. 3620 * after this point, because it has at least one child already.
4168 */ 3621 */
4169 if (memcg_kmem_is_active(parent)) 3622 if (memcg_kmem_is_active(parent))
4170 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 3623 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
4171 mutex_unlock(&activate_kmem_mutex); 3624 mutex_unlock(&memcg_limit_mutex);
4172 return ret; 3625 return ret;
4173} 3626}
4174#else 3627#else
4175static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 3628static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4176 unsigned long long val) 3629 unsigned long limit)
4177{ 3630{
4178 return -EINVAL; 3631 return -EINVAL;
4179} 3632}
@@ -4187,110 +3640,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
4187 char *buf, size_t nbytes, loff_t off) 3640 char *buf, size_t nbytes, loff_t off)
4188{ 3641{
4189 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3642 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4190 enum res_type type; 3643 unsigned long nr_pages;
4191 int name;
4192 unsigned long long val;
4193 int ret; 3644 int ret;
4194 3645
4195 buf = strstrip(buf); 3646 buf = strstrip(buf);
4196 type = MEMFILE_TYPE(of_cft(of)->private); 3647 ret = page_counter_memparse(buf, &nr_pages);
4197 name = MEMFILE_ATTR(of_cft(of)->private); 3648 if (ret)
3649 return ret;
4198 3650
4199 switch (name) { 3651 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4200 case RES_LIMIT: 3652 case RES_LIMIT:
4201 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3653 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4202 ret = -EINVAL; 3654 ret = -EINVAL;
4203 break; 3655 break;
4204 } 3656 }
4205 /* This function does all necessary parse...reuse it */ 3657 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4206 ret = res_counter_memparse_write_strategy(buf, &val); 3658 case _MEM:
4207 if (ret) 3659 ret = mem_cgroup_resize_limit(memcg, nr_pages);
4208 break; 3660 break;
4209 if (type == _MEM) 3661 case _MEMSWAP:
4210 ret = mem_cgroup_resize_limit(memcg, val); 3662 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
4211 else if (type == _MEMSWAP)
4212 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4213 else if (type == _KMEM)
4214 ret = memcg_update_kmem_limit(memcg, val);
4215 else
4216 return -EINVAL;
4217 break;
4218 case RES_SOFT_LIMIT:
4219 ret = res_counter_memparse_write_strategy(buf, &val);
4220 if (ret)
4221 break; 3663 break;
4222 /* 3664 case _KMEM:
4223 * For memsw, soft limits are hard to implement in terms 3665 ret = memcg_update_kmem_limit(memcg, nr_pages);
4224 * of semantics, for now, we support soft limits for 3666 break;
4225 * control without swap 3667 }
4226 */
4227 if (type == _MEM)
4228 ret = res_counter_set_soft_limit(&memcg->res, val);
4229 else
4230 ret = -EINVAL;
4231 break; 3668 break;
4232 default: 3669 case RES_SOFT_LIMIT:
4233 ret = -EINVAL; /* should be BUG() ? */ 3670 memcg->soft_limit = nr_pages;
3671 ret = 0;
4234 break; 3672 break;
4235 } 3673 }
4236 return ret ?: nbytes; 3674 return ret ?: nbytes;
4237} 3675}
4238 3676
4239static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4240 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4241{
4242 unsigned long long min_limit, min_memsw_limit, tmp;
4243
4244 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4245 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4246 if (!memcg->use_hierarchy)
4247 goto out;
4248
4249 while (memcg->css.parent) {
4250 memcg = mem_cgroup_from_css(memcg->css.parent);
4251 if (!memcg->use_hierarchy)
4252 break;
4253 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4254 min_limit = min(min_limit, tmp);
4255 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4256 min_memsw_limit = min(min_memsw_limit, tmp);
4257 }
4258out:
4259 *mem_limit = min_limit;
4260 *memsw_limit = min_memsw_limit;
4261}
4262
4263static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 3677static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
4264 size_t nbytes, loff_t off) 3678 size_t nbytes, loff_t off)
4265{ 3679{
4266 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3680 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4267 int name; 3681 struct page_counter *counter;
4268 enum res_type type;
4269 3682
4270 type = MEMFILE_TYPE(of_cft(of)->private); 3683 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4271 name = MEMFILE_ATTR(of_cft(of)->private); 3684 case _MEM:
3685 counter = &memcg->memory;
3686 break;
3687 case _MEMSWAP:
3688 counter = &memcg->memsw;
3689 break;
3690 case _KMEM:
3691 counter = &memcg->kmem;
3692 break;
3693 default:
3694 BUG();
3695 }
4272 3696
4273 switch (name) { 3697 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4274 case RES_MAX_USAGE: 3698 case RES_MAX_USAGE:
4275 if (type == _MEM) 3699 page_counter_reset_watermark(counter);
4276 res_counter_reset_max(&memcg->res);
4277 else if (type == _MEMSWAP)
4278 res_counter_reset_max(&memcg->memsw);
4279 else if (type == _KMEM)
4280 res_counter_reset_max(&memcg->kmem);
4281 else
4282 return -EINVAL;
4283 break; 3700 break;
4284 case RES_FAILCNT: 3701 case RES_FAILCNT:
4285 if (type == _MEM) 3702 counter->failcnt = 0;
4286 res_counter_reset_failcnt(&memcg->res);
4287 else if (type == _MEMSWAP)
4288 res_counter_reset_failcnt(&memcg->memsw);
4289 else if (type == _KMEM)
4290 res_counter_reset_failcnt(&memcg->kmem);
4291 else
4292 return -EINVAL;
4293 break; 3703 break;
3704 default:
3705 BUG();
4294 } 3706 }
4295 3707
4296 return nbytes; 3708 return nbytes;
@@ -4387,6 +3799,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
4387static int memcg_stat_show(struct seq_file *m, void *v) 3799static int memcg_stat_show(struct seq_file *m, void *v)
4388{ 3800{
4389 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 3801 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3802 unsigned long memory, memsw;
4390 struct mem_cgroup *mi; 3803 struct mem_cgroup *mi;
4391 unsigned int i; 3804 unsigned int i;
4392 3805
@@ -4406,14 +3819,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
4406 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 3819 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4407 3820
4408 /* Hierarchical information */ 3821 /* Hierarchical information */
4409 { 3822 memory = memsw = PAGE_COUNTER_MAX;
4410 unsigned long long limit, memsw_limit; 3823 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4411 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 3824 memory = min(memory, mi->memory.limit);
4412 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 3825 memsw = min(memsw, mi->memsw.limit);
4413 if (do_swap_account)
4414 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4415 memsw_limit);
4416 } 3826 }
3827 seq_printf(m, "hierarchical_memory_limit %llu\n",
3828 (u64)memory * PAGE_SIZE);
3829 if (do_swap_account)
3830 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3831 (u64)memsw * PAGE_SIZE);
4417 3832
4418 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3833 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4419 long long val = 0; 3834 long long val = 0;
@@ -4497,7 +3912,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4497static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3912static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4498{ 3913{
4499 struct mem_cgroup_threshold_ary *t; 3914 struct mem_cgroup_threshold_ary *t;
4500 u64 usage; 3915 unsigned long usage;
4501 int i; 3916 int i;
4502 3917
4503 rcu_read_lock(); 3918 rcu_read_lock();
@@ -4596,10 +4011,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4596{ 4011{
4597 struct mem_cgroup_thresholds *thresholds; 4012 struct mem_cgroup_thresholds *thresholds;
4598 struct mem_cgroup_threshold_ary *new; 4013 struct mem_cgroup_threshold_ary *new;
4599 u64 threshold, usage; 4014 unsigned long threshold;
4015 unsigned long usage;
4600 int i, size, ret; 4016 int i, size, ret;
4601 4017
4602 ret = res_counter_memparse_write_strategy(args, &threshold); 4018 ret = page_counter_memparse(args, &threshold);
4603 if (ret) 4019 if (ret)
4604 return ret; 4020 return ret;
4605 4021
@@ -4689,7 +4105,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4689{ 4105{
4690 struct mem_cgroup_thresholds *thresholds; 4106 struct mem_cgroup_thresholds *thresholds;
4691 struct mem_cgroup_threshold_ary *new; 4107 struct mem_cgroup_threshold_ary *new;
4692 u64 usage; 4108 unsigned long usage;
4693 int i, j, size; 4109 int i, j, size;
4694 4110
4695 mutex_lock(&memcg->thresholds_lock); 4111 mutex_lock(&memcg->thresholds_lock);
@@ -4855,40 +4271,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4855{ 4271{
4856 mem_cgroup_sockets_destroy(memcg); 4272 mem_cgroup_sockets_destroy(memcg);
4857} 4273}
4858
4859static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4860{
4861 if (!memcg_kmem_is_active(memcg))
4862 return;
4863
4864 /*
4865 * kmem charges can outlive the cgroup. In the case of slab
4866 * pages, for instance, a page contain objects from various
4867 * processes. As we prevent from taking a reference for every
4868 * such allocation we have to be careful when doing uncharge
4869 * (see memcg_uncharge_kmem) and here during offlining.
4870 *
4871 * The idea is that that only the _last_ uncharge which sees
4872 * the dead memcg will drop the last reference. An additional
4873 * reference is taken here before the group is marked dead
4874 * which is then paired with css_put during uncharge resp. here.
4875 *
4876 * Although this might sound strange as this path is called from
4877 * css_offline() when the referencemight have dropped down to 0 and
4878 * shouldn't be incremented anymore (css_tryget_online() would
4879 * fail) we do not have other options because of the kmem
4880 * allocations lifetime.
4881 */
4882 css_get(&memcg->css);
4883
4884 memcg_kmem_mark_dead(memcg);
4885
4886 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
4887 return;
4888
4889 if (memcg_kmem_test_and_clear_dead(memcg))
4890 css_put(&memcg->css);
4891}
4892#else 4274#else
4893static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4275static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4894{ 4276{
@@ -4898,10 +4280,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4898static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4280static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4899{ 4281{
4900} 4282}
4901
4902static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4903{
4904}
4905#endif 4283#endif
4906 4284
4907/* 4285/*
@@ -5228,7 +4606,10 @@ static struct cftype mem_cgroup_files[] = {
5228#ifdef CONFIG_SLABINFO 4606#ifdef CONFIG_SLABINFO
5229 { 4607 {
5230 .name = "kmem.slabinfo", 4608 .name = "kmem.slabinfo",
5231 .seq_show = mem_cgroup_slabinfo_read, 4609 .seq_start = slab_start,
4610 .seq_next = slab_next,
4611 .seq_stop = slab_stop,
4612 .seq_show = memcg_slab_show,
5232 }, 4613 },
5233#endif 4614#endif
5234#endif 4615#endif
@@ -5363,9 +4744,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
5363 */ 4744 */
5364struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 4745struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5365{ 4746{
5366 if (!memcg->res.parent) 4747 if (!memcg->memory.parent)
5367 return NULL; 4748 return NULL;
5368 return mem_cgroup_from_res_counter(memcg->res.parent, res); 4749 return mem_cgroup_from_counter(memcg->memory.parent, memory);
5369} 4750}
5370EXPORT_SYMBOL(parent_mem_cgroup); 4751EXPORT_SYMBOL(parent_mem_cgroup);
5371 4752
@@ -5410,9 +4791,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5410 /* root ? */ 4791 /* root ? */
5411 if (parent_css == NULL) { 4792 if (parent_css == NULL) {
5412 root_mem_cgroup = memcg; 4793 root_mem_cgroup = memcg;
5413 res_counter_init(&memcg->res, NULL); 4794 page_counter_init(&memcg->memory, NULL);
5414 res_counter_init(&memcg->memsw, NULL); 4795 page_counter_init(&memcg->memsw, NULL);
5415 res_counter_init(&memcg->kmem, NULL); 4796 page_counter_init(&memcg->kmem, NULL);
5416 } 4797 }
5417 4798
5418 memcg->last_scanned_node = MAX_NUMNODES; 4799 memcg->last_scanned_node = MAX_NUMNODES;
@@ -5451,18 +4832,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5451 memcg->swappiness = mem_cgroup_swappiness(parent); 4832 memcg->swappiness = mem_cgroup_swappiness(parent);
5452 4833
5453 if (parent->use_hierarchy) { 4834 if (parent->use_hierarchy) {
5454 res_counter_init(&memcg->res, &parent->res); 4835 page_counter_init(&memcg->memory, &parent->memory);
5455 res_counter_init(&memcg->memsw, &parent->memsw); 4836 page_counter_init(&memcg->memsw, &parent->memsw);
5456 res_counter_init(&memcg->kmem, &parent->kmem); 4837 page_counter_init(&memcg->kmem, &parent->kmem);
5457 4838
5458 /* 4839 /*
5459 * No need to take a reference to the parent because cgroup 4840 * No need to take a reference to the parent because cgroup
5460 * core guarantees its existence. 4841 * core guarantees its existence.
5461 */ 4842 */
5462 } else { 4843 } else {
5463 res_counter_init(&memcg->res, NULL); 4844 page_counter_init(&memcg->memory, NULL);
5464 res_counter_init(&memcg->memsw, NULL); 4845 page_counter_init(&memcg->memsw, NULL);
5465 res_counter_init(&memcg->kmem, NULL); 4846 page_counter_init(&memcg->kmem, NULL);
5466 /* 4847 /*
5467 * Deeper hierachy with use_hierarchy == false doesn't make 4848 * Deeper hierachy with use_hierarchy == false doesn't make
5468 * much sense so let cgroup subsystem know about this 4849 * much sense so let cgroup subsystem know about this
@@ -5487,29 +4868,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
5487 return 0; 4868 return 0;
5488} 4869}
5489 4870
5490/*
5491 * Announce all parents that a group from their hierarchy is gone.
5492 */
5493static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
5494{
5495 struct mem_cgroup *parent = memcg;
5496
5497 while ((parent = parent_mem_cgroup(parent)))
5498 mem_cgroup_iter_invalidate(parent);
5499
5500 /*
5501 * if the root memcg is not hierarchical we have to check it
5502 * explicitely.
5503 */
5504 if (!root_mem_cgroup->use_hierarchy)
5505 mem_cgroup_iter_invalidate(root_mem_cgroup);
5506}
5507
5508static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 4871static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5509{ 4872{
5510 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4873 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5511 struct mem_cgroup_event *event, *tmp; 4874 struct mem_cgroup_event *event, *tmp;
5512 struct cgroup_subsys_state *iter;
5513 4875
5514 /* 4876 /*
5515 * Unregister events and notify userspace. 4877 * Unregister events and notify userspace.
@@ -5523,17 +4885,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5523 } 4885 }
5524 spin_unlock(&memcg->event_list_lock); 4886 spin_unlock(&memcg->event_list_lock);
5525 4887
5526 kmem_cgroup_css_offline(memcg);
5527
5528 mem_cgroup_invalidate_reclaim_iterators(memcg);
5529
5530 /*
5531 * This requires that offlining is serialized. Right now that is
5532 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
5533 */
5534 css_for_each_descendant_post(iter, css)
5535 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
5536
5537 memcg_unregister_all_caches(memcg); 4888 memcg_unregister_all_caches(memcg);
5538 vmpressure_cleanup(&memcg->vmpressure); 4889 vmpressure_cleanup(&memcg->vmpressure);
5539} 4890}
@@ -5541,42 +4892,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5541static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4892static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5542{ 4893{
5543 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4894 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5544 /*
5545 * XXX: css_offline() would be where we should reparent all
5546 * memory to prepare the cgroup for destruction. However,
5547 * memcg does not do css_tryget_online() and res_counter charging
5548 * under the same RCU lock region, which means that charging
5549 * could race with offlining. Offlining only happens to
5550 * cgroups with no tasks in them but charges can show up
5551 * without any tasks from the swapin path when the target
5552 * memcg is looked up from the swapout record and not from the
5553 * current task as it usually is. A race like this can leak
5554 * charges and put pages with stale cgroup pointers into
5555 * circulation:
5556 *
5557 * #0 #1
5558 * lookup_swap_cgroup_id()
5559 * rcu_read_lock()
5560 * mem_cgroup_lookup()
5561 * css_tryget_online()
5562 * rcu_read_unlock()
5563 * disable css_tryget_online()
5564 * call_rcu()
5565 * offline_css()
5566 * reparent_charges()
5567 * res_counter_charge()
5568 * css_put()
5569 * css_free()
5570 * pc->mem_cgroup = dead memcg
5571 * add page to lru
5572 *
5573 * The bulk of the charges are still moved in offline_css() to
5574 * avoid pinning a lot of pages in case a long-term reference
5575 * like a swapout record is deferring the css_free() to long
5576 * after offlining. But this makes sure we catch any charges
5577 * made after offlining:
5578 */
5579 mem_cgroup_reparent_charges(memcg);
5580 4895
5581 memcg_destroy_kmem(memcg); 4896 memcg_destroy_kmem(memcg);
5582 __mem_cgroup_free(memcg); 4897 __mem_cgroup_free(memcg);
@@ -5599,10 +4914,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5599{ 4914{
5600 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4915 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5601 4916
5602 mem_cgroup_resize_limit(memcg, ULLONG_MAX); 4917 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
5603 mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 4918 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
5604 memcg_update_kmem_limit(memcg, ULLONG_MAX); 4919 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
5605 res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 4920 memcg->soft_limit = 0;
5606} 4921}
5607 4922
5608#ifdef CONFIG_MMU 4923#ifdef CONFIG_MMU
@@ -5758,7 +5073,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5758 unsigned long addr, pte_t ptent, union mc_target *target) 5073 unsigned long addr, pte_t ptent, union mc_target *target)
5759{ 5074{
5760 struct page *page = NULL; 5075 struct page *page = NULL;
5761 struct page_cgroup *pc;
5762 enum mc_target_type ret = MC_TARGET_NONE; 5076 enum mc_target_type ret = MC_TARGET_NONE;
5763 swp_entry_t ent = { .val = 0 }; 5077 swp_entry_t ent = { .val = 0 };
5764 5078
@@ -5772,13 +5086,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5772 if (!page && !ent.val) 5086 if (!page && !ent.val)
5773 return ret; 5087 return ret;
5774 if (page) { 5088 if (page) {
5775 pc = lookup_page_cgroup(page);
5776 /* 5089 /*
5777 * Do only loose check w/o serialization. 5090 * Do only loose check w/o serialization.
5778 * mem_cgroup_move_account() checks the pc is valid or 5091 * mem_cgroup_move_account() checks the page is valid or
5779 * not under LRU exclusion. 5092 * not under LRU exclusion.
5780 */ 5093 */
5781 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5094 if (page->mem_cgroup == mc.from) {
5782 ret = MC_TARGET_PAGE; 5095 ret = MC_TARGET_PAGE;
5783 if (target) 5096 if (target)
5784 target->page = page; 5097 target->page = page;
@@ -5806,15 +5119,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5806 unsigned long addr, pmd_t pmd, union mc_target *target) 5119 unsigned long addr, pmd_t pmd, union mc_target *target)
5807{ 5120{
5808 struct page *page = NULL; 5121 struct page *page = NULL;
5809 struct page_cgroup *pc;
5810 enum mc_target_type ret = MC_TARGET_NONE; 5122 enum mc_target_type ret = MC_TARGET_NONE;
5811 5123
5812 page = pmd_page(pmd); 5124 page = pmd_page(pmd);
5813 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5125 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5814 if (!move_anon()) 5126 if (!move_anon())
5815 return ret; 5127 return ret;
5816 pc = lookup_page_cgroup(page); 5128 if (page->mem_cgroup == mc.from) {
5817 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5818 ret = MC_TARGET_PAGE; 5129 ret = MC_TARGET_PAGE;
5819 if (target) { 5130 if (target) {
5820 get_page(page); 5131 get_page(page);
@@ -5897,7 +5208,6 @@ static void __mem_cgroup_clear_mc(void)
5897{ 5208{
5898 struct mem_cgroup *from = mc.from; 5209 struct mem_cgroup *from = mc.from;
5899 struct mem_cgroup *to = mc.to; 5210 struct mem_cgroup *to = mc.to;
5900 int i;
5901 5211
5902 /* we must uncharge all the leftover precharges from mc.to */ 5212 /* we must uncharge all the leftover precharges from mc.to */
5903 if (mc.precharge) { 5213 if (mc.precharge) {
@@ -5916,19 +5226,17 @@ static void __mem_cgroup_clear_mc(void)
5916 if (mc.moved_swap) { 5226 if (mc.moved_swap) {
5917 /* uncharge swap account from the old cgroup */ 5227 /* uncharge swap account from the old cgroup */
5918 if (!mem_cgroup_is_root(mc.from)) 5228 if (!mem_cgroup_is_root(mc.from))
5919 res_counter_uncharge(&mc.from->memsw, 5229 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5920 PAGE_SIZE * mc.moved_swap);
5921
5922 for (i = 0; i < mc.moved_swap; i++)
5923 css_put(&mc.from->css);
5924 5230
5925 /* 5231 /*
5926 * we charged both to->res and to->memsw, so we should 5232 * we charged both to->memory and to->memsw, so we
5927 * uncharge to->res. 5233 * should uncharge to->memory.
5928 */ 5234 */
5929 if (!mem_cgroup_is_root(mc.to)) 5235 if (!mem_cgroup_is_root(mc.to))
5930 res_counter_uncharge(&mc.to->res, 5236 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5931 PAGE_SIZE * mc.moved_swap); 5237
5238 css_put_many(&mc.from->css, mc.moved_swap);
5239
5932 /* we've already done css_get(mc.to) */ 5240 /* we've already done css_get(mc.to) */
5933 mc.moved_swap = 0; 5241 mc.moved_swap = 0;
5934 } 5242 }
@@ -5939,8 +5247,6 @@ static void __mem_cgroup_clear_mc(void)
5939 5247
5940static void mem_cgroup_clear_mc(void) 5248static void mem_cgroup_clear_mc(void)
5941{ 5249{
5942 struct mem_cgroup *from = mc.from;
5943
5944 /* 5250 /*
5945 * we must clear moving_task before waking up waiters at the end of 5251 * we must clear moving_task before waking up waiters at the end of
5946 * task migration. 5252 * task migration.
@@ -5951,7 +5257,6 @@ static void mem_cgroup_clear_mc(void)
5951 mc.from = NULL; 5257 mc.from = NULL;
5952 mc.to = NULL; 5258 mc.to = NULL;
5953 spin_unlock(&mc.lock); 5259 spin_unlock(&mc.lock);
5954 mem_cgroup_end_move(from);
5955} 5260}
5956 5261
5957static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5262static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
@@ -5984,7 +5289,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5984 VM_BUG_ON(mc.precharge); 5289 VM_BUG_ON(mc.precharge);
5985 VM_BUG_ON(mc.moved_charge); 5290 VM_BUG_ON(mc.moved_charge);
5986 VM_BUG_ON(mc.moved_swap); 5291 VM_BUG_ON(mc.moved_swap);
5987 mem_cgroup_start_move(from); 5292
5988 spin_lock(&mc.lock); 5293 spin_lock(&mc.lock);
5989 mc.from = from; 5294 mc.from = from;
5990 mc.to = memcg; 5295 mc.to = memcg;
@@ -6004,7 +5309,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6004static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5309static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6005 struct cgroup_taskset *tset) 5310 struct cgroup_taskset *tset)
6006{ 5311{
6007 mem_cgroup_clear_mc(); 5312 if (mc.to)
5313 mem_cgroup_clear_mc();
6008} 5314}
6009 5315
6010static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5316static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
@@ -6018,7 +5324,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6018 enum mc_target_type target_type; 5324 enum mc_target_type target_type;
6019 union mc_target target; 5325 union mc_target target;
6020 struct page *page; 5326 struct page *page;
6021 struct page_cgroup *pc;
6022 5327
6023 /* 5328 /*
6024 * We don't take compound_lock() here but no race with splitting thp 5329 * We don't take compound_lock() here but no race with splitting thp
@@ -6039,9 +5344,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6039 if (target_type == MC_TARGET_PAGE) { 5344 if (target_type == MC_TARGET_PAGE) {
6040 page = target.page; 5345 page = target.page;
6041 if (!isolate_lru_page(page)) { 5346 if (!isolate_lru_page(page)) {
6042 pc = lookup_page_cgroup(page);
6043 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5347 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6044 pc, mc.from, mc.to)) { 5348 mc.from, mc.to)) {
6045 mc.precharge -= HPAGE_PMD_NR; 5349 mc.precharge -= HPAGE_PMD_NR;
6046 mc.moved_charge += HPAGE_PMD_NR; 5350 mc.moved_charge += HPAGE_PMD_NR;
6047 } 5351 }
@@ -6069,9 +5373,7 @@ retry:
6069 page = target.page; 5373 page = target.page;
6070 if (isolate_lru_page(page)) 5374 if (isolate_lru_page(page))
6071 goto put; 5375 goto put;
6072 pc = lookup_page_cgroup(page); 5376 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
6073 if (!mem_cgroup_move_account(page, 1, pc,
6074 mc.from, mc.to)) {
6075 mc.precharge--; 5377 mc.precharge--;
6076 /* we uncharge from mc.from later. */ 5378 /* we uncharge from mc.from later. */
6077 mc.moved_charge++; 5379 mc.moved_charge++;
@@ -6115,6 +5417,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
6115 struct vm_area_struct *vma; 5417 struct vm_area_struct *vma;
6116 5418
6117 lru_add_drain_all(); 5419 lru_add_drain_all();
5420 /*
5421 * Signal mem_cgroup_begin_page_stat() to take the memcg's
5422 * move_lock while we're moving its pages to another memcg.
5423 * Then wait for already started RCU-only updates to finish.
5424 */
5425 atomic_inc(&mc.from->moving_account);
5426 synchronize_rcu();
6118retry: 5427retry:
6119 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5428 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
6120 /* 5429 /*
@@ -6147,6 +5456,7 @@ retry:
6147 break; 5456 break;
6148 } 5457 }
6149 up_read(&mm->mmap_sem); 5458 up_read(&mm->mmap_sem);
5459 atomic_dec(&mc.from->moving_account);
6150} 5460}
6151 5461
6152static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 5462static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
@@ -6250,7 +5560,7 @@ static void __init enable_swap_cgroup(void)
6250 */ 5560 */
6251void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5561void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6252{ 5562{
6253 struct page_cgroup *pc; 5563 struct mem_cgroup *memcg;
6254 unsigned short oldid; 5564 unsigned short oldid;
6255 5565
6256 VM_BUG_ON_PAGE(PageLRU(page), page); 5566 VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -6259,20 +5569,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6259 if (!do_swap_account) 5569 if (!do_swap_account)
6260 return; 5570 return;
6261 5571
6262 pc = lookup_page_cgroup(page); 5572 memcg = page->mem_cgroup;
6263 5573
6264 /* Readahead page, never charged */ 5574 /* Readahead page, never charged */
6265 if (!PageCgroupUsed(pc)) 5575 if (!memcg)
6266 return; 5576 return;
6267 5577
6268 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); 5578 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
6269
6270 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
6271 VM_BUG_ON_PAGE(oldid, page); 5579 VM_BUG_ON_PAGE(oldid, page);
5580 mem_cgroup_swap_statistics(memcg, true);
5581
5582 page->mem_cgroup = NULL;
6272 5583
6273 pc->flags &= ~PCG_MEMSW; 5584 if (!mem_cgroup_is_root(memcg))
6274 css_get(&pc->mem_cgroup->css); 5585 page_counter_uncharge(&memcg->memory, 1);
6275 mem_cgroup_swap_statistics(pc->mem_cgroup, true); 5586
5587 /* XXX: caller holds IRQ-safe mapping->tree_lock */
5588 VM_BUG_ON(!irqs_disabled());
5589
5590 mem_cgroup_charge_statistics(memcg, page, -1);
5591 memcg_check_events(memcg, page);
6276} 5592}
6277 5593
6278/** 5594/**
@@ -6294,7 +5610,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
6294 memcg = mem_cgroup_lookup(id); 5610 memcg = mem_cgroup_lookup(id);
6295 if (memcg) { 5611 if (memcg) {
6296 if (!mem_cgroup_is_root(memcg)) 5612 if (!mem_cgroup_is_root(memcg))
6297 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 5613 page_counter_uncharge(&memcg->memsw, 1);
6298 mem_cgroup_swap_statistics(memcg, false); 5614 mem_cgroup_swap_statistics(memcg, false);
6299 css_put(&memcg->css); 5615 css_put(&memcg->css);
6300 } 5616 }
@@ -6330,7 +5646,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6330 goto out; 5646 goto out;
6331 5647
6332 if (PageSwapCache(page)) { 5648 if (PageSwapCache(page)) {
6333 struct page_cgroup *pc = lookup_page_cgroup(page);
6334 /* 5649 /*
6335 * Every swap fault against a single page tries to charge the 5650 * Every swap fault against a single page tries to charge the
6336 * page, bail as early as possible. shmem_unuse() encounters 5651 * page, bail as early as possible. shmem_unuse() encounters
@@ -6338,7 +5653,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6338 * the page lock, which serializes swap cache removal, which 5653 * the page lock, which serializes swap cache removal, which
6339 * in turn serializes uncharging. 5654 * in turn serializes uncharging.
6340 */ 5655 */
6341 if (PageCgroupUsed(pc)) 5656 if (page->mem_cgroup)
6342 goto out; 5657 goto out;
6343 } 5658 }
6344 5659
@@ -6452,19 +5767,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6452} 5767}
6453 5768
6454static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5769static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6455 unsigned long nr_mem, unsigned long nr_memsw,
6456 unsigned long nr_anon, unsigned long nr_file, 5770 unsigned long nr_anon, unsigned long nr_file,
6457 unsigned long nr_huge, struct page *dummy_page) 5771 unsigned long nr_huge, struct page *dummy_page)
6458{ 5772{
5773 unsigned long nr_pages = nr_anon + nr_file;
6459 unsigned long flags; 5774 unsigned long flags;
6460 5775
6461 if (!mem_cgroup_is_root(memcg)) { 5776 if (!mem_cgroup_is_root(memcg)) {
6462 if (nr_mem) 5777 page_counter_uncharge(&memcg->memory, nr_pages);
6463 res_counter_uncharge(&memcg->res, 5778 if (do_swap_account)
6464 nr_mem * PAGE_SIZE); 5779 page_counter_uncharge(&memcg->memsw, nr_pages);
6465 if (nr_memsw)
6466 res_counter_uncharge(&memcg->memsw,
6467 nr_memsw * PAGE_SIZE);
6468 memcg_oom_recover(memcg); 5780 memcg_oom_recover(memcg);
6469 } 5781 }
6470 5782
@@ -6473,27 +5785,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6473 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 5785 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
6474 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 5786 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
6475 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 5787 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
6476 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); 5788 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
6477 memcg_check_events(memcg, dummy_page); 5789 memcg_check_events(memcg, dummy_page);
6478 local_irq_restore(flags); 5790 local_irq_restore(flags);
5791
5792 if (!mem_cgroup_is_root(memcg))
5793 css_put_many(&memcg->css, nr_pages);
6479} 5794}
6480 5795
6481static void uncharge_list(struct list_head *page_list) 5796static void uncharge_list(struct list_head *page_list)
6482{ 5797{
6483 struct mem_cgroup *memcg = NULL; 5798 struct mem_cgroup *memcg = NULL;
6484 unsigned long nr_memsw = 0;
6485 unsigned long nr_anon = 0; 5799 unsigned long nr_anon = 0;
6486 unsigned long nr_file = 0; 5800 unsigned long nr_file = 0;
6487 unsigned long nr_huge = 0; 5801 unsigned long nr_huge = 0;
6488 unsigned long pgpgout = 0; 5802 unsigned long pgpgout = 0;
6489 unsigned long nr_mem = 0;
6490 struct list_head *next; 5803 struct list_head *next;
6491 struct page *page; 5804 struct page *page;
6492 5805
6493 next = page_list->next; 5806 next = page_list->next;
6494 do { 5807 do {
6495 unsigned int nr_pages = 1; 5808 unsigned int nr_pages = 1;
6496 struct page_cgroup *pc;
6497 5809
6498 page = list_entry(next, struct page, lru); 5810 page = list_entry(next, struct page, lru);
6499 next = page->lru.next; 5811 next = page->lru.next;
@@ -6501,24 +5813,22 @@ static void uncharge_list(struct list_head *page_list)
6501 VM_BUG_ON_PAGE(PageLRU(page), page); 5813 VM_BUG_ON_PAGE(PageLRU(page), page);
6502 VM_BUG_ON_PAGE(page_count(page), page); 5814 VM_BUG_ON_PAGE(page_count(page), page);
6503 5815
6504 pc = lookup_page_cgroup(page); 5816 if (!page->mem_cgroup)
6505 if (!PageCgroupUsed(pc))
6506 continue; 5817 continue;
6507 5818
6508 /* 5819 /*
6509 * Nobody should be changing or seriously looking at 5820 * Nobody should be changing or seriously looking at
6510 * pc->mem_cgroup and pc->flags at this point, we have 5821 * page->mem_cgroup at this point, we have fully
6511 * fully exclusive access to the page. 5822 * exclusive access to the page.
6512 */ 5823 */
6513 5824
6514 if (memcg != pc->mem_cgroup) { 5825 if (memcg != page->mem_cgroup) {
6515 if (memcg) { 5826 if (memcg) {
6516 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 5827 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
6517 nr_anon, nr_file, nr_huge, page); 5828 nr_huge, page);
6518 pgpgout = nr_mem = nr_memsw = 0; 5829 pgpgout = nr_anon = nr_file = nr_huge = 0;
6519 nr_anon = nr_file = nr_huge = 0;
6520 } 5830 }
6521 memcg = pc->mem_cgroup; 5831 memcg = page->mem_cgroup;
6522 } 5832 }
6523 5833
6524 if (PageTransHuge(page)) { 5834 if (PageTransHuge(page)) {
@@ -6532,18 +5842,14 @@ static void uncharge_list(struct list_head *page_list)
6532 else 5842 else
6533 nr_file += nr_pages; 5843 nr_file += nr_pages;
6534 5844
6535 if (pc->flags & PCG_MEM) 5845 page->mem_cgroup = NULL;
6536 nr_mem += nr_pages;
6537 if (pc->flags & PCG_MEMSW)
6538 nr_memsw += nr_pages;
6539 pc->flags = 0;
6540 5846
6541 pgpgout++; 5847 pgpgout++;
6542 } while (next != page_list); 5848 } while (next != page_list);
6543 5849
6544 if (memcg) 5850 if (memcg)
6545 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 5851 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
6546 nr_anon, nr_file, nr_huge, page); 5852 nr_huge, page);
6547} 5853}
6548 5854
6549/** 5855/**
@@ -6555,14 +5861,11 @@ static void uncharge_list(struct list_head *page_list)
6555 */ 5861 */
6556void mem_cgroup_uncharge(struct page *page) 5862void mem_cgroup_uncharge(struct page *page)
6557{ 5863{
6558 struct page_cgroup *pc;
6559
6560 if (mem_cgroup_disabled()) 5864 if (mem_cgroup_disabled())
6561 return; 5865 return;
6562 5866
6563 /* Don't touch page->lru of any random page, pre-check: */ 5867 /* Don't touch page->lru of any random page, pre-check: */
6564 pc = lookup_page_cgroup(page); 5868 if (!page->mem_cgroup)
6565 if (!PageCgroupUsed(pc))
6566 return; 5869 return;
6567 5870
6568 INIT_LIST_HEAD(&page->lru); 5871 INIT_LIST_HEAD(&page->lru);
@@ -6598,7 +5901,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
6598void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 5901void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6599 bool lrucare) 5902 bool lrucare)
6600{ 5903{
6601 struct page_cgroup *pc; 5904 struct mem_cgroup *memcg;
6602 int isolated; 5905 int isolated;
6603 5906
6604 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5907 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -6613,27 +5916,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6613 return; 5916 return;
6614 5917
6615 /* Page cache replacement: new page already charged? */ 5918 /* Page cache replacement: new page already charged? */
6616 pc = lookup_page_cgroup(newpage); 5919 if (newpage->mem_cgroup)
6617 if (PageCgroupUsed(pc))
6618 return; 5920 return;
6619 5921
6620 /* Re-entrant migration: old page already uncharged? */ 5922 /*
6621 pc = lookup_page_cgroup(oldpage); 5923 * Swapcache readahead pages can get migrated before being
6622 if (!PageCgroupUsed(pc)) 5924 * charged, and migration from compaction can happen to an
5925 * uncharged page when the PFN walker finds a page that
5926 * reclaim just put back on the LRU but has not released yet.
5927 */
5928 memcg = oldpage->mem_cgroup;
5929 if (!memcg)
6623 return; 5930 return;
6624 5931
6625 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
6626 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
6627
6628 if (lrucare) 5932 if (lrucare)
6629 lock_page_lru(oldpage, &isolated); 5933 lock_page_lru(oldpage, &isolated);
6630 5934
6631 pc->flags = 0; 5935 oldpage->mem_cgroup = NULL;
6632 5936
6633 if (lrucare) 5937 if (lrucare)
6634 unlock_page_lru(oldpage, isolated); 5938 unlock_page_lru(oldpage, isolated);
6635 5939
6636 commit_charge(newpage, pc->mem_cgroup, lrucare); 5940 commit_charge(newpage, memcg, lrucare);
6637} 5941}
6638 5942
6639/* 5943/*
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b852b10ec76d..e5ee0ca7ae85 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -233,7 +233,7 @@ void shake_page(struct page *p, int access)
233 lru_add_drain_all(); 233 lru_add_drain_all();
234 if (PageLRU(p)) 234 if (PageLRU(p))
235 return; 235 return;
236 drain_all_pages(); 236 drain_all_pages(page_zone(p));
237 if (PageLRU(p) || is_free_buddy_page(p)) 237 if (PageLRU(p) || is_free_buddy_page(p))
238 return; 238 return;
239 } 239 }
@@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags)
1661 if (!is_free_buddy_page(page)) 1661 if (!is_free_buddy_page(page))
1662 lru_add_drain_all(); 1662 lru_add_drain_all();
1663 if (!is_free_buddy_page(page)) 1663 if (!is_free_buddy_page(page))
1664 drain_all_pages(); 1664 drain_all_pages(page_zone(page));
1665 SetPageHWPoison(page); 1665 SetPageHWPoison(page);
1666 if (!is_free_buddy_page(page)) 1666 if (!is_free_buddy_page(page))
1667 pr_info("soft offline: %#lx: page leaked\n", 1667 pr_info("soft offline: %#lx: page leaked\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1bf4807cb21e..9fab10795bea 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1725,7 +1725,7 @@ repeat:
1725 if (drain) { 1725 if (drain) {
1726 lru_add_drain_all(); 1726 lru_add_drain_all();
1727 cond_resched(); 1727 cond_resched();
1728 drain_all_pages(); 1728 drain_all_pages(zone);
1729 } 1729 }
1730 1730
1731 pfn = scan_movable_pages(start_pfn, end_pfn); 1731 pfn = scan_movable_pages(start_pfn, end_pfn);
@@ -1747,7 +1747,7 @@ repeat:
1747 lru_add_drain_all(); 1747 lru_add_drain_all();
1748 yield(); 1748 yield();
1749 /* drain pcp pages, this is synchronous. */ 1749 /* drain pcp pages, this is synchronous. */
1750 drain_all_pages(); 1750 drain_all_pages(zone);
1751 /* 1751 /*
1752 * dissolve free hugepages in the memory block before doing offlining 1752 * dissolve free hugepages in the memory block before doing offlining
1753 * actually in order to make hugetlbfs's object counting consistent. 1753 * actually in order to make hugetlbfs's object counting consistent.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5340f6b91312..3b014d326151 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -119,7 +119,7 @@ found:
119 119
120/* return true if the task is not adequate as candidate victim task. */ 120/* return true if the task is not adequate as candidate victim task. */
121static bool oom_unkillable_task(struct task_struct *p, 121static bool oom_unkillable_task(struct task_struct *p,
122 const struct mem_cgroup *memcg, const nodemask_t *nodemask) 122 struct mem_cgroup *memcg, const nodemask_t *nodemask)
123{ 123{
124 if (is_global_init(p)) 124 if (is_global_init(p))
125 return true; 125 return true;
@@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
353 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, 353 * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
354 * swapents, oom_score_adj value, and name. 354 * swapents, oom_score_adj value, and name.
355 */ 355 */
356static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) 356static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
357{ 357{
358 struct task_struct *p; 358 struct task_struct *p;
359 struct task_struct *task; 359 struct task_struct *task;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 19ceae87522d..d5d81f5384d1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page)
2357 dec_zone_page_state(page, NR_WRITEBACK); 2357 dec_zone_page_state(page, NR_WRITEBACK);
2358 inc_zone_page_state(page, NR_WRITTEN); 2358 inc_zone_page_state(page, NR_WRITTEN);
2359 } 2359 }
2360 mem_cgroup_end_page_stat(memcg, locked, memcg_flags); 2360 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
2361 return ret; 2361 return ret;
2362} 2362}
2363 2363
@@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2399 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2399 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
2400 inc_zone_page_state(page, NR_WRITEBACK); 2400 inc_zone_page_state(page, NR_WRITEBACK);
2401 } 2401 }
2402 mem_cgroup_end_page_stat(memcg, locked, memcg_flags); 2402 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
2403 return ret; 2403 return ret;
2404 2404
2405} 2405}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 616a2c956b4b..a7198c065999 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,6 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 51#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 52#include <linux/kmemleak.h>
54#include <linux/compaction.h> 53#include <linux/compaction.h>
@@ -641,8 +640,10 @@ static inline int free_pages_check(struct page *page)
641 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; 640 bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
642 bad_flags = PAGE_FLAGS_CHECK_AT_FREE; 641 bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
643 } 642 }
644 if (unlikely(mem_cgroup_bad_page_check(page))) 643#ifdef CONFIG_MEMCG
645 bad_reason = "cgroup check failed"; 644 if (unlikely(page->mem_cgroup))
645 bad_reason = "page still charged to cgroup";
646#endif
646 if (unlikely(bad_reason)) { 647 if (unlikely(bad_reason)) {
647 bad_page(page, bad_reason, bad_flags); 648 bad_page(page, bad_reason, bad_flags);
648 return 1; 649 return 1;
@@ -741,6 +742,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
741 int i; 742 int i;
742 int bad = 0; 743 int bad = 0;
743 744
745 VM_BUG_ON_PAGE(PageTail(page), page);
746 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
747
744 trace_mm_page_free(page, order); 748 trace_mm_page_free(page, order);
745 kmemcheck_free_shadow(page, order); 749 kmemcheck_free_shadow(page, order);
746 750
@@ -898,8 +902,10 @@ static inline int check_new_page(struct page *page)
898 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; 902 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
899 bad_flags = PAGE_FLAGS_CHECK_AT_PREP; 903 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
900 } 904 }
901 if (unlikely(mem_cgroup_bad_page_check(page))) 905#ifdef CONFIG_MEMCG
902 bad_reason = "cgroup check failed"; 906 if (unlikely(page->mem_cgroup))
907 bad_reason = "page still charged to cgroup";
908#endif
903 if (unlikely(bad_reason)) { 909 if (unlikely(bad_reason)) {
904 bad_page(page, bad_reason, bad_flags); 910 bad_page(page, bad_reason, bad_flags);
905 return 1; 911 return 1;
@@ -1267,55 +1273,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1267#endif 1273#endif
1268 1274
1269/* 1275/*
1270 * Drain pages of the indicated processor. 1276 * Drain pcplists of the indicated processor and zone.
1271 * 1277 *
1272 * The processor must either be the current processor and the 1278 * The processor must either be the current processor and the
1273 * thread pinned to the current processor or a processor that 1279 * thread pinned to the current processor or a processor that
1274 * is not online. 1280 * is not online.
1275 */ 1281 */
1276static void drain_pages(unsigned int cpu) 1282static void drain_pages_zone(unsigned int cpu, struct zone *zone)
1277{ 1283{
1278 unsigned long flags; 1284 unsigned long flags;
1279 struct zone *zone; 1285 struct per_cpu_pageset *pset;
1286 struct per_cpu_pages *pcp;
1280 1287
1281 for_each_populated_zone(zone) { 1288 local_irq_save(flags);
1282 struct per_cpu_pageset *pset; 1289 pset = per_cpu_ptr(zone->pageset, cpu);
1283 struct per_cpu_pages *pcp;
1284 1290
1285 local_irq_save(flags); 1291 pcp = &pset->pcp;
1286 pset = per_cpu_ptr(zone->pageset, cpu); 1292 if (pcp->count) {
1293 free_pcppages_bulk(zone, pcp->count, pcp);
1294 pcp->count = 0;
1295 }
1296 local_irq_restore(flags);
1297}
1287 1298
1288 pcp = &pset->pcp; 1299/*
1289 if (pcp->count) { 1300 * Drain pcplists of all zones on the indicated processor.
1290 free_pcppages_bulk(zone, pcp->count, pcp); 1301 *
1291 pcp->count = 0; 1302 * The processor must either be the current processor and the
1292 } 1303 * thread pinned to the current processor or a processor that
1293 local_irq_restore(flags); 1304 * is not online.
1305 */
1306static void drain_pages(unsigned int cpu)
1307{
1308 struct zone *zone;
1309
1310 for_each_populated_zone(zone) {
1311 drain_pages_zone(cpu, zone);
1294 } 1312 }
1295} 1313}
1296 1314
1297/* 1315/*
1298 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 1316 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
1317 *
1318 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
1319 * the single zone's pages.
1299 */ 1320 */
1300void drain_local_pages(void *arg) 1321void drain_local_pages(struct zone *zone)
1301{ 1322{
1302 drain_pages(smp_processor_id()); 1323 int cpu = smp_processor_id();
1324
1325 if (zone)
1326 drain_pages_zone(cpu, zone);
1327 else
1328 drain_pages(cpu);
1303} 1329}
1304 1330
1305/* 1331/*
1306 * Spill all the per-cpu pages from all CPUs back into the buddy allocator. 1332 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
1307 * 1333 *
1334 * When zone parameter is non-NULL, spill just the single zone's pages.
1335 *
1308 * Note that this code is protected against sending an IPI to an offline 1336 * Note that this code is protected against sending an IPI to an offline
1309 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: 1337 * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
1310 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but 1338 * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
1311 * nothing keeps CPUs from showing up after we populated the cpumask and 1339 * nothing keeps CPUs from showing up after we populated the cpumask and
1312 * before the call to on_each_cpu_mask(). 1340 * before the call to on_each_cpu_mask().
1313 */ 1341 */
1314void drain_all_pages(void) 1342void drain_all_pages(struct zone *zone)
1315{ 1343{
1316 int cpu; 1344 int cpu;
1317 struct per_cpu_pageset *pcp;
1318 struct zone *zone;
1319 1345
1320 /* 1346 /*
1321 * Allocate in the BSS so we wont require allocation in 1347 * Allocate in the BSS so we wont require allocation in
@@ -1330,20 +1356,31 @@ void drain_all_pages(void)
1330 * disables preemption as part of its processing 1356 * disables preemption as part of its processing
1331 */ 1357 */
1332 for_each_online_cpu(cpu) { 1358 for_each_online_cpu(cpu) {
1359 struct per_cpu_pageset *pcp;
1360 struct zone *z;
1333 bool has_pcps = false; 1361 bool has_pcps = false;
1334 for_each_populated_zone(zone) { 1362
1363 if (zone) {
1335 pcp = per_cpu_ptr(zone->pageset, cpu); 1364 pcp = per_cpu_ptr(zone->pageset, cpu);
1336 if (pcp->pcp.count) { 1365 if (pcp->pcp.count)
1337 has_pcps = true; 1366 has_pcps = true;
1338 break; 1367 } else {
1368 for_each_populated_zone(z) {
1369 pcp = per_cpu_ptr(z->pageset, cpu);
1370 if (pcp->pcp.count) {
1371 has_pcps = true;
1372 break;
1373 }
1339 } 1374 }
1340 } 1375 }
1376
1341 if (has_pcps) 1377 if (has_pcps)
1342 cpumask_set_cpu(cpu, &cpus_with_pcps); 1378 cpumask_set_cpu(cpu, &cpus_with_pcps);
1343 else 1379 else
1344 cpumask_clear_cpu(cpu, &cpus_with_pcps); 1380 cpumask_clear_cpu(cpu, &cpus_with_pcps);
1345 } 1381 }
1346 on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); 1382 on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages,
1383 zone, 1);
1347} 1384}
1348 1385
1349#ifdef CONFIG_HIBERNATION 1386#ifdef CONFIG_HIBERNATION
@@ -1705,7 +1742,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
1705 unsigned long mark, int classzone_idx, int alloc_flags, 1742 unsigned long mark, int classzone_idx, int alloc_flags,
1706 long free_pages) 1743 long free_pages)
1707{ 1744{
1708 /* free_pages my go negative - that's OK */ 1745 /* free_pages may go negative - that's OK */
1709 long min = mark; 1746 long min = mark;
1710 int o; 1747 int o;
1711 long free_cma = 0; 1748 long free_cma = 0;
@@ -2296,7 +2333,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2296 int classzone_idx, int migratetype, enum migrate_mode mode, 2333 int classzone_idx, int migratetype, enum migrate_mode mode,
2297 int *contended_compaction, bool *deferred_compaction) 2334 int *contended_compaction, bool *deferred_compaction)
2298{ 2335{
2299 struct zone *last_compact_zone = NULL;
2300 unsigned long compact_result; 2336 unsigned long compact_result;
2301 struct page *page; 2337 struct page *page;
2302 2338
@@ -2307,7 +2343,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2307 compact_result = try_to_compact_pages(zonelist, order, gfp_mask, 2343 compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
2308 nodemask, mode, 2344 nodemask, mode,
2309 contended_compaction, 2345 contended_compaction,
2310 &last_compact_zone); 2346 alloc_flags, classzone_idx);
2311 current->flags &= ~PF_MEMALLOC; 2347 current->flags &= ~PF_MEMALLOC;
2312 2348
2313 switch (compact_result) { 2349 switch (compact_result) {
@@ -2326,10 +2362,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2326 */ 2362 */
2327 count_vm_event(COMPACTSTALL); 2363 count_vm_event(COMPACTSTALL);
2328 2364
2329 /* Page migration frees to the PCP lists but we want merging */
2330 drain_pages(get_cpu());
2331 put_cpu();
2332
2333 page = get_page_from_freelist(gfp_mask, nodemask, 2365 page = get_page_from_freelist(gfp_mask, nodemask,
2334 order, zonelist, high_zoneidx, 2366 order, zonelist, high_zoneidx,
2335 alloc_flags & ~ALLOC_NO_WATERMARKS, 2367 alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2345,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2345 } 2377 }
2346 2378
2347 /* 2379 /*
2348 * last_compact_zone is where try_to_compact_pages thought allocation
2349 * should succeed, so it did not defer compaction. But here we know
2350 * that it didn't succeed, so we do the defer.
2351 */
2352 if (last_compact_zone && mode != MIGRATE_ASYNC)
2353 defer_compaction(last_compact_zone, order);
2354
2355 /*
2356 * It's bad if compaction run occurs and fails. The most likely reason 2380 * It's bad if compaction run occurs and fails. The most likely reason
2357 * is that pages exist, but not enough to satisfy watermarks. 2381 * is that pages exist, but not enough to satisfy watermarks.
2358 */ 2382 */
@@ -2433,7 +2457,7 @@ retry:
2433 * pages are pinned on the per-cpu lists. Drain them and try again 2457 * pages are pinned on the per-cpu lists. Drain them and try again
2434 */ 2458 */
2435 if (!page && !drained) { 2459 if (!page && !drained) {
2436 drain_all_pages(); 2460 drain_all_pages(NULL);
2437 drained = true; 2461 drained = true;
2438 goto retry; 2462 goto retry;
2439 } 2463 }
@@ -3893,14 +3917,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3893 else 3917 else
3894 page_group_by_mobility_disabled = 0; 3918 page_group_by_mobility_disabled = 0;
3895 3919
3896 printk("Built %i zonelists in %s order, mobility grouping %s. " 3920 pr_info("Built %i zonelists in %s order, mobility grouping %s. "
3897 "Total pages: %ld\n", 3921 "Total pages: %ld\n",
3898 nr_online_nodes, 3922 nr_online_nodes,
3899 zonelist_order_name[current_zonelist_order], 3923 zonelist_order_name[current_zonelist_order],
3900 page_group_by_mobility_disabled ? "off" : "on", 3924 page_group_by_mobility_disabled ? "off" : "on",
3901 vm_total_pages); 3925 vm_total_pages);
3902#ifdef CONFIG_NUMA 3926#ifdef CONFIG_NUMA
3903 printk("Policy zone: %s\n", zone_names[policy_zone]); 3927 pr_info("Policy zone: %s\n", zone_names[policy_zone]);
3904#endif 3928#endif
3905} 3929}
3906 3930
@@ -4832,7 +4856,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4832#endif 4856#endif
4833 init_waitqueue_head(&pgdat->kswapd_wait); 4857 init_waitqueue_head(&pgdat->kswapd_wait);
4834 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4858 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4835 pgdat_page_cgroup_init(pgdat);
4836 4859
4837 for (j = 0; j < MAX_NR_ZONES; j++) { 4860 for (j = 0; j < MAX_NR_ZONES; j++) {
4838 struct zone *zone = pgdat->node_zones + j; 4861 struct zone *zone = pgdat->node_zones + j;
@@ -5334,33 +5357,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5334 find_zone_movable_pfns_for_nodes(); 5357 find_zone_movable_pfns_for_nodes();
5335 5358
5336 /* Print out the zone ranges */ 5359 /* Print out the zone ranges */
5337 printk("Zone ranges:\n"); 5360 pr_info("Zone ranges:\n");
5338 for (i = 0; i < MAX_NR_ZONES; i++) { 5361 for (i = 0; i < MAX_NR_ZONES; i++) {
5339 if (i == ZONE_MOVABLE) 5362 if (i == ZONE_MOVABLE)
5340 continue; 5363 continue;
5341 printk(KERN_CONT " %-8s ", zone_names[i]); 5364 pr_info(" %-8s ", zone_names[i]);
5342 if (arch_zone_lowest_possible_pfn[i] == 5365 if (arch_zone_lowest_possible_pfn[i] ==
5343 arch_zone_highest_possible_pfn[i]) 5366 arch_zone_highest_possible_pfn[i])
5344 printk(KERN_CONT "empty\n"); 5367 pr_cont("empty\n");
5345 else 5368 else
5346 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", 5369 pr_cont("[mem %0#10lx-%0#10lx]\n",
5347 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5370 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
5348 (arch_zone_highest_possible_pfn[i] 5371 (arch_zone_highest_possible_pfn[i]
5349 << PAGE_SHIFT) - 1); 5372 << PAGE_SHIFT) - 1);
5350 } 5373 }
5351 5374
5352 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 5375 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
5353 printk("Movable zone start for each node\n"); 5376 pr_info("Movable zone start for each node\n");
5354 for (i = 0; i < MAX_NUMNODES; i++) { 5377 for (i = 0; i < MAX_NUMNODES; i++) {
5355 if (zone_movable_pfn[i]) 5378 if (zone_movable_pfn[i])
5356 printk(" Node %d: %#010lx\n", i, 5379 pr_info(" Node %d: %#010lx\n", i,
5357 zone_movable_pfn[i] << PAGE_SHIFT); 5380 zone_movable_pfn[i] << PAGE_SHIFT);
5358 } 5381 }
5359 5382
5360 /* Print out the early node map */ 5383 /* Print out the early node map */
5361 printk("Early memory node ranges\n"); 5384 pr_info("Early memory node ranges\n");
5362 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5385 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5363 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5386 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid,
5364 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5387 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
5365 5388
5366 /* Initialise every node */ 5389 /* Initialise every node */
@@ -5496,7 +5519,7 @@ void __init mem_init_print_info(const char *str)
5496 5519
5497#undef adj_init_size 5520#undef adj_init_size
5498 5521
5499 printk("Memory: %luK/%luK available " 5522 pr_info("Memory: %luK/%luK available "
5500 "(%luK kernel code, %luK rwdata, %luK rodata, " 5523 "(%luK kernel code, %luK rwdata, %luK rodata, "
5501 "%luK init, %luK bss, %luK reserved" 5524 "%luK init, %luK bss, %luK reserved"
5502#ifdef CONFIG_HIGHMEM 5525#ifdef CONFIG_HIGHMEM
@@ -6385,7 +6408,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
6385 */ 6408 */
6386 6409
6387 lru_add_drain_all(); 6410 lru_add_drain_all();
6388 drain_all_pages(); 6411 drain_all_pages(cc.zone);
6389 6412
6390 order = 0; 6413 order = 0;
6391 outer_start = start; 6414 outer_start = start;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
deleted file mode 100644
index 5331c2bd85a2..000000000000
--- a/mm/page_cgroup.c
+++ /dev/null
@@ -1,530 +0,0 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11#include <linux/swapops.h>
12#include <linux/kmemleak.h>
13
14static unsigned long total_usage;
15
16#if !defined(CONFIG_SPARSEMEM)
17
18
19void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
20{
21 pgdat->node_page_cgroup = NULL;
22}
23
24struct page_cgroup *lookup_page_cgroup(struct page *page)
25{
26 unsigned long pfn = page_to_pfn(page);
27 unsigned long offset;
28 struct page_cgroup *base;
29
30 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
31#ifdef CONFIG_DEBUG_VM
32 /*
33 * The sanity checks the page allocator does upon freeing a
34 * page can reach here before the page_cgroup arrays are
35 * allocated when feeding a range of pages to the allocator
36 * for the first time during bootup or memory hotplug.
37 */
38 if (unlikely(!base))
39 return NULL;
40#endif
41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42 return base + offset;
43}
44
45static int __init alloc_node_page_cgroup(int nid)
46{
47 struct page_cgroup *base;
48 unsigned long table_size;
49 unsigned long nr_pages;
50
51 nr_pages = NODE_DATA(nid)->node_spanned_pages;
52 if (!nr_pages)
53 return 0;
54
55 table_size = sizeof(struct page_cgroup) * nr_pages;
56
57 base = memblock_virt_alloc_try_nid_nopanic(
58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
59 BOOTMEM_ALLOC_ACCESSIBLE, nid);
60 if (!base)
61 return -ENOMEM;
62 NODE_DATA(nid)->node_page_cgroup = base;
63 total_usage += table_size;
64 return 0;
65}
66
67void __init page_cgroup_init_flatmem(void)
68{
69
70 int nid, fail;
71
72 if (mem_cgroup_disabled())
73 return;
74
75 for_each_online_node(nid) {
76 fail = alloc_node_page_cgroup(nid);
77 if (fail)
78 goto fail;
79 }
80 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
81 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
82 " don't want memory cgroups\n");
83 return;
84fail:
85 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
86 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
87 panic("Out of memory");
88}
89
90#else /* CONFIG_FLAT_NODE_MEM_MAP */
91
92struct page_cgroup *lookup_page_cgroup(struct page *page)
93{
94 unsigned long pfn = page_to_pfn(page);
95 struct mem_section *section = __pfn_to_section(pfn);
96#ifdef CONFIG_DEBUG_VM
97 /*
98 * The sanity checks the page allocator does upon freeing a
99 * page can reach here before the page_cgroup arrays are
100 * allocated when feeding a range of pages to the allocator
101 * for the first time during bootup or memory hotplug.
102 */
103 if (!section->page_cgroup)
104 return NULL;
105#endif
106 return section->page_cgroup + pfn;
107}
108
109static void *__meminit alloc_page_cgroup(size_t size, int nid)
110{
111 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
112 void *addr = NULL;
113
114 addr = alloc_pages_exact_nid(nid, size, flags);
115 if (addr) {
116 kmemleak_alloc(addr, size, 1, flags);
117 return addr;
118 }
119
120 if (node_state(nid, N_HIGH_MEMORY))
121 addr = vzalloc_node(size, nid);
122 else
123 addr = vzalloc(size);
124
125 return addr;
126}
127
128static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
129{
130 struct mem_section *section;
131 struct page_cgroup *base;
132 unsigned long table_size;
133
134 section = __pfn_to_section(pfn);
135
136 if (section->page_cgroup)
137 return 0;
138
139 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
140 base = alloc_page_cgroup(table_size, nid);
141
142 /*
143 * The value stored in section->page_cgroup is (base - pfn)
144 * and it does not point to the memory block allocated above,
145 * causing kmemleak false positives.
146 */
147 kmemleak_not_leak(base);
148
149 if (!base) {
150 printk(KERN_ERR "page cgroup allocation failure\n");
151 return -ENOMEM;
152 }
153
154 /*
155 * The passed "pfn" may not be aligned to SECTION. For the calculation
156 * we need to apply a mask.
157 */
158 pfn &= PAGE_SECTION_MASK;
159 section->page_cgroup = base - pfn;
160 total_usage += table_size;
161 return 0;
162}
163#ifdef CONFIG_MEMORY_HOTPLUG
164static void free_page_cgroup(void *addr)
165{
166 if (is_vmalloc_addr(addr)) {
167 vfree(addr);
168 } else {
169 struct page *page = virt_to_page(addr);
170 size_t table_size =
171 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
172
173 BUG_ON(PageReserved(page));
174 kmemleak_free(addr);
175 free_pages_exact(addr, table_size);
176 }
177}
178
179static void __free_page_cgroup(unsigned long pfn)
180{
181 struct mem_section *ms;
182 struct page_cgroup *base;
183
184 ms = __pfn_to_section(pfn);
185 if (!ms || !ms->page_cgroup)
186 return;
187 base = ms->page_cgroup + pfn;
188 free_page_cgroup(base);
189 ms->page_cgroup = NULL;
190}
191
192static int __meminit online_page_cgroup(unsigned long start_pfn,
193 unsigned long nr_pages,
194 int nid)
195{
196 unsigned long start, end, pfn;
197 int fail = 0;
198
199 start = SECTION_ALIGN_DOWN(start_pfn);
200 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
201
202 if (nid == -1) {
203 /*
204 * In this case, "nid" already exists and contains valid memory.
205 * "start_pfn" passed to us is a pfn which is an arg for
206 * online__pages(), and start_pfn should exist.
207 */
208 nid = pfn_to_nid(start_pfn);
209 VM_BUG_ON(!node_state(nid, N_ONLINE));
210 }
211
212 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
213 if (!pfn_present(pfn))
214 continue;
215 fail = init_section_page_cgroup(pfn, nid);
216 }
217 if (!fail)
218 return 0;
219
220 /* rollback */
221 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
222 __free_page_cgroup(pfn);
223
224 return -ENOMEM;
225}
226
227static int __meminit offline_page_cgroup(unsigned long start_pfn,
228 unsigned long nr_pages, int nid)
229{
230 unsigned long start, end, pfn;
231
232 start = SECTION_ALIGN_DOWN(start_pfn);
233 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
234
235 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
236 __free_page_cgroup(pfn);
237 return 0;
238
239}
240
241static int __meminit page_cgroup_callback(struct notifier_block *self,
242 unsigned long action, void *arg)
243{
244 struct memory_notify *mn = arg;
245 int ret = 0;
246 switch (action) {
247 case MEM_GOING_ONLINE:
248 ret = online_page_cgroup(mn->start_pfn,
249 mn->nr_pages, mn->status_change_nid);
250 break;
251 case MEM_OFFLINE:
252 offline_page_cgroup(mn->start_pfn,
253 mn->nr_pages, mn->status_change_nid);
254 break;
255 case MEM_CANCEL_ONLINE:
256 offline_page_cgroup(mn->start_pfn,
257 mn->nr_pages, mn->status_change_nid);
258 break;
259 case MEM_GOING_OFFLINE:
260 break;
261 case MEM_ONLINE:
262 case MEM_CANCEL_OFFLINE:
263 break;
264 }
265
266 return notifier_from_errno(ret);
267}
268
269#endif
270
271void __init page_cgroup_init(void)
272{
273 unsigned long pfn;
274 int nid;
275
276 if (mem_cgroup_disabled())
277 return;
278
279 for_each_node_state(nid, N_MEMORY) {
280 unsigned long start_pfn, end_pfn;
281
282 start_pfn = node_start_pfn(nid);
283 end_pfn = node_end_pfn(nid);
284 /*
285 * start_pfn and end_pfn may not be aligned to SECTION and the
286 * page->flags of out of node pages are not initialized. So we
287 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
288 */
289 for (pfn = start_pfn;
290 pfn < end_pfn;
291 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
292
293 if (!pfn_valid(pfn))
294 continue;
295 /*
296 * Nodes's pfns can be overlapping.
297 * We know some arch can have a nodes layout such as
298 * -------------pfn-------------->
299 * N0 | N1 | N2 | N0 | N1 | N2|....
300 */
301 if (pfn_to_nid(pfn) != nid)
302 continue;
303 if (init_section_page_cgroup(pfn, nid))
304 goto oom;
305 }
306 }
307 hotplug_memory_notifier(page_cgroup_callback, 0);
308 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
309 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
310 "don't want memory cgroups\n");
311 return;
312oom:
313 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
314 panic("Out of memory");
315}
316
317void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
318{
319 return;
320}
321
322#endif
323
324
325#ifdef CONFIG_MEMCG_SWAP
326
327static DEFINE_MUTEX(swap_cgroup_mutex);
328struct swap_cgroup_ctrl {
329 struct page **map;
330 unsigned long length;
331 spinlock_t lock;
332};
333
334static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
335
336struct swap_cgroup {
337 unsigned short id;
338};
339#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
340
341/*
342 * SwapCgroup implements "lookup" and "exchange" operations.
343 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
344 * against SwapCache. At swap_free(), this is accessed directly from swap.
345 *
346 * This means,
347 * - we have no race in "exchange" when we're accessed via SwapCache because
348 * SwapCache(and its swp_entry) is under lock.
349 * - When called via swap_free(), there is no user of this entry and no race.
350 * Then, we don't need lock around "exchange".
351 *
352 * TODO: we can push these buffers out to HIGHMEM.
353 */
354
355/*
356 * allocate buffer for swap_cgroup.
357 */
358static int swap_cgroup_prepare(int type)
359{
360 struct page *page;
361 struct swap_cgroup_ctrl *ctrl;
362 unsigned long idx, max;
363
364 ctrl = &swap_cgroup_ctrl[type];
365
366 for (idx = 0; idx < ctrl->length; idx++) {
367 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
368 if (!page)
369 goto not_enough_page;
370 ctrl->map[idx] = page;
371 }
372 return 0;
373not_enough_page:
374 max = idx;
375 for (idx = 0; idx < max; idx++)
376 __free_page(ctrl->map[idx]);
377
378 return -ENOMEM;
379}
380
381static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
382 struct swap_cgroup_ctrl **ctrlp)
383{
384 pgoff_t offset = swp_offset(ent);
385 struct swap_cgroup_ctrl *ctrl;
386 struct page *mappage;
387 struct swap_cgroup *sc;
388
389 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
390 if (ctrlp)
391 *ctrlp = ctrl;
392
393 mappage = ctrl->map[offset / SC_PER_PAGE];
394 sc = page_address(mappage);
395 return sc + offset % SC_PER_PAGE;
396}
397
398/**
399 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
400 * @ent: swap entry to be cmpxchged
401 * @old: old id
402 * @new: new id
403 *
404 * Returns old id at success, 0 at failure.
405 * (There is no mem_cgroup using 0 as its id)
406 */
407unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
408 unsigned short old, unsigned short new)
409{
410 struct swap_cgroup_ctrl *ctrl;
411 struct swap_cgroup *sc;
412 unsigned long flags;
413 unsigned short retval;
414
415 sc = lookup_swap_cgroup(ent, &ctrl);
416
417 spin_lock_irqsave(&ctrl->lock, flags);
418 retval = sc->id;
419 if (retval == old)
420 sc->id = new;
421 else
422 retval = 0;
423 spin_unlock_irqrestore(&ctrl->lock, flags);
424 return retval;
425}
426
427/**
428 * swap_cgroup_record - record mem_cgroup for this swp_entry.
429 * @ent: swap entry to be recorded into
430 * @id: mem_cgroup to be recorded
431 *
432 * Returns old value at success, 0 at failure.
433 * (Of course, old value can be 0.)
434 */
435unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
436{
437 struct swap_cgroup_ctrl *ctrl;
438 struct swap_cgroup *sc;
439 unsigned short old;
440 unsigned long flags;
441
442 sc = lookup_swap_cgroup(ent, &ctrl);
443
444 spin_lock_irqsave(&ctrl->lock, flags);
445 old = sc->id;
446 sc->id = id;
447 spin_unlock_irqrestore(&ctrl->lock, flags);
448
449 return old;
450}
451
452/**
453 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
454 * @ent: swap entry to be looked up.
455 *
456 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
457 */
458unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
459{
460 return lookup_swap_cgroup(ent, NULL)->id;
461}
462
463int swap_cgroup_swapon(int type, unsigned long max_pages)
464{
465 void *array;
466 unsigned long array_size;
467 unsigned long length;
468 struct swap_cgroup_ctrl *ctrl;
469
470 if (!do_swap_account)
471 return 0;
472
473 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
474 array_size = length * sizeof(void *);
475
476 array = vzalloc(array_size);
477 if (!array)
478 goto nomem;
479
480 ctrl = &swap_cgroup_ctrl[type];
481 mutex_lock(&swap_cgroup_mutex);
482 ctrl->length = length;
483 ctrl->map = array;
484 spin_lock_init(&ctrl->lock);
485 if (swap_cgroup_prepare(type)) {
486 /* memory shortage */
487 ctrl->map = NULL;
488 ctrl->length = 0;
489 mutex_unlock(&swap_cgroup_mutex);
490 vfree(array);
491 goto nomem;
492 }
493 mutex_unlock(&swap_cgroup_mutex);
494
495 return 0;
496nomem:
497 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
498 printk(KERN_INFO
499 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
500 return -ENOMEM;
501}
502
503void swap_cgroup_swapoff(int type)
504{
505 struct page **map;
506 unsigned long i, length;
507 struct swap_cgroup_ctrl *ctrl;
508
509 if (!do_swap_account)
510 return;
511
512 mutex_lock(&swap_cgroup_mutex);
513 ctrl = &swap_cgroup_ctrl[type];
514 map = ctrl->map;
515 length = ctrl->length;
516 ctrl->map = NULL;
517 ctrl->length = 0;
518 mutex_unlock(&swap_cgroup_mutex);
519
520 if (map) {
521 for (i = 0; i < length; i++) {
522 struct page *page = map[i];
523 if (page)
524 __free_page(page);
525 }
526 vfree(map);
527 }
528}
529
530#endif
diff --git a/mm/page_counter.c b/mm/page_counter.c
new file mode 100644
index 000000000000..a009574fbba9
--- /dev/null
+++ b/mm/page_counter.c
@@ -0,0 +1,192 @@
1/*
2 * Lockless hierarchical page accounting & limiting
3 *
4 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
5 */
6
7#include <linux/page_counter.h>
8#include <linux/atomic.h>
9#include <linux/kernel.h>
10#include <linux/string.h>
11#include <linux/sched.h>
12#include <linux/bug.h>
13#include <asm/page.h>
14
15/**
16 * page_counter_cancel - take pages out of the local counter
17 * @counter: counter
18 * @nr_pages: number of pages to cancel
19 */
20void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
21{
22 long new;
23
24 new = atomic_long_sub_return(nr_pages, &counter->count);
25 /* More uncharges than charges? */
26 WARN_ON_ONCE(new < 0);
27}
28
29/**
30 * page_counter_charge - hierarchically charge pages
31 * @counter: counter
32 * @nr_pages: number of pages to charge
33 *
34 * NOTE: This does not consider any configured counter limits.
35 */
36void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
37{
38 struct page_counter *c;
39
40 for (c = counter; c; c = c->parent) {
41 long new;
42
43 new = atomic_long_add_return(nr_pages, &c->count);
44 /*
45 * This is indeed racy, but we can live with some
46 * inaccuracy in the watermark.
47 */
48 if (new > c->watermark)
49 c->watermark = new;
50 }
51}
52
53/**
54 * page_counter_try_charge - try to hierarchically charge pages
55 * @counter: counter
56 * @nr_pages: number of pages to charge
57 * @fail: points first counter to hit its limit, if any
58 *
59 * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
60 * its ancestors has hit its configured limit.
61 */
62int page_counter_try_charge(struct page_counter *counter,
63 unsigned long nr_pages,
64 struct page_counter **fail)
65{
66 struct page_counter *c;
67
68 for (c = counter; c; c = c->parent) {
69 long new;
70 /*
71 * Charge speculatively to avoid an expensive CAS. If
72 * a bigger charge fails, it might falsely lock out a
73 * racing smaller charge and send it into reclaim
74 * early, but the error is limited to the difference
75 * between the two sizes, which is less than 2M/4M in
76 * case of a THP locking out a regular page charge.
77 *
78 * The atomic_long_add_return() implies a full memory
79 * barrier between incrementing the count and reading
80 * the limit. When racing with page_counter_limit(),
81 * we either see the new limit or the setter sees the
82 * counter has changed and retries.
83 */
84 new = atomic_long_add_return(nr_pages, &c->count);
85 if (new > c->limit) {
86 atomic_long_sub(nr_pages, &c->count);
87 /*
88 * This is racy, but we can live with some
89 * inaccuracy in the failcnt.
90 */
91 c->failcnt++;
92 *fail = c;
93 goto failed;
94 }
95 /*
96 * Just like with failcnt, we can live with some
97 * inaccuracy in the watermark.
98 */
99 if (new > c->watermark)
100 c->watermark = new;
101 }
102 return 0;
103
104failed:
105 for (c = counter; c != *fail; c = c->parent)
106 page_counter_cancel(c, nr_pages);
107
108 return -ENOMEM;
109}
110
111/**
112 * page_counter_uncharge - hierarchically uncharge pages
113 * @counter: counter
114 * @nr_pages: number of pages to uncharge
115 */
116void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
117{
118 struct page_counter *c;
119
120 for (c = counter; c; c = c->parent)
121 page_counter_cancel(c, nr_pages);
122}
123
124/**
125 * page_counter_limit - limit the number of pages allowed
126 * @counter: counter
127 * @limit: limit to set
128 *
129 * Returns 0 on success, -EBUSY if the current number of pages on the
130 * counter already exceeds the specified limit.
131 *
132 * The caller must serialize invocations on the same counter.
133 */
134int page_counter_limit(struct page_counter *counter, unsigned long limit)
135{
136 for (;;) {
137 unsigned long old;
138 long count;
139
140 /*
141 * Update the limit while making sure that it's not
142 * below the concurrently-changing counter value.
143 *
144 * The xchg implies two full memory barriers before
145 * and after, so the read-swap-read is ordered and
146 * ensures coherency with page_counter_try_charge():
147 * that function modifies the count before checking
148 * the limit, so if it sees the old limit, we see the
149 * modified counter and retry.
150 */
151 count = atomic_long_read(&counter->count);
152
153 if (count > limit)
154 return -EBUSY;
155
156 old = xchg(&counter->limit, limit);
157
158 if (atomic_long_read(&counter->count) <= count)
159 return 0;
160
161 counter->limit = old;
162 cond_resched();
163 }
164}
165
166/**
167 * page_counter_memparse - memparse() for page counter limits
168 * @buf: string to parse
169 * @nr_pages: returns the result in number of pages
170 *
171 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
172 * limited to %PAGE_COUNTER_MAX.
173 */
174int page_counter_memparse(const char *buf, unsigned long *nr_pages)
175{
176 char unlimited[] = "-1";
177 char *end;
178 u64 bytes;
179
180 if (!strncmp(buf, unlimited, sizeof(unlimited))) {
181 *nr_pages = PAGE_COUNTER_MAX;
182 return 0;
183 }
184
185 bytes = memparse(buf, &end);
186 if (*end != '\0')
187 return -EINVAL;
188
189 *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
190
191 return 0;
192}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c8778f7e208e..72f5ac381ab3 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -68,7 +68,7 @@ out:
68 68
69 spin_unlock_irqrestore(&zone->lock, flags); 69 spin_unlock_irqrestore(&zone->lock, flags);
70 if (!ret) 70 if (!ret)
71 drain_all_pages(); 71 drain_all_pages(zone);
72 return ret; 72 return ret;
73} 73}
74 74
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e4c7213210c..45eba36fd673 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1053,7 +1053,7 @@ void page_add_file_rmap(struct page *page)
1053 __inc_zone_page_state(page, NR_FILE_MAPPED); 1053 __inc_zone_page_state(page, NR_FILE_MAPPED);
1054 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1054 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1055 } 1055 }
1056 mem_cgroup_end_page_stat(memcg, locked, flags); 1056 mem_cgroup_end_page_stat(memcg, &locked, &flags);
1057} 1057}
1058 1058
1059static void page_remove_file_rmap(struct page *page) 1059static void page_remove_file_rmap(struct page *page)
@@ -1083,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page)
1083 if (unlikely(PageMlocked(page))) 1083 if (unlikely(PageMlocked(page)))
1084 clear_page_mlock(page); 1084 clear_page_mlock(page);
1085out: 1085out:
1086 mem_cgroup_end_page_stat(memcg, locked, flags); 1086 mem_cgroup_end_page_stat(memcg, &locked, &flags);
1087} 1087}
1088 1088
1089/** 1089/**
diff --git a/mm/slab.c b/mm/slab.c
index f34e053ec46e..79e15f0a2a6e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep,
2590 * Be lazy and only check for valid flags here, keeping it out of the 2590 * Be lazy and only check for valid flags here, keeping it out of the
2591 * critical path in kmem_cache_alloc(). 2591 * critical path in kmem_cache_alloc().
2592 */ 2592 */
2593 BUG_ON(flags & GFP_SLAB_BUG_MASK); 2593 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
2594 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
2595 BUG();
2596 }
2594 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2597 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2595 2598
2596 /* Take the node list lock to change the colour_next on this node */ 2599 /* Take the node list lock to change the colour_next on this node */
@@ -3580,11 +3583,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
3580 3583
3581 for_each_online_node(node) { 3584 for_each_online_node(node) {
3582 3585
3583 if (use_alien_caches) { 3586 if (use_alien_caches) {
3584 new_alien = alloc_alien_cache(node, cachep->limit, gfp); 3587 new_alien = alloc_alien_cache(node, cachep->limit, gfp);
3585 if (!new_alien) 3588 if (!new_alien)
3586 goto fail; 3589 goto fail;
3587 } 3590 }
3588 3591
3589 new_shared = NULL; 3592 new_shared = NULL;
3590 if (cachep->shared) { 3593 if (cachep->shared) {
@@ -4043,12 +4046,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4043 4046
4044#ifdef CONFIG_DEBUG_SLAB_LEAK 4047#ifdef CONFIG_DEBUG_SLAB_LEAK
4045 4048
4046static void *leaks_start(struct seq_file *m, loff_t *pos)
4047{
4048 mutex_lock(&slab_mutex);
4049 return seq_list_start(&slab_caches, *pos);
4050}
4051
4052static inline int add_caller(unsigned long *n, unsigned long v) 4049static inline int add_caller(unsigned long *n, unsigned long v)
4053{ 4050{
4054 unsigned long *p; 4051 unsigned long *p;
@@ -4170,7 +4167,7 @@ static int leaks_show(struct seq_file *m, void *p)
4170} 4167}
4171 4168
4172static const struct seq_operations slabstats_op = { 4169static const struct seq_operations slabstats_op = {
4173 .start = leaks_start, 4170 .start = slab_start,
4174 .next = slab_next, 4171 .next = slab_next,
4175 .stop = slab_stop, 4172 .stop = slab_stop,
4176 .show = leaks_show, 4173 .show = leaks_show,
diff --git a/mm/slab.h b/mm/slab.h
index ab019e63e3c2..1cf4005482dd 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx)
209 209
210 rcu_read_lock(); 210 rcu_read_lock();
211 params = rcu_dereference(s->memcg_params); 211 params = rcu_dereference(s->memcg_params);
212 cachep = params->memcg_caches[idx];
213 rcu_read_unlock();
214 212
215 /* 213 /*
216 * Make sure we will access the up-to-date value. The code updating 214 * Make sure we will access the up-to-date value. The code updating
217 * memcg_caches issues a write barrier to match this (see 215 * memcg_caches issues a write barrier to match this (see
218 * memcg_register_cache()). 216 * memcg_register_cache()).
219 */ 217 */
220 smp_read_barrier_depends(); 218 cachep = lockless_dereference(params->memcg_caches[idx]);
219 rcu_read_unlock();
220
221 return cachep; 221 return cachep;
222} 222}
223 223
@@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
357 357
358#endif 358#endif
359 359
360void *slab_start(struct seq_file *m, loff_t *pos);
360void *slab_next(struct seq_file *m, void *p, loff_t *pos); 361void *slab_next(struct seq_file *m, void *p, loff_t *pos);
361void slab_stop(struct seq_file *m, void *p); 362void slab_stop(struct seq_file *m, void *p);
363int memcg_slab_show(struct seq_file *m, void *p);
362 364
363#endif /* MM_SLAB_H */ 365#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index dcdab81bd240..e03dd6f2a272 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
240 size = ALIGN(size, align); 240 size = ALIGN(size, align);
241 flags = kmem_cache_flags(size, flags, name, NULL); 241 flags = kmem_cache_flags(size, flags, name, NULL);
242 242
243 list_for_each_entry(s, &slab_caches, list) { 243 list_for_each_entry_reverse(s, &slab_caches, list) {
244 if (slab_unmergeable(s)) 244 if (slab_unmergeable(s))
245 continue; 245 continue;
246 246
@@ -811,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace);
811#define SLABINFO_RIGHTS S_IRUSR 811#define SLABINFO_RIGHTS S_IRUSR
812#endif 812#endif
813 813
814void print_slabinfo_header(struct seq_file *m) 814static void print_slabinfo_header(struct seq_file *m)
815{ 815{
816 /* 816 /*
817 * Output format version, so at least we can change it 817 * Output format version, so at least we can change it
@@ -834,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m)
834 seq_putc(m, '\n'); 834 seq_putc(m, '\n');
835} 835}
836 836
837static void *s_start(struct seq_file *m, loff_t *pos) 837void *slab_start(struct seq_file *m, loff_t *pos)
838{ 838{
839 loff_t n = *pos;
840
841 mutex_lock(&slab_mutex); 839 mutex_lock(&slab_mutex);
842 if (!n)
843 print_slabinfo_header(m);
844
845 return seq_list_start(&slab_caches, *pos); 840 return seq_list_start(&slab_caches, *pos);
846} 841}
847 842
@@ -881,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
881 } 876 }
882} 877}
883 878
884int cache_show(struct kmem_cache *s, struct seq_file *m) 879static void cache_show(struct kmem_cache *s, struct seq_file *m)
885{ 880{
886 struct slabinfo sinfo; 881 struct slabinfo sinfo;
887 882
@@ -900,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m)
900 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); 895 sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
901 slabinfo_show_stats(m, s); 896 slabinfo_show_stats(m, s);
902 seq_putc(m, '\n'); 897 seq_putc(m, '\n');
898}
899
900static int slab_show(struct seq_file *m, void *p)
901{
902 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
903
904 if (p == slab_caches.next)
905 print_slabinfo_header(m);
906 if (is_root_cache(s))
907 cache_show(s, m);
903 return 0; 908 return 0;
904} 909}
905 910
906static int s_show(struct seq_file *m, void *p) 911#ifdef CONFIG_MEMCG_KMEM
912int memcg_slab_show(struct seq_file *m, void *p)
907{ 913{
908 struct kmem_cache *s = list_entry(p, struct kmem_cache, list); 914 struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
915 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
909 916
910 if (!is_root_cache(s)) 917 if (p == slab_caches.next)
911 return 0; 918 print_slabinfo_header(m);
912 return cache_show(s, m); 919 if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
920 cache_show(s, m);
921 return 0;
913} 922}
923#endif
914 924
915/* 925/*
916 * slabinfo_op - iterator that generates /proc/slabinfo 926 * slabinfo_op - iterator that generates /proc/slabinfo
@@ -926,10 +936,10 @@ static int s_show(struct seq_file *m, void *p)
926 * + further values on SMP and with statistics enabled 936 * + further values on SMP and with statistics enabled
927 */ 937 */
928static const struct seq_operations slabinfo_op = { 938static const struct seq_operations slabinfo_op = {
929 .start = s_start, 939 .start = slab_start,
930 .next = slab_next, 940 .next = slab_next,
931 .stop = slab_stop, 941 .stop = slab_stop,
932 .show = s_show, 942 .show = slab_show,
933}; 943};
934 944
935static int slabinfo_open(struct inode *inode, struct file *file) 945static int slabinfo_open(struct inode *inode, struct file *file)
diff --git a/mm/slub.c b/mm/slub.c
index ae7b9f1ad394..386bbed76e94 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page)
849 maxobj = order_objects(compound_order(page), s->size, s->reserved); 849 maxobj = order_objects(compound_order(page), s->size, s->reserved);
850 if (page->objects > maxobj) { 850 if (page->objects > maxobj) {
851 slab_err(s, page, "objects %u > max %u", 851 slab_err(s, page, "objects %u > max %u",
852 s->name, page->objects, maxobj); 852 page->objects, maxobj);
853 return 0; 853 return 0;
854 } 854 }
855 if (page->inuse > page->objects) { 855 if (page->inuse > page->objects) {
856 slab_err(s, page, "inuse %u > max %u", 856 slab_err(s, page, "inuse %u > max %u",
857 s->name, page->inuse, page->objects); 857 page->inuse, page->objects);
858 return 0; 858 return 0;
859 } 859 }
860 /* Slab_pad_check fixes things up after itself */ 860 /* Slab_pad_check fixes things up after itself */
@@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
871 int nr = 0; 871 int nr = 0;
872 void *fp; 872 void *fp;
873 void *object = NULL; 873 void *object = NULL;
874 unsigned long max_objects; 874 int max_objects;
875 875
876 fp = page->freelist; 876 fp = page->freelist;
877 while (fp && nr <= page->objects) { 877 while (fp && nr <= page->objects) {
@@ -1377,7 +1377,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1377 int order; 1377 int order;
1378 int idx; 1378 int idx;
1379 1379
1380 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1380 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1381 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK);
1382 BUG();
1383 }
1381 1384
1382 page = allocate_slab(s, 1385 page = allocate_slab(s,
1383 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 1386 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -2554,7 +2557,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2554 2557
2555 } else { /* Needs to be taken off a list */ 2558 } else { /* Needs to be taken off a list */
2556 2559
2557 n = get_node(s, page_to_nid(page)); 2560 n = get_node(s, page_to_nid(page));
2558 /* 2561 /*
2559 * Speculatively acquire the list_lock. 2562 * Speculatively acquire the list_lock.
2560 * If the cmpxchg does not succeed then we may 2563 * If the cmpxchg does not succeed then we may
@@ -2587,10 +2590,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2587 * The list lock was not taken therefore no list 2590 * The list lock was not taken therefore no list
2588 * activity can be necessary. 2591 * activity can be necessary.
2589 */ 2592 */
2590 if (was_frozen) 2593 if (was_frozen)
2591 stat(s, FREE_FROZEN); 2594 stat(s, FREE_FROZEN);
2592 return; 2595 return;
2593 } 2596 }
2594 2597
2595 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) 2598 if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
2596 goto slab_empty; 2599 goto slab_empty;
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
new file mode 100644
index 000000000000..b5f7f24b8dd1
--- /dev/null
+++ b/mm/swap_cgroup.c
@@ -0,0 +1,208 @@
1#include <linux/swap_cgroup.h>
2#include <linux/vmalloc.h>
3#include <linux/mm.h>
4
5#include <linux/swapops.h> /* depends on mm.h include */
6
7static DEFINE_MUTEX(swap_cgroup_mutex);
8struct swap_cgroup_ctrl {
9 struct page **map;
10 unsigned long length;
11 spinlock_t lock;
12};
13
14static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
15
16struct swap_cgroup {
17 unsigned short id;
18};
19#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
20
21/*
22 * SwapCgroup implements "lookup" and "exchange" operations.
23 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
24 * against SwapCache. At swap_free(), this is accessed directly from swap.
25 *
26 * This means,
27 * - we have no race in "exchange" when we're accessed via SwapCache because
28 * SwapCache(and its swp_entry) is under lock.
29 * - When called via swap_free(), there is no user of this entry and no race.
30 * Then, we don't need lock around "exchange".
31 *
32 * TODO: we can push these buffers out to HIGHMEM.
33 */
34
35/*
36 * allocate buffer for swap_cgroup.
37 */
38static int swap_cgroup_prepare(int type)
39{
40 struct page *page;
41 struct swap_cgroup_ctrl *ctrl;
42 unsigned long idx, max;
43
44 ctrl = &swap_cgroup_ctrl[type];
45
46 for (idx = 0; idx < ctrl->length; idx++) {
47 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
48 if (!page)
49 goto not_enough_page;
50 ctrl->map[idx] = page;
51 }
52 return 0;
53not_enough_page:
54 max = idx;
55 for (idx = 0; idx < max; idx++)
56 __free_page(ctrl->map[idx]);
57
58 return -ENOMEM;
59}
60
61static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
62 struct swap_cgroup_ctrl **ctrlp)
63{
64 pgoff_t offset = swp_offset(ent);
65 struct swap_cgroup_ctrl *ctrl;
66 struct page *mappage;
67 struct swap_cgroup *sc;
68
69 ctrl = &swap_cgroup_ctrl[swp_type(ent)];
70 if (ctrlp)
71 *ctrlp = ctrl;
72
73 mappage = ctrl->map[offset / SC_PER_PAGE];
74 sc = page_address(mappage);
75 return sc + offset % SC_PER_PAGE;
76}
77
78/**
79 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
80 * @ent: swap entry to be cmpxchged
81 * @old: old id
82 * @new: new id
83 *
84 * Returns old id at success, 0 at failure.
85 * (There is no mem_cgroup using 0 as its id)
86 */
87unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
88 unsigned short old, unsigned short new)
89{
90 struct swap_cgroup_ctrl *ctrl;
91 struct swap_cgroup *sc;
92 unsigned long flags;
93 unsigned short retval;
94
95 sc = lookup_swap_cgroup(ent, &ctrl);
96
97 spin_lock_irqsave(&ctrl->lock, flags);
98 retval = sc->id;
99 if (retval == old)
100 sc->id = new;
101 else
102 retval = 0;
103 spin_unlock_irqrestore(&ctrl->lock, flags);
104 return retval;
105}
106
107/**
108 * swap_cgroup_record - record mem_cgroup for this swp_entry.
109 * @ent: swap entry to be recorded into
110 * @id: mem_cgroup to be recorded
111 *
112 * Returns old value at success, 0 at failure.
113 * (Of course, old value can be 0.)
114 */
115unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
116{
117 struct swap_cgroup_ctrl *ctrl;
118 struct swap_cgroup *sc;
119 unsigned short old;
120 unsigned long flags;
121
122 sc = lookup_swap_cgroup(ent, &ctrl);
123
124 spin_lock_irqsave(&ctrl->lock, flags);
125 old = sc->id;
126 sc->id = id;
127 spin_unlock_irqrestore(&ctrl->lock, flags);
128
129 return old;
130}
131
132/**
133 * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry
134 * @ent: swap entry to be looked up.
135 *
136 * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
137 */
138unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
139{
140 return lookup_swap_cgroup(ent, NULL)->id;
141}
142
143int swap_cgroup_swapon(int type, unsigned long max_pages)
144{
145 void *array;
146 unsigned long array_size;
147 unsigned long length;
148 struct swap_cgroup_ctrl *ctrl;
149
150 if (!do_swap_account)
151 return 0;
152
153 length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
154 array_size = length * sizeof(void *);
155
156 array = vzalloc(array_size);
157 if (!array)
158 goto nomem;
159
160 ctrl = &swap_cgroup_ctrl[type];
161 mutex_lock(&swap_cgroup_mutex);
162 ctrl->length = length;
163 ctrl->map = array;
164 spin_lock_init(&ctrl->lock);
165 if (swap_cgroup_prepare(type)) {
166 /* memory shortage */
167 ctrl->map = NULL;
168 ctrl->length = 0;
169 mutex_unlock(&swap_cgroup_mutex);
170 vfree(array);
171 goto nomem;
172 }
173 mutex_unlock(&swap_cgroup_mutex);
174
175 return 0;
176nomem:
177 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
178 printk(KERN_INFO
179 "swap_cgroup can be disabled by swapaccount=0 boot option\n");
180 return -ENOMEM;
181}
182
183void swap_cgroup_swapoff(int type)
184{
185 struct page **map;
186 unsigned long i, length;
187 struct swap_cgroup_ctrl *ctrl;
188
189 if (!do_swap_account)
190 return;
191
192 mutex_lock(&swap_cgroup_mutex);
193 ctrl = &swap_cgroup_ctrl[type];
194 map = ctrl->map;
195 length = ctrl->length;
196 ctrl->map = NULL;
197 ctrl->length = 0;
198 mutex_unlock(&swap_cgroup_mutex);
199
200 if (map) {
201 for (i = 0; i < length; i++) {
202 struct page *page = map[i];
203 if (page)
204 __free_page(page);
205 }
206 vfree(map);
207 }
208}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 154444918685..9711342987a0 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,7 +17,6 @@
17#include <linux/blkdev.h> 17#include <linux/blkdev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/page_cgroup.h>
21 20
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23 22
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8798b2e0ac59..63f55ccb9b26 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -38,7 +38,7 @@
38#include <asm/pgtable.h> 38#include <asm/pgtable.h>
39#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
40#include <linux/swapops.h> 40#include <linux/swapops.h>
41#include <linux/page_cgroup.h> 41#include <linux/swap_cgroup.h>
42 42
43static bool swap_count_continued(struct swap_info_struct *, pgoff_t, 43static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
44 unsigned char); 44 unsigned char);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 90520af7f186..8a18196fcdff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -463,8 +463,7 @@ overflow:
463 goto retry; 463 goto retry;
464 } 464 }
465 if (printk_ratelimit()) 465 if (printk_ratelimit())
466 printk(KERN_WARNING 466 pr_warn("vmap allocation for size %lu failed: "
467 "vmap allocation for size %lu failed: "
468 "use vmalloc=<size> to increase size.\n", size); 467 "use vmalloc=<size> to increase size.\n", size);
469 kfree(va); 468 kfree(va);
470 return ERR_PTR(-EBUSY); 469 return ERR_PTR(-EBUSY);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcb47074ae03..4636d9e822c1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -260,8 +260,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,
260 do_div(delta, lru_pages + 1); 260 do_div(delta, lru_pages + 1);
261 total_scan += delta; 261 total_scan += delta;
262 if (total_scan < 0) { 262 if (total_scan < 0) {
263 printk(KERN_ERR 263 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
264 "shrink_slab: %pF negative objects to delete nr=%ld\n",
265 shrinker->scan_objects, total_scan); 264 shrinker->scan_objects, total_scan);
266 total_scan = freeable; 265 total_scan = freeable;
267 } 266 }
@@ -875,7 +874,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
875 * end of the LRU a second time. 874 * end of the LRU a second time.
876 */ 875 */
877 mapping = page_mapping(page); 876 mapping = page_mapping(page);
878 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || 877 if (((dirty || writeback) && mapping &&
878 bdi_write_congested(mapping->backing_dev_info)) ||
879 (writeback && PageReclaim(page))) 879 (writeback && PageReclaim(page)))
880 nr_congested++; 880 nr_congested++;
881 881
@@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
2249 return true; 2249 return true;
2250 2250
2251 /* If compaction would go ahead or the allocation would succeed, stop */ 2251 /* If compaction would go ahead or the allocation would succeed, stop */
2252 switch (compaction_suitable(zone, sc->order)) { 2252 switch (compaction_suitable(zone, sc->order, 0, 0)) {
2253 case COMPACT_PARTIAL: 2253 case COMPACT_PARTIAL:
2254 case COMPACT_CONTINUE: 2254 case COMPACT_CONTINUE:
2255 return false; 2255 return false;
@@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
2346 * If compaction is not ready to start and allocation is not likely 2346 * If compaction is not ready to start and allocation is not likely
2347 * to succeed without it, then keep reclaiming. 2347 * to succeed without it, then keep reclaiming.
2348 */ 2348 */
2349 if (compaction_suitable(zone, order) == COMPACT_SKIPPED) 2349 if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
2350 return false; 2350 return false;
2351 2351
2352 return watermark_ok; 2352 return watermark_ok;
@@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order,
2824 balance_gap, classzone_idx, 0)) 2824 balance_gap, classzone_idx, 0))
2825 return false; 2825 return false;
2826 2826
2827 if (IS_ENABLED(CONFIG_COMPACTION) && order && 2827 if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
2828 compaction_suitable(zone, order) == COMPACT_SKIPPED) 2828 order, 0, classzone_idx) == COMPACT_SKIPPED)
2829 return false; 2829 return false;
2830 2830
2831 return true; 2831 return true;
@@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2952 * from memory. Do not reclaim more than needed for compaction. 2952 * from memory. Do not reclaim more than needed for compaction.
2953 */ 2953 */
2954 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && 2954 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2955 compaction_suitable(zone, sc->order) != 2955 compaction_suitable(zone, sc->order, 0, classzone_idx)
2956 COMPACT_SKIPPED) 2956 != COMPACT_SKIPPED)
2957 testorder = 0; 2957 testorder = 0;
2958 2958
2959 /* 2959 /*