diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 21:34:42 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-12-10 21:34:42 -0500 |
commit | b6da0076bab5a12afb19312ffee41c95490af2a0 (patch) | |
tree | 52a5675b9c2ff95d88b981d5b9a3822f6073c112 /mm | |
parent | cbfe0de303a55ed96d8831c2d5f56f8131cd6612 (diff) | |
parent | a53b831549141aa060a8b54b76e3a42870d74cc0 (diff) |
Merge branch 'akpm' (patchbomb from Andrew)
Merge first patchbomb from Andrew Morton:
- a few minor cifs fixes
- dma-debug upadtes
- ocfs2
- slab
- about half of MM
- procfs
- kernel/exit.c
- panic.c tweaks
- printk upates
- lib/ updates
- checkpatch updates
- fs/binfmt updates
- the drivers/rtc tree
- nilfs
- kmod fixes
- more kernel/exit.c
- various other misc tweaks and fixes
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (190 commits)
exit: pidns: fix/update the comments in zap_pid_ns_processes()
exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting
exit: exit_notify: re-use "dead" list to autoreap current
exit: reparent: call forget_original_parent() under tasklist_lock
exit: reparent: avoid find_new_reaper() if no children
exit: reparent: introduce find_alive_thread()
exit: reparent: introduce find_child_reaper()
exit: reparent: document the ->has_child_subreaper checks
exit: reparent: s/while_each_thread/for_each_thread/ in find_new_reaper()
exit: reparent: fix the cross-namespace PR_SET_CHILD_SUBREAPER reparenting
exit: reparent: fix the dead-parent PR_SET_CHILD_SUBREAPER reparenting
exit: proc: don't try to flush /proc/tgid/task/tgid
exit: release_task: fix the comment about group leader accounting
exit: wait: drop tasklist_lock before psig->c* accounting
exit: wait: don't use zombie->real_parent
exit: wait: cleanup the ptrace_reparented() checks
usermodehelper: kill the kmod_thread_locker logic
usermodehelper: don't use CLONE_VFORK for ____call_usermodehelper()
fs/hfs/catalog.c: fix comparison bug in hfs_cat_keycmp
nilfs2: fix the nilfs_iget() vs. nilfs_new_inode() races
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/cma.c | 14 | ||||
-rw-r--r-- | mm/compaction.c | 139 | ||||
-rw-r--r-- | mm/debug.c | 5 | ||||
-rw-r--r-- | mm/frontswap.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 1 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 103 | ||||
-rw-r--r-- | mm/internal.h | 7 | ||||
-rw-r--r-- | mm/memcontrol.c | 1706 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 4 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 137 | ||||
-rw-r--r-- | mm/page_cgroup.c | 530 | ||||
-rw-r--r-- | mm/page_counter.c | 192 | ||||
-rw-r--r-- | mm/page_isolation.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 23 | ||||
-rw-r--r-- | mm/slab.h | 8 | ||||
-rw-r--r-- | mm/slab_common.c | 40 | ||||
-rw-r--r-- | mm/slub.c | 21 | ||||
-rw-r--r-- | mm/swap_cgroup.c | 208 | ||||
-rw-r--r-- | mm/swap_state.c | 1 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 18 |
28 files changed, 1236 insertions, 1954 deletions
diff --git a/mm/Makefile b/mm/Makefile index 8405eb0023a9..b3c6ce932c64 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -55,7 +55,9 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o | |||
55 | obj-$(CONFIG_MIGRATION) += migrate.o | 55 | obj-$(CONFIG_MIGRATION) += migrate.o |
56 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 56 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
57 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 57 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
58 | obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o | 58 | obj-$(CONFIG_PAGE_COUNTER) += page_counter.o |
59 | obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o | ||
60 | obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o | ||
59 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o | 61 | obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o |
60 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 62 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
61 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | 63 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o |
@@ -215,9 +215,21 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
215 | bool fixed, struct cma **res_cma) | 215 | bool fixed, struct cma **res_cma) |
216 | { | 216 | { |
217 | phys_addr_t memblock_end = memblock_end_of_DRAM(); | 217 | phys_addr_t memblock_end = memblock_end_of_DRAM(); |
218 | phys_addr_t highmem_start = __pa(high_memory); | 218 | phys_addr_t highmem_start; |
219 | int ret = 0; | 219 | int ret = 0; |
220 | 220 | ||
221 | #ifdef CONFIG_X86 | ||
222 | /* | ||
223 | * high_memory isn't direct mapped memory so retrieving its physical | ||
224 | * address isn't appropriate. But it would be useful to check the | ||
225 | * physical address of the highmem boundary so it's justfiable to get | ||
226 | * the physical address from it. On x86 there is a validation check for | ||
227 | * this case, so the following workaround is needed to avoid it. | ||
228 | */ | ||
229 | highmem_start = __pa_nodebug(high_memory); | ||
230 | #else | ||
231 | highmem_start = __pa(high_memory); | ||
232 | #endif | ||
221 | pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", | 233 | pr_debug("%s(size %pa, base %pa, limit %pa alignment %pa)\n", |
222 | __func__, &size, &base, &limit, &alignment); | 234 | __func__, &size, &base, &limit, &alignment); |
223 | 235 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index f9792ba3537c..546e571e9d60 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -41,15 +41,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) | |||
41 | static unsigned long release_freepages(struct list_head *freelist) | 41 | static unsigned long release_freepages(struct list_head *freelist) |
42 | { | 42 | { |
43 | struct page *page, *next; | 43 | struct page *page, *next; |
44 | unsigned long count = 0; | 44 | unsigned long high_pfn = 0; |
45 | 45 | ||
46 | list_for_each_entry_safe(page, next, freelist, lru) { | 46 | list_for_each_entry_safe(page, next, freelist, lru) { |
47 | unsigned long pfn = page_to_pfn(page); | ||
47 | list_del(&page->lru); | 48 | list_del(&page->lru); |
48 | __free_page(page); | 49 | __free_page(page); |
49 | count++; | 50 | if (pfn > high_pfn) |
51 | high_pfn = pfn; | ||
50 | } | 52 | } |
51 | 53 | ||
52 | return count; | 54 | return high_pfn; |
53 | } | 55 | } |
54 | 56 | ||
55 | static void map_pages(struct list_head *list) | 57 | static void map_pages(struct list_head *list) |
@@ -195,16 +197,12 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
195 | 197 | ||
196 | /* Update where async and sync compaction should restart */ | 198 | /* Update where async and sync compaction should restart */ |
197 | if (migrate_scanner) { | 199 | if (migrate_scanner) { |
198 | if (cc->finished_update_migrate) | ||
199 | return; | ||
200 | if (pfn > zone->compact_cached_migrate_pfn[0]) | 200 | if (pfn > zone->compact_cached_migrate_pfn[0]) |
201 | zone->compact_cached_migrate_pfn[0] = pfn; | 201 | zone->compact_cached_migrate_pfn[0] = pfn; |
202 | if (cc->mode != MIGRATE_ASYNC && | 202 | if (cc->mode != MIGRATE_ASYNC && |
203 | pfn > zone->compact_cached_migrate_pfn[1]) | 203 | pfn > zone->compact_cached_migrate_pfn[1]) |
204 | zone->compact_cached_migrate_pfn[1] = pfn; | 204 | zone->compact_cached_migrate_pfn[1] = pfn; |
205 | } else { | 205 | } else { |
206 | if (cc->finished_update_free) | ||
207 | return; | ||
208 | if (pfn < zone->compact_cached_free_pfn) | 206 | if (pfn < zone->compact_cached_free_pfn) |
209 | zone->compact_cached_free_pfn = pfn; | 207 | zone->compact_cached_free_pfn = pfn; |
210 | } | 208 | } |
@@ -715,7 +713,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
715 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 713 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
716 | 714 | ||
717 | isolate_success: | 715 | isolate_success: |
718 | cc->finished_update_migrate = true; | ||
719 | list_add(&page->lru, migratelist); | 716 | list_add(&page->lru, migratelist); |
720 | cc->nr_migratepages++; | 717 | cc->nr_migratepages++; |
721 | nr_isolated++; | 718 | nr_isolated++; |
@@ -889,15 +886,6 @@ static void isolate_freepages(struct compact_control *cc) | |||
889 | block_start_pfn - pageblock_nr_pages; | 886 | block_start_pfn - pageblock_nr_pages; |
890 | 887 | ||
891 | /* | 888 | /* |
892 | * Set a flag that we successfully isolated in this pageblock. | ||
893 | * In the next loop iteration, zone->compact_cached_free_pfn | ||
894 | * will not be updated and thus it will effectively contain the | ||
895 | * highest pageblock we isolated pages from. | ||
896 | */ | ||
897 | if (isolated) | ||
898 | cc->finished_update_free = true; | ||
899 | |||
900 | /* | ||
901 | * isolate_freepages_block() might have aborted due to async | 889 | * isolate_freepages_block() might have aborted due to async |
902 | * compaction being contended | 890 | * compaction being contended |
903 | */ | 891 | */ |
@@ -1086,9 +1074,9 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1086 | 1074 | ||
1087 | /* Compaction run is not finished if the watermark is not met */ | 1075 | /* Compaction run is not finished if the watermark is not met */ |
1088 | watermark = low_wmark_pages(zone); | 1076 | watermark = low_wmark_pages(zone); |
1089 | watermark += (1 << cc->order); | ||
1090 | 1077 | ||
1091 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 1078 | if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, |
1079 | cc->alloc_flags)) | ||
1092 | return COMPACT_CONTINUE; | 1080 | return COMPACT_CONTINUE; |
1093 | 1081 | ||
1094 | /* Direct compactor: Is a suitable page free? */ | 1082 | /* Direct compactor: Is a suitable page free? */ |
@@ -1114,7 +1102,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1114 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 1102 | * COMPACT_PARTIAL - If the allocation would succeed without compaction |
1115 | * COMPACT_CONTINUE - If compaction should run now | 1103 | * COMPACT_CONTINUE - If compaction should run now |
1116 | */ | 1104 | */ |
1117 | unsigned long compaction_suitable(struct zone *zone, int order) | 1105 | unsigned long compaction_suitable(struct zone *zone, int order, |
1106 | int alloc_flags, int classzone_idx) | ||
1118 | { | 1107 | { |
1119 | int fragindex; | 1108 | int fragindex; |
1120 | unsigned long watermark; | 1109 | unsigned long watermark; |
@@ -1126,21 +1115,30 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
1126 | if (order == -1) | 1115 | if (order == -1) |
1127 | return COMPACT_CONTINUE; | 1116 | return COMPACT_CONTINUE; |
1128 | 1117 | ||
1118 | watermark = low_wmark_pages(zone); | ||
1119 | /* | ||
1120 | * If watermarks for high-order allocation are already met, there | ||
1121 | * should be no need for compaction at all. | ||
1122 | */ | ||
1123 | if (zone_watermark_ok(zone, order, watermark, classzone_idx, | ||
1124 | alloc_flags)) | ||
1125 | return COMPACT_PARTIAL; | ||
1126 | |||
1129 | /* | 1127 | /* |
1130 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 1128 | * Watermarks for order-0 must be met for compaction. Note the 2UL. |
1131 | * This is because during migration, copies of pages need to be | 1129 | * This is because during migration, copies of pages need to be |
1132 | * allocated and for a short time, the footprint is higher | 1130 | * allocated and for a short time, the footprint is higher |
1133 | */ | 1131 | */ |
1134 | watermark = low_wmark_pages(zone) + (2UL << order); | 1132 | watermark += (2UL << order); |
1135 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1133 | if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) |
1136 | return COMPACT_SKIPPED; | 1134 | return COMPACT_SKIPPED; |
1137 | 1135 | ||
1138 | /* | 1136 | /* |
1139 | * fragmentation index determines if allocation failures are due to | 1137 | * fragmentation index determines if allocation failures are due to |
1140 | * low memory or external fragmentation | 1138 | * low memory or external fragmentation |
1141 | * | 1139 | * |
1142 | * index of -1000 implies allocations might succeed depending on | 1140 | * index of -1000 would imply allocations might succeed depending on |
1143 | * watermarks | 1141 | * watermarks, but we already failed the high-order watermark check |
1144 | * index towards 0 implies failure is due to lack of memory | 1142 | * index towards 0 implies failure is due to lack of memory |
1145 | * index towards 1000 implies failure is due to fragmentation | 1143 | * index towards 1000 implies failure is due to fragmentation |
1146 | * | 1144 | * |
@@ -1150,10 +1148,6 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
1150 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 1148 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
1151 | return COMPACT_SKIPPED; | 1149 | return COMPACT_SKIPPED; |
1152 | 1150 | ||
1153 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, | ||
1154 | 0, 0)) | ||
1155 | return COMPACT_PARTIAL; | ||
1156 | |||
1157 | return COMPACT_CONTINUE; | 1151 | return COMPACT_CONTINUE; |
1158 | } | 1152 | } |
1159 | 1153 | ||
@@ -1164,8 +1158,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1164 | unsigned long end_pfn = zone_end_pfn(zone); | 1158 | unsigned long end_pfn = zone_end_pfn(zone); |
1165 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); | 1159 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); |
1166 | const bool sync = cc->mode != MIGRATE_ASYNC; | 1160 | const bool sync = cc->mode != MIGRATE_ASYNC; |
1161 | unsigned long last_migrated_pfn = 0; | ||
1167 | 1162 | ||
1168 | ret = compaction_suitable(zone, cc->order); | 1163 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, |
1164 | cc->classzone_idx); | ||
1169 | switch (ret) { | 1165 | switch (ret) { |
1170 | case COMPACT_PARTIAL: | 1166 | case COMPACT_PARTIAL: |
1171 | case COMPACT_SKIPPED: | 1167 | case COMPACT_SKIPPED: |
@@ -1208,6 +1204,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1208 | while ((ret = compact_finished(zone, cc, migratetype)) == | 1204 | while ((ret = compact_finished(zone, cc, migratetype)) == |
1209 | COMPACT_CONTINUE) { | 1205 | COMPACT_CONTINUE) { |
1210 | int err; | 1206 | int err; |
1207 | unsigned long isolate_start_pfn = cc->migrate_pfn; | ||
1211 | 1208 | ||
1212 | switch (isolate_migratepages(zone, cc)) { | 1209 | switch (isolate_migratepages(zone, cc)) { |
1213 | case ISOLATE_ABORT: | 1210 | case ISOLATE_ABORT: |
@@ -1216,7 +1213,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1216 | cc->nr_migratepages = 0; | 1213 | cc->nr_migratepages = 0; |
1217 | goto out; | 1214 | goto out; |
1218 | case ISOLATE_NONE: | 1215 | case ISOLATE_NONE: |
1219 | continue; | 1216 | /* |
1217 | * We haven't isolated and migrated anything, but | ||
1218 | * there might still be unflushed migrations from | ||
1219 | * previous cc->order aligned block. | ||
1220 | */ | ||
1221 | goto check_drain; | ||
1220 | case ISOLATE_SUCCESS: | 1222 | case ISOLATE_SUCCESS: |
1221 | ; | 1223 | ; |
1222 | } | 1224 | } |
@@ -1241,12 +1243,61 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1241 | goto out; | 1243 | goto out; |
1242 | } | 1244 | } |
1243 | } | 1245 | } |
1246 | |||
1247 | /* | ||
1248 | * Record where we could have freed pages by migration and not | ||
1249 | * yet flushed them to buddy allocator. We use the pfn that | ||
1250 | * isolate_migratepages() started from in this loop iteration | ||
1251 | * - this is the lowest page that could have been isolated and | ||
1252 | * then freed by migration. | ||
1253 | */ | ||
1254 | if (!last_migrated_pfn) | ||
1255 | last_migrated_pfn = isolate_start_pfn; | ||
1256 | |||
1257 | check_drain: | ||
1258 | /* | ||
1259 | * Has the migration scanner moved away from the previous | ||
1260 | * cc->order aligned block where we migrated from? If yes, | ||
1261 | * flush the pages that were freed, so that they can merge and | ||
1262 | * compact_finished() can detect immediately if allocation | ||
1263 | * would succeed. | ||
1264 | */ | ||
1265 | if (cc->order > 0 && last_migrated_pfn) { | ||
1266 | int cpu; | ||
1267 | unsigned long current_block_start = | ||
1268 | cc->migrate_pfn & ~((1UL << cc->order) - 1); | ||
1269 | |||
1270 | if (last_migrated_pfn < current_block_start) { | ||
1271 | cpu = get_cpu(); | ||
1272 | lru_add_drain_cpu(cpu); | ||
1273 | drain_local_pages(zone); | ||
1274 | put_cpu(); | ||
1275 | /* No more flushing until we migrate again */ | ||
1276 | last_migrated_pfn = 0; | ||
1277 | } | ||
1278 | } | ||
1279 | |||
1244 | } | 1280 | } |
1245 | 1281 | ||
1246 | out: | 1282 | out: |
1247 | /* Release free pages and check accounting */ | 1283 | /* |
1248 | cc->nr_freepages -= release_freepages(&cc->freepages); | 1284 | * Release free pages and update where the free scanner should restart, |
1249 | VM_BUG_ON(cc->nr_freepages != 0); | 1285 | * so we don't leave any returned pages behind in the next attempt. |
1286 | */ | ||
1287 | if (cc->nr_freepages > 0) { | ||
1288 | unsigned long free_pfn = release_freepages(&cc->freepages); | ||
1289 | |||
1290 | cc->nr_freepages = 0; | ||
1291 | VM_BUG_ON(free_pfn == 0); | ||
1292 | /* The cached pfn is always the first in a pageblock */ | ||
1293 | free_pfn &= ~(pageblock_nr_pages-1); | ||
1294 | /* | ||
1295 | * Only go back, not forward. The cached pfn might have been | ||
1296 | * already reset to zone end in compact_finished() | ||
1297 | */ | ||
1298 | if (free_pfn > zone->compact_cached_free_pfn) | ||
1299 | zone->compact_cached_free_pfn = free_pfn; | ||
1300 | } | ||
1250 | 1301 | ||
1251 | trace_mm_compaction_end(ret); | 1302 | trace_mm_compaction_end(ret); |
1252 | 1303 | ||
@@ -1254,7 +1305,8 @@ out: | |||
1254 | } | 1305 | } |
1255 | 1306 | ||
1256 | static unsigned long compact_zone_order(struct zone *zone, int order, | 1307 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1257 | gfp_t gfp_mask, enum migrate_mode mode, int *contended) | 1308 | gfp_t gfp_mask, enum migrate_mode mode, int *contended, |
1309 | int alloc_flags, int classzone_idx) | ||
1258 | { | 1310 | { |
1259 | unsigned long ret; | 1311 | unsigned long ret; |
1260 | struct compact_control cc = { | 1312 | struct compact_control cc = { |
@@ -1264,6 +1316,8 @@ static unsigned long compact_zone_order(struct zone *zone, int order, | |||
1264 | .gfp_mask = gfp_mask, | 1316 | .gfp_mask = gfp_mask, |
1265 | .zone = zone, | 1317 | .zone = zone, |
1266 | .mode = mode, | 1318 | .mode = mode, |
1319 | .alloc_flags = alloc_flags, | ||
1320 | .classzone_idx = classzone_idx, | ||
1267 | }; | 1321 | }; |
1268 | INIT_LIST_HEAD(&cc.freepages); | 1322 | INIT_LIST_HEAD(&cc.freepages); |
1269 | INIT_LIST_HEAD(&cc.migratepages); | 1323 | INIT_LIST_HEAD(&cc.migratepages); |
@@ -1288,14 +1342,13 @@ int sysctl_extfrag_threshold = 500; | |||
1288 | * @mode: The migration mode for async, sync light, or sync migration | 1342 | * @mode: The migration mode for async, sync light, or sync migration |
1289 | * @contended: Return value that determines if compaction was aborted due to | 1343 | * @contended: Return value that determines if compaction was aborted due to |
1290 | * need_resched() or lock contention | 1344 | * need_resched() or lock contention |
1291 | * @candidate_zone: Return the zone where we think allocation should succeed | ||
1292 | * | 1345 | * |
1293 | * This is the main entry point for direct page compaction. | 1346 | * This is the main entry point for direct page compaction. |
1294 | */ | 1347 | */ |
1295 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1348 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1296 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1349 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1297 | enum migrate_mode mode, int *contended, | 1350 | enum migrate_mode mode, int *contended, |
1298 | struct zone **candidate_zone) | 1351 | int alloc_flags, int classzone_idx) |
1299 | { | 1352 | { |
1300 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1353 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1301 | int may_enter_fs = gfp_mask & __GFP_FS; | 1354 | int may_enter_fs = gfp_mask & __GFP_FS; |
@@ -1303,7 +1356,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1303 | struct zoneref *z; | 1356 | struct zoneref *z; |
1304 | struct zone *zone; | 1357 | struct zone *zone; |
1305 | int rc = COMPACT_DEFERRED; | 1358 | int rc = COMPACT_DEFERRED; |
1306 | int alloc_flags = 0; | ||
1307 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ | 1359 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ |
1308 | 1360 | ||
1309 | *contended = COMPACT_CONTENDED_NONE; | 1361 | *contended = COMPACT_CONTENDED_NONE; |
@@ -1312,10 +1364,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1312 | if (!order || !may_enter_fs || !may_perform_io) | 1364 | if (!order || !may_enter_fs || !may_perform_io) |
1313 | return COMPACT_SKIPPED; | 1365 | return COMPACT_SKIPPED; |
1314 | 1366 | ||
1315 | #ifdef CONFIG_CMA | ||
1316 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
1317 | alloc_flags |= ALLOC_CMA; | ||
1318 | #endif | ||
1319 | /* Compact each zone in the list */ | 1367 | /* Compact each zone in the list */ |
1320 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1368 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
1321 | nodemask) { | 1369 | nodemask) { |
@@ -1326,7 +1374,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1326 | continue; | 1374 | continue; |
1327 | 1375 | ||
1328 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1376 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1329 | &zone_contended); | 1377 | &zone_contended, alloc_flags, classzone_idx); |
1330 | rc = max(status, rc); | 1378 | rc = max(status, rc); |
1331 | /* | 1379 | /* |
1332 | * It takes at least one zone that wasn't lock contended | 1380 | * It takes at least one zone that wasn't lock contended |
@@ -1335,9 +1383,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1335 | all_zones_contended &= zone_contended; | 1383 | all_zones_contended &= zone_contended; |
1336 | 1384 | ||
1337 | /* If a normal allocation would succeed, stop compacting */ | 1385 | /* If a normal allocation would succeed, stop compacting */ |
1338 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, | 1386 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), |
1339 | alloc_flags)) { | 1387 | classzone_idx, alloc_flags)) { |
1340 | *candidate_zone = zone; | ||
1341 | /* | 1388 | /* |
1342 | * We think the allocation will succeed in this zone, | 1389 | * We think the allocation will succeed in this zone, |
1343 | * but it is not certain, hence the false. The caller | 1390 | * but it is not certain, hence the false. The caller |
@@ -1359,7 +1406,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1359 | goto break_loop; | 1406 | goto break_loop; |
1360 | } | 1407 | } |
1361 | 1408 | ||
1362 | if (mode != MIGRATE_ASYNC) { | 1409 | if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { |
1363 | /* | 1410 | /* |
1364 | * We think that allocation won't succeed in this zone | 1411 | * We think that allocation won't succeed in this zone |
1365 | * so we defer compaction there. If it ends up | 1412 | * so we defer compaction there. If it ends up |
diff --git a/mm/debug.c b/mm/debug.c index 5ce45c9a29b5..0e58f3211f89 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -95,7 +95,10 @@ void dump_page_badflags(struct page *page, const char *reason, | |||
95 | dump_flags(page->flags & badflags, | 95 | dump_flags(page->flags & badflags, |
96 | pageflag_names, ARRAY_SIZE(pageflag_names)); | 96 | pageflag_names, ARRAY_SIZE(pageflag_names)); |
97 | } | 97 | } |
98 | mem_cgroup_print_bad_page(page); | 98 | #ifdef CONFIG_MEMCG |
99 | if (page->mem_cgroup) | ||
100 | pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); | ||
101 | #endif | ||
99 | } | 102 | } |
100 | 103 | ||
101 | void dump_page(struct page *page, const char *reason) | 104 | void dump_page(struct page *page, const char *reason) |
diff --git a/mm/frontswap.c b/mm/frontswap.c index f2a3571c6e22..8d82809eb085 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -182,7 +182,7 @@ void __frontswap_init(unsigned type, unsigned long *map) | |||
182 | if (frontswap_ops) | 182 | if (frontswap_ops) |
183 | frontswap_ops->init(type); | 183 | frontswap_ops->init(type); |
184 | else { | 184 | else { |
185 | BUG_ON(type > MAX_SWAPFILES); | 185 | BUG_ON(type >= MAX_SWAPFILES); |
186 | set_bit(type, need_init); | 186 | set_bit(type, need_init); |
187 | } | 187 | } |
188 | } | 188 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de984159cf0b..5b2c6875fc38 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -784,7 +784,6 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
784 | if (!pmd_none(*pmd)) | 784 | if (!pmd_none(*pmd)) |
785 | return false; | 785 | return false; |
786 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 786 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
787 | entry = pmd_wrprotect(entry); | ||
788 | entry = pmd_mkhuge(entry); | 787 | entry = pmd_mkhuge(entry); |
789 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 788 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
790 | set_pmd_at(mm, haddr, pmd, entry); | 789 | set_pmd_at(mm, haddr, pmd, entry); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9fd722769927..30cd96879152 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -2638,8 +2638,9 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
2638 | 2638 | ||
2639 | tlb_start_vma(tlb, vma); | 2639 | tlb_start_vma(tlb, vma); |
2640 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2640 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
2641 | address = start; | ||
2641 | again: | 2642 | again: |
2642 | for (address = start; address < end; address += sz) { | 2643 | for (; address < end; address += sz) { |
2643 | ptep = huge_pte_offset(mm, address); | 2644 | ptep = huge_pte_offset(mm, address); |
2644 | if (!ptep) | 2645 | if (!ptep) |
2645 | continue; | 2646 | continue; |
@@ -2686,6 +2687,7 @@ again: | |||
2686 | page_remove_rmap(page); | 2687 | page_remove_rmap(page); |
2687 | force_flush = !__tlb_remove_page(tlb, page); | 2688 | force_flush = !__tlb_remove_page(tlb, page); |
2688 | if (force_flush) { | 2689 | if (force_flush) { |
2690 | address += sz; | ||
2689 | spin_unlock(ptl); | 2691 | spin_unlock(ptl); |
2690 | break; | 2692 | break; |
2691 | } | 2693 | } |
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a67c26e0f360..037e1c00a5b7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/cgroup.h> | 16 | #include <linux/cgroup.h> |
17 | #include <linux/page_counter.h> | ||
17 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
18 | #include <linux/hugetlb.h> | 19 | #include <linux/hugetlb.h> |
19 | #include <linux/hugetlb_cgroup.h> | 20 | #include <linux/hugetlb_cgroup.h> |
@@ -23,7 +24,7 @@ struct hugetlb_cgroup { | |||
23 | /* | 24 | /* |
24 | * the counter to account for hugepages from hugetlb. | 25 | * the counter to account for hugepages from hugetlb. |
25 | */ | 26 | */ |
26 | struct res_counter hugepage[HUGE_MAX_HSTATE]; | 27 | struct page_counter hugepage[HUGE_MAX_HSTATE]; |
27 | }; | 28 | }; |
28 | 29 | ||
29 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 30 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
@@ -60,7 +61,7 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) | |||
60 | int idx; | 61 | int idx; |
61 | 62 | ||
62 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { | 63 | for (idx = 0; idx < hugetlb_max_hstate; idx++) { |
63 | if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) | 64 | if (page_counter_read(&h_cg->hugepage[idx])) |
64 | return true; | 65 | return true; |
65 | } | 66 | } |
66 | return false; | 67 | return false; |
@@ -79,12 +80,12 @@ hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
79 | 80 | ||
80 | if (parent_h_cgroup) { | 81 | if (parent_h_cgroup) { |
81 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | 82 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) |
82 | res_counter_init(&h_cgroup->hugepage[idx], | 83 | page_counter_init(&h_cgroup->hugepage[idx], |
83 | &parent_h_cgroup->hugepage[idx]); | 84 | &parent_h_cgroup->hugepage[idx]); |
84 | } else { | 85 | } else { |
85 | root_h_cgroup = h_cgroup; | 86 | root_h_cgroup = h_cgroup; |
86 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) | 87 | for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) |
87 | res_counter_init(&h_cgroup->hugepage[idx], NULL); | 88 | page_counter_init(&h_cgroup->hugepage[idx], NULL); |
88 | } | 89 | } |
89 | return &h_cgroup->css; | 90 | return &h_cgroup->css; |
90 | } | 91 | } |
@@ -108,9 +109,8 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) | |||
108 | static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, | 109 | static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, |
109 | struct page *page) | 110 | struct page *page) |
110 | { | 111 | { |
111 | int csize; | 112 | unsigned int nr_pages; |
112 | struct res_counter *counter; | 113 | struct page_counter *counter; |
113 | struct res_counter *fail_res; | ||
114 | struct hugetlb_cgroup *page_hcg; | 114 | struct hugetlb_cgroup *page_hcg; |
115 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); | 115 | struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); |
116 | 116 | ||
@@ -123,15 +123,15 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, | |||
123 | if (!page_hcg || page_hcg != h_cg) | 123 | if (!page_hcg || page_hcg != h_cg) |
124 | goto out; | 124 | goto out; |
125 | 125 | ||
126 | csize = PAGE_SIZE << compound_order(page); | 126 | nr_pages = 1 << compound_order(page); |
127 | if (!parent) { | 127 | if (!parent) { |
128 | parent = root_h_cgroup; | 128 | parent = root_h_cgroup; |
129 | /* root has no limit */ | 129 | /* root has no limit */ |
130 | res_counter_charge_nofail(&parent->hugepage[idx], | 130 | page_counter_charge(&parent->hugepage[idx], nr_pages); |
131 | csize, &fail_res); | ||
132 | } | 131 | } |
133 | counter = &h_cg->hugepage[idx]; | 132 | counter = &h_cg->hugepage[idx]; |
134 | res_counter_uncharge_until(counter, counter->parent, csize); | 133 | /* Take the pages off the local counter */ |
134 | page_counter_cancel(counter, nr_pages); | ||
135 | 135 | ||
136 | set_hugetlb_cgroup(page, parent); | 136 | set_hugetlb_cgroup(page, parent); |
137 | out: | 137 | out: |
@@ -166,9 +166,8 @@ int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, | |||
166 | struct hugetlb_cgroup **ptr) | 166 | struct hugetlb_cgroup **ptr) |
167 | { | 167 | { |
168 | int ret = 0; | 168 | int ret = 0; |
169 | struct res_counter *fail_res; | 169 | struct page_counter *counter; |
170 | struct hugetlb_cgroup *h_cg = NULL; | 170 | struct hugetlb_cgroup *h_cg = NULL; |
171 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
172 | 171 | ||
173 | if (hugetlb_cgroup_disabled()) | 172 | if (hugetlb_cgroup_disabled()) |
174 | goto done; | 173 | goto done; |
@@ -187,7 +186,7 @@ again: | |||
187 | } | 186 | } |
188 | rcu_read_unlock(); | 187 | rcu_read_unlock(); |
189 | 188 | ||
190 | ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); | 189 | ret = page_counter_try_charge(&h_cg->hugepage[idx], nr_pages, &counter); |
191 | css_put(&h_cg->css); | 190 | css_put(&h_cg->css); |
192 | done: | 191 | done: |
193 | *ptr = h_cg; | 192 | *ptr = h_cg; |
@@ -213,7 +212,6 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | |||
213 | struct page *page) | 212 | struct page *page) |
214 | { | 213 | { |
215 | struct hugetlb_cgroup *h_cg; | 214 | struct hugetlb_cgroup *h_cg; |
216 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
217 | 215 | ||
218 | if (hugetlb_cgroup_disabled()) | 216 | if (hugetlb_cgroup_disabled()) |
219 | return; | 217 | return; |
@@ -222,61 +220,76 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, | |||
222 | if (unlikely(!h_cg)) | 220 | if (unlikely(!h_cg)) |
223 | return; | 221 | return; |
224 | set_hugetlb_cgroup(page, NULL); | 222 | set_hugetlb_cgroup(page, NULL); |
225 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | 223 | page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); |
226 | return; | 224 | return; |
227 | } | 225 | } |
228 | 226 | ||
229 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, | 227 | void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, |
230 | struct hugetlb_cgroup *h_cg) | 228 | struct hugetlb_cgroup *h_cg) |
231 | { | 229 | { |
232 | unsigned long csize = nr_pages * PAGE_SIZE; | ||
233 | |||
234 | if (hugetlb_cgroup_disabled() || !h_cg) | 230 | if (hugetlb_cgroup_disabled() || !h_cg) |
235 | return; | 231 | return; |
236 | 232 | ||
237 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) | 233 | if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) |
238 | return; | 234 | return; |
239 | 235 | ||
240 | res_counter_uncharge(&h_cg->hugepage[idx], csize); | 236 | page_counter_uncharge(&h_cg->hugepage[idx], nr_pages); |
241 | return; | 237 | return; |
242 | } | 238 | } |
243 | 239 | ||
240 | enum { | ||
241 | RES_USAGE, | ||
242 | RES_LIMIT, | ||
243 | RES_MAX_USAGE, | ||
244 | RES_FAILCNT, | ||
245 | }; | ||
246 | |||
244 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, | 247 | static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, |
245 | struct cftype *cft) | 248 | struct cftype *cft) |
246 | { | 249 | { |
247 | int idx, name; | 250 | struct page_counter *counter; |
248 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); | 251 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); |
249 | 252 | ||
250 | idx = MEMFILE_IDX(cft->private); | 253 | counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; |
251 | name = MEMFILE_ATTR(cft->private); | ||
252 | 254 | ||
253 | return res_counter_read_u64(&h_cg->hugepage[idx], name); | 255 | switch (MEMFILE_ATTR(cft->private)) { |
256 | case RES_USAGE: | ||
257 | return (u64)page_counter_read(counter) * PAGE_SIZE; | ||
258 | case RES_LIMIT: | ||
259 | return (u64)counter->limit * PAGE_SIZE; | ||
260 | case RES_MAX_USAGE: | ||
261 | return (u64)counter->watermark * PAGE_SIZE; | ||
262 | case RES_FAILCNT: | ||
263 | return counter->failcnt; | ||
264 | default: | ||
265 | BUG(); | ||
266 | } | ||
254 | } | 267 | } |
255 | 268 | ||
269 | static DEFINE_MUTEX(hugetlb_limit_mutex); | ||
270 | |||
256 | static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | 271 | static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, |
257 | char *buf, size_t nbytes, loff_t off) | 272 | char *buf, size_t nbytes, loff_t off) |
258 | { | 273 | { |
259 | int idx, name, ret; | 274 | int ret, idx; |
260 | unsigned long long val; | 275 | unsigned long nr_pages; |
261 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); | 276 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); |
262 | 277 | ||
278 | if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ | ||
279 | return -EINVAL; | ||
280 | |||
263 | buf = strstrip(buf); | 281 | buf = strstrip(buf); |
282 | ret = page_counter_memparse(buf, &nr_pages); | ||
283 | if (ret) | ||
284 | return ret; | ||
285 | |||
264 | idx = MEMFILE_IDX(of_cft(of)->private); | 286 | idx = MEMFILE_IDX(of_cft(of)->private); |
265 | name = MEMFILE_ATTR(of_cft(of)->private); | ||
266 | 287 | ||
267 | switch (name) { | 288 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
268 | case RES_LIMIT: | 289 | case RES_LIMIT: |
269 | if (hugetlb_cgroup_is_root(h_cg)) { | 290 | mutex_lock(&hugetlb_limit_mutex); |
270 | /* Can't set limit on root */ | 291 | ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); |
271 | ret = -EINVAL; | 292 | mutex_unlock(&hugetlb_limit_mutex); |
272 | break; | ||
273 | } | ||
274 | /* This function does all necessary parse...reuse it */ | ||
275 | ret = res_counter_memparse_write_strategy(buf, &val); | ||
276 | if (ret) | ||
277 | break; | ||
278 | val = ALIGN(val, 1ULL << huge_page_shift(&hstates[idx])); | ||
279 | ret = res_counter_set_limit(&h_cg->hugepage[idx], val); | ||
280 | break; | 293 | break; |
281 | default: | 294 | default: |
282 | ret = -EINVAL; | 295 | ret = -EINVAL; |
@@ -288,18 +301,18 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | |||
288 | static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, | 301 | static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, |
289 | char *buf, size_t nbytes, loff_t off) | 302 | char *buf, size_t nbytes, loff_t off) |
290 | { | 303 | { |
291 | int idx, name, ret = 0; | 304 | int ret = 0; |
305 | struct page_counter *counter; | ||
292 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); | 306 | struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); |
293 | 307 | ||
294 | idx = MEMFILE_IDX(of_cft(of)->private); | 308 | counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; |
295 | name = MEMFILE_ATTR(of_cft(of)->private); | ||
296 | 309 | ||
297 | switch (name) { | 310 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
298 | case RES_MAX_USAGE: | 311 | case RES_MAX_USAGE: |
299 | res_counter_reset_max(&h_cg->hugepage[idx]); | 312 | page_counter_reset_watermark(counter); |
300 | break; | 313 | break; |
301 | case RES_FAILCNT: | 314 | case RES_FAILCNT: |
302 | res_counter_reset_failcnt(&h_cg->hugepage[idx]); | 315 | counter->failcnt = 0; |
303 | break; | 316 | break; |
304 | default: | 317 | default: |
305 | ret = -EINVAL; | 318 | ret = -EINVAL; |
diff --git a/mm/internal.h b/mm/internal.h index a4f90ba7068e..efad241f7014 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -161,13 +161,10 @@ struct compact_control { | |||
161 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 161 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
162 | enum migrate_mode mode; /* Async or sync migration mode */ | 162 | enum migrate_mode mode; /* Async or sync migration mode */ |
163 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 163 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
164 | bool finished_update_free; /* True when the zone cached pfns are | ||
165 | * no longer being updated | ||
166 | */ | ||
167 | bool finished_update_migrate; | ||
168 | |||
169 | int order; /* order a direct compactor needs */ | 164 | int order; /* order a direct compactor needs */ |
170 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ | 165 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
166 | const int alloc_flags; /* alloc flags of a direct compactor */ | ||
167 | const int classzone_idx; /* zone index of a direct compactor */ | ||
171 | struct zone *zone; | 168 | struct zone *zone; |
172 | int contended; /* Signal need_sched() or lock | 169 | int contended; /* Signal need_sched() or lock |
173 | * contention detected during | 170 | * contention detected during |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ee48428cf8e3..85df503ec023 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -25,7 +25,7 @@ | |||
25 | * GNU General Public License for more details. | 25 | * GNU General Public License for more details. |
26 | */ | 26 | */ |
27 | 27 | ||
28 | #include <linux/res_counter.h> | 28 | #include <linux/page_counter.h> |
29 | #include <linux/memcontrol.h> | 29 | #include <linux/memcontrol.h> |
30 | #include <linux/cgroup.h> | 30 | #include <linux/cgroup.h> |
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
@@ -51,7 +51,7 @@ | |||
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/vmpressure.h> | 52 | #include <linux/vmpressure.h> |
53 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
54 | #include <linux/page_cgroup.h> | 54 | #include <linux/swap_cgroup.h> |
55 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
56 | #include <linux/oom.h> | 56 | #include <linux/oom.h> |
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
@@ -143,14 +143,8 @@ struct mem_cgroup_stat_cpu { | |||
143 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 143 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
144 | }; | 144 | }; |
145 | 145 | ||
146 | struct mem_cgroup_reclaim_iter { | 146 | struct reclaim_iter { |
147 | /* | 147 | struct mem_cgroup *position; |
148 | * last scanned hierarchy member. Valid only if last_dead_count | ||
149 | * matches memcg->dead_count of the hierarchy root group. | ||
150 | */ | ||
151 | struct mem_cgroup *last_visited; | ||
152 | int last_dead_count; | ||
153 | |||
154 | /* scan generation, increased every round-trip */ | 148 | /* scan generation, increased every round-trip */ |
155 | unsigned int generation; | 149 | unsigned int generation; |
156 | }; | 150 | }; |
@@ -162,10 +156,10 @@ struct mem_cgroup_per_zone { | |||
162 | struct lruvec lruvec; | 156 | struct lruvec lruvec; |
163 | unsigned long lru_size[NR_LRU_LISTS]; | 157 | unsigned long lru_size[NR_LRU_LISTS]; |
164 | 158 | ||
165 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 159 | struct reclaim_iter iter[DEF_PRIORITY + 1]; |
166 | 160 | ||
167 | struct rb_node tree_node; /* RB tree node */ | 161 | struct rb_node tree_node; /* RB tree node */ |
168 | unsigned long long usage_in_excess;/* Set to the value by which */ | 162 | unsigned long usage_in_excess;/* Set to the value by which */ |
169 | /* the soft limit is exceeded*/ | 163 | /* the soft limit is exceeded*/ |
170 | bool on_tree; | 164 | bool on_tree; |
171 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 165 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
@@ -198,7 +192,7 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; | |||
198 | 192 | ||
199 | struct mem_cgroup_threshold { | 193 | struct mem_cgroup_threshold { |
200 | struct eventfd_ctx *eventfd; | 194 | struct eventfd_ctx *eventfd; |
201 | u64 threshold; | 195 | unsigned long threshold; |
202 | }; | 196 | }; |
203 | 197 | ||
204 | /* For threshold */ | 198 | /* For threshold */ |
@@ -284,10 +278,13 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | |||
284 | */ | 278 | */ |
285 | struct mem_cgroup { | 279 | struct mem_cgroup { |
286 | struct cgroup_subsys_state css; | 280 | struct cgroup_subsys_state css; |
287 | /* | 281 | |
288 | * the counter to account for memory usage | 282 | /* Accounted resources */ |
289 | */ | 283 | struct page_counter memory; |
290 | struct res_counter res; | 284 | struct page_counter memsw; |
285 | struct page_counter kmem; | ||
286 | |||
287 | unsigned long soft_limit; | ||
291 | 288 | ||
292 | /* vmpressure notifications */ | 289 | /* vmpressure notifications */ |
293 | struct vmpressure vmpressure; | 290 | struct vmpressure vmpressure; |
@@ -296,15 +293,6 @@ struct mem_cgroup { | |||
296 | int initialized; | 293 | int initialized; |
297 | 294 | ||
298 | /* | 295 | /* |
299 | * the counter to account for mem+swap usage. | ||
300 | */ | ||
301 | struct res_counter memsw; | ||
302 | |||
303 | /* | ||
304 | * the counter to account for kernel memory usage. | ||
305 | */ | ||
306 | struct res_counter kmem; | ||
307 | /* | ||
308 | * Should the accounting and control be hierarchical, per subtree? | 296 | * Should the accounting and control be hierarchical, per subtree? |
309 | */ | 297 | */ |
310 | bool use_hierarchy; | 298 | bool use_hierarchy; |
@@ -352,7 +340,6 @@ struct mem_cgroup { | |||
352 | struct mem_cgroup_stat_cpu nocpu_base; | 340 | struct mem_cgroup_stat_cpu nocpu_base; |
353 | spinlock_t pcp_counter_lock; | 341 | spinlock_t pcp_counter_lock; |
354 | 342 | ||
355 | atomic_t dead_count; | ||
356 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 343 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
357 | struct cg_proto tcp_mem; | 344 | struct cg_proto tcp_mem; |
358 | #endif | 345 | #endif |
@@ -382,7 +369,6 @@ struct mem_cgroup { | |||
382 | /* internal only representation about the status of kmem accounting. */ | 369 | /* internal only representation about the status of kmem accounting. */ |
383 | enum { | 370 | enum { |
384 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ | 371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ |
385 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | ||
386 | }; | 372 | }; |
387 | 373 | ||
388 | #ifdef CONFIG_MEMCG_KMEM | 374 | #ifdef CONFIG_MEMCG_KMEM |
@@ -396,22 +382,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | |||
396 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 382 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); |
397 | } | 383 | } |
398 | 384 | ||
399 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | ||
400 | { | ||
401 | /* | ||
402 | * Our caller must use css_get() first, because memcg_uncharge_kmem() | ||
403 | * will call css_put() if it sees the memcg is dead. | ||
404 | */ | ||
405 | smp_wmb(); | ||
406 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | ||
407 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); | ||
408 | } | ||
409 | |||
410 | static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | ||
411 | { | ||
412 | return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, | ||
413 | &memcg->kmem_account_flags); | ||
414 | } | ||
415 | #endif | 385 | #endif |
416 | 386 | ||
417 | /* Stuffs for move charges at task migration. */ | 387 | /* Stuffs for move charges at task migration. */ |
@@ -650,7 +620,7 @@ static void disarm_kmem_keys(struct mem_cgroup *memcg) | |||
650 | * This check can't live in kmem destruction function, | 620 | * This check can't live in kmem destruction function, |
651 | * since the charges will outlive the cgroup | 621 | * since the charges will outlive the cgroup |
652 | */ | 622 | */ |
653 | WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); | 623 | WARN_ON(page_counter_read(&memcg->kmem)); |
654 | } | 624 | } |
655 | #else | 625 | #else |
656 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 626 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
@@ -664,8 +634,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg) | |||
664 | disarm_kmem_keys(memcg); | 634 | disarm_kmem_keys(memcg); |
665 | } | 635 | } |
666 | 636 | ||
667 | static void drain_all_stock_async(struct mem_cgroup *memcg); | ||
668 | |||
669 | static struct mem_cgroup_per_zone * | 637 | static struct mem_cgroup_per_zone * |
670 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | 638 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) |
671 | { | 639 | { |
@@ -706,7 +674,7 @@ soft_limit_tree_from_page(struct page *page) | |||
706 | 674 | ||
707 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, | 675 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, |
708 | struct mem_cgroup_tree_per_zone *mctz, | 676 | struct mem_cgroup_tree_per_zone *mctz, |
709 | unsigned long long new_usage_in_excess) | 677 | unsigned long new_usage_in_excess) |
710 | { | 678 | { |
711 | struct rb_node **p = &mctz->rb_root.rb_node; | 679 | struct rb_node **p = &mctz->rb_root.rb_node; |
712 | struct rb_node *parent = NULL; | 680 | struct rb_node *parent = NULL; |
@@ -755,10 +723,21 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | |||
755 | spin_unlock_irqrestore(&mctz->lock, flags); | 723 | spin_unlock_irqrestore(&mctz->lock, flags); |
756 | } | 724 | } |
757 | 725 | ||
726 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | ||
727 | { | ||
728 | unsigned long nr_pages = page_counter_read(&memcg->memory); | ||
729 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); | ||
730 | unsigned long excess = 0; | ||
731 | |||
732 | if (nr_pages > soft_limit) | ||
733 | excess = nr_pages - soft_limit; | ||
734 | |||
735 | return excess; | ||
736 | } | ||
758 | 737 | ||
759 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | 738 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) |
760 | { | 739 | { |
761 | unsigned long long excess; | 740 | unsigned long excess; |
762 | struct mem_cgroup_per_zone *mz; | 741 | struct mem_cgroup_per_zone *mz; |
763 | struct mem_cgroup_tree_per_zone *mctz; | 742 | struct mem_cgroup_tree_per_zone *mctz; |
764 | 743 | ||
@@ -769,7 +748,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | |||
769 | */ | 748 | */ |
770 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | 749 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
771 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 750 | mz = mem_cgroup_page_zoneinfo(memcg, page); |
772 | excess = res_counter_soft_limit_excess(&memcg->res); | 751 | excess = soft_limit_excess(memcg); |
773 | /* | 752 | /* |
774 | * We have to update the tree if mz is on RB-tree or | 753 | * We have to update the tree if mz is on RB-tree or |
775 | * mem is over its softlimit. | 754 | * mem is over its softlimit. |
@@ -825,7 +804,7 @@ retry: | |||
825 | * position in the tree. | 804 | * position in the tree. |
826 | */ | 805 | */ |
827 | __mem_cgroup_remove_exceeded(mz, mctz); | 806 | __mem_cgroup_remove_exceeded(mz, mctz); |
828 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || | 807 | if (!soft_limit_excess(mz->memcg) || |
829 | !css_tryget_online(&mz->memcg->css)) | 808 | !css_tryget_online(&mz->memcg->css)) |
830 | goto retry; | 809 | goto retry; |
831 | done: | 810 | done: |
@@ -1062,122 +1041,6 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
1062 | return memcg; | 1041 | return memcg; |
1063 | } | 1042 | } |
1064 | 1043 | ||
1065 | /* | ||
1066 | * Returns a next (in a pre-order walk) alive memcg (with elevated css | ||
1067 | * ref. count) or NULL if the whole root's subtree has been visited. | ||
1068 | * | ||
1069 | * helper function to be used by mem_cgroup_iter | ||
1070 | */ | ||
1071 | static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, | ||
1072 | struct mem_cgroup *last_visited) | ||
1073 | { | ||
1074 | struct cgroup_subsys_state *prev_css, *next_css; | ||
1075 | |||
1076 | prev_css = last_visited ? &last_visited->css : NULL; | ||
1077 | skip_node: | ||
1078 | next_css = css_next_descendant_pre(prev_css, &root->css); | ||
1079 | |||
1080 | /* | ||
1081 | * Even if we found a group we have to make sure it is | ||
1082 | * alive. css && !memcg means that the groups should be | ||
1083 | * skipped and we should continue the tree walk. | ||
1084 | * last_visited css is safe to use because it is | ||
1085 | * protected by css_get and the tree walk is rcu safe. | ||
1086 | * | ||
1087 | * We do not take a reference on the root of the tree walk | ||
1088 | * because we might race with the root removal when it would | ||
1089 | * be the only node in the iterated hierarchy and mem_cgroup_iter | ||
1090 | * would end up in an endless loop because it expects that at | ||
1091 | * least one valid node will be returned. Root cannot disappear | ||
1092 | * because caller of the iterator should hold it already so | ||
1093 | * skipping css reference should be safe. | ||
1094 | */ | ||
1095 | if (next_css) { | ||
1096 | struct mem_cgroup *memcg = mem_cgroup_from_css(next_css); | ||
1097 | |||
1098 | if (next_css == &root->css) | ||
1099 | return memcg; | ||
1100 | |||
1101 | if (css_tryget_online(next_css)) { | ||
1102 | /* | ||
1103 | * Make sure the memcg is initialized: | ||
1104 | * mem_cgroup_css_online() orders the the | ||
1105 | * initialization against setting the flag. | ||
1106 | */ | ||
1107 | if (smp_load_acquire(&memcg->initialized)) | ||
1108 | return memcg; | ||
1109 | css_put(next_css); | ||
1110 | } | ||
1111 | |||
1112 | prev_css = next_css; | ||
1113 | goto skip_node; | ||
1114 | } | ||
1115 | |||
1116 | return NULL; | ||
1117 | } | ||
1118 | |||
1119 | static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) | ||
1120 | { | ||
1121 | /* | ||
1122 | * When a group in the hierarchy below root is destroyed, the | ||
1123 | * hierarchy iterator can no longer be trusted since it might | ||
1124 | * have pointed to the destroyed group. Invalidate it. | ||
1125 | */ | ||
1126 | atomic_inc(&root->dead_count); | ||
1127 | } | ||
1128 | |||
1129 | static struct mem_cgroup * | ||
1130 | mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | ||
1131 | struct mem_cgroup *root, | ||
1132 | int *sequence) | ||
1133 | { | ||
1134 | struct mem_cgroup *position = NULL; | ||
1135 | /* | ||
1136 | * A cgroup destruction happens in two stages: offlining and | ||
1137 | * release. They are separated by a RCU grace period. | ||
1138 | * | ||
1139 | * If the iterator is valid, we may still race with an | ||
1140 | * offlining. The RCU lock ensures the object won't be | ||
1141 | * released, tryget will fail if we lost the race. | ||
1142 | */ | ||
1143 | *sequence = atomic_read(&root->dead_count); | ||
1144 | if (iter->last_dead_count == *sequence) { | ||
1145 | smp_rmb(); | ||
1146 | position = iter->last_visited; | ||
1147 | |||
1148 | /* | ||
1149 | * We cannot take a reference to root because we might race | ||
1150 | * with root removal and returning NULL would end up in | ||
1151 | * an endless loop on the iterator user level when root | ||
1152 | * would be returned all the time. | ||
1153 | */ | ||
1154 | if (position && position != root && | ||
1155 | !css_tryget_online(&position->css)) | ||
1156 | position = NULL; | ||
1157 | } | ||
1158 | return position; | ||
1159 | } | ||
1160 | |||
1161 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | ||
1162 | struct mem_cgroup *last_visited, | ||
1163 | struct mem_cgroup *new_position, | ||
1164 | struct mem_cgroup *root, | ||
1165 | int sequence) | ||
1166 | { | ||
1167 | /* root reference counting symmetric to mem_cgroup_iter_load */ | ||
1168 | if (last_visited && last_visited != root) | ||
1169 | css_put(&last_visited->css); | ||
1170 | /* | ||
1171 | * We store the sequence count from the time @last_visited was | ||
1172 | * loaded successfully instead of rereading it here so that we | ||
1173 | * don't lose destruction events in between. We could have | ||
1174 | * raced with the destruction of @new_position after all. | ||
1175 | */ | ||
1176 | iter->last_visited = new_position; | ||
1177 | smp_wmb(); | ||
1178 | iter->last_dead_count = sequence; | ||
1179 | } | ||
1180 | |||
1181 | /** | 1044 | /** |
1182 | * mem_cgroup_iter - iterate over memory cgroup hierarchy | 1045 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
1183 | * @root: hierarchy root | 1046 | * @root: hierarchy root |
@@ -1199,8 +1062,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1199 | struct mem_cgroup *prev, | 1062 | struct mem_cgroup *prev, |
1200 | struct mem_cgroup_reclaim_cookie *reclaim) | 1063 | struct mem_cgroup_reclaim_cookie *reclaim) |
1201 | { | 1064 | { |
1065 | struct reclaim_iter *uninitialized_var(iter); | ||
1066 | struct cgroup_subsys_state *css = NULL; | ||
1202 | struct mem_cgroup *memcg = NULL; | 1067 | struct mem_cgroup *memcg = NULL; |
1203 | struct mem_cgroup *last_visited = NULL; | 1068 | struct mem_cgroup *pos = NULL; |
1204 | 1069 | ||
1205 | if (mem_cgroup_disabled()) | 1070 | if (mem_cgroup_disabled()) |
1206 | return NULL; | 1071 | return NULL; |
@@ -1209,50 +1074,101 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1209 | root = root_mem_cgroup; | 1074 | root = root_mem_cgroup; |
1210 | 1075 | ||
1211 | if (prev && !reclaim) | 1076 | if (prev && !reclaim) |
1212 | last_visited = prev; | 1077 | pos = prev; |
1213 | 1078 | ||
1214 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1079 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1215 | if (prev) | 1080 | if (prev) |
1216 | goto out_css_put; | 1081 | goto out; |
1217 | return root; | 1082 | return root; |
1218 | } | 1083 | } |
1219 | 1084 | ||
1220 | rcu_read_lock(); | 1085 | rcu_read_lock(); |
1221 | while (!memcg) { | ||
1222 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); | ||
1223 | int uninitialized_var(seq); | ||
1224 | |||
1225 | if (reclaim) { | ||
1226 | struct mem_cgroup_per_zone *mz; | ||
1227 | |||
1228 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); | ||
1229 | iter = &mz->reclaim_iter[reclaim->priority]; | ||
1230 | if (prev && reclaim->generation != iter->generation) { | ||
1231 | iter->last_visited = NULL; | ||
1232 | goto out_unlock; | ||
1233 | } | ||
1234 | 1086 | ||
1235 | last_visited = mem_cgroup_iter_load(iter, root, &seq); | 1087 | if (reclaim) { |
1088 | struct mem_cgroup_per_zone *mz; | ||
1089 | |||
1090 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); | ||
1091 | iter = &mz->iter[reclaim->priority]; | ||
1092 | |||
1093 | if (prev && reclaim->generation != iter->generation) | ||
1094 | goto out_unlock; | ||
1095 | |||
1096 | do { | ||
1097 | pos = ACCESS_ONCE(iter->position); | ||
1098 | /* | ||
1099 | * A racing update may change the position and | ||
1100 | * put the last reference, hence css_tryget(), | ||
1101 | * or retry to see the updated position. | ||
1102 | */ | ||
1103 | } while (pos && !css_tryget(&pos->css)); | ||
1104 | } | ||
1105 | |||
1106 | if (pos) | ||
1107 | css = &pos->css; | ||
1108 | |||
1109 | for (;;) { | ||
1110 | css = css_next_descendant_pre(css, &root->css); | ||
1111 | if (!css) { | ||
1112 | /* | ||
1113 | * Reclaimers share the hierarchy walk, and a | ||
1114 | * new one might jump in right at the end of | ||
1115 | * the hierarchy - make sure they see at least | ||
1116 | * one group and restart from the beginning. | ||
1117 | */ | ||
1118 | if (!prev) | ||
1119 | continue; | ||
1120 | break; | ||
1236 | } | 1121 | } |
1237 | 1122 | ||
1238 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1123 | /* |
1124 | * Verify the css and acquire a reference. The root | ||
1125 | * is provided by the caller, so we know it's alive | ||
1126 | * and kicking, and don't take an extra reference. | ||
1127 | */ | ||
1128 | memcg = mem_cgroup_from_css(css); | ||
1129 | |||
1130 | if (css == &root->css) | ||
1131 | break; | ||
1239 | 1132 | ||
1240 | if (reclaim) { | 1133 | if (css_tryget(css)) { |
1241 | mem_cgroup_iter_update(iter, last_visited, memcg, root, | 1134 | /* |
1242 | seq); | 1135 | * Make sure the memcg is initialized: |
1136 | * mem_cgroup_css_online() orders the the | ||
1137 | * initialization against setting the flag. | ||
1138 | */ | ||
1139 | if (smp_load_acquire(&memcg->initialized)) | ||
1140 | break; | ||
1243 | 1141 | ||
1244 | if (!memcg) | 1142 | css_put(css); |
1245 | iter->generation++; | ||
1246 | else if (!prev && memcg) | ||
1247 | reclaim->generation = iter->generation; | ||
1248 | } | 1143 | } |
1249 | 1144 | ||
1250 | if (prev && !memcg) | 1145 | memcg = NULL; |
1251 | goto out_unlock; | 1146 | } |
1147 | |||
1148 | if (reclaim) { | ||
1149 | if (cmpxchg(&iter->position, pos, memcg) == pos) { | ||
1150 | if (memcg) | ||
1151 | css_get(&memcg->css); | ||
1152 | if (pos) | ||
1153 | css_put(&pos->css); | ||
1154 | } | ||
1155 | |||
1156 | /* | ||
1157 | * pairs with css_tryget when dereferencing iter->position | ||
1158 | * above. | ||
1159 | */ | ||
1160 | if (pos) | ||
1161 | css_put(&pos->css); | ||
1162 | |||
1163 | if (!memcg) | ||
1164 | iter->generation++; | ||
1165 | else if (!prev) | ||
1166 | reclaim->generation = iter->generation; | ||
1252 | } | 1167 | } |
1168 | |||
1253 | out_unlock: | 1169 | out_unlock: |
1254 | rcu_read_unlock(); | 1170 | rcu_read_unlock(); |
1255 | out_css_put: | 1171 | out: |
1256 | if (prev && prev != root) | 1172 | if (prev && prev != root) |
1257 | css_put(&prev->css); | 1173 | css_put(&prev->css); |
1258 | 1174 | ||
@@ -1346,15 +1262,18 @@ out: | |||
1346 | } | 1262 | } |
1347 | 1263 | ||
1348 | /** | 1264 | /** |
1349 | * mem_cgroup_page_lruvec - return lruvec for adding an lru page | 1265 | * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page |
1350 | * @page: the page | 1266 | * @page: the page |
1351 | * @zone: zone of the page | 1267 | * @zone: zone of the page |
1268 | * | ||
1269 | * This function is only safe when following the LRU page isolation | ||
1270 | * and putback protocol: the LRU lock must be held, and the page must | ||
1271 | * either be PageLRU() or the caller must have isolated/allocated it. | ||
1352 | */ | 1272 | */ |
1353 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | 1273 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) |
1354 | { | 1274 | { |
1355 | struct mem_cgroup_per_zone *mz; | 1275 | struct mem_cgroup_per_zone *mz; |
1356 | struct mem_cgroup *memcg; | 1276 | struct mem_cgroup *memcg; |
1357 | struct page_cgroup *pc; | ||
1358 | struct lruvec *lruvec; | 1277 | struct lruvec *lruvec; |
1359 | 1278 | ||
1360 | if (mem_cgroup_disabled()) { | 1279 | if (mem_cgroup_disabled()) { |
@@ -1362,20 +1281,13 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | |||
1362 | goto out; | 1281 | goto out; |
1363 | } | 1282 | } |
1364 | 1283 | ||
1365 | pc = lookup_page_cgroup(page); | 1284 | memcg = page->mem_cgroup; |
1366 | memcg = pc->mem_cgroup; | ||
1367 | |||
1368 | /* | 1285 | /* |
1369 | * Surreptitiously switch any uncharged offlist page to root: | 1286 | * Swapcache readahead pages are added to the LRU - and |
1370 | * an uncharged page off lru does nothing to secure | 1287 | * possibly migrated - before they are charged. |
1371 | * its former mem_cgroup from sudden removal. | ||
1372 | * | ||
1373 | * Our caller holds lru_lock, and PageCgroupUsed is updated | ||
1374 | * under page_cgroup lock: between them, they make all uses | ||
1375 | * of pc->mem_cgroup safe. | ||
1376 | */ | 1288 | */ |
1377 | if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) | 1289 | if (!memcg) |
1378 | pc->mem_cgroup = memcg = root_mem_cgroup; | 1290 | memcg = root_mem_cgroup; |
1379 | 1291 | ||
1380 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 1292 | mz = mem_cgroup_page_zoneinfo(memcg, page); |
1381 | lruvec = &mz->lruvec; | 1293 | lruvec = &mz->lruvec; |
@@ -1414,41 +1326,24 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | |||
1414 | VM_BUG_ON((long)(*lru_size) < 0); | 1326 | VM_BUG_ON((long)(*lru_size) < 0); |
1415 | } | 1327 | } |
1416 | 1328 | ||
1417 | /* | 1329 | bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) |
1418 | * Checks whether given mem is same or in the root_mem_cgroup's | ||
1419 | * hierarchy subtree | ||
1420 | */ | ||
1421 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1422 | struct mem_cgroup *memcg) | ||
1423 | { | 1330 | { |
1424 | if (root_memcg == memcg) | 1331 | if (root == memcg) |
1425 | return true; | 1332 | return true; |
1426 | if (!root_memcg->use_hierarchy || !memcg) | 1333 | if (!root->use_hierarchy) |
1427 | return false; | 1334 | return false; |
1428 | return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); | 1335 | return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); |
1429 | } | ||
1430 | |||
1431 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1432 | struct mem_cgroup *memcg) | ||
1433 | { | ||
1434 | bool ret; | ||
1435 | |||
1436 | rcu_read_lock(); | ||
1437 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); | ||
1438 | rcu_read_unlock(); | ||
1439 | return ret; | ||
1440 | } | 1336 | } |
1441 | 1337 | ||
1442 | bool task_in_mem_cgroup(struct task_struct *task, | 1338 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) |
1443 | const struct mem_cgroup *memcg) | ||
1444 | { | 1339 | { |
1445 | struct mem_cgroup *curr = NULL; | 1340 | struct mem_cgroup *task_memcg; |
1446 | struct task_struct *p; | 1341 | struct task_struct *p; |
1447 | bool ret; | 1342 | bool ret; |
1448 | 1343 | ||
1449 | p = find_lock_task_mm(task); | 1344 | p = find_lock_task_mm(task); |
1450 | if (p) { | 1345 | if (p) { |
1451 | curr = get_mem_cgroup_from_mm(p->mm); | 1346 | task_memcg = get_mem_cgroup_from_mm(p->mm); |
1452 | task_unlock(p); | 1347 | task_unlock(p); |
1453 | } else { | 1348 | } else { |
1454 | /* | 1349 | /* |
@@ -1457,19 +1352,12 @@ bool task_in_mem_cgroup(struct task_struct *task, | |||
1457 | * killed to prevent needlessly killing additional tasks. | 1352 | * killed to prevent needlessly killing additional tasks. |
1458 | */ | 1353 | */ |
1459 | rcu_read_lock(); | 1354 | rcu_read_lock(); |
1460 | curr = mem_cgroup_from_task(task); | 1355 | task_memcg = mem_cgroup_from_task(task); |
1461 | if (curr) | 1356 | css_get(&task_memcg->css); |
1462 | css_get(&curr->css); | ||
1463 | rcu_read_unlock(); | 1357 | rcu_read_unlock(); |
1464 | } | 1358 | } |
1465 | /* | 1359 | ret = mem_cgroup_is_descendant(task_memcg, memcg); |
1466 | * We should check use_hierarchy of "memcg" not "curr". Because checking | 1360 | css_put(&task_memcg->css); |
1467 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
1468 | * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* | ||
1469 | * hierarchy(even if use_hierarchy is disabled in "memcg"). | ||
1470 | */ | ||
1471 | ret = mem_cgroup_same_or_subtree(memcg, curr); | ||
1472 | css_put(&curr->css); | ||
1473 | return ret; | 1361 | return ret; |
1474 | } | 1362 | } |
1475 | 1363 | ||
@@ -1492,7 +1380,7 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1492 | return inactive * inactive_ratio < active; | 1380 | return inactive * inactive_ratio < active; |
1493 | } | 1381 | } |
1494 | 1382 | ||
1495 | #define mem_cgroup_from_res_counter(counter, member) \ | 1383 | #define mem_cgroup_from_counter(counter, member) \ |
1496 | container_of(counter, struct mem_cgroup, member) | 1384 | container_of(counter, struct mem_cgroup, member) |
1497 | 1385 | ||
1498 | /** | 1386 | /** |
@@ -1504,12 +1392,23 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1504 | */ | 1392 | */ |
1505 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | 1393 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) |
1506 | { | 1394 | { |
1507 | unsigned long long margin; | 1395 | unsigned long margin = 0; |
1396 | unsigned long count; | ||
1397 | unsigned long limit; | ||
1508 | 1398 | ||
1509 | margin = res_counter_margin(&memcg->res); | 1399 | count = page_counter_read(&memcg->memory); |
1510 | if (do_swap_account) | 1400 | limit = ACCESS_ONCE(memcg->memory.limit); |
1511 | margin = min(margin, res_counter_margin(&memcg->memsw)); | 1401 | if (count < limit) |
1512 | return margin >> PAGE_SHIFT; | 1402 | margin = limit - count; |
1403 | |||
1404 | if (do_swap_account) { | ||
1405 | count = page_counter_read(&memcg->memsw); | ||
1406 | limit = ACCESS_ONCE(memcg->memsw.limit); | ||
1407 | if (count <= limit) | ||
1408 | margin = min(margin, limit - count); | ||
1409 | } | ||
1410 | |||
1411 | return margin; | ||
1513 | } | 1412 | } |
1514 | 1413 | ||
1515 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) | 1414 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
@@ -1522,37 +1421,6 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
1522 | } | 1421 | } |
1523 | 1422 | ||
1524 | /* | 1423 | /* |
1525 | * memcg->moving_account is used for checking possibility that some thread is | ||
1526 | * calling move_account(). When a thread on CPU-A starts moving pages under | ||
1527 | * a memcg, other threads should check memcg->moving_account under | ||
1528 | * rcu_read_lock(), like this: | ||
1529 | * | ||
1530 | * CPU-A CPU-B | ||
1531 | * rcu_read_lock() | ||
1532 | * memcg->moving_account+1 if (memcg->mocing_account) | ||
1533 | * take heavy locks. | ||
1534 | * synchronize_rcu() update something. | ||
1535 | * rcu_read_unlock() | ||
1536 | * start move here. | ||
1537 | */ | ||
1538 | |||
1539 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | ||
1540 | { | ||
1541 | atomic_inc(&memcg->moving_account); | ||
1542 | synchronize_rcu(); | ||
1543 | } | ||
1544 | |||
1545 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) | ||
1546 | { | ||
1547 | /* | ||
1548 | * Now, mem_cgroup_clear_mc() may call this function with NULL. | ||
1549 | * We check NULL in callee rather than caller. | ||
1550 | */ | ||
1551 | if (memcg) | ||
1552 | atomic_dec(&memcg->moving_account); | ||
1553 | } | ||
1554 | |||
1555 | /* | ||
1556 | * A routine for checking "mem" is under move_account() or not. | 1424 | * A routine for checking "mem" is under move_account() or not. |
1557 | * | 1425 | * |
1558 | * Checking a cgroup is mc.from or mc.to or under hierarchy of | 1426 | * Checking a cgroup is mc.from or mc.to or under hierarchy of |
@@ -1574,8 +1442,8 @@ static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | |||
1574 | if (!from) | 1442 | if (!from) |
1575 | goto unlock; | 1443 | goto unlock; |
1576 | 1444 | ||
1577 | ret = mem_cgroup_same_or_subtree(memcg, from) | 1445 | ret = mem_cgroup_is_descendant(from, memcg) || |
1578 | || mem_cgroup_same_or_subtree(memcg, to); | 1446 | mem_cgroup_is_descendant(to, memcg); |
1579 | unlock: | 1447 | unlock: |
1580 | spin_unlock(&mc.lock); | 1448 | spin_unlock(&mc.lock); |
1581 | return ret; | 1449 | return ret; |
@@ -1597,23 +1465,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1597 | return false; | 1465 | return false; |
1598 | } | 1466 | } |
1599 | 1467 | ||
1600 | /* | ||
1601 | * Take this lock when | ||
1602 | * - a code tries to modify page's memcg while it's USED. | ||
1603 | * - a code tries to modify page state accounting in a memcg. | ||
1604 | */ | ||
1605 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | ||
1606 | unsigned long *flags) | ||
1607 | { | ||
1608 | spin_lock_irqsave(&memcg->move_lock, *flags); | ||
1609 | } | ||
1610 | |||
1611 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | ||
1612 | unsigned long *flags) | ||
1613 | { | ||
1614 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | ||
1615 | } | ||
1616 | |||
1617 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1468 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1618 | /** | 1469 | /** |
1619 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. | 1470 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
@@ -1644,18 +1495,15 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1644 | 1495 | ||
1645 | rcu_read_unlock(); | 1496 | rcu_read_unlock(); |
1646 | 1497 | ||
1647 | pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", | 1498 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", |
1648 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, | 1499 | K((u64)page_counter_read(&memcg->memory)), |
1649 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, | 1500 | K((u64)memcg->memory.limit), memcg->memory.failcnt); |
1650 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); | 1501 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", |
1651 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", | 1502 | K((u64)page_counter_read(&memcg->memsw)), |
1652 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | 1503 | K((u64)memcg->memsw.limit), memcg->memsw.failcnt); |
1653 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | 1504 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", |
1654 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | 1505 | K((u64)page_counter_read(&memcg->kmem)), |
1655 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", | 1506 | K((u64)memcg->kmem.limit), memcg->kmem.failcnt); |
1656 | res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, | ||
1657 | res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, | ||
1658 | res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); | ||
1659 | 1507 | ||
1660 | for_each_mem_cgroup_tree(iter, memcg) { | 1508 | for_each_mem_cgroup_tree(iter, memcg) { |
1661 | pr_info("Memory cgroup stats for "); | 1509 | pr_info("Memory cgroup stats for "); |
@@ -1695,28 +1543,17 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) | |||
1695 | /* | 1543 | /* |
1696 | * Return the memory (and swap, if configured) limit for a memcg. | 1544 | * Return the memory (and swap, if configured) limit for a memcg. |
1697 | */ | 1545 | */ |
1698 | static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1546 | static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1699 | { | 1547 | { |
1700 | u64 limit; | 1548 | unsigned long limit; |
1701 | |||
1702 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
1703 | 1549 | ||
1704 | /* | 1550 | limit = memcg->memory.limit; |
1705 | * Do not consider swap space if we cannot swap due to swappiness | ||
1706 | */ | ||
1707 | if (mem_cgroup_swappiness(memcg)) { | 1551 | if (mem_cgroup_swappiness(memcg)) { |
1708 | u64 memsw; | 1552 | unsigned long memsw_limit; |
1709 | 1553 | ||
1710 | limit += total_swap_pages << PAGE_SHIFT; | 1554 | memsw_limit = memcg->memsw.limit; |
1711 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 1555 | limit = min(limit + total_swap_pages, memsw_limit); |
1712 | |||
1713 | /* | ||
1714 | * If memsw is finite and limits the amount of swap space | ||
1715 | * available to this memcg, return that limit. | ||
1716 | */ | ||
1717 | limit = min(limit, memsw); | ||
1718 | } | 1556 | } |
1719 | |||
1720 | return limit; | 1557 | return limit; |
1721 | } | 1558 | } |
1722 | 1559 | ||
@@ -1740,7 +1577,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1740 | } | 1577 | } |
1741 | 1578 | ||
1742 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 1579 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
1743 | totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; | 1580 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; |
1744 | for_each_mem_cgroup_tree(iter, memcg) { | 1581 | for_each_mem_cgroup_tree(iter, memcg) { |
1745 | struct css_task_iter it; | 1582 | struct css_task_iter it; |
1746 | struct task_struct *task; | 1583 | struct task_struct *task; |
@@ -1880,52 +1717,11 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | |||
1880 | memcg->last_scanned_node = node; | 1717 | memcg->last_scanned_node = node; |
1881 | return node; | 1718 | return node; |
1882 | } | 1719 | } |
1883 | |||
1884 | /* | ||
1885 | * Check all nodes whether it contains reclaimable pages or not. | ||
1886 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1887 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1888 | * enough new information. We need to do double check. | ||
1889 | */ | ||
1890 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1891 | { | ||
1892 | int nid; | ||
1893 | |||
1894 | /* | ||
1895 | * quick check...making use of scan_node. | ||
1896 | * We can skip unused nodes. | ||
1897 | */ | ||
1898 | if (!nodes_empty(memcg->scan_nodes)) { | ||
1899 | for (nid = first_node(memcg->scan_nodes); | ||
1900 | nid < MAX_NUMNODES; | ||
1901 | nid = next_node(nid, memcg->scan_nodes)) { | ||
1902 | |||
1903 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1904 | return true; | ||
1905 | } | ||
1906 | } | ||
1907 | /* | ||
1908 | * Check rest of nodes. | ||
1909 | */ | ||
1910 | for_each_node_state(nid, N_MEMORY) { | ||
1911 | if (node_isset(nid, memcg->scan_nodes)) | ||
1912 | continue; | ||
1913 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | ||
1914 | return true; | ||
1915 | } | ||
1916 | return false; | ||
1917 | } | ||
1918 | |||
1919 | #else | 1720 | #else |
1920 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1721 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1921 | { | 1722 | { |
1922 | return 0; | 1723 | return 0; |
1923 | } | 1724 | } |
1924 | |||
1925 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | ||
1926 | { | ||
1927 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | ||
1928 | } | ||
1929 | #endif | 1725 | #endif |
1930 | 1726 | ||
1931 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1727 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
@@ -1943,7 +1739,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |||
1943 | .priority = 0, | 1739 | .priority = 0, |
1944 | }; | 1740 | }; |
1945 | 1741 | ||
1946 | excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; | 1742 | excess = soft_limit_excess(root_memcg); |
1947 | 1743 | ||
1948 | while (1) { | 1744 | while (1) { |
1949 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 1745 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
@@ -1969,12 +1765,10 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | |||
1969 | } | 1765 | } |
1970 | continue; | 1766 | continue; |
1971 | } | 1767 | } |
1972 | if (!mem_cgroup_reclaimable(victim, false)) | ||
1973 | continue; | ||
1974 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | 1768 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, |
1975 | zone, &nr_scanned); | 1769 | zone, &nr_scanned); |
1976 | *total_scanned += nr_scanned; | 1770 | *total_scanned += nr_scanned; |
1977 | if (!res_counter_soft_limit_excess(&root_memcg->res)) | 1771 | if (!soft_limit_excess(root_memcg)) |
1978 | break; | 1772 | break; |
1979 | } | 1773 | } |
1980 | mem_cgroup_iter_break(root_memcg, victim); | 1774 | mem_cgroup_iter_break(root_memcg, victim); |
@@ -2081,12 +1875,8 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
2081 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1875 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
2082 | oom_wait_memcg = oom_wait_info->memcg; | 1876 | oom_wait_memcg = oom_wait_info->memcg; |
2083 | 1877 | ||
2084 | /* | 1878 | if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && |
2085 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. | 1879 | !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) |
2086 | * Then we can use css_is_ancestor without taking care of RCU. | ||
2087 | */ | ||
2088 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) | ||
2089 | && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) | ||
2090 | return 0; | 1880 | return 0; |
2091 | return autoremove_wake_function(wait, mode, sync, arg); | 1881 | return autoremove_wake_function(wait, mode, sync, arg); |
2092 | } | 1882 | } |
@@ -2228,26 +2018,23 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | |||
2228 | unsigned long *flags) | 2018 | unsigned long *flags) |
2229 | { | 2019 | { |
2230 | struct mem_cgroup *memcg; | 2020 | struct mem_cgroup *memcg; |
2231 | struct page_cgroup *pc; | ||
2232 | 2021 | ||
2233 | rcu_read_lock(); | 2022 | rcu_read_lock(); |
2234 | 2023 | ||
2235 | if (mem_cgroup_disabled()) | 2024 | if (mem_cgroup_disabled()) |
2236 | return NULL; | 2025 | return NULL; |
2237 | |||
2238 | pc = lookup_page_cgroup(page); | ||
2239 | again: | 2026 | again: |
2240 | memcg = pc->mem_cgroup; | 2027 | memcg = page->mem_cgroup; |
2241 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 2028 | if (unlikely(!memcg)) |
2242 | return NULL; | 2029 | return NULL; |
2243 | 2030 | ||
2244 | *locked = false; | 2031 | *locked = false; |
2245 | if (atomic_read(&memcg->moving_account) <= 0) | 2032 | if (atomic_read(&memcg->moving_account) <= 0) |
2246 | return memcg; | 2033 | return memcg; |
2247 | 2034 | ||
2248 | move_lock_mem_cgroup(memcg, flags); | 2035 | spin_lock_irqsave(&memcg->move_lock, *flags); |
2249 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | 2036 | if (memcg != page->mem_cgroup) { |
2250 | move_unlock_mem_cgroup(memcg, flags); | 2037 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
2251 | goto again; | 2038 | goto again; |
2252 | } | 2039 | } |
2253 | *locked = true; | 2040 | *locked = true; |
@@ -2261,11 +2048,11 @@ again: | |||
2261 | * @locked: value received from mem_cgroup_begin_page_stat() | 2048 | * @locked: value received from mem_cgroup_begin_page_stat() |
2262 | * @flags: value received from mem_cgroup_begin_page_stat() | 2049 | * @flags: value received from mem_cgroup_begin_page_stat() |
2263 | */ | 2050 | */ |
2264 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, | 2051 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, |
2265 | unsigned long flags) | 2052 | unsigned long *flags) |
2266 | { | 2053 | { |
2267 | if (memcg && locked) | 2054 | if (memcg && *locked) |
2268 | move_unlock_mem_cgroup(memcg, &flags); | 2055 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
2269 | 2056 | ||
2270 | rcu_read_unlock(); | 2057 | rcu_read_unlock(); |
2271 | } | 2058 | } |
@@ -2316,33 +2103,32 @@ static DEFINE_MUTEX(percpu_charge_mutex); | |||
2316 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 2103 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2317 | { | 2104 | { |
2318 | struct memcg_stock_pcp *stock; | 2105 | struct memcg_stock_pcp *stock; |
2319 | bool ret = true; | 2106 | bool ret = false; |
2320 | 2107 | ||
2321 | if (nr_pages > CHARGE_BATCH) | 2108 | if (nr_pages > CHARGE_BATCH) |
2322 | return false; | 2109 | return ret; |
2323 | 2110 | ||
2324 | stock = &get_cpu_var(memcg_stock); | 2111 | stock = &get_cpu_var(memcg_stock); |
2325 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) | 2112 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) { |
2326 | stock->nr_pages -= nr_pages; | 2113 | stock->nr_pages -= nr_pages; |
2327 | else /* need to call res_counter_charge */ | 2114 | ret = true; |
2328 | ret = false; | 2115 | } |
2329 | put_cpu_var(memcg_stock); | 2116 | put_cpu_var(memcg_stock); |
2330 | return ret; | 2117 | return ret; |
2331 | } | 2118 | } |
2332 | 2119 | ||
2333 | /* | 2120 | /* |
2334 | * Returns stocks cached in percpu to res_counter and reset cached information. | 2121 | * Returns stocks cached in percpu and reset cached information. |
2335 | */ | 2122 | */ |
2336 | static void drain_stock(struct memcg_stock_pcp *stock) | 2123 | static void drain_stock(struct memcg_stock_pcp *stock) |
2337 | { | 2124 | { |
2338 | struct mem_cgroup *old = stock->cached; | 2125 | struct mem_cgroup *old = stock->cached; |
2339 | 2126 | ||
2340 | if (stock->nr_pages) { | 2127 | if (stock->nr_pages) { |
2341 | unsigned long bytes = stock->nr_pages * PAGE_SIZE; | 2128 | page_counter_uncharge(&old->memory, stock->nr_pages); |
2342 | |||
2343 | res_counter_uncharge(&old->res, bytes); | ||
2344 | if (do_swap_account) | 2129 | if (do_swap_account) |
2345 | res_counter_uncharge(&old->memsw, bytes); | 2130 | page_counter_uncharge(&old->memsw, stock->nr_pages); |
2131 | css_put_many(&old->css, stock->nr_pages); | ||
2346 | stock->nr_pages = 0; | 2132 | stock->nr_pages = 0; |
2347 | } | 2133 | } |
2348 | stock->cached = NULL; | 2134 | stock->cached = NULL; |
@@ -2371,7 +2157,7 @@ static void __init memcg_stock_init(void) | |||
2371 | } | 2157 | } |
2372 | 2158 | ||
2373 | /* | 2159 | /* |
2374 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 2160 | * Cache charges(val) to local per_cpu area. |
2375 | * This will be consumed by consume_stock() function, later. | 2161 | * This will be consumed by consume_stock() function, later. |
2376 | */ | 2162 | */ |
2377 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 2163 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
@@ -2388,13 +2174,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2388 | 2174 | ||
2389 | /* | 2175 | /* |
2390 | * Drains all per-CPU charge caches for given root_memcg resp. subtree | 2176 | * Drains all per-CPU charge caches for given root_memcg resp. subtree |
2391 | * of the hierarchy under it. sync flag says whether we should block | 2177 | * of the hierarchy under it. |
2392 | * until the work is done. | ||
2393 | */ | 2178 | */ |
2394 | static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | 2179 | static void drain_all_stock(struct mem_cgroup *root_memcg) |
2395 | { | 2180 | { |
2396 | int cpu, curcpu; | 2181 | int cpu, curcpu; |
2397 | 2182 | ||
2183 | /* If someone's already draining, avoid adding running more workers. */ | ||
2184 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2185 | return; | ||
2398 | /* Notify other cpus that system-wide "drain" is running */ | 2186 | /* Notify other cpus that system-wide "drain" is running */ |
2399 | get_online_cpus(); | 2187 | get_online_cpus(); |
2400 | curcpu = get_cpu(); | 2188 | curcpu = get_cpu(); |
@@ -2405,7 +2193,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
2405 | memcg = stock->cached; | 2193 | memcg = stock->cached; |
2406 | if (!memcg || !stock->nr_pages) | 2194 | if (!memcg || !stock->nr_pages) |
2407 | continue; | 2195 | continue; |
2408 | if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) | 2196 | if (!mem_cgroup_is_descendant(memcg, root_memcg)) |
2409 | continue; | 2197 | continue; |
2410 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 2198 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2411 | if (cpu == curcpu) | 2199 | if (cpu == curcpu) |
@@ -2415,42 +2203,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | |||
2415 | } | 2203 | } |
2416 | } | 2204 | } |
2417 | put_cpu(); | 2205 | put_cpu(); |
2418 | |||
2419 | if (!sync) | ||
2420 | goto out; | ||
2421 | |||
2422 | for_each_online_cpu(cpu) { | ||
2423 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
2424 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2425 | flush_work(&stock->work); | ||
2426 | } | ||
2427 | out: | ||
2428 | put_online_cpus(); | 2206 | put_online_cpus(); |
2429 | } | ||
2430 | |||
2431 | /* | ||
2432 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
2433 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
2434 | * expects some charges will be back to res_counter later but cannot wait for | ||
2435 | * it. | ||
2436 | */ | ||
2437 | static void drain_all_stock_async(struct mem_cgroup *root_memcg) | ||
2438 | { | ||
2439 | /* | ||
2440 | * If someone calls draining, avoid adding more kworker runs. | ||
2441 | */ | ||
2442 | if (!mutex_trylock(&percpu_charge_mutex)) | ||
2443 | return; | ||
2444 | drain_all_stock(root_memcg, false); | ||
2445 | mutex_unlock(&percpu_charge_mutex); | ||
2446 | } | ||
2447 | |||
2448 | /* This is a synchronous drain interface. */ | ||
2449 | static void drain_all_stock_sync(struct mem_cgroup *root_memcg) | ||
2450 | { | ||
2451 | /* called when force_empty is called */ | ||
2452 | mutex_lock(&percpu_charge_mutex); | ||
2453 | drain_all_stock(root_memcg, true); | ||
2454 | mutex_unlock(&percpu_charge_mutex); | 2207 | mutex_unlock(&percpu_charge_mutex); |
2455 | } | 2208 | } |
2456 | 2209 | ||
@@ -2506,9 +2259,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2506 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 2259 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2507 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2260 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2508 | struct mem_cgroup *mem_over_limit; | 2261 | struct mem_cgroup *mem_over_limit; |
2509 | struct res_counter *fail_res; | 2262 | struct page_counter *counter; |
2510 | unsigned long nr_reclaimed; | 2263 | unsigned long nr_reclaimed; |
2511 | unsigned long long size; | ||
2512 | bool may_swap = true; | 2264 | bool may_swap = true; |
2513 | bool drained = false; | 2265 | bool drained = false; |
2514 | int ret = 0; | 2266 | int ret = 0; |
@@ -2519,16 +2271,15 @@ retry: | |||
2519 | if (consume_stock(memcg, nr_pages)) | 2271 | if (consume_stock(memcg, nr_pages)) |
2520 | goto done; | 2272 | goto done; |
2521 | 2273 | ||
2522 | size = batch * PAGE_SIZE; | ||
2523 | if (!do_swap_account || | 2274 | if (!do_swap_account || |
2524 | !res_counter_charge(&memcg->memsw, size, &fail_res)) { | 2275 | !page_counter_try_charge(&memcg->memsw, batch, &counter)) { |
2525 | if (!res_counter_charge(&memcg->res, size, &fail_res)) | 2276 | if (!page_counter_try_charge(&memcg->memory, batch, &counter)) |
2526 | goto done_restock; | 2277 | goto done_restock; |
2527 | if (do_swap_account) | 2278 | if (do_swap_account) |
2528 | res_counter_uncharge(&memcg->memsw, size); | 2279 | page_counter_uncharge(&memcg->memsw, batch); |
2529 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2280 | mem_over_limit = mem_cgroup_from_counter(counter, memory); |
2530 | } else { | 2281 | } else { |
2531 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2282 | mem_over_limit = mem_cgroup_from_counter(counter, memsw); |
2532 | may_swap = false; | 2283 | may_swap = false; |
2533 | } | 2284 | } |
2534 | 2285 | ||
@@ -2561,7 +2312,7 @@ retry: | |||
2561 | goto retry; | 2312 | goto retry; |
2562 | 2313 | ||
2563 | if (!drained) { | 2314 | if (!drained) { |
2564 | drain_all_stock_async(mem_over_limit); | 2315 | drain_all_stock(mem_over_limit); |
2565 | drained = true; | 2316 | drained = true; |
2566 | goto retry; | 2317 | goto retry; |
2567 | } | 2318 | } |
@@ -2603,6 +2354,7 @@ bypass: | |||
2603 | return -EINTR; | 2354 | return -EINTR; |
2604 | 2355 | ||
2605 | done_restock: | 2356 | done_restock: |
2357 | css_get_many(&memcg->css, batch); | ||
2606 | if (batch > nr_pages) | 2358 | if (batch > nr_pages) |
2607 | refill_stock(memcg, batch - nr_pages); | 2359 | refill_stock(memcg, batch - nr_pages); |
2608 | done: | 2360 | done: |
@@ -2611,32 +2363,14 @@ done: | |||
2611 | 2363 | ||
2612 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | 2364 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) |
2613 | { | 2365 | { |
2614 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2615 | |||
2616 | if (mem_cgroup_is_root(memcg)) | 2366 | if (mem_cgroup_is_root(memcg)) |
2617 | return; | 2367 | return; |
2618 | 2368 | ||
2619 | res_counter_uncharge(&memcg->res, bytes); | 2369 | page_counter_uncharge(&memcg->memory, nr_pages); |
2620 | if (do_swap_account) | 2370 | if (do_swap_account) |
2621 | res_counter_uncharge(&memcg->memsw, bytes); | 2371 | page_counter_uncharge(&memcg->memsw, nr_pages); |
2622 | } | ||
2623 | |||
2624 | /* | ||
2625 | * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. | ||
2626 | * This is useful when moving usage to parent cgroup. | ||
2627 | */ | ||
2628 | static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, | ||
2629 | unsigned int nr_pages) | ||
2630 | { | ||
2631 | unsigned long bytes = nr_pages * PAGE_SIZE; | ||
2632 | |||
2633 | if (mem_cgroup_is_root(memcg)) | ||
2634 | return; | ||
2635 | 2372 | ||
2636 | res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); | 2373 | css_put_many(&memcg->css, nr_pages); |
2637 | if (do_swap_account) | ||
2638 | res_counter_uncharge_until(&memcg->memsw, | ||
2639 | memcg->memsw.parent, bytes); | ||
2640 | } | 2374 | } |
2641 | 2375 | ||
2642 | /* | 2376 | /* |
@@ -2665,17 +2399,15 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
2665 | */ | 2399 | */ |
2666 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2400 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
2667 | { | 2401 | { |
2668 | struct mem_cgroup *memcg = NULL; | 2402 | struct mem_cgroup *memcg; |
2669 | struct page_cgroup *pc; | ||
2670 | unsigned short id; | 2403 | unsigned short id; |
2671 | swp_entry_t ent; | 2404 | swp_entry_t ent; |
2672 | 2405 | ||
2673 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2406 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2674 | 2407 | ||
2675 | pc = lookup_page_cgroup(page); | 2408 | memcg = page->mem_cgroup; |
2676 | if (PageCgroupUsed(pc)) { | 2409 | if (memcg) { |
2677 | memcg = pc->mem_cgroup; | 2410 | if (!css_tryget_online(&memcg->css)) |
2678 | if (memcg && !css_tryget_online(&memcg->css)) | ||
2679 | memcg = NULL; | 2411 | memcg = NULL; |
2680 | } else if (PageSwapCache(page)) { | 2412 | } else if (PageSwapCache(page)) { |
2681 | ent.val = page_private(page); | 2413 | ent.val = page_private(page); |
@@ -2723,14 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated) | |||
2723 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, | 2455 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, |
2724 | bool lrucare) | 2456 | bool lrucare) |
2725 | { | 2457 | { |
2726 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
2727 | int isolated; | 2458 | int isolated; |
2728 | 2459 | ||
2729 | VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); | 2460 | VM_BUG_ON_PAGE(page->mem_cgroup, page); |
2730 | /* | ||
2731 | * we don't need page_cgroup_lock about tail pages, becase they are not | ||
2732 | * accessed by any other context at this point. | ||
2733 | */ | ||
2734 | 2461 | ||
2735 | /* | 2462 | /* |
2736 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page | 2463 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page |
@@ -2741,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2741 | 2468 | ||
2742 | /* | 2469 | /* |
2743 | * Nobody should be changing or seriously looking at | 2470 | * Nobody should be changing or seriously looking at |
2744 | * pc->mem_cgroup and pc->flags at this point: | 2471 | * page->mem_cgroup at this point: |
2745 | * | 2472 | * |
2746 | * - the page is uncharged | 2473 | * - the page is uncharged |
2747 | * | 2474 | * |
@@ -2753,15 +2480,12 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2753 | * - a page cache insertion, a swapin fault, or a migration | 2480 | * - a page cache insertion, a swapin fault, or a migration |
2754 | * have the page locked | 2481 | * have the page locked |
2755 | */ | 2482 | */ |
2756 | pc->mem_cgroup = memcg; | 2483 | page->mem_cgroup = memcg; |
2757 | pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); | ||
2758 | 2484 | ||
2759 | if (lrucare) | 2485 | if (lrucare) |
2760 | unlock_page_lru(page, isolated); | 2486 | unlock_page_lru(page, isolated); |
2761 | } | 2487 | } |
2762 | 2488 | ||
2763 | static DEFINE_MUTEX(set_limit_mutex); | ||
2764 | |||
2765 | #ifdef CONFIG_MEMCG_KMEM | 2489 | #ifdef CONFIG_MEMCG_KMEM |
2766 | /* | 2490 | /* |
2767 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | 2491 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or |
@@ -2769,8 +2493,6 @@ static DEFINE_MUTEX(set_limit_mutex); | |||
2769 | */ | 2493 | */ |
2770 | static DEFINE_MUTEX(memcg_slab_mutex); | 2494 | static DEFINE_MUTEX(memcg_slab_mutex); |
2771 | 2495 | ||
2772 | static DEFINE_MUTEX(activate_kmem_mutex); | ||
2773 | |||
2774 | /* | 2496 | /* |
2775 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | 2497 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer |
2776 | * in the memcg_cache_params struct. | 2498 | * in the memcg_cache_params struct. |
@@ -2784,36 +2506,17 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | |||
2784 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); | 2506 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); |
2785 | } | 2507 | } |
2786 | 2508 | ||
2787 | #ifdef CONFIG_SLABINFO | 2509 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, |
2788 | static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | 2510 | unsigned long nr_pages) |
2789 | { | ||
2790 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
2791 | struct memcg_cache_params *params; | ||
2792 | |||
2793 | if (!memcg_kmem_is_active(memcg)) | ||
2794 | return -EIO; | ||
2795 | |||
2796 | print_slabinfo_header(m); | ||
2797 | |||
2798 | mutex_lock(&memcg_slab_mutex); | ||
2799 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | ||
2800 | cache_show(memcg_params_to_cache(params), m); | ||
2801 | mutex_unlock(&memcg_slab_mutex); | ||
2802 | |||
2803 | return 0; | ||
2804 | } | ||
2805 | #endif | ||
2806 | |||
2807 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | ||
2808 | { | 2511 | { |
2809 | struct res_counter *fail_res; | 2512 | struct page_counter *counter; |
2810 | int ret = 0; | 2513 | int ret = 0; |
2811 | 2514 | ||
2812 | ret = res_counter_charge(&memcg->kmem, size, &fail_res); | 2515 | ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); |
2813 | if (ret) | 2516 | if (ret < 0) |
2814 | return ret; | 2517 | return ret; |
2815 | 2518 | ||
2816 | ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); | 2519 | ret = try_charge(memcg, gfp, nr_pages); |
2817 | if (ret == -EINTR) { | 2520 | if (ret == -EINTR) { |
2818 | /* | 2521 | /* |
2819 | * try_charge() chose to bypass to root due to OOM kill or | 2522 | * try_charge() chose to bypass to root due to OOM kill or |
@@ -2830,37 +2533,27 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) | |||
2830 | * when the allocation triggers should have been already | 2533 | * when the allocation triggers should have been already |
2831 | * directed to the root cgroup in memcontrol.h | 2534 | * directed to the root cgroup in memcontrol.h |
2832 | */ | 2535 | */ |
2833 | res_counter_charge_nofail(&memcg->res, size, &fail_res); | 2536 | page_counter_charge(&memcg->memory, nr_pages); |
2834 | if (do_swap_account) | 2537 | if (do_swap_account) |
2835 | res_counter_charge_nofail(&memcg->memsw, size, | 2538 | page_counter_charge(&memcg->memsw, nr_pages); |
2836 | &fail_res); | 2539 | css_get_many(&memcg->css, nr_pages); |
2837 | ret = 0; | 2540 | ret = 0; |
2838 | } else if (ret) | 2541 | } else if (ret) |
2839 | res_counter_uncharge(&memcg->kmem, size); | 2542 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2840 | 2543 | ||
2841 | return ret; | 2544 | return ret; |
2842 | } | 2545 | } |
2843 | 2546 | ||
2844 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | 2547 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, |
2548 | unsigned long nr_pages) | ||
2845 | { | 2549 | { |
2846 | res_counter_uncharge(&memcg->res, size); | 2550 | page_counter_uncharge(&memcg->memory, nr_pages); |
2847 | if (do_swap_account) | 2551 | if (do_swap_account) |
2848 | res_counter_uncharge(&memcg->memsw, size); | 2552 | page_counter_uncharge(&memcg->memsw, nr_pages); |
2849 | 2553 | ||
2850 | /* Not down to 0 */ | 2554 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2851 | if (res_counter_uncharge(&memcg->kmem, size)) | ||
2852 | return; | ||
2853 | 2555 | ||
2854 | /* | 2556 | css_put_many(&memcg->css, nr_pages); |
2855 | * Releases a reference taken in kmem_cgroup_css_offline in case | ||
2856 | * this last uncharge is racing with the offlining code or it is | ||
2857 | * outliving the memcg existence. | ||
2858 | * | ||
2859 | * The memory barrier imposed by test&clear is paired with the | ||
2860 | * explicit one in memcg_kmem_mark_dead(). | ||
2861 | */ | ||
2862 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
2863 | css_put(&memcg->css); | ||
2864 | } | 2557 | } |
2865 | 2558 | ||
2866 | /* | 2559 | /* |
@@ -3124,19 +2817,21 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
3124 | 2817 | ||
3125 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 2818 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
3126 | { | 2819 | { |
2820 | unsigned int nr_pages = 1 << order; | ||
3127 | int res; | 2821 | int res; |
3128 | 2822 | ||
3129 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, | 2823 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
3130 | PAGE_SIZE << order); | ||
3131 | if (!res) | 2824 | if (!res) |
3132 | atomic_add(1 << order, &cachep->memcg_params->nr_pages); | 2825 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); |
3133 | return res; | 2826 | return res; |
3134 | } | 2827 | } |
3135 | 2828 | ||
3136 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 2829 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
3137 | { | 2830 | { |
3138 | memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); | 2831 | unsigned int nr_pages = 1 << order; |
3139 | atomic_sub(1 << order, &cachep->memcg_params->nr_pages); | 2832 | |
2833 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | ||
2834 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | ||
3140 | } | 2835 | } |
3141 | 2836 | ||
3142 | /* | 2837 | /* |
@@ -3257,7 +2952,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3257 | return true; | 2952 | return true; |
3258 | } | 2953 | } |
3259 | 2954 | ||
3260 | ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); | 2955 | ret = memcg_charge_kmem(memcg, gfp, 1 << order); |
3261 | if (!ret) | 2956 | if (!ret) |
3262 | *_memcg = memcg; | 2957 | *_memcg = memcg; |
3263 | 2958 | ||
@@ -3268,46 +2963,27 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3268 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | 2963 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, |
3269 | int order) | 2964 | int order) |
3270 | { | 2965 | { |
3271 | struct page_cgroup *pc; | ||
3272 | |||
3273 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | 2966 | VM_BUG_ON(mem_cgroup_is_root(memcg)); |
3274 | 2967 | ||
3275 | /* The page allocation failed. Revert */ | 2968 | /* The page allocation failed. Revert */ |
3276 | if (!page) { | 2969 | if (!page) { |
3277 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 2970 | memcg_uncharge_kmem(memcg, 1 << order); |
3278 | return; | 2971 | return; |
3279 | } | 2972 | } |
3280 | /* | 2973 | page->mem_cgroup = memcg; |
3281 | * The page is freshly allocated and not visible to any | ||
3282 | * outside callers yet. Set up pc non-atomically. | ||
3283 | */ | ||
3284 | pc = lookup_page_cgroup(page); | ||
3285 | pc->mem_cgroup = memcg; | ||
3286 | pc->flags = PCG_USED; | ||
3287 | } | 2974 | } |
3288 | 2975 | ||
3289 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | 2976 | void __memcg_kmem_uncharge_pages(struct page *page, int order) |
3290 | { | 2977 | { |
3291 | struct mem_cgroup *memcg = NULL; | 2978 | struct mem_cgroup *memcg = page->mem_cgroup; |
3292 | struct page_cgroup *pc; | ||
3293 | |||
3294 | 2979 | ||
3295 | pc = lookup_page_cgroup(page); | ||
3296 | if (!PageCgroupUsed(pc)) | ||
3297 | return; | ||
3298 | |||
3299 | memcg = pc->mem_cgroup; | ||
3300 | pc->flags = 0; | ||
3301 | |||
3302 | /* | ||
3303 | * We trust that only if there is a memcg associated with the page, it | ||
3304 | * is a valid allocation | ||
3305 | */ | ||
3306 | if (!memcg) | 2980 | if (!memcg) |
3307 | return; | 2981 | return; |
3308 | 2982 | ||
3309 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); | 2983 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
3310 | memcg_uncharge_kmem(memcg, PAGE_SIZE << order); | 2984 | |
2985 | memcg_uncharge_kmem(memcg, 1 << order); | ||
2986 | page->mem_cgroup = NULL; | ||
3311 | } | 2987 | } |
3312 | #else | 2988 | #else |
3313 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | 2989 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
@@ -3325,21 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | |||
3325 | */ | 3001 | */ |
3326 | void mem_cgroup_split_huge_fixup(struct page *head) | 3002 | void mem_cgroup_split_huge_fixup(struct page *head) |
3327 | { | 3003 | { |
3328 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | ||
3329 | struct page_cgroup *pc; | ||
3330 | struct mem_cgroup *memcg; | ||
3331 | int i; | 3004 | int i; |
3332 | 3005 | ||
3333 | if (mem_cgroup_disabled()) | 3006 | if (mem_cgroup_disabled()) |
3334 | return; | 3007 | return; |
3335 | 3008 | ||
3336 | memcg = head_pc->mem_cgroup; | 3009 | for (i = 1; i < HPAGE_PMD_NR; i++) |
3337 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 3010 | head[i].mem_cgroup = head->mem_cgroup; |
3338 | pc = head_pc + i; | 3011 | |
3339 | pc->mem_cgroup = memcg; | 3012 | __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], |
3340 | pc->flags = head_pc->flags; | ||
3341 | } | ||
3342 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | ||
3343 | HPAGE_PMD_NR); | 3013 | HPAGE_PMD_NR); |
3344 | } | 3014 | } |
3345 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3015 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
@@ -3348,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3348 | * mem_cgroup_move_account - move account of the page | 3018 | * mem_cgroup_move_account - move account of the page |
3349 | * @page: the page | 3019 | * @page: the page |
3350 | * @nr_pages: number of regular pages (>1 for huge pages) | 3020 | * @nr_pages: number of regular pages (>1 for huge pages) |
3351 | * @pc: page_cgroup of the page. | ||
3352 | * @from: mem_cgroup which the page is moved from. | 3021 | * @from: mem_cgroup which the page is moved from. |
3353 | * @to: mem_cgroup which the page is moved to. @from != @to. | 3022 | * @to: mem_cgroup which the page is moved to. @from != @to. |
3354 | * | 3023 | * |
@@ -3361,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
3361 | */ | 3030 | */ |
3362 | static int mem_cgroup_move_account(struct page *page, | 3031 | static int mem_cgroup_move_account(struct page *page, |
3363 | unsigned int nr_pages, | 3032 | unsigned int nr_pages, |
3364 | struct page_cgroup *pc, | ||
3365 | struct mem_cgroup *from, | 3033 | struct mem_cgroup *from, |
3366 | struct mem_cgroup *to) | 3034 | struct mem_cgroup *to) |
3367 | { | 3035 | { |
@@ -3381,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
3381 | goto out; | 3049 | goto out; |
3382 | 3050 | ||
3383 | /* | 3051 | /* |
3384 | * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup | 3052 | * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup |
3385 | * of its source page while we change it: page migration takes | 3053 | * of its source page while we change it: page migration takes |
3386 | * both pages off the LRU, but page cache replacement doesn't. | 3054 | * both pages off the LRU, but page cache replacement doesn't. |
3387 | */ | 3055 | */ |
@@ -3389,10 +3057,10 @@ static int mem_cgroup_move_account(struct page *page, | |||
3389 | goto out; | 3057 | goto out; |
3390 | 3058 | ||
3391 | ret = -EINVAL; | 3059 | ret = -EINVAL; |
3392 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 3060 | if (page->mem_cgroup != from) |
3393 | goto out_unlock; | 3061 | goto out_unlock; |
3394 | 3062 | ||
3395 | move_lock_mem_cgroup(from, &flags); | 3063 | spin_lock_irqsave(&from->move_lock, flags); |
3396 | 3064 | ||
3397 | if (!PageAnon(page) && page_mapped(page)) { | 3065 | if (!PageAnon(page) && page_mapped(page)) { |
3398 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 3066 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
@@ -3409,14 +3077,15 @@ static int mem_cgroup_move_account(struct page *page, | |||
3409 | } | 3077 | } |
3410 | 3078 | ||
3411 | /* | 3079 | /* |
3412 | * It is safe to change pc->mem_cgroup here because the page | 3080 | * It is safe to change page->mem_cgroup here because the page |
3413 | * is referenced, charged, and isolated - we can't race with | 3081 | * is referenced, charged, and isolated - we can't race with |
3414 | * uncharging, charging, migration, or LRU putback. | 3082 | * uncharging, charging, migration, or LRU putback. |
3415 | */ | 3083 | */ |
3416 | 3084 | ||
3417 | /* caller should have done css_get */ | 3085 | /* caller should have done css_get */ |
3418 | pc->mem_cgroup = to; | 3086 | page->mem_cgroup = to; |
3419 | move_unlock_mem_cgroup(from, &flags); | 3087 | spin_unlock_irqrestore(&from->move_lock, flags); |
3088 | |||
3420 | ret = 0; | 3089 | ret = 0; |
3421 | 3090 | ||
3422 | local_irq_disable(); | 3091 | local_irq_disable(); |
@@ -3431,72 +3100,6 @@ out: | |||
3431 | return ret; | 3100 | return ret; |
3432 | } | 3101 | } |
3433 | 3102 | ||
3434 | /** | ||
3435 | * mem_cgroup_move_parent - moves page to the parent group | ||
3436 | * @page: the page to move | ||
3437 | * @pc: page_cgroup of the page | ||
3438 | * @child: page's cgroup | ||
3439 | * | ||
3440 | * move charges to its parent or the root cgroup if the group has no | ||
3441 | * parent (aka use_hierarchy==0). | ||
3442 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | ||
3443 | * mem_cgroup_move_account fails) the failure is always temporary and | ||
3444 | * it signals a race with a page removal/uncharge or migration. In the | ||
3445 | * first case the page is on the way out and it will vanish from the LRU | ||
3446 | * on the next attempt and the call should be retried later. | ||
3447 | * Isolation from the LRU fails only if page has been isolated from | ||
3448 | * the LRU since we looked at it and that usually means either global | ||
3449 | * reclaim or migration going on. The page will either get back to the | ||
3450 | * LRU or vanish. | ||
3451 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | ||
3452 | * (!PageCgroupUsed) or moved to a different group. The page will | ||
3453 | * disappear in the next attempt. | ||
3454 | */ | ||
3455 | static int mem_cgroup_move_parent(struct page *page, | ||
3456 | struct page_cgroup *pc, | ||
3457 | struct mem_cgroup *child) | ||
3458 | { | ||
3459 | struct mem_cgroup *parent; | ||
3460 | unsigned int nr_pages; | ||
3461 | unsigned long uninitialized_var(flags); | ||
3462 | int ret; | ||
3463 | |||
3464 | VM_BUG_ON(mem_cgroup_is_root(child)); | ||
3465 | |||
3466 | ret = -EBUSY; | ||
3467 | if (!get_page_unless_zero(page)) | ||
3468 | goto out; | ||
3469 | if (isolate_lru_page(page)) | ||
3470 | goto put; | ||
3471 | |||
3472 | nr_pages = hpage_nr_pages(page); | ||
3473 | |||
3474 | parent = parent_mem_cgroup(child); | ||
3475 | /* | ||
3476 | * If no parent, move charges to root cgroup. | ||
3477 | */ | ||
3478 | if (!parent) | ||
3479 | parent = root_mem_cgroup; | ||
3480 | |||
3481 | if (nr_pages > 1) { | ||
3482 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
3483 | flags = compound_lock_irqsave(page); | ||
3484 | } | ||
3485 | |||
3486 | ret = mem_cgroup_move_account(page, nr_pages, | ||
3487 | pc, child, parent); | ||
3488 | if (!ret) | ||
3489 | __mem_cgroup_cancel_local_charge(child, nr_pages); | ||
3490 | |||
3491 | if (nr_pages > 1) | ||
3492 | compound_unlock_irqrestore(page, flags); | ||
3493 | putback_lru_page(page); | ||
3494 | put: | ||
3495 | put_page(page); | ||
3496 | out: | ||
3497 | return ret; | ||
3498 | } | ||
3499 | |||
3500 | #ifdef CONFIG_MEMCG_SWAP | 3103 | #ifdef CONFIG_MEMCG_SWAP |
3501 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | 3104 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
3502 | bool charge) | 3105 | bool charge) |
@@ -3516,7 +3119,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
3516 | * | 3119 | * |
3517 | * Returns 0 on success, -EINVAL on failure. | 3120 | * Returns 0 on success, -EINVAL on failure. |
3518 | * | 3121 | * |
3519 | * The caller must have charged to @to, IOW, called res_counter_charge() about | 3122 | * The caller must have charged to @to, IOW, called page_counter_charge() about |
3520 | * both res and memsw, and called css_get(). | 3123 | * both res and memsw, and called css_get(). |
3521 | */ | 3124 | */ |
3522 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 3125 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
@@ -3532,7 +3135,7 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3532 | mem_cgroup_swap_statistics(to, true); | 3135 | mem_cgroup_swap_statistics(to, true); |
3533 | /* | 3136 | /* |
3534 | * This function is only called from task migration context now. | 3137 | * This function is only called from task migration context now. |
3535 | * It postpones res_counter and refcount handling till the end | 3138 | * It postpones page_counter and refcount handling till the end |
3536 | * of task migration(mem_cgroup_clear_mc()) for performance | 3139 | * of task migration(mem_cgroup_clear_mc()) for performance |
3537 | * improvement. But we cannot postpone css_get(to) because if | 3140 | * improvement. But we cannot postpone css_get(to) because if |
3538 | * the process that has been moved to @to does swap-in, the | 3141 | * the process that has been moved to @to does swap-in, the |
@@ -3554,96 +3157,57 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3554 | } | 3157 | } |
3555 | #endif | 3158 | #endif |
3556 | 3159 | ||
3557 | #ifdef CONFIG_DEBUG_VM | 3160 | static DEFINE_MUTEX(memcg_limit_mutex); |
3558 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | ||
3559 | { | ||
3560 | struct page_cgroup *pc; | ||
3561 | |||
3562 | pc = lookup_page_cgroup(page); | ||
3563 | /* | ||
3564 | * Can be NULL while feeding pages into the page allocator for | ||
3565 | * the first time, i.e. during boot or memory hotplug; | ||
3566 | * or when mem_cgroup_disabled(). | ||
3567 | */ | ||
3568 | if (likely(pc) && PageCgroupUsed(pc)) | ||
3569 | return pc; | ||
3570 | return NULL; | ||
3571 | } | ||
3572 | |||
3573 | bool mem_cgroup_bad_page_check(struct page *page) | ||
3574 | { | ||
3575 | if (mem_cgroup_disabled()) | ||
3576 | return false; | ||
3577 | |||
3578 | return lookup_page_cgroup_used(page) != NULL; | ||
3579 | } | ||
3580 | |||
3581 | void mem_cgroup_print_bad_page(struct page *page) | ||
3582 | { | ||
3583 | struct page_cgroup *pc; | ||
3584 | |||
3585 | pc = lookup_page_cgroup_used(page); | ||
3586 | if (pc) { | ||
3587 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", | ||
3588 | pc, pc->flags, pc->mem_cgroup); | ||
3589 | } | ||
3590 | } | ||
3591 | #endif | ||
3592 | 3161 | ||
3593 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3162 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
3594 | unsigned long long val) | 3163 | unsigned long limit) |
3595 | { | 3164 | { |
3165 | unsigned long curusage; | ||
3166 | unsigned long oldusage; | ||
3167 | bool enlarge = false; | ||
3596 | int retry_count; | 3168 | int retry_count; |
3597 | int ret = 0; | 3169 | int ret; |
3598 | int children = mem_cgroup_count_children(memcg); | ||
3599 | u64 curusage, oldusage; | ||
3600 | int enlarge; | ||
3601 | 3170 | ||
3602 | /* | 3171 | /* |
3603 | * For keeping hierarchical_reclaim simple, how long we should retry | 3172 | * For keeping hierarchical_reclaim simple, how long we should retry |
3604 | * is depends on callers. We set our retry-count to be function | 3173 | * is depends on callers. We set our retry-count to be function |
3605 | * of # of children which we should visit in this loop. | 3174 | * of # of children which we should visit in this loop. |
3606 | */ | 3175 | */ |
3607 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; | 3176 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * |
3177 | mem_cgroup_count_children(memcg); | ||
3608 | 3178 | ||
3609 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3179 | oldusage = page_counter_read(&memcg->memory); |
3610 | 3180 | ||
3611 | enlarge = 0; | 3181 | do { |
3612 | while (retry_count) { | ||
3613 | if (signal_pending(current)) { | 3182 | if (signal_pending(current)) { |
3614 | ret = -EINTR; | 3183 | ret = -EINTR; |
3615 | break; | 3184 | break; |
3616 | } | 3185 | } |
3617 | /* | 3186 | |
3618 | * Rather than hide all in some function, I do this in | 3187 | mutex_lock(&memcg_limit_mutex); |
3619 | * open coded manner. You see what this really does. | 3188 | if (limit > memcg->memsw.limit) { |
3620 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3189 | mutex_unlock(&memcg_limit_mutex); |
3621 | */ | ||
3622 | mutex_lock(&set_limit_mutex); | ||
3623 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { | ||
3624 | ret = -EINVAL; | 3190 | ret = -EINVAL; |
3625 | mutex_unlock(&set_limit_mutex); | ||
3626 | break; | 3191 | break; |
3627 | } | 3192 | } |
3628 | 3193 | if (limit > memcg->memory.limit) | |
3629 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) | 3194 | enlarge = true; |
3630 | enlarge = 1; | 3195 | ret = page_counter_limit(&memcg->memory, limit); |
3631 | 3196 | mutex_unlock(&memcg_limit_mutex); | |
3632 | ret = res_counter_set_limit(&memcg->res, val); | ||
3633 | mutex_unlock(&set_limit_mutex); | ||
3634 | 3197 | ||
3635 | if (!ret) | 3198 | if (!ret) |
3636 | break; | 3199 | break; |
3637 | 3200 | ||
3638 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); | 3201 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); |
3639 | 3202 | ||
3640 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3203 | curusage = page_counter_read(&memcg->memory); |
3641 | /* Usage is reduced ? */ | 3204 | /* Usage is reduced ? */ |
3642 | if (curusage >= oldusage) | 3205 | if (curusage >= oldusage) |
3643 | retry_count--; | 3206 | retry_count--; |
3644 | else | 3207 | else |
3645 | oldusage = curusage; | 3208 | oldusage = curusage; |
3646 | } | 3209 | } while (retry_count); |
3210 | |||
3647 | if (!ret && enlarge) | 3211 | if (!ret && enlarge) |
3648 | memcg_oom_recover(memcg); | 3212 | memcg_oom_recover(memcg); |
3649 | 3213 | ||
@@ -3651,52 +3215,53 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3651 | } | 3215 | } |
3652 | 3216 | ||
3653 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | 3217 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
3654 | unsigned long long val) | 3218 | unsigned long limit) |
3655 | { | 3219 | { |
3220 | unsigned long curusage; | ||
3221 | unsigned long oldusage; | ||
3222 | bool enlarge = false; | ||
3656 | int retry_count; | 3223 | int retry_count; |
3657 | u64 oldusage, curusage; | 3224 | int ret; |
3658 | int children = mem_cgroup_count_children(memcg); | ||
3659 | int ret = -EBUSY; | ||
3660 | int enlarge = 0; | ||
3661 | 3225 | ||
3662 | /* see mem_cgroup_resize_res_limit */ | 3226 | /* see mem_cgroup_resize_res_limit */ |
3663 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 3227 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * |
3664 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3228 | mem_cgroup_count_children(memcg); |
3665 | while (retry_count) { | 3229 | |
3230 | oldusage = page_counter_read(&memcg->memsw); | ||
3231 | |||
3232 | do { | ||
3666 | if (signal_pending(current)) { | 3233 | if (signal_pending(current)) { |
3667 | ret = -EINTR; | 3234 | ret = -EINTR; |
3668 | break; | 3235 | break; |
3669 | } | 3236 | } |
3670 | /* | 3237 | |
3671 | * Rather than hide all in some function, I do this in | 3238 | mutex_lock(&memcg_limit_mutex); |
3672 | * open coded manner. You see what this really does. | 3239 | if (limit < memcg->memory.limit) { |
3673 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3240 | mutex_unlock(&memcg_limit_mutex); |
3674 | */ | ||
3675 | mutex_lock(&set_limit_mutex); | ||
3676 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { | ||
3677 | ret = -EINVAL; | 3241 | ret = -EINVAL; |
3678 | mutex_unlock(&set_limit_mutex); | ||
3679 | break; | 3242 | break; |
3680 | } | 3243 | } |
3681 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) | 3244 | if (limit > memcg->memsw.limit) |
3682 | enlarge = 1; | 3245 | enlarge = true; |
3683 | ret = res_counter_set_limit(&memcg->memsw, val); | 3246 | ret = page_counter_limit(&memcg->memsw, limit); |
3684 | mutex_unlock(&set_limit_mutex); | 3247 | mutex_unlock(&memcg_limit_mutex); |
3685 | 3248 | ||
3686 | if (!ret) | 3249 | if (!ret) |
3687 | break; | 3250 | break; |
3688 | 3251 | ||
3689 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); | 3252 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); |
3690 | 3253 | ||
3691 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3254 | curusage = page_counter_read(&memcg->memsw); |
3692 | /* Usage is reduced ? */ | 3255 | /* Usage is reduced ? */ |
3693 | if (curusage >= oldusage) | 3256 | if (curusage >= oldusage) |
3694 | retry_count--; | 3257 | retry_count--; |
3695 | else | 3258 | else |
3696 | oldusage = curusage; | 3259 | oldusage = curusage; |
3697 | } | 3260 | } while (retry_count); |
3261 | |||
3698 | if (!ret && enlarge) | 3262 | if (!ret && enlarge) |
3699 | memcg_oom_recover(memcg); | 3263 | memcg_oom_recover(memcg); |
3264 | |||
3700 | return ret; | 3265 | return ret; |
3701 | } | 3266 | } |
3702 | 3267 | ||
@@ -3709,7 +3274,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3709 | unsigned long reclaimed; | 3274 | unsigned long reclaimed; |
3710 | int loop = 0; | 3275 | int loop = 0; |
3711 | struct mem_cgroup_tree_per_zone *mctz; | 3276 | struct mem_cgroup_tree_per_zone *mctz; |
3712 | unsigned long long excess; | 3277 | unsigned long excess; |
3713 | unsigned long nr_scanned; | 3278 | unsigned long nr_scanned; |
3714 | 3279 | ||
3715 | if (order > 0) | 3280 | if (order > 0) |
@@ -3735,35 +3300,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3735 | nr_reclaimed += reclaimed; | 3300 | nr_reclaimed += reclaimed; |
3736 | *total_scanned += nr_scanned; | 3301 | *total_scanned += nr_scanned; |
3737 | spin_lock_irq(&mctz->lock); | 3302 | spin_lock_irq(&mctz->lock); |
3303 | __mem_cgroup_remove_exceeded(mz, mctz); | ||
3738 | 3304 | ||
3739 | /* | 3305 | /* |
3740 | * If we failed to reclaim anything from this memory cgroup | 3306 | * If we failed to reclaim anything from this memory cgroup |
3741 | * it is time to move on to the next cgroup | 3307 | * it is time to move on to the next cgroup |
3742 | */ | 3308 | */ |
3743 | next_mz = NULL; | 3309 | next_mz = NULL; |
3744 | if (!reclaimed) { | 3310 | if (!reclaimed) |
3745 | do { | 3311 | next_mz = __mem_cgroup_largest_soft_limit_node(mctz); |
3746 | /* | 3312 | |
3747 | * Loop until we find yet another one. | 3313 | excess = soft_limit_excess(mz->memcg); |
3748 | * | ||
3749 | * By the time we get the soft_limit lock | ||
3750 | * again, someone might have aded the | ||
3751 | * group back on the RB tree. Iterate to | ||
3752 | * make sure we get a different mem. | ||
3753 | * mem_cgroup_largest_soft_limit_node returns | ||
3754 | * NULL if no other cgroup is present on | ||
3755 | * the tree | ||
3756 | */ | ||
3757 | next_mz = | ||
3758 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
3759 | if (next_mz == mz) | ||
3760 | css_put(&next_mz->memcg->css); | ||
3761 | else /* next_mz == NULL or other memcg */ | ||
3762 | break; | ||
3763 | } while (1); | ||
3764 | } | ||
3765 | __mem_cgroup_remove_exceeded(mz, mctz); | ||
3766 | excess = res_counter_soft_limit_excess(&mz->memcg->res); | ||
3767 | /* | 3314 | /* |
3768 | * One school of thought says that we should not add | 3315 | * One school of thought says that we should not add |
3769 | * back the node to the tree if reclaim returns 0. | 3316 | * back the node to the tree if reclaim returns 0. |
@@ -3792,107 +3339,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3792 | return nr_reclaimed; | 3339 | return nr_reclaimed; |
3793 | } | 3340 | } |
3794 | 3341 | ||
3795 | /** | ||
3796 | * mem_cgroup_force_empty_list - clears LRU of a group | ||
3797 | * @memcg: group to clear | ||
3798 | * @node: NUMA node | ||
3799 | * @zid: zone id | ||
3800 | * @lru: lru to to clear | ||
3801 | * | ||
3802 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | ||
3803 | * reclaim the pages page themselves - pages are moved to the parent (or root) | ||
3804 | * group. | ||
3805 | */ | ||
3806 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | ||
3807 | int node, int zid, enum lru_list lru) | ||
3808 | { | ||
3809 | struct lruvec *lruvec; | ||
3810 | unsigned long flags; | ||
3811 | struct list_head *list; | ||
3812 | struct page *busy; | ||
3813 | struct zone *zone; | ||
3814 | |||
3815 | zone = &NODE_DATA(node)->node_zones[zid]; | ||
3816 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
3817 | list = &lruvec->lists[lru]; | ||
3818 | |||
3819 | busy = NULL; | ||
3820 | do { | ||
3821 | struct page_cgroup *pc; | ||
3822 | struct page *page; | ||
3823 | |||
3824 | spin_lock_irqsave(&zone->lru_lock, flags); | ||
3825 | if (list_empty(list)) { | ||
3826 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3827 | break; | ||
3828 | } | ||
3829 | page = list_entry(list->prev, struct page, lru); | ||
3830 | if (busy == page) { | ||
3831 | list_move(&page->lru, list); | ||
3832 | busy = NULL; | ||
3833 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3834 | continue; | ||
3835 | } | ||
3836 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
3837 | |||
3838 | pc = lookup_page_cgroup(page); | ||
3839 | |||
3840 | if (mem_cgroup_move_parent(page, pc, memcg)) { | ||
3841 | /* found lock contention or "pc" is obsolete. */ | ||
3842 | busy = page; | ||
3843 | } else | ||
3844 | busy = NULL; | ||
3845 | cond_resched(); | ||
3846 | } while (!list_empty(list)); | ||
3847 | } | ||
3848 | |||
3849 | /* | ||
3850 | * make mem_cgroup's charge to be 0 if there is no task by moving | ||
3851 | * all the charges and pages to the parent. | ||
3852 | * This enables deleting this mem_cgroup. | ||
3853 | * | ||
3854 | * Caller is responsible for holding css reference on the memcg. | ||
3855 | */ | ||
3856 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | ||
3857 | { | ||
3858 | int node, zid; | ||
3859 | u64 usage; | ||
3860 | |||
3861 | do { | ||
3862 | /* This is for making all *used* pages to be on LRU. */ | ||
3863 | lru_add_drain_all(); | ||
3864 | drain_all_stock_sync(memcg); | ||
3865 | mem_cgroup_start_move(memcg); | ||
3866 | for_each_node_state(node, N_MEMORY) { | ||
3867 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
3868 | enum lru_list lru; | ||
3869 | for_each_lru(lru) { | ||
3870 | mem_cgroup_force_empty_list(memcg, | ||
3871 | node, zid, lru); | ||
3872 | } | ||
3873 | } | ||
3874 | } | ||
3875 | mem_cgroup_end_move(memcg); | ||
3876 | memcg_oom_recover(memcg); | ||
3877 | cond_resched(); | ||
3878 | |||
3879 | /* | ||
3880 | * Kernel memory may not necessarily be trackable to a specific | ||
3881 | * process. So they are not migrated, and therefore we can't | ||
3882 | * expect their value to drop to 0 here. | ||
3883 | * Having res filled up with kmem only is enough. | ||
3884 | * | ||
3885 | * This is a safety check because mem_cgroup_force_empty_list | ||
3886 | * could have raced with mem_cgroup_replace_page_cache callers | ||
3887 | * so the lru seemed empty but the page could have been added | ||
3888 | * right after the check. RES_USAGE should be safe as we always | ||
3889 | * charge before adding to the LRU. | ||
3890 | */ | ||
3891 | usage = res_counter_read_u64(&memcg->res, RES_USAGE) - | ||
3892 | res_counter_read_u64(&memcg->kmem, RES_USAGE); | ||
3893 | } while (usage > 0); | ||
3894 | } | ||
3895 | |||
3896 | /* | 3342 | /* |
3897 | * Test whether @memcg has children, dead or alive. Note that this | 3343 | * Test whether @memcg has children, dead or alive. Note that this |
3898 | * function doesn't care whether @memcg has use_hierarchy enabled and | 3344 | * function doesn't care whether @memcg has use_hierarchy enabled and |
@@ -3930,7 +3376,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | |||
3930 | /* we call try-to-free pages for make this cgroup empty */ | 3376 | /* we call try-to-free pages for make this cgroup empty */ |
3931 | lru_add_drain_all(); | 3377 | lru_add_drain_all(); |
3932 | /* try to free all pages in this cgroup */ | 3378 | /* try to free all pages in this cgroup */ |
3933 | while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { | 3379 | while (nr_retries && page_counter_read(&memcg->memory)) { |
3934 | int progress; | 3380 | int progress; |
3935 | 3381 | ||
3936 | if (signal_pending(current)) | 3382 | if (signal_pending(current)) |
@@ -4001,8 +3447,8 @@ out: | |||
4001 | return retval; | 3447 | return retval; |
4002 | } | 3448 | } |
4003 | 3449 | ||
4004 | static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, | 3450 | static unsigned long tree_stat(struct mem_cgroup *memcg, |
4005 | enum mem_cgroup_stat_index idx) | 3451 | enum mem_cgroup_stat_index idx) |
4006 | { | 3452 | { |
4007 | struct mem_cgroup *iter; | 3453 | struct mem_cgroup *iter; |
4008 | long val = 0; | 3454 | long val = 0; |
@@ -4020,55 +3466,71 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
4020 | { | 3466 | { |
4021 | u64 val; | 3467 | u64 val; |
4022 | 3468 | ||
4023 | if (!mem_cgroup_is_root(memcg)) { | 3469 | if (mem_cgroup_is_root(memcg)) { |
3470 | val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
3471 | val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
3472 | if (swap) | ||
3473 | val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
3474 | } else { | ||
4024 | if (!swap) | 3475 | if (!swap) |
4025 | return res_counter_read_u64(&memcg->res, RES_USAGE); | 3476 | val = page_counter_read(&memcg->memory); |
4026 | else | 3477 | else |
4027 | return res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3478 | val = page_counter_read(&memcg->memsw); |
4028 | } | 3479 | } |
4029 | |||
4030 | /* | ||
4031 | * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS | ||
4032 | * as well as in MEM_CGROUP_STAT_RSS_HUGE. | ||
4033 | */ | ||
4034 | val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); | ||
4035 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); | ||
4036 | |||
4037 | if (swap) | ||
4038 | val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); | ||
4039 | |||
4040 | return val << PAGE_SHIFT; | 3480 | return val << PAGE_SHIFT; |
4041 | } | 3481 | } |
4042 | 3482 | ||
3483 | enum { | ||
3484 | RES_USAGE, | ||
3485 | RES_LIMIT, | ||
3486 | RES_MAX_USAGE, | ||
3487 | RES_FAILCNT, | ||
3488 | RES_SOFT_LIMIT, | ||
3489 | }; | ||
4043 | 3490 | ||
4044 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 3491 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
4045 | struct cftype *cft) | 3492 | struct cftype *cft) |
4046 | { | 3493 | { |
4047 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3494 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4048 | enum res_type type = MEMFILE_TYPE(cft->private); | 3495 | struct page_counter *counter; |
4049 | int name = MEMFILE_ATTR(cft->private); | ||
4050 | 3496 | ||
4051 | switch (type) { | 3497 | switch (MEMFILE_TYPE(cft->private)) { |
4052 | case _MEM: | 3498 | case _MEM: |
4053 | if (name == RES_USAGE) | 3499 | counter = &memcg->memory; |
4054 | return mem_cgroup_usage(memcg, false); | 3500 | break; |
4055 | return res_counter_read_u64(&memcg->res, name); | ||
4056 | case _MEMSWAP: | 3501 | case _MEMSWAP: |
4057 | if (name == RES_USAGE) | 3502 | counter = &memcg->memsw; |
4058 | return mem_cgroup_usage(memcg, true); | 3503 | break; |
4059 | return res_counter_read_u64(&memcg->memsw, name); | ||
4060 | case _KMEM: | 3504 | case _KMEM: |
4061 | return res_counter_read_u64(&memcg->kmem, name); | 3505 | counter = &memcg->kmem; |
4062 | break; | 3506 | break; |
4063 | default: | 3507 | default: |
4064 | BUG(); | 3508 | BUG(); |
4065 | } | 3509 | } |
3510 | |||
3511 | switch (MEMFILE_ATTR(cft->private)) { | ||
3512 | case RES_USAGE: | ||
3513 | if (counter == &memcg->memory) | ||
3514 | return mem_cgroup_usage(memcg, false); | ||
3515 | if (counter == &memcg->memsw) | ||
3516 | return mem_cgroup_usage(memcg, true); | ||
3517 | return (u64)page_counter_read(counter) * PAGE_SIZE; | ||
3518 | case RES_LIMIT: | ||
3519 | return (u64)counter->limit * PAGE_SIZE; | ||
3520 | case RES_MAX_USAGE: | ||
3521 | return (u64)counter->watermark * PAGE_SIZE; | ||
3522 | case RES_FAILCNT: | ||
3523 | return counter->failcnt; | ||
3524 | case RES_SOFT_LIMIT: | ||
3525 | return (u64)memcg->soft_limit * PAGE_SIZE; | ||
3526 | default: | ||
3527 | BUG(); | ||
3528 | } | ||
4066 | } | 3529 | } |
4067 | 3530 | ||
4068 | #ifdef CONFIG_MEMCG_KMEM | 3531 | #ifdef CONFIG_MEMCG_KMEM |
4069 | /* should be called with activate_kmem_mutex held */ | 3532 | static int memcg_activate_kmem(struct mem_cgroup *memcg, |
4070 | static int __memcg_activate_kmem(struct mem_cgroup *memcg, | 3533 | unsigned long nr_pages) |
4071 | unsigned long long limit) | ||
4072 | { | 3534 | { |
4073 | int err = 0; | 3535 | int err = 0; |
4074 | int memcg_id; | 3536 | int memcg_id; |
@@ -4115,7 +3577,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
4115 | * We couldn't have accounted to this cgroup, because it hasn't got the | 3577 | * We couldn't have accounted to this cgroup, because it hasn't got the |
4116 | * active bit set yet, so this should succeed. | 3578 | * active bit set yet, so this should succeed. |
4117 | */ | 3579 | */ |
4118 | err = res_counter_set_limit(&memcg->kmem, limit); | 3580 | err = page_counter_limit(&memcg->kmem, nr_pages); |
4119 | VM_BUG_ON(err); | 3581 | VM_BUG_ON(err); |
4120 | 3582 | ||
4121 | static_key_slow_inc(&memcg_kmem_enabled_key); | 3583 | static_key_slow_inc(&memcg_kmem_enabled_key); |
@@ -4130,26 +3592,17 @@ out: | |||
4130 | return err; | 3592 | return err; |
4131 | } | 3593 | } |
4132 | 3594 | ||
4133 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | ||
4134 | unsigned long long limit) | ||
4135 | { | ||
4136 | int ret; | ||
4137 | |||
4138 | mutex_lock(&activate_kmem_mutex); | ||
4139 | ret = __memcg_activate_kmem(memcg, limit); | ||
4140 | mutex_unlock(&activate_kmem_mutex); | ||
4141 | return ret; | ||
4142 | } | ||
4143 | |||
4144 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | 3595 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, |
4145 | unsigned long long val) | 3596 | unsigned long limit) |
4146 | { | 3597 | { |
4147 | int ret; | 3598 | int ret; |
4148 | 3599 | ||
3600 | mutex_lock(&memcg_limit_mutex); | ||
4149 | if (!memcg_kmem_is_active(memcg)) | 3601 | if (!memcg_kmem_is_active(memcg)) |
4150 | ret = memcg_activate_kmem(memcg, val); | 3602 | ret = memcg_activate_kmem(memcg, limit); |
4151 | else | 3603 | else |
4152 | ret = res_counter_set_limit(&memcg->kmem, val); | 3604 | ret = page_counter_limit(&memcg->kmem, limit); |
3605 | mutex_unlock(&memcg_limit_mutex); | ||
4153 | return ret; | 3606 | return ret; |
4154 | } | 3607 | } |
4155 | 3608 | ||
@@ -4161,19 +3614,19 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
4161 | if (!parent) | 3614 | if (!parent) |
4162 | return 0; | 3615 | return 0; |
4163 | 3616 | ||
4164 | mutex_lock(&activate_kmem_mutex); | 3617 | mutex_lock(&memcg_limit_mutex); |
4165 | /* | 3618 | /* |
4166 | * If the parent cgroup is not kmem-active now, it cannot be activated | 3619 | * If the parent cgroup is not kmem-active now, it cannot be activated |
4167 | * after this point, because it has at least one child already. | 3620 | * after this point, because it has at least one child already. |
4168 | */ | 3621 | */ |
4169 | if (memcg_kmem_is_active(parent)) | 3622 | if (memcg_kmem_is_active(parent)) |
4170 | ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); | 3623 | ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); |
4171 | mutex_unlock(&activate_kmem_mutex); | 3624 | mutex_unlock(&memcg_limit_mutex); |
4172 | return ret; | 3625 | return ret; |
4173 | } | 3626 | } |
4174 | #else | 3627 | #else |
4175 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | 3628 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, |
4176 | unsigned long long val) | 3629 | unsigned long limit) |
4177 | { | 3630 | { |
4178 | return -EINVAL; | 3631 | return -EINVAL; |
4179 | } | 3632 | } |
@@ -4187,110 +3640,69 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
4187 | char *buf, size_t nbytes, loff_t off) | 3640 | char *buf, size_t nbytes, loff_t off) |
4188 | { | 3641 | { |
4189 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 3642 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
4190 | enum res_type type; | 3643 | unsigned long nr_pages; |
4191 | int name; | ||
4192 | unsigned long long val; | ||
4193 | int ret; | 3644 | int ret; |
4194 | 3645 | ||
4195 | buf = strstrip(buf); | 3646 | buf = strstrip(buf); |
4196 | type = MEMFILE_TYPE(of_cft(of)->private); | 3647 | ret = page_counter_memparse(buf, &nr_pages); |
4197 | name = MEMFILE_ATTR(of_cft(of)->private); | 3648 | if (ret) |
3649 | return ret; | ||
4198 | 3650 | ||
4199 | switch (name) { | 3651 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
4200 | case RES_LIMIT: | 3652 | case RES_LIMIT: |
4201 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 3653 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
4202 | ret = -EINVAL; | 3654 | ret = -EINVAL; |
4203 | break; | 3655 | break; |
4204 | } | 3656 | } |
4205 | /* This function does all necessary parse...reuse it */ | 3657 | switch (MEMFILE_TYPE(of_cft(of)->private)) { |
4206 | ret = res_counter_memparse_write_strategy(buf, &val); | 3658 | case _MEM: |
4207 | if (ret) | 3659 | ret = mem_cgroup_resize_limit(memcg, nr_pages); |
4208 | break; | 3660 | break; |
4209 | if (type == _MEM) | 3661 | case _MEMSWAP: |
4210 | ret = mem_cgroup_resize_limit(memcg, val); | 3662 | ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); |
4211 | else if (type == _MEMSWAP) | ||
4212 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | ||
4213 | else if (type == _KMEM) | ||
4214 | ret = memcg_update_kmem_limit(memcg, val); | ||
4215 | else | ||
4216 | return -EINVAL; | ||
4217 | break; | ||
4218 | case RES_SOFT_LIMIT: | ||
4219 | ret = res_counter_memparse_write_strategy(buf, &val); | ||
4220 | if (ret) | ||
4221 | break; | 3663 | break; |
4222 | /* | 3664 | case _KMEM: |
4223 | * For memsw, soft limits are hard to implement in terms | 3665 | ret = memcg_update_kmem_limit(memcg, nr_pages); |
4224 | * of semantics, for now, we support soft limits for | 3666 | break; |
4225 | * control without swap | 3667 | } |
4226 | */ | ||
4227 | if (type == _MEM) | ||
4228 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
4229 | else | ||
4230 | ret = -EINVAL; | ||
4231 | break; | 3668 | break; |
4232 | default: | 3669 | case RES_SOFT_LIMIT: |
4233 | ret = -EINVAL; /* should be BUG() ? */ | 3670 | memcg->soft_limit = nr_pages; |
3671 | ret = 0; | ||
4234 | break; | 3672 | break; |
4235 | } | 3673 | } |
4236 | return ret ?: nbytes; | 3674 | return ret ?: nbytes; |
4237 | } | 3675 | } |
4238 | 3676 | ||
4239 | static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | ||
4240 | unsigned long long *mem_limit, unsigned long long *memsw_limit) | ||
4241 | { | ||
4242 | unsigned long long min_limit, min_memsw_limit, tmp; | ||
4243 | |||
4244 | min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
4245 | min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
4246 | if (!memcg->use_hierarchy) | ||
4247 | goto out; | ||
4248 | |||
4249 | while (memcg->css.parent) { | ||
4250 | memcg = mem_cgroup_from_css(memcg->css.parent); | ||
4251 | if (!memcg->use_hierarchy) | ||
4252 | break; | ||
4253 | tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
4254 | min_limit = min(min_limit, tmp); | ||
4255 | tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
4256 | min_memsw_limit = min(min_memsw_limit, tmp); | ||
4257 | } | ||
4258 | out: | ||
4259 | *mem_limit = min_limit; | ||
4260 | *memsw_limit = min_memsw_limit; | ||
4261 | } | ||
4262 | |||
4263 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, | 3677 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, |
4264 | size_t nbytes, loff_t off) | 3678 | size_t nbytes, loff_t off) |
4265 | { | 3679 | { |
4266 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 3680 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
4267 | int name; | 3681 | struct page_counter *counter; |
4268 | enum res_type type; | ||
4269 | 3682 | ||
4270 | type = MEMFILE_TYPE(of_cft(of)->private); | 3683 | switch (MEMFILE_TYPE(of_cft(of)->private)) { |
4271 | name = MEMFILE_ATTR(of_cft(of)->private); | 3684 | case _MEM: |
3685 | counter = &memcg->memory; | ||
3686 | break; | ||
3687 | case _MEMSWAP: | ||
3688 | counter = &memcg->memsw; | ||
3689 | break; | ||
3690 | case _KMEM: | ||
3691 | counter = &memcg->kmem; | ||
3692 | break; | ||
3693 | default: | ||
3694 | BUG(); | ||
3695 | } | ||
4272 | 3696 | ||
4273 | switch (name) { | 3697 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
4274 | case RES_MAX_USAGE: | 3698 | case RES_MAX_USAGE: |
4275 | if (type == _MEM) | 3699 | page_counter_reset_watermark(counter); |
4276 | res_counter_reset_max(&memcg->res); | ||
4277 | else if (type == _MEMSWAP) | ||
4278 | res_counter_reset_max(&memcg->memsw); | ||
4279 | else if (type == _KMEM) | ||
4280 | res_counter_reset_max(&memcg->kmem); | ||
4281 | else | ||
4282 | return -EINVAL; | ||
4283 | break; | 3700 | break; |
4284 | case RES_FAILCNT: | 3701 | case RES_FAILCNT: |
4285 | if (type == _MEM) | 3702 | counter->failcnt = 0; |
4286 | res_counter_reset_failcnt(&memcg->res); | ||
4287 | else if (type == _MEMSWAP) | ||
4288 | res_counter_reset_failcnt(&memcg->memsw); | ||
4289 | else if (type == _KMEM) | ||
4290 | res_counter_reset_failcnt(&memcg->kmem); | ||
4291 | else | ||
4292 | return -EINVAL; | ||
4293 | break; | 3703 | break; |
3704 | default: | ||
3705 | BUG(); | ||
4294 | } | 3706 | } |
4295 | 3707 | ||
4296 | return nbytes; | 3708 | return nbytes; |
@@ -4387,6 +3799,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) | |||
4387 | static int memcg_stat_show(struct seq_file *m, void *v) | 3799 | static int memcg_stat_show(struct seq_file *m, void *v) |
4388 | { | 3800 | { |
4389 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3801 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
3802 | unsigned long memory, memsw; | ||
4390 | struct mem_cgroup *mi; | 3803 | struct mem_cgroup *mi; |
4391 | unsigned int i; | 3804 | unsigned int i; |
4392 | 3805 | ||
@@ -4406,14 +3819,16 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
4406 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); | 3819 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); |
4407 | 3820 | ||
4408 | /* Hierarchical information */ | 3821 | /* Hierarchical information */ |
4409 | { | 3822 | memory = memsw = PAGE_COUNTER_MAX; |
4410 | unsigned long long limit, memsw_limit; | 3823 | for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { |
4411 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); | 3824 | memory = min(memory, mi->memory.limit); |
4412 | seq_printf(m, "hierarchical_memory_limit %llu\n", limit); | 3825 | memsw = min(memsw, mi->memsw.limit); |
4413 | if (do_swap_account) | ||
4414 | seq_printf(m, "hierarchical_memsw_limit %llu\n", | ||
4415 | memsw_limit); | ||
4416 | } | 3826 | } |
3827 | seq_printf(m, "hierarchical_memory_limit %llu\n", | ||
3828 | (u64)memory * PAGE_SIZE); | ||
3829 | if (do_swap_account) | ||
3830 | seq_printf(m, "hierarchical_memsw_limit %llu\n", | ||
3831 | (u64)memsw * PAGE_SIZE); | ||
4417 | 3832 | ||
4418 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3833 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4419 | long long val = 0; | 3834 | long long val = 0; |
@@ -4497,7 +3912,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | |||
4497 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | 3912 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
4498 | { | 3913 | { |
4499 | struct mem_cgroup_threshold_ary *t; | 3914 | struct mem_cgroup_threshold_ary *t; |
4500 | u64 usage; | 3915 | unsigned long usage; |
4501 | int i; | 3916 | int i; |
4502 | 3917 | ||
4503 | rcu_read_lock(); | 3918 | rcu_read_lock(); |
@@ -4596,10 +4011,11 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
4596 | { | 4011 | { |
4597 | struct mem_cgroup_thresholds *thresholds; | 4012 | struct mem_cgroup_thresholds *thresholds; |
4598 | struct mem_cgroup_threshold_ary *new; | 4013 | struct mem_cgroup_threshold_ary *new; |
4599 | u64 threshold, usage; | 4014 | unsigned long threshold; |
4015 | unsigned long usage; | ||
4600 | int i, size, ret; | 4016 | int i, size, ret; |
4601 | 4017 | ||
4602 | ret = res_counter_memparse_write_strategy(args, &threshold); | 4018 | ret = page_counter_memparse(args, &threshold); |
4603 | if (ret) | 4019 | if (ret) |
4604 | return ret; | 4020 | return ret; |
4605 | 4021 | ||
@@ -4689,7 +4105,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |||
4689 | { | 4105 | { |
4690 | struct mem_cgroup_thresholds *thresholds; | 4106 | struct mem_cgroup_thresholds *thresholds; |
4691 | struct mem_cgroup_threshold_ary *new; | 4107 | struct mem_cgroup_threshold_ary *new; |
4692 | u64 usage; | 4108 | unsigned long usage; |
4693 | int i, j, size; | 4109 | int i, j, size; |
4694 | 4110 | ||
4695 | mutex_lock(&memcg->thresholds_lock); | 4111 | mutex_lock(&memcg->thresholds_lock); |
@@ -4855,40 +4271,6 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) | |||
4855 | { | 4271 | { |
4856 | mem_cgroup_sockets_destroy(memcg); | 4272 | mem_cgroup_sockets_destroy(memcg); |
4857 | } | 4273 | } |
4858 | |||
4859 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | ||
4860 | { | ||
4861 | if (!memcg_kmem_is_active(memcg)) | ||
4862 | return; | ||
4863 | |||
4864 | /* | ||
4865 | * kmem charges can outlive the cgroup. In the case of slab | ||
4866 | * pages, for instance, a page contain objects from various | ||
4867 | * processes. As we prevent from taking a reference for every | ||
4868 | * such allocation we have to be careful when doing uncharge | ||
4869 | * (see memcg_uncharge_kmem) and here during offlining. | ||
4870 | * | ||
4871 | * The idea is that that only the _last_ uncharge which sees | ||
4872 | * the dead memcg will drop the last reference. An additional | ||
4873 | * reference is taken here before the group is marked dead | ||
4874 | * which is then paired with css_put during uncharge resp. here. | ||
4875 | * | ||
4876 | * Although this might sound strange as this path is called from | ||
4877 | * css_offline() when the referencemight have dropped down to 0 and | ||
4878 | * shouldn't be incremented anymore (css_tryget_online() would | ||
4879 | * fail) we do not have other options because of the kmem | ||
4880 | * allocations lifetime. | ||
4881 | */ | ||
4882 | css_get(&memcg->css); | ||
4883 | |||
4884 | memcg_kmem_mark_dead(memcg); | ||
4885 | |||
4886 | if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) | ||
4887 | return; | ||
4888 | |||
4889 | if (memcg_kmem_test_and_clear_dead(memcg)) | ||
4890 | css_put(&memcg->css); | ||
4891 | } | ||
4892 | #else | 4274 | #else |
4893 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4275 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4894 | { | 4276 | { |
@@ -4898,10 +4280,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4898 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4280 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4899 | { | 4281 | { |
4900 | } | 4282 | } |
4901 | |||
4902 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | ||
4903 | { | ||
4904 | } | ||
4905 | #endif | 4283 | #endif |
4906 | 4284 | ||
4907 | /* | 4285 | /* |
@@ -5228,7 +4606,10 @@ static struct cftype mem_cgroup_files[] = { | |||
5228 | #ifdef CONFIG_SLABINFO | 4606 | #ifdef CONFIG_SLABINFO |
5229 | { | 4607 | { |
5230 | .name = "kmem.slabinfo", | 4608 | .name = "kmem.slabinfo", |
5231 | .seq_show = mem_cgroup_slabinfo_read, | 4609 | .seq_start = slab_start, |
4610 | .seq_next = slab_next, | ||
4611 | .seq_stop = slab_stop, | ||
4612 | .seq_show = memcg_slab_show, | ||
5232 | }, | 4613 | }, |
5233 | #endif | 4614 | #endif |
5234 | #endif | 4615 | #endif |
@@ -5363,9 +4744,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
5363 | */ | 4744 | */ |
5364 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | 4745 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) |
5365 | { | 4746 | { |
5366 | if (!memcg->res.parent) | 4747 | if (!memcg->memory.parent) |
5367 | return NULL; | 4748 | return NULL; |
5368 | return mem_cgroup_from_res_counter(memcg->res.parent, res); | 4749 | return mem_cgroup_from_counter(memcg->memory.parent, memory); |
5369 | } | 4750 | } |
5370 | EXPORT_SYMBOL(parent_mem_cgroup); | 4751 | EXPORT_SYMBOL(parent_mem_cgroup); |
5371 | 4752 | ||
@@ -5410,9 +4791,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
5410 | /* root ? */ | 4791 | /* root ? */ |
5411 | if (parent_css == NULL) { | 4792 | if (parent_css == NULL) { |
5412 | root_mem_cgroup = memcg; | 4793 | root_mem_cgroup = memcg; |
5413 | res_counter_init(&memcg->res, NULL); | 4794 | page_counter_init(&memcg->memory, NULL); |
5414 | res_counter_init(&memcg->memsw, NULL); | 4795 | page_counter_init(&memcg->memsw, NULL); |
5415 | res_counter_init(&memcg->kmem, NULL); | 4796 | page_counter_init(&memcg->kmem, NULL); |
5416 | } | 4797 | } |
5417 | 4798 | ||
5418 | memcg->last_scanned_node = MAX_NUMNODES; | 4799 | memcg->last_scanned_node = MAX_NUMNODES; |
@@ -5451,18 +4832,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5451 | memcg->swappiness = mem_cgroup_swappiness(parent); | 4832 | memcg->swappiness = mem_cgroup_swappiness(parent); |
5452 | 4833 | ||
5453 | if (parent->use_hierarchy) { | 4834 | if (parent->use_hierarchy) { |
5454 | res_counter_init(&memcg->res, &parent->res); | 4835 | page_counter_init(&memcg->memory, &parent->memory); |
5455 | res_counter_init(&memcg->memsw, &parent->memsw); | 4836 | page_counter_init(&memcg->memsw, &parent->memsw); |
5456 | res_counter_init(&memcg->kmem, &parent->kmem); | 4837 | page_counter_init(&memcg->kmem, &parent->kmem); |
5457 | 4838 | ||
5458 | /* | 4839 | /* |
5459 | * No need to take a reference to the parent because cgroup | 4840 | * No need to take a reference to the parent because cgroup |
5460 | * core guarantees its existence. | 4841 | * core guarantees its existence. |
5461 | */ | 4842 | */ |
5462 | } else { | 4843 | } else { |
5463 | res_counter_init(&memcg->res, NULL); | 4844 | page_counter_init(&memcg->memory, NULL); |
5464 | res_counter_init(&memcg->memsw, NULL); | 4845 | page_counter_init(&memcg->memsw, NULL); |
5465 | res_counter_init(&memcg->kmem, NULL); | 4846 | page_counter_init(&memcg->kmem, NULL); |
5466 | /* | 4847 | /* |
5467 | * Deeper hierachy with use_hierarchy == false doesn't make | 4848 | * Deeper hierachy with use_hierarchy == false doesn't make |
5468 | * much sense so let cgroup subsystem know about this | 4849 | * much sense so let cgroup subsystem know about this |
@@ -5487,29 +4868,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
5487 | return 0; | 4868 | return 0; |
5488 | } | 4869 | } |
5489 | 4870 | ||
5490 | /* | ||
5491 | * Announce all parents that a group from their hierarchy is gone. | ||
5492 | */ | ||
5493 | static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | ||
5494 | { | ||
5495 | struct mem_cgroup *parent = memcg; | ||
5496 | |||
5497 | while ((parent = parent_mem_cgroup(parent))) | ||
5498 | mem_cgroup_iter_invalidate(parent); | ||
5499 | |||
5500 | /* | ||
5501 | * if the root memcg is not hierarchical we have to check it | ||
5502 | * explicitely. | ||
5503 | */ | ||
5504 | if (!root_mem_cgroup->use_hierarchy) | ||
5505 | mem_cgroup_iter_invalidate(root_mem_cgroup); | ||
5506 | } | ||
5507 | |||
5508 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 4871 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
5509 | { | 4872 | { |
5510 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4873 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5511 | struct mem_cgroup_event *event, *tmp; | 4874 | struct mem_cgroup_event *event, *tmp; |
5512 | struct cgroup_subsys_state *iter; | ||
5513 | 4875 | ||
5514 | /* | 4876 | /* |
5515 | * Unregister events and notify userspace. | 4877 | * Unregister events and notify userspace. |
@@ -5523,17 +4885,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5523 | } | 4885 | } |
5524 | spin_unlock(&memcg->event_list_lock); | 4886 | spin_unlock(&memcg->event_list_lock); |
5525 | 4887 | ||
5526 | kmem_cgroup_css_offline(memcg); | ||
5527 | |||
5528 | mem_cgroup_invalidate_reclaim_iterators(memcg); | ||
5529 | |||
5530 | /* | ||
5531 | * This requires that offlining is serialized. Right now that is | ||
5532 | * guaranteed because css_killed_work_fn() holds the cgroup_mutex. | ||
5533 | */ | ||
5534 | css_for_each_descendant_post(iter, css) | ||
5535 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | ||
5536 | |||
5537 | memcg_unregister_all_caches(memcg); | 4888 | memcg_unregister_all_caches(memcg); |
5538 | vmpressure_cleanup(&memcg->vmpressure); | 4889 | vmpressure_cleanup(&memcg->vmpressure); |
5539 | } | 4890 | } |
@@ -5541,42 +4892,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
5541 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 4892 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) |
5542 | { | 4893 | { |
5543 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4894 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5544 | /* | ||
5545 | * XXX: css_offline() would be where we should reparent all | ||
5546 | * memory to prepare the cgroup for destruction. However, | ||
5547 | * memcg does not do css_tryget_online() and res_counter charging | ||
5548 | * under the same RCU lock region, which means that charging | ||
5549 | * could race with offlining. Offlining only happens to | ||
5550 | * cgroups with no tasks in them but charges can show up | ||
5551 | * without any tasks from the swapin path when the target | ||
5552 | * memcg is looked up from the swapout record and not from the | ||
5553 | * current task as it usually is. A race like this can leak | ||
5554 | * charges and put pages with stale cgroup pointers into | ||
5555 | * circulation: | ||
5556 | * | ||
5557 | * #0 #1 | ||
5558 | * lookup_swap_cgroup_id() | ||
5559 | * rcu_read_lock() | ||
5560 | * mem_cgroup_lookup() | ||
5561 | * css_tryget_online() | ||
5562 | * rcu_read_unlock() | ||
5563 | * disable css_tryget_online() | ||
5564 | * call_rcu() | ||
5565 | * offline_css() | ||
5566 | * reparent_charges() | ||
5567 | * res_counter_charge() | ||
5568 | * css_put() | ||
5569 | * css_free() | ||
5570 | * pc->mem_cgroup = dead memcg | ||
5571 | * add page to lru | ||
5572 | * | ||
5573 | * The bulk of the charges are still moved in offline_css() to | ||
5574 | * avoid pinning a lot of pages in case a long-term reference | ||
5575 | * like a swapout record is deferring the css_free() to long | ||
5576 | * after offlining. But this makes sure we catch any charges | ||
5577 | * made after offlining: | ||
5578 | */ | ||
5579 | mem_cgroup_reparent_charges(memcg); | ||
5580 | 4895 | ||
5581 | memcg_destroy_kmem(memcg); | 4896 | memcg_destroy_kmem(memcg); |
5582 | __mem_cgroup_free(memcg); | 4897 | __mem_cgroup_free(memcg); |
@@ -5599,10 +4914,10 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
5599 | { | 4914 | { |
5600 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4915 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5601 | 4916 | ||
5602 | mem_cgroup_resize_limit(memcg, ULLONG_MAX); | 4917 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
5603 | mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); | 4918 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
5604 | memcg_update_kmem_limit(memcg, ULLONG_MAX); | 4919 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
5605 | res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); | 4920 | memcg->soft_limit = 0; |
5606 | } | 4921 | } |
5607 | 4922 | ||
5608 | #ifdef CONFIG_MMU | 4923 | #ifdef CONFIG_MMU |
@@ -5758,7 +5073,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
5758 | unsigned long addr, pte_t ptent, union mc_target *target) | 5073 | unsigned long addr, pte_t ptent, union mc_target *target) |
5759 | { | 5074 | { |
5760 | struct page *page = NULL; | 5075 | struct page *page = NULL; |
5761 | struct page_cgroup *pc; | ||
5762 | enum mc_target_type ret = MC_TARGET_NONE; | 5076 | enum mc_target_type ret = MC_TARGET_NONE; |
5763 | swp_entry_t ent = { .val = 0 }; | 5077 | swp_entry_t ent = { .val = 0 }; |
5764 | 5078 | ||
@@ -5772,13 +5086,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
5772 | if (!page && !ent.val) | 5086 | if (!page && !ent.val) |
5773 | return ret; | 5087 | return ret; |
5774 | if (page) { | 5088 | if (page) { |
5775 | pc = lookup_page_cgroup(page); | ||
5776 | /* | 5089 | /* |
5777 | * Do only loose check w/o serialization. | 5090 | * Do only loose check w/o serialization. |
5778 | * mem_cgroup_move_account() checks the pc is valid or | 5091 | * mem_cgroup_move_account() checks the page is valid or |
5779 | * not under LRU exclusion. | 5092 | * not under LRU exclusion. |
5780 | */ | 5093 | */ |
5781 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | 5094 | if (page->mem_cgroup == mc.from) { |
5782 | ret = MC_TARGET_PAGE; | 5095 | ret = MC_TARGET_PAGE; |
5783 | if (target) | 5096 | if (target) |
5784 | target->page = page; | 5097 | target->page = page; |
@@ -5806,15 +5119,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
5806 | unsigned long addr, pmd_t pmd, union mc_target *target) | 5119 | unsigned long addr, pmd_t pmd, union mc_target *target) |
5807 | { | 5120 | { |
5808 | struct page *page = NULL; | 5121 | struct page *page = NULL; |
5809 | struct page_cgroup *pc; | ||
5810 | enum mc_target_type ret = MC_TARGET_NONE; | 5122 | enum mc_target_type ret = MC_TARGET_NONE; |
5811 | 5123 | ||
5812 | page = pmd_page(pmd); | 5124 | page = pmd_page(pmd); |
5813 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 5125 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
5814 | if (!move_anon()) | 5126 | if (!move_anon()) |
5815 | return ret; | 5127 | return ret; |
5816 | pc = lookup_page_cgroup(page); | 5128 | if (page->mem_cgroup == mc.from) { |
5817 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
5818 | ret = MC_TARGET_PAGE; | 5129 | ret = MC_TARGET_PAGE; |
5819 | if (target) { | 5130 | if (target) { |
5820 | get_page(page); | 5131 | get_page(page); |
@@ -5897,7 +5208,6 @@ static void __mem_cgroup_clear_mc(void) | |||
5897 | { | 5208 | { |
5898 | struct mem_cgroup *from = mc.from; | 5209 | struct mem_cgroup *from = mc.from; |
5899 | struct mem_cgroup *to = mc.to; | 5210 | struct mem_cgroup *to = mc.to; |
5900 | int i; | ||
5901 | 5211 | ||
5902 | /* we must uncharge all the leftover precharges from mc.to */ | 5212 | /* we must uncharge all the leftover precharges from mc.to */ |
5903 | if (mc.precharge) { | 5213 | if (mc.precharge) { |
@@ -5916,19 +5226,17 @@ static void __mem_cgroup_clear_mc(void) | |||
5916 | if (mc.moved_swap) { | 5226 | if (mc.moved_swap) { |
5917 | /* uncharge swap account from the old cgroup */ | 5227 | /* uncharge swap account from the old cgroup */ |
5918 | if (!mem_cgroup_is_root(mc.from)) | 5228 | if (!mem_cgroup_is_root(mc.from)) |
5919 | res_counter_uncharge(&mc.from->memsw, | 5229 | page_counter_uncharge(&mc.from->memsw, mc.moved_swap); |
5920 | PAGE_SIZE * mc.moved_swap); | ||
5921 | |||
5922 | for (i = 0; i < mc.moved_swap; i++) | ||
5923 | css_put(&mc.from->css); | ||
5924 | 5230 | ||
5925 | /* | 5231 | /* |
5926 | * we charged both to->res and to->memsw, so we should | 5232 | * we charged both to->memory and to->memsw, so we |
5927 | * uncharge to->res. | 5233 | * should uncharge to->memory. |
5928 | */ | 5234 | */ |
5929 | if (!mem_cgroup_is_root(mc.to)) | 5235 | if (!mem_cgroup_is_root(mc.to)) |
5930 | res_counter_uncharge(&mc.to->res, | 5236 | page_counter_uncharge(&mc.to->memory, mc.moved_swap); |
5931 | PAGE_SIZE * mc.moved_swap); | 5237 | |
5238 | css_put_many(&mc.from->css, mc.moved_swap); | ||
5239 | |||
5932 | /* we've already done css_get(mc.to) */ | 5240 | /* we've already done css_get(mc.to) */ |
5933 | mc.moved_swap = 0; | 5241 | mc.moved_swap = 0; |
5934 | } | 5242 | } |
@@ -5939,8 +5247,6 @@ static void __mem_cgroup_clear_mc(void) | |||
5939 | 5247 | ||
5940 | static void mem_cgroup_clear_mc(void) | 5248 | static void mem_cgroup_clear_mc(void) |
5941 | { | 5249 | { |
5942 | struct mem_cgroup *from = mc.from; | ||
5943 | |||
5944 | /* | 5250 | /* |
5945 | * we must clear moving_task before waking up waiters at the end of | 5251 | * we must clear moving_task before waking up waiters at the end of |
5946 | * task migration. | 5252 | * task migration. |
@@ -5951,7 +5257,6 @@ static void mem_cgroup_clear_mc(void) | |||
5951 | mc.from = NULL; | 5257 | mc.from = NULL; |
5952 | mc.to = NULL; | 5258 | mc.to = NULL; |
5953 | spin_unlock(&mc.lock); | 5259 | spin_unlock(&mc.lock); |
5954 | mem_cgroup_end_move(from); | ||
5955 | } | 5260 | } |
5956 | 5261 | ||
5957 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | 5262 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, |
@@ -5984,7 +5289,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
5984 | VM_BUG_ON(mc.precharge); | 5289 | VM_BUG_ON(mc.precharge); |
5985 | VM_BUG_ON(mc.moved_charge); | 5290 | VM_BUG_ON(mc.moved_charge); |
5986 | VM_BUG_ON(mc.moved_swap); | 5291 | VM_BUG_ON(mc.moved_swap); |
5987 | mem_cgroup_start_move(from); | 5292 | |
5988 | spin_lock(&mc.lock); | 5293 | spin_lock(&mc.lock); |
5989 | mc.from = from; | 5294 | mc.from = from; |
5990 | mc.to = memcg; | 5295 | mc.to = memcg; |
@@ -6004,7 +5309,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
6004 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, | 5309 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, |
6005 | struct cgroup_taskset *tset) | 5310 | struct cgroup_taskset *tset) |
6006 | { | 5311 | { |
6007 | mem_cgroup_clear_mc(); | 5312 | if (mc.to) |
5313 | mem_cgroup_clear_mc(); | ||
6008 | } | 5314 | } |
6009 | 5315 | ||
6010 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | 5316 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, |
@@ -6018,7 +5324,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
6018 | enum mc_target_type target_type; | 5324 | enum mc_target_type target_type; |
6019 | union mc_target target; | 5325 | union mc_target target; |
6020 | struct page *page; | 5326 | struct page *page; |
6021 | struct page_cgroup *pc; | ||
6022 | 5327 | ||
6023 | /* | 5328 | /* |
6024 | * We don't take compound_lock() here but no race with splitting thp | 5329 | * We don't take compound_lock() here but no race with splitting thp |
@@ -6039,9 +5344,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
6039 | if (target_type == MC_TARGET_PAGE) { | 5344 | if (target_type == MC_TARGET_PAGE) { |
6040 | page = target.page; | 5345 | page = target.page; |
6041 | if (!isolate_lru_page(page)) { | 5346 | if (!isolate_lru_page(page)) { |
6042 | pc = lookup_page_cgroup(page); | ||
6043 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | 5347 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, |
6044 | pc, mc.from, mc.to)) { | 5348 | mc.from, mc.to)) { |
6045 | mc.precharge -= HPAGE_PMD_NR; | 5349 | mc.precharge -= HPAGE_PMD_NR; |
6046 | mc.moved_charge += HPAGE_PMD_NR; | 5350 | mc.moved_charge += HPAGE_PMD_NR; |
6047 | } | 5351 | } |
@@ -6069,9 +5373,7 @@ retry: | |||
6069 | page = target.page; | 5373 | page = target.page; |
6070 | if (isolate_lru_page(page)) | 5374 | if (isolate_lru_page(page)) |
6071 | goto put; | 5375 | goto put; |
6072 | pc = lookup_page_cgroup(page); | 5376 | if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { |
6073 | if (!mem_cgroup_move_account(page, 1, pc, | ||
6074 | mc.from, mc.to)) { | ||
6075 | mc.precharge--; | 5377 | mc.precharge--; |
6076 | /* we uncharge from mc.from later. */ | 5378 | /* we uncharge from mc.from later. */ |
6077 | mc.moved_charge++; | 5379 | mc.moved_charge++; |
@@ -6115,6 +5417,13 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
6115 | struct vm_area_struct *vma; | 5417 | struct vm_area_struct *vma; |
6116 | 5418 | ||
6117 | lru_add_drain_all(); | 5419 | lru_add_drain_all(); |
5420 | /* | ||
5421 | * Signal mem_cgroup_begin_page_stat() to take the memcg's | ||
5422 | * move_lock while we're moving its pages to another memcg. | ||
5423 | * Then wait for already started RCU-only updates to finish. | ||
5424 | */ | ||
5425 | atomic_inc(&mc.from->moving_account); | ||
5426 | synchronize_rcu(); | ||
6118 | retry: | 5427 | retry: |
6119 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | 5428 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
6120 | /* | 5429 | /* |
@@ -6147,6 +5456,7 @@ retry: | |||
6147 | break; | 5456 | break; |
6148 | } | 5457 | } |
6149 | up_read(&mm->mmap_sem); | 5458 | up_read(&mm->mmap_sem); |
5459 | atomic_dec(&mc.from->moving_account); | ||
6150 | } | 5460 | } |
6151 | 5461 | ||
6152 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, | 5462 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, |
@@ -6250,7 +5560,7 @@ static void __init enable_swap_cgroup(void) | |||
6250 | */ | 5560 | */ |
6251 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | 5561 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) |
6252 | { | 5562 | { |
6253 | struct page_cgroup *pc; | 5563 | struct mem_cgroup *memcg; |
6254 | unsigned short oldid; | 5564 | unsigned short oldid; |
6255 | 5565 | ||
6256 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5566 | VM_BUG_ON_PAGE(PageLRU(page), page); |
@@ -6259,20 +5569,26 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | |||
6259 | if (!do_swap_account) | 5569 | if (!do_swap_account) |
6260 | return; | 5570 | return; |
6261 | 5571 | ||
6262 | pc = lookup_page_cgroup(page); | 5572 | memcg = page->mem_cgroup; |
6263 | 5573 | ||
6264 | /* Readahead page, never charged */ | 5574 | /* Readahead page, never charged */ |
6265 | if (!PageCgroupUsed(pc)) | 5575 | if (!memcg) |
6266 | return; | 5576 | return; |
6267 | 5577 | ||
6268 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); | 5578 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); |
6269 | |||
6270 | oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); | ||
6271 | VM_BUG_ON_PAGE(oldid, page); | 5579 | VM_BUG_ON_PAGE(oldid, page); |
5580 | mem_cgroup_swap_statistics(memcg, true); | ||
5581 | |||
5582 | page->mem_cgroup = NULL; | ||
6272 | 5583 | ||
6273 | pc->flags &= ~PCG_MEMSW; | 5584 | if (!mem_cgroup_is_root(memcg)) |
6274 | css_get(&pc->mem_cgroup->css); | 5585 | page_counter_uncharge(&memcg->memory, 1); |
6275 | mem_cgroup_swap_statistics(pc->mem_cgroup, true); | 5586 | |
5587 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | ||
5588 | VM_BUG_ON(!irqs_disabled()); | ||
5589 | |||
5590 | mem_cgroup_charge_statistics(memcg, page, -1); | ||
5591 | memcg_check_events(memcg, page); | ||
6276 | } | 5592 | } |
6277 | 5593 | ||
6278 | /** | 5594 | /** |
@@ -6294,7 +5610,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
6294 | memcg = mem_cgroup_lookup(id); | 5610 | memcg = mem_cgroup_lookup(id); |
6295 | if (memcg) { | 5611 | if (memcg) { |
6296 | if (!mem_cgroup_is_root(memcg)) | 5612 | if (!mem_cgroup_is_root(memcg)) |
6297 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 5613 | page_counter_uncharge(&memcg->memsw, 1); |
6298 | mem_cgroup_swap_statistics(memcg, false); | 5614 | mem_cgroup_swap_statistics(memcg, false); |
6299 | css_put(&memcg->css); | 5615 | css_put(&memcg->css); |
6300 | } | 5616 | } |
@@ -6330,7 +5646,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
6330 | goto out; | 5646 | goto out; |
6331 | 5647 | ||
6332 | if (PageSwapCache(page)) { | 5648 | if (PageSwapCache(page)) { |
6333 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
6334 | /* | 5649 | /* |
6335 | * Every swap fault against a single page tries to charge the | 5650 | * Every swap fault against a single page tries to charge the |
6336 | * page, bail as early as possible. shmem_unuse() encounters | 5651 | * page, bail as early as possible. shmem_unuse() encounters |
@@ -6338,7 +5653,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
6338 | * the page lock, which serializes swap cache removal, which | 5653 | * the page lock, which serializes swap cache removal, which |
6339 | * in turn serializes uncharging. | 5654 | * in turn serializes uncharging. |
6340 | */ | 5655 | */ |
6341 | if (PageCgroupUsed(pc)) | 5656 | if (page->mem_cgroup) |
6342 | goto out; | 5657 | goto out; |
6343 | } | 5658 | } |
6344 | 5659 | ||
@@ -6452,19 +5767,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) | |||
6452 | } | 5767 | } |
6453 | 5768 | ||
6454 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | 5769 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, |
6455 | unsigned long nr_mem, unsigned long nr_memsw, | ||
6456 | unsigned long nr_anon, unsigned long nr_file, | 5770 | unsigned long nr_anon, unsigned long nr_file, |
6457 | unsigned long nr_huge, struct page *dummy_page) | 5771 | unsigned long nr_huge, struct page *dummy_page) |
6458 | { | 5772 | { |
5773 | unsigned long nr_pages = nr_anon + nr_file; | ||
6459 | unsigned long flags; | 5774 | unsigned long flags; |
6460 | 5775 | ||
6461 | if (!mem_cgroup_is_root(memcg)) { | 5776 | if (!mem_cgroup_is_root(memcg)) { |
6462 | if (nr_mem) | 5777 | page_counter_uncharge(&memcg->memory, nr_pages); |
6463 | res_counter_uncharge(&memcg->res, | 5778 | if (do_swap_account) |
6464 | nr_mem * PAGE_SIZE); | 5779 | page_counter_uncharge(&memcg->memsw, nr_pages); |
6465 | if (nr_memsw) | ||
6466 | res_counter_uncharge(&memcg->memsw, | ||
6467 | nr_memsw * PAGE_SIZE); | ||
6468 | memcg_oom_recover(memcg); | 5780 | memcg_oom_recover(memcg); |
6469 | } | 5781 | } |
6470 | 5782 | ||
@@ -6473,27 +5785,27 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | |||
6473 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); | 5785 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); |
6474 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); | 5786 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); |
6475 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); | 5787 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); |
6476 | __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); | 5788 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
6477 | memcg_check_events(memcg, dummy_page); | 5789 | memcg_check_events(memcg, dummy_page); |
6478 | local_irq_restore(flags); | 5790 | local_irq_restore(flags); |
5791 | |||
5792 | if (!mem_cgroup_is_root(memcg)) | ||
5793 | css_put_many(&memcg->css, nr_pages); | ||
6479 | } | 5794 | } |
6480 | 5795 | ||
6481 | static void uncharge_list(struct list_head *page_list) | 5796 | static void uncharge_list(struct list_head *page_list) |
6482 | { | 5797 | { |
6483 | struct mem_cgroup *memcg = NULL; | 5798 | struct mem_cgroup *memcg = NULL; |
6484 | unsigned long nr_memsw = 0; | ||
6485 | unsigned long nr_anon = 0; | 5799 | unsigned long nr_anon = 0; |
6486 | unsigned long nr_file = 0; | 5800 | unsigned long nr_file = 0; |
6487 | unsigned long nr_huge = 0; | 5801 | unsigned long nr_huge = 0; |
6488 | unsigned long pgpgout = 0; | 5802 | unsigned long pgpgout = 0; |
6489 | unsigned long nr_mem = 0; | ||
6490 | struct list_head *next; | 5803 | struct list_head *next; |
6491 | struct page *page; | 5804 | struct page *page; |
6492 | 5805 | ||
6493 | next = page_list->next; | 5806 | next = page_list->next; |
6494 | do { | 5807 | do { |
6495 | unsigned int nr_pages = 1; | 5808 | unsigned int nr_pages = 1; |
6496 | struct page_cgroup *pc; | ||
6497 | 5809 | ||
6498 | page = list_entry(next, struct page, lru); | 5810 | page = list_entry(next, struct page, lru); |
6499 | next = page->lru.next; | 5811 | next = page->lru.next; |
@@ -6501,24 +5813,22 @@ static void uncharge_list(struct list_head *page_list) | |||
6501 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5813 | VM_BUG_ON_PAGE(PageLRU(page), page); |
6502 | VM_BUG_ON_PAGE(page_count(page), page); | 5814 | VM_BUG_ON_PAGE(page_count(page), page); |
6503 | 5815 | ||
6504 | pc = lookup_page_cgroup(page); | 5816 | if (!page->mem_cgroup) |
6505 | if (!PageCgroupUsed(pc)) | ||
6506 | continue; | 5817 | continue; |
6507 | 5818 | ||
6508 | /* | 5819 | /* |
6509 | * Nobody should be changing or seriously looking at | 5820 | * Nobody should be changing or seriously looking at |
6510 | * pc->mem_cgroup and pc->flags at this point, we have | 5821 | * page->mem_cgroup at this point, we have fully |
6511 | * fully exclusive access to the page. | 5822 | * exclusive access to the page. |
6512 | */ | 5823 | */ |
6513 | 5824 | ||
6514 | if (memcg != pc->mem_cgroup) { | 5825 | if (memcg != page->mem_cgroup) { |
6515 | if (memcg) { | 5826 | if (memcg) { |
6516 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, | 5827 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
6517 | nr_anon, nr_file, nr_huge, page); | 5828 | nr_huge, page); |
6518 | pgpgout = nr_mem = nr_memsw = 0; | 5829 | pgpgout = nr_anon = nr_file = nr_huge = 0; |
6519 | nr_anon = nr_file = nr_huge = 0; | ||
6520 | } | 5830 | } |
6521 | memcg = pc->mem_cgroup; | 5831 | memcg = page->mem_cgroup; |
6522 | } | 5832 | } |
6523 | 5833 | ||
6524 | if (PageTransHuge(page)) { | 5834 | if (PageTransHuge(page)) { |
@@ -6532,18 +5842,14 @@ static void uncharge_list(struct list_head *page_list) | |||
6532 | else | 5842 | else |
6533 | nr_file += nr_pages; | 5843 | nr_file += nr_pages; |
6534 | 5844 | ||
6535 | if (pc->flags & PCG_MEM) | 5845 | page->mem_cgroup = NULL; |
6536 | nr_mem += nr_pages; | ||
6537 | if (pc->flags & PCG_MEMSW) | ||
6538 | nr_memsw += nr_pages; | ||
6539 | pc->flags = 0; | ||
6540 | 5846 | ||
6541 | pgpgout++; | 5847 | pgpgout++; |
6542 | } while (next != page_list); | 5848 | } while (next != page_list); |
6543 | 5849 | ||
6544 | if (memcg) | 5850 | if (memcg) |
6545 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, | 5851 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
6546 | nr_anon, nr_file, nr_huge, page); | 5852 | nr_huge, page); |
6547 | } | 5853 | } |
6548 | 5854 | ||
6549 | /** | 5855 | /** |
@@ -6555,14 +5861,11 @@ static void uncharge_list(struct list_head *page_list) | |||
6555 | */ | 5861 | */ |
6556 | void mem_cgroup_uncharge(struct page *page) | 5862 | void mem_cgroup_uncharge(struct page *page) |
6557 | { | 5863 | { |
6558 | struct page_cgroup *pc; | ||
6559 | |||
6560 | if (mem_cgroup_disabled()) | 5864 | if (mem_cgroup_disabled()) |
6561 | return; | 5865 | return; |
6562 | 5866 | ||
6563 | /* Don't touch page->lru of any random page, pre-check: */ | 5867 | /* Don't touch page->lru of any random page, pre-check: */ |
6564 | pc = lookup_page_cgroup(page); | 5868 | if (!page->mem_cgroup) |
6565 | if (!PageCgroupUsed(pc)) | ||
6566 | return; | 5869 | return; |
6567 | 5870 | ||
6568 | INIT_LIST_HEAD(&page->lru); | 5871 | INIT_LIST_HEAD(&page->lru); |
@@ -6598,7 +5901,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
6598 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | 5901 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, |
6599 | bool lrucare) | 5902 | bool lrucare) |
6600 | { | 5903 | { |
6601 | struct page_cgroup *pc; | 5904 | struct mem_cgroup *memcg; |
6602 | int isolated; | 5905 | int isolated; |
6603 | 5906 | ||
6604 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | 5907 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); |
@@ -6613,27 +5916,28 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
6613 | return; | 5916 | return; |
6614 | 5917 | ||
6615 | /* Page cache replacement: new page already charged? */ | 5918 | /* Page cache replacement: new page already charged? */ |
6616 | pc = lookup_page_cgroup(newpage); | 5919 | if (newpage->mem_cgroup) |
6617 | if (PageCgroupUsed(pc)) | ||
6618 | return; | 5920 | return; |
6619 | 5921 | ||
6620 | /* Re-entrant migration: old page already uncharged? */ | 5922 | /* |
6621 | pc = lookup_page_cgroup(oldpage); | 5923 | * Swapcache readahead pages can get migrated before being |
6622 | if (!PageCgroupUsed(pc)) | 5924 | * charged, and migration from compaction can happen to an |
5925 | * uncharged page when the PFN walker finds a page that | ||
5926 | * reclaim just put back on the LRU but has not released yet. | ||
5927 | */ | ||
5928 | memcg = oldpage->mem_cgroup; | ||
5929 | if (!memcg) | ||
6623 | return; | 5930 | return; |
6624 | 5931 | ||
6625 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); | ||
6626 | VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); | ||
6627 | |||
6628 | if (lrucare) | 5932 | if (lrucare) |
6629 | lock_page_lru(oldpage, &isolated); | 5933 | lock_page_lru(oldpage, &isolated); |
6630 | 5934 | ||
6631 | pc->flags = 0; | 5935 | oldpage->mem_cgroup = NULL; |
6632 | 5936 | ||
6633 | if (lrucare) | 5937 | if (lrucare) |
6634 | unlock_page_lru(oldpage, isolated); | 5938 | unlock_page_lru(oldpage, isolated); |
6635 | 5939 | ||
6636 | commit_charge(newpage, pc->mem_cgroup, lrucare); | 5940 | commit_charge(newpage, memcg, lrucare); |
6637 | } | 5941 | } |
6638 | 5942 | ||
6639 | /* | 5943 | /* |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b852b10ec76d..e5ee0ca7ae85 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -233,7 +233,7 @@ void shake_page(struct page *p, int access) | |||
233 | lru_add_drain_all(); | 233 | lru_add_drain_all(); |
234 | if (PageLRU(p)) | 234 | if (PageLRU(p)) |
235 | return; | 235 | return; |
236 | drain_all_pages(); | 236 | drain_all_pages(page_zone(p)); |
237 | if (PageLRU(p) || is_free_buddy_page(p)) | 237 | if (PageLRU(p) || is_free_buddy_page(p)) |
238 | return; | 238 | return; |
239 | } | 239 | } |
@@ -1661,7 +1661,7 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1661 | if (!is_free_buddy_page(page)) | 1661 | if (!is_free_buddy_page(page)) |
1662 | lru_add_drain_all(); | 1662 | lru_add_drain_all(); |
1663 | if (!is_free_buddy_page(page)) | 1663 | if (!is_free_buddy_page(page)) |
1664 | drain_all_pages(); | 1664 | drain_all_pages(page_zone(page)); |
1665 | SetPageHWPoison(page); | 1665 | SetPageHWPoison(page); |
1666 | if (!is_free_buddy_page(page)) | 1666 | if (!is_free_buddy_page(page)) |
1667 | pr_info("soft offline: %#lx: page leaked\n", | 1667 | pr_info("soft offline: %#lx: page leaked\n", |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1bf4807cb21e..9fab10795bea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1725,7 +1725,7 @@ repeat: | |||
1725 | if (drain) { | 1725 | if (drain) { |
1726 | lru_add_drain_all(); | 1726 | lru_add_drain_all(); |
1727 | cond_resched(); | 1727 | cond_resched(); |
1728 | drain_all_pages(); | 1728 | drain_all_pages(zone); |
1729 | } | 1729 | } |
1730 | 1730 | ||
1731 | pfn = scan_movable_pages(start_pfn, end_pfn); | 1731 | pfn = scan_movable_pages(start_pfn, end_pfn); |
@@ -1747,7 +1747,7 @@ repeat: | |||
1747 | lru_add_drain_all(); | 1747 | lru_add_drain_all(); |
1748 | yield(); | 1748 | yield(); |
1749 | /* drain pcp pages, this is synchronous. */ | 1749 | /* drain pcp pages, this is synchronous. */ |
1750 | drain_all_pages(); | 1750 | drain_all_pages(zone); |
1751 | /* | 1751 | /* |
1752 | * dissolve free hugepages in the memory block before doing offlining | 1752 | * dissolve free hugepages in the memory block before doing offlining |
1753 | * actually in order to make hugetlbfs's object counting consistent. | 1753 | * actually in order to make hugetlbfs's object counting consistent. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5340f6b91312..3b014d326151 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -119,7 +119,7 @@ found: | |||
119 | 119 | ||
120 | /* return true if the task is not adequate as candidate victim task. */ | 120 | /* return true if the task is not adequate as candidate victim task. */ |
121 | static bool oom_unkillable_task(struct task_struct *p, | 121 | static bool oom_unkillable_task(struct task_struct *p, |
122 | const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 122 | struct mem_cgroup *memcg, const nodemask_t *nodemask) |
123 | { | 123 | { |
124 | if (is_global_init(p)) | 124 | if (is_global_init(p)) |
125 | return true; | 125 | return true; |
@@ -353,7 +353,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
353 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, | 353 | * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, |
354 | * swapents, oom_score_adj value, and name. | 354 | * swapents, oom_score_adj value, and name. |
355 | */ | 355 | */ |
356 | static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) | 356 | static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) |
357 | { | 357 | { |
358 | struct task_struct *p; | 358 | struct task_struct *p; |
359 | struct task_struct *task; | 359 | struct task_struct *task; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 19ceae87522d..d5d81f5384d1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2357,7 +2357,7 @@ int test_clear_page_writeback(struct page *page) | |||
2357 | dec_zone_page_state(page, NR_WRITEBACK); | 2357 | dec_zone_page_state(page, NR_WRITEBACK); |
2358 | inc_zone_page_state(page, NR_WRITTEN); | 2358 | inc_zone_page_state(page, NR_WRITTEN); |
2359 | } | 2359 | } |
2360 | mem_cgroup_end_page_stat(memcg, locked, memcg_flags); | 2360 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); |
2361 | return ret; | 2361 | return ret; |
2362 | } | 2362 | } |
2363 | 2363 | ||
@@ -2399,7 +2399,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2399 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2399 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); |
2400 | inc_zone_page_state(page, NR_WRITEBACK); | 2400 | inc_zone_page_state(page, NR_WRITEBACK); |
2401 | } | 2401 | } |
2402 | mem_cgroup_end_page_stat(memcg, locked, memcg_flags); | 2402 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); |
2403 | return ret; | 2403 | return ret; |
2404 | 2404 | ||
2405 | } | 2405 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 616a2c956b4b..a7198c065999 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -48,7 +48,6 @@ | |||
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/page_cgroup.h> | ||
52 | #include <linux/debugobjects.h> | 51 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 52 | #include <linux/kmemleak.h> |
54 | #include <linux/compaction.h> | 53 | #include <linux/compaction.h> |
@@ -641,8 +640,10 @@ static inline int free_pages_check(struct page *page) | |||
641 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; | 640 | bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
642 | bad_flags = PAGE_FLAGS_CHECK_AT_FREE; | 641 | bad_flags = PAGE_FLAGS_CHECK_AT_FREE; |
643 | } | 642 | } |
644 | if (unlikely(mem_cgroup_bad_page_check(page))) | 643 | #ifdef CONFIG_MEMCG |
645 | bad_reason = "cgroup check failed"; | 644 | if (unlikely(page->mem_cgroup)) |
645 | bad_reason = "page still charged to cgroup"; | ||
646 | #endif | ||
646 | if (unlikely(bad_reason)) { | 647 | if (unlikely(bad_reason)) { |
647 | bad_page(page, bad_reason, bad_flags); | 648 | bad_page(page, bad_reason, bad_flags); |
648 | return 1; | 649 | return 1; |
@@ -741,6 +742,9 @@ static bool free_pages_prepare(struct page *page, unsigned int order) | |||
741 | int i; | 742 | int i; |
742 | int bad = 0; | 743 | int bad = 0; |
743 | 744 | ||
745 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
746 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | ||
747 | |||
744 | trace_mm_page_free(page, order); | 748 | trace_mm_page_free(page, order); |
745 | kmemcheck_free_shadow(page, order); | 749 | kmemcheck_free_shadow(page, order); |
746 | 750 | ||
@@ -898,8 +902,10 @@ static inline int check_new_page(struct page *page) | |||
898 | bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; | 902 | bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; |
899 | bad_flags = PAGE_FLAGS_CHECK_AT_PREP; | 903 | bad_flags = PAGE_FLAGS_CHECK_AT_PREP; |
900 | } | 904 | } |
901 | if (unlikely(mem_cgroup_bad_page_check(page))) | 905 | #ifdef CONFIG_MEMCG |
902 | bad_reason = "cgroup check failed"; | 906 | if (unlikely(page->mem_cgroup)) |
907 | bad_reason = "page still charged to cgroup"; | ||
908 | #endif | ||
903 | if (unlikely(bad_reason)) { | 909 | if (unlikely(bad_reason)) { |
904 | bad_page(page, bad_reason, bad_flags); | 910 | bad_page(page, bad_reason, bad_flags); |
905 | return 1; | 911 | return 1; |
@@ -1267,55 +1273,75 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1267 | #endif | 1273 | #endif |
1268 | 1274 | ||
1269 | /* | 1275 | /* |
1270 | * Drain pages of the indicated processor. | 1276 | * Drain pcplists of the indicated processor and zone. |
1271 | * | 1277 | * |
1272 | * The processor must either be the current processor and the | 1278 | * The processor must either be the current processor and the |
1273 | * thread pinned to the current processor or a processor that | 1279 | * thread pinned to the current processor or a processor that |
1274 | * is not online. | 1280 | * is not online. |
1275 | */ | 1281 | */ |
1276 | static void drain_pages(unsigned int cpu) | 1282 | static void drain_pages_zone(unsigned int cpu, struct zone *zone) |
1277 | { | 1283 | { |
1278 | unsigned long flags; | 1284 | unsigned long flags; |
1279 | struct zone *zone; | 1285 | struct per_cpu_pageset *pset; |
1286 | struct per_cpu_pages *pcp; | ||
1280 | 1287 | ||
1281 | for_each_populated_zone(zone) { | 1288 | local_irq_save(flags); |
1282 | struct per_cpu_pageset *pset; | 1289 | pset = per_cpu_ptr(zone->pageset, cpu); |
1283 | struct per_cpu_pages *pcp; | ||
1284 | 1290 | ||
1285 | local_irq_save(flags); | 1291 | pcp = &pset->pcp; |
1286 | pset = per_cpu_ptr(zone->pageset, cpu); | 1292 | if (pcp->count) { |
1293 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
1294 | pcp->count = 0; | ||
1295 | } | ||
1296 | local_irq_restore(flags); | ||
1297 | } | ||
1287 | 1298 | ||
1288 | pcp = &pset->pcp; | 1299 | /* |
1289 | if (pcp->count) { | 1300 | * Drain pcplists of all zones on the indicated processor. |
1290 | free_pcppages_bulk(zone, pcp->count, pcp); | 1301 | * |
1291 | pcp->count = 0; | 1302 | * The processor must either be the current processor and the |
1292 | } | 1303 | * thread pinned to the current processor or a processor that |
1293 | local_irq_restore(flags); | 1304 | * is not online. |
1305 | */ | ||
1306 | static void drain_pages(unsigned int cpu) | ||
1307 | { | ||
1308 | struct zone *zone; | ||
1309 | |||
1310 | for_each_populated_zone(zone) { | ||
1311 | drain_pages_zone(cpu, zone); | ||
1294 | } | 1312 | } |
1295 | } | 1313 | } |
1296 | 1314 | ||
1297 | /* | 1315 | /* |
1298 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 1316 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
1317 | * | ||
1318 | * The CPU has to be pinned. When zone parameter is non-NULL, spill just | ||
1319 | * the single zone's pages. | ||
1299 | */ | 1320 | */ |
1300 | void drain_local_pages(void *arg) | 1321 | void drain_local_pages(struct zone *zone) |
1301 | { | 1322 | { |
1302 | drain_pages(smp_processor_id()); | 1323 | int cpu = smp_processor_id(); |
1324 | |||
1325 | if (zone) | ||
1326 | drain_pages_zone(cpu, zone); | ||
1327 | else | ||
1328 | drain_pages(cpu); | ||
1303 | } | 1329 | } |
1304 | 1330 | ||
1305 | /* | 1331 | /* |
1306 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. | 1332 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
1307 | * | 1333 | * |
1334 | * When zone parameter is non-NULL, spill just the single zone's pages. | ||
1335 | * | ||
1308 | * Note that this code is protected against sending an IPI to an offline | 1336 | * Note that this code is protected against sending an IPI to an offline |
1309 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: | 1337 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: |
1310 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but | 1338 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but |
1311 | * nothing keeps CPUs from showing up after we populated the cpumask and | 1339 | * nothing keeps CPUs from showing up after we populated the cpumask and |
1312 | * before the call to on_each_cpu_mask(). | 1340 | * before the call to on_each_cpu_mask(). |
1313 | */ | 1341 | */ |
1314 | void drain_all_pages(void) | 1342 | void drain_all_pages(struct zone *zone) |
1315 | { | 1343 | { |
1316 | int cpu; | 1344 | int cpu; |
1317 | struct per_cpu_pageset *pcp; | ||
1318 | struct zone *zone; | ||
1319 | 1345 | ||
1320 | /* | 1346 | /* |
1321 | * Allocate in the BSS so we wont require allocation in | 1347 | * Allocate in the BSS so we wont require allocation in |
@@ -1330,20 +1356,31 @@ void drain_all_pages(void) | |||
1330 | * disables preemption as part of its processing | 1356 | * disables preemption as part of its processing |
1331 | */ | 1357 | */ |
1332 | for_each_online_cpu(cpu) { | 1358 | for_each_online_cpu(cpu) { |
1359 | struct per_cpu_pageset *pcp; | ||
1360 | struct zone *z; | ||
1333 | bool has_pcps = false; | 1361 | bool has_pcps = false; |
1334 | for_each_populated_zone(zone) { | 1362 | |
1363 | if (zone) { | ||
1335 | pcp = per_cpu_ptr(zone->pageset, cpu); | 1364 | pcp = per_cpu_ptr(zone->pageset, cpu); |
1336 | if (pcp->pcp.count) { | 1365 | if (pcp->pcp.count) |
1337 | has_pcps = true; | 1366 | has_pcps = true; |
1338 | break; | 1367 | } else { |
1368 | for_each_populated_zone(z) { | ||
1369 | pcp = per_cpu_ptr(z->pageset, cpu); | ||
1370 | if (pcp->pcp.count) { | ||
1371 | has_pcps = true; | ||
1372 | break; | ||
1373 | } | ||
1339 | } | 1374 | } |
1340 | } | 1375 | } |
1376 | |||
1341 | if (has_pcps) | 1377 | if (has_pcps) |
1342 | cpumask_set_cpu(cpu, &cpus_with_pcps); | 1378 | cpumask_set_cpu(cpu, &cpus_with_pcps); |
1343 | else | 1379 | else |
1344 | cpumask_clear_cpu(cpu, &cpus_with_pcps); | 1380 | cpumask_clear_cpu(cpu, &cpus_with_pcps); |
1345 | } | 1381 | } |
1346 | on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); | 1382 | on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, |
1383 | zone, 1); | ||
1347 | } | 1384 | } |
1348 | 1385 | ||
1349 | #ifdef CONFIG_HIBERNATION | 1386 | #ifdef CONFIG_HIBERNATION |
@@ -1705,7 +1742,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, | |||
1705 | unsigned long mark, int classzone_idx, int alloc_flags, | 1742 | unsigned long mark, int classzone_idx, int alloc_flags, |
1706 | long free_pages) | 1743 | long free_pages) |
1707 | { | 1744 | { |
1708 | /* free_pages my go negative - that's OK */ | 1745 | /* free_pages may go negative - that's OK */ |
1709 | long min = mark; | 1746 | long min = mark; |
1710 | int o; | 1747 | int o; |
1711 | long free_cma = 0; | 1748 | long free_cma = 0; |
@@ -2296,7 +2333,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2296 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2333 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2297 | int *contended_compaction, bool *deferred_compaction) | 2334 | int *contended_compaction, bool *deferred_compaction) |
2298 | { | 2335 | { |
2299 | struct zone *last_compact_zone = NULL; | ||
2300 | unsigned long compact_result; | 2336 | unsigned long compact_result; |
2301 | struct page *page; | 2337 | struct page *page; |
2302 | 2338 | ||
@@ -2307,7 +2343,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2307 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2343 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, |
2308 | nodemask, mode, | 2344 | nodemask, mode, |
2309 | contended_compaction, | 2345 | contended_compaction, |
2310 | &last_compact_zone); | 2346 | alloc_flags, classzone_idx); |
2311 | current->flags &= ~PF_MEMALLOC; | 2347 | current->flags &= ~PF_MEMALLOC; |
2312 | 2348 | ||
2313 | switch (compact_result) { | 2349 | switch (compact_result) { |
@@ -2326,10 +2362,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2326 | */ | 2362 | */ |
2327 | count_vm_event(COMPACTSTALL); | 2363 | count_vm_event(COMPACTSTALL); |
2328 | 2364 | ||
2329 | /* Page migration frees to the PCP lists but we want merging */ | ||
2330 | drain_pages(get_cpu()); | ||
2331 | put_cpu(); | ||
2332 | |||
2333 | page = get_page_from_freelist(gfp_mask, nodemask, | 2365 | page = get_page_from_freelist(gfp_mask, nodemask, |
2334 | order, zonelist, high_zoneidx, | 2366 | order, zonelist, high_zoneidx, |
2335 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2367 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
@@ -2345,14 +2377,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2345 | } | 2377 | } |
2346 | 2378 | ||
2347 | /* | 2379 | /* |
2348 | * last_compact_zone is where try_to_compact_pages thought allocation | ||
2349 | * should succeed, so it did not defer compaction. But here we know | ||
2350 | * that it didn't succeed, so we do the defer. | ||
2351 | */ | ||
2352 | if (last_compact_zone && mode != MIGRATE_ASYNC) | ||
2353 | defer_compaction(last_compact_zone, order); | ||
2354 | |||
2355 | /* | ||
2356 | * It's bad if compaction run occurs and fails. The most likely reason | 2380 | * It's bad if compaction run occurs and fails. The most likely reason |
2357 | * is that pages exist, but not enough to satisfy watermarks. | 2381 | * is that pages exist, but not enough to satisfy watermarks. |
2358 | */ | 2382 | */ |
@@ -2433,7 +2457,7 @@ retry: | |||
2433 | * pages are pinned on the per-cpu lists. Drain them and try again | 2457 | * pages are pinned on the per-cpu lists. Drain them and try again |
2434 | */ | 2458 | */ |
2435 | if (!page && !drained) { | 2459 | if (!page && !drained) { |
2436 | drain_all_pages(); | 2460 | drain_all_pages(NULL); |
2437 | drained = true; | 2461 | drained = true; |
2438 | goto retry; | 2462 | goto retry; |
2439 | } | 2463 | } |
@@ -3893,14 +3917,14 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
3893 | else | 3917 | else |
3894 | page_group_by_mobility_disabled = 0; | 3918 | page_group_by_mobility_disabled = 0; |
3895 | 3919 | ||
3896 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 3920 | pr_info("Built %i zonelists in %s order, mobility grouping %s. " |
3897 | "Total pages: %ld\n", | 3921 | "Total pages: %ld\n", |
3898 | nr_online_nodes, | 3922 | nr_online_nodes, |
3899 | zonelist_order_name[current_zonelist_order], | 3923 | zonelist_order_name[current_zonelist_order], |
3900 | page_group_by_mobility_disabled ? "off" : "on", | 3924 | page_group_by_mobility_disabled ? "off" : "on", |
3901 | vm_total_pages); | 3925 | vm_total_pages); |
3902 | #ifdef CONFIG_NUMA | 3926 | #ifdef CONFIG_NUMA |
3903 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 3927 | pr_info("Policy zone: %s\n", zone_names[policy_zone]); |
3904 | #endif | 3928 | #endif |
3905 | } | 3929 | } |
3906 | 3930 | ||
@@ -4832,7 +4856,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4832 | #endif | 4856 | #endif |
4833 | init_waitqueue_head(&pgdat->kswapd_wait); | 4857 | init_waitqueue_head(&pgdat->kswapd_wait); |
4834 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4858 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4835 | pgdat_page_cgroup_init(pgdat); | ||
4836 | 4859 | ||
4837 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4860 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4838 | struct zone *zone = pgdat->node_zones + j; | 4861 | struct zone *zone = pgdat->node_zones + j; |
@@ -5334,33 +5357,33 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5334 | find_zone_movable_pfns_for_nodes(); | 5357 | find_zone_movable_pfns_for_nodes(); |
5335 | 5358 | ||
5336 | /* Print out the zone ranges */ | 5359 | /* Print out the zone ranges */ |
5337 | printk("Zone ranges:\n"); | 5360 | pr_info("Zone ranges:\n"); |
5338 | for (i = 0; i < MAX_NR_ZONES; i++) { | 5361 | for (i = 0; i < MAX_NR_ZONES; i++) { |
5339 | if (i == ZONE_MOVABLE) | 5362 | if (i == ZONE_MOVABLE) |
5340 | continue; | 5363 | continue; |
5341 | printk(KERN_CONT " %-8s ", zone_names[i]); | 5364 | pr_info(" %-8s ", zone_names[i]); |
5342 | if (arch_zone_lowest_possible_pfn[i] == | 5365 | if (arch_zone_lowest_possible_pfn[i] == |
5343 | arch_zone_highest_possible_pfn[i]) | 5366 | arch_zone_highest_possible_pfn[i]) |
5344 | printk(KERN_CONT "empty\n"); | 5367 | pr_cont("empty\n"); |
5345 | else | 5368 | else |
5346 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", | 5369 | pr_cont("[mem %0#10lx-%0#10lx]\n", |
5347 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5370 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
5348 | (arch_zone_highest_possible_pfn[i] | 5371 | (arch_zone_highest_possible_pfn[i] |
5349 | << PAGE_SHIFT) - 1); | 5372 | << PAGE_SHIFT) - 1); |
5350 | } | 5373 | } |
5351 | 5374 | ||
5352 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 5375 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
5353 | printk("Movable zone start for each node\n"); | 5376 | pr_info("Movable zone start for each node\n"); |
5354 | for (i = 0; i < MAX_NUMNODES; i++) { | 5377 | for (i = 0; i < MAX_NUMNODES; i++) { |
5355 | if (zone_movable_pfn[i]) | 5378 | if (zone_movable_pfn[i]) |
5356 | printk(" Node %d: %#010lx\n", i, | 5379 | pr_info(" Node %d: %#010lx\n", i, |
5357 | zone_movable_pfn[i] << PAGE_SHIFT); | 5380 | zone_movable_pfn[i] << PAGE_SHIFT); |
5358 | } | 5381 | } |
5359 | 5382 | ||
5360 | /* Print out the early node map */ | 5383 | /* Print out the early node map */ |
5361 | printk("Early memory node ranges\n"); | 5384 | pr_info("Early memory node ranges\n"); |
5362 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5385 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
5363 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5386 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
5364 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5387 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); |
5365 | 5388 | ||
5366 | /* Initialise every node */ | 5389 | /* Initialise every node */ |
@@ -5496,7 +5519,7 @@ void __init mem_init_print_info(const char *str) | |||
5496 | 5519 | ||
5497 | #undef adj_init_size | 5520 | #undef adj_init_size |
5498 | 5521 | ||
5499 | printk("Memory: %luK/%luK available " | 5522 | pr_info("Memory: %luK/%luK available " |
5500 | "(%luK kernel code, %luK rwdata, %luK rodata, " | 5523 | "(%luK kernel code, %luK rwdata, %luK rodata, " |
5501 | "%luK init, %luK bss, %luK reserved" | 5524 | "%luK init, %luK bss, %luK reserved" |
5502 | #ifdef CONFIG_HIGHMEM | 5525 | #ifdef CONFIG_HIGHMEM |
@@ -6385,7 +6408,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
6385 | */ | 6408 | */ |
6386 | 6409 | ||
6387 | lru_add_drain_all(); | 6410 | lru_add_drain_all(); |
6388 | drain_all_pages(); | 6411 | drain_all_pages(cc.zone); |
6389 | 6412 | ||
6390 | order = 0; | 6413 | order = 0; |
6391 | outer_start = start; | 6414 | outer_start = start; |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c deleted file mode 100644 index 5331c2bd85a2..000000000000 --- a/mm/page_cgroup.c +++ /dev/null | |||
@@ -1,530 +0,0 @@ | |||
1 | #include <linux/mm.h> | ||
2 | #include <linux/mmzone.h> | ||
3 | #include <linux/bootmem.h> | ||
4 | #include <linux/bit_spinlock.h> | ||
5 | #include <linux/page_cgroup.h> | ||
6 | #include <linux/hash.h> | ||
7 | #include <linux/slab.h> | ||
8 | #include <linux/memory.h> | ||
9 | #include <linux/vmalloc.h> | ||
10 | #include <linux/cgroup.h> | ||
11 | #include <linux/swapops.h> | ||
12 | #include <linux/kmemleak.h> | ||
13 | |||
14 | static unsigned long total_usage; | ||
15 | |||
16 | #if !defined(CONFIG_SPARSEMEM) | ||
17 | |||
18 | |||
19 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
20 | { | ||
21 | pgdat->node_page_cgroup = NULL; | ||
22 | } | ||
23 | |||
24 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
25 | { | ||
26 | unsigned long pfn = page_to_pfn(page); | ||
27 | unsigned long offset; | ||
28 | struct page_cgroup *base; | ||
29 | |||
30 | base = NODE_DATA(page_to_nid(page))->node_page_cgroup; | ||
31 | #ifdef CONFIG_DEBUG_VM | ||
32 | /* | ||
33 | * The sanity checks the page allocator does upon freeing a | ||
34 | * page can reach here before the page_cgroup arrays are | ||
35 | * allocated when feeding a range of pages to the allocator | ||
36 | * for the first time during bootup or memory hotplug. | ||
37 | */ | ||
38 | if (unlikely(!base)) | ||
39 | return NULL; | ||
40 | #endif | ||
41 | offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; | ||
42 | return base + offset; | ||
43 | } | ||
44 | |||
45 | static int __init alloc_node_page_cgroup(int nid) | ||
46 | { | ||
47 | struct page_cgroup *base; | ||
48 | unsigned long table_size; | ||
49 | unsigned long nr_pages; | ||
50 | |||
51 | nr_pages = NODE_DATA(nid)->node_spanned_pages; | ||
52 | if (!nr_pages) | ||
53 | return 0; | ||
54 | |||
55 | table_size = sizeof(struct page_cgroup) * nr_pages; | ||
56 | |||
57 | base = memblock_virt_alloc_try_nid_nopanic( | ||
58 | table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS), | ||
59 | BOOTMEM_ALLOC_ACCESSIBLE, nid); | ||
60 | if (!base) | ||
61 | return -ENOMEM; | ||
62 | NODE_DATA(nid)->node_page_cgroup = base; | ||
63 | total_usage += table_size; | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | void __init page_cgroup_init_flatmem(void) | ||
68 | { | ||
69 | |||
70 | int nid, fail; | ||
71 | |||
72 | if (mem_cgroup_disabled()) | ||
73 | return; | ||
74 | |||
75 | for_each_online_node(nid) { | ||
76 | fail = alloc_node_page_cgroup(nid); | ||
77 | if (fail) | ||
78 | goto fail; | ||
79 | } | ||
80 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
81 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" | ||
82 | " don't want memory cgroups\n"); | ||
83 | return; | ||
84 | fail: | ||
85 | printk(KERN_CRIT "allocation of page_cgroup failed.\n"); | ||
86 | printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); | ||
87 | panic("Out of memory"); | ||
88 | } | ||
89 | |||
90 | #else /* CONFIG_FLAT_NODE_MEM_MAP */ | ||
91 | |||
92 | struct page_cgroup *lookup_page_cgroup(struct page *page) | ||
93 | { | ||
94 | unsigned long pfn = page_to_pfn(page); | ||
95 | struct mem_section *section = __pfn_to_section(pfn); | ||
96 | #ifdef CONFIG_DEBUG_VM | ||
97 | /* | ||
98 | * The sanity checks the page allocator does upon freeing a | ||
99 | * page can reach here before the page_cgroup arrays are | ||
100 | * allocated when feeding a range of pages to the allocator | ||
101 | * for the first time during bootup or memory hotplug. | ||
102 | */ | ||
103 | if (!section->page_cgroup) | ||
104 | return NULL; | ||
105 | #endif | ||
106 | return section->page_cgroup + pfn; | ||
107 | } | ||
108 | |||
109 | static void *__meminit alloc_page_cgroup(size_t size, int nid) | ||
110 | { | ||
111 | gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN; | ||
112 | void *addr = NULL; | ||
113 | |||
114 | addr = alloc_pages_exact_nid(nid, size, flags); | ||
115 | if (addr) { | ||
116 | kmemleak_alloc(addr, size, 1, flags); | ||
117 | return addr; | ||
118 | } | ||
119 | |||
120 | if (node_state(nid, N_HIGH_MEMORY)) | ||
121 | addr = vzalloc_node(size, nid); | ||
122 | else | ||
123 | addr = vzalloc(size); | ||
124 | |||
125 | return addr; | ||
126 | } | ||
127 | |||
128 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) | ||
129 | { | ||
130 | struct mem_section *section; | ||
131 | struct page_cgroup *base; | ||
132 | unsigned long table_size; | ||
133 | |||
134 | section = __pfn_to_section(pfn); | ||
135 | |||
136 | if (section->page_cgroup) | ||
137 | return 0; | ||
138 | |||
139 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
140 | base = alloc_page_cgroup(table_size, nid); | ||
141 | |||
142 | /* | ||
143 | * The value stored in section->page_cgroup is (base - pfn) | ||
144 | * and it does not point to the memory block allocated above, | ||
145 | * causing kmemleak false positives. | ||
146 | */ | ||
147 | kmemleak_not_leak(base); | ||
148 | |||
149 | if (!base) { | ||
150 | printk(KERN_ERR "page cgroup allocation failure\n"); | ||
151 | return -ENOMEM; | ||
152 | } | ||
153 | |||
154 | /* | ||
155 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
156 | * we need to apply a mask. | ||
157 | */ | ||
158 | pfn &= PAGE_SECTION_MASK; | ||
159 | section->page_cgroup = base - pfn; | ||
160 | total_usage += table_size; | ||
161 | return 0; | ||
162 | } | ||
163 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
164 | static void free_page_cgroup(void *addr) | ||
165 | { | ||
166 | if (is_vmalloc_addr(addr)) { | ||
167 | vfree(addr); | ||
168 | } else { | ||
169 | struct page *page = virt_to_page(addr); | ||
170 | size_t table_size = | ||
171 | sizeof(struct page_cgroup) * PAGES_PER_SECTION; | ||
172 | |||
173 | BUG_ON(PageReserved(page)); | ||
174 | kmemleak_free(addr); | ||
175 | free_pages_exact(addr, table_size); | ||
176 | } | ||
177 | } | ||
178 | |||
179 | static void __free_page_cgroup(unsigned long pfn) | ||
180 | { | ||
181 | struct mem_section *ms; | ||
182 | struct page_cgroup *base; | ||
183 | |||
184 | ms = __pfn_to_section(pfn); | ||
185 | if (!ms || !ms->page_cgroup) | ||
186 | return; | ||
187 | base = ms->page_cgroup + pfn; | ||
188 | free_page_cgroup(base); | ||
189 | ms->page_cgroup = NULL; | ||
190 | } | ||
191 | |||
192 | static int __meminit online_page_cgroup(unsigned long start_pfn, | ||
193 | unsigned long nr_pages, | ||
194 | int nid) | ||
195 | { | ||
196 | unsigned long start, end, pfn; | ||
197 | int fail = 0; | ||
198 | |||
199 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
200 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
201 | |||
202 | if (nid == -1) { | ||
203 | /* | ||
204 | * In this case, "nid" already exists and contains valid memory. | ||
205 | * "start_pfn" passed to us is a pfn which is an arg for | ||
206 | * online__pages(), and start_pfn should exist. | ||
207 | */ | ||
208 | nid = pfn_to_nid(start_pfn); | ||
209 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
210 | } | ||
211 | |||
212 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | ||
213 | if (!pfn_present(pfn)) | ||
214 | continue; | ||
215 | fail = init_section_page_cgroup(pfn, nid); | ||
216 | } | ||
217 | if (!fail) | ||
218 | return 0; | ||
219 | |||
220 | /* rollback */ | ||
221 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
222 | __free_page_cgroup(pfn); | ||
223 | |||
224 | return -ENOMEM; | ||
225 | } | ||
226 | |||
227 | static int __meminit offline_page_cgroup(unsigned long start_pfn, | ||
228 | unsigned long nr_pages, int nid) | ||
229 | { | ||
230 | unsigned long start, end, pfn; | ||
231 | |||
232 | start = SECTION_ALIGN_DOWN(start_pfn); | ||
233 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | ||
234 | |||
235 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) | ||
236 | __free_page_cgroup(pfn); | ||
237 | return 0; | ||
238 | |||
239 | } | ||
240 | |||
241 | static int __meminit page_cgroup_callback(struct notifier_block *self, | ||
242 | unsigned long action, void *arg) | ||
243 | { | ||
244 | struct memory_notify *mn = arg; | ||
245 | int ret = 0; | ||
246 | switch (action) { | ||
247 | case MEM_GOING_ONLINE: | ||
248 | ret = online_page_cgroup(mn->start_pfn, | ||
249 | mn->nr_pages, mn->status_change_nid); | ||
250 | break; | ||
251 | case MEM_OFFLINE: | ||
252 | offline_page_cgroup(mn->start_pfn, | ||
253 | mn->nr_pages, mn->status_change_nid); | ||
254 | break; | ||
255 | case MEM_CANCEL_ONLINE: | ||
256 | offline_page_cgroup(mn->start_pfn, | ||
257 | mn->nr_pages, mn->status_change_nid); | ||
258 | break; | ||
259 | case MEM_GOING_OFFLINE: | ||
260 | break; | ||
261 | case MEM_ONLINE: | ||
262 | case MEM_CANCEL_OFFLINE: | ||
263 | break; | ||
264 | } | ||
265 | |||
266 | return notifier_from_errno(ret); | ||
267 | } | ||
268 | |||
269 | #endif | ||
270 | |||
271 | void __init page_cgroup_init(void) | ||
272 | { | ||
273 | unsigned long pfn; | ||
274 | int nid; | ||
275 | |||
276 | if (mem_cgroup_disabled()) | ||
277 | return; | ||
278 | |||
279 | for_each_node_state(nid, N_MEMORY) { | ||
280 | unsigned long start_pfn, end_pfn; | ||
281 | |||
282 | start_pfn = node_start_pfn(nid); | ||
283 | end_pfn = node_end_pfn(nid); | ||
284 | /* | ||
285 | * start_pfn and end_pfn may not be aligned to SECTION and the | ||
286 | * page->flags of out of node pages are not initialized. So we | ||
287 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. | ||
288 | */ | ||
289 | for (pfn = start_pfn; | ||
290 | pfn < end_pfn; | ||
291 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
292 | |||
293 | if (!pfn_valid(pfn)) | ||
294 | continue; | ||
295 | /* | ||
296 | * Nodes's pfns can be overlapping. | ||
297 | * We know some arch can have a nodes layout such as | ||
298 | * -------------pfn--------------> | ||
299 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
300 | */ | ||
301 | if (pfn_to_nid(pfn) != nid) | ||
302 | continue; | ||
303 | if (init_section_page_cgroup(pfn, nid)) | ||
304 | goto oom; | ||
305 | } | ||
306 | } | ||
307 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
308 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | ||
309 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " | ||
310 | "don't want memory cgroups\n"); | ||
311 | return; | ||
312 | oom: | ||
313 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | ||
314 | panic("Out of memory"); | ||
315 | } | ||
316 | |||
317 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | ||
318 | { | ||
319 | return; | ||
320 | } | ||
321 | |||
322 | #endif | ||
323 | |||
324 | |||
325 | #ifdef CONFIG_MEMCG_SWAP | ||
326 | |||
327 | static DEFINE_MUTEX(swap_cgroup_mutex); | ||
328 | struct swap_cgroup_ctrl { | ||
329 | struct page **map; | ||
330 | unsigned long length; | ||
331 | spinlock_t lock; | ||
332 | }; | ||
333 | |||
334 | static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | ||
335 | |||
336 | struct swap_cgroup { | ||
337 | unsigned short id; | ||
338 | }; | ||
339 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | ||
340 | |||
341 | /* | ||
342 | * SwapCgroup implements "lookup" and "exchange" operations. | ||
343 | * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge | ||
344 | * against SwapCache. At swap_free(), this is accessed directly from swap. | ||
345 | * | ||
346 | * This means, | ||
347 | * - we have no race in "exchange" when we're accessed via SwapCache because | ||
348 | * SwapCache(and its swp_entry) is under lock. | ||
349 | * - When called via swap_free(), there is no user of this entry and no race. | ||
350 | * Then, we don't need lock around "exchange". | ||
351 | * | ||
352 | * TODO: we can push these buffers out to HIGHMEM. | ||
353 | */ | ||
354 | |||
355 | /* | ||
356 | * allocate buffer for swap_cgroup. | ||
357 | */ | ||
358 | static int swap_cgroup_prepare(int type) | ||
359 | { | ||
360 | struct page *page; | ||
361 | struct swap_cgroup_ctrl *ctrl; | ||
362 | unsigned long idx, max; | ||
363 | |||
364 | ctrl = &swap_cgroup_ctrl[type]; | ||
365 | |||
366 | for (idx = 0; idx < ctrl->length; idx++) { | ||
367 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
368 | if (!page) | ||
369 | goto not_enough_page; | ||
370 | ctrl->map[idx] = page; | ||
371 | } | ||
372 | return 0; | ||
373 | not_enough_page: | ||
374 | max = idx; | ||
375 | for (idx = 0; idx < max; idx++) | ||
376 | __free_page(ctrl->map[idx]); | ||
377 | |||
378 | return -ENOMEM; | ||
379 | } | ||
380 | |||
381 | static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | ||
382 | struct swap_cgroup_ctrl **ctrlp) | ||
383 | { | ||
384 | pgoff_t offset = swp_offset(ent); | ||
385 | struct swap_cgroup_ctrl *ctrl; | ||
386 | struct page *mappage; | ||
387 | struct swap_cgroup *sc; | ||
388 | |||
389 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; | ||
390 | if (ctrlp) | ||
391 | *ctrlp = ctrl; | ||
392 | |||
393 | mappage = ctrl->map[offset / SC_PER_PAGE]; | ||
394 | sc = page_address(mappage); | ||
395 | return sc + offset % SC_PER_PAGE; | ||
396 | } | ||
397 | |||
398 | /** | ||
399 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
400 | * @ent: swap entry to be cmpxchged | ||
401 | * @old: old id | ||
402 | * @new: new id | ||
403 | * | ||
404 | * Returns old id at success, 0 at failure. | ||
405 | * (There is no mem_cgroup using 0 as its id) | ||
406 | */ | ||
407 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
408 | unsigned short old, unsigned short new) | ||
409 | { | ||
410 | struct swap_cgroup_ctrl *ctrl; | ||
411 | struct swap_cgroup *sc; | ||
412 | unsigned long flags; | ||
413 | unsigned short retval; | ||
414 | |||
415 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
416 | |||
417 | spin_lock_irqsave(&ctrl->lock, flags); | ||
418 | retval = sc->id; | ||
419 | if (retval == old) | ||
420 | sc->id = new; | ||
421 | else | ||
422 | retval = 0; | ||
423 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
424 | return retval; | ||
425 | } | ||
426 | |||
427 | /** | ||
428 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | ||
429 | * @ent: swap entry to be recorded into | ||
430 | * @id: mem_cgroup to be recorded | ||
431 | * | ||
432 | * Returns old value at success, 0 at failure. | ||
433 | * (Of course, old value can be 0.) | ||
434 | */ | ||
435 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | ||
436 | { | ||
437 | struct swap_cgroup_ctrl *ctrl; | ||
438 | struct swap_cgroup *sc; | ||
439 | unsigned short old; | ||
440 | unsigned long flags; | ||
441 | |||
442 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
443 | |||
444 | spin_lock_irqsave(&ctrl->lock, flags); | ||
445 | old = sc->id; | ||
446 | sc->id = id; | ||
447 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
448 | |||
449 | return old; | ||
450 | } | ||
451 | |||
452 | /** | ||
453 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry | ||
454 | * @ent: swap entry to be looked up. | ||
455 | * | ||
456 | * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | ||
457 | */ | ||
458 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | ||
459 | { | ||
460 | return lookup_swap_cgroup(ent, NULL)->id; | ||
461 | } | ||
462 | |||
463 | int swap_cgroup_swapon(int type, unsigned long max_pages) | ||
464 | { | ||
465 | void *array; | ||
466 | unsigned long array_size; | ||
467 | unsigned long length; | ||
468 | struct swap_cgroup_ctrl *ctrl; | ||
469 | |||
470 | if (!do_swap_account) | ||
471 | return 0; | ||
472 | |||
473 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); | ||
474 | array_size = length * sizeof(void *); | ||
475 | |||
476 | array = vzalloc(array_size); | ||
477 | if (!array) | ||
478 | goto nomem; | ||
479 | |||
480 | ctrl = &swap_cgroup_ctrl[type]; | ||
481 | mutex_lock(&swap_cgroup_mutex); | ||
482 | ctrl->length = length; | ||
483 | ctrl->map = array; | ||
484 | spin_lock_init(&ctrl->lock); | ||
485 | if (swap_cgroup_prepare(type)) { | ||
486 | /* memory shortage */ | ||
487 | ctrl->map = NULL; | ||
488 | ctrl->length = 0; | ||
489 | mutex_unlock(&swap_cgroup_mutex); | ||
490 | vfree(array); | ||
491 | goto nomem; | ||
492 | } | ||
493 | mutex_unlock(&swap_cgroup_mutex); | ||
494 | |||
495 | return 0; | ||
496 | nomem: | ||
497 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | ||
498 | printk(KERN_INFO | ||
499 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); | ||
500 | return -ENOMEM; | ||
501 | } | ||
502 | |||
503 | void swap_cgroup_swapoff(int type) | ||
504 | { | ||
505 | struct page **map; | ||
506 | unsigned long i, length; | ||
507 | struct swap_cgroup_ctrl *ctrl; | ||
508 | |||
509 | if (!do_swap_account) | ||
510 | return; | ||
511 | |||
512 | mutex_lock(&swap_cgroup_mutex); | ||
513 | ctrl = &swap_cgroup_ctrl[type]; | ||
514 | map = ctrl->map; | ||
515 | length = ctrl->length; | ||
516 | ctrl->map = NULL; | ||
517 | ctrl->length = 0; | ||
518 | mutex_unlock(&swap_cgroup_mutex); | ||
519 | |||
520 | if (map) { | ||
521 | for (i = 0; i < length; i++) { | ||
522 | struct page *page = map[i]; | ||
523 | if (page) | ||
524 | __free_page(page); | ||
525 | } | ||
526 | vfree(map); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | #endif | ||
diff --git a/mm/page_counter.c b/mm/page_counter.c new file mode 100644 index 000000000000..a009574fbba9 --- /dev/null +++ b/mm/page_counter.c | |||
@@ -0,0 +1,192 @@ | |||
1 | /* | ||
2 | * Lockless hierarchical page accounting & limiting | ||
3 | * | ||
4 | * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner | ||
5 | */ | ||
6 | |||
7 | #include <linux/page_counter.h> | ||
8 | #include <linux/atomic.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/string.h> | ||
11 | #include <linux/sched.h> | ||
12 | #include <linux/bug.h> | ||
13 | #include <asm/page.h> | ||
14 | |||
15 | /** | ||
16 | * page_counter_cancel - take pages out of the local counter | ||
17 | * @counter: counter | ||
18 | * @nr_pages: number of pages to cancel | ||
19 | */ | ||
20 | void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) | ||
21 | { | ||
22 | long new; | ||
23 | |||
24 | new = atomic_long_sub_return(nr_pages, &counter->count); | ||
25 | /* More uncharges than charges? */ | ||
26 | WARN_ON_ONCE(new < 0); | ||
27 | } | ||
28 | |||
29 | /** | ||
30 | * page_counter_charge - hierarchically charge pages | ||
31 | * @counter: counter | ||
32 | * @nr_pages: number of pages to charge | ||
33 | * | ||
34 | * NOTE: This does not consider any configured counter limits. | ||
35 | */ | ||
36 | void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) | ||
37 | { | ||
38 | struct page_counter *c; | ||
39 | |||
40 | for (c = counter; c; c = c->parent) { | ||
41 | long new; | ||
42 | |||
43 | new = atomic_long_add_return(nr_pages, &c->count); | ||
44 | /* | ||
45 | * This is indeed racy, but we can live with some | ||
46 | * inaccuracy in the watermark. | ||
47 | */ | ||
48 | if (new > c->watermark) | ||
49 | c->watermark = new; | ||
50 | } | ||
51 | } | ||
52 | |||
53 | /** | ||
54 | * page_counter_try_charge - try to hierarchically charge pages | ||
55 | * @counter: counter | ||
56 | * @nr_pages: number of pages to charge | ||
57 | * @fail: points first counter to hit its limit, if any | ||
58 | * | ||
59 | * Returns 0 on success, or -ENOMEM and @fail if the counter or one of | ||
60 | * its ancestors has hit its configured limit. | ||
61 | */ | ||
62 | int page_counter_try_charge(struct page_counter *counter, | ||
63 | unsigned long nr_pages, | ||
64 | struct page_counter **fail) | ||
65 | { | ||
66 | struct page_counter *c; | ||
67 | |||
68 | for (c = counter; c; c = c->parent) { | ||
69 | long new; | ||
70 | /* | ||
71 | * Charge speculatively to avoid an expensive CAS. If | ||
72 | * a bigger charge fails, it might falsely lock out a | ||
73 | * racing smaller charge and send it into reclaim | ||
74 | * early, but the error is limited to the difference | ||
75 | * between the two sizes, which is less than 2M/4M in | ||
76 | * case of a THP locking out a regular page charge. | ||
77 | * | ||
78 | * The atomic_long_add_return() implies a full memory | ||
79 | * barrier between incrementing the count and reading | ||
80 | * the limit. When racing with page_counter_limit(), | ||
81 | * we either see the new limit or the setter sees the | ||
82 | * counter has changed and retries. | ||
83 | */ | ||
84 | new = atomic_long_add_return(nr_pages, &c->count); | ||
85 | if (new > c->limit) { | ||
86 | atomic_long_sub(nr_pages, &c->count); | ||
87 | /* | ||
88 | * This is racy, but we can live with some | ||
89 | * inaccuracy in the failcnt. | ||
90 | */ | ||
91 | c->failcnt++; | ||
92 | *fail = c; | ||
93 | goto failed; | ||
94 | } | ||
95 | /* | ||
96 | * Just like with failcnt, we can live with some | ||
97 | * inaccuracy in the watermark. | ||
98 | */ | ||
99 | if (new > c->watermark) | ||
100 | c->watermark = new; | ||
101 | } | ||
102 | return 0; | ||
103 | |||
104 | failed: | ||
105 | for (c = counter; c != *fail; c = c->parent) | ||
106 | page_counter_cancel(c, nr_pages); | ||
107 | |||
108 | return -ENOMEM; | ||
109 | } | ||
110 | |||
111 | /** | ||
112 | * page_counter_uncharge - hierarchically uncharge pages | ||
113 | * @counter: counter | ||
114 | * @nr_pages: number of pages to uncharge | ||
115 | */ | ||
116 | void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) | ||
117 | { | ||
118 | struct page_counter *c; | ||
119 | |||
120 | for (c = counter; c; c = c->parent) | ||
121 | page_counter_cancel(c, nr_pages); | ||
122 | } | ||
123 | |||
124 | /** | ||
125 | * page_counter_limit - limit the number of pages allowed | ||
126 | * @counter: counter | ||
127 | * @limit: limit to set | ||
128 | * | ||
129 | * Returns 0 on success, -EBUSY if the current number of pages on the | ||
130 | * counter already exceeds the specified limit. | ||
131 | * | ||
132 | * The caller must serialize invocations on the same counter. | ||
133 | */ | ||
134 | int page_counter_limit(struct page_counter *counter, unsigned long limit) | ||
135 | { | ||
136 | for (;;) { | ||
137 | unsigned long old; | ||
138 | long count; | ||
139 | |||
140 | /* | ||
141 | * Update the limit while making sure that it's not | ||
142 | * below the concurrently-changing counter value. | ||
143 | * | ||
144 | * The xchg implies two full memory barriers before | ||
145 | * and after, so the read-swap-read is ordered and | ||
146 | * ensures coherency with page_counter_try_charge(): | ||
147 | * that function modifies the count before checking | ||
148 | * the limit, so if it sees the old limit, we see the | ||
149 | * modified counter and retry. | ||
150 | */ | ||
151 | count = atomic_long_read(&counter->count); | ||
152 | |||
153 | if (count > limit) | ||
154 | return -EBUSY; | ||
155 | |||
156 | old = xchg(&counter->limit, limit); | ||
157 | |||
158 | if (atomic_long_read(&counter->count) <= count) | ||
159 | return 0; | ||
160 | |||
161 | counter->limit = old; | ||
162 | cond_resched(); | ||
163 | } | ||
164 | } | ||
165 | |||
166 | /** | ||
167 | * page_counter_memparse - memparse() for page counter limits | ||
168 | * @buf: string to parse | ||
169 | * @nr_pages: returns the result in number of pages | ||
170 | * | ||
171 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | ||
172 | * limited to %PAGE_COUNTER_MAX. | ||
173 | */ | ||
174 | int page_counter_memparse(const char *buf, unsigned long *nr_pages) | ||
175 | { | ||
176 | char unlimited[] = "-1"; | ||
177 | char *end; | ||
178 | u64 bytes; | ||
179 | |||
180 | if (!strncmp(buf, unlimited, sizeof(unlimited))) { | ||
181 | *nr_pages = PAGE_COUNTER_MAX; | ||
182 | return 0; | ||
183 | } | ||
184 | |||
185 | bytes = memparse(buf, &end); | ||
186 | if (*end != '\0') | ||
187 | return -EINVAL; | ||
188 | |||
189 | *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX); | ||
190 | |||
191 | return 0; | ||
192 | } | ||
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c8778f7e208e..72f5ac381ab3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -68,7 +68,7 @@ out: | |||
68 | 68 | ||
69 | spin_unlock_irqrestore(&zone->lock, flags); | 69 | spin_unlock_irqrestore(&zone->lock, flags); |
70 | if (!ret) | 70 | if (!ret) |
71 | drain_all_pages(); | 71 | drain_all_pages(zone); |
72 | return ret; | 72 | return ret; |
73 | } | 73 | } |
74 | 74 | ||
@@ -1053,7 +1053,7 @@ void page_add_file_rmap(struct page *page) | |||
1053 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1053 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1054 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1054 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
1055 | } | 1055 | } |
1056 | mem_cgroup_end_page_stat(memcg, locked, flags); | 1056 | mem_cgroup_end_page_stat(memcg, &locked, &flags); |
1057 | } | 1057 | } |
1058 | 1058 | ||
1059 | static void page_remove_file_rmap(struct page *page) | 1059 | static void page_remove_file_rmap(struct page *page) |
@@ -1083,7 +1083,7 @@ static void page_remove_file_rmap(struct page *page) | |||
1083 | if (unlikely(PageMlocked(page))) | 1083 | if (unlikely(PageMlocked(page))) |
1084 | clear_page_mlock(page); | 1084 | clear_page_mlock(page); |
1085 | out: | 1085 | out: |
1086 | mem_cgroup_end_page_stat(memcg, locked, flags); | 1086 | mem_cgroup_end_page_stat(memcg, &locked, &flags); |
1087 | } | 1087 | } |
1088 | 1088 | ||
1089 | /** | 1089 | /** |
@@ -2590,7 +2590,10 @@ static int cache_grow(struct kmem_cache *cachep, | |||
2590 | * Be lazy and only check for valid flags here, keeping it out of the | 2590 | * Be lazy and only check for valid flags here, keeping it out of the |
2591 | * critical path in kmem_cache_alloc(). | 2591 | * critical path in kmem_cache_alloc(). |
2592 | */ | 2592 | */ |
2593 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 2593 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { |
2594 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | ||
2595 | BUG(); | ||
2596 | } | ||
2594 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 2597 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
2595 | 2598 | ||
2596 | /* Take the node list lock to change the colour_next on this node */ | 2599 | /* Take the node list lock to change the colour_next on this node */ |
@@ -3580,11 +3583,11 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) | |||
3580 | 3583 | ||
3581 | for_each_online_node(node) { | 3584 | for_each_online_node(node) { |
3582 | 3585 | ||
3583 | if (use_alien_caches) { | 3586 | if (use_alien_caches) { |
3584 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); | 3587 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); |
3585 | if (!new_alien) | 3588 | if (!new_alien) |
3586 | goto fail; | 3589 | goto fail; |
3587 | } | 3590 | } |
3588 | 3591 | ||
3589 | new_shared = NULL; | 3592 | new_shared = NULL; |
3590 | if (cachep->shared) { | 3593 | if (cachep->shared) { |
@@ -4043,12 +4046,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
4043 | 4046 | ||
4044 | #ifdef CONFIG_DEBUG_SLAB_LEAK | 4047 | #ifdef CONFIG_DEBUG_SLAB_LEAK |
4045 | 4048 | ||
4046 | static void *leaks_start(struct seq_file *m, loff_t *pos) | ||
4047 | { | ||
4048 | mutex_lock(&slab_mutex); | ||
4049 | return seq_list_start(&slab_caches, *pos); | ||
4050 | } | ||
4051 | |||
4052 | static inline int add_caller(unsigned long *n, unsigned long v) | 4049 | static inline int add_caller(unsigned long *n, unsigned long v) |
4053 | { | 4050 | { |
4054 | unsigned long *p; | 4051 | unsigned long *p; |
@@ -4170,7 +4167,7 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4170 | } | 4167 | } |
4171 | 4168 | ||
4172 | static const struct seq_operations slabstats_op = { | 4169 | static const struct seq_operations slabstats_op = { |
4173 | .start = leaks_start, | 4170 | .start = slab_start, |
4174 | .next = slab_next, | 4171 | .next = slab_next, |
4175 | .stop = slab_stop, | 4172 | .stop = slab_stop, |
4176 | .show = leaks_show, | 4173 | .show = leaks_show, |
@@ -209,15 +209,15 @@ cache_from_memcg_idx(struct kmem_cache *s, int idx) | |||
209 | 209 | ||
210 | rcu_read_lock(); | 210 | rcu_read_lock(); |
211 | params = rcu_dereference(s->memcg_params); | 211 | params = rcu_dereference(s->memcg_params); |
212 | cachep = params->memcg_caches[idx]; | ||
213 | rcu_read_unlock(); | ||
214 | 212 | ||
215 | /* | 213 | /* |
216 | * Make sure we will access the up-to-date value. The code updating | 214 | * Make sure we will access the up-to-date value. The code updating |
217 | * memcg_caches issues a write barrier to match this (see | 215 | * memcg_caches issues a write barrier to match this (see |
218 | * memcg_register_cache()). | 216 | * memcg_register_cache()). |
219 | */ | 217 | */ |
220 | smp_read_barrier_depends(); | 218 | cachep = lockless_dereference(params->memcg_caches[idx]); |
219 | rcu_read_unlock(); | ||
220 | |||
221 | return cachep; | 221 | return cachep; |
222 | } | 222 | } |
223 | 223 | ||
@@ -357,7 +357,9 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
357 | 357 | ||
358 | #endif | 358 | #endif |
359 | 359 | ||
360 | void *slab_start(struct seq_file *m, loff_t *pos); | ||
360 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); | 361 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); |
361 | void slab_stop(struct seq_file *m, void *p); | 362 | void slab_stop(struct seq_file *m, void *p); |
363 | int memcg_slab_show(struct seq_file *m, void *p); | ||
362 | 364 | ||
363 | #endif /* MM_SLAB_H */ | 365 | #endif /* MM_SLAB_H */ |
diff --git a/mm/slab_common.c b/mm/slab_common.c index dcdab81bd240..e03dd6f2a272 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -240,7 +240,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, | |||
240 | size = ALIGN(size, align); | 240 | size = ALIGN(size, align); |
241 | flags = kmem_cache_flags(size, flags, name, NULL); | 241 | flags = kmem_cache_flags(size, flags, name, NULL); |
242 | 242 | ||
243 | list_for_each_entry(s, &slab_caches, list) { | 243 | list_for_each_entry_reverse(s, &slab_caches, list) { |
244 | if (slab_unmergeable(s)) | 244 | if (slab_unmergeable(s)) |
245 | continue; | 245 | continue; |
246 | 246 | ||
@@ -811,7 +811,7 @@ EXPORT_SYMBOL(kmalloc_order_trace); | |||
811 | #define SLABINFO_RIGHTS S_IRUSR | 811 | #define SLABINFO_RIGHTS S_IRUSR |
812 | #endif | 812 | #endif |
813 | 813 | ||
814 | void print_slabinfo_header(struct seq_file *m) | 814 | static void print_slabinfo_header(struct seq_file *m) |
815 | { | 815 | { |
816 | /* | 816 | /* |
817 | * Output format version, so at least we can change it | 817 | * Output format version, so at least we can change it |
@@ -834,14 +834,9 @@ void print_slabinfo_header(struct seq_file *m) | |||
834 | seq_putc(m, '\n'); | 834 | seq_putc(m, '\n'); |
835 | } | 835 | } |
836 | 836 | ||
837 | static void *s_start(struct seq_file *m, loff_t *pos) | 837 | void *slab_start(struct seq_file *m, loff_t *pos) |
838 | { | 838 | { |
839 | loff_t n = *pos; | ||
840 | |||
841 | mutex_lock(&slab_mutex); | 839 | mutex_lock(&slab_mutex); |
842 | if (!n) | ||
843 | print_slabinfo_header(m); | ||
844 | |||
845 | return seq_list_start(&slab_caches, *pos); | 840 | return seq_list_start(&slab_caches, *pos); |
846 | } | 841 | } |
847 | 842 | ||
@@ -881,7 +876,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | |||
881 | } | 876 | } |
882 | } | 877 | } |
883 | 878 | ||
884 | int cache_show(struct kmem_cache *s, struct seq_file *m) | 879 | static void cache_show(struct kmem_cache *s, struct seq_file *m) |
885 | { | 880 | { |
886 | struct slabinfo sinfo; | 881 | struct slabinfo sinfo; |
887 | 882 | ||
@@ -900,17 +895,32 @@ int cache_show(struct kmem_cache *s, struct seq_file *m) | |||
900 | sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); | 895 | sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail); |
901 | slabinfo_show_stats(m, s); | 896 | slabinfo_show_stats(m, s); |
902 | seq_putc(m, '\n'); | 897 | seq_putc(m, '\n'); |
898 | } | ||
899 | |||
900 | static int slab_show(struct seq_file *m, void *p) | ||
901 | { | ||
902 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); | ||
903 | |||
904 | if (p == slab_caches.next) | ||
905 | print_slabinfo_header(m); | ||
906 | if (is_root_cache(s)) | ||
907 | cache_show(s, m); | ||
903 | return 0; | 908 | return 0; |
904 | } | 909 | } |
905 | 910 | ||
906 | static int s_show(struct seq_file *m, void *p) | 911 | #ifdef CONFIG_MEMCG_KMEM |
912 | int memcg_slab_show(struct seq_file *m, void *p) | ||
907 | { | 913 | { |
908 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); | 914 | struct kmem_cache *s = list_entry(p, struct kmem_cache, list); |
915 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
909 | 916 | ||
910 | if (!is_root_cache(s)) | 917 | if (p == slab_caches.next) |
911 | return 0; | 918 | print_slabinfo_header(m); |
912 | return cache_show(s, m); | 919 | if (!is_root_cache(s) && s->memcg_params->memcg == memcg) |
920 | cache_show(s, m); | ||
921 | return 0; | ||
913 | } | 922 | } |
923 | #endif | ||
914 | 924 | ||
915 | /* | 925 | /* |
916 | * slabinfo_op - iterator that generates /proc/slabinfo | 926 | * slabinfo_op - iterator that generates /proc/slabinfo |
@@ -926,10 +936,10 @@ static int s_show(struct seq_file *m, void *p) | |||
926 | * + further values on SMP and with statistics enabled | 936 | * + further values on SMP and with statistics enabled |
927 | */ | 937 | */ |
928 | static const struct seq_operations slabinfo_op = { | 938 | static const struct seq_operations slabinfo_op = { |
929 | .start = s_start, | 939 | .start = slab_start, |
930 | .next = slab_next, | 940 | .next = slab_next, |
931 | .stop = slab_stop, | 941 | .stop = slab_stop, |
932 | .show = s_show, | 942 | .show = slab_show, |
933 | }; | 943 | }; |
934 | 944 | ||
935 | static int slabinfo_open(struct inode *inode, struct file *file) | 945 | static int slabinfo_open(struct inode *inode, struct file *file) |
@@ -849,12 +849,12 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
849 | maxobj = order_objects(compound_order(page), s->size, s->reserved); | 849 | maxobj = order_objects(compound_order(page), s->size, s->reserved); |
850 | if (page->objects > maxobj) { | 850 | if (page->objects > maxobj) { |
851 | slab_err(s, page, "objects %u > max %u", | 851 | slab_err(s, page, "objects %u > max %u", |
852 | s->name, page->objects, maxobj); | 852 | page->objects, maxobj); |
853 | return 0; | 853 | return 0; |
854 | } | 854 | } |
855 | if (page->inuse > page->objects) { | 855 | if (page->inuse > page->objects) { |
856 | slab_err(s, page, "inuse %u > max %u", | 856 | slab_err(s, page, "inuse %u > max %u", |
857 | s->name, page->inuse, page->objects); | 857 | page->inuse, page->objects); |
858 | return 0; | 858 | return 0; |
859 | } | 859 | } |
860 | /* Slab_pad_check fixes things up after itself */ | 860 | /* Slab_pad_check fixes things up after itself */ |
@@ -871,7 +871,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
871 | int nr = 0; | 871 | int nr = 0; |
872 | void *fp; | 872 | void *fp; |
873 | void *object = NULL; | 873 | void *object = NULL; |
874 | unsigned long max_objects; | 874 | int max_objects; |
875 | 875 | ||
876 | fp = page->freelist; | 876 | fp = page->freelist; |
877 | while (fp && nr <= page->objects) { | 877 | while (fp && nr <= page->objects) { |
@@ -1377,7 +1377,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1377 | int order; | 1377 | int order; |
1378 | int idx; | 1378 | int idx; |
1379 | 1379 | ||
1380 | BUG_ON(flags & GFP_SLAB_BUG_MASK); | 1380 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { |
1381 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | ||
1382 | BUG(); | ||
1383 | } | ||
1381 | 1384 | ||
1382 | page = allocate_slab(s, | 1385 | page = allocate_slab(s, |
1383 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); | 1386 | flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); |
@@ -2554,7 +2557,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2554 | 2557 | ||
2555 | } else { /* Needs to be taken off a list */ | 2558 | } else { /* Needs to be taken off a list */ |
2556 | 2559 | ||
2557 | n = get_node(s, page_to_nid(page)); | 2560 | n = get_node(s, page_to_nid(page)); |
2558 | /* | 2561 | /* |
2559 | * Speculatively acquire the list_lock. | 2562 | * Speculatively acquire the list_lock. |
2560 | * If the cmpxchg does not succeed then we may | 2563 | * If the cmpxchg does not succeed then we may |
@@ -2587,10 +2590,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2587 | * The list lock was not taken therefore no list | 2590 | * The list lock was not taken therefore no list |
2588 | * activity can be necessary. | 2591 | * activity can be necessary. |
2589 | */ | 2592 | */ |
2590 | if (was_frozen) | 2593 | if (was_frozen) |
2591 | stat(s, FREE_FROZEN); | 2594 | stat(s, FREE_FROZEN); |
2592 | return; | 2595 | return; |
2593 | } | 2596 | } |
2594 | 2597 | ||
2595 | if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) | 2598 | if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) |
2596 | goto slab_empty; | 2599 | goto slab_empty; |
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c new file mode 100644 index 000000000000..b5f7f24b8dd1 --- /dev/null +++ b/mm/swap_cgroup.c | |||
@@ -0,0 +1,208 @@ | |||
1 | #include <linux/swap_cgroup.h> | ||
2 | #include <linux/vmalloc.h> | ||
3 | #include <linux/mm.h> | ||
4 | |||
5 | #include <linux/swapops.h> /* depends on mm.h include */ | ||
6 | |||
7 | static DEFINE_MUTEX(swap_cgroup_mutex); | ||
8 | struct swap_cgroup_ctrl { | ||
9 | struct page **map; | ||
10 | unsigned long length; | ||
11 | spinlock_t lock; | ||
12 | }; | ||
13 | |||
14 | static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | ||
15 | |||
16 | struct swap_cgroup { | ||
17 | unsigned short id; | ||
18 | }; | ||
19 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | ||
20 | |||
21 | /* | ||
22 | * SwapCgroup implements "lookup" and "exchange" operations. | ||
23 | * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge | ||
24 | * against SwapCache. At swap_free(), this is accessed directly from swap. | ||
25 | * | ||
26 | * This means, | ||
27 | * - we have no race in "exchange" when we're accessed via SwapCache because | ||
28 | * SwapCache(and its swp_entry) is under lock. | ||
29 | * - When called via swap_free(), there is no user of this entry and no race. | ||
30 | * Then, we don't need lock around "exchange". | ||
31 | * | ||
32 | * TODO: we can push these buffers out to HIGHMEM. | ||
33 | */ | ||
34 | |||
35 | /* | ||
36 | * allocate buffer for swap_cgroup. | ||
37 | */ | ||
38 | static int swap_cgroup_prepare(int type) | ||
39 | { | ||
40 | struct page *page; | ||
41 | struct swap_cgroup_ctrl *ctrl; | ||
42 | unsigned long idx, max; | ||
43 | |||
44 | ctrl = &swap_cgroup_ctrl[type]; | ||
45 | |||
46 | for (idx = 0; idx < ctrl->length; idx++) { | ||
47 | page = alloc_page(GFP_KERNEL | __GFP_ZERO); | ||
48 | if (!page) | ||
49 | goto not_enough_page; | ||
50 | ctrl->map[idx] = page; | ||
51 | } | ||
52 | return 0; | ||
53 | not_enough_page: | ||
54 | max = idx; | ||
55 | for (idx = 0; idx < max; idx++) | ||
56 | __free_page(ctrl->map[idx]); | ||
57 | |||
58 | return -ENOMEM; | ||
59 | } | ||
60 | |||
61 | static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | ||
62 | struct swap_cgroup_ctrl **ctrlp) | ||
63 | { | ||
64 | pgoff_t offset = swp_offset(ent); | ||
65 | struct swap_cgroup_ctrl *ctrl; | ||
66 | struct page *mappage; | ||
67 | struct swap_cgroup *sc; | ||
68 | |||
69 | ctrl = &swap_cgroup_ctrl[swp_type(ent)]; | ||
70 | if (ctrlp) | ||
71 | *ctrlp = ctrl; | ||
72 | |||
73 | mappage = ctrl->map[offset / SC_PER_PAGE]; | ||
74 | sc = page_address(mappage); | ||
75 | return sc + offset % SC_PER_PAGE; | ||
76 | } | ||
77 | |||
78 | /** | ||
79 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
80 | * @ent: swap entry to be cmpxchged | ||
81 | * @old: old id | ||
82 | * @new: new id | ||
83 | * | ||
84 | * Returns old id at success, 0 at failure. | ||
85 | * (There is no mem_cgroup using 0 as its id) | ||
86 | */ | ||
87 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
88 | unsigned short old, unsigned short new) | ||
89 | { | ||
90 | struct swap_cgroup_ctrl *ctrl; | ||
91 | struct swap_cgroup *sc; | ||
92 | unsigned long flags; | ||
93 | unsigned short retval; | ||
94 | |||
95 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
96 | |||
97 | spin_lock_irqsave(&ctrl->lock, flags); | ||
98 | retval = sc->id; | ||
99 | if (retval == old) | ||
100 | sc->id = new; | ||
101 | else | ||
102 | retval = 0; | ||
103 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
104 | return retval; | ||
105 | } | ||
106 | |||
107 | /** | ||
108 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | ||
109 | * @ent: swap entry to be recorded into | ||
110 | * @id: mem_cgroup to be recorded | ||
111 | * | ||
112 | * Returns old value at success, 0 at failure. | ||
113 | * (Of course, old value can be 0.) | ||
114 | */ | ||
115 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | ||
116 | { | ||
117 | struct swap_cgroup_ctrl *ctrl; | ||
118 | struct swap_cgroup *sc; | ||
119 | unsigned short old; | ||
120 | unsigned long flags; | ||
121 | |||
122 | sc = lookup_swap_cgroup(ent, &ctrl); | ||
123 | |||
124 | spin_lock_irqsave(&ctrl->lock, flags); | ||
125 | old = sc->id; | ||
126 | sc->id = id; | ||
127 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
128 | |||
129 | return old; | ||
130 | } | ||
131 | |||
132 | /** | ||
133 | * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry | ||
134 | * @ent: swap entry to be looked up. | ||
135 | * | ||
136 | * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) | ||
137 | */ | ||
138 | unsigned short lookup_swap_cgroup_id(swp_entry_t ent) | ||
139 | { | ||
140 | return lookup_swap_cgroup(ent, NULL)->id; | ||
141 | } | ||
142 | |||
143 | int swap_cgroup_swapon(int type, unsigned long max_pages) | ||
144 | { | ||
145 | void *array; | ||
146 | unsigned long array_size; | ||
147 | unsigned long length; | ||
148 | struct swap_cgroup_ctrl *ctrl; | ||
149 | |||
150 | if (!do_swap_account) | ||
151 | return 0; | ||
152 | |||
153 | length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); | ||
154 | array_size = length * sizeof(void *); | ||
155 | |||
156 | array = vzalloc(array_size); | ||
157 | if (!array) | ||
158 | goto nomem; | ||
159 | |||
160 | ctrl = &swap_cgroup_ctrl[type]; | ||
161 | mutex_lock(&swap_cgroup_mutex); | ||
162 | ctrl->length = length; | ||
163 | ctrl->map = array; | ||
164 | spin_lock_init(&ctrl->lock); | ||
165 | if (swap_cgroup_prepare(type)) { | ||
166 | /* memory shortage */ | ||
167 | ctrl->map = NULL; | ||
168 | ctrl->length = 0; | ||
169 | mutex_unlock(&swap_cgroup_mutex); | ||
170 | vfree(array); | ||
171 | goto nomem; | ||
172 | } | ||
173 | mutex_unlock(&swap_cgroup_mutex); | ||
174 | |||
175 | return 0; | ||
176 | nomem: | ||
177 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | ||
178 | printk(KERN_INFO | ||
179 | "swap_cgroup can be disabled by swapaccount=0 boot option\n"); | ||
180 | return -ENOMEM; | ||
181 | } | ||
182 | |||
183 | void swap_cgroup_swapoff(int type) | ||
184 | { | ||
185 | struct page **map; | ||
186 | unsigned long i, length; | ||
187 | struct swap_cgroup_ctrl *ctrl; | ||
188 | |||
189 | if (!do_swap_account) | ||
190 | return; | ||
191 | |||
192 | mutex_lock(&swap_cgroup_mutex); | ||
193 | ctrl = &swap_cgroup_ctrl[type]; | ||
194 | map = ctrl->map; | ||
195 | length = ctrl->length; | ||
196 | ctrl->map = NULL; | ||
197 | ctrl->length = 0; | ||
198 | mutex_unlock(&swap_cgroup_mutex); | ||
199 | |||
200 | if (map) { | ||
201 | for (i = 0; i < length; i++) { | ||
202 | struct page *page = map[i]; | ||
203 | if (page) | ||
204 | __free_page(page); | ||
205 | } | ||
206 | vfree(map); | ||
207 | } | ||
208 | } | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index 154444918685..9711342987a0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -17,7 +17,6 @@ | |||
17 | #include <linux/blkdev.h> | 17 | #include <linux/blkdev.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
20 | #include <linux/page_cgroup.h> | ||
21 | 20 | ||
22 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
23 | 22 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 8798b2e0ac59..63f55ccb9b26 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -38,7 +38,7 @@ | |||
38 | #include <asm/pgtable.h> | 38 | #include <asm/pgtable.h> |
39 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
40 | #include <linux/swapops.h> | 40 | #include <linux/swapops.h> |
41 | #include <linux/page_cgroup.h> | 41 | #include <linux/swap_cgroup.h> |
42 | 42 | ||
43 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | 43 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, |
44 | unsigned char); | 44 | unsigned char); |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 90520af7f186..8a18196fcdff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -463,8 +463,7 @@ overflow: | |||
463 | goto retry; | 463 | goto retry; |
464 | } | 464 | } |
465 | if (printk_ratelimit()) | 465 | if (printk_ratelimit()) |
466 | printk(KERN_WARNING | 466 | pr_warn("vmap allocation for size %lu failed: " |
467 | "vmap allocation for size %lu failed: " | ||
468 | "use vmalloc=<size> to increase size.\n", size); | 467 | "use vmalloc=<size> to increase size.\n", size); |
469 | kfree(va); | 468 | kfree(va); |
470 | return ERR_PTR(-EBUSY); | 469 | return ERR_PTR(-EBUSY); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index dcb47074ae03..4636d9e822c1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -260,8 +260,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, | |||
260 | do_div(delta, lru_pages + 1); | 260 | do_div(delta, lru_pages + 1); |
261 | total_scan += delta; | 261 | total_scan += delta; |
262 | if (total_scan < 0) { | 262 | if (total_scan < 0) { |
263 | printk(KERN_ERR | 263 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", |
264 | "shrink_slab: %pF negative objects to delete nr=%ld\n", | ||
265 | shrinker->scan_objects, total_scan); | 264 | shrinker->scan_objects, total_scan); |
266 | total_scan = freeable; | 265 | total_scan = freeable; |
267 | } | 266 | } |
@@ -875,7 +874,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
875 | * end of the LRU a second time. | 874 | * end of the LRU a second time. |
876 | */ | 875 | */ |
877 | mapping = page_mapping(page); | 876 | mapping = page_mapping(page); |
878 | if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || | 877 | if (((dirty || writeback) && mapping && |
878 | bdi_write_congested(mapping->backing_dev_info)) || | ||
879 | (writeback && PageReclaim(page))) | 879 | (writeback && PageReclaim(page))) |
880 | nr_congested++; | 880 | nr_congested++; |
881 | 881 | ||
@@ -2249,7 +2249,7 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2249 | return true; | 2249 | return true; |
2250 | 2250 | ||
2251 | /* If compaction would go ahead or the allocation would succeed, stop */ | 2251 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2252 | switch (compaction_suitable(zone, sc->order)) { | 2252 | switch (compaction_suitable(zone, sc->order, 0, 0)) { |
2253 | case COMPACT_PARTIAL: | 2253 | case COMPACT_PARTIAL: |
2254 | case COMPACT_CONTINUE: | 2254 | case COMPACT_CONTINUE: |
2255 | return false; | 2255 | return false; |
@@ -2346,7 +2346,7 @@ static inline bool compaction_ready(struct zone *zone, int order) | |||
2346 | * If compaction is not ready to start and allocation is not likely | 2346 | * If compaction is not ready to start and allocation is not likely |
2347 | * to succeed without it, then keep reclaiming. | 2347 | * to succeed without it, then keep reclaiming. |
2348 | */ | 2348 | */ |
2349 | if (compaction_suitable(zone, order) == COMPACT_SKIPPED) | 2349 | if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) |
2350 | return false; | 2350 | return false; |
2351 | 2351 | ||
2352 | return watermark_ok; | 2352 | return watermark_ok; |
@@ -2824,8 +2824,8 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2824 | balance_gap, classzone_idx, 0)) | 2824 | balance_gap, classzone_idx, 0)) |
2825 | return false; | 2825 | return false; |
2826 | 2826 | ||
2827 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | 2827 | if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, |
2828 | compaction_suitable(zone, order) == COMPACT_SKIPPED) | 2828 | order, 0, classzone_idx) == COMPACT_SKIPPED) |
2829 | return false; | 2829 | return false; |
2830 | 2830 | ||
2831 | return true; | 2831 | return true; |
@@ -2952,8 +2952,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2952 | * from memory. Do not reclaim more than needed for compaction. | 2952 | * from memory. Do not reclaim more than needed for compaction. |
2953 | */ | 2953 | */ |
2954 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && | 2954 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && |
2955 | compaction_suitable(zone, sc->order) != | 2955 | compaction_suitable(zone, sc->order, 0, classzone_idx) |
2956 | COMPACT_SKIPPED) | 2956 | != COMPACT_SKIPPED) |
2957 | testorder = 0; | 2957 | testorder = 0; |
2958 | 2958 | ||
2959 | /* | 2959 | /* |