diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 76 | ||||
-rw-r--r-- | mm/huge_memory.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 8 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 222 | ||||
-rw-r--r-- | mm/memory-failure.c | 25 | ||||
-rw-r--r-- | mm/memory.c | 33 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 9 | ||||
-rw-r--r-- | mm/page_cgroup.c | 71 | ||||
-rw-r--r-- | mm/rmap.c | 111 | ||||
-rw-r--r-- | mm/shmem.c | 74 | ||||
-rw-r--r-- | mm/slab.c | 9 | ||||
-rw-r--r-- | mm/slub.c | 12 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/thrash.c | 105 | ||||
-rw-r--r-- | mm/truncate.c | 29 | ||||
-rw-r--r-- | mm/vmscan.c | 106 |
20 files changed, 634 insertions, 293 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 021a2960ef9..6cc604bd564 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone, | |||
144 | int nr_freepages = cc->nr_freepages; | 144 | int nr_freepages = cc->nr_freepages; |
145 | struct list_head *freelist = &cc->freepages; | 145 | struct list_head *freelist = &cc->freepages; |
146 | 146 | ||
147 | /* | ||
148 | * Initialise the free scanner. The starting point is where we last | ||
149 | * scanned from (or the end of the zone if starting). The low point | ||
150 | * is the end of the pageblock the migration scanner is using. | ||
151 | */ | ||
147 | pfn = cc->free_pfn; | 152 | pfn = cc->free_pfn; |
148 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 153 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; |
149 | high_pfn = low_pfn; | 154 | |
155 | /* | ||
156 | * Take care that if the migration scanner is at the end of the zone | ||
157 | * that the free scanner does not accidentally move to the next zone | ||
158 | * in the next isolation cycle. | ||
159 | */ | ||
160 | high_pfn = min(low_pfn, pfn); | ||
150 | 161 | ||
151 | /* | 162 | /* |
152 | * Isolate free pages until enough are available to migrate the | 163 | * Isolate free pages until enough are available to migrate the |
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone) | |||
240 | return isolated > (inactive + active) / 2; | 251 | return isolated > (inactive + active) / 2; |
241 | } | 252 | } |
242 | 253 | ||
254 | /* possible outcome of isolate_migratepages */ | ||
255 | typedef enum { | ||
256 | ISOLATE_ABORT, /* Abort compaction now */ | ||
257 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | ||
258 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | ||
259 | } isolate_migrate_t; | ||
260 | |||
243 | /* | 261 | /* |
244 | * Isolate all pages that can be migrated from the block pointed to by | 262 | * Isolate all pages that can be migrated from the block pointed to by |
245 | * the migrate scanner within compact_control. | 263 | * the migrate scanner within compact_control. |
246 | */ | 264 | */ |
247 | static unsigned long isolate_migratepages(struct zone *zone, | 265 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
248 | struct compact_control *cc) | 266 | struct compact_control *cc) |
249 | { | 267 | { |
250 | unsigned long low_pfn, end_pfn; | 268 | unsigned long low_pfn, end_pfn; |
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
261 | /* Do not cross the free scanner or scan within a memory hole */ | 279 | /* Do not cross the free scanner or scan within a memory hole */ |
262 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 280 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
263 | cc->migrate_pfn = end_pfn; | 281 | cc->migrate_pfn = end_pfn; |
264 | return 0; | 282 | return ISOLATE_NONE; |
265 | } | 283 | } |
266 | 284 | ||
267 | /* | 285 | /* |
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
270 | * delay for some time until fewer pages are isolated | 288 | * delay for some time until fewer pages are isolated |
271 | */ | 289 | */ |
272 | while (unlikely(too_many_isolated(zone))) { | 290 | while (unlikely(too_many_isolated(zone))) { |
291 | /* async migration should just abort */ | ||
292 | if (!cc->sync) | ||
293 | return ISOLATE_ABORT; | ||
294 | |||
273 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 295 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
274 | 296 | ||
275 | if (fatal_signal_pending(current)) | 297 | if (fatal_signal_pending(current)) |
276 | return 0; | 298 | return ISOLATE_ABORT; |
277 | } | 299 | } |
278 | 300 | ||
279 | /* Time to isolate some pages for migration */ | 301 | /* Time to isolate some pages for migration */ |
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
358 | 380 | ||
359 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 381 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
360 | 382 | ||
361 | return cc->nr_migratepages; | 383 | return ISOLATE_SUCCESS; |
362 | } | 384 | } |
363 | 385 | ||
364 | /* | 386 | /* |
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone, | |||
420 | if (cc->free_pfn <= cc->migrate_pfn) | 442 | if (cc->free_pfn <= cc->migrate_pfn) |
421 | return COMPACT_COMPLETE; | 443 | return COMPACT_COMPLETE; |
422 | 444 | ||
423 | /* Compaction run is not finished if the watermark is not met */ | ||
424 | watermark = low_wmark_pages(zone); | ||
425 | watermark += (1 << cc->order); | ||
426 | |||
427 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
428 | return COMPACT_CONTINUE; | ||
429 | |||
430 | /* | 445 | /* |
431 | * order == -1 is expected when compacting via | 446 | * order == -1 is expected when compacting via |
432 | * /proc/sys/vm/compact_memory | 447 | * /proc/sys/vm/compact_memory |
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone, | |||
434 | if (cc->order == -1) | 449 | if (cc->order == -1) |
435 | return COMPACT_CONTINUE; | 450 | return COMPACT_CONTINUE; |
436 | 451 | ||
452 | /* Compaction run is not finished if the watermark is not met */ | ||
453 | watermark = low_wmark_pages(zone); | ||
454 | watermark += (1 << cc->order); | ||
455 | |||
456 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
457 | return COMPACT_CONTINUE; | ||
458 | |||
437 | /* Direct compactor: Is a suitable page free? */ | 459 | /* Direct compactor: Is a suitable page free? */ |
438 | for (order = cc->order; order < MAX_ORDER; order++) { | 460 | for (order = cc->order; order < MAX_ORDER; order++) { |
439 | /* Job done if page is free of the right migratetype */ | 461 | /* Job done if page is free of the right migratetype */ |
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
461 | unsigned long watermark; | 483 | unsigned long watermark; |
462 | 484 | ||
463 | /* | 485 | /* |
486 | * order == -1 is expected when compacting via | ||
487 | * /proc/sys/vm/compact_memory | ||
488 | */ | ||
489 | if (order == -1) | ||
490 | return COMPACT_CONTINUE; | ||
491 | |||
492 | /* | ||
464 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 493 | * Watermarks for order-0 must be met for compaction. Note the 2UL. |
465 | * This is because during migration, copies of pages need to be | 494 | * This is because during migration, copies of pages need to be |
466 | * allocated and for a short time, the footprint is higher | 495 | * allocated and for a short time, the footprint is higher |
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
470 | return COMPACT_SKIPPED; | 499 | return COMPACT_SKIPPED; |
471 | 500 | ||
472 | /* | 501 | /* |
473 | * order == -1 is expected when compacting via | ||
474 | * /proc/sys/vm/compact_memory | ||
475 | */ | ||
476 | if (order == -1) | ||
477 | return COMPACT_CONTINUE; | ||
478 | |||
479 | /* | ||
480 | * fragmentation index determines if allocation failures are due to | 502 | * fragmentation index determines if allocation failures are due to |
481 | * low memory or external fragmentation | 503 | * low memory or external fragmentation |
482 | * | 504 | * |
483 | * index of -1 implies allocations might succeed dependingon watermarks | 505 | * index of -1000 implies allocations might succeed depending on |
506 | * watermarks | ||
484 | * index towards 0 implies failure is due to lack of memory | 507 | * index towards 0 implies failure is due to lack of memory |
485 | * index towards 1000 implies failure is due to fragmentation | 508 | * index towards 1000 implies failure is due to fragmentation |
486 | * | 509 | * |
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
490 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 513 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
491 | return COMPACT_SKIPPED; | 514 | return COMPACT_SKIPPED; |
492 | 515 | ||
493 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) | 516 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, |
517 | 0, 0)) | ||
494 | return COMPACT_PARTIAL; | 518 | return COMPACT_PARTIAL; |
495 | 519 | ||
496 | return COMPACT_CONTINUE; | 520 | return COMPACT_CONTINUE; |
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
522 | unsigned long nr_migrate, nr_remaining; | 546 | unsigned long nr_migrate, nr_remaining; |
523 | int err; | 547 | int err; |
524 | 548 | ||
525 | if (!isolate_migratepages(zone, cc)) | 549 | switch (isolate_migratepages(zone, cc)) { |
550 | case ISOLATE_ABORT: | ||
551 | ret = COMPACT_PARTIAL; | ||
552 | goto out; | ||
553 | case ISOLATE_NONE: | ||
526 | continue; | 554 | continue; |
555 | case ISOLATE_SUCCESS: | ||
556 | ; | ||
557 | } | ||
527 | 558 | ||
528 | nr_migrate = cc->nr_migratepages; | 559 | nr_migrate = cc->nr_migratepages; |
529 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 560 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
547 | 578 | ||
548 | } | 579 | } |
549 | 580 | ||
581 | out: | ||
550 | /* Release free pages and check accounting */ | 582 | /* Release free pages and check accounting */ |
551 | cc->nr_freepages -= release_freepages(&cc->freepages); | 583 | cc->nr_freepages -= release_freepages(&cc->freepages); |
552 | VM_BUG_ON(cc->nr_freepages != 0); | 584 | VM_BUG_ON(cc->nr_freepages != 0); |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 615d9743a3c..81532f297fd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void) | |||
2234 | while (likely(khugepaged_enabled())) { | 2234 | while (likely(khugepaged_enabled())) { |
2235 | #ifndef CONFIG_NUMA | 2235 | #ifndef CONFIG_NUMA |
2236 | hpage = khugepaged_alloc_hugepage(); | 2236 | hpage = khugepaged_alloc_hugepage(); |
2237 | if (unlikely(!hpage)) { | 2237 | if (unlikely(!hpage)) |
2238 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2239 | break; | 2238 | break; |
2240 | } | ||
2241 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2242 | #else | 2239 | #else |
2243 | if (IS_ERR(hpage)) { | 2240 | if (IS_ERR(hpage)) { |
2244 | khugepaged_alloc_sleep(); | 2241 | khugepaged_alloc_sleep(); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6402458fee3..bfcf153bc82 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void) | |||
1111 | WARN_ON(page_count(page) != 1); | 1111 | WARN_ON(page_count(page) != 1); |
1112 | prep_compound_huge_page(page, h->order); | 1112 | prep_compound_huge_page(page, h->order); |
1113 | prep_new_huge_page(h, page, page_to_nid(page)); | 1113 | prep_new_huge_page(h, page, page_to_nid(page)); |
1114 | /* | ||
1115 | * If we had gigantic hugepages allocated at boot time, we need | ||
1116 | * to restore the 'stolen' pages to totalram_pages in order to | ||
1117 | * fix confusing memory reports from free(1) and another | ||
1118 | * side-effects, like CommitLimit going negative. | ||
1119 | */ | ||
1120 | if (h->order > (MAX_ORDER - 1)) | ||
1121 | totalram_pages += 1 << h->order; | ||
1114 | } | 1122 | } |
1115 | } | 1123 | } |
1116 | 1124 | ||
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
1303 | ksm_scan.mm_slot = slot; | 1303 | ksm_scan.mm_slot = slot; |
1304 | spin_unlock(&ksm_mmlist_lock); | 1304 | spin_unlock(&ksm_mmlist_lock); |
1305 | /* | ||
1306 | * Although we tested list_empty() above, a racing __ksm_exit | ||
1307 | * of the last mm on the list may have removed it since then. | ||
1308 | */ | ||
1309 | if (slot == &ksm_mm_head) | ||
1310 | return NULL; | ||
1305 | next_mm: | 1311 | next_mm: |
1306 | ksm_scan.address = 0; | 1312 | ksm_scan.address = 0; |
1307 | ksm_scan.rmap_list = &slot->rmap_list; | 1313 | ksm_scan.rmap_list = &slot->rmap_list; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3a..e013b8e57d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/limits.h> | 35 | #include <linux/limits.h> |
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
38 | #include <linux/shmem_fs.h> | ||
38 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
39 | #include <linux/swap.h> | 40 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
@@ -107,10 +108,12 @@ enum mem_cgroup_events_index { | |||
107 | enum mem_cgroup_events_target { | 108 | enum mem_cgroup_events_target { |
108 | MEM_CGROUP_TARGET_THRESH, | 109 | MEM_CGROUP_TARGET_THRESH, |
109 | MEM_CGROUP_TARGET_SOFTLIMIT, | 110 | MEM_CGROUP_TARGET_SOFTLIMIT, |
111 | MEM_CGROUP_TARGET_NUMAINFO, | ||
110 | MEM_CGROUP_NTARGETS, | 112 | MEM_CGROUP_NTARGETS, |
111 | }; | 113 | }; |
112 | #define THRESHOLDS_EVENTS_TARGET (128) | 114 | #define THRESHOLDS_EVENTS_TARGET (128) |
113 | #define SOFTLIMIT_EVENTS_TARGET (1024) | 115 | #define SOFTLIMIT_EVENTS_TARGET (1024) |
116 | #define NUMAINFO_EVENTS_TARGET (1024) | ||
114 | 117 | ||
115 | struct mem_cgroup_stat_cpu { | 118 | struct mem_cgroup_stat_cpu { |
116 | long count[MEM_CGROUP_STAT_NSTATS]; | 119 | long count[MEM_CGROUP_STAT_NSTATS]; |
@@ -236,7 +239,8 @@ struct mem_cgroup { | |||
236 | int last_scanned_node; | 239 | int last_scanned_node; |
237 | #if MAX_NUMNODES > 1 | 240 | #if MAX_NUMNODES > 1 |
238 | nodemask_t scan_nodes; | 241 | nodemask_t scan_nodes; |
239 | unsigned long next_scan_node_update; | 242 | atomic_t numainfo_events; |
243 | atomic_t numainfo_updating; | ||
240 | #endif | 244 | #endif |
241 | /* | 245 | /* |
242 | * Should the accounting and control be hierarchical, per subtree? | 246 | * Should the accounting and control be hierarchical, per subtree? |
@@ -359,7 +363,7 @@ enum charge_type { | |||
359 | static void mem_cgroup_get(struct mem_cgroup *mem); | 363 | static void mem_cgroup_get(struct mem_cgroup *mem); |
360 | static void mem_cgroup_put(struct mem_cgroup *mem); | 364 | static void mem_cgroup_put(struct mem_cgroup *mem); |
361 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 365 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
362 | static void drain_all_stock_async(void); | 366 | static void drain_all_stock_async(struct mem_cgroup *mem); |
363 | 367 | ||
364 | static struct mem_cgroup_per_zone * | 368 | static struct mem_cgroup_per_zone * |
365 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 369 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, | |||
576 | return val; | 580 | return val; |
577 | } | 581 | } |
578 | 582 | ||
579 | static long mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
580 | { | ||
581 | long ret; | ||
582 | |||
583 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
584 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
585 | return ret; | ||
586 | } | ||
587 | |||
588 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 583 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
589 | bool charge) | 584 | bool charge) |
590 | { | 585 | { |
@@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) | |||
688 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 683 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
689 | next = val + SOFTLIMIT_EVENTS_TARGET; | 684 | next = val + SOFTLIMIT_EVENTS_TARGET; |
690 | break; | 685 | break; |
686 | case MEM_CGROUP_TARGET_NUMAINFO: | ||
687 | next = val + NUMAINFO_EVENTS_TARGET; | ||
688 | break; | ||
691 | default: | 689 | default: |
692 | return; | 690 | return; |
693 | } | 691 | } |
@@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | |||
706 | mem_cgroup_threshold(mem); | 704 | mem_cgroup_threshold(mem); |
707 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); | 705 | __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); |
708 | if (unlikely(__memcg_event_check(mem, | 706 | if (unlikely(__memcg_event_check(mem, |
709 | MEM_CGROUP_TARGET_SOFTLIMIT))){ | 707 | MEM_CGROUP_TARGET_SOFTLIMIT))) { |
710 | mem_cgroup_update_tree(mem, page); | 708 | mem_cgroup_update_tree(mem, page); |
711 | __mem_cgroup_target_update(mem, | 709 | __mem_cgroup_target_update(mem, |
712 | MEM_CGROUP_TARGET_SOFTLIMIT); | 710 | MEM_CGROUP_TARGET_SOFTLIMIT); |
713 | } | 711 | } |
712 | #if MAX_NUMNODES > 1 | ||
713 | if (unlikely(__memcg_event_check(mem, | ||
714 | MEM_CGROUP_TARGET_NUMAINFO))) { | ||
715 | atomic_inc(&mem->numainfo_events); | ||
716 | __mem_cgroup_target_update(mem, | ||
717 | MEM_CGROUP_TARGET_NUMAINFO); | ||
718 | } | ||
719 | #endif | ||
714 | } | 720 | } |
715 | } | 721 | } |
716 | 722 | ||
@@ -735,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
735 | struct mem_cgroup, css); | 741 | struct mem_cgroup, css); |
736 | } | 742 | } |
737 | 743 | ||
738 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 744 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
739 | { | 745 | { |
740 | struct mem_cgroup *mem = NULL; | 746 | struct mem_cgroup *mem = NULL; |
741 | 747 | ||
@@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, | |||
1128 | return MEM_CGROUP_ZSTAT(mz, lru); | 1134 | return MEM_CGROUP_ZSTAT(mz, lru); |
1129 | } | 1135 | } |
1130 | 1136 | ||
1131 | #ifdef CONFIG_NUMA | ||
1132 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | 1137 | static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, |
1133 | int nid) | 1138 | int nid) |
1134 | { | 1139 | { |
@@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, | |||
1140 | return ret; | 1145 | return ret; |
1141 | } | 1146 | } |
1142 | 1147 | ||
1148 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1149 | int nid) | ||
1150 | { | ||
1151 | unsigned long ret; | ||
1152 | |||
1153 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1154 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1155 | return ret; | ||
1156 | } | ||
1157 | |||
1158 | #if MAX_NUMNODES > 1 | ||
1143 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | 1159 | static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) |
1144 | { | 1160 | { |
1145 | u64 total = 0; | 1161 | u64 total = 0; |
@@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) | |||
1151 | return total; | 1167 | return total; |
1152 | } | 1168 | } |
1153 | 1169 | ||
1154 | static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, | ||
1155 | int nid) | ||
1156 | { | ||
1157 | unsigned long ret; | ||
1158 | |||
1159 | ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + | ||
1160 | mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); | ||
1161 | |||
1162 | return ret; | ||
1163 | } | ||
1164 | |||
1165 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) | 1170 | static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) |
1166 | { | 1171 | { |
1167 | u64 total = 0; | 1172 | u64 total = 0; |
@@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1558 | return ret; | 1563 | return ret; |
1559 | } | 1564 | } |
1560 | 1565 | ||
1566 | /** | ||
1567 | * test_mem_cgroup_node_reclaimable | ||
1568 | * @mem: the target memcg | ||
1569 | * @nid: the node ID to be checked. | ||
1570 | * @noswap : specify true here if the user wants flle only information. | ||
1571 | * | ||
1572 | * This function returns whether the specified memcg contains any | ||
1573 | * reclaimable pages on a node. Returns true if there are any reclaimable | ||
1574 | * pages in the node. | ||
1575 | */ | ||
1576 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, | ||
1577 | int nid, bool noswap) | ||
1578 | { | ||
1579 | if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) | ||
1580 | return true; | ||
1581 | if (noswap || !total_swap_pages) | ||
1582 | return false; | ||
1583 | if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) | ||
1584 | return true; | ||
1585 | return false; | ||
1586 | |||
1587 | } | ||
1561 | #if MAX_NUMNODES > 1 | 1588 | #if MAX_NUMNODES > 1 |
1562 | 1589 | ||
1563 | /* | 1590 | /* |
@@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1569 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | 1596 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) |
1570 | { | 1597 | { |
1571 | int nid; | 1598 | int nid; |
1572 | 1599 | /* | |
1573 | if (time_after(mem->next_scan_node_update, jiffies)) | 1600 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET |
1601 | * pagein/pageout changes since the last update. | ||
1602 | */ | ||
1603 | if (!atomic_read(&mem->numainfo_events)) | ||
1604 | return; | ||
1605 | if (atomic_inc_return(&mem->numainfo_updating) > 1) | ||
1574 | return; | 1606 | return; |
1575 | 1607 | ||
1576 | mem->next_scan_node_update = jiffies + 10*HZ; | ||
1577 | /* make a nodemask where this memcg uses memory from */ | 1608 | /* make a nodemask where this memcg uses memory from */ |
1578 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | 1609 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; |
1579 | 1610 | ||
1580 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1611 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { |
1581 | 1612 | ||
1582 | if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || | 1613 | if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) |
1583 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) | 1614 | node_clear(nid, mem->scan_nodes); |
1584 | continue; | ||
1585 | |||
1586 | if (total_swap_pages && | ||
1587 | (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || | ||
1588 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) | ||
1589 | continue; | ||
1590 | node_clear(nid, mem->scan_nodes); | ||
1591 | } | 1615 | } |
1616 | |||
1617 | atomic_set(&mem->numainfo_events, 0); | ||
1618 | atomic_set(&mem->numainfo_updating, 0); | ||
1592 | } | 1619 | } |
1593 | 1620 | ||
1594 | /* | 1621 | /* |
@@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | |||
1626 | return node; | 1653 | return node; |
1627 | } | 1654 | } |
1628 | 1655 | ||
1656 | /* | ||
1657 | * Check all nodes whether it contains reclaimable pages or not. | ||
1658 | * For quick scan, we make use of scan_nodes. This will allow us to skip | ||
1659 | * unused nodes. But scan_nodes is lazily updated and may not cotain | ||
1660 | * enough new information. We need to do double check. | ||
1661 | */ | ||
1662 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | ||
1663 | { | ||
1664 | int nid; | ||
1665 | |||
1666 | /* | ||
1667 | * quick check...making use of scan_node. | ||
1668 | * We can skip unused nodes. | ||
1669 | */ | ||
1670 | if (!nodes_empty(mem->scan_nodes)) { | ||
1671 | for (nid = first_node(mem->scan_nodes); | ||
1672 | nid < MAX_NUMNODES; | ||
1673 | nid = next_node(nid, mem->scan_nodes)) { | ||
1674 | |||
1675 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | ||
1676 | return true; | ||
1677 | } | ||
1678 | } | ||
1679 | /* | ||
1680 | * Check rest of nodes. | ||
1681 | */ | ||
1682 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
1683 | if (node_isset(nid, mem->scan_nodes)) | ||
1684 | continue; | ||
1685 | if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) | ||
1686 | return true; | ||
1687 | } | ||
1688 | return false; | ||
1689 | } | ||
1690 | |||
1629 | #else | 1691 | #else |
1630 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | 1692 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) |
1631 | { | 1693 | { |
1632 | return 0; | 1694 | return 0; |
1633 | } | 1695 | } |
1696 | |||
1697 | bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) | ||
1698 | { | ||
1699 | return test_mem_cgroup_node_reclaimable(mem, 0, noswap); | ||
1700 | } | ||
1634 | #endif | 1701 | #endif |
1635 | 1702 | ||
1636 | /* | 1703 | /* |
@@ -1663,15 +1730,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1730 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1664 | 1731 | ||
1665 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1732 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1666 | if (root_mem->memsw_is_minimum) | 1733 | if (!check_soft && root_mem->memsw_is_minimum) |
1667 | noswap = true; | 1734 | noswap = true; |
1668 | 1735 | ||
1669 | while (1) { | 1736 | while (1) { |
1670 | victim = mem_cgroup_select_victim(root_mem); | 1737 | victim = mem_cgroup_select_victim(root_mem); |
1671 | if (victim == root_mem) { | 1738 | if (victim == root_mem) { |
1672 | loop++; | 1739 | loop++; |
1673 | if (loop >= 1) | 1740 | /* |
1674 | drain_all_stock_async(); | 1741 | * We are not draining per cpu cached charges during |
1742 | * soft limit reclaim because global reclaim doesn't | ||
1743 | * care about charges. It tries to free some memory and | ||
1744 | * charges will not give any. | ||
1745 | */ | ||
1746 | if (!check_soft && loop >= 1) | ||
1747 | drain_all_stock_async(root_mem); | ||
1675 | if (loop >= 2) { | 1748 | if (loop >= 2) { |
1676 | /* | 1749 | /* |
1677 | * If we have not been able to reclaim | 1750 | * If we have not been able to reclaim |
@@ -1695,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1695 | } | 1768 | } |
1696 | } | 1769 | } |
1697 | } | 1770 | } |
1698 | if (!mem_cgroup_local_usage(victim)) { | 1771 | if (!mem_cgroup_reclaimable(victim, noswap)) { |
1699 | /* this cgroup's local usage == 0 */ | 1772 | /* this cgroup's local usage == 0 */ |
1700 | css_put(&victim->css); | 1773 | css_put(&victim->css); |
1701 | continue; | 1774 | continue; |
@@ -1934,9 +2007,11 @@ struct memcg_stock_pcp { | |||
1934 | struct mem_cgroup *cached; /* this never be root cgroup */ | 2007 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1935 | unsigned int nr_pages; | 2008 | unsigned int nr_pages; |
1936 | struct work_struct work; | 2009 | struct work_struct work; |
2010 | unsigned long flags; | ||
2011 | #define FLUSHING_CACHED_CHARGE (0) | ||
1937 | }; | 2012 | }; |
1938 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 2013 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1939 | static atomic_t memcg_drain_count; | 2014 | static DEFINE_MUTEX(percpu_charge_mutex); |
1940 | 2015 | ||
1941 | /* | 2016 | /* |
1942 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 2017 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
@@ -1984,6 +2059,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1984 | { | 2059 | { |
1985 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | 2060 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); |
1986 | drain_stock(stock); | 2061 | drain_stock(stock); |
2062 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | ||
1987 | } | 2063 | } |
1988 | 2064 | ||
1989 | /* | 2065 | /* |
@@ -2008,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | |||
2008 | * expects some charges will be back to res_counter later but cannot wait for | 2084 | * expects some charges will be back to res_counter later but cannot wait for |
2009 | * it. | 2085 | * it. |
2010 | */ | 2086 | */ |
2011 | static void drain_all_stock_async(void) | 2087 | static void drain_all_stock_async(struct mem_cgroup *root_mem) |
2012 | { | 2088 | { |
2013 | int cpu; | 2089 | int cpu, curcpu; |
2014 | /* This function is for scheduling "drain" in asynchronous way. | 2090 | /* |
2015 | * The result of "drain" is not directly handled by callers. Then, | 2091 | * If someone calls draining, avoid adding more kworker runs. |
2016 | * if someone is calling drain, we don't have to call drain more. | ||
2017 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
2018 | * there is a race. We just do loose check here. | ||
2019 | */ | 2092 | */ |
2020 | if (atomic_read(&memcg_drain_count)) | 2093 | if (!mutex_trylock(&percpu_charge_mutex)) |
2021 | return; | 2094 | return; |
2022 | /* Notify other cpus that system-wide "drain" is running */ | 2095 | /* Notify other cpus that system-wide "drain" is running */ |
2023 | atomic_inc(&memcg_drain_count); | ||
2024 | get_online_cpus(); | 2096 | get_online_cpus(); |
2097 | /* | ||
2098 | * Get a hint for avoiding draining charges on the current cpu, | ||
2099 | * which must be exhausted by our charging. It is not required that | ||
2100 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2101 | * getcpu()/putcpu(). | ||
2102 | */ | ||
2103 | curcpu = raw_smp_processor_id(); | ||
2025 | for_each_online_cpu(cpu) { | 2104 | for_each_online_cpu(cpu) { |
2026 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2105 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2027 | schedule_work_on(cpu, &stock->work); | 2106 | struct mem_cgroup *mem; |
2107 | |||
2108 | if (cpu == curcpu) | ||
2109 | continue; | ||
2110 | |||
2111 | mem = stock->cached; | ||
2112 | if (!mem) | ||
2113 | continue; | ||
2114 | if (mem != root_mem) { | ||
2115 | if (!root_mem->use_hierarchy) | ||
2116 | continue; | ||
2117 | /* check whether "mem" is under tree of "root_mem" */ | ||
2118 | if (!css_is_ancestor(&mem->css, &root_mem->css)) | ||
2119 | continue; | ||
2120 | } | ||
2121 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2122 | schedule_work_on(cpu, &stock->work); | ||
2028 | } | 2123 | } |
2029 | put_online_cpus(); | 2124 | put_online_cpus(); |
2030 | atomic_dec(&memcg_drain_count); | 2125 | mutex_unlock(&percpu_charge_mutex); |
2031 | /* We don't wait for flush_work */ | 2126 | /* We don't wait for flush_work */ |
2032 | } | 2127 | } |
2033 | 2128 | ||
@@ -2035,9 +2130,9 @@ static void drain_all_stock_async(void) | |||
2035 | static void drain_all_stock_sync(void) | 2130 | static void drain_all_stock_sync(void) |
2036 | { | 2131 | { |
2037 | /* called when force_empty is called */ | 2132 | /* called when force_empty is called */ |
2038 | atomic_inc(&memcg_drain_count); | 2133 | mutex_lock(&percpu_charge_mutex); |
2039 | schedule_on_each_cpu(drain_local_stock); | 2134 | schedule_on_each_cpu(drain_local_stock); |
2040 | atomic_dec(&memcg_drain_count); | 2135 | mutex_unlock(&percpu_charge_mutex); |
2041 | } | 2136 | } |
2042 | 2137 | ||
2043 | /* | 2138 | /* |
@@ -4640,6 +4735,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4640 | { | 4735 | { |
4641 | .name = "numa_stat", | 4736 | .name = "numa_stat", |
4642 | .open = mem_control_numa_stat_open, | 4737 | .open = mem_control_numa_stat_open, |
4738 | .mode = S_IRUGO, | ||
4643 | }, | 4739 | }, |
4644 | #endif | 4740 | #endif |
4645 | }; | 4741 | }; |
@@ -5414,18 +5510,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
5414 | struct cgroup *old_cont, | 5510 | struct cgroup *old_cont, |
5415 | struct task_struct *p) | 5511 | struct task_struct *p) |
5416 | { | 5512 | { |
5417 | struct mm_struct *mm; | 5513 | struct mm_struct *mm = get_task_mm(p); |
5418 | 5514 | ||
5419 | if (!mc.to) | ||
5420 | /* no need to move charge */ | ||
5421 | return; | ||
5422 | |||
5423 | mm = get_task_mm(p); | ||
5424 | if (mm) { | 5515 | if (mm) { |
5425 | mem_cgroup_move_charge(mm); | 5516 | if (mc.to) |
5517 | mem_cgroup_move_charge(mm); | ||
5518 | put_swap_token(mm); | ||
5426 | mmput(mm); | 5519 | mmput(mm); |
5427 | } | 5520 | } |
5428 | mem_cgroup_clear_mc(); | 5521 | if (mc.to) |
5522 | mem_cgroup_clear_mc(); | ||
5429 | } | 5523 | } |
5430 | #else /* !CONFIG_MMU */ | 5524 | #else /* !CONFIG_MMU */ |
5431 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5525 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5c8f7e08928..740c4f52059 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <linux/swapops.h> | 52 | #include <linux/swapops.h> |
53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 54 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | ||
55 | #include "internal.h" | 56 | #include "internal.h" |
56 | 57 | ||
57 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 58 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -390,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
390 | struct task_struct *tsk; | 391 | struct task_struct *tsk; |
391 | struct anon_vma *av; | 392 | struct anon_vma *av; |
392 | 393 | ||
393 | read_lock(&tasklist_lock); | ||
394 | av = page_lock_anon_vma(page); | 394 | av = page_lock_anon_vma(page); |
395 | if (av == NULL) /* Not actually mapped anymore */ | 395 | if (av == NULL) /* Not actually mapped anymore */ |
396 | goto out; | 396 | return; |
397 | |||
398 | read_lock(&tasklist_lock); | ||
397 | for_each_process (tsk) { | 399 | for_each_process (tsk) { |
398 | struct anon_vma_chain *vmac; | 400 | struct anon_vma_chain *vmac; |
399 | 401 | ||
@@ -407,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
407 | add_to_kill(tsk, page, vma, to_kill, tkc); | 409 | add_to_kill(tsk, page, vma, to_kill, tkc); |
408 | } | 410 | } |
409 | } | 411 | } |
410 | page_unlock_anon_vma(av); | ||
411 | out: | ||
412 | read_unlock(&tasklist_lock); | 412 | read_unlock(&tasklist_lock); |
413 | page_unlock_anon_vma(av); | ||
413 | } | 414 | } |
414 | 415 | ||
415 | /* | 416 | /* |
@@ -423,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
423 | struct prio_tree_iter iter; | 424 | struct prio_tree_iter iter; |
424 | struct address_space *mapping = page->mapping; | 425 | struct address_space *mapping = page->mapping; |
425 | 426 | ||
426 | /* | ||
427 | * A note on the locking order between the two locks. | ||
428 | * We don't rely on this particular order. | ||
429 | * If you have some other code that needs a different order | ||
430 | * feel free to switch them around. Or add a reverse link | ||
431 | * from mm_struct to task_struct, then this could be all | ||
432 | * done without taking tasklist_lock and looping over all tasks. | ||
433 | */ | ||
434 | |||
435 | read_lock(&tasklist_lock); | ||
436 | mutex_lock(&mapping->i_mmap_mutex); | 427 | mutex_lock(&mapping->i_mmap_mutex); |
428 | read_lock(&tasklist_lock); | ||
437 | for_each_process(tsk) { | 429 | for_each_process(tsk) { |
438 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 430 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
439 | 431 | ||
@@ -453,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, | |||
453 | add_to_kill(tsk, page, vma, to_kill, tkc); | 445 | add_to_kill(tsk, page, vma, to_kill, tkc); |
454 | } | 446 | } |
455 | } | 447 | } |
456 | mutex_unlock(&mapping->i_mmap_mutex); | ||
457 | read_unlock(&tasklist_lock); | 448 | read_unlock(&tasklist_lock); |
449 | mutex_unlock(&mapping->i_mmap_mutex); | ||
458 | } | 450 | } |
459 | 451 | ||
460 | /* | 452 | /* |
@@ -1468,7 +1460,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1468 | put_page(page); | 1460 | put_page(page); |
1469 | if (!ret) { | 1461 | if (!ret) { |
1470 | LIST_HEAD(pagelist); | 1462 | LIST_HEAD(pagelist); |
1471 | 1463 | inc_zone_page_state(page, NR_ISOLATED_ANON + | |
1464 | page_is_file_cache(page)); | ||
1472 | list_add(&page->lru, &pagelist); | 1465 | list_add(&page->lru, &pagelist); |
1473 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1466 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1474 | 0, true); | 1467 | 0, true); |
diff --git a/mm/memory.c b/mm/memory.c index 6953d3926e0..9b8a01d941c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
305 | if (batch->nr == batch->max) { | 305 | if (batch->nr == batch->max) { |
306 | if (!tlb_next_batch(tlb)) | 306 | if (!tlb_next_batch(tlb)) |
307 | return 0; | 307 | return 0; |
308 | batch = tlb->active; | ||
308 | } | 309 | } |
309 | VM_BUG_ON(batch->nr > batch->max); | 310 | VM_BUG_ON(batch->nr > batch->max); |
310 | 311 | ||
@@ -1112,11 +1113,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1112 | int force_flush = 0; | 1113 | int force_flush = 0; |
1113 | int rss[NR_MM_COUNTERS]; | 1114 | int rss[NR_MM_COUNTERS]; |
1114 | spinlock_t *ptl; | 1115 | spinlock_t *ptl; |
1116 | pte_t *start_pte; | ||
1115 | pte_t *pte; | 1117 | pte_t *pte; |
1116 | 1118 | ||
1117 | again: | 1119 | again: |
1118 | init_rss_vec(rss); | 1120 | init_rss_vec(rss); |
1119 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1121 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
1122 | pte = start_pte; | ||
1120 | arch_enter_lazy_mmu_mode(); | 1123 | arch_enter_lazy_mmu_mode(); |
1121 | do { | 1124 | do { |
1122 | pte_t ptent = *pte; | 1125 | pte_t ptent = *pte; |
@@ -1196,7 +1199,7 @@ again: | |||
1196 | 1199 | ||
1197 | add_mm_rss_vec(mm, rss); | 1200 | add_mm_rss_vec(mm, rss); |
1198 | arch_leave_lazy_mmu_mode(); | 1201 | arch_leave_lazy_mmu_mode(); |
1199 | pte_unmap_unlock(pte - 1, ptl); | 1202 | pte_unmap_unlock(start_pte, ptl); |
1200 | 1203 | ||
1201 | /* | 1204 | /* |
1202 | * mmu_gather ran out of room to batch pages, we break out of | 1205 | * mmu_gather ran out of room to batch pages, we break out of |
@@ -1296,7 +1299,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1296 | 1299 | ||
1297 | /** | 1300 | /** |
1298 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1301 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1299 | * @tlbp: address of the caller's struct mmu_gather | 1302 | * @tlb: address of the caller's struct mmu_gather |
1300 | * @vma: the starting vma | 1303 | * @vma: the starting vma |
1301 | * @start_addr: virtual address at which to start unmapping | 1304 | * @start_addr: virtual address at which to start unmapping |
1302 | * @end_addr: virtual address at which to end unmapping | 1305 | * @end_addr: virtual address at which to end unmapping |
@@ -2796,30 +2799,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2796 | } | 2799 | } |
2797 | EXPORT_SYMBOL(unmap_mapping_range); | 2800 | EXPORT_SYMBOL(unmap_mapping_range); |
2798 | 2801 | ||
2799 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
2800 | { | ||
2801 | struct address_space *mapping = inode->i_mapping; | ||
2802 | |||
2803 | /* | ||
2804 | * If the underlying filesystem is not going to provide | ||
2805 | * a way to truncate a range of blocks (punch a hole) - | ||
2806 | * we should return failure right now. | ||
2807 | */ | ||
2808 | if (!inode->i_op->truncate_range) | ||
2809 | return -ENOSYS; | ||
2810 | |||
2811 | mutex_lock(&inode->i_mutex); | ||
2812 | down_write(&inode->i_alloc_sem); | ||
2813 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2814 | truncate_inode_pages_range(mapping, offset, end); | ||
2815 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
2816 | inode->i_op->truncate_range(inode, offset, end); | ||
2817 | up_write(&inode->i_alloc_sem); | ||
2818 | mutex_unlock(&inode->i_mutex); | ||
2819 | |||
2820 | return 0; | ||
2821 | } | ||
2822 | |||
2823 | /* | 2802 | /* |
2824 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 2803 | * We enter with non-exclusive mmap_sem (to exclude vma changes, |
2825 | * but allow concurrent faults), and pte mapped but not yet locked. | 2804 | * but allow concurrent faults), and pte mapped but not yet locked. |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9f646374e32..c46887b5a11 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -494,6 +494,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
494 | /* init node's zones as empty zones, we don't have any present pages.*/ | 494 | /* init node's zones as empty zones, we don't have any present pages.*/ |
495 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); | 495 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); |
496 | 496 | ||
497 | /* | ||
498 | * The node we allocated has no zone fallback lists. For avoiding | ||
499 | * to access not-initialized zonelist, build here. | ||
500 | */ | ||
501 | mutex_lock(&zonelists_mutex); | ||
502 | build_all_zonelists(NULL); | ||
503 | mutex_unlock(&zonelists_mutex); | ||
504 | |||
497 | return pgdat; | 505 | return pgdat; |
498 | } | 506 | } |
499 | 507 | ||
@@ -515,7 +523,7 @@ int mem_online_node(int nid) | |||
515 | 523 | ||
516 | lock_memory_hotplug(); | 524 | lock_memory_hotplug(); |
517 | pgdat = hotadd_new_pgdat(nid, 0); | 525 | pgdat = hotadd_new_pgdat(nid, 0); |
518 | if (pgdat) { | 526 | if (!pgdat) { |
519 | ret = -ENOMEM; | 527 | ret = -ENOMEM; |
520 | goto out; | 528 | goto out; |
521 | } | 529 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index e4a5c912983..666e4e67741 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
288 | */ | 288 | */ |
289 | __dec_zone_page_state(page, NR_FILE_PAGES); | 289 | __dec_zone_page_state(page, NR_FILE_PAGES); |
290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
291 | if (PageSwapBacked(page)) { | 291 | if (!PageSwapCache(page) && PageSwapBacked(page)) { |
292 | __dec_zone_page_state(page, NR_SHMEM); | 292 | __dec_zone_page_state(page, NR_SHMEM); |
293 | __inc_zone_page_state(newpage, NR_SHMEM); | 293 | __inc_zone_page_state(newpage, NR_SHMEM); |
294 | } | 294 | } |
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | |||
906 | if (anon_vma) | 906 | if (anon_vma) |
907 | return anon_vma; | 907 | return anon_vma; |
908 | try_prev: | 908 | try_prev: |
909 | /* | 909 | near = vma->vm_prev; |
910 | * It is potentially slow to have to call find_vma_prev here. | ||
911 | * But it's only on the first write fault on the vma, not | ||
912 | * every time, and we could devise a way to avoid it later | ||
913 | * (e.g. stash info in next's anon_vma_node when assigning | ||
914 | * an anon_vma, or when trying vma_merge). Another time. | ||
915 | */ | ||
916 | BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); | ||
917 | if (!near) | 910 | if (!near) |
918 | goto none; | 911 | goto none; |
919 | 912 | ||
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2044 | return -EINVAL; | 2037 | return -EINVAL; |
2045 | 2038 | ||
2046 | /* Find the first overlapping VMA */ | 2039 | /* Find the first overlapping VMA */ |
2047 | vma = find_vma_prev(mm, start, &prev); | 2040 | vma = find_vma(mm, start); |
2048 | if (!vma) | 2041 | if (!vma) |
2049 | return 0; | 2042 | return 0; |
2043 | prev = vma->vm_prev; | ||
2050 | /* we have start < vma->vm_end */ | 2044 | /* we have start < vma->vm_end */ |
2051 | 2045 | ||
2052 | /* if it doesn't overlap, we have nothing.. */ | 2046 | /* if it doesn't overlap, we have nothing.. */ |
diff --git a/mm/nommu.c b/mm/nommu.c index 1fd0c51b10a..9edc897a397 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1813,10 +1813,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1813 | return NULL; | 1813 | return NULL; |
1814 | } | 1814 | } |
1815 | 1815 | ||
1816 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, | 1816 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1817 | unsigned long to, unsigned long size, pgprot_t prot) | 1817 | unsigned long pfn, unsigned long size, pgprot_t prot) |
1818 | { | 1818 | { |
1819 | vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; | 1819 | if (addr != (pfn << PAGE_SHIFT)) |
1820 | return -EINVAL; | ||
1821 | |||
1822 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; | ||
1820 | return 0; | 1823 | return 0; |
1821 | } | 1824 | } |
1822 | EXPORT_SYMBOL(remap_pfn_range); | 1825 | EXPORT_SYMBOL(remap_pfn_range); |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 74ccff61d1b..53bffc6c293 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr) | |||
162 | } | 162 | } |
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | static int __meminit init_section_page_cgroup(unsigned long pfn) | 165 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) |
166 | { | 166 | { |
167 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | 168 | struct mem_section *section; |
169 | unsigned long table_size; | 169 | unsigned long table_size; |
170 | unsigned long nr; | 170 | unsigned long nr; |
171 | int nid, index; | 171 | int index; |
172 | 172 | ||
173 | nr = pfn_to_section_nr(pfn); | 173 | nr = pfn_to_section_nr(pfn); |
174 | section = __nr_to_section(nr); | 174 | section = __nr_to_section(nr); |
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
176 | if (section->page_cgroup) | 176 | if (section->page_cgroup) |
177 | return 0; | 177 | return 0; |
178 | 178 | ||
179 | nid = page_to_nid(pfn_to_page(pfn)); | ||
180 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 179 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
181 | base = alloc_page_cgroup(table_size, nid); | 180 | base = alloc_page_cgroup(table_size, nid); |
182 | 181 | ||
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
196 | pc = base + index; | 195 | pc = base + index; |
197 | init_page_cgroup(pc, nr); | 196 | init_page_cgroup(pc, nr); |
198 | } | 197 | } |
199 | 198 | /* | |
199 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
200 | * we need to apply a mask. | ||
201 | */ | ||
202 | pfn &= PAGE_SECTION_MASK; | ||
200 | section->page_cgroup = base - pfn; | 203 | section->page_cgroup = base - pfn; |
201 | total_usage += table_size; | 204 | total_usage += table_size; |
202 | return 0; | 205 | return 0; |
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn, | |||
225 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 228 | start = start_pfn & ~(PAGES_PER_SECTION - 1); |
226 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 229 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); |
227 | 230 | ||
231 | if (nid == -1) { | ||
232 | /* | ||
233 | * In this case, "nid" already exists and contains valid memory. | ||
234 | * "start_pfn" passed to us is a pfn which is an arg for | ||
235 | * online__pages(), and start_pfn should exist. | ||
236 | */ | ||
237 | nid = pfn_to_nid(start_pfn); | ||
238 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
239 | } | ||
240 | |||
228 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | 241 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { |
229 | if (!pfn_present(pfn)) | 242 | if (!pfn_present(pfn)) |
230 | continue; | 243 | continue; |
231 | fail = init_section_page_cgroup(pfn); | 244 | fail = init_section_page_cgroup(pfn, nid); |
232 | } | 245 | } |
233 | if (!fail) | 246 | if (!fail) |
234 | return 0; | 247 | return 0; |
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
284 | void __init page_cgroup_init(void) | 297 | void __init page_cgroup_init(void) |
285 | { | 298 | { |
286 | unsigned long pfn; | 299 | unsigned long pfn; |
287 | int fail = 0; | 300 | int nid; |
288 | 301 | ||
289 | if (mem_cgroup_disabled()) | 302 | if (mem_cgroup_disabled()) |
290 | return; | 303 | return; |
291 | 304 | ||
292 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | 305 | for_each_node_state(nid, N_HIGH_MEMORY) { |
293 | if (!pfn_present(pfn)) | 306 | unsigned long start_pfn, end_pfn; |
294 | continue; | 307 | |
295 | fail = init_section_page_cgroup(pfn); | 308 | start_pfn = node_start_pfn(nid); |
296 | } | 309 | end_pfn = node_end_pfn(nid); |
297 | if (fail) { | 310 | /* |
298 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | 311 | * start_pfn and end_pfn may not be aligned to SECTION and the |
299 | panic("Out of memory"); | 312 | * page->flags of out of node pages are not initialized. So we |
300 | } else { | 313 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. |
301 | hotplug_memory_notifier(page_cgroup_callback, 0); | 314 | */ |
315 | for (pfn = start_pfn; | ||
316 | pfn < end_pfn; | ||
317 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
318 | |||
319 | if (!pfn_valid(pfn)) | ||
320 | continue; | ||
321 | /* | ||
322 | * Nodes's pfns can be overlapping. | ||
323 | * We know some arch can have a nodes layout such as | ||
324 | * -------------pfn--------------> | ||
325 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
326 | */ | ||
327 | if (pfn_to_nid(pfn) != nid) | ||
328 | continue; | ||
329 | if (init_section_page_cgroup(pfn, nid)) | ||
330 | goto oom; | ||
331 | } | ||
302 | } | 332 | } |
333 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
303 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 334 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
304 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" | 335 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " |
305 | " want memory cgroups\n"); | 336 | "don't want memory cgroups\n"); |
337 | return; | ||
338 | oom: | ||
339 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | ||
340 | panic("Out of memory"); | ||
306 | } | 341 | } |
307 | 342 | ||
308 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 343 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |
@@ -38,9 +38,8 @@ | |||
38 | * in arch-dependent flush_dcache_mmap_lock, | 38 | * in arch-dependent flush_dcache_mmap_lock, |
39 | * within inode_wb_list_lock in __sync_single_inode) | 39 | * within inode_wb_list_lock in __sync_single_inode) |
40 | * | 40 | * |
41 | * (code doesn't rely on that order so it could be switched around) | 41 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) |
42 | * ->tasklist_lock | 42 | * ->tasklist_lock |
43 | * anon_vma->mutex (memory_failure, collect_procs_anon) | ||
44 | * pte map lock | 43 | * pte map lock |
45 | */ | 44 | */ |
46 | 45 | ||
@@ -112,9 +111,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
112 | kmem_cache_free(anon_vma_cachep, anon_vma); | 111 | kmem_cache_free(anon_vma_cachep, anon_vma); |
113 | } | 112 | } |
114 | 113 | ||
115 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | 114 | static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) |
116 | { | 115 | { |
117 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | 116 | return kmem_cache_alloc(anon_vma_chain_cachep, gfp); |
118 | } | 117 | } |
119 | 118 | ||
120 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | 119 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) |
@@ -159,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
159 | struct mm_struct *mm = vma->vm_mm; | 158 | struct mm_struct *mm = vma->vm_mm; |
160 | struct anon_vma *allocated; | 159 | struct anon_vma *allocated; |
161 | 160 | ||
162 | avc = anon_vma_chain_alloc(); | 161 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
163 | if (!avc) | 162 | if (!avc) |
164 | goto out_enomem; | 163 | goto out_enomem; |
165 | 164 | ||
@@ -200,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
200 | return -ENOMEM; | 199 | return -ENOMEM; |
201 | } | 200 | } |
202 | 201 | ||
202 | /* | ||
203 | * This is a useful helper function for locking the anon_vma root as | ||
204 | * we traverse the vma->anon_vma_chain, looping over anon_vma's that | ||
205 | * have the same vma. | ||
206 | * | ||
207 | * Such anon_vma's should have the same root, so you'd expect to see | ||
208 | * just a single mutex_lock for the whole traversal. | ||
209 | */ | ||
210 | static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) | ||
211 | { | ||
212 | struct anon_vma *new_root = anon_vma->root; | ||
213 | if (new_root != root) { | ||
214 | if (WARN_ON_ONCE(root)) | ||
215 | mutex_unlock(&root->mutex); | ||
216 | root = new_root; | ||
217 | mutex_lock(&root->mutex); | ||
218 | } | ||
219 | return root; | ||
220 | } | ||
221 | |||
222 | static inline void unlock_anon_vma_root(struct anon_vma *root) | ||
223 | { | ||
224 | if (root) | ||
225 | mutex_unlock(&root->mutex); | ||
226 | } | ||
227 | |||
203 | static void anon_vma_chain_link(struct vm_area_struct *vma, | 228 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
204 | struct anon_vma_chain *avc, | 229 | struct anon_vma_chain *avc, |
205 | struct anon_vma *anon_vma) | 230 | struct anon_vma *anon_vma) |
@@ -208,13 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
208 | avc->anon_vma = anon_vma; | 233 | avc->anon_vma = anon_vma; |
209 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 234 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
210 | 235 | ||
211 | anon_vma_lock(anon_vma); | ||
212 | /* | 236 | /* |
213 | * It's critical to add new vmas to the tail of the anon_vma, | 237 | * It's critical to add new vmas to the tail of the anon_vma, |
214 | * see comment in huge_memory.c:__split_huge_page(). | 238 | * see comment in huge_memory.c:__split_huge_page(). |
215 | */ | 239 | */ |
216 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 240 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
217 | anon_vma_unlock(anon_vma); | ||
218 | } | 241 | } |
219 | 242 | ||
220 | /* | 243 | /* |
@@ -224,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
224 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 247 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
225 | { | 248 | { |
226 | struct anon_vma_chain *avc, *pavc; | 249 | struct anon_vma_chain *avc, *pavc; |
250 | struct anon_vma *root = NULL; | ||
227 | 251 | ||
228 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { | 252 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
229 | avc = anon_vma_chain_alloc(); | 253 | struct anon_vma *anon_vma; |
230 | if (!avc) | 254 | |
231 | goto enomem_failure; | 255 | avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); |
232 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | 256 | if (unlikely(!avc)) { |
257 | unlock_anon_vma_root(root); | ||
258 | root = NULL; | ||
259 | avc = anon_vma_chain_alloc(GFP_KERNEL); | ||
260 | if (!avc) | ||
261 | goto enomem_failure; | ||
262 | } | ||
263 | anon_vma = pavc->anon_vma; | ||
264 | root = lock_anon_vma_root(root, anon_vma); | ||
265 | anon_vma_chain_link(dst, avc, anon_vma); | ||
233 | } | 266 | } |
267 | unlock_anon_vma_root(root); | ||
234 | return 0; | 268 | return 0; |
235 | 269 | ||
236 | enomem_failure: | 270 | enomem_failure: |
@@ -263,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
263 | anon_vma = anon_vma_alloc(); | 297 | anon_vma = anon_vma_alloc(); |
264 | if (!anon_vma) | 298 | if (!anon_vma) |
265 | goto out_error; | 299 | goto out_error; |
266 | avc = anon_vma_chain_alloc(); | 300 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
267 | if (!avc) | 301 | if (!avc) |
268 | goto out_error_free_anon_vma; | 302 | goto out_error_free_anon_vma; |
269 | 303 | ||
@@ -280,7 +314,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
280 | get_anon_vma(anon_vma->root); | 314 | get_anon_vma(anon_vma->root); |
281 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 315 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
282 | vma->anon_vma = anon_vma; | 316 | vma->anon_vma = anon_vma; |
317 | anon_vma_lock(anon_vma); | ||
283 | anon_vma_chain_link(vma, avc, anon_vma); | 318 | anon_vma_chain_link(vma, avc, anon_vma); |
319 | anon_vma_unlock(anon_vma); | ||
284 | 320 | ||
285 | return 0; | 321 | return 0; |
286 | 322 | ||
@@ -291,36 +327,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
291 | return -ENOMEM; | 327 | return -ENOMEM; |
292 | } | 328 | } |
293 | 329 | ||
294 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) | ||
295 | { | ||
296 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; | ||
297 | int empty; | ||
298 | |||
299 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
300 | if (!anon_vma) | ||
301 | return; | ||
302 | |||
303 | anon_vma_lock(anon_vma); | ||
304 | list_del(&anon_vma_chain->same_anon_vma); | ||
305 | |||
306 | /* We must garbage collect the anon_vma if it's empty */ | ||
307 | empty = list_empty(&anon_vma->head); | ||
308 | anon_vma_unlock(anon_vma); | ||
309 | |||
310 | if (empty) | ||
311 | put_anon_vma(anon_vma); | ||
312 | } | ||
313 | |||
314 | void unlink_anon_vmas(struct vm_area_struct *vma) | 330 | void unlink_anon_vmas(struct vm_area_struct *vma) |
315 | { | 331 | { |
316 | struct anon_vma_chain *avc, *next; | 332 | struct anon_vma_chain *avc, *next; |
333 | struct anon_vma *root = NULL; | ||
317 | 334 | ||
318 | /* | 335 | /* |
319 | * Unlink each anon_vma chained to the VMA. This list is ordered | 336 | * Unlink each anon_vma chained to the VMA. This list is ordered |
320 | * from newest to oldest, ensuring the root anon_vma gets freed last. | 337 | * from newest to oldest, ensuring the root anon_vma gets freed last. |
321 | */ | 338 | */ |
322 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 339 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
323 | anon_vma_unlink(avc); | 340 | struct anon_vma *anon_vma = avc->anon_vma; |
341 | |||
342 | root = lock_anon_vma_root(root, anon_vma); | ||
343 | list_del(&avc->same_anon_vma); | ||
344 | |||
345 | /* | ||
346 | * Leave empty anon_vmas on the list - we'll need | ||
347 | * to free them outside the lock. | ||
348 | */ | ||
349 | if (list_empty(&anon_vma->head)) | ||
350 | continue; | ||
351 | |||
352 | list_del(&avc->same_vma); | ||
353 | anon_vma_chain_free(avc); | ||
354 | } | ||
355 | unlock_anon_vma_root(root); | ||
356 | |||
357 | /* | ||
358 | * Iterate the list once more, it now only contains empty and unlinked | ||
359 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | ||
360 | * needing to acquire the anon_vma->root->mutex. | ||
361 | */ | ||
362 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
363 | struct anon_vma *anon_vma = avc->anon_vma; | ||
364 | |||
365 | put_anon_vma(anon_vma); | ||
366 | |||
324 | list_del(&avc->same_vma); | 367 | list_del(&avc->same_vma); |
325 | anon_vma_chain_free(avc); | 368 | anon_vma_chain_free(avc); |
326 | } | 369 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index d221a1cfd7b..fcedf5464eb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -539,7 +539,7 @@ static void shmem_free_pages(struct list_head *next) | |||
539 | } while (next); | 539 | } while (next); |
540 | } | 540 | } |
541 | 541 | ||
542 | static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | 542 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) |
543 | { | 543 | { |
544 | struct shmem_inode_info *info = SHMEM_I(inode); | 544 | struct shmem_inode_info *info = SHMEM_I(inode); |
545 | unsigned long idx; | 545 | unsigned long idx; |
@@ -562,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | |||
562 | spinlock_t *punch_lock; | 562 | spinlock_t *punch_lock; |
563 | unsigned long upper_limit; | 563 | unsigned long upper_limit; |
564 | 564 | ||
565 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
566 | |||
565 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 567 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
566 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 568 | idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
567 | if (idx >= info->next_index) | 569 | if (idx >= info->next_index) |
@@ -738,16 +740,8 @@ done2: | |||
738 | * lowered next_index. Also, though shmem_getpage checks | 740 | * lowered next_index. Also, though shmem_getpage checks |
739 | * i_size before adding to cache, no recheck after: so fix the | 741 | * i_size before adding to cache, no recheck after: so fix the |
740 | * narrow window there too. | 742 | * narrow window there too. |
741 | * | ||
742 | * Recalling truncate_inode_pages_range and unmap_mapping_range | ||
743 | * every time for punch_hole (which never got a chance to clear | ||
744 | * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, | ||
745 | * yet hardly ever necessary: try to optimize them out later. | ||
746 | */ | 743 | */ |
747 | truncate_inode_pages_range(inode->i_mapping, start, end); | 744 | truncate_inode_pages_range(inode->i_mapping, start, end); |
748 | if (punch_hole) | ||
749 | unmap_mapping_range(inode->i_mapping, start, | ||
750 | end - start, 1); | ||
751 | } | 745 | } |
752 | 746 | ||
753 | spin_lock(&info->lock); | 747 | spin_lock(&info->lock); |
@@ -766,22 +760,23 @@ done2: | |||
766 | shmem_free_pages(pages_to_free.next); | 760 | shmem_free_pages(pages_to_free.next); |
767 | } | 761 | } |
768 | } | 762 | } |
763 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | ||
769 | 764 | ||
770 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | 765 | static int shmem_setattr(struct dentry *dentry, struct iattr *attr) |
771 | { | 766 | { |
772 | struct inode *inode = dentry->d_inode; | 767 | struct inode *inode = dentry->d_inode; |
773 | loff_t newsize = attr->ia_size; | ||
774 | int error; | 768 | int error; |
775 | 769 | ||
776 | error = inode_change_ok(inode, attr); | 770 | error = inode_change_ok(inode, attr); |
777 | if (error) | 771 | if (error) |
778 | return error; | 772 | return error; |
779 | 773 | ||
780 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) | 774 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
781 | && newsize != inode->i_size) { | 775 | loff_t oldsize = inode->i_size; |
776 | loff_t newsize = attr->ia_size; | ||
782 | struct page *page = NULL; | 777 | struct page *page = NULL; |
783 | 778 | ||
784 | if (newsize < inode->i_size) { | 779 | if (newsize < oldsize) { |
785 | /* | 780 | /* |
786 | * If truncating down to a partial page, then | 781 | * If truncating down to a partial page, then |
787 | * if that page is already allocated, hold it | 782 | * if that page is already allocated, hold it |
@@ -810,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | spin_unlock(&info->lock); | 805 | spin_unlock(&info->lock); |
811 | } | 806 | } |
812 | } | 807 | } |
813 | 808 | if (newsize != oldsize) { | |
814 | /* XXX(truncate): truncate_setsize should be called last */ | 809 | i_size_write(inode, newsize); |
815 | truncate_setsize(inode, newsize); | 810 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
811 | } | ||
812 | if (newsize < oldsize) { | ||
813 | loff_t holebegin = round_up(newsize, PAGE_SIZE); | ||
814 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | ||
815 | shmem_truncate_range(inode, newsize, (loff_t)-1); | ||
816 | /* unmap again to remove racily COWed private pages */ | ||
817 | unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); | ||
818 | } | ||
816 | if (page) | 819 | if (page) |
817 | page_cache_release(page); | 820 | page_cache_release(page); |
818 | shmem_truncate_range(inode, newsize, (loff_t)-1); | ||
819 | } | 821 | } |
820 | 822 | ||
821 | setattr_copy(inode, attr); | 823 | setattr_copy(inode, attr); |
@@ -832,7 +834,6 @@ static void shmem_evict_inode(struct inode *inode) | |||
832 | struct shmem_xattr *xattr, *nxattr; | 834 | struct shmem_xattr *xattr, *nxattr; |
833 | 835 | ||
834 | if (inode->i_mapping->a_ops == &shmem_aops) { | 836 | if (inode->i_mapping->a_ops == &shmem_aops) { |
835 | truncate_inode_pages(inode->i_mapping, 0); | ||
836 | shmem_unacct_size(info->flags, inode->i_size); | 837 | shmem_unacct_size(info->flags, inode->i_size); |
837 | inode->i_size = 0; | 838 | inode->i_size = 0; |
838 | shmem_truncate_range(inode, 0, (loff_t)-1); | 839 | shmem_truncate_range(inode, 0, (loff_t)-1); |
@@ -2706,7 +2707,7 @@ static const struct file_operations shmem_file_operations = { | |||
2706 | }; | 2707 | }; |
2707 | 2708 | ||
2708 | static const struct inode_operations shmem_inode_operations = { | 2709 | static const struct inode_operations shmem_inode_operations = { |
2709 | .setattr = shmem_notify_change, | 2710 | .setattr = shmem_setattr, |
2710 | .truncate_range = shmem_truncate_range, | 2711 | .truncate_range = shmem_truncate_range, |
2711 | #ifdef CONFIG_TMPFS_XATTR | 2712 | #ifdef CONFIG_TMPFS_XATTR |
2712 | .setxattr = shmem_setxattr, | 2713 | .setxattr = shmem_setxattr, |
@@ -2739,7 +2740,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2739 | .removexattr = shmem_removexattr, | 2740 | .removexattr = shmem_removexattr, |
2740 | #endif | 2741 | #endif |
2741 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2742 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2742 | .setattr = shmem_notify_change, | 2743 | .setattr = shmem_setattr, |
2743 | .check_acl = generic_check_acl, | 2744 | .check_acl = generic_check_acl, |
2744 | #endif | 2745 | #endif |
2745 | }; | 2746 | }; |
@@ -2752,7 +2753,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2752 | .removexattr = shmem_removexattr, | 2753 | .removexattr = shmem_removexattr, |
2753 | #endif | 2754 | #endif |
2754 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2755 | #ifdef CONFIG_TMPFS_POSIX_ACL |
2755 | .setattr = shmem_notify_change, | 2756 | .setattr = shmem_setattr, |
2756 | .check_acl = generic_check_acl, | 2757 | .check_acl = generic_check_acl, |
2757 | #endif | 2758 | #endif |
2758 | }; | 2759 | }; |
@@ -2908,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2908 | return 0; | 2909 | return 0; |
2909 | } | 2910 | } |
2910 | 2911 | ||
2912 | void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) | ||
2913 | { | ||
2914 | truncate_inode_pages_range(inode->i_mapping, start, end); | ||
2915 | } | ||
2916 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | ||
2917 | |||
2911 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 2918 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
2912 | /** | 2919 | /** |
2913 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | 2920 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file |
@@ -3028,3 +3035,26 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3028 | vma->vm_flags |= VM_CAN_NONLINEAR; | 3035 | vma->vm_flags |= VM_CAN_NONLINEAR; |
3029 | return 0; | 3036 | return 0; |
3030 | } | 3037 | } |
3038 | |||
3039 | /** | ||
3040 | * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. | ||
3041 | * @mapping: the page's address_space | ||
3042 | * @index: the page index | ||
3043 | * @gfp: the page allocator flags to use if allocating | ||
3044 | * | ||
3045 | * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", | ||
3046 | * with any new page allocations done using the specified allocation flags. | ||
3047 | * But read_cache_page_gfp() uses the ->readpage() method: which does not | ||
3048 | * suit tmpfs, since it may have pages in swapcache, and needs to find those | ||
3049 | * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. | ||
3050 | * | ||
3051 | * Provide a stub for those callers to start using now, then later | ||
3052 | * flesh it out to call shmem_getpage() with additional gfp mask, when | ||
3053 | * shmem_file_splice_read() is added and shmem_readpage() is removed. | ||
3054 | */ | ||
3055 | struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | ||
3056 | pgoff_t index, gfp_t gfp) | ||
3057 | { | ||
3058 | return read_cache_page_gfp(mapping, index, gfp); | ||
3059 | } | ||
3060 | EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); | ||
@@ -3604,13 +3604,14 @@ free_done: | |||
3604 | * Release an obj back to its cache. If the obj has a constructed state, it must | 3604 | * Release an obj back to its cache. If the obj has a constructed state, it must |
3605 | * be in this state _before_ it is released. Called with disabled ints. | 3605 | * be in this state _before_ it is released. Called with disabled ints. |
3606 | */ | 3606 | */ |
3607 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) | 3607 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, |
3608 | void *caller) | ||
3608 | { | 3609 | { |
3609 | struct array_cache *ac = cpu_cache_get(cachep); | 3610 | struct array_cache *ac = cpu_cache_get(cachep); |
3610 | 3611 | ||
3611 | check_irq_off(); | 3612 | check_irq_off(); |
3612 | kmemleak_free_recursive(objp, cachep->flags); | 3613 | kmemleak_free_recursive(objp, cachep->flags); |
3613 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3614 | objp = cache_free_debugcheck(cachep, objp, caller); |
3614 | 3615 | ||
3615 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | 3616 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); |
3616 | 3617 | ||
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3801 | debug_check_no_locks_freed(objp, obj_size(cachep)); | 3802 | debug_check_no_locks_freed(objp, obj_size(cachep)); |
3802 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3803 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3803 | debug_check_no_obj_freed(objp, obj_size(cachep)); | 3804 | debug_check_no_obj_freed(objp, obj_size(cachep)); |
3804 | __cache_free(cachep, objp); | 3805 | __cache_free(cachep, objp, __builtin_return_address(0)); |
3805 | local_irq_restore(flags); | 3806 | local_irq_restore(flags); |
3806 | 3807 | ||
3807 | trace_kmem_cache_free(_RET_IP_, objp); | 3808 | trace_kmem_cache_free(_RET_IP_, objp); |
@@ -3831,7 +3832,7 @@ void kfree(const void *objp) | |||
3831 | c = virt_to_cache(objp); | 3832 | c = virt_to_cache(objp); |
3832 | debug_check_no_locks_freed(objp, obj_size(c)); | 3833 | debug_check_no_locks_freed(objp, obj_size(c)); |
3833 | debug_check_no_obj_freed(objp, obj_size(c)); | 3834 | debug_check_no_obj_freed(objp, obj_size(c)); |
3834 | __cache_free(c, (void *)objp); | 3835 | __cache_free(c, (void *)objp, __builtin_return_address(0)); |
3835 | local_irq_restore(flags); | 3836 | local_irq_restore(flags); |
3836 | } | 3837 | } |
3837 | EXPORT_SYMBOL(kfree); | 3838 | EXPORT_SYMBOL(kfree); |
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) | |||
2320 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < | 2320 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < |
2321 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); | 2321 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); |
2322 | 2322 | ||
2323 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2324 | /* | 2323 | /* |
2325 | * Must align to double word boundary for the double cmpxchg instructions | 2324 | * Must align to double word boundary for the double cmpxchg |
2326 | * to work. | 2325 | * instructions to work; see __pcpu_double_call_return_bool(). |
2327 | */ | 2326 | */ |
2328 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); | 2327 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), |
2329 | #else | 2328 | 2 * sizeof(void *)); |
2330 | /* Regular alignment is sufficient */ | ||
2331 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
2332 | #endif | ||
2333 | 2329 | ||
2334 | if (!s->cpu_slab) | 2330 | if (!s->cpu_slab) |
2335 | return 0; | 2331 | return 0; |
diff --git a/mm/swapfile.c b/mm/swapfile.c index d537d29e9b7..ff8dc1a18cb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -14,7 +14,7 @@ | |||
14 | #include <linux/vmalloc.h> | 14 | #include <linux/vmalloc.h> |
15 | #include <linux/pagemap.h> | 15 | #include <linux/pagemap.h> |
16 | #include <linux/namei.h> | 16 | #include <linux/namei.h> |
17 | #include <linux/shm.h> | 17 | #include <linux/shmem_fs.h> |
18 | #include <linux/blkdev.h> | 18 | #include <linux/blkdev.h> |
19 | #include <linux/random.h> | 19 | #include <linux/random.h> |
20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
diff --git a/mm/thrash.c b/mm/thrash.c index 2372d4ed5dd..fabf2d0f516 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -21,14 +21,40 @@ | |||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/memcontrol.h> | ||
25 | |||
26 | #include <trace/events/vmscan.h> | ||
27 | |||
28 | #define TOKEN_AGING_INTERVAL (0xFF) | ||
24 | 29 | ||
25 | static DEFINE_SPINLOCK(swap_token_lock); | 30 | static DEFINE_SPINLOCK(swap_token_lock); |
26 | struct mm_struct *swap_token_mm; | 31 | struct mm_struct *swap_token_mm; |
32 | struct mem_cgroup *swap_token_memcg; | ||
27 | static unsigned int global_faults; | 33 | static unsigned int global_faults; |
34 | static unsigned int last_aging; | ||
35 | |||
36 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
37 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
38 | { | ||
39 | struct mem_cgroup *memcg; | ||
40 | |||
41 | memcg = try_get_mem_cgroup_from_mm(mm); | ||
42 | if (memcg) | ||
43 | css_put(mem_cgroup_css(memcg)); | ||
44 | |||
45 | return memcg; | ||
46 | } | ||
47 | #else | ||
48 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
49 | { | ||
50 | return NULL; | ||
51 | } | ||
52 | #endif | ||
28 | 53 | ||
29 | void grab_swap_token(struct mm_struct *mm) | 54 | void grab_swap_token(struct mm_struct *mm) |
30 | { | 55 | { |
31 | int current_interval; | 56 | int current_interval; |
57 | unsigned int old_prio = mm->token_priority; | ||
32 | 58 | ||
33 | global_faults++; | 59 | global_faults++; |
34 | 60 | ||
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm) | |||
38 | return; | 64 | return; |
39 | 65 | ||
40 | /* First come first served */ | 66 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | 67 | if (!swap_token_mm) |
42 | mm->token_priority = mm->token_priority + 2; | 68 | goto replace_token; |
43 | swap_token_mm = mm; | 69 | |
44 | goto out; | 70 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { |
71 | swap_token_mm->token_priority /= 2; | ||
72 | last_aging = global_faults; | ||
45 | } | 73 | } |
46 | 74 | ||
47 | if (mm != swap_token_mm) { | 75 | if (mm == swap_token_mm) { |
48 | if (current_interval < mm->last_interval) | ||
49 | mm->token_priority++; | ||
50 | else { | ||
51 | if (likely(mm->token_priority > 0)) | ||
52 | mm->token_priority--; | ||
53 | } | ||
54 | /* Check if we deserve the token */ | ||
55 | if (mm->token_priority > swap_token_mm->token_priority) { | ||
56 | mm->token_priority += 2; | ||
57 | swap_token_mm = mm; | ||
58 | } | ||
59 | } else { | ||
60 | /* Token holder came in again! */ | ||
61 | mm->token_priority += 2; | 76 | mm->token_priority += 2; |
77 | goto update_priority; | ||
78 | } | ||
79 | |||
80 | if (current_interval < mm->last_interval) | ||
81 | mm->token_priority++; | ||
82 | else { | ||
83 | if (likely(mm->token_priority > 0)) | ||
84 | mm->token_priority--; | ||
62 | } | 85 | } |
63 | 86 | ||
87 | /* Check if we deserve the token */ | ||
88 | if (mm->token_priority > swap_token_mm->token_priority) | ||
89 | goto replace_token; | ||
90 | |||
91 | update_priority: | ||
92 | trace_update_swap_token_priority(mm, old_prio, swap_token_mm); | ||
93 | |||
64 | out: | 94 | out: |
65 | mm->faultstamp = global_faults; | 95 | mm->faultstamp = global_faults; |
66 | mm->last_interval = current_interval; | 96 | mm->last_interval = current_interval; |
67 | spin_unlock(&swap_token_lock); | 97 | spin_unlock(&swap_token_lock); |
98 | return; | ||
99 | |||
100 | replace_token: | ||
101 | mm->token_priority += 2; | ||
102 | trace_replace_swap_token(swap_token_mm, mm); | ||
103 | swap_token_mm = mm; | ||
104 | swap_token_memcg = swap_token_memcg_from_mm(mm); | ||
105 | last_aging = global_faults; | ||
106 | goto out; | ||
68 | } | 107 | } |
69 | 108 | ||
70 | /* Called on process exit. */ | 109 | /* Called on process exit. */ |
71 | void __put_swap_token(struct mm_struct *mm) | 110 | void __put_swap_token(struct mm_struct *mm) |
72 | { | 111 | { |
73 | spin_lock(&swap_token_lock); | 112 | spin_lock(&swap_token_lock); |
74 | if (likely(mm == swap_token_mm)) | 113 | if (likely(mm == swap_token_mm)) { |
114 | trace_put_swap_token(swap_token_mm); | ||
75 | swap_token_mm = NULL; | 115 | swap_token_mm = NULL; |
116 | swap_token_memcg = NULL; | ||
117 | } | ||
76 | spin_unlock(&swap_token_lock); | 118 | spin_unlock(&swap_token_lock); |
77 | } | 119 | } |
120 | |||
121 | static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) | ||
122 | { | ||
123 | if (!a) | ||
124 | return true; | ||
125 | if (!b) | ||
126 | return true; | ||
127 | if (a == b) | ||
128 | return true; | ||
129 | return false; | ||
130 | } | ||
131 | |||
132 | void disable_swap_token(struct mem_cgroup *memcg) | ||
133 | { | ||
134 | /* memcg reclaim don't disable unrelated mm token. */ | ||
135 | if (match_memcg(memcg, swap_token_memcg)) { | ||
136 | spin_lock(&swap_token_lock); | ||
137 | if (match_memcg(memcg, swap_token_memcg)) { | ||
138 | trace_disable_swap_token(swap_token_mm); | ||
139 | swap_token_mm = NULL; | ||
140 | swap_token_memcg = NULL; | ||
141 | } | ||
142 | spin_unlock(&swap_token_lock); | ||
143 | } | ||
144 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 3a29a618021..e13f22efaad 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range); | |||
304 | * @lstart: offset from which to truncate | 304 | * @lstart: offset from which to truncate |
305 | * | 305 | * |
306 | * Called under (and serialised by) inode->i_mutex. | 306 | * Called under (and serialised by) inode->i_mutex. |
307 | * | ||
308 | * Note: When this function returns, there can be a page in the process of | ||
309 | * deletion (inside __delete_from_page_cache()) in the specified range. Thus | ||
310 | * mapping->nrpages can be non-zero when this function returns even after | ||
311 | * truncation of the whole mapping. | ||
307 | */ | 312 | */ |
308 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | 313 | void truncate_inode_pages(struct address_space *mapping, loff_t lstart) |
309 | { | 314 | { |
@@ -603,3 +608,27 @@ int vmtruncate(struct inode *inode, loff_t offset) | |||
603 | return 0; | 608 | return 0; |
604 | } | 609 | } |
605 | EXPORT_SYMBOL(vmtruncate); | 610 | EXPORT_SYMBOL(vmtruncate); |
611 | |||
612 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | ||
613 | { | ||
614 | struct address_space *mapping = inode->i_mapping; | ||
615 | |||
616 | /* | ||
617 | * If the underlying filesystem is not going to provide | ||
618 | * a way to truncate a range of blocks (punch a hole) - | ||
619 | * we should return failure right now. | ||
620 | */ | ||
621 | if (!inode->i_op->truncate_range) | ||
622 | return -ENOSYS; | ||
623 | |||
624 | mutex_lock(&inode->i_mutex); | ||
625 | down_write(&inode->i_alloc_sem); | ||
626 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
627 | inode->i_op->truncate_range(inode, offset, end); | ||
628 | /* unmap again to remove racily COWed private pages */ | ||
629 | unmap_mapping_range(mapping, offset, (end - offset), 1); | ||
630 | up_write(&inode->i_alloc_sem); | ||
631 | mutex_unlock(&inode->i_mutex); | ||
632 | |||
633 | return 0; | ||
634 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index faa0a088f9c..5ed24b94c5e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1124 | nr_lumpy_dirty++; | 1124 | nr_lumpy_dirty++; |
1125 | scan++; | 1125 | scan++; |
1126 | } else { | 1126 | } else { |
1127 | /* the page is freed already. */ | 1127 | /* |
1128 | if (!page_count(cursor_page)) | 1128 | * Check if the page is freed already. |
1129 | * | ||
1130 | * We can't use page_count() as that | ||
1131 | * requires compound_head and we don't | ||
1132 | * have a pin on the page here. If a | ||
1133 | * page is tail, we may or may not | ||
1134 | * have isolated the head, so assume | ||
1135 | * it's not free, it'd be tricky to | ||
1136 | * track the head status without a | ||
1137 | * page pin. | ||
1138 | */ | ||
1139 | if (!PageTail(cursor_page) && | ||
1140 | !atomic_read(&cursor_page->_count)) | ||
1129 | continue; | 1141 | continue; |
1130 | break; | 1142 | break; |
1131 | } | 1143 | } |
@@ -1983,14 +1995,13 @@ restart: | |||
1983 | * If a zone is deemed to be full of pinned pages then just give it a light | 1995 | * If a zone is deemed to be full of pinned pages then just give it a light |
1984 | * scan then give up on it. | 1996 | * scan then give up on it. |
1985 | */ | 1997 | */ |
1986 | static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | 1998 | static void shrink_zones(int priority, struct zonelist *zonelist, |
1987 | struct scan_control *sc) | 1999 | struct scan_control *sc) |
1988 | { | 2000 | { |
1989 | struct zoneref *z; | 2001 | struct zoneref *z; |
1990 | struct zone *zone; | 2002 | struct zone *zone; |
1991 | unsigned long nr_soft_reclaimed; | 2003 | unsigned long nr_soft_reclaimed; |
1992 | unsigned long nr_soft_scanned; | 2004 | unsigned long nr_soft_scanned; |
1993 | unsigned long total_scanned = 0; | ||
1994 | 2005 | ||
1995 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2006 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1996 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2007 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
@@ -2005,19 +2016,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, | |||
2005 | continue; | 2016 | continue; |
2006 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2017 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2007 | continue; /* Let kswapd poll it */ | 2018 | continue; /* Let kswapd poll it */ |
2019 | /* | ||
2020 | * This steals pages from memory cgroups over softlimit | ||
2021 | * and returns the number of reclaimed pages and | ||
2022 | * scanned pages. This works for global memory pressure | ||
2023 | * and balancing, not for a memcg's limit. | ||
2024 | */ | ||
2025 | nr_soft_scanned = 0; | ||
2026 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2027 | sc->order, sc->gfp_mask, | ||
2028 | &nr_soft_scanned); | ||
2029 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2030 | sc->nr_scanned += nr_soft_scanned; | ||
2031 | /* need some check for avoid more shrink_zone() */ | ||
2008 | } | 2032 | } |
2009 | 2033 | ||
2010 | nr_soft_scanned = 0; | ||
2011 | nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, | ||
2012 | sc->order, sc->gfp_mask, | ||
2013 | &nr_soft_scanned); | ||
2014 | sc->nr_reclaimed += nr_soft_reclaimed; | ||
2015 | total_scanned += nr_soft_scanned; | ||
2016 | |||
2017 | shrink_zone(priority, zone, sc); | 2034 | shrink_zone(priority, zone, sc); |
2018 | } | 2035 | } |
2019 | |||
2020 | return total_scanned; | ||
2021 | } | 2036 | } |
2022 | 2037 | ||
2023 | static bool zone_reclaimable(struct zone *zone) | 2038 | static bool zone_reclaimable(struct zone *zone) |
@@ -2081,8 +2096,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2081 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2096 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2082 | sc->nr_scanned = 0; | 2097 | sc->nr_scanned = 0; |
2083 | if (!priority) | 2098 | if (!priority) |
2084 | disable_swap_token(); | 2099 | disable_swap_token(sc->mem_cgroup); |
2085 | total_scanned += shrink_zones(priority, zonelist, sc); | 2100 | shrink_zones(priority, zonelist, sc); |
2086 | /* | 2101 | /* |
2087 | * Don't shrink slabs when reclaiming memory from | 2102 | * Don't shrink slabs when reclaiming memory from |
2088 | * over limit cgroups | 2103 | * over limit cgroups |
@@ -2311,7 +2326,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2311 | return true; | 2326 | return true; |
2312 | 2327 | ||
2313 | /* Check the watermark levels */ | 2328 | /* Check the watermark levels */ |
2314 | for (i = 0; i < pgdat->nr_zones; i++) { | 2329 | for (i = 0; i <= classzone_idx; i++) { |
2315 | struct zone *zone = pgdat->node_zones + i; | 2330 | struct zone *zone = pgdat->node_zones + i; |
2316 | 2331 | ||
2317 | if (!populated_zone(zone)) | 2332 | if (!populated_zone(zone)) |
@@ -2329,7 +2344,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2329 | } | 2344 | } |
2330 | 2345 | ||
2331 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), | 2346 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2332 | classzone_idx, 0)) | 2347 | i, 0)) |
2333 | all_zones_ok = false; | 2348 | all_zones_ok = false; |
2334 | else | 2349 | else |
2335 | balanced += zone->present_pages; | 2350 | balanced += zone->present_pages; |
@@ -2407,7 +2422,7 @@ loop_again: | |||
2407 | 2422 | ||
2408 | /* The swap token gets in the way of swapout... */ | 2423 | /* The swap token gets in the way of swapout... */ |
2409 | if (!priority) | 2424 | if (!priority) |
2410 | disable_swap_token(); | 2425 | disable_swap_token(NULL); |
2411 | 2426 | ||
2412 | all_zones_ok = 1; | 2427 | all_zones_ok = 1; |
2413 | balanced = 0; | 2428 | balanced = 0; |
@@ -2436,7 +2451,6 @@ loop_again: | |||
2436 | if (!zone_watermark_ok_safe(zone, order, | 2451 | if (!zone_watermark_ok_safe(zone, order, |
2437 | high_wmark_pages(zone), 0, 0)) { | 2452 | high_wmark_pages(zone), 0, 0)) { |
2438 | end_zone = i; | 2453 | end_zone = i; |
2439 | *classzone_idx = i; | ||
2440 | break; | 2454 | break; |
2441 | } | 2455 | } |
2442 | } | 2456 | } |
@@ -2495,18 +2509,18 @@ loop_again: | |||
2495 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2509 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2496 | if (!zone_watermark_ok_safe(zone, order, | 2510 | if (!zone_watermark_ok_safe(zone, order, |
2497 | high_wmark_pages(zone) + balance_gap, | 2511 | high_wmark_pages(zone) + balance_gap, |
2498 | end_zone, 0)) | 2512 | end_zone, 0)) { |
2499 | shrink_zone(priority, zone, &sc); | 2513 | shrink_zone(priority, zone, &sc); |
2500 | reclaim_state->reclaimed_slab = 0; | ||
2501 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | ||
2502 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2503 | total_scanned += sc.nr_scanned; | ||
2504 | 2514 | ||
2505 | if (zone->all_unreclaimable) | 2515 | reclaim_state->reclaimed_slab = 0; |
2506 | continue; | 2516 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); |
2507 | if (nr_slab == 0 && | 2517 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
2508 | !zone_reclaimable(zone)) | 2518 | total_scanned += sc.nr_scanned; |
2509 | zone->all_unreclaimable = 1; | 2519 | |
2520 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2521 | zone->all_unreclaimable = 1; | ||
2522 | } | ||
2523 | |||
2510 | /* | 2524 | /* |
2511 | * If we've done a decent amount of scanning and | 2525 | * If we've done a decent amount of scanning and |
2512 | * the reclaim ratio is low, start doing writepage | 2526 | * the reclaim ratio is low, start doing writepage |
@@ -2516,6 +2530,12 @@ loop_again: | |||
2516 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2530 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2517 | sc.may_writepage = 1; | 2531 | sc.may_writepage = 1; |
2518 | 2532 | ||
2533 | if (zone->all_unreclaimable) { | ||
2534 | if (end_zone && end_zone == i) | ||
2535 | end_zone--; | ||
2536 | continue; | ||
2537 | } | ||
2538 | |||
2519 | if (!zone_watermark_ok_safe(zone, order, | 2539 | if (!zone_watermark_ok_safe(zone, order, |
2520 | high_wmark_pages(zone), end_zone, 0)) { | 2540 | high_wmark_pages(zone), end_zone, 0)) { |
2521 | all_zones_ok = 0; | 2541 | all_zones_ok = 0; |
@@ -2694,8 +2714,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2694 | */ | 2714 | */ |
2695 | static int kswapd(void *p) | 2715 | static int kswapd(void *p) |
2696 | { | 2716 | { |
2697 | unsigned long order; | 2717 | unsigned long order, new_order; |
2698 | int classzone_idx; | 2718 | int classzone_idx, new_classzone_idx; |
2699 | pg_data_t *pgdat = (pg_data_t*)p; | 2719 | pg_data_t *pgdat = (pg_data_t*)p; |
2700 | struct task_struct *tsk = current; | 2720 | struct task_struct *tsk = current; |
2701 | 2721 | ||
@@ -2725,17 +2745,23 @@ static int kswapd(void *p) | |||
2725 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 2745 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
2726 | set_freezable(); | 2746 | set_freezable(); |
2727 | 2747 | ||
2728 | order = 0; | 2748 | order = new_order = 0; |
2729 | classzone_idx = MAX_NR_ZONES - 1; | 2749 | classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; |
2730 | for ( ; ; ) { | 2750 | for ( ; ; ) { |
2731 | unsigned long new_order; | ||
2732 | int new_classzone_idx; | ||
2733 | int ret; | 2751 | int ret; |
2734 | 2752 | ||
2735 | new_order = pgdat->kswapd_max_order; | 2753 | /* |
2736 | new_classzone_idx = pgdat->classzone_idx; | 2754 | * If the last balance_pgdat was unsuccessful it's unlikely a |
2737 | pgdat->kswapd_max_order = 0; | 2755 | * new request of a similar or harder type will succeed soon |
2738 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2756 | * so consider going to sleep on the basis we reclaimed at |
2757 | */ | ||
2758 | if (classzone_idx >= new_classzone_idx && order == new_order) { | ||
2759 | new_order = pgdat->kswapd_max_order; | ||
2760 | new_classzone_idx = pgdat->classzone_idx; | ||
2761 | pgdat->kswapd_max_order = 0; | ||
2762 | pgdat->classzone_idx = pgdat->nr_zones - 1; | ||
2763 | } | ||
2764 | |||
2739 | if (order < new_order || classzone_idx > new_classzone_idx) { | 2765 | if (order < new_order || classzone_idx > new_classzone_idx) { |
2740 | /* | 2766 | /* |
2741 | * Don't sleep if someone wants a larger 'order' | 2767 | * Don't sleep if someone wants a larger 'order' |
@@ -2748,7 +2774,7 @@ static int kswapd(void *p) | |||
2748 | order = pgdat->kswapd_max_order; | 2774 | order = pgdat->kswapd_max_order; |
2749 | classzone_idx = pgdat->classzone_idx; | 2775 | classzone_idx = pgdat->classzone_idx; |
2750 | pgdat->kswapd_max_order = 0; | 2776 | pgdat->kswapd_max_order = 0; |
2751 | pgdat->classzone_idx = MAX_NR_ZONES - 1; | 2777 | pgdat->classzone_idx = pgdat->nr_zones - 1; |
2752 | } | 2778 | } |
2753 | 2779 | ||
2754 | ret = try_to_freeze(); | 2780 | ret = try_to_freeze(); |