diff options
author | David S. Miller <davem@davemloft.net> | 2011-06-21 01:29:08 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2011-06-21 01:29:08 -0400 |
commit | 9f6ec8d697c08963d83880ccd35c13c5ace716ea (patch) | |
tree | ad8d93cf6fcdd09b86ade09f5fcbbc66cdb1cca2 /mm | |
parent | 4aa3a715551c93eda32d79bd52042ce500bd5383 (diff) | |
parent | 56299378726d5f2ba8d3c8cbbd13cb280ba45e4f (diff) |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
Conflicts:
drivers/net/wireless/iwlwifi/iwl-agn-rxon.c
drivers/net/wireless/rtlwifi/pci.c
net/netfilter/ipvs/ip_vs_core.c
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 76 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 12 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 81 | ||||
-rw-r--r-- | mm/memory-failure.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 8 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 12 | ||||
-rw-r--r-- | mm/page_cgroup.c | 71 | ||||
-rw-r--r-- | mm/rmap.c | 106 | ||||
-rw-r--r-- | mm/slab.c | 9 | ||||
-rw-r--r-- | mm/slub.c | 12 | ||||
-rw-r--r-- | mm/thrash.c | 105 | ||||
-rw-r--r-- | mm/vmscan.c | 20 |
17 files changed, 383 insertions, 154 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 021a2960ef9e..6cc604bd5649 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -144,9 +144,20 @@ static void isolate_freepages(struct zone *zone, | |||
144 | int nr_freepages = cc->nr_freepages; | 144 | int nr_freepages = cc->nr_freepages; |
145 | struct list_head *freelist = &cc->freepages; | 145 | struct list_head *freelist = &cc->freepages; |
146 | 146 | ||
147 | /* | ||
148 | * Initialise the free scanner. The starting point is where we last | ||
149 | * scanned from (or the end of the zone if starting). The low point | ||
150 | * is the end of the pageblock the migration scanner is using. | ||
151 | */ | ||
147 | pfn = cc->free_pfn; | 152 | pfn = cc->free_pfn; |
148 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 153 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; |
149 | high_pfn = low_pfn; | 154 | |
155 | /* | ||
156 | * Take care that if the migration scanner is at the end of the zone | ||
157 | * that the free scanner does not accidentally move to the next zone | ||
158 | * in the next isolation cycle. | ||
159 | */ | ||
160 | high_pfn = min(low_pfn, pfn); | ||
150 | 161 | ||
151 | /* | 162 | /* |
152 | * Isolate free pages until enough are available to migrate the | 163 | * Isolate free pages until enough are available to migrate the |
@@ -240,11 +251,18 @@ static bool too_many_isolated(struct zone *zone) | |||
240 | return isolated > (inactive + active) / 2; | 251 | return isolated > (inactive + active) / 2; |
241 | } | 252 | } |
242 | 253 | ||
254 | /* possible outcome of isolate_migratepages */ | ||
255 | typedef enum { | ||
256 | ISOLATE_ABORT, /* Abort compaction now */ | ||
257 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | ||
258 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | ||
259 | } isolate_migrate_t; | ||
260 | |||
243 | /* | 261 | /* |
244 | * Isolate all pages that can be migrated from the block pointed to by | 262 | * Isolate all pages that can be migrated from the block pointed to by |
245 | * the migrate scanner within compact_control. | 263 | * the migrate scanner within compact_control. |
246 | */ | 264 | */ |
247 | static unsigned long isolate_migratepages(struct zone *zone, | 265 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
248 | struct compact_control *cc) | 266 | struct compact_control *cc) |
249 | { | 267 | { |
250 | unsigned long low_pfn, end_pfn; | 268 | unsigned long low_pfn, end_pfn; |
@@ -261,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
261 | /* Do not cross the free scanner or scan within a memory hole */ | 279 | /* Do not cross the free scanner or scan within a memory hole */ |
262 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 280 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
263 | cc->migrate_pfn = end_pfn; | 281 | cc->migrate_pfn = end_pfn; |
264 | return 0; | 282 | return ISOLATE_NONE; |
265 | } | 283 | } |
266 | 284 | ||
267 | /* | 285 | /* |
@@ -270,10 +288,14 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
270 | * delay for some time until fewer pages are isolated | 288 | * delay for some time until fewer pages are isolated |
271 | */ | 289 | */ |
272 | while (unlikely(too_many_isolated(zone))) { | 290 | while (unlikely(too_many_isolated(zone))) { |
291 | /* async migration should just abort */ | ||
292 | if (!cc->sync) | ||
293 | return ISOLATE_ABORT; | ||
294 | |||
273 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 295 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
274 | 296 | ||
275 | if (fatal_signal_pending(current)) | 297 | if (fatal_signal_pending(current)) |
276 | return 0; | 298 | return ISOLATE_ABORT; |
277 | } | 299 | } |
278 | 300 | ||
279 | /* Time to isolate some pages for migration */ | 301 | /* Time to isolate some pages for migration */ |
@@ -358,7 +380,7 @@ static unsigned long isolate_migratepages(struct zone *zone, | |||
358 | 380 | ||
359 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 381 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
360 | 382 | ||
361 | return cc->nr_migratepages; | 383 | return ISOLATE_SUCCESS; |
362 | } | 384 | } |
363 | 385 | ||
364 | /* | 386 | /* |
@@ -420,13 +442,6 @@ static int compact_finished(struct zone *zone, | |||
420 | if (cc->free_pfn <= cc->migrate_pfn) | 442 | if (cc->free_pfn <= cc->migrate_pfn) |
421 | return COMPACT_COMPLETE; | 443 | return COMPACT_COMPLETE; |
422 | 444 | ||
423 | /* Compaction run is not finished if the watermark is not met */ | ||
424 | watermark = low_wmark_pages(zone); | ||
425 | watermark += (1 << cc->order); | ||
426 | |||
427 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
428 | return COMPACT_CONTINUE; | ||
429 | |||
430 | /* | 445 | /* |
431 | * order == -1 is expected when compacting via | 446 | * order == -1 is expected when compacting via |
432 | * /proc/sys/vm/compact_memory | 447 | * /proc/sys/vm/compact_memory |
@@ -434,6 +449,13 @@ static int compact_finished(struct zone *zone, | |||
434 | if (cc->order == -1) | 449 | if (cc->order == -1) |
435 | return COMPACT_CONTINUE; | 450 | return COMPACT_CONTINUE; |
436 | 451 | ||
452 | /* Compaction run is not finished if the watermark is not met */ | ||
453 | watermark = low_wmark_pages(zone); | ||
454 | watermark += (1 << cc->order); | ||
455 | |||
456 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | ||
457 | return COMPACT_CONTINUE; | ||
458 | |||
437 | /* Direct compactor: Is a suitable page free? */ | 459 | /* Direct compactor: Is a suitable page free? */ |
438 | for (order = cc->order; order < MAX_ORDER; order++) { | 460 | for (order = cc->order; order < MAX_ORDER; order++) { |
439 | /* Job done if page is free of the right migratetype */ | 461 | /* Job done if page is free of the right migratetype */ |
@@ -461,6 +483,13 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
461 | unsigned long watermark; | 483 | unsigned long watermark; |
462 | 484 | ||
463 | /* | 485 | /* |
486 | * order == -1 is expected when compacting via | ||
487 | * /proc/sys/vm/compact_memory | ||
488 | */ | ||
489 | if (order == -1) | ||
490 | return COMPACT_CONTINUE; | ||
491 | |||
492 | /* | ||
464 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 493 | * Watermarks for order-0 must be met for compaction. Note the 2UL. |
465 | * This is because during migration, copies of pages need to be | 494 | * This is because during migration, copies of pages need to be |
466 | * allocated and for a short time, the footprint is higher | 495 | * allocated and for a short time, the footprint is higher |
@@ -470,17 +499,11 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
470 | return COMPACT_SKIPPED; | 499 | return COMPACT_SKIPPED; |
471 | 500 | ||
472 | /* | 501 | /* |
473 | * order == -1 is expected when compacting via | ||
474 | * /proc/sys/vm/compact_memory | ||
475 | */ | ||
476 | if (order == -1) | ||
477 | return COMPACT_CONTINUE; | ||
478 | |||
479 | /* | ||
480 | * fragmentation index determines if allocation failures are due to | 502 | * fragmentation index determines if allocation failures are due to |
481 | * low memory or external fragmentation | 503 | * low memory or external fragmentation |
482 | * | 504 | * |
483 | * index of -1 implies allocations might succeed dependingon watermarks | 505 | * index of -1000 implies allocations might succeed depending on |
506 | * watermarks | ||
484 | * index towards 0 implies failure is due to lack of memory | 507 | * index towards 0 implies failure is due to lack of memory |
485 | * index towards 1000 implies failure is due to fragmentation | 508 | * index towards 1000 implies failure is due to fragmentation |
486 | * | 509 | * |
@@ -490,7 +513,8 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
490 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 513 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
491 | return COMPACT_SKIPPED; | 514 | return COMPACT_SKIPPED; |
492 | 515 | ||
493 | if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) | 516 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, |
517 | 0, 0)) | ||
494 | return COMPACT_PARTIAL; | 518 | return COMPACT_PARTIAL; |
495 | 519 | ||
496 | return COMPACT_CONTINUE; | 520 | return COMPACT_CONTINUE; |
@@ -522,8 +546,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
522 | unsigned long nr_migrate, nr_remaining; | 546 | unsigned long nr_migrate, nr_remaining; |
523 | int err; | 547 | int err; |
524 | 548 | ||
525 | if (!isolate_migratepages(zone, cc)) | 549 | switch (isolate_migratepages(zone, cc)) { |
550 | case ISOLATE_ABORT: | ||
551 | ret = COMPACT_PARTIAL; | ||
552 | goto out; | ||
553 | case ISOLATE_NONE: | ||
526 | continue; | 554 | continue; |
555 | case ISOLATE_SUCCESS: | ||
556 | ; | ||
557 | } | ||
527 | 558 | ||
528 | nr_migrate = cc->nr_migratepages; | 559 | nr_migrate = cc->nr_migratepages; |
529 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 560 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
@@ -547,6 +578,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
547 | 578 | ||
548 | } | 579 | } |
549 | 580 | ||
581 | out: | ||
550 | /* Release free pages and check accounting */ | 582 | /* Release free pages and check accounting */ |
551 | cc->nr_freepages -= release_freepages(&cc->freepages); | 583 | cc->nr_freepages -= release_freepages(&cc->freepages); |
552 | VM_BUG_ON(cc->nr_freepages != 0); | 584 | VM_BUG_ON(cc->nr_freepages != 0); |
diff --git a/mm/filemap.c b/mm/filemap.c index d7b10578a64b..a8251a8d3457 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -2000,7 +2000,7 @@ int file_remove_suid(struct file *file) | |||
2000 | error = security_inode_killpriv(dentry); | 2000 | error = security_inode_killpriv(dentry); |
2001 | if (!error && killsuid) | 2001 | if (!error && killsuid) |
2002 | error = __remove_suid(dentry, killsuid); | 2002 | error = __remove_suid(dentry, killsuid); |
2003 | if (!error) | 2003 | if (!error && (inode->i_sb->s_flags & MS_NOSEC)) |
2004 | inode->i_flags |= S_NOSEC; | 2004 | inode->i_flags |= S_NOSEC; |
2005 | 2005 | ||
2006 | return error; | 2006 | return error; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 615d9743a3cb..81532f297fd2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -2234,11 +2234,8 @@ static void khugepaged_loop(void) | |||
2234 | while (likely(khugepaged_enabled())) { | 2234 | while (likely(khugepaged_enabled())) { |
2235 | #ifndef CONFIG_NUMA | 2235 | #ifndef CONFIG_NUMA |
2236 | hpage = khugepaged_alloc_hugepage(); | 2236 | hpage = khugepaged_alloc_hugepage(); |
2237 | if (unlikely(!hpage)) { | 2237 | if (unlikely(!hpage)) |
2238 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2239 | break; | 2238 | break; |
2240 | } | ||
2241 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2242 | #else | 2239 | #else |
2243 | if (IS_ERR(hpage)) { | 2240 | if (IS_ERR(hpage)) { |
2244 | khugepaged_alloc_sleep(); | 2241 | khugepaged_alloc_sleep(); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f33bb319b73f..bfcf153bc829 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1033,10 +1033,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1033 | */ | 1033 | */ |
1034 | chg = vma_needs_reservation(h, vma, addr); | 1034 | chg = vma_needs_reservation(h, vma, addr); |
1035 | if (chg < 0) | 1035 | if (chg < 0) |
1036 | return ERR_PTR(chg); | 1036 | return ERR_PTR(-VM_FAULT_OOM); |
1037 | if (chg) | 1037 | if (chg) |
1038 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1038 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
1039 | return ERR_PTR(-ENOSPC); | 1039 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1040 | 1040 | ||
1041 | spin_lock(&hugetlb_lock); | 1041 | spin_lock(&hugetlb_lock); |
1042 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); | 1042 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
@@ -1111,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void) | |||
1111 | WARN_ON(page_count(page) != 1); | 1111 | WARN_ON(page_count(page) != 1); |
1112 | prep_compound_huge_page(page, h->order); | 1112 | prep_compound_huge_page(page, h->order); |
1113 | prep_new_huge_page(h, page, page_to_nid(page)); | 1113 | prep_new_huge_page(h, page, page_to_nid(page)); |
1114 | /* | ||
1115 | * If we had gigantic hugepages allocated at boot time, we need | ||
1116 | * to restore the 'stolen' pages to totalram_pages in order to | ||
1117 | * fix confusing memory reports from free(1) and another | ||
1118 | * side-effects, like CommitLimit going negative. | ||
1119 | */ | ||
1120 | if (h->order > (MAX_ORDER - 1)) | ||
1121 | totalram_pages += 1 << h->order; | ||
1114 | } | 1122 | } |
1115 | } | 1123 | } |
1116 | 1124 | ||
@@ -1302,6 +1302,12 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | 1302 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); |
1303 | ksm_scan.mm_slot = slot; | 1303 | ksm_scan.mm_slot = slot; |
1304 | spin_unlock(&ksm_mmlist_lock); | 1304 | spin_unlock(&ksm_mmlist_lock); |
1305 | /* | ||
1306 | * Although we tested list_empty() above, a racing __ksm_exit | ||
1307 | * of the last mm on the list may have removed it since then. | ||
1308 | */ | ||
1309 | if (slot == &ksm_mm_head) | ||
1310 | return NULL; | ||
1305 | next_mm: | 1311 | next_mm: |
1306 | ksm_scan.address = 0; | 1312 | ksm_scan.address = 0; |
1307 | ksm_scan.rmap_list = &slot->rmap_list; | 1313 | ksm_scan.rmap_list = &slot->rmap_list; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bd9052a5d3ad..cf7d027a8844 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -359,7 +359,7 @@ enum charge_type { | |||
359 | static void mem_cgroup_get(struct mem_cgroup *mem); | 359 | static void mem_cgroup_get(struct mem_cgroup *mem); |
360 | static void mem_cgroup_put(struct mem_cgroup *mem); | 360 | static void mem_cgroup_put(struct mem_cgroup *mem); |
361 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 361 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
362 | static void drain_all_stock_async(void); | 362 | static void drain_all_stock_async(struct mem_cgroup *mem); |
363 | 363 | ||
364 | static struct mem_cgroup_per_zone * | 364 | static struct mem_cgroup_per_zone * |
365 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 365 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -735,7 +735,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
735 | struct mem_cgroup, css); | 735 | struct mem_cgroup, css); |
736 | } | 736 | } |
737 | 737 | ||
738 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 738 | struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
739 | { | 739 | { |
740 | struct mem_cgroup *mem = NULL; | 740 | struct mem_cgroup *mem = NULL; |
741 | 741 | ||
@@ -1663,15 +1663,21 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; | 1663 | excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; |
1664 | 1664 | ||
1665 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1665 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
1666 | if (root_mem->memsw_is_minimum) | 1666 | if (!check_soft && root_mem->memsw_is_minimum) |
1667 | noswap = true; | 1667 | noswap = true; |
1668 | 1668 | ||
1669 | while (1) { | 1669 | while (1) { |
1670 | victim = mem_cgroup_select_victim(root_mem); | 1670 | victim = mem_cgroup_select_victim(root_mem); |
1671 | if (victim == root_mem) { | 1671 | if (victim == root_mem) { |
1672 | loop++; | 1672 | loop++; |
1673 | if (loop >= 1) | 1673 | /* |
1674 | drain_all_stock_async(); | 1674 | * We are not draining per cpu cached charges during |
1675 | * soft limit reclaim because global reclaim doesn't | ||
1676 | * care about charges. It tries to free some memory and | ||
1677 | * charges will not give any. | ||
1678 | */ | ||
1679 | if (!check_soft && loop >= 1) | ||
1680 | drain_all_stock_async(root_mem); | ||
1675 | if (loop >= 2) { | 1681 | if (loop >= 2) { |
1676 | /* | 1682 | /* |
1677 | * If we have not been able to reclaim | 1683 | * If we have not been able to reclaim |
@@ -1934,9 +1940,11 @@ struct memcg_stock_pcp { | |||
1934 | struct mem_cgroup *cached; /* this never be root cgroup */ | 1940 | struct mem_cgroup *cached; /* this never be root cgroup */ |
1935 | unsigned int nr_pages; | 1941 | unsigned int nr_pages; |
1936 | struct work_struct work; | 1942 | struct work_struct work; |
1943 | unsigned long flags; | ||
1944 | #define FLUSHING_CACHED_CHARGE (0) | ||
1937 | }; | 1945 | }; |
1938 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 1946 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
1939 | static atomic_t memcg_drain_count; | 1947 | static DEFINE_MUTEX(percpu_charge_mutex); |
1940 | 1948 | ||
1941 | /* | 1949 | /* |
1942 | * Try to consume stocked charge on this cpu. If success, one page is consumed | 1950 | * Try to consume stocked charge on this cpu. If success, one page is consumed |
@@ -1984,6 +1992,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1984 | { | 1992 | { |
1985 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | 1993 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); |
1986 | drain_stock(stock); | 1994 | drain_stock(stock); |
1995 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | ||
1987 | } | 1996 | } |
1988 | 1997 | ||
1989 | /* | 1998 | /* |
@@ -2008,26 +2017,45 @@ static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) | |||
2008 | * expects some charges will be back to res_counter later but cannot wait for | 2017 | * expects some charges will be back to res_counter later but cannot wait for |
2009 | * it. | 2018 | * it. |
2010 | */ | 2019 | */ |
2011 | static void drain_all_stock_async(void) | 2020 | static void drain_all_stock_async(struct mem_cgroup *root_mem) |
2012 | { | 2021 | { |
2013 | int cpu; | 2022 | int cpu, curcpu; |
2014 | /* This function is for scheduling "drain" in asynchronous way. | 2023 | /* |
2015 | * The result of "drain" is not directly handled by callers. Then, | 2024 | * If someone calls draining, avoid adding more kworker runs. |
2016 | * if someone is calling drain, we don't have to call drain more. | ||
2017 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
2018 | * there is a race. We just do loose check here. | ||
2019 | */ | 2025 | */ |
2020 | if (atomic_read(&memcg_drain_count)) | 2026 | if (!mutex_trylock(&percpu_charge_mutex)) |
2021 | return; | 2027 | return; |
2022 | /* Notify other cpus that system-wide "drain" is running */ | 2028 | /* Notify other cpus that system-wide "drain" is running */ |
2023 | atomic_inc(&memcg_drain_count); | ||
2024 | get_online_cpus(); | 2029 | get_online_cpus(); |
2030 | /* | ||
2031 | * Get a hint for avoiding draining charges on the current cpu, | ||
2032 | * which must be exhausted by our charging. It is not required that | ||
2033 | * this be a precise check, so we use raw_smp_processor_id() instead of | ||
2034 | * getcpu()/putcpu(). | ||
2035 | */ | ||
2036 | curcpu = raw_smp_processor_id(); | ||
2025 | for_each_online_cpu(cpu) { | 2037 | for_each_online_cpu(cpu) { |
2026 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2038 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2027 | schedule_work_on(cpu, &stock->work); | 2039 | struct mem_cgroup *mem; |
2040 | |||
2041 | if (cpu == curcpu) | ||
2042 | continue; | ||
2043 | |||
2044 | mem = stock->cached; | ||
2045 | if (!mem) | ||
2046 | continue; | ||
2047 | if (mem != root_mem) { | ||
2048 | if (!root_mem->use_hierarchy) | ||
2049 | continue; | ||
2050 | /* check whether "mem" is under tree of "root_mem" */ | ||
2051 | if (!css_is_ancestor(&mem->css, &root_mem->css)) | ||
2052 | continue; | ||
2053 | } | ||
2054 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | ||
2055 | schedule_work_on(cpu, &stock->work); | ||
2028 | } | 2056 | } |
2029 | put_online_cpus(); | 2057 | put_online_cpus(); |
2030 | atomic_dec(&memcg_drain_count); | 2058 | mutex_unlock(&percpu_charge_mutex); |
2031 | /* We don't wait for flush_work */ | 2059 | /* We don't wait for flush_work */ |
2032 | } | 2060 | } |
2033 | 2061 | ||
@@ -2035,9 +2063,9 @@ static void drain_all_stock_async(void) | |||
2035 | static void drain_all_stock_sync(void) | 2063 | static void drain_all_stock_sync(void) |
2036 | { | 2064 | { |
2037 | /* called when force_empty is called */ | 2065 | /* called when force_empty is called */ |
2038 | atomic_inc(&memcg_drain_count); | 2066 | mutex_lock(&percpu_charge_mutex); |
2039 | schedule_on_each_cpu(drain_local_stock); | 2067 | schedule_on_each_cpu(drain_local_stock); |
2040 | atomic_dec(&memcg_drain_count); | 2068 | mutex_unlock(&percpu_charge_mutex); |
2041 | } | 2069 | } |
2042 | 2070 | ||
2043 | /* | 2071 | /* |
@@ -4640,6 +4668,7 @@ static struct cftype mem_cgroup_files[] = { | |||
4640 | { | 4668 | { |
4641 | .name = "numa_stat", | 4669 | .name = "numa_stat", |
4642 | .open = mem_control_numa_stat_open, | 4670 | .open = mem_control_numa_stat_open, |
4671 | .mode = S_IRUGO, | ||
4643 | }, | 4672 | }, |
4644 | #endif | 4673 | #endif |
4645 | }; | 4674 | }; |
@@ -5414,18 +5443,16 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
5414 | struct cgroup *old_cont, | 5443 | struct cgroup *old_cont, |
5415 | struct task_struct *p) | 5444 | struct task_struct *p) |
5416 | { | 5445 | { |
5417 | struct mm_struct *mm; | 5446 | struct mm_struct *mm = get_task_mm(p); |
5418 | 5447 | ||
5419 | if (!mc.to) | ||
5420 | /* no need to move charge */ | ||
5421 | return; | ||
5422 | |||
5423 | mm = get_task_mm(p); | ||
5424 | if (mm) { | 5448 | if (mm) { |
5425 | mem_cgroup_move_charge(mm); | 5449 | if (mc.to) |
5450 | mem_cgroup_move_charge(mm); | ||
5451 | put_swap_token(mm); | ||
5426 | mmput(mm); | 5452 | mmput(mm); |
5427 | } | 5453 | } |
5428 | mem_cgroup_clear_mc(); | 5454 | if (mc.to) |
5455 | mem_cgroup_clear_mc(); | ||
5429 | } | 5456 | } |
5430 | #else /* !CONFIG_MMU */ | 5457 | #else /* !CONFIG_MMU */ |
5431 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | 5458 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5c8f7e08928d..eac0ba561491 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -52,6 +52,7 @@ | |||
52 | #include <linux/swapops.h> | 52 | #include <linux/swapops.h> |
53 | #include <linux/hugetlb.h> | 53 | #include <linux/hugetlb.h> |
54 | #include <linux/memory_hotplug.h> | 54 | #include <linux/memory_hotplug.h> |
55 | #include <linux/mm_inline.h> | ||
55 | #include "internal.h" | 56 | #include "internal.h" |
56 | 57 | ||
57 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 58 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
@@ -1468,7 +1469,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1468 | put_page(page); | 1469 | put_page(page); |
1469 | if (!ret) { | 1470 | if (!ret) { |
1470 | LIST_HEAD(pagelist); | 1471 | LIST_HEAD(pagelist); |
1471 | 1472 | inc_zone_page_state(page, NR_ISOLATED_ANON + | |
1473 | page_is_file_cache(page)); | ||
1472 | list_add(&page->lru, &pagelist); | 1474 | list_add(&page->lru, &pagelist); |
1473 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1475 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1474 | 0, true); | 1476 | 0, true); |
diff --git a/mm/memory.c b/mm/memory.c index 6953d3926e01..87d935333f0d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -1112,11 +1112,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1112 | int force_flush = 0; | 1112 | int force_flush = 0; |
1113 | int rss[NR_MM_COUNTERS]; | 1113 | int rss[NR_MM_COUNTERS]; |
1114 | spinlock_t *ptl; | 1114 | spinlock_t *ptl; |
1115 | pte_t *start_pte; | ||
1115 | pte_t *pte; | 1116 | pte_t *pte; |
1116 | 1117 | ||
1117 | again: | 1118 | again: |
1118 | init_rss_vec(rss); | 1119 | init_rss_vec(rss); |
1119 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 1120 | start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
1121 | pte = start_pte; | ||
1120 | arch_enter_lazy_mmu_mode(); | 1122 | arch_enter_lazy_mmu_mode(); |
1121 | do { | 1123 | do { |
1122 | pte_t ptent = *pte; | 1124 | pte_t ptent = *pte; |
@@ -1196,7 +1198,7 @@ again: | |||
1196 | 1198 | ||
1197 | add_mm_rss_vec(mm, rss); | 1199 | add_mm_rss_vec(mm, rss); |
1198 | arch_leave_lazy_mmu_mode(); | 1200 | arch_leave_lazy_mmu_mode(); |
1199 | pte_unmap_unlock(pte - 1, ptl); | 1201 | pte_unmap_unlock(start_pte, ptl); |
1200 | 1202 | ||
1201 | /* | 1203 | /* |
1202 | * mmu_gather ran out of room to batch pages, we break out of | 1204 | * mmu_gather ran out of room to batch pages, we break out of |
@@ -1296,7 +1298,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
1296 | 1298 | ||
1297 | /** | 1299 | /** |
1298 | * unmap_vmas - unmap a range of memory covered by a list of vma's | 1300 | * unmap_vmas - unmap a range of memory covered by a list of vma's |
1299 | * @tlbp: address of the caller's struct mmu_gather | 1301 | * @tlb: address of the caller's struct mmu_gather |
1300 | * @vma: the starting vma | 1302 | * @vma: the starting vma |
1301 | * @start_addr: virtual address at which to start unmapping | 1303 | * @start_addr: virtual address at which to start unmapping |
1302 | * @end_addr: virtual address at which to end unmapping | 1304 | * @end_addr: virtual address at which to end unmapping |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9f646374e32f..02159c755136 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -494,6 +494,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
494 | /* init node's zones as empty zones, we don't have any present pages.*/ | 494 | /* init node's zones as empty zones, we don't have any present pages.*/ |
495 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); | 495 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); |
496 | 496 | ||
497 | /* | ||
498 | * The node we allocated has no zone fallback lists. For avoiding | ||
499 | * to access not-initialized zonelist, build here. | ||
500 | */ | ||
501 | build_all_zonelists(NULL); | ||
502 | |||
497 | return pgdat; | 503 | return pgdat; |
498 | } | 504 | } |
499 | 505 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index e4a5c912983d..666e4e677414 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -288,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
288 | */ | 288 | */ |
289 | __dec_zone_page_state(page, NR_FILE_PAGES); | 289 | __dec_zone_page_state(page, NR_FILE_PAGES); |
290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 290 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
291 | if (PageSwapBacked(page)) { | 291 | if (!PageSwapCache(page) && PageSwapBacked(page)) { |
292 | __dec_zone_page_state(page, NR_SHMEM); | 292 | __dec_zone_page_state(page, NR_SHMEM); |
293 | __inc_zone_page_state(newpage, NR_SHMEM); | 293 | __inc_zone_page_state(newpage, NR_SHMEM); |
294 | } | 294 | } |
@@ -906,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | |||
906 | if (anon_vma) | 906 | if (anon_vma) |
907 | return anon_vma; | 907 | return anon_vma; |
908 | try_prev: | 908 | try_prev: |
909 | /* | 909 | near = vma->vm_prev; |
910 | * It is potentially slow to have to call find_vma_prev here. | ||
911 | * But it's only on the first write fault on the vma, not | ||
912 | * every time, and we could devise a way to avoid it later | ||
913 | * (e.g. stash info in next's anon_vma_node when assigning | ||
914 | * an anon_vma, or when trying vma_merge). Another time. | ||
915 | */ | ||
916 | BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma); | ||
917 | if (!near) | 910 | if (!near) |
918 | goto none; | 911 | goto none; |
919 | 912 | ||
@@ -2044,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2044 | return -EINVAL; | 2037 | return -EINVAL; |
2045 | 2038 | ||
2046 | /* Find the first overlapping VMA */ | 2039 | /* Find the first overlapping VMA */ |
2047 | vma = find_vma_prev(mm, start, &prev); | 2040 | vma = find_vma(mm, start); |
2048 | if (!vma) | 2041 | if (!vma) |
2049 | return 0; | 2042 | return 0; |
2043 | prev = vma->vm_prev; | ||
2050 | /* we have start < vma->vm_end */ | 2044 | /* we have start < vma->vm_end */ |
2051 | 2045 | ||
2052 | /* if it doesn't overlap, we have nothing.. */ | 2046 | /* if it doesn't overlap, we have nothing.. */ |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 74ccff61d1be..53bffc6c293e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
@@ -162,13 +162,13 @@ static void free_page_cgroup(void *addr) | |||
162 | } | 162 | } |
163 | #endif | 163 | #endif |
164 | 164 | ||
165 | static int __meminit init_section_page_cgroup(unsigned long pfn) | 165 | static int __meminit init_section_page_cgroup(unsigned long pfn, int nid) |
166 | { | 166 | { |
167 | struct page_cgroup *base, *pc; | 167 | struct page_cgroup *base, *pc; |
168 | struct mem_section *section; | 168 | struct mem_section *section; |
169 | unsigned long table_size; | 169 | unsigned long table_size; |
170 | unsigned long nr; | 170 | unsigned long nr; |
171 | int nid, index; | 171 | int index; |
172 | 172 | ||
173 | nr = pfn_to_section_nr(pfn); | 173 | nr = pfn_to_section_nr(pfn); |
174 | section = __nr_to_section(nr); | 174 | section = __nr_to_section(nr); |
@@ -176,7 +176,6 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
176 | if (section->page_cgroup) | 176 | if (section->page_cgroup) |
177 | return 0; | 177 | return 0; |
178 | 178 | ||
179 | nid = page_to_nid(pfn_to_page(pfn)); | ||
180 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 179 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
181 | base = alloc_page_cgroup(table_size, nid); | 180 | base = alloc_page_cgroup(table_size, nid); |
182 | 181 | ||
@@ -196,7 +195,11 @@ static int __meminit init_section_page_cgroup(unsigned long pfn) | |||
196 | pc = base + index; | 195 | pc = base + index; |
197 | init_page_cgroup(pc, nr); | 196 | init_page_cgroup(pc, nr); |
198 | } | 197 | } |
199 | 198 | /* | |
199 | * The passed "pfn" may not be aligned to SECTION. For the calculation | ||
200 | * we need to apply a mask. | ||
201 | */ | ||
202 | pfn &= PAGE_SECTION_MASK; | ||
200 | section->page_cgroup = base - pfn; | 203 | section->page_cgroup = base - pfn; |
201 | total_usage += table_size; | 204 | total_usage += table_size; |
202 | return 0; | 205 | return 0; |
@@ -225,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn, | |||
225 | start = start_pfn & ~(PAGES_PER_SECTION - 1); | 228 | start = start_pfn & ~(PAGES_PER_SECTION - 1); |
226 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); | 229 | end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); |
227 | 230 | ||
231 | if (nid == -1) { | ||
232 | /* | ||
233 | * In this case, "nid" already exists and contains valid memory. | ||
234 | * "start_pfn" passed to us is a pfn which is an arg for | ||
235 | * online__pages(), and start_pfn should exist. | ||
236 | */ | ||
237 | nid = pfn_to_nid(start_pfn); | ||
238 | VM_BUG_ON(!node_state(nid, N_ONLINE)); | ||
239 | } | ||
240 | |||
228 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { | 241 | for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { |
229 | if (!pfn_present(pfn)) | 242 | if (!pfn_present(pfn)) |
230 | continue; | 243 | continue; |
231 | fail = init_section_page_cgroup(pfn); | 244 | fail = init_section_page_cgroup(pfn, nid); |
232 | } | 245 | } |
233 | if (!fail) | 246 | if (!fail) |
234 | return 0; | 247 | return 0; |
@@ -284,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, | |||
284 | void __init page_cgroup_init(void) | 297 | void __init page_cgroup_init(void) |
285 | { | 298 | { |
286 | unsigned long pfn; | 299 | unsigned long pfn; |
287 | int fail = 0; | 300 | int nid; |
288 | 301 | ||
289 | if (mem_cgroup_disabled()) | 302 | if (mem_cgroup_disabled()) |
290 | return; | 303 | return; |
291 | 304 | ||
292 | for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { | 305 | for_each_node_state(nid, N_HIGH_MEMORY) { |
293 | if (!pfn_present(pfn)) | 306 | unsigned long start_pfn, end_pfn; |
294 | continue; | 307 | |
295 | fail = init_section_page_cgroup(pfn); | 308 | start_pfn = node_start_pfn(nid); |
296 | } | 309 | end_pfn = node_end_pfn(nid); |
297 | if (fail) { | 310 | /* |
298 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | 311 | * start_pfn and end_pfn may not be aligned to SECTION and the |
299 | panic("Out of memory"); | 312 | * page->flags of out of node pages are not initialized. So we |
300 | } else { | 313 | * scan [start_pfn, the biggest section's pfn < end_pfn) here. |
301 | hotplug_memory_notifier(page_cgroup_callback, 0); | 314 | */ |
315 | for (pfn = start_pfn; | ||
316 | pfn < end_pfn; | ||
317 | pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) { | ||
318 | |||
319 | if (!pfn_valid(pfn)) | ||
320 | continue; | ||
321 | /* | ||
322 | * Nodes's pfns can be overlapping. | ||
323 | * We know some arch can have a nodes layout such as | ||
324 | * -------------pfn--------------> | ||
325 | * N0 | N1 | N2 | N0 | N1 | N2|.... | ||
326 | */ | ||
327 | if (pfn_to_nid(pfn) != nid) | ||
328 | continue; | ||
329 | if (init_section_page_cgroup(pfn, nid)) | ||
330 | goto oom; | ||
331 | } | ||
302 | } | 332 | } |
333 | hotplug_memory_notifier(page_cgroup_callback, 0); | ||
303 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 334 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
304 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" | 335 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you " |
305 | " want memory cgroups\n"); | 336 | "don't want memory cgroups\n"); |
337 | return; | ||
338 | oom: | ||
339 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); | ||
340 | panic("Out of memory"); | ||
306 | } | 341 | } |
307 | 342 | ||
308 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 343 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |
@@ -112,9 +112,9 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
112 | kmem_cache_free(anon_vma_cachep, anon_vma); | 112 | kmem_cache_free(anon_vma_cachep, anon_vma); |
113 | } | 113 | } |
114 | 114 | ||
115 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | 115 | static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) |
116 | { | 116 | { |
117 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | 117 | return kmem_cache_alloc(anon_vma_chain_cachep, gfp); |
118 | } | 118 | } |
119 | 119 | ||
120 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | 120 | static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) |
@@ -159,7 +159,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
159 | struct mm_struct *mm = vma->vm_mm; | 159 | struct mm_struct *mm = vma->vm_mm; |
160 | struct anon_vma *allocated; | 160 | struct anon_vma *allocated; |
161 | 161 | ||
162 | avc = anon_vma_chain_alloc(); | 162 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
163 | if (!avc) | 163 | if (!avc) |
164 | goto out_enomem; | 164 | goto out_enomem; |
165 | 165 | ||
@@ -200,6 +200,32 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
200 | return -ENOMEM; | 200 | return -ENOMEM; |
201 | } | 201 | } |
202 | 202 | ||
203 | /* | ||
204 | * This is a useful helper function for locking the anon_vma root as | ||
205 | * we traverse the vma->anon_vma_chain, looping over anon_vma's that | ||
206 | * have the same vma. | ||
207 | * | ||
208 | * Such anon_vma's should have the same root, so you'd expect to see | ||
209 | * just a single mutex_lock for the whole traversal. | ||
210 | */ | ||
211 | static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) | ||
212 | { | ||
213 | struct anon_vma *new_root = anon_vma->root; | ||
214 | if (new_root != root) { | ||
215 | if (WARN_ON_ONCE(root)) | ||
216 | mutex_unlock(&root->mutex); | ||
217 | root = new_root; | ||
218 | mutex_lock(&root->mutex); | ||
219 | } | ||
220 | return root; | ||
221 | } | ||
222 | |||
223 | static inline void unlock_anon_vma_root(struct anon_vma *root) | ||
224 | { | ||
225 | if (root) | ||
226 | mutex_unlock(&root->mutex); | ||
227 | } | ||
228 | |||
203 | static void anon_vma_chain_link(struct vm_area_struct *vma, | 229 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
204 | struct anon_vma_chain *avc, | 230 | struct anon_vma_chain *avc, |
205 | struct anon_vma *anon_vma) | 231 | struct anon_vma *anon_vma) |
@@ -208,13 +234,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
208 | avc->anon_vma = anon_vma; | 234 | avc->anon_vma = anon_vma; |
209 | list_add(&avc->same_vma, &vma->anon_vma_chain); | 235 | list_add(&avc->same_vma, &vma->anon_vma_chain); |
210 | 236 | ||
211 | anon_vma_lock(anon_vma); | ||
212 | /* | 237 | /* |
213 | * It's critical to add new vmas to the tail of the anon_vma, | 238 | * It's critical to add new vmas to the tail of the anon_vma, |
214 | * see comment in huge_memory.c:__split_huge_page(). | 239 | * see comment in huge_memory.c:__split_huge_page(). |
215 | */ | 240 | */ |
216 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | 241 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); |
217 | anon_vma_unlock(anon_vma); | ||
218 | } | 242 | } |
219 | 243 | ||
220 | /* | 244 | /* |
@@ -224,13 +248,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
224 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 248 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
225 | { | 249 | { |
226 | struct anon_vma_chain *avc, *pavc; | 250 | struct anon_vma_chain *avc, *pavc; |
251 | struct anon_vma *root = NULL; | ||
227 | 252 | ||
228 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { | 253 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
229 | avc = anon_vma_chain_alloc(); | 254 | struct anon_vma *anon_vma; |
230 | if (!avc) | 255 | |
231 | goto enomem_failure; | 256 | avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN); |
232 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | 257 | if (unlikely(!avc)) { |
258 | unlock_anon_vma_root(root); | ||
259 | root = NULL; | ||
260 | avc = anon_vma_chain_alloc(GFP_KERNEL); | ||
261 | if (!avc) | ||
262 | goto enomem_failure; | ||
263 | } | ||
264 | anon_vma = pavc->anon_vma; | ||
265 | root = lock_anon_vma_root(root, anon_vma); | ||
266 | anon_vma_chain_link(dst, avc, anon_vma); | ||
233 | } | 267 | } |
268 | unlock_anon_vma_root(root); | ||
234 | return 0; | 269 | return 0; |
235 | 270 | ||
236 | enomem_failure: | 271 | enomem_failure: |
@@ -263,7 +298,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
263 | anon_vma = anon_vma_alloc(); | 298 | anon_vma = anon_vma_alloc(); |
264 | if (!anon_vma) | 299 | if (!anon_vma) |
265 | goto out_error; | 300 | goto out_error; |
266 | avc = anon_vma_chain_alloc(); | 301 | avc = anon_vma_chain_alloc(GFP_KERNEL); |
267 | if (!avc) | 302 | if (!avc) |
268 | goto out_error_free_anon_vma; | 303 | goto out_error_free_anon_vma; |
269 | 304 | ||
@@ -280,7 +315,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
280 | get_anon_vma(anon_vma->root); | 315 | get_anon_vma(anon_vma->root); |
281 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 316 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
282 | vma->anon_vma = anon_vma; | 317 | vma->anon_vma = anon_vma; |
318 | anon_vma_lock(anon_vma); | ||
283 | anon_vma_chain_link(vma, avc, anon_vma); | 319 | anon_vma_chain_link(vma, avc, anon_vma); |
320 | anon_vma_unlock(anon_vma); | ||
284 | 321 | ||
285 | return 0; | 322 | return 0; |
286 | 323 | ||
@@ -291,36 +328,43 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
291 | return -ENOMEM; | 328 | return -ENOMEM; |
292 | } | 329 | } |
293 | 330 | ||
294 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) | ||
295 | { | ||
296 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; | ||
297 | int empty; | ||
298 | |||
299 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
300 | if (!anon_vma) | ||
301 | return; | ||
302 | |||
303 | anon_vma_lock(anon_vma); | ||
304 | list_del(&anon_vma_chain->same_anon_vma); | ||
305 | |||
306 | /* We must garbage collect the anon_vma if it's empty */ | ||
307 | empty = list_empty(&anon_vma->head); | ||
308 | anon_vma_unlock(anon_vma); | ||
309 | |||
310 | if (empty) | ||
311 | put_anon_vma(anon_vma); | ||
312 | } | ||
313 | |||
314 | void unlink_anon_vmas(struct vm_area_struct *vma) | 331 | void unlink_anon_vmas(struct vm_area_struct *vma) |
315 | { | 332 | { |
316 | struct anon_vma_chain *avc, *next; | 333 | struct anon_vma_chain *avc, *next; |
334 | struct anon_vma *root = NULL; | ||
317 | 335 | ||
318 | /* | 336 | /* |
319 | * Unlink each anon_vma chained to the VMA. This list is ordered | 337 | * Unlink each anon_vma chained to the VMA. This list is ordered |
320 | * from newest to oldest, ensuring the root anon_vma gets freed last. | 338 | * from newest to oldest, ensuring the root anon_vma gets freed last. |
321 | */ | 339 | */ |
322 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 340 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
323 | anon_vma_unlink(avc); | 341 | struct anon_vma *anon_vma = avc->anon_vma; |
342 | |||
343 | root = lock_anon_vma_root(root, anon_vma); | ||
344 | list_del(&avc->same_anon_vma); | ||
345 | |||
346 | /* | ||
347 | * Leave empty anon_vmas on the list - we'll need | ||
348 | * to free them outside the lock. | ||
349 | */ | ||
350 | if (list_empty(&anon_vma->head)) | ||
351 | continue; | ||
352 | |||
353 | list_del(&avc->same_vma); | ||
354 | anon_vma_chain_free(avc); | ||
355 | } | ||
356 | unlock_anon_vma_root(root); | ||
357 | |||
358 | /* | ||
359 | * Iterate the list once more, it now only contains empty and unlinked | ||
360 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | ||
361 | * needing to acquire the anon_vma->root->mutex. | ||
362 | */ | ||
363 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
364 | struct anon_vma *anon_vma = avc->anon_vma; | ||
365 | |||
366 | put_anon_vma(anon_vma); | ||
367 | |||
324 | list_del(&avc->same_vma); | 368 | list_del(&avc->same_vma); |
325 | anon_vma_chain_free(avc); | 369 | anon_vma_chain_free(avc); |
326 | } | 370 | } |
@@ -3604,13 +3604,14 @@ free_done: | |||
3604 | * Release an obj back to its cache. If the obj has a constructed state, it must | 3604 | * Release an obj back to its cache. If the obj has a constructed state, it must |
3605 | * be in this state _before_ it is released. Called with disabled ints. | 3605 | * be in this state _before_ it is released. Called with disabled ints. |
3606 | */ | 3606 | */ |
3607 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) | 3607 | static inline void __cache_free(struct kmem_cache *cachep, void *objp, |
3608 | void *caller) | ||
3608 | { | 3609 | { |
3609 | struct array_cache *ac = cpu_cache_get(cachep); | 3610 | struct array_cache *ac = cpu_cache_get(cachep); |
3610 | 3611 | ||
3611 | check_irq_off(); | 3612 | check_irq_off(); |
3612 | kmemleak_free_recursive(objp, cachep->flags); | 3613 | kmemleak_free_recursive(objp, cachep->flags); |
3613 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3614 | objp = cache_free_debugcheck(cachep, objp, caller); |
3614 | 3615 | ||
3615 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | 3616 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); |
3616 | 3617 | ||
@@ -3801,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
3801 | debug_check_no_locks_freed(objp, obj_size(cachep)); | 3802 | debug_check_no_locks_freed(objp, obj_size(cachep)); |
3802 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) | 3803 | if (!(cachep->flags & SLAB_DEBUG_OBJECTS)) |
3803 | debug_check_no_obj_freed(objp, obj_size(cachep)); | 3804 | debug_check_no_obj_freed(objp, obj_size(cachep)); |
3804 | __cache_free(cachep, objp); | 3805 | __cache_free(cachep, objp, __builtin_return_address(0)); |
3805 | local_irq_restore(flags); | 3806 | local_irq_restore(flags); |
3806 | 3807 | ||
3807 | trace_kmem_cache_free(_RET_IP_, objp); | 3808 | trace_kmem_cache_free(_RET_IP_, objp); |
@@ -3831,7 +3832,7 @@ void kfree(const void *objp) | |||
3831 | c = virt_to_cache(objp); | 3832 | c = virt_to_cache(objp); |
3832 | debug_check_no_locks_freed(objp, obj_size(c)); | 3833 | debug_check_no_locks_freed(objp, obj_size(c)); |
3833 | debug_check_no_obj_freed(objp, obj_size(c)); | 3834 | debug_check_no_obj_freed(objp, obj_size(c)); |
3834 | __cache_free(c, (void *)objp); | 3835 | __cache_free(c, (void *)objp, __builtin_return_address(0)); |
3835 | local_irq_restore(flags); | 3836 | local_irq_restore(flags); |
3836 | } | 3837 | } |
3837 | EXPORT_SYMBOL(kfree); | 3838 | EXPORT_SYMBOL(kfree); |
@@ -2320,16 +2320,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) | |||
2320 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < | 2320 | BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < |
2321 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); | 2321 | SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); |
2322 | 2322 | ||
2323 | #ifdef CONFIG_CMPXCHG_LOCAL | ||
2324 | /* | 2323 | /* |
2325 | * Must align to double word boundary for the double cmpxchg instructions | 2324 | * Must align to double word boundary for the double cmpxchg |
2326 | * to work. | 2325 | * instructions to work; see __pcpu_double_call_return_bool(). |
2327 | */ | 2326 | */ |
2328 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *)); | 2327 | s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), |
2329 | #else | 2328 | 2 * sizeof(void *)); |
2330 | /* Regular alignment is sufficient */ | ||
2331 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
2332 | #endif | ||
2333 | 2329 | ||
2334 | if (!s->cpu_slab) | 2330 | if (!s->cpu_slab) |
2335 | return 0; | 2331 | return 0; |
diff --git a/mm/thrash.c b/mm/thrash.c index 2372d4ed5dd8..fabf2d0f5169 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
@@ -21,14 +21,40 @@ | |||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
23 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
24 | #include <linux/memcontrol.h> | ||
25 | |||
26 | #include <trace/events/vmscan.h> | ||
27 | |||
28 | #define TOKEN_AGING_INTERVAL (0xFF) | ||
24 | 29 | ||
25 | static DEFINE_SPINLOCK(swap_token_lock); | 30 | static DEFINE_SPINLOCK(swap_token_lock); |
26 | struct mm_struct *swap_token_mm; | 31 | struct mm_struct *swap_token_mm; |
32 | struct mem_cgroup *swap_token_memcg; | ||
27 | static unsigned int global_faults; | 33 | static unsigned int global_faults; |
34 | static unsigned int last_aging; | ||
35 | |||
36 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
37 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
38 | { | ||
39 | struct mem_cgroup *memcg; | ||
40 | |||
41 | memcg = try_get_mem_cgroup_from_mm(mm); | ||
42 | if (memcg) | ||
43 | css_put(mem_cgroup_css(memcg)); | ||
44 | |||
45 | return memcg; | ||
46 | } | ||
47 | #else | ||
48 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
49 | { | ||
50 | return NULL; | ||
51 | } | ||
52 | #endif | ||
28 | 53 | ||
29 | void grab_swap_token(struct mm_struct *mm) | 54 | void grab_swap_token(struct mm_struct *mm) |
30 | { | 55 | { |
31 | int current_interval; | 56 | int current_interval; |
57 | unsigned int old_prio = mm->token_priority; | ||
32 | 58 | ||
33 | global_faults++; | 59 | global_faults++; |
34 | 60 | ||
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm) | |||
38 | return; | 64 | return; |
39 | 65 | ||
40 | /* First come first served */ | 66 | /* First come first served */ |
41 | if (swap_token_mm == NULL) { | 67 | if (!swap_token_mm) |
42 | mm->token_priority = mm->token_priority + 2; | 68 | goto replace_token; |
43 | swap_token_mm = mm; | 69 | |
44 | goto out; | 70 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { |
71 | swap_token_mm->token_priority /= 2; | ||
72 | last_aging = global_faults; | ||
45 | } | 73 | } |
46 | 74 | ||
47 | if (mm != swap_token_mm) { | 75 | if (mm == swap_token_mm) { |
48 | if (current_interval < mm->last_interval) | ||
49 | mm->token_priority++; | ||
50 | else { | ||
51 | if (likely(mm->token_priority > 0)) | ||
52 | mm->token_priority--; | ||
53 | } | ||
54 | /* Check if we deserve the token */ | ||
55 | if (mm->token_priority > swap_token_mm->token_priority) { | ||
56 | mm->token_priority += 2; | ||
57 | swap_token_mm = mm; | ||
58 | } | ||
59 | } else { | ||
60 | /* Token holder came in again! */ | ||
61 | mm->token_priority += 2; | 76 | mm->token_priority += 2; |
77 | goto update_priority; | ||
78 | } | ||
79 | |||
80 | if (current_interval < mm->last_interval) | ||
81 | mm->token_priority++; | ||
82 | else { | ||
83 | if (likely(mm->token_priority > 0)) | ||
84 | mm->token_priority--; | ||
62 | } | 85 | } |
63 | 86 | ||
87 | /* Check if we deserve the token */ | ||
88 | if (mm->token_priority > swap_token_mm->token_priority) | ||
89 | goto replace_token; | ||
90 | |||
91 | update_priority: | ||
92 | trace_update_swap_token_priority(mm, old_prio, swap_token_mm); | ||
93 | |||
64 | out: | 94 | out: |
65 | mm->faultstamp = global_faults; | 95 | mm->faultstamp = global_faults; |
66 | mm->last_interval = current_interval; | 96 | mm->last_interval = current_interval; |
67 | spin_unlock(&swap_token_lock); | 97 | spin_unlock(&swap_token_lock); |
98 | return; | ||
99 | |||
100 | replace_token: | ||
101 | mm->token_priority += 2; | ||
102 | trace_replace_swap_token(swap_token_mm, mm); | ||
103 | swap_token_mm = mm; | ||
104 | swap_token_memcg = swap_token_memcg_from_mm(mm); | ||
105 | last_aging = global_faults; | ||
106 | goto out; | ||
68 | } | 107 | } |
69 | 108 | ||
70 | /* Called on process exit. */ | 109 | /* Called on process exit. */ |
71 | void __put_swap_token(struct mm_struct *mm) | 110 | void __put_swap_token(struct mm_struct *mm) |
72 | { | 111 | { |
73 | spin_lock(&swap_token_lock); | 112 | spin_lock(&swap_token_lock); |
74 | if (likely(mm == swap_token_mm)) | 113 | if (likely(mm == swap_token_mm)) { |
114 | trace_put_swap_token(swap_token_mm); | ||
75 | swap_token_mm = NULL; | 115 | swap_token_mm = NULL; |
116 | swap_token_memcg = NULL; | ||
117 | } | ||
76 | spin_unlock(&swap_token_lock); | 118 | spin_unlock(&swap_token_lock); |
77 | } | 119 | } |
120 | |||
121 | static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) | ||
122 | { | ||
123 | if (!a) | ||
124 | return true; | ||
125 | if (!b) | ||
126 | return true; | ||
127 | if (a == b) | ||
128 | return true; | ||
129 | return false; | ||
130 | } | ||
131 | |||
132 | void disable_swap_token(struct mem_cgroup *memcg) | ||
133 | { | ||
134 | /* memcg reclaim don't disable unrelated mm token. */ | ||
135 | if (match_memcg(memcg, swap_token_memcg)) { | ||
136 | spin_lock(&swap_token_lock); | ||
137 | if (match_memcg(memcg, swap_token_memcg)) { | ||
138 | trace_disable_swap_token(swap_token_mm); | ||
139 | swap_token_mm = NULL; | ||
140 | swap_token_memcg = NULL; | ||
141 | } | ||
142 | spin_unlock(&swap_token_lock); | ||
143 | } | ||
144 | } | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index faa0a088f9cc..8ff834e19c24 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1124,8 +1124,20 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1124 | nr_lumpy_dirty++; | 1124 | nr_lumpy_dirty++; |
1125 | scan++; | 1125 | scan++; |
1126 | } else { | 1126 | } else { |
1127 | /* the page is freed already. */ | 1127 | /* |
1128 | if (!page_count(cursor_page)) | 1128 | * Check if the page is freed already. |
1129 | * | ||
1130 | * We can't use page_count() as that | ||
1131 | * requires compound_head and we don't | ||
1132 | * have a pin on the page here. If a | ||
1133 | * page is tail, we may or may not | ||
1134 | * have isolated the head, so assume | ||
1135 | * it's not free, it'd be tricky to | ||
1136 | * track the head status without a | ||
1137 | * page pin. | ||
1138 | */ | ||
1139 | if (!PageTail(cursor_page) && | ||
1140 | !atomic_read(&cursor_page->_count)) | ||
1129 | continue; | 1141 | continue; |
1130 | break; | 1142 | break; |
1131 | } | 1143 | } |
@@ -2081,7 +2093,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2081 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2093 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2082 | sc->nr_scanned = 0; | 2094 | sc->nr_scanned = 0; |
2083 | if (!priority) | 2095 | if (!priority) |
2084 | disable_swap_token(); | 2096 | disable_swap_token(sc->mem_cgroup); |
2085 | total_scanned += shrink_zones(priority, zonelist, sc); | 2097 | total_scanned += shrink_zones(priority, zonelist, sc); |
2086 | /* | 2098 | /* |
2087 | * Don't shrink slabs when reclaiming memory from | 2099 | * Don't shrink slabs when reclaiming memory from |
@@ -2407,7 +2419,7 @@ loop_again: | |||
2407 | 2419 | ||
2408 | /* The swap token gets in the way of swapout... */ | 2420 | /* The swap token gets in the way of swapout... */ |
2409 | if (!priority) | 2421 | if (!priority) |
2410 | disable_swap_token(); | 2422 | disable_swap_token(NULL); |
2411 | 2423 | ||
2412 | all_zones_ok = 1; | 2424 | all_zones_ok = 1; |
2413 | balanced = 0; | 2425 | balanced = 0; |