diff options
author | Alexander Graf <agraf@suse.de> | 2013-08-28 18:41:59 -0400 |
---|---|---|
committer | Alexander Graf <agraf@suse.de> | 2013-08-28 18:41:59 -0400 |
commit | bf550fc93d9855872a95e69e4002256110d89858 (patch) | |
tree | 10876bb4304bffe54c4160a132e7b8de6577ac4e /mm | |
parent | 7e48c101e0c53e6095c5f4f5e63d14df50aae8fc (diff) | |
parent | cc2df20c7c4ce594c3e17e9cc260c330646012c8 (diff) |
Merge remote-tracking branch 'origin/next' into kvm-ppc-next
Conflicts:
mm/Kconfig
CMA DMA split and ZSWAP introduction were conflicting, fix up manually.
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 42 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 39 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 30 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 363 | ||||
-rw-r--r-- | mm/memory-failure.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 139 | ||||
-rw-r--r-- | mm/mm_init.c | 47 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 20 | ||||
-rw-r--r-- | mm/nobootmem.c | 35 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 384 | ||||
-rw-r--r-- | mm/page_io.c | 50 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 16 | ||||
-rw-r--r-- | mm/slab.c | 51 | ||||
-rw-r--r-- | mm/slab.h | 3 | ||||
-rw-r--r-- | mm/slab_common.c | 18 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 38 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 106 | ||||
-rw-r--r-- | mm/swapfile.c | 55 | ||||
-rw-r--r-- | mm/util.c | 1 | ||||
-rw-r--r-- | mm/vmalloc.c | 164 | ||||
-rw-r--r-- | mm/vmscan.c | 605 | ||||
-rw-r--r-- | mm/zbud.c | 527 | ||||
-rw-r--r-- | mm/zswap.c | 943 |
37 files changed, 2862 insertions, 953 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 81bcb4bd422d..6cdd27043303 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -501,3 +501,45 @@ config CMA_DEBUG | |||
501 | messages for every CMA call as well as various messages while | 501 | messages for every CMA call as well as various messages while |
502 | processing calls such as dma_alloc_from_contiguous(). | 502 | processing calls such as dma_alloc_from_contiguous(). |
503 | This option does not affect warning and error messages. | 503 | This option does not affect warning and error messages. |
504 | |||
505 | config ZBUD | ||
506 | tristate | ||
507 | default n | ||
508 | help | ||
509 | A special purpose allocator for storing compressed pages. | ||
510 | It is designed to store up to two compressed pages per physical | ||
511 | page. While this design limits storage density, it has simple and | ||
512 | deterministic reclaim properties that make it preferable to a higher | ||
513 | density approach when reclaim will be used. | ||
514 | |||
515 | config ZSWAP | ||
516 | bool "Compressed cache for swap pages (EXPERIMENTAL)" | ||
517 | depends on FRONTSWAP && CRYPTO=y | ||
518 | select CRYPTO_LZO | ||
519 | select ZBUD | ||
520 | default n | ||
521 | help | ||
522 | A lightweight compressed cache for swap pages. It takes | ||
523 | pages that are in the process of being swapped out and attempts to | ||
524 | compress them into a dynamically allocated RAM-based memory pool. | ||
525 | This can result in a significant I/O reduction on swap device and, | ||
526 | in the case where decompressing from RAM is faster that swap device | ||
527 | reads, can also improve workload performance. | ||
528 | |||
529 | This is marked experimental because it is a new feature (as of | ||
530 | v3.11) that interacts heavily with memory reclaim. While these | ||
531 | interactions don't cause any known issues on simple memory setups, | ||
532 | they have not be fully explored on the large set of potential | ||
533 | configurations and workloads that exist. | ||
534 | |||
535 | config MEM_SOFT_DIRTY | ||
536 | bool "Track memory changes" | ||
537 | depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY | ||
538 | select PROC_PAGE_MONITOR | ||
539 | help | ||
540 | This option enables memory changes tracking by introducing a | ||
541 | soft-dirty bit on pte-s. This bit it set when someone writes | ||
542 | into a page just as regular dirty bit, but unlike the latter | ||
543 | it can be cleared by hands. | ||
544 | |||
545 | See Documentation/vm/soft-dirty.txt for more details. | ||
diff --git a/mm/Makefile b/mm/Makefile index 72c5acb9345f..f00803386a67 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | |||
32 | obj-$(CONFIG_BOUNCE) += bounce.o | 32 | obj-$(CONFIG_BOUNCE) += bounce.o |
33 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 33 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
34 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | 34 | obj-$(CONFIG_FRONTSWAP) += frontswap.o |
35 | obj-$(CONFIG_ZSWAP) += zswap.o | ||
35 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 36 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
36 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 37 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
37 | obj-$(CONFIG_NUMA) += mempolicy.o | 38 | obj-$(CONFIG_NUMA) += mempolicy.o |
@@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | |||
58 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | 59 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o |
59 | obj-$(CONFIG_CLEANCACHE) += cleancache.o | 60 | obj-$(CONFIG_CLEANCACHE) += cleancache.o |
60 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o | 61 | obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o |
62 | obj-$(CONFIG_ZBUD) += zbud.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 502517492258..d014ee5fcbbd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy); | |||
515 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | 515 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, |
516 | unsigned int cap) | 516 | unsigned int cap) |
517 | { | 517 | { |
518 | char tmp[32]; | ||
519 | int err; | 518 | int err; |
520 | 519 | ||
521 | bdi->name = name; | 520 | bdi->name = name; |
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | |||
524 | if (err) | 523 | if (err) |
525 | return err; | 524 | return err; |
526 | 525 | ||
527 | sprintf(tmp, "%.28s%s", name, "-%d"); | 526 | err = bdi_register(bdi, NULL, "%.28s-%ld", name, |
528 | err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); | 527 | atomic_long_inc_return(&bdi_seq)); |
529 | if (err) { | 528 | if (err) { |
530 | bdi_destroy(bdi); | 529 | bdi_destroy(bdi); |
531 | return err; | 530 | return err; |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 2b0bcb019ec2..6ab7744e692e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
241 | return count; | 241 | return count; |
242 | } | 242 | } |
243 | 243 | ||
244 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | 244 | static int reset_managed_pages_done __initdata; |
245 | |||
246 | static inline void __init reset_node_managed_pages(pg_data_t *pgdat) | ||
245 | { | 247 | { |
246 | struct zone *z; | 248 | struct zone *z; |
247 | 249 | ||
248 | /* | 250 | if (reset_managed_pages_done) |
249 | * In free_area_init_core(), highmem zone's managed_pages is set to | 251 | return; |
250 | * present_pages, and bootmem allocator doesn't allocate from highmem | 252 | |
251 | * zones. So there's no need to recalculate managed_pages because all | ||
252 | * highmem pages will be managed by the buddy system. Here highmem | ||
253 | * zone also includes highmem movable zone. | ||
254 | */ | ||
255 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | 253 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) |
256 | if (!is_highmem(z)) | 254 | z->managed_pages = 0; |
257 | z->managed_pages = 0; | ||
258 | } | 255 | } |
259 | 256 | ||
260 | /** | 257 | void __init reset_all_zones_managed_pages(void) |
261 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | ||
262 | * @pgdat: node to be released | ||
263 | * | ||
264 | * Returns the number of pages actually released. | ||
265 | */ | ||
266 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | ||
267 | { | 258 | { |
268 | register_page_bootmem_info_node(pgdat); | 259 | struct pglist_data *pgdat; |
269 | reset_node_lowmem_managed_pages(pgdat); | 260 | |
270 | return free_all_bootmem_core(pgdat->bdata); | 261 | for_each_online_pgdat(pgdat) |
262 | reset_node_managed_pages(pgdat); | ||
263 | reset_managed_pages_done = 1; | ||
271 | } | 264 | } |
272 | 265 | ||
273 | /** | 266 | /** |
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void) | |||
279 | { | 272 | { |
280 | unsigned long total_pages = 0; | 273 | unsigned long total_pages = 0; |
281 | bootmem_data_t *bdata; | 274 | bootmem_data_t *bdata; |
282 | struct pglist_data *pgdat; | ||
283 | 275 | ||
284 | for_each_online_pgdat(pgdat) | 276 | reset_all_zones_managed_pages(); |
285 | reset_node_lowmem_managed_pages(pgdat); | ||
286 | 277 | ||
287 | list_for_each_entry(bdata, &bdata_list, list) | 278 | list_for_each_entry(bdata, &bdata_list, list) |
288 | total_pages += free_all_bootmem_core(bdata); | 279 | total_pages += free_all_bootmem_core(bdata); |
289 | 280 | ||
281 | totalram_pages += total_pages; | ||
282 | |||
290 | return total_pages; | 283 | return total_pages; |
291 | } | 284 | } |
292 | 285 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 7905fe721aa8..4b51ac1acae7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, | |||
1539 | struct address_space *mapping = file->f_mapping; | 1539 | struct address_space *mapping = file->f_mapping; |
1540 | 1540 | ||
1541 | /* If we don't want any read-ahead, don't bother */ | 1541 | /* If we don't want any read-ahead, don't bother */ |
1542 | if (VM_RandomReadHint(vma)) | 1542 | if (vma->vm_flags & VM_RAND_READ) |
1543 | return; | 1543 | return; |
1544 | if (!ra->ra_pages) | 1544 | if (!ra->ra_pages) |
1545 | return; | 1545 | return; |
1546 | 1546 | ||
1547 | if (VM_SequentialReadHint(vma)) { | 1547 | if (vma->vm_flags & VM_SEQ_READ) { |
1548 | page_cache_sync_readahead(mapping, ra, file, offset, | 1548 | page_cache_sync_readahead(mapping, ra, file, offset, |
1549 | ra->ra_pages); | 1549 | ra->ra_pages); |
1550 | return; | 1550 | return; |
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, | |||
1584 | struct address_space *mapping = file->f_mapping; | 1584 | struct address_space *mapping = file->f_mapping; |
1585 | 1585 | ||
1586 | /* If we don't want any read-ahead, don't bother */ | 1586 | /* If we don't want any read-ahead, don't bother */ |
1587 | if (VM_RandomReadHint(vma)) | 1587 | if (vma->vm_flags & VM_RAND_READ) |
1588 | return; | 1588 | return; |
1589 | if (ra->mmap_miss > 0) | 1589 | if (ra->mmap_miss > 0) |
1590 | ra->mmap_miss--; | 1590 | ra->mmap_miss--; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329b83fe..243e710c6039 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
729 | pmd_t entry; | 729 | pmd_t entry; |
730 | entry = mk_huge_pmd(page, vma); | 730 | entry = mk_huge_pmd(page, vma); |
731 | page_add_new_anon_rmap(page, vma, haddr); | 731 | page_add_new_anon_rmap(page, vma, haddr); |
732 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | ||
732 | set_pmd_at(mm, haddr, pmd, entry); | 733 | set_pmd_at(mm, haddr, pmd, entry); |
733 | pgtable_trans_huge_deposit(mm, pgtable); | ||
734 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 734 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
735 | mm->nr_ptes++; | 735 | mm->nr_ptes++; |
736 | spin_unlock(&mm->page_table_lock); | 736 | spin_unlock(&mm->page_table_lock); |
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
771 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 771 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
772 | entry = pmd_wrprotect(entry); | 772 | entry = pmd_wrprotect(entry); |
773 | entry = pmd_mkhuge(entry); | 773 | entry = pmd_mkhuge(entry); |
774 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | ||
774 | set_pmd_at(mm, haddr, pmd, entry); | 775 | set_pmd_at(mm, haddr, pmd, entry); |
775 | pgtable_trans_huge_deposit(mm, pgtable); | ||
776 | mm->nr_ptes++; | 776 | mm->nr_ptes++; |
777 | return true; | 777 | return true; |
778 | } | 778 | } |
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
916 | 916 | ||
917 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 917 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
918 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 918 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
919 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | ||
919 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 920 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
920 | pgtable_trans_huge_deposit(dst_mm, pgtable); | ||
921 | dst_mm->nr_ptes++; | 921 | dst_mm->nr_ptes++; |
922 | 922 | ||
923 | ret = 0; | 923 | ret = 0; |
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | |||
987 | pmdp_clear_flush(vma, haddr, pmd); | 987 | pmdp_clear_flush(vma, haddr, pmd); |
988 | /* leave pmd empty until pte is filled */ | 988 | /* leave pmd empty until pte is filled */ |
989 | 989 | ||
990 | pgtable = pgtable_trans_huge_withdraw(mm); | 990 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
991 | pmd_populate(mm, &_pmd, pgtable); | 991 | pmd_populate(mm, &_pmd, pgtable); |
992 | 992 | ||
993 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 993 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1085 | pmdp_clear_flush(vma, haddr, pmd); | 1085 | pmdp_clear_flush(vma, haddr, pmd); |
1086 | /* leave pmd empty until pte is filled */ | 1086 | /* leave pmd empty until pte is filled */ |
1087 | 1087 | ||
1088 | pgtable = pgtable_trans_huge_withdraw(mm); | 1088 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1089 | pmd_populate(mm, &_pmd, pgtable); | 1089 | pmd_populate(mm, &_pmd, pgtable); |
1090 | 1090 | ||
1091 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 1091 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1265 | * young bit, instead of the current set_pmd_at. | 1265 | * young bit, instead of the current set_pmd_at. |
1266 | */ | 1266 | */ |
1267 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 1267 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); |
1268 | set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); | 1268 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, |
1269 | pmd, _pmd, 1)) | ||
1270 | update_mmu_cache_pmd(vma, addr, pmd); | ||
1269 | } | 1271 | } |
1270 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 1272 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
1271 | if (page->mapping && trylock_page(page)) { | 1273 | if (page->mapping && trylock_page(page)) { |
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1358 | struct page *page; | 1360 | struct page *page; |
1359 | pgtable_t pgtable; | 1361 | pgtable_t pgtable; |
1360 | pmd_t orig_pmd; | 1362 | pmd_t orig_pmd; |
1361 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1363 | /* |
1364 | * For architectures like ppc64 we look at deposited pgtable | ||
1365 | * when calling pmdp_get_and_clear. So do the | ||
1366 | * pgtable_trans_huge_withdraw after finishing pmdp related | ||
1367 | * operations. | ||
1368 | */ | ||
1362 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1369 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
1363 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1370 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1371 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | ||
1364 | if (is_huge_zero_pmd(orig_pmd)) { | 1372 | if (is_huge_zero_pmd(orig_pmd)) { |
1365 | tlb->mm->nr_ptes--; | 1373 | tlb->mm->nr_ptes--; |
1366 | spin_unlock(&tlb->mm->page_table_lock); | 1374 | spin_unlock(&tlb->mm->page_table_lock); |
@@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1429 | if (ret == 1) { | 1437 | if (ret == 1) { |
1430 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | 1438 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1431 | VM_BUG_ON(!pmd_none(*new_pmd)); | 1439 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1432 | set_pmd_at(mm, new_addr, new_pmd, pmd); | 1440 | set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); |
1433 | spin_unlock(&mm->page_table_lock); | 1441 | spin_unlock(&mm->page_table_lock); |
1434 | } | 1442 | } |
1435 | out: | 1443 | out: |
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page, | |||
1691 | pmd = page_check_address_pmd(page, mm, address, | 1699 | pmd = page_check_address_pmd(page, mm, address, |
1692 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); | 1700 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); |
1693 | if (pmd) { | 1701 | if (pmd) { |
1694 | pgtable = pgtable_trans_huge_withdraw(mm); | 1702 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
1695 | pmd_populate(mm, &_pmd, pgtable); | 1703 | pmd_populate(mm, &_pmd, pgtable); |
1696 | 1704 | ||
1697 | haddr = address; | 1705 | haddr = address; |
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2359 | spin_lock(&mm->page_table_lock); | 2367 | spin_lock(&mm->page_table_lock); |
2360 | BUG_ON(!pmd_none(*pmd)); | 2368 | BUG_ON(!pmd_none(*pmd)); |
2361 | page_add_new_anon_rmap(new_page, vma, address); | 2369 | page_add_new_anon_rmap(new_page, vma, address); |
2370 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | ||
2362 | set_pmd_at(mm, address, pmd, _pmd); | 2371 | set_pmd_at(mm, address, pmd, _pmd); |
2363 | update_mmu_cache_pmd(vma, address, pmd); | 2372 | update_mmu_cache_pmd(vma, address, pmd); |
2364 | pgtable_trans_huge_deposit(mm, pgtable); | ||
2365 | spin_unlock(&mm->page_table_lock); | 2373 | spin_unlock(&mm->page_table_lock); |
2366 | 2374 | ||
2367 | *hpage = NULL; | 2375 | *hpage = NULL; |
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
2667 | pmdp_clear_flush(vma, haddr, pmd); | 2675 | pmdp_clear_flush(vma, haddr, pmd); |
2668 | /* leave pmd empty until pte is filled */ | 2676 | /* leave pmd empty until pte is filled */ |
2669 | 2677 | ||
2670 | pgtable = pgtable_trans_huge_withdraw(mm); | 2678 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
2671 | pmd_populate(mm, &_pmd, pgtable); | 2679 | pmd_populate(mm, &_pmd, pgtable); |
2672 | 2680 | ||
2673 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 2681 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aed085ad11a8..83aff0a4d093 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | |||
319 | 319 | ||
320 | hstate = hstate_vma(vma); | 320 | hstate = hstate_vma(vma); |
321 | 321 | ||
322 | return 1UL << (hstate->order + PAGE_SHIFT); | 322 | return 1UL << huge_page_shift(hstate); |
323 | } | 323 | } |
324 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); | 324 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); |
325 | 325 | ||
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void) | |||
1263 | * side-effects, like CommitLimit going negative. | 1263 | * side-effects, like CommitLimit going negative. |
1264 | */ | 1264 | */ |
1265 | if (h->order > (MAX_ORDER - 1)) | 1265 | if (h->order > (MAX_ORDER - 1)) |
1266 | totalram_pages += 1 << h->order; | 1266 | adjust_managed_page_count(page, 1 << h->order); |
1267 | } | 1267 | } |
1268 | } | 1268 | } |
1269 | 1269 | ||
diff --git a/mm/internal.h b/mm/internal.h index 8562de0a5197..4390ac6c106e 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page) | |||
32 | set_page_count(page, 1); | 32 | set_page_count(page, 1); |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void __put_page(struct page *page) | ||
36 | { | ||
37 | atomic_dec(&page->_count); | ||
38 | } | ||
39 | |||
40 | static inline void __get_page_tail_foll(struct page *page, | 35 | static inline void __get_page_tail_foll(struct page *page, |
41 | bool get_page_head) | 36 | bool get_page_head) |
42 | { | 37 | { |
diff --git a/mm/memblock.c b/mm/memblock.c index c5fad932fa51..a847bfe6f3ba 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
566 | /** | 566 | /** |
567 | * __next_free_mem_range - next function for for_each_free_mem_range() | 567 | * __next_free_mem_range - next function for for_each_free_mem_range() |
568 | * @idx: pointer to u64 loop variable | 568 | * @idx: pointer to u64 loop variable |
569 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 569 | * @nid: node selector, %MAX_NUMNODES for all nodes |
570 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL | 570 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
571 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL | 571 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
572 | * @out_nid: ptr to int for nid of the range, can be %NULL | 572 | * @out_nid: ptr to int for nid of the range, can be %NULL |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 194721839cf5..d12ca6f3c293 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node { | |||
187 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 187 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
188 | }; | 188 | }; |
189 | 189 | ||
190 | struct mem_cgroup_lru_info { | ||
191 | struct mem_cgroup_per_node *nodeinfo[0]; | ||
192 | }; | ||
193 | |||
194 | /* | 190 | /* |
195 | * Cgroups above their limits are maintained in a RB-Tree, independent of | 191 | * Cgroups above their limits are maintained in a RB-Tree, independent of |
196 | * their hierarchy representation | 192 | * their hierarchy representation |
@@ -267,28 +263,10 @@ struct mem_cgroup { | |||
267 | /* vmpressure notifications */ | 263 | /* vmpressure notifications */ |
268 | struct vmpressure vmpressure; | 264 | struct vmpressure vmpressure; |
269 | 265 | ||
270 | union { | 266 | /* |
271 | /* | 267 | * the counter to account for mem+swap usage. |
272 | * the counter to account for mem+swap usage. | 268 | */ |
273 | */ | 269 | struct res_counter memsw; |
274 | struct res_counter memsw; | ||
275 | |||
276 | /* | ||
277 | * rcu_freeing is used only when freeing struct mem_cgroup, | ||
278 | * so put it into a union to avoid wasting more memory. | ||
279 | * It must be disjoint from the css field. It could be | ||
280 | * in a union with the res field, but res plays a much | ||
281 | * larger part in mem_cgroup life than memsw, and might | ||
282 | * be of interest, even at time of free, when debugging. | ||
283 | * So share rcu_head with the less interesting memsw. | ||
284 | */ | ||
285 | struct rcu_head rcu_freeing; | ||
286 | /* | ||
287 | * We also need some space for a worker in deferred freeing. | ||
288 | * By the time we call it, rcu_freeing is no longer in use. | ||
289 | */ | ||
290 | struct work_struct work_freeing; | ||
291 | }; | ||
292 | 270 | ||
293 | /* | 271 | /* |
294 | * the counter to account for kernel memory usage. | 272 | * the counter to account for kernel memory usage. |
@@ -303,8 +281,6 @@ struct mem_cgroup { | |||
303 | bool oom_lock; | 281 | bool oom_lock; |
304 | atomic_t under_oom; | 282 | atomic_t under_oom; |
305 | 283 | ||
306 | atomic_t refcnt; | ||
307 | |||
308 | int swappiness; | 284 | int swappiness; |
309 | /* OOM-Killer disable */ | 285 | /* OOM-Killer disable */ |
310 | int oom_kill_disable; | 286 | int oom_kill_disable; |
@@ -366,14 +342,8 @@ struct mem_cgroup { | |||
366 | atomic_t numainfo_updating; | 342 | atomic_t numainfo_updating; |
367 | #endif | 343 | #endif |
368 | 344 | ||
369 | /* | 345 | struct mem_cgroup_per_node *nodeinfo[0]; |
370 | * Per cgroup active and inactive list, similar to the | 346 | /* WARNING: nodeinfo must be the last member here */ |
371 | * per zone LRU lists. | ||
372 | * | ||
373 | * WARNING: This has to be the last element of the struct. Don't | ||
374 | * add new fields after this point. | ||
375 | */ | ||
376 | struct mem_cgroup_lru_info info; | ||
377 | }; | 347 | }; |
378 | 348 | ||
379 | static size_t memcg_size(void) | 349 | static size_t memcg_size(void) |
@@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) | |||
416 | 386 | ||
417 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | 387 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) |
418 | { | 388 | { |
389 | /* | ||
390 | * Our caller must use css_get() first, because memcg_uncharge_kmem() | ||
391 | * will call css_put() if it sees the memcg is dead. | ||
392 | */ | ||
393 | smp_wmb(); | ||
419 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | 394 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) |
420 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); | 395 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); |
421 | } | 396 | } |
@@ -508,9 +483,6 @@ enum res_type { | |||
508 | */ | 483 | */ |
509 | static DEFINE_MUTEX(memcg_create_mutex); | 484 | static DEFINE_MUTEX(memcg_create_mutex); |
510 | 485 | ||
511 | static void mem_cgroup_get(struct mem_cgroup *memcg); | ||
512 | static void mem_cgroup_put(struct mem_cgroup *memcg); | ||
513 | |||
514 | static inline | 486 | static inline |
515 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | 487 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) |
516 | { | 488 | { |
@@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk) | |||
561 | */ | 533 | */ |
562 | if (sk->sk_cgrp) { | 534 | if (sk->sk_cgrp) { |
563 | BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); | 535 | BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); |
564 | mem_cgroup_get(sk->sk_cgrp->memcg); | 536 | css_get(&sk->sk_cgrp->memcg->css); |
565 | return; | 537 | return; |
566 | } | 538 | } |
567 | 539 | ||
568 | rcu_read_lock(); | 540 | rcu_read_lock(); |
569 | memcg = mem_cgroup_from_task(current); | 541 | memcg = mem_cgroup_from_task(current); |
570 | cg_proto = sk->sk_prot->proto_cgroup(memcg); | 542 | cg_proto = sk->sk_prot->proto_cgroup(memcg); |
571 | if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { | 543 | if (!mem_cgroup_is_root(memcg) && |
572 | mem_cgroup_get(memcg); | 544 | memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { |
573 | sk->sk_cgrp = cg_proto; | 545 | sk->sk_cgrp = cg_proto; |
574 | } | 546 | } |
575 | rcu_read_unlock(); | 547 | rcu_read_unlock(); |
@@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk) | |||
583 | struct mem_cgroup *memcg; | 555 | struct mem_cgroup *memcg; |
584 | WARN_ON(!sk->sk_cgrp->memcg); | 556 | WARN_ON(!sk->sk_cgrp->memcg); |
585 | memcg = sk->sk_cgrp->memcg; | 557 | memcg = sk->sk_cgrp->memcg; |
586 | mem_cgroup_put(memcg); | 558 | css_put(&sk->sk_cgrp->memcg->css); |
587 | } | 559 | } |
588 | } | 560 | } |
589 | 561 | ||
@@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone * | |||
683 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) | 655 | mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) |
684 | { | 656 | { |
685 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | 657 | VM_BUG_ON((unsigned)nid >= nr_node_ids); |
686 | return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; | 658 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; |
687 | } | 659 | } |
688 | 660 | ||
689 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) | 661 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) |
@@ -1148,6 +1120,58 @@ skip_node: | |||
1148 | return NULL; | 1120 | return NULL; |
1149 | } | 1121 | } |
1150 | 1122 | ||
1123 | static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) | ||
1124 | { | ||
1125 | /* | ||
1126 | * When a group in the hierarchy below root is destroyed, the | ||
1127 | * hierarchy iterator can no longer be trusted since it might | ||
1128 | * have pointed to the destroyed group. Invalidate it. | ||
1129 | */ | ||
1130 | atomic_inc(&root->dead_count); | ||
1131 | } | ||
1132 | |||
1133 | static struct mem_cgroup * | ||
1134 | mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, | ||
1135 | struct mem_cgroup *root, | ||
1136 | int *sequence) | ||
1137 | { | ||
1138 | struct mem_cgroup *position = NULL; | ||
1139 | /* | ||
1140 | * A cgroup destruction happens in two stages: offlining and | ||
1141 | * release. They are separated by a RCU grace period. | ||
1142 | * | ||
1143 | * If the iterator is valid, we may still race with an | ||
1144 | * offlining. The RCU lock ensures the object won't be | ||
1145 | * released, tryget will fail if we lost the race. | ||
1146 | */ | ||
1147 | *sequence = atomic_read(&root->dead_count); | ||
1148 | if (iter->last_dead_count == *sequence) { | ||
1149 | smp_rmb(); | ||
1150 | position = iter->last_visited; | ||
1151 | if (position && !css_tryget(&position->css)) | ||
1152 | position = NULL; | ||
1153 | } | ||
1154 | return position; | ||
1155 | } | ||
1156 | |||
1157 | static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, | ||
1158 | struct mem_cgroup *last_visited, | ||
1159 | struct mem_cgroup *new_position, | ||
1160 | int sequence) | ||
1161 | { | ||
1162 | if (last_visited) | ||
1163 | css_put(&last_visited->css); | ||
1164 | /* | ||
1165 | * We store the sequence count from the time @last_visited was | ||
1166 | * loaded successfully instead of rereading it here so that we | ||
1167 | * don't lose destruction events in between. We could have | ||
1168 | * raced with the destruction of @new_position after all. | ||
1169 | */ | ||
1170 | iter->last_visited = new_position; | ||
1171 | smp_wmb(); | ||
1172 | iter->last_dead_count = sequence; | ||
1173 | } | ||
1174 | |||
1151 | /** | 1175 | /** |
1152 | * mem_cgroup_iter - iterate over memory cgroup hierarchy | 1176 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
1153 | * @root: hierarchy root | 1177 | * @root: hierarchy root |
@@ -1171,7 +1195,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1171 | { | 1195 | { |
1172 | struct mem_cgroup *memcg = NULL; | 1196 | struct mem_cgroup *memcg = NULL; |
1173 | struct mem_cgroup *last_visited = NULL; | 1197 | struct mem_cgroup *last_visited = NULL; |
1174 | unsigned long uninitialized_var(dead_count); | ||
1175 | 1198 | ||
1176 | if (mem_cgroup_disabled()) | 1199 | if (mem_cgroup_disabled()) |
1177 | return NULL; | 1200 | return NULL; |
@@ -1191,6 +1214,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1191 | rcu_read_lock(); | 1214 | rcu_read_lock(); |
1192 | while (!memcg) { | 1215 | while (!memcg) { |
1193 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); | 1216 | struct mem_cgroup_reclaim_iter *uninitialized_var(iter); |
1217 | int uninitialized_var(seq); | ||
1194 | 1218 | ||
1195 | if (reclaim) { | 1219 | if (reclaim) { |
1196 | int nid = zone_to_nid(reclaim->zone); | 1220 | int nid = zone_to_nid(reclaim->zone); |
@@ -1204,37 +1228,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
1204 | goto out_unlock; | 1228 | goto out_unlock; |
1205 | } | 1229 | } |
1206 | 1230 | ||
1207 | /* | 1231 | last_visited = mem_cgroup_iter_load(iter, root, &seq); |
1208 | * If the dead_count mismatches, a destruction | ||
1209 | * has happened or is happening concurrently. | ||
1210 | * If the dead_count matches, a destruction | ||
1211 | * might still happen concurrently, but since | ||
1212 | * we checked under RCU, that destruction | ||
1213 | * won't free the object until we release the | ||
1214 | * RCU reader lock. Thus, the dead_count | ||
1215 | * check verifies the pointer is still valid, | ||
1216 | * css_tryget() verifies the cgroup pointed to | ||
1217 | * is alive. | ||
1218 | */ | ||
1219 | dead_count = atomic_read(&root->dead_count); | ||
1220 | if (dead_count == iter->last_dead_count) { | ||
1221 | smp_rmb(); | ||
1222 | last_visited = iter->last_visited; | ||
1223 | if (last_visited && | ||
1224 | !css_tryget(&last_visited->css)) | ||
1225 | last_visited = NULL; | ||
1226 | } | ||
1227 | } | 1232 | } |
1228 | 1233 | ||
1229 | memcg = __mem_cgroup_iter_next(root, last_visited); | 1234 | memcg = __mem_cgroup_iter_next(root, last_visited); |
1230 | 1235 | ||
1231 | if (reclaim) { | 1236 | if (reclaim) { |
1232 | if (last_visited) | 1237 | mem_cgroup_iter_update(iter, last_visited, memcg, seq); |
1233 | css_put(&last_visited->css); | ||
1234 | |||
1235 | iter->last_visited = memcg; | ||
1236 | smp_wmb(); | ||
1237 | iter->last_dead_count = dead_count; | ||
1238 | 1238 | ||
1239 | if (!memcg) | 1239 | if (!memcg) |
1240 | iter->generation++; | 1240 | iter->generation++; |
@@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | |||
1448 | return ret; | 1448 | return ret; |
1449 | } | 1449 | } |
1450 | 1450 | ||
1451 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | 1451 | bool task_in_mem_cgroup(struct task_struct *task, |
1452 | const struct mem_cgroup *memcg) | ||
1452 | { | 1453 | { |
1453 | int ret; | ||
1454 | struct mem_cgroup *curr = NULL; | 1454 | struct mem_cgroup *curr = NULL; |
1455 | struct task_struct *p; | 1455 | struct task_struct *p; |
1456 | bool ret; | ||
1456 | 1457 | ||
1457 | p = find_lock_task_mm(task); | 1458 | p = find_lock_task_mm(task); |
1458 | if (p) { | 1459 | if (p) { |
@@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | |||
1464 | * killer still needs to detect if they have already been oom | 1465 | * killer still needs to detect if they have already been oom |
1465 | * killed to prevent needlessly killing additional tasks. | 1466 | * killed to prevent needlessly killing additional tasks. |
1466 | */ | 1467 | */ |
1467 | task_lock(task); | 1468 | rcu_read_lock(); |
1468 | curr = mem_cgroup_from_task(task); | 1469 | curr = mem_cgroup_from_task(task); |
1469 | if (curr) | 1470 | if (curr) |
1470 | css_get(&curr->css); | 1471 | css_get(&curr->css); |
1471 | task_unlock(task); | 1472 | rcu_read_unlock(); |
1472 | } | 1473 | } |
1473 | if (!curr) | 1474 | if (!curr) |
1474 | return 0; | 1475 | return false; |
1475 | /* | 1476 | /* |
1476 | * We should check use_hierarchy of "memcg" not "curr". Because checking | 1477 | * We should check use_hierarchy of "memcg" not "curr". Because checking |
1477 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1478 | * use_hierarchy of "curr" here make this function true if hierarchy is |
@@ -3031,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) | |||
3031 | if (res_counter_uncharge(&memcg->kmem, size)) | 3032 | if (res_counter_uncharge(&memcg->kmem, size)) |
3032 | return; | 3033 | return; |
3033 | 3034 | ||
3035 | /* | ||
3036 | * Releases a reference taken in kmem_cgroup_css_offline in case | ||
3037 | * this last uncharge is racing with the offlining code or it is | ||
3038 | * outliving the memcg existence. | ||
3039 | * | ||
3040 | * The memory barrier imposed by test&clear is paired with the | ||
3041 | * explicit one in memcg_kmem_mark_dead(). | ||
3042 | */ | ||
3034 | if (memcg_kmem_test_and_clear_dead(memcg)) | 3043 | if (memcg_kmem_test_and_clear_dead(memcg)) |
3035 | mem_cgroup_put(memcg); | 3044 | css_put(&memcg->css); |
3036 | } | 3045 | } |
3037 | 3046 | ||
3038 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) | 3047 | void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) |
@@ -3223,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s) | |||
3223 | list_del(&s->memcg_params->list); | 3232 | list_del(&s->memcg_params->list); |
3224 | mutex_unlock(&memcg->slab_caches_mutex); | 3233 | mutex_unlock(&memcg->slab_caches_mutex); |
3225 | 3234 | ||
3226 | mem_cgroup_put(memcg); | 3235 | css_put(&memcg->css); |
3227 | out: | 3236 | out: |
3228 | kfree(s->memcg_params); | 3237 | kfree(s->memcg_params); |
3229 | } | 3238 | } |
@@ -3383,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
3383 | 3392 | ||
3384 | mutex_lock(&memcg_cache_mutex); | 3393 | mutex_lock(&memcg_cache_mutex); |
3385 | new_cachep = cachep->memcg_params->memcg_caches[idx]; | 3394 | new_cachep = cachep->memcg_params->memcg_caches[idx]; |
3386 | if (new_cachep) | 3395 | if (new_cachep) { |
3396 | css_put(&memcg->css); | ||
3387 | goto out; | 3397 | goto out; |
3398 | } | ||
3388 | 3399 | ||
3389 | new_cachep = kmem_cache_dup(memcg, cachep); | 3400 | new_cachep = kmem_cache_dup(memcg, cachep); |
3390 | if (new_cachep == NULL) { | 3401 | if (new_cachep == NULL) { |
3391 | new_cachep = cachep; | 3402 | new_cachep = cachep; |
3403 | css_put(&memcg->css); | ||
3392 | goto out; | 3404 | goto out; |
3393 | } | 3405 | } |
3394 | 3406 | ||
3395 | mem_cgroup_get(memcg); | ||
3396 | atomic_set(&new_cachep->memcg_params->nr_pages , 0); | 3407 | atomic_set(&new_cachep->memcg_params->nr_pages , 0); |
3397 | 3408 | ||
3398 | cachep->memcg_params->memcg_caches[idx] = new_cachep; | 3409 | cachep->memcg_params->memcg_caches[idx] = new_cachep; |
@@ -3480,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) | |||
3480 | 3491 | ||
3481 | cw = container_of(w, struct create_work, work); | 3492 | cw = container_of(w, struct create_work, work); |
3482 | memcg_create_kmem_cache(cw->memcg, cw->cachep); | 3493 | memcg_create_kmem_cache(cw->memcg, cw->cachep); |
3483 | /* Drop the reference gotten when we enqueued. */ | ||
3484 | css_put(&cw->memcg->css); | ||
3485 | kfree(cw); | 3494 | kfree(cw); |
3486 | } | 3495 | } |
3487 | 3496 | ||
@@ -3618,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3618 | int ret; | 3627 | int ret; |
3619 | 3628 | ||
3620 | *_memcg = NULL; | 3629 | *_memcg = NULL; |
3630 | |||
3631 | /* | ||
3632 | * Disabling accounting is only relevant for some specific memcg | ||
3633 | * internal allocations. Therefore we would initially not have such | ||
3634 | * check here, since direct calls to the page allocator that are marked | ||
3635 | * with GFP_KMEMCG only happen outside memcg core. We are mostly | ||
3636 | * concerned with cache allocations, and by having this test at | ||
3637 | * memcg_kmem_get_cache, we are already able to relay the allocation to | ||
3638 | * the root cache and bypass the memcg cache altogether. | ||
3639 | * | ||
3640 | * There is one exception, though: the SLUB allocator does not create | ||
3641 | * large order caches, but rather service large kmallocs directly from | ||
3642 | * the page allocator. Therefore, the following sequence when backed by | ||
3643 | * the SLUB allocator: | ||
3644 | * | ||
3645 | * memcg_stop_kmem_account(); | ||
3646 | * kmalloc(<large_number>) | ||
3647 | * memcg_resume_kmem_account(); | ||
3648 | * | ||
3649 | * would effectively ignore the fact that we should skip accounting, | ||
3650 | * since it will drive us directly to this function without passing | ||
3651 | * through the cache selector memcg_kmem_get_cache. Such large | ||
3652 | * allocations are extremely rare but can happen, for instance, for the | ||
3653 | * cache arrays. We bring this test here. | ||
3654 | */ | ||
3655 | if (!current->mm || current->memcg_kmem_skip_account) | ||
3656 | return true; | ||
3657 | |||
3621 | memcg = try_get_mem_cgroup_from_mm(current->mm); | 3658 | memcg = try_get_mem_cgroup_from_mm(current->mm); |
3622 | 3659 | ||
3623 | /* | 3660 | /* |
@@ -4171,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, | |||
4171 | unlock_page_cgroup(pc); | 4208 | unlock_page_cgroup(pc); |
4172 | /* | 4209 | /* |
4173 | * even after unlock, we have memcg->res.usage here and this memcg | 4210 | * even after unlock, we have memcg->res.usage here and this memcg |
4174 | * will never be freed. | 4211 | * will never be freed, so it's safe to call css_get(). |
4175 | */ | 4212 | */ |
4176 | memcg_check_events(memcg, page); | 4213 | memcg_check_events(memcg, page); |
4177 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { | 4214 | if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { |
4178 | mem_cgroup_swap_statistics(memcg, true); | 4215 | mem_cgroup_swap_statistics(memcg, true); |
4179 | mem_cgroup_get(memcg); | 4216 | css_get(&memcg->css); |
4180 | } | 4217 | } |
4181 | /* | 4218 | /* |
4182 | * Migration does not charge the res_counter for the | 4219 | * Migration does not charge the res_counter for the |
@@ -4288,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
4288 | 4325 | ||
4289 | /* | 4326 | /* |
4290 | * record memcg information, if swapout && memcg != NULL, | 4327 | * record memcg information, if swapout && memcg != NULL, |
4291 | * mem_cgroup_get() was called in uncharge(). | 4328 | * css_get() was called in uncharge(). |
4292 | */ | 4329 | */ |
4293 | if (do_swap_account && swapout && memcg) | 4330 | if (do_swap_account && swapout && memcg) |
4294 | swap_cgroup_record(ent, css_id(&memcg->css)); | 4331 | swap_cgroup_record(ent, css_id(&memcg->css)); |
@@ -4319,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
4319 | if (!mem_cgroup_is_root(memcg)) | 4356 | if (!mem_cgroup_is_root(memcg)) |
4320 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 4357 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); |
4321 | mem_cgroup_swap_statistics(memcg, false); | 4358 | mem_cgroup_swap_statistics(memcg, false); |
4322 | mem_cgroup_put(memcg); | 4359 | css_put(&memcg->css); |
4323 | } | 4360 | } |
4324 | rcu_read_unlock(); | 4361 | rcu_read_unlock(); |
4325 | } | 4362 | } |
@@ -4353,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
4353 | * This function is only called from task migration context now. | 4390 | * This function is only called from task migration context now. |
4354 | * It postpones res_counter and refcount handling till the end | 4391 | * It postpones res_counter and refcount handling till the end |
4355 | * of task migration(mem_cgroup_clear_mc()) for performance | 4392 | * of task migration(mem_cgroup_clear_mc()) for performance |
4356 | * improvement. But we cannot postpone mem_cgroup_get(to) | 4393 | * improvement. But we cannot postpone css_get(to) because if |
4357 | * because if the process that has been moved to @to does | 4394 | * the process that has been moved to @to does swap-in, the |
4358 | * swap-in, the refcount of @to might be decreased to 0. | 4395 | * refcount of @to might be decreased to 0. |
4396 | * | ||
4397 | * We are in attach() phase, so the cgroup is guaranteed to be | ||
4398 | * alive, so we can just call css_get(). | ||
4359 | */ | 4399 | */ |
4360 | mem_cgroup_get(to); | 4400 | css_get(&to->css); |
4361 | return 0; | 4401 | return 0; |
4362 | } | 4402 | } |
4363 | return -EINVAL; | 4403 | return -EINVAL; |
@@ -5136,14 +5176,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) | |||
5136 | * starts accounting before all call sites are patched | 5176 | * starts accounting before all call sites are patched |
5137 | */ | 5177 | */ |
5138 | memcg_kmem_set_active(memcg); | 5178 | memcg_kmem_set_active(memcg); |
5139 | |||
5140 | /* | ||
5141 | * kmem charges can outlive the cgroup. In the case of slab | ||
5142 | * pages, for instance, a page contain objects from various | ||
5143 | * processes, so it is unfeasible to migrate them away. We | ||
5144 | * need to reference count the memcg because of that. | ||
5145 | */ | ||
5146 | mem_cgroup_get(memcg); | ||
5147 | } else | 5179 | } else |
5148 | ret = res_counter_set_limit(&memcg->kmem, val); | 5180 | ret = res_counter_set_limit(&memcg->kmem, val); |
5149 | out: | 5181 | out: |
@@ -5176,16 +5208,16 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) | |||
5176 | goto out; | 5208 | goto out; |
5177 | 5209 | ||
5178 | /* | 5210 | /* |
5179 | * destroy(), called if we fail, will issue static_key_slow_inc() and | 5211 | * __mem_cgroup_free() will issue static_key_slow_dec() because this |
5180 | * mem_cgroup_put() if kmem is enabled. We have to either call them | 5212 | * memcg is active already. If the later initialization fails then the |
5181 | * unconditionally, or clear the KMEM_ACTIVE flag. I personally find | 5213 | * cgroup core triggers the cleanup so we do not have to do it here. |
5182 | * this more consistent, since it always leads to the same destroy path | ||
5183 | */ | 5214 | */ |
5184 | mem_cgroup_get(memcg); | ||
5185 | static_key_slow_inc(&memcg_kmem_enabled_key); | 5215 | static_key_slow_inc(&memcg_kmem_enabled_key); |
5186 | 5216 | ||
5187 | mutex_lock(&set_limit_mutex); | 5217 | mutex_lock(&set_limit_mutex); |
5218 | memcg_stop_kmem_account(); | ||
5188 | ret = memcg_update_cache_sizes(memcg); | 5219 | ret = memcg_update_cache_sizes(memcg); |
5220 | memcg_resume_kmem_account(); | ||
5189 | mutex_unlock(&set_limit_mutex); | 5221 | mutex_unlock(&set_limit_mutex); |
5190 | out: | 5222 | out: |
5191 | return ret; | 5223 | return ret; |
@@ -5864,23 +5896,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
5864 | return mem_cgroup_sockets_init(memcg, ss); | 5896 | return mem_cgroup_sockets_init(memcg, ss); |
5865 | } | 5897 | } |
5866 | 5898 | ||
5867 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) | 5899 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
5868 | { | 5900 | { |
5869 | mem_cgroup_sockets_destroy(memcg); | 5901 | mem_cgroup_sockets_destroy(memcg); |
5902 | } | ||
5903 | |||
5904 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | ||
5905 | { | ||
5906 | if (!memcg_kmem_is_active(memcg)) | ||
5907 | return; | ||
5908 | |||
5909 | /* | ||
5910 | * kmem charges can outlive the cgroup. In the case of slab | ||
5911 | * pages, for instance, a page contain objects from various | ||
5912 | * processes. As we prevent from taking a reference for every | ||
5913 | * such allocation we have to be careful when doing uncharge | ||
5914 | * (see memcg_uncharge_kmem) and here during offlining. | ||
5915 | * | ||
5916 | * The idea is that that only the _last_ uncharge which sees | ||
5917 | * the dead memcg will drop the last reference. An additional | ||
5918 | * reference is taken here before the group is marked dead | ||
5919 | * which is then paired with css_put during uncharge resp. here. | ||
5920 | * | ||
5921 | * Although this might sound strange as this path is called from | ||
5922 | * css_offline() when the referencemight have dropped down to 0 | ||
5923 | * and shouldn't be incremented anymore (css_tryget would fail) | ||
5924 | * we do not have other options because of the kmem allocations | ||
5925 | * lifetime. | ||
5926 | */ | ||
5927 | css_get(&memcg->css); | ||
5870 | 5928 | ||
5871 | memcg_kmem_mark_dead(memcg); | 5929 | memcg_kmem_mark_dead(memcg); |
5872 | 5930 | ||
5873 | if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) | 5931 | if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) |
5874 | return; | 5932 | return; |
5875 | 5933 | ||
5876 | /* | ||
5877 | * Charges already down to 0, undo mem_cgroup_get() done in the charge | ||
5878 | * path here, being careful not to race with memcg_uncharge_kmem: it is | ||
5879 | * possible that the charges went down to 0 between mark_dead and the | ||
5880 | * res_counter read, so in that case, we don't need the put | ||
5881 | */ | ||
5882 | if (memcg_kmem_test_and_clear_dead(memcg)) | 5934 | if (memcg_kmem_test_and_clear_dead(memcg)) |
5883 | mem_cgroup_put(memcg); | 5935 | css_put(&memcg->css); |
5884 | } | 5936 | } |
5885 | #else | 5937 | #else |
5886 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 5938 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
@@ -5888,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
5888 | return 0; | 5940 | return 0; |
5889 | } | 5941 | } |
5890 | 5942 | ||
5891 | static void kmem_cgroup_destroy(struct mem_cgroup *memcg) | 5943 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
5944 | { | ||
5945 | } | ||
5946 | |||
5947 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | ||
5892 | { | 5948 | { |
5893 | } | 5949 | } |
5894 | #endif | 5950 | #endif |
@@ -6058,13 +6114,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
6058 | mz->on_tree = false; | 6114 | mz->on_tree = false; |
6059 | mz->memcg = memcg; | 6115 | mz->memcg = memcg; |
6060 | } | 6116 | } |
6061 | memcg->info.nodeinfo[node] = pn; | 6117 | memcg->nodeinfo[node] = pn; |
6062 | return 0; | 6118 | return 0; |
6063 | } | 6119 | } |
6064 | 6120 | ||
6065 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 6121 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
6066 | { | 6122 | { |
6067 | kfree(memcg->info.nodeinfo[node]); | 6123 | kfree(memcg->nodeinfo[node]); |
6068 | } | 6124 | } |
6069 | 6125 | ||
6070 | static struct mem_cgroup *mem_cgroup_alloc(void) | 6126 | static struct mem_cgroup *mem_cgroup_alloc(void) |
@@ -6137,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
6137 | vfree(memcg); | 6193 | vfree(memcg); |
6138 | } | 6194 | } |
6139 | 6195 | ||
6140 | |||
6141 | /* | ||
6142 | * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, | ||
6143 | * but in process context. The work_freeing structure is overlaid | ||
6144 | * on the rcu_freeing structure, which itself is overlaid on memsw. | ||
6145 | */ | ||
6146 | static void free_work(struct work_struct *work) | ||
6147 | { | ||
6148 | struct mem_cgroup *memcg; | ||
6149 | |||
6150 | memcg = container_of(work, struct mem_cgroup, work_freeing); | ||
6151 | __mem_cgroup_free(memcg); | ||
6152 | } | ||
6153 | |||
6154 | static void free_rcu(struct rcu_head *rcu_head) | ||
6155 | { | ||
6156 | struct mem_cgroup *memcg; | ||
6157 | |||
6158 | memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); | ||
6159 | INIT_WORK(&memcg->work_freeing, free_work); | ||
6160 | schedule_work(&memcg->work_freeing); | ||
6161 | } | ||
6162 | |||
6163 | static void mem_cgroup_get(struct mem_cgroup *memcg) | ||
6164 | { | ||
6165 | atomic_inc(&memcg->refcnt); | ||
6166 | } | ||
6167 | |||
6168 | static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) | ||
6169 | { | ||
6170 | if (atomic_sub_and_test(count, &memcg->refcnt)) { | ||
6171 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | ||
6172 | call_rcu(&memcg->rcu_freeing, free_rcu); | ||
6173 | if (parent) | ||
6174 | mem_cgroup_put(parent); | ||
6175 | } | ||
6176 | } | ||
6177 | |||
6178 | static void mem_cgroup_put(struct mem_cgroup *memcg) | ||
6179 | { | ||
6180 | __mem_cgroup_put(memcg, 1); | ||
6181 | } | ||
6182 | |||
6183 | /* | 6196 | /* |
6184 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 6197 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
6185 | */ | 6198 | */ |
@@ -6239,7 +6252,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) | |||
6239 | 6252 | ||
6240 | memcg->last_scanned_node = MAX_NUMNODES; | 6253 | memcg->last_scanned_node = MAX_NUMNODES; |
6241 | INIT_LIST_HEAD(&memcg->oom_notify); | 6254 | INIT_LIST_HEAD(&memcg->oom_notify); |
6242 | atomic_set(&memcg->refcnt, 1); | ||
6243 | memcg->move_charge_at_immigrate = 0; | 6255 | memcg->move_charge_at_immigrate = 0; |
6244 | mutex_init(&memcg->thresholds_lock); | 6256 | mutex_init(&memcg->thresholds_lock); |
6245 | spin_lock_init(&memcg->move_lock); | 6257 | spin_lock_init(&memcg->move_lock); |
@@ -6275,12 +6287,9 @@ mem_cgroup_css_online(struct cgroup *cont) | |||
6275 | res_counter_init(&memcg->kmem, &parent->kmem); | 6287 | res_counter_init(&memcg->kmem, &parent->kmem); |
6276 | 6288 | ||
6277 | /* | 6289 | /* |
6278 | * We increment refcnt of the parent to ensure that we can | 6290 | * No need to take a reference to the parent because cgroup |
6279 | * safely access it on res_counter_charge/uncharge. | 6291 | * core guarantees its existence. |
6280 | * This refcnt will be decremented when freeing this | ||
6281 | * mem_cgroup(see mem_cgroup_put). | ||
6282 | */ | 6292 | */ |
6283 | mem_cgroup_get(parent); | ||
6284 | } else { | 6293 | } else { |
6285 | res_counter_init(&memcg->res, NULL); | 6294 | res_counter_init(&memcg->res, NULL); |
6286 | res_counter_init(&memcg->memsw, NULL); | 6295 | res_counter_init(&memcg->memsw, NULL); |
@@ -6296,16 +6305,6 @@ mem_cgroup_css_online(struct cgroup *cont) | |||
6296 | 6305 | ||
6297 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); | 6306 | error = memcg_init_kmem(memcg, &mem_cgroup_subsys); |
6298 | mutex_unlock(&memcg_create_mutex); | 6307 | mutex_unlock(&memcg_create_mutex); |
6299 | if (error) { | ||
6300 | /* | ||
6301 | * We call put now because our (and parent's) refcnts | ||
6302 | * are already in place. mem_cgroup_put() will internally | ||
6303 | * call __mem_cgroup_free, so return directly | ||
6304 | */ | ||
6305 | mem_cgroup_put(memcg); | ||
6306 | if (parent->use_hierarchy) | ||
6307 | mem_cgroup_put(parent); | ||
6308 | } | ||
6309 | return error; | 6308 | return error; |
6310 | } | 6309 | } |
6311 | 6310 | ||
@@ -6317,20 +6316,22 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) | |||
6317 | struct mem_cgroup *parent = memcg; | 6316 | struct mem_cgroup *parent = memcg; |
6318 | 6317 | ||
6319 | while ((parent = parent_mem_cgroup(parent))) | 6318 | while ((parent = parent_mem_cgroup(parent))) |
6320 | atomic_inc(&parent->dead_count); | 6319 | mem_cgroup_iter_invalidate(parent); |
6321 | 6320 | ||
6322 | /* | 6321 | /* |
6323 | * if the root memcg is not hierarchical we have to check it | 6322 | * if the root memcg is not hierarchical we have to check it |
6324 | * explicitely. | 6323 | * explicitely. |
6325 | */ | 6324 | */ |
6326 | if (!root_mem_cgroup->use_hierarchy) | 6325 | if (!root_mem_cgroup->use_hierarchy) |
6327 | atomic_inc(&root_mem_cgroup->dead_count); | 6326 | mem_cgroup_iter_invalidate(root_mem_cgroup); |
6328 | } | 6327 | } |
6329 | 6328 | ||
6330 | static void mem_cgroup_css_offline(struct cgroup *cont) | 6329 | static void mem_cgroup_css_offline(struct cgroup *cont) |
6331 | { | 6330 | { |
6332 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6331 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
6333 | 6332 | ||
6333 | kmem_cgroup_css_offline(memcg); | ||
6334 | |||
6334 | mem_cgroup_invalidate_reclaim_iterators(memcg); | 6335 | mem_cgroup_invalidate_reclaim_iterators(memcg); |
6335 | mem_cgroup_reparent_charges(memcg); | 6336 | mem_cgroup_reparent_charges(memcg); |
6336 | mem_cgroup_destroy_all_caches(memcg); | 6337 | mem_cgroup_destroy_all_caches(memcg); |
@@ -6340,9 +6341,8 @@ static void mem_cgroup_css_free(struct cgroup *cont) | |||
6340 | { | 6341 | { |
6341 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); | 6342 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
6342 | 6343 | ||
6343 | kmem_cgroup_destroy(memcg); | 6344 | memcg_destroy_kmem(memcg); |
6344 | 6345 | __mem_cgroup_free(memcg); | |
6345 | mem_cgroup_put(memcg); | ||
6346 | } | 6346 | } |
6347 | 6347 | ||
6348 | #ifdef CONFIG_MMU | 6348 | #ifdef CONFIG_MMU |
@@ -6651,6 +6651,7 @@ static void __mem_cgroup_clear_mc(void) | |||
6651 | { | 6651 | { |
6652 | struct mem_cgroup *from = mc.from; | 6652 | struct mem_cgroup *from = mc.from; |
6653 | struct mem_cgroup *to = mc.to; | 6653 | struct mem_cgroup *to = mc.to; |
6654 | int i; | ||
6654 | 6655 | ||
6655 | /* we must uncharge all the leftover precharges from mc.to */ | 6656 | /* we must uncharge all the leftover precharges from mc.to */ |
6656 | if (mc.precharge) { | 6657 | if (mc.precharge) { |
@@ -6671,7 +6672,9 @@ static void __mem_cgroup_clear_mc(void) | |||
6671 | if (!mem_cgroup_is_root(mc.from)) | 6672 | if (!mem_cgroup_is_root(mc.from)) |
6672 | res_counter_uncharge(&mc.from->memsw, | 6673 | res_counter_uncharge(&mc.from->memsw, |
6673 | PAGE_SIZE * mc.moved_swap); | 6674 | PAGE_SIZE * mc.moved_swap); |
6674 | __mem_cgroup_put(mc.from, mc.moved_swap); | 6675 | |
6676 | for (i = 0; i < mc.moved_swap; i++) | ||
6677 | css_put(&mc.from->css); | ||
6675 | 6678 | ||
6676 | if (!mem_cgroup_is_root(mc.to)) { | 6679 | if (!mem_cgroup_is_root(mc.to)) { |
6677 | /* | 6680 | /* |
@@ -6681,7 +6684,7 @@ static void __mem_cgroup_clear_mc(void) | |||
6681 | res_counter_uncharge(&mc.to->res, | 6684 | res_counter_uncharge(&mc.to->res, |
6682 | PAGE_SIZE * mc.moved_swap); | 6685 | PAGE_SIZE * mc.moved_swap); |
6683 | } | 6686 | } |
6684 | /* we've already done mem_cgroup_get(mc.to) */ | 6687 | /* we've already done css_get(mc.to) */ |
6685 | mc.moved_swap = 0; | 6688 | mc.moved_swap = 0; |
6686 | } | 6689 | } |
6687 | memcg_oom_recover(from); | 6690 | memcg_oom_recover(from); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ceb0c7f1932f..2c13aa7a0164 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1410 | 1410 | ||
1411 | /* | 1411 | /* |
1412 | * Isolate the page, so that it doesn't get reallocated if it | 1412 | * Isolate the page, so that it doesn't get reallocated if it |
1413 | * was free. | 1413 | * was free. This flag should be kept set until the source page |
1414 | * is freed and PG_hwpoison on it is set. | ||
1414 | */ | 1415 | */ |
1415 | set_migratetype_isolate(p, true); | 1416 | set_migratetype_isolate(p, true); |
1416 | /* | 1417 | /* |
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1433 | /* Not a free page */ | 1434 | /* Not a free page */ |
1434 | ret = 1; | 1435 | ret = 1; |
1435 | } | 1436 | } |
1436 | unset_migratetype_isolate(p, MIGRATE_MOVABLE); | ||
1437 | unlock_memory_hotplug(); | 1437 | unlock_memory_hotplug(); |
1438 | return ret; | 1438 | return ret; |
1439 | } | 1439 | } |
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1494 | atomic_long_add(1 << compound_trans_order(hpage), | 1494 | atomic_long_add(1 << compound_trans_order(hpage), |
1495 | &num_poisoned_pages); | 1495 | &num_poisoned_pages); |
1496 | } | 1496 | } |
1497 | /* keep elevated page count for bad page */ | ||
1498 | return ret; | 1497 | return ret; |
1499 | } | 1498 | } |
1500 | 1499 | ||
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags) | |||
1559 | atomic_long_inc(&num_poisoned_pages); | 1558 | atomic_long_inc(&num_poisoned_pages); |
1560 | } | 1559 | } |
1561 | } | 1560 | } |
1562 | /* keep elevated page count for bad page */ | 1561 | unset_migratetype_isolate(page, MIGRATE_MOVABLE); |
1563 | return ret; | 1562 | return ret; |
1564 | } | 1563 | } |
1565 | 1564 | ||
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1625 | if (ret > 0) | 1624 | if (ret > 0) |
1626 | ret = -EIO; | 1625 | ret = -EIO; |
1627 | } else { | 1626 | } else { |
1627 | /* | ||
1628 | * After page migration succeeds, the source page can | ||
1629 | * be trapped in pagevec and actual freeing is delayed. | ||
1630 | * Freeing code works differently based on PG_hwpoison, | ||
1631 | * so there's a race. We need to make sure that the | ||
1632 | * source page should be freed back to buddy before | ||
1633 | * setting PG_hwpoison. | ||
1634 | */ | ||
1635 | if (!is_free_buddy_page(page)) | ||
1636 | lru_add_drain_all(); | ||
1637 | if (!is_free_buddy_page(page)) | ||
1638 | drain_all_pages(); | ||
1628 | SetPageHWPoison(page); | 1639 | SetPageHWPoison(page); |
1640 | if (!is_free_buddy_page(page)) | ||
1641 | pr_info("soft offline: %#lx: page leaked\n", | ||
1642 | pfn); | ||
1629 | atomic_long_inc(&num_poisoned_pages); | 1643 | atomic_long_inc(&num_poisoned_pages); |
1630 | } | 1644 | } |
1631 | } else { | 1645 | } else { |
diff --git a/mm/memory.c b/mm/memory.c index 95d0cce63583..1ce2e2a734fc 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr); | |||
82 | EXPORT_SYMBOL(mem_map); | 82 | EXPORT_SYMBOL(mem_map); |
83 | #endif | 83 | #endif |
84 | 84 | ||
85 | unsigned long num_physpages; | ||
86 | /* | 85 | /* |
87 | * A number of key systems in x86 including ioremap() rely on the assumption | 86 | * A number of key systems in x86 including ioremap() rely on the assumption |
88 | * that high_memory defines the upper bound on direct map memory, then end | 87 | * that high_memory defines the upper bound on direct map memory, then end |
@@ -92,7 +91,6 @@ unsigned long num_physpages; | |||
92 | */ | 91 | */ |
93 | void * high_memory; | 92 | void * high_memory; |
94 | 93 | ||
95 | EXPORT_SYMBOL(num_physpages); | ||
96 | EXPORT_SYMBOL(high_memory); | 94 | EXPORT_SYMBOL(high_memory); |
97 | 95 | ||
98 | /* | 96 | /* |
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1101 | spinlock_t *ptl; | 1099 | spinlock_t *ptl; |
1102 | pte_t *start_pte; | 1100 | pte_t *start_pte; |
1103 | pte_t *pte; | 1101 | pte_t *pte; |
1102 | unsigned long range_start = addr; | ||
1104 | 1103 | ||
1105 | again: | 1104 | again: |
1106 | init_rss_vec(rss); | 1105 | init_rss_vec(rss); |
@@ -1151,7 +1150,7 @@ again: | |||
1151 | if (pte_dirty(ptent)) | 1150 | if (pte_dirty(ptent)) |
1152 | set_page_dirty(page); | 1151 | set_page_dirty(page); |
1153 | if (pte_young(ptent) && | 1152 | if (pte_young(ptent) && |
1154 | likely(!VM_SequentialReadHint(vma))) | 1153 | likely(!(vma->vm_flags & VM_SEQ_READ))) |
1155 | mark_page_accessed(page); | 1154 | mark_page_accessed(page); |
1156 | rss[MM_FILEPAGES]--; | 1155 | rss[MM_FILEPAGES]--; |
1157 | } | 1156 | } |
@@ -1206,12 +1205,14 @@ again: | |||
1206 | force_flush = 0; | 1205 | force_flush = 0; |
1207 | 1206 | ||
1208 | #ifdef HAVE_GENERIC_MMU_GATHER | 1207 | #ifdef HAVE_GENERIC_MMU_GATHER |
1209 | tlb->start = addr; | 1208 | tlb->start = range_start; |
1210 | tlb->end = end; | 1209 | tlb->end = addr; |
1211 | #endif | 1210 | #endif |
1212 | tlb_flush_mmu(tlb); | 1211 | tlb_flush_mmu(tlb); |
1213 | if (addr != end) | 1212 | if (addr != end) { |
1213 | range_start = addr; | ||
1214 | goto again; | 1214 | goto again; |
1215 | } | ||
1215 | } | 1216 | } |
1216 | 1217 | ||
1217 | return addr; | 1218 | return addr; |
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
2904 | details->first_index, details->last_index) { | 2905 | details->first_index, details->last_index) { |
2905 | 2906 | ||
2906 | vba = vma->vm_pgoff; | 2907 | vba = vma->vm_pgoff; |
2907 | vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; | 2908 | vea = vba + vma_pages(vma) - 1; |
2908 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ | 2909 | /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ |
2909 | zba = details->first_index; | 2910 | zba = details->first_index; |
2910 | if (zba < vba) | 2911 | if (zba < vba) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1ad92b46753e..ca1dd3aa5eee 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) | |||
75 | res->end = start + size - 1; | 75 | res->end = start + size - 1; |
76 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 76 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
77 | if (request_resource(&iomem_resource, res) < 0) { | 77 | if (request_resource(&iomem_resource, res) < 0) { |
78 | printk("System RAM resource %pR cannot be added\n", res); | 78 | pr_debug("System RAM resource %pR cannot be added\n", res); |
79 | kfree(res); | 79 | kfree(res); |
80 | res = NULL; | 80 | res = NULL; |
81 | } | 81 | } |
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page, | |||
101 | atomic_inc(&page->_count); | 101 | atomic_inc(&page->_count); |
102 | } | 102 | } |
103 | 103 | ||
104 | /* reference to __meminit __free_pages_bootmem is valid | 104 | void put_page_bootmem(struct page *page) |
105 | * so use __ref to tell modpost not to generate a warning */ | ||
106 | void __ref put_page_bootmem(struct page *page) | ||
107 | { | 105 | { |
108 | unsigned long type; | 106 | unsigned long type; |
109 | static DEFINE_MUTEX(ppb_lock); | ||
110 | 107 | ||
111 | type = (unsigned long) page->lru.next; | 108 | type = (unsigned long) page->lru.next; |
112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 109 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page) | |||
116 | ClearPagePrivate(page); | 113 | ClearPagePrivate(page); |
117 | set_page_private(page, 0); | 114 | set_page_private(page, 0); |
118 | INIT_LIST_HEAD(&page->lru); | 115 | INIT_LIST_HEAD(&page->lru); |
119 | 116 | free_reserved_page(page); | |
120 | /* | ||
121 | * Please refer to comment for __free_pages_bootmem() | ||
122 | * for why we serialize here. | ||
123 | */ | ||
124 | mutex_lock(&ppb_lock); | ||
125 | __free_pages_bootmem(page, 0); | ||
126 | mutex_unlock(&ppb_lock); | ||
127 | totalram_pages++; | ||
128 | } | 117 | } |
129 | |||
130 | } | 118 | } |
131 | 119 | ||
132 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE | 120 | #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE |
@@ -220,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) | |||
220 | pfn = pgdat->node_start_pfn; | 208 | pfn = pgdat->node_start_pfn; |
221 | end_pfn = pgdat_end_pfn(pgdat); | 209 | end_pfn = pgdat_end_pfn(pgdat); |
222 | 210 | ||
223 | /* register_section info */ | 211 | /* register section info */ |
224 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 212 | for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
225 | /* | 213 | /* |
226 | * Some platforms can assign the same pfn to multiple nodes - on | 214 | * Some platforms can assign the same pfn to multiple nodes - on |
227 | * node0 as well as nodeN. To avoid registering a pfn against | 215 | * node0 as well as nodeN. To avoid registering a pfn against |
228 | * multiple nodes we check that this pfn does not already | 216 | * multiple nodes we check that this pfn does not already |
229 | * reside in some other node. | 217 | * reside in some other nodes. |
230 | */ | 218 | */ |
231 | if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) | 219 | if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) |
232 | register_page_bootmem_info_section(pfn); | 220 | register_page_bootmem_info_section(pfn); |
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, | |||
309 | /* can't move pfns which are higher than @z2 */ | 297 | /* can't move pfns which are higher than @z2 */ |
310 | if (end_pfn > zone_end_pfn(z2)) | 298 | if (end_pfn > zone_end_pfn(z2)) |
311 | goto out_fail; | 299 | goto out_fail; |
312 | /* the move out part mast at the left most of @z2 */ | 300 | /* the move out part must be at the left most of @z2 */ |
313 | if (start_pfn > z2->zone_start_pfn) | 301 | if (start_pfn > z2->zone_start_pfn) |
314 | goto out_fail; | 302 | goto out_fail; |
315 | /* must included/overlap */ | 303 | /* must included/overlap */ |
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); | |||
775 | 763 | ||
776 | void __online_page_set_limits(struct page *page) | 764 | void __online_page_set_limits(struct page *page) |
777 | { | 765 | { |
778 | unsigned long pfn = page_to_pfn(page); | ||
779 | |||
780 | if (pfn >= num_physpages) | ||
781 | num_physpages = pfn + 1; | ||
782 | } | 766 | } |
783 | EXPORT_SYMBOL_GPL(__online_page_set_limits); | 767 | EXPORT_SYMBOL_GPL(__online_page_set_limits); |
784 | 768 | ||
785 | void __online_page_increment_counters(struct page *page) | 769 | void __online_page_increment_counters(struct page *page) |
786 | { | 770 | { |
787 | totalram_pages++; | 771 | adjust_managed_page_count(page, 1); |
788 | |||
789 | #ifdef CONFIG_HIGHMEM | ||
790 | if (PageHighMem(page)) | ||
791 | totalhigh_pages++; | ||
792 | #endif | ||
793 | } | 772 | } |
794 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); | 773 | EXPORT_SYMBOL_GPL(__online_page_increment_counters); |
795 | 774 | ||
796 | void __online_page_free(struct page *page) | 775 | void __online_page_free(struct page *page) |
797 | { | 776 | { |
798 | ClearPageReserved(page); | 777 | __free_reserved_page(page); |
799 | init_page_count(page); | ||
800 | __free_page(page); | ||
801 | } | 778 | } |
802 | EXPORT_SYMBOL_GPL(__online_page_free); | 779 | EXPORT_SYMBOL_GPL(__online_page_free); |
803 | 780 | ||
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
918 | 895 | ||
919 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | 896 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) |
920 | { | 897 | { |
898 | unsigned long flags; | ||
921 | unsigned long onlined_pages = 0; | 899 | unsigned long onlined_pages = 0; |
922 | struct zone *zone; | 900 | struct zone *zone; |
923 | int need_zonelists_rebuild = 0; | 901 | int need_zonelists_rebuild = 0; |
@@ -936,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
936 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && | 914 | if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && |
937 | !can_online_high_movable(zone)) { | 915 | !can_online_high_movable(zone)) { |
938 | unlock_memory_hotplug(); | 916 | unlock_memory_hotplug(); |
939 | return -1; | 917 | return -EINVAL; |
940 | } | 918 | } |
941 | 919 | ||
942 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { | 920 | if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { |
943 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { | 921 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { |
944 | unlock_memory_hotplug(); | 922 | unlock_memory_hotplug(); |
945 | return -1; | 923 | return -EINVAL; |
946 | } | 924 | } |
947 | } | 925 | } |
948 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { | 926 | if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { |
949 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { | 927 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { |
950 | unlock_memory_hotplug(); | 928 | unlock_memory_hotplug(); |
951 | return -1; | 929 | return -EINVAL; |
952 | } | 930 | } |
953 | } | 931 | } |
954 | 932 | ||
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
994 | return ret; | 972 | return ret; |
995 | } | 973 | } |
996 | 974 | ||
997 | zone->managed_pages += onlined_pages; | ||
998 | zone->present_pages += onlined_pages; | 975 | zone->present_pages += onlined_pages; |
976 | |||
977 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
999 | zone->zone_pgdat->node_present_pages += onlined_pages; | 978 | zone->zone_pgdat->node_present_pages += onlined_pages; |
979 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | ||
980 | |||
1000 | if (onlined_pages) { | 981 | if (onlined_pages) { |
1001 | node_states_set_node(zone_to_nid(zone), &arg); | 982 | node_states_set_node(zone_to_nid(zone), &arg); |
1002 | if (need_zonelists_rebuild) | 983 | if (need_zonelists_rebuild) |
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1487 | unsigned long pfn, nr_pages, expire; | 1468 | unsigned long pfn, nr_pages, expire; |
1488 | long offlined_pages; | 1469 | long offlined_pages; |
1489 | int ret, drain, retry_max, node; | 1470 | int ret, drain, retry_max, node; |
1471 | unsigned long flags; | ||
1490 | struct zone *zone; | 1472 | struct zone *zone; |
1491 | struct memory_notify arg; | 1473 | struct memory_notify arg; |
1492 | 1474 | ||
@@ -1578,10 +1560,12 @@ repeat: | |||
1578 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 1560 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
1579 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1561 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
1580 | /* removal success */ | 1562 | /* removal success */ |
1581 | zone->managed_pages -= offlined_pages; | 1563 | adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); |
1582 | zone->present_pages -= offlined_pages; | 1564 | zone->present_pages -= offlined_pages; |
1565 | |||
1566 | pgdat_resize_lock(zone->zone_pgdat, &flags); | ||
1583 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 1567 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
1584 | totalram_pages -= offlined_pages; | 1568 | pgdat_resize_unlock(zone->zone_pgdat, &flags); |
1585 | 1569 | ||
1586 | init_per_zone_wmark_min(); | 1570 | init_per_zone_wmark_min(); |
1587 | 1571 | ||
@@ -1621,6 +1605,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
1621 | { | 1605 | { |
1622 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1606 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
1623 | } | 1607 | } |
1608 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1624 | 1609 | ||
1625 | /** | 1610 | /** |
1626 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) | 1611 | * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) |
@@ -1634,7 +1619,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | |||
1634 | * | 1619 | * |
1635 | * Returns the return value of func. | 1620 | * Returns the return value of func. |
1636 | */ | 1621 | */ |
1637 | static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | 1622 | int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, |
1638 | void *arg, int (*func)(struct memory_block *, void *)) | 1623 | void *arg, int (*func)(struct memory_block *, void *)) |
1639 | { | 1624 | { |
1640 | struct memory_block *mem = NULL; | 1625 | struct memory_block *mem = NULL; |
@@ -1671,24 +1656,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, | |||
1671 | return 0; | 1656 | return 0; |
1672 | } | 1657 | } |
1673 | 1658 | ||
1674 | /** | 1659 | #ifdef CONFIG_MEMORY_HOTREMOVE |
1675 | * offline_memory_block_cb - callback function for offlining memory block | ||
1676 | * @mem: the memory block to be offlined | ||
1677 | * @arg: buffer to hold error msg | ||
1678 | * | ||
1679 | * Always return 0, and put the error msg in arg if any. | ||
1680 | */ | ||
1681 | static int offline_memory_block_cb(struct memory_block *mem, void *arg) | ||
1682 | { | ||
1683 | int *ret = arg; | ||
1684 | int error = offline_memory_block(mem); | ||
1685 | |||
1686 | if (error != 0 && *ret == 0) | ||
1687 | *ret = error; | ||
1688 | |||
1689 | return 0; | ||
1690 | } | ||
1691 | |||
1692 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) | 1660 | static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) |
1693 | { | 1661 | { |
1694 | int ret = !is_memblock_offlined(mem); | 1662 | int ret = !is_memblock_offlined(mem); |
@@ -1814,54 +1782,22 @@ void try_offline_node(int nid) | |||
1814 | } | 1782 | } |
1815 | EXPORT_SYMBOL(try_offline_node); | 1783 | EXPORT_SYMBOL(try_offline_node); |
1816 | 1784 | ||
1817 | int __ref remove_memory(int nid, u64 start, u64 size) | 1785 | void __ref remove_memory(int nid, u64 start, u64 size) |
1818 | { | 1786 | { |
1819 | unsigned long start_pfn, end_pfn; | 1787 | int ret; |
1820 | int ret = 0; | ||
1821 | int retry = 1; | ||
1822 | |||
1823 | start_pfn = PFN_DOWN(start); | ||
1824 | end_pfn = PFN_UP(start + size - 1); | ||
1825 | |||
1826 | /* | ||
1827 | * When CONFIG_MEMCG is on, one memory block may be used by other | ||
1828 | * blocks to store page cgroup when onlining pages. But we don't know | ||
1829 | * in what order pages are onlined. So we iterate twice to offline | ||
1830 | * memory: | ||
1831 | * 1st iterate: offline every non primary memory block. | ||
1832 | * 2nd iterate: offline primary (i.e. first added) memory block. | ||
1833 | */ | ||
1834 | repeat: | ||
1835 | walk_memory_range(start_pfn, end_pfn, &ret, | ||
1836 | offline_memory_block_cb); | ||
1837 | if (ret) { | ||
1838 | if (!retry) | ||
1839 | return ret; | ||
1840 | |||
1841 | retry = 0; | ||
1842 | ret = 0; | ||
1843 | goto repeat; | ||
1844 | } | ||
1845 | 1788 | ||
1846 | lock_memory_hotplug(); | 1789 | lock_memory_hotplug(); |
1847 | 1790 | ||
1848 | /* | 1791 | /* |
1849 | * we have offlined all memory blocks like this: | 1792 | * All memory blocks must be offlined before removing memory. Check |
1850 | * 1. lock memory hotplug | 1793 | * whether all memory blocks in question are offline and trigger a BUG() |
1851 | * 2. offline a memory block | 1794 | * if this is not the case. |
1852 | * 3. unlock memory hotplug | ||
1853 | * | ||
1854 | * repeat step1-3 to offline the memory block. All memory blocks | ||
1855 | * must be offlined before removing memory. But we don't hold the | ||
1856 | * lock in the whole operation. So we should check whether all | ||
1857 | * memory blocks are offlined. | ||
1858 | */ | 1795 | */ |
1859 | 1796 | ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, | |
1860 | ret = walk_memory_range(start_pfn, end_pfn, NULL, | ||
1861 | is_memblock_offlined_cb); | 1797 | is_memblock_offlined_cb); |
1862 | if (ret) { | 1798 | if (ret) { |
1863 | unlock_memory_hotplug(); | 1799 | unlock_memory_hotplug(); |
1864 | return ret; | 1800 | BUG(); |
1865 | } | 1801 | } |
1866 | 1802 | ||
1867 | /* remove memmap entry */ | 1803 | /* remove memmap entry */ |
@@ -1872,17 +1808,6 @@ repeat: | |||
1872 | try_offline_node(nid); | 1808 | try_offline_node(nid); |
1873 | 1809 | ||
1874 | unlock_memory_hotplug(); | 1810 | unlock_memory_hotplug(); |
1875 | |||
1876 | return 0; | ||
1877 | } | 1811 | } |
1878 | #else | ||
1879 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | ||
1880 | { | ||
1881 | return -EINVAL; | ||
1882 | } | ||
1883 | int remove_memory(int nid, u64 start, u64 size) | ||
1884 | { | ||
1885 | return -EINVAL; | ||
1886 | } | ||
1887 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
1888 | EXPORT_SYMBOL_GPL(remove_memory); | 1812 | EXPORT_SYMBOL_GPL(remove_memory); |
1813 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index c280a02ea11e..633c08863fd8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -9,6 +9,8 @@ | |||
9 | #include <linux/init.h> | 9 | #include <linux/init.h> |
10 | #include <linux/kobject.h> | 10 | #include <linux/kobject.h> |
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/memory.h> | ||
13 | #include <linux/notifier.h> | ||
12 | #include "internal.h" | 14 | #include "internal.h" |
13 | 15 | ||
14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 16 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel); | |||
147 | struct kobject *mm_kobj; | 149 | struct kobject *mm_kobj; |
148 | EXPORT_SYMBOL_GPL(mm_kobj); | 150 | EXPORT_SYMBOL_GPL(mm_kobj); |
149 | 151 | ||
152 | #ifdef CONFIG_SMP | ||
153 | s32 vm_committed_as_batch = 32; | ||
154 | |||
155 | static void __meminit mm_compute_batch(void) | ||
156 | { | ||
157 | u64 memsized_batch; | ||
158 | s32 nr = num_present_cpus(); | ||
159 | s32 batch = max_t(s32, nr*2, 32); | ||
160 | |||
161 | /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ | ||
162 | memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); | ||
163 | |||
164 | vm_committed_as_batch = max_t(s32, memsized_batch, batch); | ||
165 | } | ||
166 | |||
167 | static int __meminit mm_compute_batch_notifier(struct notifier_block *self, | ||
168 | unsigned long action, void *arg) | ||
169 | { | ||
170 | switch (action) { | ||
171 | case MEM_ONLINE: | ||
172 | case MEM_OFFLINE: | ||
173 | mm_compute_batch(); | ||
174 | default: | ||
175 | break; | ||
176 | } | ||
177 | return NOTIFY_OK; | ||
178 | } | ||
179 | |||
180 | static struct notifier_block compute_batch_nb __meminitdata = { | ||
181 | .notifier_call = mm_compute_batch_notifier, | ||
182 | .priority = IPC_CALLBACK_PRI, /* use lowest priority */ | ||
183 | }; | ||
184 | |||
185 | static int __init mm_compute_batch_init(void) | ||
186 | { | ||
187 | mm_compute_batch(); | ||
188 | register_hotmemory_notifier(&compute_batch_nb); | ||
189 | |||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | __initcall(mm_compute_batch_init); | ||
194 | |||
195 | #endif | ||
196 | |||
150 | static int __init mm_sysfs_init(void) | 197 | static int __init mm_sysfs_init(void) |
151 | { | 198 | { |
152 | mm_kobj = kobject_create_and_add("mm", kernel_kobj); | 199 | mm_kobj = kobject_create_and_add("mm", kernel_kobj); |
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
955 | if (is_mergeable_vma(vma, file, vm_flags) && | 955 | if (is_mergeable_vma(vma, file, vm_flags) && |
956 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { | 956 | is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { |
957 | pgoff_t vm_pglen; | 957 | pgoff_t vm_pglen; |
958 | vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; | 958 | vm_pglen = vma_pages(vma); |
959 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) | 959 | if (vma->vm_pgoff + vm_pglen == vm_pgoff) |
960 | return 1; | 960 | return 1; |
961 | } | 961 | } |
@@ -1358,18 +1358,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1358 | 1358 | ||
1359 | if (!(flags & MAP_ANONYMOUS)) { | 1359 | if (!(flags & MAP_ANONYMOUS)) { |
1360 | audit_mmap_fd(fd, flags); | 1360 | audit_mmap_fd(fd, flags); |
1361 | if (unlikely(flags & MAP_HUGETLB)) | ||
1362 | return -EINVAL; | ||
1363 | file = fget(fd); | 1361 | file = fget(fd); |
1364 | if (!file) | 1362 | if (!file) |
1365 | goto out; | 1363 | goto out; |
1366 | if (is_file_hugepages(file)) | 1364 | if (is_file_hugepages(file)) |
1367 | len = ALIGN(len, huge_page_size(hstate_file(file))); | 1365 | len = ALIGN(len, huge_page_size(hstate_file(file))); |
1366 | retval = -EINVAL; | ||
1367 | if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) | ||
1368 | goto out_fput; | ||
1368 | } else if (flags & MAP_HUGETLB) { | 1369 | } else if (flags & MAP_HUGETLB) { |
1369 | struct user_struct *user = NULL; | 1370 | struct user_struct *user = NULL; |
1370 | struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & | 1371 | struct hstate *hs; |
1371 | SHM_HUGE_MASK); | ||
1372 | 1372 | ||
1373 | hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); | ||
1373 | if (!hs) | 1374 | if (!hs) |
1374 | return -EINVAL; | 1375 | return -EINVAL; |
1375 | 1376 | ||
@@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1391 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1392 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
1392 | 1393 | ||
1393 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1394 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
1395 | out_fput: | ||
1394 | if (file) | 1396 | if (file) |
1395 | fput(file); | 1397 | fput(file); |
1396 | out: | 1398 | out: |
@@ -1876,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1876 | } | 1878 | } |
1877 | #endif | 1879 | #endif |
1878 | 1880 | ||
1879 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | ||
1880 | { | ||
1881 | /* | ||
1882 | * Is this a new hole at the lowest possible address? | ||
1883 | */ | ||
1884 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) | ||
1885 | mm->free_area_cache = addr; | ||
1886 | } | ||
1887 | |||
1888 | /* | 1881 | /* |
1889 | * This mmap-allocator allocates new areas top-down from below the | 1882 | * This mmap-allocator allocates new areas top-down from below the |
1890 | * stack's low limit (the base): | 1883 | * stack's low limit (the base): |
@@ -1941,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1941 | } | 1934 | } |
1942 | #endif | 1935 | #endif |
1943 | 1936 | ||
1944 | void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) | ||
1945 | { | ||
1946 | /* | ||
1947 | * Is this a new hole at the highest possible address? | ||
1948 | */ | ||
1949 | if (addr > mm->free_area_cache) | ||
1950 | mm->free_area_cache = addr; | ||
1951 | |||
1952 | /* dont allow allocations above current base */ | ||
1953 | if (mm->free_area_cache > mm->mmap_base) | ||
1954 | mm->free_area_cache = mm->mmap_base; | ||
1955 | } | ||
1956 | |||
1957 | unsigned long | 1937 | unsigned long |
1958 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | 1938 | get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, |
1959 | unsigned long pgoff, unsigned long flags) | 1939 | unsigned long pgoff, unsigned long flags) |
@@ -2374,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2374 | { | 2354 | { |
2375 | struct vm_area_struct **insertion_point; | 2355 | struct vm_area_struct **insertion_point; |
2376 | struct vm_area_struct *tail_vma = NULL; | 2356 | struct vm_area_struct *tail_vma = NULL; |
2377 | unsigned long addr; | ||
2378 | 2357 | ||
2379 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); | 2358 | insertion_point = (prev ? &prev->vm_next : &mm->mmap); |
2380 | vma->vm_prev = NULL; | 2359 | vma->vm_prev = NULL; |
@@ -2391,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2391 | } else | 2370 | } else |
2392 | mm->highest_vm_end = prev ? prev->vm_end : 0; | 2371 | mm->highest_vm_end = prev ? prev->vm_end : 0; |
2393 | tail_vma->vm_next = NULL; | 2372 | tail_vma->vm_next = NULL; |
2394 | if (mm->unmap_area == arch_unmap_area) | ||
2395 | addr = prev ? prev->vm_end : mm->mmap_base; | ||
2396 | else | ||
2397 | addr = vma ? vma->vm_start : mm->mmap_base; | ||
2398 | mm->unmap_area(mm, addr); | ||
2399 | mm->mmap_cache = NULL; /* Kill the cache. */ | 2373 | mm->mmap_cache = NULL; /* Kill the cache. */ |
2400 | } | 2374 | } |
2401 | 2375 | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 6725ff183374..93e6089cb456 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
@@ -315,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | |||
315 | 315 | ||
316 | /* | 316 | /* |
317 | * Wait for any running method to finish, of course including | 317 | * Wait for any running method to finish, of course including |
318 | * ->release if it was run by mmu_notifier_relase instead of us. | 318 | * ->release if it was run by mmu_notifier_release instead of us. |
319 | */ | 319 | */ |
320 | synchronize_srcu(&srcu); | 320 | synchronize_srcu(&srcu); |
321 | 321 | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 463a25705ac6..457d34ef3bf2 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
126 | continue; | 126 | continue; |
127 | pte = ptep_get_and_clear(mm, old_addr, old_pte); | 127 | pte = ptep_get_and_clear(mm, old_addr, old_pte); |
128 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); | 128 | pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); |
129 | set_pte_at(mm, new_addr, new_pte, pte); | 129 | set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); |
130 | } | 130 | } |
131 | 131 | ||
132 | arch_leave_lazy_mmu_mode(); | 132 | arch_leave_lazy_mmu_mode(); |
@@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
456 | unsigned long charged = 0; | 456 | unsigned long charged = 0; |
457 | bool locked = false; | 457 | bool locked = false; |
458 | 458 | ||
459 | down_write(¤t->mm->mmap_sem); | ||
460 | |||
461 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) | 459 | if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) |
462 | goto out; | 460 | return ret; |
461 | |||
462 | if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) | ||
463 | return ret; | ||
463 | 464 | ||
464 | if (addr & ~PAGE_MASK) | 465 | if (addr & ~PAGE_MASK) |
465 | goto out; | 466 | return ret; |
466 | 467 | ||
467 | old_len = PAGE_ALIGN(old_len); | 468 | old_len = PAGE_ALIGN(old_len); |
468 | new_len = PAGE_ALIGN(new_len); | 469 | new_len = PAGE_ALIGN(new_len); |
@@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
473 | * a zero new-len is nonsensical. | 474 | * a zero new-len is nonsensical. |
474 | */ | 475 | */ |
475 | if (!new_len) | 476 | if (!new_len) |
476 | goto out; | 477 | return ret; |
478 | |||
479 | down_write(¤t->mm->mmap_sem); | ||
477 | 480 | ||
478 | if (flags & MREMAP_FIXED) { | 481 | if (flags & MREMAP_FIXED) { |
479 | if (flags & MREMAP_MAYMOVE) | 482 | ret = mremap_to(addr, old_len, new_addr, new_len, |
480 | ret = mremap_to(addr, old_len, new_addr, new_len, | 483 | &locked); |
481 | &locked); | ||
482 | goto out; | 484 | goto out; |
483 | } | 485 | } |
484 | 486 | ||
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bdd3fa2fc73b..61107cf55bb3 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void) | |||
137 | return count; | 137 | return count; |
138 | } | 138 | } |
139 | 139 | ||
140 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | 140 | static int reset_managed_pages_done __initdata; |
141 | |||
142 | static inline void __init reset_node_managed_pages(pg_data_t *pgdat) | ||
141 | { | 143 | { |
142 | struct zone *z; | 144 | struct zone *z; |
143 | 145 | ||
144 | /* | 146 | if (reset_managed_pages_done) |
145 | * In free_area_init_core(), highmem zone's managed_pages is set to | 147 | return; |
146 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
147 | * zones. So there's no need to recalculate managed_pages because all | ||
148 | * highmem pages will be managed by the buddy system. Here highmem | ||
149 | * zone also includes highmem movable zone. | ||
150 | */ | ||
151 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | 148 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) |
152 | if (!is_highmem(z)) | 149 | z->managed_pages = 0; |
153 | z->managed_pages = 0; | 150 | } |
151 | |||
152 | void __init reset_all_zones_managed_pages(void) | ||
153 | { | ||
154 | struct pglist_data *pgdat; | ||
155 | |||
156 | for_each_online_pgdat(pgdat) | ||
157 | reset_node_managed_pages(pgdat); | ||
158 | reset_managed_pages_done = 1; | ||
154 | } | 159 | } |
155 | 160 | ||
156 | /** | 161 | /** |
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | |||
160 | */ | 165 | */ |
161 | unsigned long __init free_all_bootmem(void) | 166 | unsigned long __init free_all_bootmem(void) |
162 | { | 167 | { |
163 | struct pglist_data *pgdat; | 168 | unsigned long pages; |
164 | 169 | ||
165 | for_each_online_pgdat(pgdat) | 170 | reset_all_zones_managed_pages(); |
166 | reset_node_lowmem_managed_pages(pgdat); | ||
167 | 171 | ||
168 | /* | 172 | /* |
169 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 173 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
170 | * because in some case like Node0 doesn't have RAM installed | 174 | * because in some case like Node0 doesn't have RAM installed |
171 | * low ram will be on Node1 | 175 | * low ram will be on Node1 |
172 | */ | 176 | */ |
173 | return free_low_memory_core_early(); | 177 | pages = free_low_memory_core_early(); |
178 | totalram_pages += pages; | ||
179 | |||
180 | return pages; | ||
174 | } | 181 | } |
175 | 182 | ||
176 | /** | 183 | /** |
diff --git a/mm/nommu.c b/mm/nommu.c index 298884dcd6e7..ecd1f158548e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -56,7 +56,6 @@ | |||
56 | void *high_memory; | 56 | void *high_memory; |
57 | struct page *mem_map; | 57 | struct page *mem_map; |
58 | unsigned long max_mapnr; | 58 | unsigned long max_mapnr; |
59 | unsigned long num_physpages; | ||
60 | unsigned long highest_memmap_pfn; | 59 | unsigned long highest_memmap_pfn; |
61 | struct percpu_counter vm_committed_as; | 60 | struct percpu_counter vm_committed_as; |
62 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 61 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void) | |||
85 | EXPORT_SYMBOL_GPL(vm_memory_committed); | 84 | EXPORT_SYMBOL_GPL(vm_memory_committed); |
86 | 85 | ||
87 | EXPORT_SYMBOL(mem_map); | 86 | EXPORT_SYMBOL(mem_map); |
88 | EXPORT_SYMBOL(num_physpages); | ||
89 | 87 | ||
90 | /* list of mapped, potentially shareable regions */ | 88 | /* list of mapped, potentially shareable regions */ |
91 | static struct kmem_cache *vm_region_jar; | 89 | static struct kmem_cache *vm_region_jar; |
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); | |||
282 | 280 | ||
283 | long vread(char *buf, char *addr, unsigned long count) | 281 | long vread(char *buf, char *addr, unsigned long count) |
284 | { | 282 | { |
283 | /* Don't allow overflow */ | ||
284 | if ((unsigned long) buf + count < count) | ||
285 | count = -(unsigned long) buf; | ||
286 | |||
285 | memcpy(buf, addr, count); | 287 | memcpy(buf, addr, count); |
286 | return count; | 288 | return count; |
287 | } | 289 | } |
@@ -1869,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, | |||
1869 | return -ENOMEM; | 1871 | return -ENOMEM; |
1870 | } | 1872 | } |
1871 | 1873 | ||
1872 | void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | ||
1873 | { | ||
1874 | } | ||
1875 | |||
1876 | void unmap_mapping_range(struct address_space *mapping, | 1874 | void unmap_mapping_range(struct address_space *mapping, |
1877 | loff_t const holebegin, loff_t const holelen, | 1875 | loff_t const holebegin, loff_t const holelen, |
1878 | int even_cows) | 1876 | int even_cows) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fccf..b100255dedda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -61,10 +61,14 @@ | |||
61 | #include <linux/hugetlb.h> | 61 | #include <linux/hugetlb.h> |
62 | #include <linux/sched/rt.h> | 62 | #include <linux/sched/rt.h> |
63 | 63 | ||
64 | #include <asm/sections.h> | ||
64 | #include <asm/tlbflush.h> | 65 | #include <asm/tlbflush.h> |
65 | #include <asm/div64.h> | 66 | #include <asm/div64.h> |
66 | #include "internal.h" | 67 | #include "internal.h" |
67 | 68 | ||
69 | /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ | ||
70 | static DEFINE_MUTEX(pcp_batch_high_lock); | ||
71 | |||
68 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | 72 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
69 | DEFINE_PER_CPU(int, numa_node); | 73 | DEFINE_PER_CPU(int, numa_node); |
70 | EXPORT_PER_CPU_SYMBOL(numa_node); | 74 | EXPORT_PER_CPU_SYMBOL(numa_node); |
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
100 | }; | 104 | }; |
101 | EXPORT_SYMBOL(node_states); | 105 | EXPORT_SYMBOL(node_states); |
102 | 106 | ||
107 | /* Protect totalram_pages and zone->managed_pages */ | ||
108 | static DEFINE_SPINLOCK(managed_page_count_lock); | ||
109 | |||
103 | unsigned long totalram_pages __read_mostly; | 110 | unsigned long totalram_pages __read_mostly; |
104 | unsigned long totalreserve_pages __read_mostly; | 111 | unsigned long totalreserve_pages __read_mostly; |
105 | /* | 112 | /* |
@@ -197,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
197 | }; | 204 | }; |
198 | 205 | ||
199 | int min_free_kbytes = 1024; | 206 | int min_free_kbytes = 1024; |
207 | int user_min_free_kbytes; | ||
200 | 208 | ||
201 | static unsigned long __meminitdata nr_kernel_pages; | 209 | static unsigned long __meminitdata nr_kernel_pages; |
202 | static unsigned long __meminitdata nr_all_pages; | 210 | static unsigned long __meminitdata nr_all_pages; |
@@ -739,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
739 | local_irq_restore(flags); | 747 | local_irq_restore(flags); |
740 | } | 748 | } |
741 | 749 | ||
742 | /* | 750 | void __init __free_pages_bootmem(struct page *page, unsigned int order) |
743 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
744 | * but we still need to serialize writers. Currently all callers of | ||
745 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
746 | * at boot time. So for shorter boot time, we shift the burden to | ||
747 | * put_page_bootmem() to serialize writers. | ||
748 | */ | ||
749 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | ||
750 | { | 751 | { |
751 | unsigned int nr_pages = 1 << order; | 752 | unsigned int nr_pages = 1 << order; |
752 | unsigned int loop; | 753 | unsigned int loop; |
@@ -781,11 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
781 | set_page_refcounted(page); | 782 | set_page_refcounted(page); |
782 | set_pageblock_migratetype(page, MIGRATE_CMA); | 783 | set_pageblock_migratetype(page, MIGRATE_CMA); |
783 | __free_pages(page, pageblock_order); | 784 | __free_pages(page, pageblock_order); |
784 | totalram_pages += pageblock_nr_pages; | 785 | adjust_managed_page_count(page, pageblock_nr_pages); |
785 | #ifdef CONFIG_HIGHMEM | ||
786 | if (PageHighMem(page)) | ||
787 | totalhigh_pages += pageblock_nr_pages; | ||
788 | #endif | ||
789 | } | 786 | } |
790 | #endif | 787 | #endif |
791 | 788 | ||
@@ -1050,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | |||
1050 | * MIGRATE_CMA areas. | 1047 | * MIGRATE_CMA areas. |
1051 | */ | 1048 | */ |
1052 | if (!is_migrate_cma(migratetype) && | 1049 | if (!is_migrate_cma(migratetype) && |
1053 | (unlikely(current_order >= pageblock_order / 2) || | 1050 | (current_order >= pageblock_order / 2 || |
1054 | start_migratetype == MIGRATE_RECLAIMABLE || | 1051 | start_migratetype == MIGRATE_RECLAIMABLE || |
1055 | page_group_by_mobility_disabled)) { | 1052 | page_group_by_mobility_disabled)) { |
1056 | int pages; | 1053 | int pages; |
@@ -1179,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1179 | { | 1176 | { |
1180 | unsigned long flags; | 1177 | unsigned long flags; |
1181 | int to_drain; | 1178 | int to_drain; |
1179 | unsigned long batch; | ||
1182 | 1180 | ||
1183 | local_irq_save(flags); | 1181 | local_irq_save(flags); |
1184 | if (pcp->count >= pcp->batch) | 1182 | batch = ACCESS_ONCE(pcp->batch); |
1185 | to_drain = pcp->batch; | 1183 | if (pcp->count >= batch) |
1184 | to_drain = batch; | ||
1186 | else | 1185 | else |
1187 | to_drain = pcp->count; | 1186 | to_drain = pcp->count; |
1188 | if (to_drain > 0) { | 1187 | if (to_drain > 0) { |
@@ -1350,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold) | |||
1350 | list_add(&page->lru, &pcp->lists[migratetype]); | 1349 | list_add(&page->lru, &pcp->lists[migratetype]); |
1351 | pcp->count++; | 1350 | pcp->count++; |
1352 | if (pcp->count >= pcp->high) { | 1351 | if (pcp->count >= pcp->high) { |
1353 | free_pcppages_bulk(zone, pcp->batch, pcp); | 1352 | unsigned long batch = ACCESS_ONCE(pcp->batch); |
1354 | pcp->count -= pcp->batch; | 1353 | free_pcppages_bulk(zone, batch, pcp); |
1354 | pcp->count -= batch; | ||
1355 | } | 1355 | } |
1356 | 1356 | ||
1357 | out: | 1357 | out: |
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL(free_pages_exact); | |||
2839 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | 2839 | * nr_free_zone_pages() counts the number of counts pages which are beyond the |
2840 | * high watermark within all zones at or below a given zone index. For each | 2840 | * high watermark within all zones at or below a given zone index. For each |
2841 | * zone, the number of pages is calculated as: | 2841 | * zone, the number of pages is calculated as: |
2842 | * present_pages - high_pages | 2842 | * managed_pages - high_pages |
2843 | */ | 2843 | */ |
2844 | static unsigned long nr_free_zone_pages(int offset) | 2844 | static unsigned long nr_free_zone_pages(int offset) |
2845 | { | 2845 | { |
@@ -2906,9 +2906,13 @@ EXPORT_SYMBOL(si_meminfo); | |||
2906 | #ifdef CONFIG_NUMA | 2906 | #ifdef CONFIG_NUMA |
2907 | void si_meminfo_node(struct sysinfo *val, int nid) | 2907 | void si_meminfo_node(struct sysinfo *val, int nid) |
2908 | { | 2908 | { |
2909 | int zone_type; /* needs to be signed */ | ||
2910 | unsigned long managed_pages = 0; | ||
2909 | pg_data_t *pgdat = NODE_DATA(nid); | 2911 | pg_data_t *pgdat = NODE_DATA(nid); |
2910 | 2912 | ||
2911 | val->totalram = pgdat->node_present_pages; | 2913 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
2914 | managed_pages += pgdat->node_zones[zone_type].managed_pages; | ||
2915 | val->totalram = managed_pages; | ||
2912 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2916 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
2913 | #ifdef CONFIG_HIGHMEM | 2917 | #ifdef CONFIG_HIGHMEM |
2914 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; | 2918 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; |
@@ -3150,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | |||
3150 | * Add all populated zones of a node to the zonelist. | 3154 | * Add all populated zones of a node to the zonelist. |
3151 | */ | 3155 | */ |
3152 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | 3156 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, |
3153 | int nr_zones, enum zone_type zone_type) | 3157 | int nr_zones) |
3154 | { | 3158 | { |
3155 | struct zone *zone; | 3159 | struct zone *zone; |
3156 | 3160 | enum zone_type zone_type = MAX_NR_ZONES; | |
3157 | BUG_ON(zone_type >= MAX_NR_ZONES); | ||
3158 | zone_type++; | ||
3159 | 3161 | ||
3160 | do { | 3162 | do { |
3161 | zone_type--; | 3163 | zone_type--; |
@@ -3165,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | |||
3165 | &zonelist->_zonerefs[nr_zones++]); | 3167 | &zonelist->_zonerefs[nr_zones++]); |
3166 | check_highest_zone(zone_type); | 3168 | check_highest_zone(zone_type); |
3167 | } | 3169 | } |
3168 | |||
3169 | } while (zone_type); | 3170 | } while (zone_type); |
3171 | |||
3170 | return nr_zones; | 3172 | return nr_zones; |
3171 | } | 3173 | } |
3172 | 3174 | ||
@@ -3250,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
3250 | static DEFINE_MUTEX(zl_order_mutex); | 3252 | static DEFINE_MUTEX(zl_order_mutex); |
3251 | 3253 | ||
3252 | mutex_lock(&zl_order_mutex); | 3254 | mutex_lock(&zl_order_mutex); |
3253 | if (write) | 3255 | if (write) { |
3254 | strcpy(saved_string, (char*)table->data); | 3256 | if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { |
3257 | ret = -EINVAL; | ||
3258 | goto out; | ||
3259 | } | ||
3260 | strcpy(saved_string, (char *)table->data); | ||
3261 | } | ||
3255 | ret = proc_dostring(table, write, buffer, length, ppos); | 3262 | ret = proc_dostring(table, write, buffer, length, ppos); |
3256 | if (ret) | 3263 | if (ret) |
3257 | goto out; | 3264 | goto out; |
3258 | if (write) { | 3265 | if (write) { |
3259 | int oldval = user_zonelist_order; | 3266 | int oldval = user_zonelist_order; |
3260 | if (__parse_numa_zonelist_order((char*)table->data)) { | 3267 | |
3268 | ret = __parse_numa_zonelist_order((char *)table->data); | ||
3269 | if (ret) { | ||
3261 | /* | 3270 | /* |
3262 | * bogus value. restore saved string | 3271 | * bogus value. restore saved string |
3263 | */ | 3272 | */ |
3264 | strncpy((char*)table->data, saved_string, | 3273 | strncpy((char *)table->data, saved_string, |
3265 | NUMA_ZONELIST_ORDER_LEN); | 3274 | NUMA_ZONELIST_ORDER_LEN); |
3266 | user_zonelist_order = oldval; | 3275 | user_zonelist_order = oldval; |
3267 | } else if (oldval != user_zonelist_order) { | 3276 | } else if (oldval != user_zonelist_order) { |
@@ -3353,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
3353 | zonelist = &pgdat->node_zonelists[0]; | 3362 | zonelist = &pgdat->node_zonelists[0]; |
3354 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) | 3363 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) |
3355 | ; | 3364 | ; |
3356 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3365 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); |
3357 | MAX_NR_ZONES - 1); | ||
3358 | zonelist->_zonerefs[j].zone = NULL; | 3366 | zonelist->_zonerefs[j].zone = NULL; |
3359 | zonelist->_zonerefs[j].zone_idx = 0; | 3367 | zonelist->_zonerefs[j].zone_idx = 0; |
3360 | } | 3368 | } |
@@ -3368,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) | |||
3368 | struct zonelist *zonelist; | 3376 | struct zonelist *zonelist; |
3369 | 3377 | ||
3370 | zonelist = &pgdat->node_zonelists[1]; | 3378 | zonelist = &pgdat->node_zonelists[1]; |
3371 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | 3379 | j = build_zonelists_node(pgdat, zonelist, 0); |
3372 | zonelist->_zonerefs[j].zone = NULL; | 3380 | zonelist->_zonerefs[j].zone = NULL; |
3373 | zonelist->_zonerefs[j].zone_idx = 0; | 3381 | zonelist->_zonerefs[j].zone_idx = 0; |
3374 | } | 3382 | } |
@@ -3425,8 +3433,8 @@ static int default_zonelist_order(void) | |||
3425 | z = &NODE_DATA(nid)->node_zones[zone_type]; | 3433 | z = &NODE_DATA(nid)->node_zones[zone_type]; |
3426 | if (populated_zone(z)) { | 3434 | if (populated_zone(z)) { |
3427 | if (zone_type < ZONE_NORMAL) | 3435 | if (zone_type < ZONE_NORMAL) |
3428 | low_kmem_size += z->present_pages; | 3436 | low_kmem_size += z->managed_pages; |
3429 | total_size += z->present_pages; | 3437 | total_size += z->managed_pages; |
3430 | } else if (zone_type == ZONE_NORMAL) { | 3438 | } else if (zone_type == ZONE_NORMAL) { |
3431 | /* | 3439 | /* |
3432 | * If any node has only lowmem, then node order | 3440 | * If any node has only lowmem, then node order |
@@ -3576,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat) | |||
3576 | local_node = pgdat->node_id; | 3584 | local_node = pgdat->node_id; |
3577 | 3585 | ||
3578 | zonelist = &pgdat->node_zonelists[0]; | 3586 | zonelist = &pgdat->node_zonelists[0]; |
3579 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | 3587 | j = build_zonelists_node(pgdat, zonelist, 0); |
3580 | 3588 | ||
3581 | /* | 3589 | /* |
3582 | * Now we build the zonelist so that it contains the zones | 3590 | * Now we build the zonelist so that it contains the zones |
@@ -3589,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat) | |||
3589 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 3597 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
3590 | if (!node_online(node)) | 3598 | if (!node_online(node)) |
3591 | continue; | 3599 | continue; |
3592 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3600 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); |
3593 | MAX_NR_ZONES - 1); | ||
3594 | } | 3601 | } |
3595 | for (node = 0; node < local_node; node++) { | 3602 | for (node = 0; node < local_node; node++) { |
3596 | if (!node_online(node)) | 3603 | if (!node_online(node)) |
3597 | continue; | 3604 | continue; |
3598 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3605 | j = build_zonelists_node(NODE_DATA(node), zonelist, j); |
3599 | MAX_NR_ZONES - 1); | ||
3600 | } | 3606 | } |
3601 | 3607 | ||
3602 | zonelist->_zonerefs[j].zone = NULL; | 3608 | zonelist->_zonerefs[j].zone = NULL; |
@@ -3705,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | |||
3705 | mminit_verify_zonelist(); | 3711 | mminit_verify_zonelist(); |
3706 | cpuset_init_current_mems_allowed(); | 3712 | cpuset_init_current_mems_allowed(); |
3707 | } else { | 3713 | } else { |
3708 | /* we have to stop all cpus to guarantee there is no user | ||
3709 | of zonelist */ | ||
3710 | #ifdef CONFIG_MEMORY_HOTPLUG | 3714 | #ifdef CONFIG_MEMORY_HOTPLUG |
3711 | if (zone) | 3715 | if (zone) |
3712 | setup_zone_pageset(zone); | 3716 | setup_zone_pageset(zone); |
3713 | #endif | 3717 | #endif |
3718 | /* we have to stop all cpus to guarantee there is no user | ||
3719 | of zonelist */ | ||
3714 | stop_machine(__build_all_zonelists, pgdat, NULL); | 3720 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3715 | /* cpuset refresh routine should be here */ | 3721 | /* cpuset refresh routine should be here */ |
3716 | } | 3722 | } |
@@ -4032,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone) | |||
4032 | #endif | 4038 | #endif |
4033 | } | 4039 | } |
4034 | 4040 | ||
4035 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 4041 | /* |
4042 | * pcp->high and pcp->batch values are related and dependent on one another: | ||
4043 | * ->batch must never be higher then ->high. | ||
4044 | * The following function updates them in a safe manner without read side | ||
4045 | * locking. | ||
4046 | * | ||
4047 | * Any new users of pcp->batch and pcp->high should ensure they can cope with | ||
4048 | * those fields changing asynchronously (acording the the above rule). | ||
4049 | * | ||
4050 | * mutex_is_locked(&pcp_batch_high_lock) required when calling this function | ||
4051 | * outside of boot time (or some other assurance that no concurrent updaters | ||
4052 | * exist). | ||
4053 | */ | ||
4054 | static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, | ||
4055 | unsigned long batch) | ||
4056 | { | ||
4057 | /* start with a fail safe value for batch */ | ||
4058 | pcp->batch = 1; | ||
4059 | smp_wmb(); | ||
4060 | |||
4061 | /* Update high, then batch, in order */ | ||
4062 | pcp->high = high; | ||
4063 | smp_wmb(); | ||
4064 | |||
4065 | pcp->batch = batch; | ||
4066 | } | ||
4067 | |||
4068 | /* a companion to pageset_set_high() */ | ||
4069 | static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) | ||
4070 | { | ||
4071 | pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); | ||
4072 | } | ||
4073 | |||
4074 | static void pageset_init(struct per_cpu_pageset *p) | ||
4036 | { | 4075 | { |
4037 | struct per_cpu_pages *pcp; | 4076 | struct per_cpu_pages *pcp; |
4038 | int migratetype; | 4077 | int migratetype; |
@@ -4041,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
4041 | 4080 | ||
4042 | pcp = &p->pcp; | 4081 | pcp = &p->pcp; |
4043 | pcp->count = 0; | 4082 | pcp->count = 0; |
4044 | pcp->high = 6 * batch; | ||
4045 | pcp->batch = max(1UL, 1 * batch); | ||
4046 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) | 4083 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
4047 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | 4084 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
4048 | } | 4085 | } |
4049 | 4086 | ||
4087 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | ||
4088 | { | ||
4089 | pageset_init(p); | ||
4090 | pageset_set_batch(p, batch); | ||
4091 | } | ||
4092 | |||
4050 | /* | 4093 | /* |
4051 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | 4094 | * pageset_set_high() sets the high water mark for hot per_cpu_pagelist |
4052 | * to the value high for the pageset p. | 4095 | * to the value high for the pageset p. |
4053 | */ | 4096 | */ |
4054 | 4097 | static void pageset_set_high(struct per_cpu_pageset *p, | |
4055 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | ||
4056 | unsigned long high) | 4098 | unsigned long high) |
4057 | { | 4099 | { |
4058 | struct per_cpu_pages *pcp; | 4100 | unsigned long batch = max(1UL, high / 4); |
4101 | if ((high / 4) > (PAGE_SHIFT * 8)) | ||
4102 | batch = PAGE_SHIFT * 8; | ||
4059 | 4103 | ||
4060 | pcp = &p->pcp; | 4104 | pageset_update(&p->pcp, high, batch); |
4061 | pcp->high = high; | ||
4062 | pcp->batch = max(1UL, high/4); | ||
4063 | if ((high/4) > (PAGE_SHIFT * 8)) | ||
4064 | pcp->batch = PAGE_SHIFT * 8; | ||
4065 | } | 4105 | } |
4066 | 4106 | ||
4067 | static void __meminit setup_zone_pageset(struct zone *zone) | 4107 | static void __meminit pageset_set_high_and_batch(struct zone *zone, |
4108 | struct per_cpu_pageset *pcp) | ||
4068 | { | 4109 | { |
4069 | int cpu; | 4110 | if (percpu_pagelist_fraction) |
4070 | 4111 | pageset_set_high(pcp, | |
4071 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 4112 | (zone->managed_pages / |
4113 | percpu_pagelist_fraction)); | ||
4114 | else | ||
4115 | pageset_set_batch(pcp, zone_batchsize(zone)); | ||
4116 | } | ||
4072 | 4117 | ||
4073 | for_each_possible_cpu(cpu) { | 4118 | static void __meminit zone_pageset_init(struct zone *zone, int cpu) |
4074 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | 4119 | { |
4120 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
4075 | 4121 | ||
4076 | setup_pageset(pcp, zone_batchsize(zone)); | 4122 | pageset_init(pcp); |
4123 | pageset_set_high_and_batch(zone, pcp); | ||
4124 | } | ||
4077 | 4125 | ||
4078 | if (percpu_pagelist_fraction) | 4126 | static void __meminit setup_zone_pageset(struct zone *zone) |
4079 | setup_pagelist_highmark(pcp, | 4127 | { |
4080 | (zone->managed_pages / | 4128 | int cpu; |
4081 | percpu_pagelist_fraction)); | 4129 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
4082 | } | 4130 | for_each_possible_cpu(cpu) |
4131 | zone_pageset_init(zone, cpu); | ||
4083 | } | 4132 | } |
4084 | 4133 | ||
4085 | /* | 4134 | /* |
@@ -4368,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, | |||
4368 | */ | 4417 | */ |
4369 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4418 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4370 | unsigned long zone_type, | 4419 | unsigned long zone_type, |
4420 | unsigned long node_start_pfn, | ||
4421 | unsigned long node_end_pfn, | ||
4371 | unsigned long *ignored) | 4422 | unsigned long *ignored) |
4372 | { | 4423 | { |
4373 | unsigned long node_start_pfn, node_end_pfn; | ||
4374 | unsigned long zone_start_pfn, zone_end_pfn; | 4424 | unsigned long zone_start_pfn, zone_end_pfn; |
4375 | 4425 | ||
4376 | /* Get the start and end of the node and zone */ | 4426 | /* Get the start and end of the zone */ |
4377 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
4378 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | 4427 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; |
4379 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | 4428 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; |
4380 | adjust_zone_range_for_zone_movable(nid, zone_type, | 4429 | adjust_zone_range_for_zone_movable(nid, zone_type, |
@@ -4429,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, | |||
4429 | /* Return the number of page frames in holes in a zone on a node */ | 4478 | /* Return the number of page frames in holes in a zone on a node */ |
4430 | static unsigned long __meminit zone_absent_pages_in_node(int nid, | 4479 | static unsigned long __meminit zone_absent_pages_in_node(int nid, |
4431 | unsigned long zone_type, | 4480 | unsigned long zone_type, |
4481 | unsigned long node_start_pfn, | ||
4482 | unsigned long node_end_pfn, | ||
4432 | unsigned long *ignored) | 4483 | unsigned long *ignored) |
4433 | { | 4484 | { |
4434 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; | 4485 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
4435 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; | 4486 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
4436 | unsigned long node_start_pfn, node_end_pfn; | ||
4437 | unsigned long zone_start_pfn, zone_end_pfn; | 4487 | unsigned long zone_start_pfn, zone_end_pfn; |
4438 | 4488 | ||
4439 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | ||
4440 | zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); | 4489 | zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); |
4441 | zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); | 4490 | zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); |
4442 | 4491 | ||
@@ -4449,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
4449 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4498 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4450 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4499 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4451 | unsigned long zone_type, | 4500 | unsigned long zone_type, |
4501 | unsigned long node_start_pfn, | ||
4502 | unsigned long node_end_pfn, | ||
4452 | unsigned long *zones_size) | 4503 | unsigned long *zones_size) |
4453 | { | 4504 | { |
4454 | return zones_size[zone_type]; | 4505 | return zones_size[zone_type]; |
@@ -4456,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
4456 | 4507 | ||
4457 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | 4508 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
4458 | unsigned long zone_type, | 4509 | unsigned long zone_type, |
4510 | unsigned long node_start_pfn, | ||
4511 | unsigned long node_end_pfn, | ||
4459 | unsigned long *zholes_size) | 4512 | unsigned long *zholes_size) |
4460 | { | 4513 | { |
4461 | if (!zholes_size) | 4514 | if (!zholes_size) |
@@ -4467,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
4467 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4520 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4468 | 4521 | ||
4469 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 4522 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
4470 | unsigned long *zones_size, unsigned long *zholes_size) | 4523 | unsigned long node_start_pfn, |
4524 | unsigned long node_end_pfn, | ||
4525 | unsigned long *zones_size, | ||
4526 | unsigned long *zholes_size) | ||
4471 | { | 4527 | { |
4472 | unsigned long realtotalpages, totalpages = 0; | 4528 | unsigned long realtotalpages, totalpages = 0; |
4473 | enum zone_type i; | 4529 | enum zone_type i; |
4474 | 4530 | ||
4475 | for (i = 0; i < MAX_NR_ZONES; i++) | 4531 | for (i = 0; i < MAX_NR_ZONES; i++) |
4476 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | 4532 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, |
4477 | zones_size); | 4533 | node_start_pfn, |
4534 | node_end_pfn, | ||
4535 | zones_size); | ||
4478 | pgdat->node_spanned_pages = totalpages; | 4536 | pgdat->node_spanned_pages = totalpages; |
4479 | 4537 | ||
4480 | realtotalpages = totalpages; | 4538 | realtotalpages = totalpages; |
4481 | for (i = 0; i < MAX_NR_ZONES; i++) | 4539 | for (i = 0; i < MAX_NR_ZONES; i++) |
4482 | realtotalpages -= | 4540 | realtotalpages -= |
4483 | zone_absent_pages_in_node(pgdat->node_id, i, | 4541 | zone_absent_pages_in_node(pgdat->node_id, i, |
4484 | zholes_size); | 4542 | node_start_pfn, node_end_pfn, |
4543 | zholes_size); | ||
4485 | pgdat->node_present_pages = realtotalpages; | 4544 | pgdat->node_present_pages = realtotalpages; |
4486 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | 4545 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, |
4487 | realtotalpages); | 4546 | realtotalpages); |
@@ -4590,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | |||
4590 | * NOTE: pgdat should get zeroed by caller. | 4649 | * NOTE: pgdat should get zeroed by caller. |
4591 | */ | 4650 | */ |
4592 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4651 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4652 | unsigned long node_start_pfn, unsigned long node_end_pfn, | ||
4593 | unsigned long *zones_size, unsigned long *zholes_size) | 4653 | unsigned long *zones_size, unsigned long *zholes_size) |
4594 | { | 4654 | { |
4595 | enum zone_type j; | 4655 | enum zone_type j; |
@@ -4611,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4611 | struct zone *zone = pgdat->node_zones + j; | 4671 | struct zone *zone = pgdat->node_zones + j; |
4612 | unsigned long size, realsize, freesize, memmap_pages; | 4672 | unsigned long size, realsize, freesize, memmap_pages; |
4613 | 4673 | ||
4614 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4674 | size = zone_spanned_pages_in_node(nid, j, node_start_pfn, |
4675 | node_end_pfn, zones_size); | ||
4615 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, | 4676 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
4677 | node_start_pfn, | ||
4678 | node_end_pfn, | ||
4616 | zholes_size); | 4679 | zholes_size); |
4617 | 4680 | ||
4618 | /* | 4681 | /* |
@@ -4726,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4726 | unsigned long node_start_pfn, unsigned long *zholes_size) | 4789 | unsigned long node_start_pfn, unsigned long *zholes_size) |
4727 | { | 4790 | { |
4728 | pg_data_t *pgdat = NODE_DATA(nid); | 4791 | pg_data_t *pgdat = NODE_DATA(nid); |
4792 | unsigned long start_pfn = 0; | ||
4793 | unsigned long end_pfn = 0; | ||
4729 | 4794 | ||
4730 | /* pg_data_t should be reset to zero when it's allocated */ | 4795 | /* pg_data_t should be reset to zero when it's allocated */ |
4731 | WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); | 4796 | WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); |
@@ -4733,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4733 | pgdat->node_id = nid; | 4798 | pgdat->node_id = nid; |
4734 | pgdat->node_start_pfn = node_start_pfn; | 4799 | pgdat->node_start_pfn = node_start_pfn; |
4735 | init_zone_allows_reclaim(nid); | 4800 | init_zone_allows_reclaim(nid); |
4736 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4801 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4802 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | ||
4803 | #endif | ||
4804 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | ||
4805 | zones_size, zholes_size); | ||
4737 | 4806 | ||
4738 | alloc_node_mem_map(pgdat); | 4807 | alloc_node_mem_map(pgdat); |
4739 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 4808 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
@@ -4742,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4742 | (unsigned long)pgdat->node_mem_map); | 4811 | (unsigned long)pgdat->node_mem_map); |
4743 | #endif | 4812 | #endif |
4744 | 4813 | ||
4745 | free_area_init_core(pgdat, zones_size, zholes_size); | 4814 | free_area_init_core(pgdat, start_pfn, end_pfn, |
4815 | zones_size, zholes_size); | ||
4746 | } | 4816 | } |
4747 | 4817 | ||
4748 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4818 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
@@ -5150,35 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore); | |||
5150 | 5220 | ||
5151 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 5221 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
5152 | 5222 | ||
5153 | unsigned long free_reserved_area(unsigned long start, unsigned long end, | 5223 | void adjust_managed_page_count(struct page *page, long count) |
5154 | int poison, char *s) | ||
5155 | { | 5224 | { |
5156 | unsigned long pages, pos; | 5225 | spin_lock(&managed_page_count_lock); |
5226 | page_zone(page)->managed_pages += count; | ||
5227 | totalram_pages += count; | ||
5228 | #ifdef CONFIG_HIGHMEM | ||
5229 | if (PageHighMem(page)) | ||
5230 | totalhigh_pages += count; | ||
5231 | #endif | ||
5232 | spin_unlock(&managed_page_count_lock); | ||
5233 | } | ||
5234 | EXPORT_SYMBOL(adjust_managed_page_count); | ||
5157 | 5235 | ||
5158 | pos = start = PAGE_ALIGN(start); | 5236 | unsigned long free_reserved_area(void *start, void *end, int poison, char *s) |
5159 | end &= PAGE_MASK; | 5237 | { |
5160 | for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { | 5238 | void *pos; |
5161 | if (poison) | 5239 | unsigned long pages = 0; |
5162 | memset((void *)pos, poison, PAGE_SIZE); | 5240 | |
5163 | free_reserved_page(virt_to_page((void *)pos)); | 5241 | start = (void *)PAGE_ALIGN((unsigned long)start); |
5242 | end = (void *)((unsigned long)end & PAGE_MASK); | ||
5243 | for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { | ||
5244 | if ((unsigned int)poison <= 0xFF) | ||
5245 | memset(pos, poison, PAGE_SIZE); | ||
5246 | free_reserved_page(virt_to_page(pos)); | ||
5164 | } | 5247 | } |
5165 | 5248 | ||
5166 | if (pages && s) | 5249 | if (pages && s) |
5167 | pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", | 5250 | pr_info("Freeing %s memory: %ldK (%p - %p)\n", |
5168 | s, pages << (PAGE_SHIFT - 10), start, end); | 5251 | s, pages << (PAGE_SHIFT - 10), start, end); |
5169 | 5252 | ||
5170 | return pages; | 5253 | return pages; |
5171 | } | 5254 | } |
5255 | EXPORT_SYMBOL(free_reserved_area); | ||
5172 | 5256 | ||
5173 | #ifdef CONFIG_HIGHMEM | 5257 | #ifdef CONFIG_HIGHMEM |
5174 | void free_highmem_page(struct page *page) | 5258 | void free_highmem_page(struct page *page) |
5175 | { | 5259 | { |
5176 | __free_reserved_page(page); | 5260 | __free_reserved_page(page); |
5177 | totalram_pages++; | 5261 | totalram_pages++; |
5262 | page_zone(page)->managed_pages++; | ||
5178 | totalhigh_pages++; | 5263 | totalhigh_pages++; |
5179 | } | 5264 | } |
5180 | #endif | 5265 | #endif |
5181 | 5266 | ||
5267 | |||
5268 | void __init mem_init_print_info(const char *str) | ||
5269 | { | ||
5270 | unsigned long physpages, codesize, datasize, rosize, bss_size; | ||
5271 | unsigned long init_code_size, init_data_size; | ||
5272 | |||
5273 | physpages = get_num_physpages(); | ||
5274 | codesize = _etext - _stext; | ||
5275 | datasize = _edata - _sdata; | ||
5276 | rosize = __end_rodata - __start_rodata; | ||
5277 | bss_size = __bss_stop - __bss_start; | ||
5278 | init_data_size = __init_end - __init_begin; | ||
5279 | init_code_size = _einittext - _sinittext; | ||
5280 | |||
5281 | /* | ||
5282 | * Detect special cases and adjust section sizes accordingly: | ||
5283 | * 1) .init.* may be embedded into .data sections | ||
5284 | * 2) .init.text.* may be out of [__init_begin, __init_end], | ||
5285 | * please refer to arch/tile/kernel/vmlinux.lds.S. | ||
5286 | * 3) .rodata.* may be embedded into .text or .data sections. | ||
5287 | */ | ||
5288 | #define adj_init_size(start, end, size, pos, adj) \ | ||
5289 | if (start <= pos && pos < end && size > adj) \ | ||
5290 | size -= adj; | ||
5291 | |||
5292 | adj_init_size(__init_begin, __init_end, init_data_size, | ||
5293 | _sinittext, init_code_size); | ||
5294 | adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); | ||
5295 | adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); | ||
5296 | adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); | ||
5297 | adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); | ||
5298 | |||
5299 | #undef adj_init_size | ||
5300 | |||
5301 | printk("Memory: %luK/%luK available " | ||
5302 | "(%luK kernel code, %luK rwdata, %luK rodata, " | ||
5303 | "%luK init, %luK bss, %luK reserved" | ||
5304 | #ifdef CONFIG_HIGHMEM | ||
5305 | ", %luK highmem" | ||
5306 | #endif | ||
5307 | "%s%s)\n", | ||
5308 | nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), | ||
5309 | codesize >> 10, datasize >> 10, rosize >> 10, | ||
5310 | (init_data_size + init_code_size) >> 10, bss_size >> 10, | ||
5311 | (physpages - totalram_pages) << (PAGE_SHIFT-10), | ||
5312 | #ifdef CONFIG_HIGHMEM | ||
5313 | totalhigh_pages << (PAGE_SHIFT-10), | ||
5314 | #endif | ||
5315 | str ? ", " : "", str ? str : ""); | ||
5316 | } | ||
5317 | |||
5182 | /** | 5318 | /** |
5183 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 5319 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
5184 | * @new_dma_reserve: The number of pages to mark reserved | 5320 | * @new_dma_reserve: The number of pages to mark reserved |
@@ -5454,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) | |||
5454 | int __meminit init_per_zone_wmark_min(void) | 5590 | int __meminit init_per_zone_wmark_min(void) |
5455 | { | 5591 | { |
5456 | unsigned long lowmem_kbytes; | 5592 | unsigned long lowmem_kbytes; |
5593 | int new_min_free_kbytes; | ||
5457 | 5594 | ||
5458 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); | 5595 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); |
5459 | 5596 | new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | |
5460 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | 5597 | |
5461 | if (min_free_kbytes < 128) | 5598 | if (new_min_free_kbytes > user_min_free_kbytes) { |
5462 | min_free_kbytes = 128; | 5599 | min_free_kbytes = new_min_free_kbytes; |
5463 | if (min_free_kbytes > 65536) | 5600 | if (min_free_kbytes < 128) |
5464 | min_free_kbytes = 65536; | 5601 | min_free_kbytes = 128; |
5602 | if (min_free_kbytes > 65536) | ||
5603 | min_free_kbytes = 65536; | ||
5604 | } else { | ||
5605 | pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", | ||
5606 | new_min_free_kbytes, user_min_free_kbytes); | ||
5607 | } | ||
5465 | setup_per_zone_wmarks(); | 5608 | setup_per_zone_wmarks(); |
5466 | refresh_zone_stat_thresholds(); | 5609 | refresh_zone_stat_thresholds(); |
5467 | setup_per_zone_lowmem_reserve(); | 5610 | setup_per_zone_lowmem_reserve(); |
@@ -5479,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | |||
5479 | void __user *buffer, size_t *length, loff_t *ppos) | 5622 | void __user *buffer, size_t *length, loff_t *ppos) |
5480 | { | 5623 | { |
5481 | proc_dointvec(table, write, buffer, length, ppos); | 5624 | proc_dointvec(table, write, buffer, length, ppos); |
5482 | if (write) | 5625 | if (write) { |
5626 | user_min_free_kbytes = min_free_kbytes; | ||
5483 | setup_per_zone_wmarks(); | 5627 | setup_per_zone_wmarks(); |
5628 | } | ||
5484 | return 0; | 5629 | return 0; |
5485 | } | 5630 | } |
5486 | 5631 | ||
@@ -5540,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
5540 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | 5685 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist |
5541 | * can have before it gets flushed back to buddy allocator. | 5686 | * can have before it gets flushed back to buddy allocator. |
5542 | */ | 5687 | */ |
5543 | |||
5544 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 5688 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
5545 | void __user *buffer, size_t *length, loff_t *ppos) | 5689 | void __user *buffer, size_t *length, loff_t *ppos) |
5546 | { | 5690 | { |
@@ -5551,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
5551 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5695 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5552 | if (!write || (ret < 0)) | 5696 | if (!write || (ret < 0)) |
5553 | return ret; | 5697 | return ret; |
5698 | |||
5699 | mutex_lock(&pcp_batch_high_lock); | ||
5554 | for_each_populated_zone(zone) { | 5700 | for_each_populated_zone(zone) { |
5555 | for_each_possible_cpu(cpu) { | 5701 | unsigned long high; |
5556 | unsigned long high; | 5702 | high = zone->managed_pages / percpu_pagelist_fraction; |
5557 | high = zone->managed_pages / percpu_pagelist_fraction; | 5703 | for_each_possible_cpu(cpu) |
5558 | setup_pagelist_highmark( | 5704 | pageset_set_high(per_cpu_ptr(zone->pageset, cpu), |
5559 | per_cpu_ptr(zone->pageset, cpu), high); | 5705 | high); |
5560 | } | ||
5561 | } | 5706 | } |
5707 | mutex_unlock(&pcp_batch_high_lock); | ||
5562 | return 0; | 5708 | return 0; |
5563 | } | 5709 | } |
5564 | 5710 | ||
@@ -6047,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) | |||
6047 | #endif | 6193 | #endif |
6048 | 6194 | ||
6049 | #ifdef CONFIG_MEMORY_HOTPLUG | 6195 | #ifdef CONFIG_MEMORY_HOTPLUG |
6050 | static int __meminit __zone_pcp_update(void *data) | 6196 | /* |
6051 | { | 6197 | * The zone indicated has a new number of managed_pages; batch sizes and percpu |
6052 | struct zone *zone = data; | 6198 | * page high values need to be recalulated. |
6053 | int cpu; | 6199 | */ |
6054 | unsigned long batch = zone_batchsize(zone), flags; | ||
6055 | |||
6056 | for_each_possible_cpu(cpu) { | ||
6057 | struct per_cpu_pageset *pset; | ||
6058 | struct per_cpu_pages *pcp; | ||
6059 | |||
6060 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
6061 | pcp = &pset->pcp; | ||
6062 | |||
6063 | local_irq_save(flags); | ||
6064 | if (pcp->count > 0) | ||
6065 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
6066 | drain_zonestat(zone, pset); | ||
6067 | setup_pageset(pset, batch); | ||
6068 | local_irq_restore(flags); | ||
6069 | } | ||
6070 | return 0; | ||
6071 | } | ||
6072 | |||
6073 | void __meminit zone_pcp_update(struct zone *zone) | 6200 | void __meminit zone_pcp_update(struct zone *zone) |
6074 | { | 6201 | { |
6075 | stop_machine(__zone_pcp_update, zone, NULL); | 6202 | unsigned cpu; |
6203 | mutex_lock(&pcp_batch_high_lock); | ||
6204 | for_each_possible_cpu(cpu) | ||
6205 | pageset_set_high_and_batch(zone, | ||
6206 | per_cpu_ptr(zone->pageset, cpu)); | ||
6207 | mutex_unlock(&pcp_batch_high_lock); | ||
6076 | } | 6208 | } |
6077 | #endif | 6209 | #endif |
6078 | 6210 | ||
@@ -6142,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
6142 | list_del(&page->lru); | 6274 | list_del(&page->lru); |
6143 | rmv_page_order(page); | 6275 | rmv_page_order(page); |
6144 | zone->free_area[order].nr_free--; | 6276 | zone->free_area[order].nr_free--; |
6277 | #ifdef CONFIG_HIGHMEM | ||
6278 | if (PageHighMem(page)) | ||
6279 | totalhigh_pages -= 1 << order; | ||
6280 | #endif | ||
6145 | for (i = 0; i < (1 << order); i++) | 6281 | for (i = 0; i < (1 << order); i++) |
6146 | SetPageReserved((page+i)); | 6282 | SetPageReserved((page+i)); |
6147 | pfn += (1 << order); | 6283 | pfn += (1 << order); |
diff --git a/mm/page_io.c b/mm/page_io.c index a8a3ef45fed7..ba05b64e5d8d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
22 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
23 | #include <linux/aio.h> | 23 | #include <linux/aio.h> |
24 | #include <linux/blkdev.h> | ||
24 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
25 | 26 | ||
26 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 27 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err) | |||
80 | imajor(bio->bi_bdev->bd_inode), | 81 | imajor(bio->bi_bdev->bd_inode), |
81 | iminor(bio->bi_bdev->bd_inode), | 82 | iminor(bio->bi_bdev->bd_inode), |
82 | (unsigned long long)bio->bi_sector); | 83 | (unsigned long long)bio->bi_sector); |
83 | } else { | 84 | goto out; |
84 | SetPageUptodate(page); | ||
85 | } | 85 | } |
86 | |||
87 | SetPageUptodate(page); | ||
88 | |||
89 | /* | ||
90 | * There is no guarantee that the page is in swap cache - the software | ||
91 | * suspend code (at least) uses end_swap_bio_read() against a non- | ||
92 | * swapcache page. So we must check PG_swapcache before proceeding with | ||
93 | * this optimization. | ||
94 | */ | ||
95 | if (likely(PageSwapCache(page))) { | ||
96 | struct swap_info_struct *sis; | ||
97 | |||
98 | sis = page_swap_info(page); | ||
99 | if (sis->flags & SWP_BLKDEV) { | ||
100 | /* | ||
101 | * The swap subsystem performs lazy swap slot freeing, | ||
102 | * expecting that the page will be swapped out again. | ||
103 | * So we can avoid an unnecessary write if the page | ||
104 | * isn't redirtied. | ||
105 | * This is good for real swap storage because we can | ||
106 | * reduce unnecessary I/O and enhance wear-leveling | ||
107 | * if an SSD is used as the as swap device. | ||
108 | * But if in-memory swap device (eg zram) is used, | ||
109 | * this causes a duplicated copy between uncompressed | ||
110 | * data in VM-owned memory and compressed data in | ||
111 | * zram-owned memory. So let's free zram-owned memory | ||
112 | * and make the VM-owned decompressed page *dirty*, | ||
113 | * so the page should be swapped out somewhere again if | ||
114 | * we again wish to reclaim it. | ||
115 | */ | ||
116 | struct gendisk *disk = sis->bdev->bd_disk; | ||
117 | if (disk->fops->swap_slot_free_notify) { | ||
118 | swp_entry_t entry; | ||
119 | unsigned long offset; | ||
120 | |||
121 | entry.val = page_private(page); | ||
122 | offset = swp_offset(entry); | ||
123 | |||
124 | SetPageDirty(page); | ||
125 | disk->fops->swap_slot_free_notify(sis->bdev, | ||
126 | offset); | ||
127 | } | ||
128 | } | ||
129 | } | ||
130 | |||
131 | out: | ||
86 | unlock_page(page); | 132 | unlock_page(page); |
87 | bio_put(bio); | 133 | bio_put(bio); |
88 | } | 134 | } |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c8323fe6c8f..e1a6e4fab016 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | |||
124 | 124 | ||
125 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | 125 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT |
126 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 126 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
127 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) | 127 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
128 | pgtable_t pgtable) | ||
128 | { | 129 | { |
129 | assert_spin_locked(&mm->page_table_lock); | 130 | assert_spin_locked(&mm->page_table_lock); |
130 | 131 | ||
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) | |||
141 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW | 142 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW |
142 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 143 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
143 | /* no "address" argument so destroys page coloring of some arch */ | 144 | /* no "address" argument so destroys page coloring of some arch */ |
144 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) | 145 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) |
145 | { | 146 | { |
146 | pgtable_t pgtable; | 147 | pgtable_t pgtable; |
147 | 148 | ||
@@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
720 | * mapping is already gone, the unmap path will have | 720 | * mapping is already gone, the unmap path will have |
721 | * set PG_referenced or activated the page. | 721 | * set PG_referenced or activated the page. |
722 | */ | 722 | */ |
723 | if (likely(!VM_SequentialReadHint(vma))) | 723 | if (likely(!(vma->vm_flags & VM_SEQ_READ))) |
724 | referenced++; | 724 | referenced++; |
725 | } | 725 | } |
726 | pte_unmap_unlock(pte, ptl); | 726 | pte_unmap_unlock(pte, ptl); |
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page, | |||
1093 | else | 1093 | else |
1094 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1094 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1095 | __page_set_anon_rmap(page, vma, address, 1); | 1095 | __page_set_anon_rmap(page, vma, address, 1); |
1096 | if (!mlocked_vma_newpage(vma, page)) | 1096 | if (!mlocked_vma_newpage(vma, page)) { |
1097 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 1097 | SetPageActive(page); |
1098 | else | 1098 | lru_cache_add(page); |
1099 | } else | ||
1099 | add_page_to_unevictable_list(page); | 1100 | add_page_to_unevictable_list(page); |
1100 | } | 1101 | } |
1101 | 1102 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 118dfa4952f4..a87990cf9f94 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1936,6 +1936,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1936 | 1936 | ||
1937 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); | 1937 | inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); |
1938 | if (inode) { | 1938 | if (inode) { |
1939 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
1940 | error = generic_acl_init(inode, dir); | ||
1941 | if (error) { | ||
1942 | iput(inode); | ||
1943 | return error; | ||
1944 | } | ||
1945 | #endif | ||
1939 | error = security_inode_init_security(inode, dir, | 1946 | error = security_inode_init_security(inode, dir, |
1940 | &dentry->d_name, | 1947 | &dentry->d_name, |
1941 | shmem_initxattrs, NULL); | 1948 | shmem_initxattrs, NULL); |
@@ -1945,15 +1952,8 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1945 | return error; | 1952 | return error; |
1946 | } | 1953 | } |
1947 | } | 1954 | } |
1948 | #ifdef CONFIG_TMPFS_POSIX_ACL | 1955 | |
1949 | error = generic_acl_init(inode, dir); | ||
1950 | if (error) { | ||
1951 | iput(inode); | ||
1952 | return error; | ||
1953 | } | ||
1954 | #else | ||
1955 | error = 0; | 1956 | error = 0; |
1956 | #endif | ||
1957 | dir->i_size += BOGO_DIRENT_SIZE; | 1957 | dir->i_size += BOGO_DIRENT_SIZE; |
1958 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; | 1958 | dir->i_ctime = dir->i_mtime = CURRENT_TIME; |
1959 | d_instantiate(dentry, inode); | 1959 | d_instantiate(dentry, inode); |
@@ -565,7 +565,7 @@ static void init_node_lock_keys(int q) | |||
565 | if (slab_state < UP) | 565 | if (slab_state < UP) |
566 | return; | 566 | return; |
567 | 567 | ||
568 | for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) { | 568 | for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { |
569 | struct kmem_cache_node *n; | 569 | struct kmem_cache_node *n; |
570 | struct kmem_cache *cache = kmalloc_caches[i]; | 570 | struct kmem_cache *cache = kmalloc_caches[i]; |
571 | 571 | ||
@@ -1180,6 +1180,12 @@ static int init_cache_node_node(int node) | |||
1180 | return 0; | 1180 | return 0; |
1181 | } | 1181 | } |
1182 | 1182 | ||
1183 | static inline int slabs_tofree(struct kmem_cache *cachep, | ||
1184 | struct kmem_cache_node *n) | ||
1185 | { | ||
1186 | return (n->free_objects + cachep->num - 1) / cachep->num; | ||
1187 | } | ||
1188 | |||
1183 | static void __cpuinit cpuup_canceled(long cpu) | 1189 | static void __cpuinit cpuup_canceled(long cpu) |
1184 | { | 1190 | { |
1185 | struct kmem_cache *cachep; | 1191 | struct kmem_cache *cachep; |
@@ -1241,7 +1247,7 @@ free_array_cache: | |||
1241 | n = cachep->node[node]; | 1247 | n = cachep->node[node]; |
1242 | if (!n) | 1248 | if (!n) |
1243 | continue; | 1249 | continue; |
1244 | drain_freelist(cachep, n, n->free_objects); | 1250 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
1245 | } | 1251 | } |
1246 | } | 1252 | } |
1247 | 1253 | ||
@@ -1408,7 +1414,7 @@ static int __meminit drain_cache_node_node(int node) | |||
1408 | if (!n) | 1414 | if (!n) |
1409 | continue; | 1415 | continue; |
1410 | 1416 | ||
1411 | drain_freelist(cachep, n, n->free_objects); | 1417 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
1412 | 1418 | ||
1413 | if (!list_empty(&n->slabs_full) || | 1419 | if (!list_empty(&n->slabs_full) || |
1414 | !list_empty(&n->slabs_partial)) { | 1420 | !list_empty(&n->slabs_partial)) { |
@@ -2532,7 +2538,7 @@ static int __cache_shrink(struct kmem_cache *cachep) | |||
2532 | if (!n) | 2538 | if (!n) |
2533 | continue; | 2539 | continue; |
2534 | 2540 | ||
2535 | drain_freelist(cachep, n, n->free_objects); | 2541 | drain_freelist(cachep, n, slabs_tofree(cachep, n)); |
2536 | 2542 | ||
2537 | ret += !list_empty(&n->slabs_full) || | 2543 | ret += !list_empty(&n->slabs_full) || |
2538 | !list_empty(&n->slabs_partial); | 2544 | !list_empty(&n->slabs_partial); |
@@ -3338,18 +3344,6 @@ done: | |||
3338 | return obj; | 3344 | return obj; |
3339 | } | 3345 | } |
3340 | 3346 | ||
3341 | /** | ||
3342 | * kmem_cache_alloc_node - Allocate an object on the specified node | ||
3343 | * @cachep: The cache to allocate from. | ||
3344 | * @flags: See kmalloc(). | ||
3345 | * @nodeid: node number of the target node. | ||
3346 | * @caller: return address of caller, used for debug information | ||
3347 | * | ||
3348 | * Identical to kmem_cache_alloc but it will allocate memory on the given | ||
3349 | * node, which can improve the performance for cpu bound structures. | ||
3350 | * | ||
3351 | * Fallback to other node is possible if __GFP_THISNODE is not set. | ||
3352 | */ | ||
3353 | static __always_inline void * | 3347 | static __always_inline void * |
3354 | slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | 3348 | slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, |
3355 | unsigned long caller) | 3349 | unsigned long caller) |
@@ -3643,6 +3637,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); | |||
3643 | #endif | 3637 | #endif |
3644 | 3638 | ||
3645 | #ifdef CONFIG_NUMA | 3639 | #ifdef CONFIG_NUMA |
3640 | /** | ||
3641 | * kmem_cache_alloc_node - Allocate an object on the specified node | ||
3642 | * @cachep: The cache to allocate from. | ||
3643 | * @flags: See kmalloc(). | ||
3644 | * @nodeid: node number of the target node. | ||
3645 | * | ||
3646 | * Identical to kmem_cache_alloc but it will allocate memory on the given | ||
3647 | * node, which can improve the performance for cpu bound structures. | ||
3648 | * | ||
3649 | * Fallback to other node is possible if __GFP_THISNODE is not set. | ||
3650 | */ | ||
3646 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3651 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
3647 | { | 3652 | { |
3648 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); | 3653 | void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); |
@@ -4431,20 +4436,10 @@ static int leaks_show(struct seq_file *m, void *p) | |||
4431 | return 0; | 4436 | return 0; |
4432 | } | 4437 | } |
4433 | 4438 | ||
4434 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | ||
4435 | { | ||
4436 | return seq_list_next(p, &slab_caches, pos); | ||
4437 | } | ||
4438 | |||
4439 | static void s_stop(struct seq_file *m, void *p) | ||
4440 | { | ||
4441 | mutex_unlock(&slab_mutex); | ||
4442 | } | ||
4443 | |||
4444 | static const struct seq_operations slabstats_op = { | 4439 | static const struct seq_operations slabstats_op = { |
4445 | .start = leaks_start, | 4440 | .start = leaks_start, |
4446 | .next = s_next, | 4441 | .next = slab_next, |
4447 | .stop = s_stop, | 4442 | .stop = slab_stop, |
4448 | .show = leaks_show, | 4443 | .show = leaks_show, |
4449 | }; | 4444 | }; |
4450 | 4445 | ||
@@ -271,3 +271,6 @@ struct kmem_cache_node { | |||
271 | #endif | 271 | #endif |
272 | 272 | ||
273 | }; | 273 | }; |
274 | |||
275 | void *slab_next(struct seq_file *m, void *p, loff_t *pos); | ||
276 | void slab_stop(struct seq_file *m, void *p); | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c index 2d414508e9ec..538bade6df7d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -497,6 +497,13 @@ void __init create_kmalloc_caches(unsigned long flags) | |||
497 | 497 | ||
498 | 498 | ||
499 | #ifdef CONFIG_SLABINFO | 499 | #ifdef CONFIG_SLABINFO |
500 | |||
501 | #ifdef CONFIG_SLAB | ||
502 | #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) | ||
503 | #else | ||
504 | #define SLABINFO_RIGHTS S_IRUSR | ||
505 | #endif | ||
506 | |||
500 | void print_slabinfo_header(struct seq_file *m) | 507 | void print_slabinfo_header(struct seq_file *m) |
501 | { | 508 | { |
502 | /* | 509 | /* |
@@ -531,12 +538,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) | |||
531 | return seq_list_start(&slab_caches, *pos); | 538 | return seq_list_start(&slab_caches, *pos); |
532 | } | 539 | } |
533 | 540 | ||
534 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | 541 | void *slab_next(struct seq_file *m, void *p, loff_t *pos) |
535 | { | 542 | { |
536 | return seq_list_next(p, &slab_caches, pos); | 543 | return seq_list_next(p, &slab_caches, pos); |
537 | } | 544 | } |
538 | 545 | ||
539 | static void s_stop(struct seq_file *m, void *p) | 546 | void slab_stop(struct seq_file *m, void *p) |
540 | { | 547 | { |
541 | mutex_unlock(&slab_mutex); | 548 | mutex_unlock(&slab_mutex); |
542 | } | 549 | } |
@@ -613,8 +620,8 @@ static int s_show(struct seq_file *m, void *p) | |||
613 | */ | 620 | */ |
614 | static const struct seq_operations slabinfo_op = { | 621 | static const struct seq_operations slabinfo_op = { |
615 | .start = s_start, | 622 | .start = s_start, |
616 | .next = s_next, | 623 | .next = slab_next, |
617 | .stop = s_stop, | 624 | .stop = slab_stop, |
618 | .show = s_show, | 625 | .show = s_show, |
619 | }; | 626 | }; |
620 | 627 | ||
@@ -633,7 +640,8 @@ static const struct file_operations proc_slabinfo_operations = { | |||
633 | 640 | ||
634 | static int __init slab_proc_init(void) | 641 | static int __init slab_proc_init(void) |
635 | { | 642 | { |
636 | proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); | 643 | proc_create("slabinfo", SLABINFO_RIGHTS, NULL, |
644 | &proc_slabinfo_operations); | ||
637 | return 0; | 645 | return 0; |
638 | } | 646 | } |
639 | module_init(slab_proc_init); | 647 | module_init(slab_proc_init); |
@@ -122,7 +122,7 @@ static inline void clear_slob_page_free(struct page *sp) | |||
122 | } | 122 | } |
123 | 123 | ||
124 | #define SLOB_UNIT sizeof(slob_t) | 124 | #define SLOB_UNIT sizeof(slob_t) |
125 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) | 125 | #define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 128 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
@@ -554,7 +554,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
554 | flags, node); | 554 | flags, node); |
555 | } | 555 | } |
556 | 556 | ||
557 | if (c->ctor) | 557 | if (b && c->ctor) |
558 | c->ctor(b); | 558 | c->ctor(b); |
559 | 559 | ||
560 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); | 560 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); |
@@ -123,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
123 | #endif | 123 | #endif |
124 | } | 124 | } |
125 | 125 | ||
126 | static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) | ||
127 | { | ||
128 | #ifdef CONFIG_SLUB_CPU_PARTIAL | ||
129 | return !kmem_cache_debug(s); | ||
130 | #else | ||
131 | return false; | ||
132 | #endif | ||
133 | } | ||
134 | |||
126 | /* | 135 | /* |
127 | * Issues still to be resolved: | 136 | * Issues still to be resolved: |
128 | * | 137 | * |
@@ -1573,7 +1582,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1573 | put_cpu_partial(s, page, 0); | 1582 | put_cpu_partial(s, page, 0); |
1574 | stat(s, CPU_PARTIAL_NODE); | 1583 | stat(s, CPU_PARTIAL_NODE); |
1575 | } | 1584 | } |
1576 | if (kmem_cache_debug(s) || available > s->cpu_partial / 2) | 1585 | if (!kmem_cache_has_cpu_partial(s) |
1586 | || available > s->cpu_partial / 2) | ||
1577 | break; | 1587 | break; |
1578 | 1588 | ||
1579 | } | 1589 | } |
@@ -1884,6 +1894,7 @@ redo: | |||
1884 | static void unfreeze_partials(struct kmem_cache *s, | 1894 | static void unfreeze_partials(struct kmem_cache *s, |
1885 | struct kmem_cache_cpu *c) | 1895 | struct kmem_cache_cpu *c) |
1886 | { | 1896 | { |
1897 | #ifdef CONFIG_SLUB_CPU_PARTIAL | ||
1887 | struct kmem_cache_node *n = NULL, *n2 = NULL; | 1898 | struct kmem_cache_node *n = NULL, *n2 = NULL; |
1888 | struct page *page, *discard_page = NULL; | 1899 | struct page *page, *discard_page = NULL; |
1889 | 1900 | ||
@@ -1938,6 +1949,7 @@ static void unfreeze_partials(struct kmem_cache *s, | |||
1938 | discard_slab(s, page); | 1949 | discard_slab(s, page); |
1939 | stat(s, FREE_SLAB); | 1950 | stat(s, FREE_SLAB); |
1940 | } | 1951 | } |
1952 | #endif | ||
1941 | } | 1953 | } |
1942 | 1954 | ||
1943 | /* | 1955 | /* |
@@ -1951,10 +1963,14 @@ static void unfreeze_partials(struct kmem_cache *s, | |||
1951 | */ | 1963 | */ |
1952 | static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | 1964 | static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) |
1953 | { | 1965 | { |
1966 | #ifdef CONFIG_SLUB_CPU_PARTIAL | ||
1954 | struct page *oldpage; | 1967 | struct page *oldpage; |
1955 | int pages; | 1968 | int pages; |
1956 | int pobjects; | 1969 | int pobjects; |
1957 | 1970 | ||
1971 | if (!s->cpu_partial) | ||
1972 | return; | ||
1973 | |||
1958 | do { | 1974 | do { |
1959 | pages = 0; | 1975 | pages = 0; |
1960 | pobjects = 0; | 1976 | pobjects = 0; |
@@ -1987,6 +2003,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
1987 | page->next = oldpage; | 2003 | page->next = oldpage; |
1988 | 2004 | ||
1989 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); | 2005 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); |
2006 | #endif | ||
1990 | } | 2007 | } |
1991 | 2008 | ||
1992 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 2009 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
@@ -2358,7 +2375,7 @@ redo: | |||
2358 | 2375 | ||
2359 | object = c->freelist; | 2376 | object = c->freelist; |
2360 | page = c->page; | 2377 | page = c->page; |
2361 | if (unlikely(!object || !node_match(page, node))) | 2378 | if (unlikely(!object || !page || !node_match(page, node))) |
2362 | object = __slab_alloc(s, gfpflags, node, addr, c); | 2379 | object = __slab_alloc(s, gfpflags, node, addr, c); |
2363 | 2380 | ||
2364 | else { | 2381 | else { |
@@ -2495,7 +2512,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2495 | new.inuse--; | 2512 | new.inuse--; |
2496 | if ((!new.inuse || !prior) && !was_frozen) { | 2513 | if ((!new.inuse || !prior) && !was_frozen) { |
2497 | 2514 | ||
2498 | if (!kmem_cache_debug(s) && !prior) | 2515 | if (kmem_cache_has_cpu_partial(s) && !prior) |
2499 | 2516 | ||
2500 | /* | 2517 | /* |
2501 | * Slab was on no list before and will be partially empty | 2518 | * Slab was on no list before and will be partially empty |
@@ -2550,8 +2567,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2550 | * Objects left in the slab. If it was not on the partial list before | 2567 | * Objects left in the slab. If it was not on the partial list before |
2551 | * then add it. | 2568 | * then add it. |
2552 | */ | 2569 | */ |
2553 | if (kmem_cache_debug(s) && unlikely(!prior)) { | 2570 | if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { |
2554 | remove_full(s, page); | 2571 | if (kmem_cache_debug(s)) |
2572 | remove_full(s, page); | ||
2555 | add_partial(n, page, DEACTIVATE_TO_TAIL); | 2573 | add_partial(n, page, DEACTIVATE_TO_TAIL); |
2556 | stat(s, FREE_ADD_PARTIAL); | 2574 | stat(s, FREE_ADD_PARTIAL); |
2557 | } | 2575 | } |
@@ -3059,7 +3077,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | |||
3059 | * per node list when we run out of per cpu objects. We only fetch 50% | 3077 | * per node list when we run out of per cpu objects. We only fetch 50% |
3060 | * to keep some capacity around for frees. | 3078 | * to keep some capacity around for frees. |
3061 | */ | 3079 | */ |
3062 | if (kmem_cache_debug(s)) | 3080 | if (!kmem_cache_has_cpu_partial(s)) |
3063 | s->cpu_partial = 0; | 3081 | s->cpu_partial = 0; |
3064 | else if (s->size >= PAGE_SIZE) | 3082 | else if (s->size >= PAGE_SIZE) |
3065 | s->cpu_partial = 2; | 3083 | s->cpu_partial = 2; |
@@ -4456,7 +4474,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, | |||
4456 | err = strict_strtoul(buf, 10, &objects); | 4474 | err = strict_strtoul(buf, 10, &objects); |
4457 | if (err) | 4475 | if (err) |
4458 | return err; | 4476 | return err; |
4459 | if (objects && kmem_cache_debug(s)) | 4477 | if (objects && !kmem_cache_has_cpu_partial(s)) |
4460 | return -EINVAL; | 4478 | return -EINVAL; |
4461 | 4479 | ||
4462 | s->cpu_partial = objects; | 4480 | s->cpu_partial = objects; |
@@ -5269,7 +5287,6 @@ __initcall(slab_sysfs_init); | |||
5269 | #ifdef CONFIG_SLABINFO | 5287 | #ifdef CONFIG_SLABINFO |
5270 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) | 5288 | void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) |
5271 | { | 5289 | { |
5272 | unsigned long nr_partials = 0; | ||
5273 | unsigned long nr_slabs = 0; | 5290 | unsigned long nr_slabs = 0; |
5274 | unsigned long nr_objs = 0; | 5291 | unsigned long nr_objs = 0; |
5275 | unsigned long nr_free = 0; | 5292 | unsigned long nr_free = 0; |
@@ -5281,9 +5298,8 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) | |||
5281 | if (!n) | 5298 | if (!n) |
5282 | continue; | 5299 | continue; |
5283 | 5300 | ||
5284 | nr_partials += n->nr_partial; | 5301 | nr_slabs += node_nr_slabs(n); |
5285 | nr_slabs += atomic_long_read(&n->nr_slabs); | 5302 | nr_objs += node_nr_objs(n); |
5286 | nr_objs += atomic_long_read(&n->total_objects); | ||
5287 | nr_free += count_partial(n, count_free); | 5303 | nr_free += count_partial(n, count_free); |
5288 | } | 5304 | } |
5289 | 5305 | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 1c91f0d3f6ab..308d50331bc3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -79,7 +79,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) | |||
79 | { | 79 | { |
80 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); | 80 | unsigned long root = SECTION_NR_TO_ROOT(section_nr); |
81 | struct mem_section *section; | 81 | struct mem_section *section; |
82 | int ret = 0; | ||
83 | 82 | ||
84 | if (mem_section[root]) | 83 | if (mem_section[root]) |
85 | return -EEXIST; | 84 | return -EEXIST; |
@@ -90,7 +89,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) | |||
90 | 89 | ||
91 | mem_section[root] = section; | 90 | mem_section[root] = section; |
92 | 91 | ||
93 | return ret; | 92 | return 0; |
94 | } | 93 | } |
95 | #else /* !SPARSEMEM_EXTREME */ | 94 | #else /* !SPARSEMEM_EXTREME */ |
96 | static inline int sparse_index_init(unsigned long section_nr, int nid) | 95 | static inline int sparse_index_init(unsigned long section_nr, int nid) |
@@ -481,6 +480,9 @@ void __init sparse_init(void) | |||
481 | struct page **map_map; | 480 | struct page **map_map; |
482 | #endif | 481 | #endif |
483 | 482 | ||
483 | /* see include/linux/mmzone.h 'struct mem_section' definition */ | ||
484 | BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); | ||
485 | |||
484 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ | 486 | /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ |
485 | set_pageblock_order(); | 487 | set_pageblock_order(); |
486 | 488 | ||
@@ -751,6 +753,7 @@ out: | |||
751 | return ret; | 753 | return ret; |
752 | } | 754 | } |
753 | 755 | ||
756 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
754 | #ifdef CONFIG_MEMORY_FAILURE | 757 | #ifdef CONFIG_MEMORY_FAILURE |
755 | static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | 758 | static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) |
756 | { | 759 | { |
@@ -772,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
772 | } | 775 | } |
773 | #endif | 776 | #endif |
774 | 777 | ||
775 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
776 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) | 778 | static void free_section_usemap(struct page *memmap, unsigned long *usemap) |
777 | { | 779 | { |
778 | struct page *usemap_page; | 780 | struct page *usemap_page; |
@@ -34,10 +34,13 @@ | |||
34 | 34 | ||
35 | #include "internal.h" | 35 | #include "internal.h" |
36 | 36 | ||
37 | #define CREATE_TRACE_POINTS | ||
38 | #include <trace/events/pagemap.h> | ||
39 | |||
37 | /* How many pages do we try to swap or page in/out together? */ | 40 | /* How many pages do we try to swap or page in/out together? */ |
38 | int page_cluster; | 41 | int page_cluster; |
39 | 42 | ||
40 | static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); | 43 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
41 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 44 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
42 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | 45 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); |
43 | 46 | ||
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, | |||
384 | SetPageActive(page); | 387 | SetPageActive(page); |
385 | lru += LRU_ACTIVE; | 388 | lru += LRU_ACTIVE; |
386 | add_page_to_lru_list(page, lruvec, lru); | 389 | add_page_to_lru_list(page, lruvec, lru); |
390 | trace_mm_lru_activate(page, page_to_pfn(page)); | ||
387 | 391 | ||
388 | __count_vm_event(PGACTIVATE); | 392 | __count_vm_event(PGACTIVATE); |
389 | update_page_reclaim_stat(lruvec, file, 1); | 393 | update_page_reclaim_stat(lruvec, file, 1); |
@@ -428,6 +432,33 @@ void activate_page(struct page *page) | |||
428 | } | 432 | } |
429 | #endif | 433 | #endif |
430 | 434 | ||
435 | static void __lru_cache_activate_page(struct page *page) | ||
436 | { | ||
437 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | ||
438 | int i; | ||
439 | |||
440 | /* | ||
441 | * Search backwards on the optimistic assumption that the page being | ||
442 | * activated has just been added to this pagevec. Note that only | ||
443 | * the local pagevec is examined as a !PageLRU page could be in the | ||
444 | * process of being released, reclaimed, migrated or on a remote | ||
445 | * pagevec that is currently being drained. Furthermore, marking | ||
446 | * a remote pagevec's page PageActive potentially hits a race where | ||
447 | * a page is marked PageActive just after it is added to the inactive | ||
448 | * list causing accounting errors and BUG_ON checks to trigger. | ||
449 | */ | ||
450 | for (i = pagevec_count(pvec) - 1; i >= 0; i--) { | ||
451 | struct page *pagevec_page = pvec->pages[i]; | ||
452 | |||
453 | if (pagevec_page == page) { | ||
454 | SetPageActive(page); | ||
455 | break; | ||
456 | } | ||
457 | } | ||
458 | |||
459 | put_cpu_var(lru_add_pvec); | ||
460 | } | ||
461 | |||
431 | /* | 462 | /* |
432 | * Mark a page as having seen activity. | 463 | * Mark a page as having seen activity. |
433 | * | 464 | * |
@@ -438,8 +469,18 @@ void activate_page(struct page *page) | |||
438 | void mark_page_accessed(struct page *page) | 469 | void mark_page_accessed(struct page *page) |
439 | { | 470 | { |
440 | if (!PageActive(page) && !PageUnevictable(page) && | 471 | if (!PageActive(page) && !PageUnevictable(page) && |
441 | PageReferenced(page) && PageLRU(page)) { | 472 | PageReferenced(page)) { |
442 | activate_page(page); | 473 | |
474 | /* | ||
475 | * If the page is on the LRU, queue it for activation via | ||
476 | * activate_page_pvecs. Otherwise, assume the page is on a | ||
477 | * pagevec, mark it active and it'll be moved to the active | ||
478 | * LRU on the next drain. | ||
479 | */ | ||
480 | if (PageLRU(page)) | ||
481 | activate_page(page); | ||
482 | else | ||
483 | __lru_cache_activate_page(page); | ||
443 | ClearPageReferenced(page); | 484 | ClearPageReferenced(page); |
444 | } else if (!PageReferenced(page)) { | 485 | } else if (!PageReferenced(page)) { |
445 | SetPageReferenced(page); | 486 | SetPageReferenced(page); |
@@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page) | |||
448 | EXPORT_SYMBOL(mark_page_accessed); | 489 | EXPORT_SYMBOL(mark_page_accessed); |
449 | 490 | ||
450 | /* | 491 | /* |
451 | * Order of operations is important: flush the pagevec when it's already | 492 | * Queue the page for addition to the LRU via pagevec. The decision on whether |
452 | * full, not when adding the last page, to make sure that last page is | 493 | * to add the page to the [in]active [file|anon] list is deferred until the |
453 | * not added to the LRU directly when passed to this function. Because | 494 | * pagevec is drained. This gives a chance for the caller of __lru_cache_add() |
454 | * mark_page_accessed() (called after this when writing) only activates | 495 | * have the page added to the active list using mark_page_accessed(). |
455 | * pages that are on the LRU, linear writes in subpage chunks would see | ||
456 | * every PAGEVEC_SIZE page activated, which is unexpected. | ||
457 | */ | 496 | */ |
458 | void __lru_cache_add(struct page *page, enum lru_list lru) | 497 | void __lru_cache_add(struct page *page) |
459 | { | 498 | { |
460 | struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; | 499 | struct pagevec *pvec = &get_cpu_var(lru_add_pvec); |
461 | 500 | ||
462 | page_cache_get(page); | 501 | page_cache_get(page); |
463 | if (!pagevec_space(pvec)) | 502 | if (!pagevec_space(pvec)) |
464 | __pagevec_lru_add(pvec, lru); | 503 | __pagevec_lru_add(pvec); |
465 | pagevec_add(pvec, page); | 504 | pagevec_add(pvec, page); |
466 | put_cpu_var(lru_add_pvecs); | 505 | put_cpu_var(lru_add_pvec); |
467 | } | 506 | } |
468 | EXPORT_SYMBOL(__lru_cache_add); | 507 | EXPORT_SYMBOL(__lru_cache_add); |
469 | 508 | ||
470 | /** | 509 | /** |
471 | * lru_cache_add_lru - add a page to a page list | 510 | * lru_cache_add - add a page to a page list |
472 | * @page: the page to be added to the LRU. | 511 | * @page: the page to be added to the LRU. |
473 | * @lru: the LRU list to which the page is added. | ||
474 | */ | 512 | */ |
475 | void lru_cache_add_lru(struct page *page, enum lru_list lru) | 513 | void lru_cache_add(struct page *page) |
476 | { | 514 | { |
477 | if (PageActive(page)) { | 515 | if (PageActive(page)) { |
478 | VM_BUG_ON(PageUnevictable(page)); | 516 | VM_BUG_ON(PageUnevictable(page)); |
479 | ClearPageActive(page); | ||
480 | } else if (PageUnevictable(page)) { | 517 | } else if (PageUnevictable(page)) { |
481 | VM_BUG_ON(PageActive(page)); | 518 | VM_BUG_ON(PageActive(page)); |
482 | ClearPageUnevictable(page); | ||
483 | } | 519 | } |
484 | 520 | ||
485 | VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); | 521 | VM_BUG_ON(PageLRU(page)); |
486 | __lru_cache_add(page, lru); | 522 | __lru_cache_add(page); |
487 | } | 523 | } |
488 | 524 | ||
489 | /** | 525 | /** |
@@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | |||
583 | */ | 619 | */ |
584 | void lru_add_drain_cpu(int cpu) | 620 | void lru_add_drain_cpu(int cpu) |
585 | { | 621 | { |
586 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | 622 | struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); |
587 | struct pagevec *pvec; | ||
588 | int lru; | ||
589 | 623 | ||
590 | for_each_lru(lru) { | 624 | if (pagevec_count(pvec)) |
591 | pvec = &pvecs[lru - LRU_BASE]; | 625 | __pagevec_lru_add(pvec); |
592 | if (pagevec_count(pvec)) | ||
593 | __pagevec_lru_add(pvec, lru); | ||
594 | } | ||
595 | 626 | ||
596 | pvec = &per_cpu(lru_rotate_pvecs, cpu); | 627 | pvec = &per_cpu(lru_rotate_pvecs, cpu); |
597 | if (pagevec_count(pvec)) { | 628 | if (pagevec_count(pvec)) { |
@@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold) | |||
708 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 739 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
709 | } | 740 | } |
710 | 741 | ||
742 | /* Clear Active bit in case of parallel mark_page_accessed */ | ||
743 | ClearPageActive(page); | ||
744 | |||
711 | list_add(&page->lru, &pages_to_free); | 745 | list_add(&page->lru, &pages_to_free); |
712 | } | 746 | } |
713 | if (zone) | 747 | if (zone) |
@@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
795 | static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, | 829 | static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, |
796 | void *arg) | 830 | void *arg) |
797 | { | 831 | { |
798 | enum lru_list lru = (enum lru_list)arg; | 832 | int file = page_is_file_cache(page); |
799 | int file = is_file_lru(lru); | 833 | int active = PageActive(page); |
800 | int active = is_active_lru(lru); | 834 | enum lru_list lru = page_lru(page); |
801 | 835 | ||
802 | VM_BUG_ON(PageActive(page)); | ||
803 | VM_BUG_ON(PageUnevictable(page)); | 836 | VM_BUG_ON(PageUnevictable(page)); |
804 | VM_BUG_ON(PageLRU(page)); | 837 | VM_BUG_ON(PageLRU(page)); |
805 | 838 | ||
806 | SetPageLRU(page); | 839 | SetPageLRU(page); |
807 | if (active) | ||
808 | SetPageActive(page); | ||
809 | add_page_to_lru_list(page, lruvec, lru); | 840 | add_page_to_lru_list(page, lruvec, lru); |
810 | update_page_reclaim_stat(lruvec, file, active); | 841 | update_page_reclaim_stat(lruvec, file, active); |
842 | trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); | ||
811 | } | 843 | } |
812 | 844 | ||
813 | /* | 845 | /* |
814 | * Add the passed pages to the LRU, then drop the caller's refcount | 846 | * Add the passed pages to the LRU, then drop the caller's refcount |
815 | * on them. Reinitialises the caller's pagevec. | 847 | * on them. Reinitialises the caller's pagevec. |
816 | */ | 848 | */ |
817 | void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) | 849 | void __pagevec_lru_add(struct pagevec *pvec) |
818 | { | 850 | { |
819 | VM_BUG_ON(is_unevictable_lru(lru)); | 851 | pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); |
820 | |||
821 | pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); | ||
822 | } | 852 | } |
823 | EXPORT_SYMBOL(__pagevec_lru_add); | 853 | EXPORT_SYMBOL(__pagevec_lru_add); |
824 | 854 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 746af55b8455..36af6eeaa67e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, | |||
212 | si->cluster_nr = SWAPFILE_CLUSTER - 1; | 212 | si->cluster_nr = SWAPFILE_CLUSTER - 1; |
213 | goto checks; | 213 | goto checks; |
214 | } | 214 | } |
215 | if (si->flags & SWP_DISCARDABLE) { | 215 | if (si->flags & SWP_PAGE_DISCARD) { |
216 | /* | 216 | /* |
217 | * Start range check on racing allocations, in case | 217 | * Start range check on racing allocations, in case |
218 | * they overlap the cluster we eventually decide on | 218 | * they overlap the cluster we eventually decide on |
@@ -322,7 +322,7 @@ checks: | |||
322 | 322 | ||
323 | if (si->lowest_alloc) { | 323 | if (si->lowest_alloc) { |
324 | /* | 324 | /* |
325 | * Only set when SWP_DISCARDABLE, and there's a scan | 325 | * Only set when SWP_PAGE_DISCARD, and there's a scan |
326 | * for a free cluster in progress or just completed. | 326 | * for a free cluster in progress or just completed. |
327 | */ | 327 | */ |
328 | if (found_free_cluster) { | 328 | if (found_free_cluster) { |
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, | |||
2016 | return nr_extents; | 2016 | return nr_extents; |
2017 | } | 2017 | } |
2018 | 2018 | ||
2019 | /* | ||
2020 | * Helper to sys_swapon determining if a given swap | ||
2021 | * backing device queue supports DISCARD operations. | ||
2022 | */ | ||
2023 | static bool swap_discardable(struct swap_info_struct *si) | ||
2024 | { | ||
2025 | struct request_queue *q = bdev_get_queue(si->bdev); | ||
2026 | |||
2027 | if (!q || !blk_queue_discard(q)) | ||
2028 | return false; | ||
2029 | |||
2030 | return true; | ||
2031 | } | ||
2032 | |||
2019 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 2033 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
2020 | { | 2034 | { |
2021 | struct swap_info_struct *p; | 2035 | struct swap_info_struct *p; |
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2123 | p->flags |= SWP_SOLIDSTATE; | 2137 | p->flags |= SWP_SOLIDSTATE; |
2124 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | 2138 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); |
2125 | } | 2139 | } |
2126 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) | 2140 | |
2127 | p->flags |= SWP_DISCARDABLE; | 2141 | if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
2142 | /* | ||
2143 | * When discard is enabled for swap with no particular | ||
2144 | * policy flagged, we set all swap discard flags here in | ||
2145 | * order to sustain backward compatibility with older | ||
2146 | * swapon(8) releases. | ||
2147 | */ | ||
2148 | p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | | ||
2149 | SWP_PAGE_DISCARD); | ||
2150 | |||
2151 | /* | ||
2152 | * By flagging sys_swapon, a sysadmin can tell us to | ||
2153 | * either do single-time area discards only, or to just | ||
2154 | * perform discards for released swap page-clusters. | ||
2155 | * Now it's time to adjust the p->flags accordingly. | ||
2156 | */ | ||
2157 | if (swap_flags & SWAP_FLAG_DISCARD_ONCE) | ||
2158 | p->flags &= ~SWP_PAGE_DISCARD; | ||
2159 | else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) | ||
2160 | p->flags &= ~SWP_AREA_DISCARD; | ||
2161 | |||
2162 | /* issue a swapon-time discard if it's still required */ | ||
2163 | if (p->flags & SWP_AREA_DISCARD) { | ||
2164 | int err = discard_swap(p); | ||
2165 | if (unlikely(err)) | ||
2166 | printk(KERN_ERR | ||
2167 | "swapon: discard_swap(%p): %d\n", | ||
2168 | p, err); | ||
2169 | } | ||
2170 | } | ||
2128 | } | 2171 | } |
2129 | 2172 | ||
2130 | mutex_lock(&swapon_mutex); | 2173 | mutex_lock(&swapon_mutex); |
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2135 | enable_swap_info(p, prio, swap_map, frontswap_map); | 2178 | enable_swap_info(p, prio, swap_map, frontswap_map); |
2136 | 2179 | ||
2137 | printk(KERN_INFO "Adding %uk swap on %s. " | 2180 | printk(KERN_INFO "Adding %uk swap on %s. " |
2138 | "Priority:%d extents:%d across:%lluk %s%s%s\n", | 2181 | "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", |
2139 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, | 2182 | p->pages<<(PAGE_SHIFT-10), name->name, p->prio, |
2140 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2183 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
2141 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2184 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
2142 | (p->flags & SWP_DISCARDABLE) ? "D" : "", | 2185 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
2186 | (p->flags & SWP_AREA_DISCARD) ? "s" : "", | ||
2187 | (p->flags & SWP_PAGE_DISCARD) ? "c" : "", | ||
2143 | (frontswap_map) ? "FS" : ""); | 2188 | (frontswap_map) ? "FS" : ""); |
2144 | 2189 | ||
2145 | mutex_unlock(&swapon_mutex); | 2190 | mutex_unlock(&swapon_mutex); |
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
295 | { | 295 | { |
296 | mm->mmap_base = TASK_UNMAPPED_BASE; | 296 | mm->mmap_base = TASK_UNMAPPED_BASE; |
297 | mm->get_unmapped_area = arch_get_unmapped_area; | 297 | mm->get_unmapped_area = arch_get_unmapped_area; |
298 | mm->unmap_area = arch_unmap_area; | ||
299 | } | 298 | } |
300 | #endif | 299 | #endif |
301 | 300 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d365724feb05..13a54953a273 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) | |||
292 | va = rb_entry(n, struct vmap_area, rb_node); | 292 | va = rb_entry(n, struct vmap_area, rb_node); |
293 | if (addr < va->va_start) | 293 | if (addr < va->va_start) |
294 | n = n->rb_left; | 294 | n = n->rb_left; |
295 | else if (addr > va->va_start) | 295 | else if (addr >= va->va_end) |
296 | n = n->rb_right; | 296 | n = n->rb_right; |
297 | else | 297 | else |
298 | return va; | 298 | return va; |
@@ -388,12 +388,12 @@ nocache: | |||
388 | addr = ALIGN(first->va_end, align); | 388 | addr = ALIGN(first->va_end, align); |
389 | if (addr < vstart) | 389 | if (addr < vstart) |
390 | goto nocache; | 390 | goto nocache; |
391 | if (addr + size - 1 < addr) | 391 | if (addr + size < addr) |
392 | goto overflow; | 392 | goto overflow; |
393 | 393 | ||
394 | } else { | 394 | } else { |
395 | addr = ALIGN(vstart, align); | 395 | addr = ALIGN(vstart, align); |
396 | if (addr + size - 1 < addr) | 396 | if (addr + size < addr) |
397 | goto overflow; | 397 | goto overflow; |
398 | 398 | ||
399 | n = vmap_area_root.rb_node; | 399 | n = vmap_area_root.rb_node; |
@@ -420,7 +420,7 @@ nocache: | |||
420 | if (addr + cached_hole_size < first->va_start) | 420 | if (addr + cached_hole_size < first->va_start) |
421 | cached_hole_size = first->va_start - addr; | 421 | cached_hole_size = first->va_start - addr; |
422 | addr = ALIGN(first->va_end, align); | 422 | addr = ALIGN(first->va_end, align); |
423 | if (addr + size - 1 < addr) | 423 | if (addr + size < addr) |
424 | goto overflow; | 424 | goto overflow; |
425 | 425 | ||
426 | if (list_is_last(&first->list, &vmap_area_list)) | 426 | if (list_is_last(&first->list, &vmap_area_list)) |
@@ -754,7 +754,6 @@ struct vmap_block { | |||
754 | struct vmap_area *va; | 754 | struct vmap_area *va; |
755 | struct vmap_block_queue *vbq; | 755 | struct vmap_block_queue *vbq; |
756 | unsigned long free, dirty; | 756 | unsigned long free, dirty; |
757 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | ||
758 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 757 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); |
759 | struct list_head free_list; | 758 | struct list_head free_list; |
760 | struct rcu_head rcu_head; | 759 | struct rcu_head rcu_head; |
@@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
820 | vb->va = va; | 819 | vb->va = va; |
821 | vb->free = VMAP_BBMAP_BITS; | 820 | vb->free = VMAP_BBMAP_BITS; |
822 | vb->dirty = 0; | 821 | vb->dirty = 0; |
823 | bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); | ||
824 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | 822 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); |
825 | INIT_LIST_HEAD(&vb->free_list); | 823 | INIT_LIST_HEAD(&vb->free_list); |
826 | 824 | ||
@@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu) | |||
873 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | 871 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { |
874 | vb->free = 0; /* prevent further allocs after releasing lock */ | 872 | vb->free = 0; /* prevent further allocs after releasing lock */ |
875 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | 873 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ |
876 | bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); | ||
877 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | 874 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); |
878 | spin_lock(&vbq->lock); | 875 | spin_lock(&vbq->lock); |
879 | list_del_rcu(&vb->free_list); | 876 | list_del_rcu(&vb->free_list); |
@@ -891,11 +888,6 @@ static void purge_fragmented_blocks(int cpu) | |||
891 | } | 888 | } |
892 | } | 889 | } |
893 | 890 | ||
894 | static void purge_fragmented_blocks_thiscpu(void) | ||
895 | { | ||
896 | purge_fragmented_blocks(smp_processor_id()); | ||
897 | } | ||
898 | |||
899 | static void purge_fragmented_blocks_allcpus(void) | 891 | static void purge_fragmented_blocks_allcpus(void) |
900 | { | 892 | { |
901 | int cpu; | 893 | int cpu; |
@@ -910,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
910 | struct vmap_block *vb; | 902 | struct vmap_block *vb; |
911 | unsigned long addr = 0; | 903 | unsigned long addr = 0; |
912 | unsigned int order; | 904 | unsigned int order; |
913 | int purge = 0; | ||
914 | 905 | ||
915 | BUG_ON(size & ~PAGE_MASK); | 906 | BUG_ON(size & ~PAGE_MASK); |
916 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 907 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
@@ -934,17 +925,7 @@ again: | |||
934 | if (vb->free < 1UL << order) | 925 | if (vb->free < 1UL << order) |
935 | goto next; | 926 | goto next; |
936 | 927 | ||
937 | i = bitmap_find_free_region(vb->alloc_map, | 928 | i = VMAP_BBMAP_BITS - vb->free; |
938 | VMAP_BBMAP_BITS, order); | ||
939 | |||
940 | if (i < 0) { | ||
941 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { | ||
942 | /* fragmented and no outstanding allocations */ | ||
943 | BUG_ON(vb->dirty != VMAP_BBMAP_BITS); | ||
944 | purge = 1; | ||
945 | } | ||
946 | goto next; | ||
947 | } | ||
948 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 929 | addr = vb->va->va_start + (i << PAGE_SHIFT); |
949 | BUG_ON(addr_to_vb_idx(addr) != | 930 | BUG_ON(addr_to_vb_idx(addr) != |
950 | addr_to_vb_idx(vb->va->va_start)); | 931 | addr_to_vb_idx(vb->va->va_start)); |
@@ -960,9 +941,6 @@ next: | |||
960 | spin_unlock(&vb->lock); | 941 | spin_unlock(&vb->lock); |
961 | } | 942 | } |
962 | 943 | ||
963 | if (purge) | ||
964 | purge_fragmented_blocks_thiscpu(); | ||
965 | |||
966 | put_cpu_var(vmap_block_queue); | 944 | put_cpu_var(vmap_block_queue); |
967 | rcu_read_unlock(); | 945 | rcu_read_unlock(); |
968 | 946 | ||
@@ -1311,22 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
1311 | spin_unlock(&vmap_area_lock); | 1289 | spin_unlock(&vmap_area_lock); |
1312 | } | 1290 | } |
1313 | 1291 | ||
1314 | static void clear_vm_unlist(struct vm_struct *vm) | 1292 | static void clear_vm_uninitialized_flag(struct vm_struct *vm) |
1315 | { | 1293 | { |
1316 | /* | 1294 | /* |
1317 | * Before removing VM_UNLIST, | 1295 | * Before removing VM_UNINITIALIZED, |
1318 | * we should make sure that vm has proper values. | 1296 | * we should make sure that vm has proper values. |
1319 | * Pair with smp_rmb() in show_numa_info(). | 1297 | * Pair with smp_rmb() in show_numa_info(). |
1320 | */ | 1298 | */ |
1321 | smp_wmb(); | 1299 | smp_wmb(); |
1322 | vm->flags &= ~VM_UNLIST; | 1300 | vm->flags &= ~VM_UNINITIALIZED; |
1323 | } | ||
1324 | |||
1325 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
1326 | unsigned long flags, const void *caller) | ||
1327 | { | ||
1328 | setup_vmalloc_vm(vm, va, flags, caller); | ||
1329 | clear_vm_unlist(vm); | ||
1330 | } | 1301 | } |
1331 | 1302 | ||
1332 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1303 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
@@ -1337,16 +1308,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1337 | struct vm_struct *area; | 1308 | struct vm_struct *area; |
1338 | 1309 | ||
1339 | BUG_ON(in_interrupt()); | 1310 | BUG_ON(in_interrupt()); |
1340 | if (flags & VM_IOREMAP) { | 1311 | if (flags & VM_IOREMAP) |
1341 | int bit = fls(size); | 1312 | align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); |
1342 | |||
1343 | if (bit > IOREMAP_MAX_ORDER) | ||
1344 | bit = IOREMAP_MAX_ORDER; | ||
1345 | else if (bit < PAGE_SHIFT) | ||
1346 | bit = PAGE_SHIFT; | ||
1347 | |||
1348 | align = 1ul << bit; | ||
1349 | } | ||
1350 | 1313 | ||
1351 | size = PAGE_ALIGN(size); | 1314 | size = PAGE_ALIGN(size); |
1352 | if (unlikely(!size)) | 1315 | if (unlikely(!size)) |
@@ -1367,16 +1330,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1367 | return NULL; | 1330 | return NULL; |
1368 | } | 1331 | } |
1369 | 1332 | ||
1370 | /* | 1333 | setup_vmalloc_vm(area, va, flags, caller); |
1371 | * When this function is called from __vmalloc_node_range, | ||
1372 | * we add VM_UNLIST flag to avoid accessing uninitialized | ||
1373 | * members of vm_struct such as pages and nr_pages fields. | ||
1374 | * They will be set later. | ||
1375 | */ | ||
1376 | if (flags & VM_UNLIST) | ||
1377 | setup_vmalloc_vm(area, va, flags, caller); | ||
1378 | else | ||
1379 | insert_vmalloc_vm(area, va, flags, caller); | ||
1380 | 1334 | ||
1381 | return area; | 1335 | return area; |
1382 | } | 1336 | } |
@@ -1476,10 +1430,9 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1476 | if (!addr) | 1430 | if (!addr) |
1477 | return; | 1431 | return; |
1478 | 1432 | ||
1479 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | 1433 | if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", |
1480 | WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | 1434 | addr)) |
1481 | return; | 1435 | return; |
1482 | } | ||
1483 | 1436 | ||
1484 | area = remove_vm_area(addr); | 1437 | area = remove_vm_area(addr); |
1485 | if (unlikely(!area)) { | 1438 | if (unlikely(!area)) { |
@@ -1524,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1524 | * conventions for vfree() arch-depenedent would be a really bad idea) | 1477 | * conventions for vfree() arch-depenedent would be a really bad idea) |
1525 | * | 1478 | * |
1526 | * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) | 1479 | * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) |
1527 | * | ||
1528 | */ | 1480 | */ |
1529 | void vfree(const void *addr) | 1481 | void vfree(const void *addr) |
1530 | { | 1482 | { |
@@ -1536,8 +1488,8 @@ void vfree(const void *addr) | |||
1536 | return; | 1488 | return; |
1537 | if (unlikely(in_interrupt())) { | 1489 | if (unlikely(in_interrupt())) { |
1538 | struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); | 1490 | struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); |
1539 | llist_add((struct llist_node *)addr, &p->list); | 1491 | if (llist_add((struct llist_node *)addr, &p->list)) |
1540 | schedule_work(&p->wq); | 1492 | schedule_work(&p->wq); |
1541 | } else | 1493 | } else |
1542 | __vunmap(addr, 1); | 1494 | __vunmap(addr, 1); |
1543 | } | 1495 | } |
@@ -1682,21 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1682 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1634 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1683 | goto fail; | 1635 | goto fail; |
1684 | 1636 | ||
1685 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, | 1637 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, |
1686 | start, end, node, gfp_mask, caller); | 1638 | start, end, node, gfp_mask, caller); |
1687 | if (!area) | 1639 | if (!area) |
1688 | goto fail; | 1640 | goto fail; |
1689 | 1641 | ||
1690 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1642 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
1691 | if (!addr) | 1643 | if (!addr) |
1692 | return NULL; | 1644 | goto fail; |
1693 | 1645 | ||
1694 | /* | 1646 | /* |
1695 | * In this function, newly allocated vm_struct has VM_UNLIST flag. | 1647 | * In this function, newly allocated vm_struct has VM_UNINITIALIZED |
1696 | * It means that vm_struct is not fully initialized. | 1648 | * flag. It means that vm_struct is not fully initialized. |
1697 | * Now, it is fully initialized, so remove this flag here. | 1649 | * Now, it is fully initialized, so remove this flag here. |
1698 | */ | 1650 | */ |
1699 | clear_vm_unlist(area); | 1651 | clear_vm_uninitialized_flag(area); |
1700 | 1652 | ||
1701 | /* | 1653 | /* |
1702 | * A ref_count = 3 is needed because the vm_struct and vmap_area | 1654 | * A ref_count = 3 is needed because the vm_struct and vmap_area |
@@ -2148,42 +2100,43 @@ finished: | |||
2148 | } | 2100 | } |
2149 | 2101 | ||
2150 | /** | 2102 | /** |
2151 | * remap_vmalloc_range - map vmalloc pages to userspace | 2103 | * remap_vmalloc_range_partial - map vmalloc pages to userspace |
2152 | * @vma: vma to cover (map full range of vma) | 2104 | * @vma: vma to cover |
2153 | * @addr: vmalloc memory | 2105 | * @uaddr: target user address to start at |
2154 | * @pgoff: number of pages into addr before first page to map | 2106 | * @kaddr: virtual address of vmalloc kernel memory |
2107 | * @size: size of map area | ||
2155 | * | 2108 | * |
2156 | * Returns: 0 for success, -Exxx on failure | 2109 | * Returns: 0 for success, -Exxx on failure |
2157 | * | 2110 | * |
2158 | * This function checks that addr is a valid vmalloc'ed area, and | 2111 | * This function checks that @kaddr is a valid vmalloc'ed area, |
2159 | * that it is big enough to cover the vma. Will return failure if | 2112 | * and that it is big enough to cover the range starting at |
2160 | * that criteria isn't met. | 2113 | * @uaddr in @vma. Will return failure if that criteria isn't |
2114 | * met. | ||
2161 | * | 2115 | * |
2162 | * Similar to remap_pfn_range() (see mm/memory.c) | 2116 | * Similar to remap_pfn_range() (see mm/memory.c) |
2163 | */ | 2117 | */ |
2164 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | 2118 | int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, |
2165 | unsigned long pgoff) | 2119 | void *kaddr, unsigned long size) |
2166 | { | 2120 | { |
2167 | struct vm_struct *area; | 2121 | struct vm_struct *area; |
2168 | unsigned long uaddr = vma->vm_start; | ||
2169 | unsigned long usize = vma->vm_end - vma->vm_start; | ||
2170 | 2122 | ||
2171 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 2123 | size = PAGE_ALIGN(size); |
2124 | |||
2125 | if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) | ||
2172 | return -EINVAL; | 2126 | return -EINVAL; |
2173 | 2127 | ||
2174 | area = find_vm_area(addr); | 2128 | area = find_vm_area(kaddr); |
2175 | if (!area) | 2129 | if (!area) |
2176 | return -EINVAL; | 2130 | return -EINVAL; |
2177 | 2131 | ||
2178 | if (!(area->flags & VM_USERMAP)) | 2132 | if (!(area->flags & VM_USERMAP)) |
2179 | return -EINVAL; | 2133 | return -EINVAL; |
2180 | 2134 | ||
2181 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 2135 | if (kaddr + size > area->addr + area->size) |
2182 | return -EINVAL; | 2136 | return -EINVAL; |
2183 | 2137 | ||
2184 | addr += pgoff << PAGE_SHIFT; | ||
2185 | do { | 2138 | do { |
2186 | struct page *page = vmalloc_to_page(addr); | 2139 | struct page *page = vmalloc_to_page(kaddr); |
2187 | int ret; | 2140 | int ret; |
2188 | 2141 | ||
2189 | ret = vm_insert_page(vma, uaddr, page); | 2142 | ret = vm_insert_page(vma, uaddr, page); |
@@ -2191,14 +2144,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
2191 | return ret; | 2144 | return ret; |
2192 | 2145 | ||
2193 | uaddr += PAGE_SIZE; | 2146 | uaddr += PAGE_SIZE; |
2194 | addr += PAGE_SIZE; | 2147 | kaddr += PAGE_SIZE; |
2195 | usize -= PAGE_SIZE; | 2148 | size -= PAGE_SIZE; |
2196 | } while (usize > 0); | 2149 | } while (size > 0); |
2197 | 2150 | ||
2198 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; | 2151 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
2199 | 2152 | ||
2200 | return 0; | 2153 | return 0; |
2201 | } | 2154 | } |
2155 | EXPORT_SYMBOL(remap_vmalloc_range_partial); | ||
2156 | |||
2157 | /** | ||
2158 | * remap_vmalloc_range - map vmalloc pages to userspace | ||
2159 | * @vma: vma to cover (map full range of vma) | ||
2160 | * @addr: vmalloc memory | ||
2161 | * @pgoff: number of pages into addr before first page to map | ||
2162 | * | ||
2163 | * Returns: 0 for success, -Exxx on failure | ||
2164 | * | ||
2165 | * This function checks that addr is a valid vmalloc'ed area, and | ||
2166 | * that it is big enough to cover the vma. Will return failure if | ||
2167 | * that criteria isn't met. | ||
2168 | * | ||
2169 | * Similar to remap_pfn_range() (see mm/memory.c) | ||
2170 | */ | ||
2171 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | ||
2172 | unsigned long pgoff) | ||
2173 | { | ||
2174 | return remap_vmalloc_range_partial(vma, vma->vm_start, | ||
2175 | addr + (pgoff << PAGE_SHIFT), | ||
2176 | vma->vm_end - vma->vm_start); | ||
2177 | } | ||
2202 | EXPORT_SYMBOL(remap_vmalloc_range); | 2178 | EXPORT_SYMBOL(remap_vmalloc_range); |
2203 | 2179 | ||
2204 | /* | 2180 | /* |
@@ -2512,8 +2488,8 @@ found: | |||
2512 | 2488 | ||
2513 | /* insert all vm's */ | 2489 | /* insert all vm's */ |
2514 | for (area = 0; area < nr_vms; area++) | 2490 | for (area = 0; area < nr_vms; area++) |
2515 | insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, | 2491 | setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, |
2516 | pcpu_get_vm_areas); | 2492 | pcpu_get_vm_areas); |
2517 | 2493 | ||
2518 | kfree(vas); | 2494 | kfree(vas); |
2519 | return vms; | 2495 | return vms; |
@@ -2592,11 +2568,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
2592 | if (!counters) | 2568 | if (!counters) |
2593 | return; | 2569 | return; |
2594 | 2570 | ||
2595 | /* Pair with smp_wmb() in clear_vm_unlist() */ | ||
2596 | smp_rmb(); | ||
2597 | if (v->flags & VM_UNLIST) | ||
2598 | return; | ||
2599 | |||
2600 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | 2571 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); |
2601 | 2572 | ||
2602 | for (nr = 0; nr < v->nr_pages; nr++) | 2573 | for (nr = 0; nr < v->nr_pages; nr++) |
@@ -2625,6 +2596,11 @@ static int s_show(struct seq_file *m, void *p) | |||
2625 | 2596 | ||
2626 | v = va->vm; | 2597 | v = va->vm; |
2627 | 2598 | ||
2599 | /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ | ||
2600 | smp_rmb(); | ||
2601 | if (v->flags & VM_UNINITIALIZED) | ||
2602 | return 0; | ||
2603 | |||
2628 | seq_printf(m, "0x%pK-0x%pK %7ld", | 2604 | seq_printf(m, "0x%pK-0x%pK %7ld", |
2629 | v->addr, v->addr + v->size, v->size); | 2605 | v->addr, v->addr + v->size, v->size); |
2630 | 2606 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index fa6a85378ee4..2cff0d491c6d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
546 | void putback_lru_page(struct page *page) | 546 | void putback_lru_page(struct page *page) |
547 | { | 547 | { |
548 | int lru; | 548 | int lru; |
549 | int active = !!TestClearPageActive(page); | ||
550 | int was_unevictable = PageUnevictable(page); | 549 | int was_unevictable = PageUnevictable(page); |
551 | 550 | ||
552 | VM_BUG_ON(PageLRU(page)); | 551 | VM_BUG_ON(PageLRU(page)); |
@@ -561,8 +560,8 @@ redo: | |||
561 | * unevictable page on [in]active list. | 560 | * unevictable page on [in]active list. |
562 | * We know how to handle that. | 561 | * We know how to handle that. |
563 | */ | 562 | */ |
564 | lru = active + page_lru_base_type(page); | 563 | lru = page_lru_base_type(page); |
565 | lru_cache_add_lru(page, lru); | 564 | lru_cache_add(page); |
566 | } else { | 565 | } else { |
567 | /* | 566 | /* |
568 | * Put unevictable pages directly on zone's unevictable | 567 | * Put unevictable pages directly on zone's unevictable |
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page, | |||
669 | return PAGEREF_RECLAIM; | 668 | return PAGEREF_RECLAIM; |
670 | } | 669 | } |
671 | 670 | ||
671 | /* Check if a page is dirty or under writeback */ | ||
672 | static void page_check_dirty_writeback(struct page *page, | ||
673 | bool *dirty, bool *writeback) | ||
674 | { | ||
675 | struct address_space *mapping; | ||
676 | |||
677 | /* | ||
678 | * Anonymous pages are not handled by flushers and must be written | ||
679 | * from reclaim context. Do not stall reclaim based on them | ||
680 | */ | ||
681 | if (!page_is_file_cache(page)) { | ||
682 | *dirty = false; | ||
683 | *writeback = false; | ||
684 | return; | ||
685 | } | ||
686 | |||
687 | /* By default assume that the page flags are accurate */ | ||
688 | *dirty = PageDirty(page); | ||
689 | *writeback = PageWriteback(page); | ||
690 | |||
691 | /* Verify dirty/writeback state if the filesystem supports it */ | ||
692 | if (!page_has_private(page)) | ||
693 | return; | ||
694 | |||
695 | mapping = page_mapping(page); | ||
696 | if (mapping && mapping->a_ops->is_dirty_writeback) | ||
697 | mapping->a_ops->is_dirty_writeback(page, dirty, writeback); | ||
698 | } | ||
699 | |||
672 | /* | 700 | /* |
673 | * shrink_page_list() returns the number of reclaimed pages | 701 | * shrink_page_list() returns the number of reclaimed pages |
674 | */ | 702 | */ |
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
677 | struct scan_control *sc, | 705 | struct scan_control *sc, |
678 | enum ttu_flags ttu_flags, | 706 | enum ttu_flags ttu_flags, |
679 | unsigned long *ret_nr_dirty, | 707 | unsigned long *ret_nr_dirty, |
708 | unsigned long *ret_nr_unqueued_dirty, | ||
709 | unsigned long *ret_nr_congested, | ||
680 | unsigned long *ret_nr_writeback, | 710 | unsigned long *ret_nr_writeback, |
711 | unsigned long *ret_nr_immediate, | ||
681 | bool force_reclaim) | 712 | bool force_reclaim) |
682 | { | 713 | { |
683 | LIST_HEAD(ret_pages); | 714 | LIST_HEAD(ret_pages); |
684 | LIST_HEAD(free_pages); | 715 | LIST_HEAD(free_pages); |
685 | int pgactivate = 0; | 716 | int pgactivate = 0; |
717 | unsigned long nr_unqueued_dirty = 0; | ||
686 | unsigned long nr_dirty = 0; | 718 | unsigned long nr_dirty = 0; |
687 | unsigned long nr_congested = 0; | 719 | unsigned long nr_congested = 0; |
688 | unsigned long nr_reclaimed = 0; | 720 | unsigned long nr_reclaimed = 0; |
689 | unsigned long nr_writeback = 0; | 721 | unsigned long nr_writeback = 0; |
722 | unsigned long nr_immediate = 0; | ||
690 | 723 | ||
691 | cond_resched(); | 724 | cond_resched(); |
692 | 725 | ||
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
696 | struct page *page; | 729 | struct page *page; |
697 | int may_enter_fs; | 730 | int may_enter_fs; |
698 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | 731 | enum page_references references = PAGEREF_RECLAIM_CLEAN; |
732 | bool dirty, writeback; | ||
699 | 733 | ||
700 | cond_resched(); | 734 | cond_resched(); |
701 | 735 | ||
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
723 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 757 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
724 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 758 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
725 | 759 | ||
760 | /* | ||
761 | * The number of dirty pages determines if a zone is marked | ||
762 | * reclaim_congested which affects wait_iff_congested. kswapd | ||
763 | * will stall and start writing pages if the tail of the LRU | ||
764 | * is all dirty unqueued pages. | ||
765 | */ | ||
766 | page_check_dirty_writeback(page, &dirty, &writeback); | ||
767 | if (dirty || writeback) | ||
768 | nr_dirty++; | ||
769 | |||
770 | if (dirty && !writeback) | ||
771 | nr_unqueued_dirty++; | ||
772 | |||
773 | /* | ||
774 | * Treat this page as congested if the underlying BDI is or if | ||
775 | * pages are cycling through the LRU so quickly that the | ||
776 | * pages marked for immediate reclaim are making it to the | ||
777 | * end of the LRU a second time. | ||
778 | */ | ||
779 | mapping = page_mapping(page); | ||
780 | if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || | ||
781 | (writeback && PageReclaim(page))) | ||
782 | nr_congested++; | ||
783 | |||
784 | /* | ||
785 | * If a page at the tail of the LRU is under writeback, there | ||
786 | * are three cases to consider. | ||
787 | * | ||
788 | * 1) If reclaim is encountering an excessive number of pages | ||
789 | * under writeback and this page is both under writeback and | ||
790 | * PageReclaim then it indicates that pages are being queued | ||
791 | * for IO but are being recycled through the LRU before the | ||
792 | * IO can complete. Waiting on the page itself risks an | ||
793 | * indefinite stall if it is impossible to writeback the | ||
794 | * page due to IO error or disconnected storage so instead | ||
795 | * note that the LRU is being scanned too quickly and the | ||
796 | * caller can stall after page list has been processed. | ||
797 | * | ||
798 | * 2) Global reclaim encounters a page, memcg encounters a | ||
799 | * page that is not marked for immediate reclaim or | ||
800 | * the caller does not have __GFP_IO. In this case mark | ||
801 | * the page for immediate reclaim and continue scanning. | ||
802 | * | ||
803 | * __GFP_IO is checked because a loop driver thread might | ||
804 | * enter reclaim, and deadlock if it waits on a page for | ||
805 | * which it is needed to do the write (loop masks off | ||
806 | * __GFP_IO|__GFP_FS for this reason); but more thought | ||
807 | * would probably show more reasons. | ||
808 | * | ||
809 | * Don't require __GFP_FS, since we're not going into the | ||
810 | * FS, just waiting on its writeback completion. Worryingly, | ||
811 | * ext4 gfs2 and xfs allocate pages with | ||
812 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing | ||
813 | * may_enter_fs here is liable to OOM on them. | ||
814 | * | ||
815 | * 3) memcg encounters a page that is not already marked | ||
816 | * PageReclaim. memcg does not have any dirty pages | ||
817 | * throttling so we could easily OOM just because too many | ||
818 | * pages are in writeback and there is nothing else to | ||
819 | * reclaim. Wait for the writeback to complete. | ||
820 | */ | ||
726 | if (PageWriteback(page)) { | 821 | if (PageWriteback(page)) { |
727 | /* | 822 | /* Case 1 above */ |
728 | * memcg doesn't have any dirty pages throttling so we | 823 | if (current_is_kswapd() && |
729 | * could easily OOM just because too many pages are in | 824 | PageReclaim(page) && |
730 | * writeback and there is nothing else to reclaim. | 825 | zone_is_reclaim_writeback(zone)) { |
731 | * | 826 | nr_immediate++; |
732 | * Check __GFP_IO, certainly because a loop driver | 827 | goto keep_locked; |
733 | * thread might enter reclaim, and deadlock if it waits | 828 | |
734 | * on a page for which it is needed to do the write | 829 | /* Case 2 above */ |
735 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | 830 | } else if (global_reclaim(sc) || |
736 | * but more thought would probably show more reasons. | ||
737 | * | ||
738 | * Don't require __GFP_FS, since we're not going into | ||
739 | * the FS, just waiting on its writeback completion. | ||
740 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
741 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
742 | * testing may_enter_fs here is liable to OOM on them. | ||
743 | */ | ||
744 | if (global_reclaim(sc) || | ||
745 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | 831 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { |
746 | /* | 832 | /* |
747 | * This is slightly racy - end_page_writeback() | 833 | * This is slightly racy - end_page_writeback() |
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
756 | */ | 842 | */ |
757 | SetPageReclaim(page); | 843 | SetPageReclaim(page); |
758 | nr_writeback++; | 844 | nr_writeback++; |
845 | |||
759 | goto keep_locked; | 846 | goto keep_locked; |
847 | |||
848 | /* Case 3 above */ | ||
849 | } else { | ||
850 | wait_on_page_writeback(page); | ||
760 | } | 851 | } |
761 | wait_on_page_writeback(page); | ||
762 | } | 852 | } |
763 | 853 | ||
764 | if (!force_reclaim) | 854 | if (!force_reclaim) |
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
784 | if (!add_to_swap(page, page_list)) | 874 | if (!add_to_swap(page, page_list)) |
785 | goto activate_locked; | 875 | goto activate_locked; |
786 | may_enter_fs = 1; | 876 | may_enter_fs = 1; |
787 | } | ||
788 | 877 | ||
789 | mapping = page_mapping(page); | 878 | /* Adding to swap updated mapping */ |
879 | mapping = page_mapping(page); | ||
880 | } | ||
790 | 881 | ||
791 | /* | 882 | /* |
792 | * The page is mapped into the page tables of one or more | 883 | * The page is mapped into the page tables of one or more |
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
806 | } | 897 | } |
807 | 898 | ||
808 | if (PageDirty(page)) { | 899 | if (PageDirty(page)) { |
809 | nr_dirty++; | ||
810 | |||
811 | /* | 900 | /* |
812 | * Only kswapd can writeback filesystem pages to | 901 | * Only kswapd can writeback filesystem pages to |
813 | * avoid risk of stack overflow but do not writeback | 902 | * avoid risk of stack overflow but only writeback |
814 | * unless under significant pressure. | 903 | * if many dirty pages have been encountered. |
815 | */ | 904 | */ |
816 | if (page_is_file_cache(page) && | 905 | if (page_is_file_cache(page) && |
817 | (!current_is_kswapd() || | 906 | (!current_is_kswapd() || |
818 | sc->priority >= DEF_PRIORITY - 2)) { | 907 | !zone_is_reclaim_dirty(zone))) { |
819 | /* | 908 | /* |
820 | * Immediately reclaim when written back. | 909 | * Immediately reclaim when written back. |
821 | * Similar in principal to deactivate_page() | 910 | * Similar in principal to deactivate_page() |
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
838 | /* Page is dirty, try to write it out here */ | 927 | /* Page is dirty, try to write it out here */ |
839 | switch (pageout(page, mapping, sc)) { | 928 | switch (pageout(page, mapping, sc)) { |
840 | case PAGE_KEEP: | 929 | case PAGE_KEEP: |
841 | nr_congested++; | ||
842 | goto keep_locked; | 930 | goto keep_locked; |
843 | case PAGE_ACTIVATE: | 931 | case PAGE_ACTIVATE: |
844 | goto activate_locked; | 932 | goto activate_locked; |
@@ -946,22 +1034,16 @@ keep: | |||
946 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 1034 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
947 | } | 1035 | } |
948 | 1036 | ||
949 | /* | ||
950 | * Tag a zone as congested if all the dirty pages encountered were | ||
951 | * backed by a congested BDI. In this case, reclaimers should just | ||
952 | * back off and wait for congestion to clear because further reclaim | ||
953 | * will encounter the same problem | ||
954 | */ | ||
955 | if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) | ||
956 | zone_set_flag(zone, ZONE_CONGESTED); | ||
957 | |||
958 | free_hot_cold_page_list(&free_pages, 1); | 1037 | free_hot_cold_page_list(&free_pages, 1); |
959 | 1038 | ||
960 | list_splice(&ret_pages, page_list); | 1039 | list_splice(&ret_pages, page_list); |
961 | count_vm_events(PGACTIVATE, pgactivate); | 1040 | count_vm_events(PGACTIVATE, pgactivate); |
962 | mem_cgroup_uncharge_end(); | 1041 | mem_cgroup_uncharge_end(); |
963 | *ret_nr_dirty += nr_dirty; | 1042 | *ret_nr_dirty += nr_dirty; |
1043 | *ret_nr_congested += nr_congested; | ||
1044 | *ret_nr_unqueued_dirty += nr_unqueued_dirty; | ||
964 | *ret_nr_writeback += nr_writeback; | 1045 | *ret_nr_writeback += nr_writeback; |
1046 | *ret_nr_immediate += nr_immediate; | ||
965 | return nr_reclaimed; | 1047 | return nr_reclaimed; |
966 | } | 1048 | } |
967 | 1049 | ||
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
973 | .priority = DEF_PRIORITY, | 1055 | .priority = DEF_PRIORITY, |
974 | .may_unmap = 1, | 1056 | .may_unmap = 1, |
975 | }; | 1057 | }; |
976 | unsigned long ret, dummy1, dummy2; | 1058 | unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; |
977 | struct page *page, *next; | 1059 | struct page *page, *next; |
978 | LIST_HEAD(clean_pages); | 1060 | LIST_HEAD(clean_pages); |
979 | 1061 | ||
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
985 | } | 1067 | } |
986 | 1068 | ||
987 | ret = shrink_page_list(&clean_pages, zone, &sc, | 1069 | ret = shrink_page_list(&clean_pages, zone, &sc, |
988 | TTU_UNMAP|TTU_IGNORE_ACCESS, | 1070 | TTU_UNMAP|TTU_IGNORE_ACCESS, |
989 | &dummy1, &dummy2, true); | 1071 | &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); |
990 | list_splice(&clean_pages, page_list); | 1072 | list_splice(&clean_pages, page_list); |
991 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); | 1073 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); |
992 | return ret; | 1074 | return ret; |
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1281 | unsigned long nr_reclaimed = 0; | 1363 | unsigned long nr_reclaimed = 0; |
1282 | unsigned long nr_taken; | 1364 | unsigned long nr_taken; |
1283 | unsigned long nr_dirty = 0; | 1365 | unsigned long nr_dirty = 0; |
1366 | unsigned long nr_congested = 0; | ||
1367 | unsigned long nr_unqueued_dirty = 0; | ||
1284 | unsigned long nr_writeback = 0; | 1368 | unsigned long nr_writeback = 0; |
1369 | unsigned long nr_immediate = 0; | ||
1285 | isolate_mode_t isolate_mode = 0; | 1370 | isolate_mode_t isolate_mode = 0; |
1286 | int file = is_file_lru(lru); | 1371 | int file = is_file_lru(lru); |
1287 | struct zone *zone = lruvec_zone(lruvec); | 1372 | struct zone *zone = lruvec_zone(lruvec); |
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1323 | return 0; | 1408 | return 0; |
1324 | 1409 | ||
1325 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, | 1410 | nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, |
1326 | &nr_dirty, &nr_writeback, false); | 1411 | &nr_dirty, &nr_unqueued_dirty, &nr_congested, |
1412 | &nr_writeback, &nr_immediate, | ||
1413 | false); | ||
1327 | 1414 | ||
1328 | spin_lock_irq(&zone->lru_lock); | 1415 | spin_lock_irq(&zone->lru_lock); |
1329 | 1416 | ||
@@ -1356,21 +1443,51 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1356 | * as there is no guarantee the dirtying process is throttled in the | 1443 | * as there is no guarantee the dirtying process is throttled in the |
1357 | * same way balance_dirty_pages() manages. | 1444 | * same way balance_dirty_pages() manages. |
1358 | * | 1445 | * |
1359 | * This scales the number of dirty pages that must be under writeback | 1446 | * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number |
1360 | * before throttling depending on priority. It is a simple backoff | 1447 | * of pages under pages flagged for immediate reclaim and stall if any |
1361 | * function that has the most effect in the range DEF_PRIORITY to | 1448 | * are encountered in the nr_immediate check below. |
1362 | * DEF_PRIORITY-2 which is the priority reclaim is considered to be | 1449 | */ |
1363 | * in trouble and reclaim is considered to be in trouble. | 1450 | if (nr_writeback && nr_writeback == nr_taken) |
1364 | * | 1451 | zone_set_flag(zone, ZONE_WRITEBACK); |
1365 | * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle | 1452 | |
1366 | * DEF_PRIORITY-1 50% must be PageWriteback | 1453 | /* |
1367 | * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble | 1454 | * memcg will stall in page writeback so only consider forcibly |
1368 | * ... | 1455 | * stalling for global reclaim |
1369 | * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any | ||
1370 | * isolated page is PageWriteback | ||
1371 | */ | 1456 | */ |
1372 | if (nr_writeback && nr_writeback >= | 1457 | if (global_reclaim(sc)) { |
1373 | (nr_taken >> (DEF_PRIORITY - sc->priority))) | 1458 | /* |
1459 | * Tag a zone as congested if all the dirty pages scanned were | ||
1460 | * backed by a congested BDI and wait_iff_congested will stall. | ||
1461 | */ | ||
1462 | if (nr_dirty && nr_dirty == nr_congested) | ||
1463 | zone_set_flag(zone, ZONE_CONGESTED); | ||
1464 | |||
1465 | /* | ||
1466 | * If dirty pages are scanned that are not queued for IO, it | ||
1467 | * implies that flushers are not keeping up. In this case, flag | ||
1468 | * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing | ||
1469 | * pages from reclaim context. It will forcibly stall in the | ||
1470 | * next check. | ||
1471 | */ | ||
1472 | if (nr_unqueued_dirty == nr_taken) | ||
1473 | zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); | ||
1474 | |||
1475 | /* | ||
1476 | * In addition, if kswapd scans pages marked marked for | ||
1477 | * immediate reclaim and under writeback (nr_immediate), it | ||
1478 | * implies that pages are cycling through the LRU faster than | ||
1479 | * they are written so also forcibly stall. | ||
1480 | */ | ||
1481 | if (nr_unqueued_dirty == nr_taken || nr_immediate) | ||
1482 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
1483 | } | ||
1484 | |||
1485 | /* | ||
1486 | * Stall direct reclaim for IO completions if underlying BDIs or zone | ||
1487 | * is congested. Allow kswapd to continue until it starts encountering | ||
1488 | * unqueued dirty pages or cycling through the LRU too quickly. | ||
1489 | */ | ||
1490 | if (!sc->hibernation_mode && !current_is_kswapd()) | ||
1374 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1491 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1375 | 1492 | ||
1376 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, | 1493 | trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, |
@@ -1822,17 +1939,25 @@ out: | |||
1822 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | 1939 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) |
1823 | { | 1940 | { |
1824 | unsigned long nr[NR_LRU_LISTS]; | 1941 | unsigned long nr[NR_LRU_LISTS]; |
1942 | unsigned long targets[NR_LRU_LISTS]; | ||
1825 | unsigned long nr_to_scan; | 1943 | unsigned long nr_to_scan; |
1826 | enum lru_list lru; | 1944 | enum lru_list lru; |
1827 | unsigned long nr_reclaimed = 0; | 1945 | unsigned long nr_reclaimed = 0; |
1828 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | 1946 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
1829 | struct blk_plug plug; | 1947 | struct blk_plug plug; |
1948 | bool scan_adjusted = false; | ||
1830 | 1949 | ||
1831 | get_scan_count(lruvec, sc, nr); | 1950 | get_scan_count(lruvec, sc, nr); |
1832 | 1951 | ||
1952 | /* Record the original scan target for proportional adjustments later */ | ||
1953 | memcpy(targets, nr, sizeof(nr)); | ||
1954 | |||
1833 | blk_start_plug(&plug); | 1955 | blk_start_plug(&plug); |
1834 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1956 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
1835 | nr[LRU_INACTIVE_FILE]) { | 1957 | nr[LRU_INACTIVE_FILE]) { |
1958 | unsigned long nr_anon, nr_file, percentage; | ||
1959 | unsigned long nr_scanned; | ||
1960 | |||
1836 | for_each_evictable_lru(lru) { | 1961 | for_each_evictable_lru(lru) { |
1837 | if (nr[lru]) { | 1962 | if (nr[lru]) { |
1838 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); | 1963 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); |
@@ -1842,17 +1967,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | |||
1842 | lruvec, sc); | 1967 | lruvec, sc); |
1843 | } | 1968 | } |
1844 | } | 1969 | } |
1970 | |||
1971 | if (nr_reclaimed < nr_to_reclaim || scan_adjusted) | ||
1972 | continue; | ||
1973 | |||
1845 | /* | 1974 | /* |
1846 | * On large memory systems, scan >> priority can become | 1975 | * For global direct reclaim, reclaim only the number of pages |
1847 | * really large. This is fine for the starting priority; | 1976 | * requested. Less care is taken to scan proportionally as it |
1848 | * we want to put equal scanning pressure on each zone. | 1977 | * is more important to minimise direct reclaim stall latency |
1849 | * However, if the VM has a harder time of freeing pages, | 1978 | * than it is to properly age the LRU lists. |
1850 | * with multiple processes reclaiming pages, the total | ||
1851 | * freeing target can get unreasonably large. | ||
1852 | */ | 1979 | */ |
1853 | if (nr_reclaimed >= nr_to_reclaim && | 1980 | if (global_reclaim(sc) && !current_is_kswapd()) |
1854 | sc->priority < DEF_PRIORITY) | ||
1855 | break; | 1981 | break; |
1982 | |||
1983 | /* | ||
1984 | * For kswapd and memcg, reclaim at least the number of pages | ||
1985 | * requested. Ensure that the anon and file LRUs shrink | ||
1986 | * proportionally what was requested by get_scan_count(). We | ||
1987 | * stop reclaiming one LRU and reduce the amount scanning | ||
1988 | * proportional to the original scan target. | ||
1989 | */ | ||
1990 | nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; | ||
1991 | nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; | ||
1992 | |||
1993 | if (nr_file > nr_anon) { | ||
1994 | unsigned long scan_target = targets[LRU_INACTIVE_ANON] + | ||
1995 | targets[LRU_ACTIVE_ANON] + 1; | ||
1996 | lru = LRU_BASE; | ||
1997 | percentage = nr_anon * 100 / scan_target; | ||
1998 | } else { | ||
1999 | unsigned long scan_target = targets[LRU_INACTIVE_FILE] + | ||
2000 | targets[LRU_ACTIVE_FILE] + 1; | ||
2001 | lru = LRU_FILE; | ||
2002 | percentage = nr_file * 100 / scan_target; | ||
2003 | } | ||
2004 | |||
2005 | /* Stop scanning the smaller of the LRU */ | ||
2006 | nr[lru] = 0; | ||
2007 | nr[lru + LRU_ACTIVE] = 0; | ||
2008 | |||
2009 | /* | ||
2010 | * Recalculate the other LRU scan count based on its original | ||
2011 | * scan target and the percentage scanning already complete | ||
2012 | */ | ||
2013 | lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; | ||
2014 | nr_scanned = targets[lru] - nr[lru]; | ||
2015 | nr[lru] = targets[lru] * (100 - percentage) / 100; | ||
2016 | nr[lru] -= min(nr[lru], nr_scanned); | ||
2017 | |||
2018 | lru += LRU_ACTIVE; | ||
2019 | nr_scanned = targets[lru] - nr[lru]; | ||
2020 | nr[lru] = targets[lru] * (100 - percentage) / 100; | ||
2021 | nr[lru] -= min(nr[lru], nr_scanned); | ||
2022 | |||
2023 | scan_adjusted = true; | ||
1856 | } | 2024 | } |
1857 | blk_finish_plug(&plug); | 2025 | blk_finish_plug(&plug); |
1858 | sc->nr_reclaimed += nr_reclaimed; | 2026 | sc->nr_reclaimed += nr_reclaimed; |
@@ -2179,8 +2347,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2179 | aborted_reclaim = shrink_zones(zonelist, sc); | 2347 | aborted_reclaim = shrink_zones(zonelist, sc); |
2180 | 2348 | ||
2181 | /* | 2349 | /* |
2182 | * Don't shrink slabs when reclaiming memory from | 2350 | * Don't shrink slabs when reclaiming memory from over limit |
2183 | * over limit cgroups | 2351 | * cgroups but do shrink slab at least once when aborting |
2352 | * reclaim for compaction to avoid unevenly scanning file/anon | ||
2353 | * LRU pages over slab pages. | ||
2184 | */ | 2354 | */ |
2185 | if (global_reclaim(sc)) { | 2355 | if (global_reclaim(sc)) { |
2186 | unsigned long lru_pages = 0; | 2356 | unsigned long lru_pages = 0; |
@@ -2222,18 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2222 | WB_REASON_TRY_TO_FREE_PAGES); | 2392 | WB_REASON_TRY_TO_FREE_PAGES); |
2223 | sc->may_writepage = 1; | 2393 | sc->may_writepage = 1; |
2224 | } | 2394 | } |
2225 | 2395 | } while (--sc->priority >= 0 && !aborted_reclaim); | |
2226 | /* Take a nap, wait for some writeback to complete */ | ||
2227 | if (!sc->hibernation_mode && sc->nr_scanned && | ||
2228 | sc->priority < DEF_PRIORITY - 2) { | ||
2229 | struct zone *preferred_zone; | ||
2230 | |||
2231 | first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), | ||
2232 | &cpuset_current_mems_allowed, | ||
2233 | &preferred_zone); | ||
2234 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); | ||
2235 | } | ||
2236 | } while (--sc->priority >= 0); | ||
2237 | 2396 | ||
2238 | out: | 2397 | out: |
2239 | delayacct_freepages_end(); | 2398 | delayacct_freepages_end(); |
@@ -2601,6 +2760,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2601 | } | 2760 | } |
2602 | 2761 | ||
2603 | /* | 2762 | /* |
2763 | * kswapd shrinks the zone by the number of pages required to reach | ||
2764 | * the high watermark. | ||
2765 | * | ||
2766 | * Returns true if kswapd scanned at least the requested number of pages to | ||
2767 | * reclaim or if the lack of progress was due to pages under writeback. | ||
2768 | * This is used to determine if the scanning priority needs to be raised. | ||
2769 | */ | ||
2770 | static bool kswapd_shrink_zone(struct zone *zone, | ||
2771 | int classzone_idx, | ||
2772 | struct scan_control *sc, | ||
2773 | unsigned long lru_pages, | ||
2774 | unsigned long *nr_attempted) | ||
2775 | { | ||
2776 | unsigned long nr_slab; | ||
2777 | int testorder = sc->order; | ||
2778 | unsigned long balance_gap; | ||
2779 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2780 | struct shrink_control shrink = { | ||
2781 | .gfp_mask = sc->gfp_mask, | ||
2782 | }; | ||
2783 | bool lowmem_pressure; | ||
2784 | |||
2785 | /* Reclaim above the high watermark. */ | ||
2786 | sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); | ||
2787 | |||
2788 | /* | ||
2789 | * Kswapd reclaims only single pages with compaction enabled. Trying | ||
2790 | * too hard to reclaim until contiguous free pages have become | ||
2791 | * available can hurt performance by evicting too much useful data | ||
2792 | * from memory. Do not reclaim more than needed for compaction. | ||
2793 | */ | ||
2794 | if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && | ||
2795 | compaction_suitable(zone, sc->order) != | ||
2796 | COMPACT_SKIPPED) | ||
2797 | testorder = 0; | ||
2798 | |||
2799 | /* | ||
2800 | * We put equal pressure on every zone, unless one zone has way too | ||
2801 | * many pages free already. The "too many pages" is defined as the | ||
2802 | * high wmark plus a "gap" where the gap is either the low | ||
2803 | * watermark or 1% of the zone, whichever is smaller. | ||
2804 | */ | ||
2805 | balance_gap = min(low_wmark_pages(zone), | ||
2806 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | ||
2807 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2808 | |||
2809 | /* | ||
2810 | * If there is no low memory pressure or the zone is balanced then no | ||
2811 | * reclaim is necessary | ||
2812 | */ | ||
2813 | lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); | ||
2814 | if (!lowmem_pressure && zone_balanced(zone, testorder, | ||
2815 | balance_gap, classzone_idx)) | ||
2816 | return true; | ||
2817 | |||
2818 | shrink_zone(zone, sc); | ||
2819 | |||
2820 | reclaim_state->reclaimed_slab = 0; | ||
2821 | nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); | ||
2822 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2823 | |||
2824 | /* Account for the number of pages attempted to reclaim */ | ||
2825 | *nr_attempted += sc->nr_to_reclaim; | ||
2826 | |||
2827 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2828 | zone->all_unreclaimable = 1; | ||
2829 | |||
2830 | zone_clear_flag(zone, ZONE_WRITEBACK); | ||
2831 | |||
2832 | /* | ||
2833 | * If a zone reaches its high watermark, consider it to be no longer | ||
2834 | * congested. It's possible there are dirty pages backed by congested | ||
2835 | * BDIs but as pressure is relieved, speculatively avoid congestion | ||
2836 | * waits. | ||
2837 | */ | ||
2838 | if (!zone->all_unreclaimable && | ||
2839 | zone_balanced(zone, testorder, 0, classzone_idx)) { | ||
2840 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2841 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | ||
2842 | } | ||
2843 | |||
2844 | return sc->nr_scanned >= sc->nr_to_reclaim; | ||
2845 | } | ||
2846 | |||
2847 | /* | ||
2604 | * For kswapd, balance_pgdat() will work across all this node's zones until | 2848 | * For kswapd, balance_pgdat() will work across all this node's zones until |
2605 | * they are all at high_wmark_pages(zone). | 2849 | * they are all at high_wmark_pages(zone). |
2606 | * | 2850 | * |
@@ -2624,35 +2868,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2624 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2868 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2625 | int *classzone_idx) | 2869 | int *classzone_idx) |
2626 | { | 2870 | { |
2627 | bool pgdat_is_balanced = false; | ||
2628 | int i; | 2871 | int i; |
2629 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2872 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2630 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2631 | unsigned long nr_soft_reclaimed; | 2873 | unsigned long nr_soft_reclaimed; |
2632 | unsigned long nr_soft_scanned; | 2874 | unsigned long nr_soft_scanned; |
2633 | struct scan_control sc = { | 2875 | struct scan_control sc = { |
2634 | .gfp_mask = GFP_KERNEL, | 2876 | .gfp_mask = GFP_KERNEL, |
2877 | .priority = DEF_PRIORITY, | ||
2635 | .may_unmap = 1, | 2878 | .may_unmap = 1, |
2636 | .may_swap = 1, | 2879 | .may_swap = 1, |
2637 | /* | 2880 | .may_writepage = !laptop_mode, |
2638 | * kswapd doesn't want to be bailed out while reclaim. because | ||
2639 | * we want to put equal scanning pressure on each zone. | ||
2640 | */ | ||
2641 | .nr_to_reclaim = ULONG_MAX, | ||
2642 | .order = order, | 2881 | .order = order, |
2643 | .target_mem_cgroup = NULL, | 2882 | .target_mem_cgroup = NULL, |
2644 | }; | 2883 | }; |
2645 | struct shrink_control shrink = { | ||
2646 | .gfp_mask = sc.gfp_mask, | ||
2647 | }; | ||
2648 | loop_again: | ||
2649 | sc.priority = DEF_PRIORITY; | ||
2650 | sc.nr_reclaimed = 0; | ||
2651 | sc.may_writepage = !laptop_mode; | ||
2652 | count_vm_event(PAGEOUTRUN); | 2884 | count_vm_event(PAGEOUTRUN); |
2653 | 2885 | ||
2654 | do { | 2886 | do { |
2655 | unsigned long lru_pages = 0; | 2887 | unsigned long lru_pages = 0; |
2888 | unsigned long nr_attempted = 0; | ||
2889 | bool raise_priority = true; | ||
2890 | bool pgdat_needs_compaction = (order > 0); | ||
2891 | |||
2892 | sc.nr_reclaimed = 0; | ||
2656 | 2893 | ||
2657 | /* | 2894 | /* |
2658 | * Scan in the highmem->dma direction for the highest | 2895 | * Scan in the highmem->dma direction for the highest |
@@ -2689,23 +2926,46 @@ loop_again: | |||
2689 | end_zone = i; | 2926 | end_zone = i; |
2690 | break; | 2927 | break; |
2691 | } else { | 2928 | } else { |
2692 | /* If balanced, clear the congested flag */ | 2929 | /* |
2930 | * If balanced, clear the dirty and congested | ||
2931 | * flags | ||
2932 | */ | ||
2693 | zone_clear_flag(zone, ZONE_CONGESTED); | 2933 | zone_clear_flag(zone, ZONE_CONGESTED); |
2934 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | ||
2694 | } | 2935 | } |
2695 | } | 2936 | } |
2696 | 2937 | ||
2697 | if (i < 0) { | 2938 | if (i < 0) |
2698 | pgdat_is_balanced = true; | ||
2699 | goto out; | 2939 | goto out; |
2700 | } | ||
2701 | 2940 | ||
2702 | for (i = 0; i <= end_zone; i++) { | 2941 | for (i = 0; i <= end_zone; i++) { |
2703 | struct zone *zone = pgdat->node_zones + i; | 2942 | struct zone *zone = pgdat->node_zones + i; |
2704 | 2943 | ||
2944 | if (!populated_zone(zone)) | ||
2945 | continue; | ||
2946 | |||
2705 | lru_pages += zone_reclaimable_pages(zone); | 2947 | lru_pages += zone_reclaimable_pages(zone); |
2948 | |||
2949 | /* | ||
2950 | * If any zone is currently balanced then kswapd will | ||
2951 | * not call compaction as it is expected that the | ||
2952 | * necessary pages are already available. | ||
2953 | */ | ||
2954 | if (pgdat_needs_compaction && | ||
2955 | zone_watermark_ok(zone, order, | ||
2956 | low_wmark_pages(zone), | ||
2957 | *classzone_idx, 0)) | ||
2958 | pgdat_needs_compaction = false; | ||
2706 | } | 2959 | } |
2707 | 2960 | ||
2708 | /* | 2961 | /* |
2962 | * If we're getting trouble reclaiming, start doing writepage | ||
2963 | * even in laptop mode. | ||
2964 | */ | ||
2965 | if (sc.priority < DEF_PRIORITY - 2) | ||
2966 | sc.may_writepage = 1; | ||
2967 | |||
2968 | /* | ||
2709 | * Now scan the zone in the dma->highmem direction, stopping | 2969 | * Now scan the zone in the dma->highmem direction, stopping |
2710 | * at the last zone which needs scanning. | 2970 | * at the last zone which needs scanning. |
2711 | * | 2971 | * |
@@ -2716,8 +2976,6 @@ loop_again: | |||
2716 | */ | 2976 | */ |
2717 | for (i = 0; i <= end_zone; i++) { | 2977 | for (i = 0; i <= end_zone; i++) { |
2718 | struct zone *zone = pgdat->node_zones + i; | 2978 | struct zone *zone = pgdat->node_zones + i; |
2719 | int nr_slab, testorder; | ||
2720 | unsigned long balance_gap; | ||
2721 | 2979 | ||
2722 | if (!populated_zone(zone)) | 2980 | if (!populated_zone(zone)) |
2723 | continue; | 2981 | continue; |
@@ -2738,65 +2996,14 @@ loop_again: | |||
2738 | sc.nr_reclaimed += nr_soft_reclaimed; | 2996 | sc.nr_reclaimed += nr_soft_reclaimed; |
2739 | 2997 | ||
2740 | /* | 2998 | /* |
2741 | * We put equal pressure on every zone, unless | 2999 | * There should be no need to raise the scanning |
2742 | * one zone has way too many pages free | 3000 | * priority if enough pages are already being scanned |
2743 | * already. The "too many pages" is defined | 3001 | * that that high watermark would be met at 100% |
2744 | * as the high wmark plus a "gap" where the | 3002 | * efficiency. |
2745 | * gap is either the low watermark or 1% | ||
2746 | * of the zone, whichever is smaller. | ||
2747 | */ | 3003 | */ |
2748 | balance_gap = min(low_wmark_pages(zone), | 3004 | if (kswapd_shrink_zone(zone, end_zone, &sc, |
2749 | (zone->managed_pages + | 3005 | lru_pages, &nr_attempted)) |
2750 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 3006 | raise_priority = false; |
2751 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | ||
2752 | /* | ||
2753 | * Kswapd reclaims only single pages with compaction | ||
2754 | * enabled. Trying too hard to reclaim until contiguous | ||
2755 | * free pages have become available can hurt performance | ||
2756 | * by evicting too much useful data from memory. | ||
2757 | * Do not reclaim more than needed for compaction. | ||
2758 | */ | ||
2759 | testorder = order; | ||
2760 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | ||
2761 | compaction_suitable(zone, order) != | ||
2762 | COMPACT_SKIPPED) | ||
2763 | testorder = 0; | ||
2764 | |||
2765 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || | ||
2766 | !zone_balanced(zone, testorder, | ||
2767 | balance_gap, end_zone)) { | ||
2768 | shrink_zone(zone, &sc); | ||
2769 | |||
2770 | reclaim_state->reclaimed_slab = 0; | ||
2771 | nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); | ||
2772 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | ||
2773 | |||
2774 | if (nr_slab == 0 && !zone_reclaimable(zone)) | ||
2775 | zone->all_unreclaimable = 1; | ||
2776 | } | ||
2777 | |||
2778 | /* | ||
2779 | * If we're getting trouble reclaiming, start doing | ||
2780 | * writepage even in laptop mode. | ||
2781 | */ | ||
2782 | if (sc.priority < DEF_PRIORITY - 2) | ||
2783 | sc.may_writepage = 1; | ||
2784 | |||
2785 | if (zone->all_unreclaimable) { | ||
2786 | if (end_zone && end_zone == i) | ||
2787 | end_zone--; | ||
2788 | continue; | ||
2789 | } | ||
2790 | |||
2791 | if (zone_balanced(zone, testorder, 0, end_zone)) | ||
2792 | /* | ||
2793 | * If a zone reaches its high watermark, | ||
2794 | * consider it to be no longer congested. It's | ||
2795 | * possible there are dirty pages backed by | ||
2796 | * congested BDIs but as pressure is relieved, | ||
2797 | * speculatively avoid congestion waits | ||
2798 | */ | ||
2799 | zone_clear_flag(zone, ZONE_CONGESTED); | ||
2800 | } | 3007 | } |
2801 | 3008 | ||
2802 | /* | 3009 | /* |
@@ -2808,74 +3015,38 @@ loop_again: | |||
2808 | pfmemalloc_watermark_ok(pgdat)) | 3015 | pfmemalloc_watermark_ok(pgdat)) |
2809 | wake_up(&pgdat->pfmemalloc_wait); | 3016 | wake_up(&pgdat->pfmemalloc_wait); |
2810 | 3017 | ||
2811 | if (pgdat_balanced(pgdat, order, *classzone_idx)) { | ||
2812 | pgdat_is_balanced = true; | ||
2813 | break; /* kswapd: all done */ | ||
2814 | } | ||
2815 | |||
2816 | /* | 3018 | /* |
2817 | * We do this so kswapd doesn't build up large priorities for | 3019 | * Fragmentation may mean that the system cannot be rebalanced |
2818 | * example when it is freeing in parallel with allocators. It | 3020 | * for high-order allocations in all zones. If twice the |
2819 | * matches the direct reclaim path behaviour in terms of impact | 3021 | * allocation size has been reclaimed and the zones are still |
2820 | * on zone->*_priority. | 3022 | * not balanced then recheck the watermarks at order-0 to |
3023 | * prevent kswapd reclaiming excessively. Assume that a | ||
3024 | * process requested a high-order can direct reclaim/compact. | ||
2821 | */ | 3025 | */ |
2822 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 3026 | if (order && sc.nr_reclaimed >= 2UL << order) |
2823 | break; | 3027 | order = sc.order = 0; |
2824 | } while (--sc.priority >= 0); | ||
2825 | |||
2826 | out: | ||
2827 | if (!pgdat_is_balanced) { | ||
2828 | cond_resched(); | ||
2829 | 3028 | ||
2830 | try_to_freeze(); | 3029 | /* Check if kswapd should be suspending */ |
3030 | if (try_to_freeze() || kthread_should_stop()) | ||
3031 | break; | ||
2831 | 3032 | ||
2832 | /* | 3033 | /* |
2833 | * Fragmentation may mean that the system cannot be | 3034 | * Compact if necessary and kswapd is reclaiming at least the |
2834 | * rebalanced for high-order allocations in all zones. | 3035 | * high watermark number of pages as requsted |
2835 | * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, | ||
2836 | * it means the zones have been fully scanned and are still | ||
2837 | * not balanced. For high-order allocations, there is | ||
2838 | * little point trying all over again as kswapd may | ||
2839 | * infinite loop. | ||
2840 | * | ||
2841 | * Instead, recheck all watermarks at order-0 as they | ||
2842 | * are the most important. If watermarks are ok, kswapd will go | ||
2843 | * back to sleep. High-order users can still perform direct | ||
2844 | * reclaim if they wish. | ||
2845 | */ | 3036 | */ |
2846 | if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) | 3037 | if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) |
2847 | order = sc.order = 0; | ||
2848 | |||
2849 | goto loop_again; | ||
2850 | } | ||
2851 | |||
2852 | /* | ||
2853 | * If kswapd was reclaiming at a higher order, it has the option of | ||
2854 | * sleeping without all zones being balanced. Before it does, it must | ||
2855 | * ensure that the watermarks for order-0 on *all* zones are met and | ||
2856 | * that the congestion flags are cleared. The congestion flag must | ||
2857 | * be cleared as kswapd is the only mechanism that clears the flag | ||
2858 | * and it is potentially going to sleep here. | ||
2859 | */ | ||
2860 | if (order) { | ||
2861 | int zones_need_compaction = 1; | ||
2862 | |||
2863 | for (i = 0; i <= end_zone; i++) { | ||
2864 | struct zone *zone = pgdat->node_zones + i; | ||
2865 | |||
2866 | if (!populated_zone(zone)) | ||
2867 | continue; | ||
2868 | |||
2869 | /* Check if the memory needs to be defragmented. */ | ||
2870 | if (zone_watermark_ok(zone, order, | ||
2871 | low_wmark_pages(zone), *classzone_idx, 0)) | ||
2872 | zones_need_compaction = 0; | ||
2873 | } | ||
2874 | |||
2875 | if (zones_need_compaction) | ||
2876 | compact_pgdat(pgdat, order); | 3038 | compact_pgdat(pgdat, order); |
2877 | } | ||
2878 | 3039 | ||
3040 | /* | ||
3041 | * Raise priority if scanning rate is too low or there was no | ||
3042 | * progress in reclaiming pages | ||
3043 | */ | ||
3044 | if (raise_priority || !sc.nr_reclaimed) | ||
3045 | sc.priority--; | ||
3046 | } while (sc.priority >= 1 && | ||
3047 | !pgdat_balanced(pgdat, order, *classzone_idx)); | ||
3048 | |||
3049 | out: | ||
2879 | /* | 3050 | /* |
2880 | * Return the order we were reclaiming at so prepare_kswapd_sleep() | 3051 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2881 | * makes a decision on the order we were last reclaiming at. However, | 3052 | * makes a decision on the order we were last reclaiming at. However, |
diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000000..9bb4710e3589 --- /dev/null +++ b/mm/zbud.c | |||
@@ -0,0 +1,527 @@ | |||
1 | /* | ||
2 | * zbud.c | ||
3 | * | ||
4 | * Copyright (C) 2013, Seth Jennings, IBM | ||
5 | * | ||
6 | * Concepts based on zcache internal zbud allocator by Dan Magenheimer. | ||
7 | * | ||
8 | * zbud is an special purpose allocator for storing compressed pages. Contrary | ||
9 | * to what its name may suggest, zbud is not a buddy allocator, but rather an | ||
10 | * allocator that "buddies" two compressed pages together in a single memory | ||
11 | * page. | ||
12 | * | ||
13 | * While this design limits storage density, it has simple and deterministic | ||
14 | * reclaim properties that make it preferable to a higher density approach when | ||
15 | * reclaim will be used. | ||
16 | * | ||
17 | * zbud works by storing compressed pages, or "zpages", together in pairs in a | ||
18 | * single memory page called a "zbud page". The first buddy is "left | ||
19 | * justifed" at the beginning of the zbud page, and the last buddy is "right | ||
20 | * justified" at the end of the zbud page. The benefit is that if either | ||
21 | * buddy is freed, the freed buddy space, coalesced with whatever slack space | ||
22 | * that existed between the buddies, results in the largest possible free region | ||
23 | * within the zbud page. | ||
24 | * | ||
25 | * zbud also provides an attractive lower bound on density. The ratio of zpages | ||
26 | * to zbud pages can not be less than 1. This ensures that zbud can never "do | ||
27 | * harm" by using more pages to store zpages than the uncompressed zpages would | ||
28 | * have used on their own. | ||
29 | * | ||
30 | * zbud pages are divided into "chunks". The size of the chunks is fixed at | ||
31 | * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages | ||
32 | * into chunks allows organizing unbuddied zbud pages into a manageable number | ||
33 | * of unbuddied lists according to the number of free chunks available in the | ||
34 | * zbud page. | ||
35 | * | ||
36 | * The zbud API differs from that of conventional allocators in that the | ||
37 | * allocation function, zbud_alloc(), returns an opaque handle to the user, | ||
38 | * not a dereferenceable pointer. The user must map the handle using | ||
39 | * zbud_map() in order to get a usable pointer by which to access the | ||
40 | * allocation data and unmap the handle with zbud_unmap() when operations | ||
41 | * on the allocation data are complete. | ||
42 | */ | ||
43 | |||
44 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
45 | |||
46 | #include <linux/atomic.h> | ||
47 | #include <linux/list.h> | ||
48 | #include <linux/mm.h> | ||
49 | #include <linux/module.h> | ||
50 | #include <linux/preempt.h> | ||
51 | #include <linux/slab.h> | ||
52 | #include <linux/spinlock.h> | ||
53 | #include <linux/zbud.h> | ||
54 | |||
55 | /***************** | ||
56 | * Structures | ||
57 | *****************/ | ||
58 | /* | ||
59 | * NCHUNKS_ORDER determines the internal allocation granularity, effectively | ||
60 | * adjusting internal fragmentation. It also determines the number of | ||
61 | * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the | ||
62 | * allocation granularity will be in chunks of size PAGE_SIZE/64, and there | ||
63 | * will be 64 freelists per pool. | ||
64 | */ | ||
65 | #define NCHUNKS_ORDER 6 | ||
66 | |||
67 | #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) | ||
68 | #define CHUNK_SIZE (1 << CHUNK_SHIFT) | ||
69 | #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) | ||
70 | #define ZHDR_SIZE_ALIGNED CHUNK_SIZE | ||
71 | |||
72 | /** | ||
73 | * struct zbud_pool - stores metadata for each zbud pool | ||
74 | * @lock: protects all pool fields and first|last_chunk fields of any | ||
75 | * zbud page in the pool | ||
76 | * @unbuddied: array of lists tracking zbud pages that only contain one buddy; | ||
77 | * the lists each zbud page is added to depends on the size of | ||
78 | * its free region. | ||
79 | * @buddied: list tracking the zbud pages that contain two buddies; | ||
80 | * these zbud pages are full | ||
81 | * @lru: list tracking the zbud pages in LRU order by most recently | ||
82 | * added buddy. | ||
83 | * @pages_nr: number of zbud pages in the pool. | ||
84 | * @ops: pointer to a structure of user defined operations specified at | ||
85 | * pool creation time. | ||
86 | * | ||
87 | * This structure is allocated at pool creation time and maintains metadata | ||
88 | * pertaining to a particular zbud pool. | ||
89 | */ | ||
90 | struct zbud_pool { | ||
91 | spinlock_t lock; | ||
92 | struct list_head unbuddied[NCHUNKS]; | ||
93 | struct list_head buddied; | ||
94 | struct list_head lru; | ||
95 | u64 pages_nr; | ||
96 | struct zbud_ops *ops; | ||
97 | }; | ||
98 | |||
99 | /* | ||
100 | * struct zbud_header - zbud page metadata occupying the first chunk of each | ||
101 | * zbud page. | ||
102 | * @buddy: links the zbud page into the unbuddied/buddied lists in the pool | ||
103 | * @lru: links the zbud page into the lru list in the pool | ||
104 | * @first_chunks: the size of the first buddy in chunks, 0 if free | ||
105 | * @last_chunks: the size of the last buddy in chunks, 0 if free | ||
106 | */ | ||
107 | struct zbud_header { | ||
108 | struct list_head buddy; | ||
109 | struct list_head lru; | ||
110 | unsigned int first_chunks; | ||
111 | unsigned int last_chunks; | ||
112 | bool under_reclaim; | ||
113 | }; | ||
114 | |||
115 | /***************** | ||
116 | * Helpers | ||
117 | *****************/ | ||
118 | /* Just to make the code easier to read */ | ||
119 | enum buddy { | ||
120 | FIRST, | ||
121 | LAST | ||
122 | }; | ||
123 | |||
124 | /* Converts an allocation size in bytes to size in zbud chunks */ | ||
125 | static int size_to_chunks(int size) | ||
126 | { | ||
127 | return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; | ||
128 | } | ||
129 | |||
130 | #define for_each_unbuddied_list(_iter, _begin) \ | ||
131 | for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) | ||
132 | |||
133 | /* Initializes the zbud header of a newly allocated zbud page */ | ||
134 | static struct zbud_header *init_zbud_page(struct page *page) | ||
135 | { | ||
136 | struct zbud_header *zhdr = page_address(page); | ||
137 | zhdr->first_chunks = 0; | ||
138 | zhdr->last_chunks = 0; | ||
139 | INIT_LIST_HEAD(&zhdr->buddy); | ||
140 | INIT_LIST_HEAD(&zhdr->lru); | ||
141 | zhdr->under_reclaim = 0; | ||
142 | return zhdr; | ||
143 | } | ||
144 | |||
145 | /* Resets the struct page fields and frees the page */ | ||
146 | static void free_zbud_page(struct zbud_header *zhdr) | ||
147 | { | ||
148 | __free_page(virt_to_page(zhdr)); | ||
149 | } | ||
150 | |||
151 | /* | ||
152 | * Encodes the handle of a particular buddy within a zbud page | ||
153 | * Pool lock should be held as this function accesses first|last_chunks | ||
154 | */ | ||
155 | static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) | ||
156 | { | ||
157 | unsigned long handle; | ||
158 | |||
159 | /* | ||
160 | * For now, the encoded handle is actually just the pointer to the data | ||
161 | * but this might not always be the case. A little information hiding. | ||
162 | * Add CHUNK_SIZE to the handle if it is the first allocation to jump | ||
163 | * over the zbud header in the first chunk. | ||
164 | */ | ||
165 | handle = (unsigned long)zhdr; | ||
166 | if (bud == FIRST) | ||
167 | /* skip over zbud header */ | ||
168 | handle += ZHDR_SIZE_ALIGNED; | ||
169 | else /* bud == LAST */ | ||
170 | handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); | ||
171 | return handle; | ||
172 | } | ||
173 | |||
174 | /* Returns the zbud page where a given handle is stored */ | ||
175 | static struct zbud_header *handle_to_zbud_header(unsigned long handle) | ||
176 | { | ||
177 | return (struct zbud_header *)(handle & PAGE_MASK); | ||
178 | } | ||
179 | |||
180 | /* Returns the number of free chunks in a zbud page */ | ||
181 | static int num_free_chunks(struct zbud_header *zhdr) | ||
182 | { | ||
183 | /* | ||
184 | * Rather than branch for different situations, just use the fact that | ||
185 | * free buddies have a length of zero to simplify everything. -1 at the | ||
186 | * end for the zbud header. | ||
187 | */ | ||
188 | return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; | ||
189 | } | ||
190 | |||
191 | /***************** | ||
192 | * API Functions | ||
193 | *****************/ | ||
194 | /** | ||
195 | * zbud_create_pool() - create a new zbud pool | ||
196 | * @gfp: gfp flags when allocating the zbud pool structure | ||
197 | * @ops: user-defined operations for the zbud pool | ||
198 | * | ||
199 | * Return: pointer to the new zbud pool or NULL if the metadata allocation | ||
200 | * failed. | ||
201 | */ | ||
202 | struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) | ||
203 | { | ||
204 | struct zbud_pool *pool; | ||
205 | int i; | ||
206 | |||
207 | pool = kmalloc(sizeof(struct zbud_pool), gfp); | ||
208 | if (!pool) | ||
209 | return NULL; | ||
210 | spin_lock_init(&pool->lock); | ||
211 | for_each_unbuddied_list(i, 0) | ||
212 | INIT_LIST_HEAD(&pool->unbuddied[i]); | ||
213 | INIT_LIST_HEAD(&pool->buddied); | ||
214 | INIT_LIST_HEAD(&pool->lru); | ||
215 | pool->pages_nr = 0; | ||
216 | pool->ops = ops; | ||
217 | return pool; | ||
218 | } | ||
219 | |||
220 | /** | ||
221 | * zbud_destroy_pool() - destroys an existing zbud pool | ||
222 | * @pool: the zbud pool to be destroyed | ||
223 | * | ||
224 | * The pool should be emptied before this function is called. | ||
225 | */ | ||
226 | void zbud_destroy_pool(struct zbud_pool *pool) | ||
227 | { | ||
228 | kfree(pool); | ||
229 | } | ||
230 | |||
231 | /** | ||
232 | * zbud_alloc() - allocates a region of a given size | ||
233 | * @pool: zbud pool from which to allocate | ||
234 | * @size: size in bytes of the desired allocation | ||
235 | * @gfp: gfp flags used if the pool needs to grow | ||
236 | * @handle: handle of the new allocation | ||
237 | * | ||
238 | * This function will attempt to find a free region in the pool large enough to | ||
239 | * satisfy the allocation request. A search of the unbuddied lists is | ||
240 | * performed first. If no suitable free region is found, then a new page is | ||
241 | * allocated and added to the pool to satisfy the request. | ||
242 | * | ||
243 | * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used | ||
244 | * as zbud pool pages. | ||
245 | * | ||
246 | * Return: 0 if success and handle is set, otherwise -EINVAL is the size or | ||
247 | * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate | ||
248 | * a new page. | ||
249 | */ | ||
250 | int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, | ||
251 | unsigned long *handle) | ||
252 | { | ||
253 | int chunks, i, freechunks; | ||
254 | struct zbud_header *zhdr = NULL; | ||
255 | enum buddy bud; | ||
256 | struct page *page; | ||
257 | |||
258 | if (size <= 0 || gfp & __GFP_HIGHMEM) | ||
259 | return -EINVAL; | ||
260 | if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED) | ||
261 | return -ENOSPC; | ||
262 | chunks = size_to_chunks(size); | ||
263 | spin_lock(&pool->lock); | ||
264 | |||
265 | /* First, try to find an unbuddied zbud page. */ | ||
266 | zhdr = NULL; | ||
267 | for_each_unbuddied_list(i, chunks) { | ||
268 | if (!list_empty(&pool->unbuddied[i])) { | ||
269 | zhdr = list_first_entry(&pool->unbuddied[i], | ||
270 | struct zbud_header, buddy); | ||
271 | list_del(&zhdr->buddy); | ||
272 | if (zhdr->first_chunks == 0) | ||
273 | bud = FIRST; | ||
274 | else | ||
275 | bud = LAST; | ||
276 | goto found; | ||
277 | } | ||
278 | } | ||
279 | |||
280 | /* Couldn't find unbuddied zbud page, create new one */ | ||
281 | spin_unlock(&pool->lock); | ||
282 | page = alloc_page(gfp); | ||
283 | if (!page) | ||
284 | return -ENOMEM; | ||
285 | spin_lock(&pool->lock); | ||
286 | pool->pages_nr++; | ||
287 | zhdr = init_zbud_page(page); | ||
288 | bud = FIRST; | ||
289 | |||
290 | found: | ||
291 | if (bud == FIRST) | ||
292 | zhdr->first_chunks = chunks; | ||
293 | else | ||
294 | zhdr->last_chunks = chunks; | ||
295 | |||
296 | if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { | ||
297 | /* Add to unbuddied list */ | ||
298 | freechunks = num_free_chunks(zhdr); | ||
299 | list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); | ||
300 | } else { | ||
301 | /* Add to buddied list */ | ||
302 | list_add(&zhdr->buddy, &pool->buddied); | ||
303 | } | ||
304 | |||
305 | /* Add/move zbud page to beginning of LRU */ | ||
306 | if (!list_empty(&zhdr->lru)) | ||
307 | list_del(&zhdr->lru); | ||
308 | list_add(&zhdr->lru, &pool->lru); | ||
309 | |||
310 | *handle = encode_handle(zhdr, bud); | ||
311 | spin_unlock(&pool->lock); | ||
312 | |||
313 | return 0; | ||
314 | } | ||
315 | |||
316 | /** | ||
317 | * zbud_free() - frees the allocation associated with the given handle | ||
318 | * @pool: pool in which the allocation resided | ||
319 | * @handle: handle associated with the allocation returned by zbud_alloc() | ||
320 | * | ||
321 | * In the case that the zbud page in which the allocation resides is under | ||
322 | * reclaim, as indicated by the PG_reclaim flag being set, this function | ||
323 | * only sets the first|last_chunks to 0. The page is actually freed | ||
324 | * once both buddies are evicted (see zbud_reclaim_page() below). | ||
325 | */ | ||
326 | void zbud_free(struct zbud_pool *pool, unsigned long handle) | ||
327 | { | ||
328 | struct zbud_header *zhdr; | ||
329 | int freechunks; | ||
330 | |||
331 | spin_lock(&pool->lock); | ||
332 | zhdr = handle_to_zbud_header(handle); | ||
333 | |||
334 | /* If first buddy, handle will be page aligned */ | ||
335 | if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) | ||
336 | zhdr->last_chunks = 0; | ||
337 | else | ||
338 | zhdr->first_chunks = 0; | ||
339 | |||
340 | if (zhdr->under_reclaim) { | ||
341 | /* zbud page is under reclaim, reclaim will free */ | ||
342 | spin_unlock(&pool->lock); | ||
343 | return; | ||
344 | } | ||
345 | |||
346 | /* Remove from existing buddy list */ | ||
347 | list_del(&zhdr->buddy); | ||
348 | |||
349 | if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { | ||
350 | /* zbud page is empty, free */ | ||
351 | list_del(&zhdr->lru); | ||
352 | free_zbud_page(zhdr); | ||
353 | pool->pages_nr--; | ||
354 | } else { | ||
355 | /* Add to unbuddied list */ | ||
356 | freechunks = num_free_chunks(zhdr); | ||
357 | list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); | ||
358 | } | ||
359 | |||
360 | spin_unlock(&pool->lock); | ||
361 | } | ||
362 | |||
363 | #define list_tail_entry(ptr, type, member) \ | ||
364 | list_entry((ptr)->prev, type, member) | ||
365 | |||
366 | /** | ||
367 | * zbud_reclaim_page() - evicts allocations from a pool page and frees it | ||
368 | * @pool: pool from which a page will attempt to be evicted | ||
369 | * @retires: number of pages on the LRU list for which eviction will | ||
370 | * be attempted before failing | ||
371 | * | ||
372 | * zbud reclaim is different from normal system reclaim in that the reclaim is | ||
373 | * done from the bottom, up. This is because only the bottom layer, zbud, has | ||
374 | * information on how the allocations are organized within each zbud page. This | ||
375 | * has the potential to create interesting locking situations between zbud and | ||
376 | * the user, however. | ||
377 | * | ||
378 | * To avoid these, this is how zbud_reclaim_page() should be called: | ||
379 | |||
380 | * The user detects a page should be reclaimed and calls zbud_reclaim_page(). | ||
381 | * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call | ||
382 | * the user-defined eviction handler with the pool and handle as arguments. | ||
383 | * | ||
384 | * If the handle can not be evicted, the eviction handler should return | ||
385 | * non-zero. zbud_reclaim_page() will add the zbud page back to the | ||
386 | * appropriate list and try the next zbud page on the LRU up to | ||
387 | * a user defined number of retries. | ||
388 | * | ||
389 | * If the handle is successfully evicted, the eviction handler should | ||
390 | * return 0 _and_ should have called zbud_free() on the handle. zbud_free() | ||
391 | * contains logic to delay freeing the page if the page is under reclaim, | ||
392 | * as indicated by the setting of the PG_reclaim flag on the underlying page. | ||
393 | * | ||
394 | * If all buddies in the zbud page are successfully evicted, then the | ||
395 | * zbud page can be freed. | ||
396 | * | ||
397 | * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are | ||
398 | * no pages to evict or an eviction handler is not registered, -EAGAIN if | ||
399 | * the retry limit was hit. | ||
400 | */ | ||
401 | int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) | ||
402 | { | ||
403 | int i, ret, freechunks; | ||
404 | struct zbud_header *zhdr; | ||
405 | unsigned long first_handle = 0, last_handle = 0; | ||
406 | |||
407 | spin_lock(&pool->lock); | ||
408 | if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || | ||
409 | retries == 0) { | ||
410 | spin_unlock(&pool->lock); | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | for (i = 0; i < retries; i++) { | ||
414 | zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); | ||
415 | list_del(&zhdr->lru); | ||
416 | list_del(&zhdr->buddy); | ||
417 | /* Protect zbud page against free */ | ||
418 | zhdr->under_reclaim = true; | ||
419 | /* | ||
420 | * We need encode the handles before unlocking, since we can | ||
421 | * race with free that will set (first|last)_chunks to 0 | ||
422 | */ | ||
423 | first_handle = 0; | ||
424 | last_handle = 0; | ||
425 | if (zhdr->first_chunks) | ||
426 | first_handle = encode_handle(zhdr, FIRST); | ||
427 | if (zhdr->last_chunks) | ||
428 | last_handle = encode_handle(zhdr, LAST); | ||
429 | spin_unlock(&pool->lock); | ||
430 | |||
431 | /* Issue the eviction callback(s) */ | ||
432 | if (first_handle) { | ||
433 | ret = pool->ops->evict(pool, first_handle); | ||
434 | if (ret) | ||
435 | goto next; | ||
436 | } | ||
437 | if (last_handle) { | ||
438 | ret = pool->ops->evict(pool, last_handle); | ||
439 | if (ret) | ||
440 | goto next; | ||
441 | } | ||
442 | next: | ||
443 | spin_lock(&pool->lock); | ||
444 | zhdr->under_reclaim = false; | ||
445 | if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { | ||
446 | /* | ||
447 | * Both buddies are now free, free the zbud page and | ||
448 | * return success. | ||
449 | */ | ||
450 | free_zbud_page(zhdr); | ||
451 | pool->pages_nr--; | ||
452 | spin_unlock(&pool->lock); | ||
453 | return 0; | ||
454 | } else if (zhdr->first_chunks == 0 || | ||
455 | zhdr->last_chunks == 0) { | ||
456 | /* add to unbuddied list */ | ||
457 | freechunks = num_free_chunks(zhdr); | ||
458 | list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); | ||
459 | } else { | ||
460 | /* add to buddied list */ | ||
461 | list_add(&zhdr->buddy, &pool->buddied); | ||
462 | } | ||
463 | |||
464 | /* add to beginning of LRU */ | ||
465 | list_add(&zhdr->lru, &pool->lru); | ||
466 | } | ||
467 | spin_unlock(&pool->lock); | ||
468 | return -EAGAIN; | ||
469 | } | ||
470 | |||
471 | /** | ||
472 | * zbud_map() - maps the allocation associated with the given handle | ||
473 | * @pool: pool in which the allocation resides | ||
474 | * @handle: handle associated with the allocation to be mapped | ||
475 | * | ||
476 | * While trivial for zbud, the mapping functions for others allocators | ||
477 | * implementing this allocation API could have more complex information encoded | ||
478 | * in the handle and could create temporary mappings to make the data | ||
479 | * accessible to the user. | ||
480 | * | ||
481 | * Returns: a pointer to the mapped allocation | ||
482 | */ | ||
483 | void *zbud_map(struct zbud_pool *pool, unsigned long handle) | ||
484 | { | ||
485 | return (void *)(handle); | ||
486 | } | ||
487 | |||
488 | /** | ||
489 | * zbud_unmap() - maps the allocation associated with the given handle | ||
490 | * @pool: pool in which the allocation resides | ||
491 | * @handle: handle associated with the allocation to be unmapped | ||
492 | */ | ||
493 | void zbud_unmap(struct zbud_pool *pool, unsigned long handle) | ||
494 | { | ||
495 | } | ||
496 | |||
497 | /** | ||
498 | * zbud_get_pool_size() - gets the zbud pool size in pages | ||
499 | * @pool: pool whose size is being queried | ||
500 | * | ||
501 | * Returns: size in pages of the given pool. The pool lock need not be | ||
502 | * taken to access pages_nr. | ||
503 | */ | ||
504 | u64 zbud_get_pool_size(struct zbud_pool *pool) | ||
505 | { | ||
506 | return pool->pages_nr; | ||
507 | } | ||
508 | |||
509 | static int __init init_zbud(void) | ||
510 | { | ||
511 | /* Make sure the zbud header will fit in one chunk */ | ||
512 | BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); | ||
513 | pr_info("loaded\n"); | ||
514 | return 0; | ||
515 | } | ||
516 | |||
517 | static void __exit exit_zbud(void) | ||
518 | { | ||
519 | pr_info("unloaded\n"); | ||
520 | } | ||
521 | |||
522 | module_init(init_zbud); | ||
523 | module_exit(exit_zbud); | ||
524 | |||
525 | MODULE_LICENSE("GPL"); | ||
526 | MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); | ||
527 | MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); | ||
diff --git a/mm/zswap.c b/mm/zswap.c new file mode 100644 index 000000000000..deda2b671e12 --- /dev/null +++ b/mm/zswap.c | |||
@@ -0,0 +1,943 @@ | |||
1 | /* | ||
2 | * zswap.c - zswap driver file | ||
3 | * | ||
4 | * zswap is a backend for frontswap that takes pages that are in the process | ||
5 | * of being swapped out and attempts to compress and store them in a | ||
6 | * RAM-based memory pool. This can result in a significant I/O reduction on | ||
7 | * the swap device and, in the case where decompressing from RAM is faster | ||
8 | * than reading from the swap device, can also improve workload performance. | ||
9 | * | ||
10 | * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> | ||
11 | * | ||
12 | * This program is free software; you can redistribute it and/or | ||
13 | * modify it under the terms of the GNU General Public License | ||
14 | * as published by the Free Software Foundation; either version 2 | ||
15 | * of the License, or (at your option) any later version. | ||
16 | * | ||
17 | * This program is distributed in the hope that it will be useful, | ||
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
20 | * GNU General Public License for more details. | ||
21 | */ | ||
22 | |||
23 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
24 | |||
25 | #include <linux/module.h> | ||
26 | #include <linux/cpu.h> | ||
27 | #include <linux/highmem.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/atomic.h> | ||
32 | #include <linux/frontswap.h> | ||
33 | #include <linux/rbtree.h> | ||
34 | #include <linux/swap.h> | ||
35 | #include <linux/crypto.h> | ||
36 | #include <linux/mempool.h> | ||
37 | #include <linux/zbud.h> | ||
38 | |||
39 | #include <linux/mm_types.h> | ||
40 | #include <linux/page-flags.h> | ||
41 | #include <linux/swapops.h> | ||
42 | #include <linux/writeback.h> | ||
43 | #include <linux/pagemap.h> | ||
44 | |||
45 | /********************************* | ||
46 | * statistics | ||
47 | **********************************/ | ||
48 | /* Number of memory pages used by the compressed pool */ | ||
49 | static u64 zswap_pool_pages; | ||
50 | /* The number of compressed pages currently stored in zswap */ | ||
51 | static atomic_t zswap_stored_pages = ATOMIC_INIT(0); | ||
52 | |||
53 | /* | ||
54 | * The statistics below are not protected from concurrent access for | ||
55 | * performance reasons so they may not be a 100% accurate. However, | ||
56 | * they do provide useful information on roughly how many times a | ||
57 | * certain event is occurring. | ||
58 | */ | ||
59 | |||
60 | /* Pool limit was hit (see zswap_max_pool_percent) */ | ||
61 | static u64 zswap_pool_limit_hit; | ||
62 | /* Pages written back when pool limit was reached */ | ||
63 | static u64 zswap_written_back_pages; | ||
64 | /* Store failed due to a reclaim failure after pool limit was reached */ | ||
65 | static u64 zswap_reject_reclaim_fail; | ||
66 | /* Compressed page was too big for the allocator to (optimally) store */ | ||
67 | static u64 zswap_reject_compress_poor; | ||
68 | /* Store failed because underlying allocator could not get memory */ | ||
69 | static u64 zswap_reject_alloc_fail; | ||
70 | /* Store failed because the entry metadata could not be allocated (rare) */ | ||
71 | static u64 zswap_reject_kmemcache_fail; | ||
72 | /* Duplicate store was encountered (rare) */ | ||
73 | static u64 zswap_duplicate_entry; | ||
74 | |||
75 | /********************************* | ||
76 | * tunables | ||
77 | **********************************/ | ||
78 | /* Enable/disable zswap (disabled by default, fixed at boot for now) */ | ||
79 | static bool zswap_enabled __read_mostly; | ||
80 | module_param_named(enabled, zswap_enabled, bool, 0); | ||
81 | |||
82 | /* Compressor to be used by zswap (fixed at boot for now) */ | ||
83 | #define ZSWAP_COMPRESSOR_DEFAULT "lzo" | ||
84 | static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; | ||
85 | module_param_named(compressor, zswap_compressor, charp, 0); | ||
86 | |||
87 | /* The maximum percentage of memory that the compressed pool can occupy */ | ||
88 | static unsigned int zswap_max_pool_percent = 20; | ||
89 | module_param_named(max_pool_percent, | ||
90 | zswap_max_pool_percent, uint, 0644); | ||
91 | |||
92 | /********************************* | ||
93 | * compression functions | ||
94 | **********************************/ | ||
95 | /* per-cpu compression transforms */ | ||
96 | static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; | ||
97 | |||
98 | enum comp_op { | ||
99 | ZSWAP_COMPOP_COMPRESS, | ||
100 | ZSWAP_COMPOP_DECOMPRESS | ||
101 | }; | ||
102 | |||
103 | static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, | ||
104 | u8 *dst, unsigned int *dlen) | ||
105 | { | ||
106 | struct crypto_comp *tfm; | ||
107 | int ret; | ||
108 | |||
109 | tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); | ||
110 | switch (op) { | ||
111 | case ZSWAP_COMPOP_COMPRESS: | ||
112 | ret = crypto_comp_compress(tfm, src, slen, dst, dlen); | ||
113 | break; | ||
114 | case ZSWAP_COMPOP_DECOMPRESS: | ||
115 | ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); | ||
116 | break; | ||
117 | default: | ||
118 | ret = -EINVAL; | ||
119 | } | ||
120 | |||
121 | put_cpu(); | ||
122 | return ret; | ||
123 | } | ||
124 | |||
125 | static int __init zswap_comp_init(void) | ||
126 | { | ||
127 | if (!crypto_has_comp(zswap_compressor, 0, 0)) { | ||
128 | pr_info("%s compressor not available\n", zswap_compressor); | ||
129 | /* fall back to default compressor */ | ||
130 | zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; | ||
131 | if (!crypto_has_comp(zswap_compressor, 0, 0)) | ||
132 | /* can't even load the default compressor */ | ||
133 | return -ENODEV; | ||
134 | } | ||
135 | pr_info("using %s compressor\n", zswap_compressor); | ||
136 | |||
137 | /* alloc percpu transforms */ | ||
138 | zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); | ||
139 | if (!zswap_comp_pcpu_tfms) | ||
140 | return -ENOMEM; | ||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | static void zswap_comp_exit(void) | ||
145 | { | ||
146 | /* free percpu transforms */ | ||
147 | if (zswap_comp_pcpu_tfms) | ||
148 | free_percpu(zswap_comp_pcpu_tfms); | ||
149 | } | ||
150 | |||
151 | /********************************* | ||
152 | * data structures | ||
153 | **********************************/ | ||
154 | /* | ||
155 | * struct zswap_entry | ||
156 | * | ||
157 | * This structure contains the metadata for tracking a single compressed | ||
158 | * page within zswap. | ||
159 | * | ||
160 | * rbnode - links the entry into red-black tree for the appropriate swap type | ||
161 | * refcount - the number of outstanding reference to the entry. This is needed | ||
162 | * to protect against premature freeing of the entry by code | ||
163 | * concurent calls to load, invalidate, and writeback. The lock | ||
164 | * for the zswap_tree structure that contains the entry must | ||
165 | * be held while changing the refcount. Since the lock must | ||
166 | * be held, there is no reason to also make refcount atomic. | ||
167 | * offset - the swap offset for the entry. Index into the red-black tree. | ||
168 | * handle - zsmalloc allocation handle that stores the compressed page data | ||
169 | * length - the length in bytes of the compressed page data. Needed during | ||
170 | * decompression | ||
171 | */ | ||
172 | struct zswap_entry { | ||
173 | struct rb_node rbnode; | ||
174 | pgoff_t offset; | ||
175 | int refcount; | ||
176 | unsigned int length; | ||
177 | unsigned long handle; | ||
178 | }; | ||
179 | |||
180 | struct zswap_header { | ||
181 | swp_entry_t swpentry; | ||
182 | }; | ||
183 | |||
184 | /* | ||
185 | * The tree lock in the zswap_tree struct protects a few things: | ||
186 | * - the rbtree | ||
187 | * - the refcount field of each entry in the tree | ||
188 | */ | ||
189 | struct zswap_tree { | ||
190 | struct rb_root rbroot; | ||
191 | spinlock_t lock; | ||
192 | struct zbud_pool *pool; | ||
193 | }; | ||
194 | |||
195 | static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; | ||
196 | |||
197 | /********************************* | ||
198 | * zswap entry functions | ||
199 | **********************************/ | ||
200 | static struct kmem_cache *zswap_entry_cache; | ||
201 | |||
202 | static int zswap_entry_cache_create(void) | ||
203 | { | ||
204 | zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); | ||
205 | return (zswap_entry_cache == NULL); | ||
206 | } | ||
207 | |||
208 | static void zswap_entry_cache_destory(void) | ||
209 | { | ||
210 | kmem_cache_destroy(zswap_entry_cache); | ||
211 | } | ||
212 | |||
213 | static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) | ||
214 | { | ||
215 | struct zswap_entry *entry; | ||
216 | entry = kmem_cache_alloc(zswap_entry_cache, gfp); | ||
217 | if (!entry) | ||
218 | return NULL; | ||
219 | entry->refcount = 1; | ||
220 | return entry; | ||
221 | } | ||
222 | |||
223 | static void zswap_entry_cache_free(struct zswap_entry *entry) | ||
224 | { | ||
225 | kmem_cache_free(zswap_entry_cache, entry); | ||
226 | } | ||
227 | |||
228 | /* caller must hold the tree lock */ | ||
229 | static void zswap_entry_get(struct zswap_entry *entry) | ||
230 | { | ||
231 | entry->refcount++; | ||
232 | } | ||
233 | |||
234 | /* caller must hold the tree lock */ | ||
235 | static int zswap_entry_put(struct zswap_entry *entry) | ||
236 | { | ||
237 | entry->refcount--; | ||
238 | return entry->refcount; | ||
239 | } | ||
240 | |||
241 | /********************************* | ||
242 | * rbtree functions | ||
243 | **********************************/ | ||
244 | static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) | ||
245 | { | ||
246 | struct rb_node *node = root->rb_node; | ||
247 | struct zswap_entry *entry; | ||
248 | |||
249 | while (node) { | ||
250 | entry = rb_entry(node, struct zswap_entry, rbnode); | ||
251 | if (entry->offset > offset) | ||
252 | node = node->rb_left; | ||
253 | else if (entry->offset < offset) | ||
254 | node = node->rb_right; | ||
255 | else | ||
256 | return entry; | ||
257 | } | ||
258 | return NULL; | ||
259 | } | ||
260 | |||
261 | /* | ||
262 | * In the case that a entry with the same offset is found, a pointer to | ||
263 | * the existing entry is stored in dupentry and the function returns -EEXIST | ||
264 | */ | ||
265 | static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, | ||
266 | struct zswap_entry **dupentry) | ||
267 | { | ||
268 | struct rb_node **link = &root->rb_node, *parent = NULL; | ||
269 | struct zswap_entry *myentry; | ||
270 | |||
271 | while (*link) { | ||
272 | parent = *link; | ||
273 | myentry = rb_entry(parent, struct zswap_entry, rbnode); | ||
274 | if (myentry->offset > entry->offset) | ||
275 | link = &(*link)->rb_left; | ||
276 | else if (myentry->offset < entry->offset) | ||
277 | link = &(*link)->rb_right; | ||
278 | else { | ||
279 | *dupentry = myentry; | ||
280 | return -EEXIST; | ||
281 | } | ||
282 | } | ||
283 | rb_link_node(&entry->rbnode, parent, link); | ||
284 | rb_insert_color(&entry->rbnode, root); | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | /********************************* | ||
289 | * per-cpu code | ||
290 | **********************************/ | ||
291 | static DEFINE_PER_CPU(u8 *, zswap_dstmem); | ||
292 | |||
293 | static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) | ||
294 | { | ||
295 | struct crypto_comp *tfm; | ||
296 | u8 *dst; | ||
297 | |||
298 | switch (action) { | ||
299 | case CPU_UP_PREPARE: | ||
300 | tfm = crypto_alloc_comp(zswap_compressor, 0, 0); | ||
301 | if (IS_ERR(tfm)) { | ||
302 | pr_err("can't allocate compressor transform\n"); | ||
303 | return NOTIFY_BAD; | ||
304 | } | ||
305 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; | ||
306 | dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); | ||
307 | if (!dst) { | ||
308 | pr_err("can't allocate compressor buffer\n"); | ||
309 | crypto_free_comp(tfm); | ||
310 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; | ||
311 | return NOTIFY_BAD; | ||
312 | } | ||
313 | per_cpu(zswap_dstmem, cpu) = dst; | ||
314 | break; | ||
315 | case CPU_DEAD: | ||
316 | case CPU_UP_CANCELED: | ||
317 | tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); | ||
318 | if (tfm) { | ||
319 | crypto_free_comp(tfm); | ||
320 | *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; | ||
321 | } | ||
322 | dst = per_cpu(zswap_dstmem, cpu); | ||
323 | kfree(dst); | ||
324 | per_cpu(zswap_dstmem, cpu) = NULL; | ||
325 | break; | ||
326 | default: | ||
327 | break; | ||
328 | } | ||
329 | return NOTIFY_OK; | ||
330 | } | ||
331 | |||
332 | static int zswap_cpu_notifier(struct notifier_block *nb, | ||
333 | unsigned long action, void *pcpu) | ||
334 | { | ||
335 | unsigned long cpu = (unsigned long)pcpu; | ||
336 | return __zswap_cpu_notifier(action, cpu); | ||
337 | } | ||
338 | |||
339 | static struct notifier_block zswap_cpu_notifier_block = { | ||
340 | .notifier_call = zswap_cpu_notifier | ||
341 | }; | ||
342 | |||
343 | static int zswap_cpu_init(void) | ||
344 | { | ||
345 | unsigned long cpu; | ||
346 | |||
347 | get_online_cpus(); | ||
348 | for_each_online_cpu(cpu) | ||
349 | if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) | ||
350 | goto cleanup; | ||
351 | register_cpu_notifier(&zswap_cpu_notifier_block); | ||
352 | put_online_cpus(); | ||
353 | return 0; | ||
354 | |||
355 | cleanup: | ||
356 | for_each_online_cpu(cpu) | ||
357 | __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); | ||
358 | put_online_cpus(); | ||
359 | return -ENOMEM; | ||
360 | } | ||
361 | |||
362 | /********************************* | ||
363 | * helpers | ||
364 | **********************************/ | ||
365 | static bool zswap_is_full(void) | ||
366 | { | ||
367 | return (totalram_pages * zswap_max_pool_percent / 100 < | ||
368 | zswap_pool_pages); | ||
369 | } | ||
370 | |||
371 | /* | ||
372 | * Carries out the common pattern of freeing and entry's zsmalloc allocation, | ||
373 | * freeing the entry itself, and decrementing the number of stored pages. | ||
374 | */ | ||
375 | static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) | ||
376 | { | ||
377 | zbud_free(tree->pool, entry->handle); | ||
378 | zswap_entry_cache_free(entry); | ||
379 | atomic_dec(&zswap_stored_pages); | ||
380 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | ||
381 | } | ||
382 | |||
383 | /********************************* | ||
384 | * writeback code | ||
385 | **********************************/ | ||
386 | /* return enum for zswap_get_swap_cache_page */ | ||
387 | enum zswap_get_swap_ret { | ||
388 | ZSWAP_SWAPCACHE_NEW, | ||
389 | ZSWAP_SWAPCACHE_EXIST, | ||
390 | ZSWAP_SWAPCACHE_NOMEM | ||
391 | }; | ||
392 | |||
393 | /* | ||
394 | * zswap_get_swap_cache_page | ||
395 | * | ||
396 | * This is an adaption of read_swap_cache_async() | ||
397 | * | ||
398 | * This function tries to find a page with the given swap entry | ||
399 | * in the swapper_space address space (the swap cache). If the page | ||
400 | * is found, it is returned in retpage. Otherwise, a page is allocated, | ||
401 | * added to the swap cache, and returned in retpage. | ||
402 | * | ||
403 | * If success, the swap cache page is returned in retpage | ||
404 | * Returns 0 if page was already in the swap cache, page is not locked | ||
405 | * Returns 1 if the new page needs to be populated, page is locked | ||
406 | * Returns <0 on error | ||
407 | */ | ||
408 | static int zswap_get_swap_cache_page(swp_entry_t entry, | ||
409 | struct page **retpage) | ||
410 | { | ||
411 | struct page *found_page, *new_page = NULL; | ||
412 | struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; | ||
413 | int err; | ||
414 | |||
415 | *retpage = NULL; | ||
416 | do { | ||
417 | /* | ||
418 | * First check the swap cache. Since this is normally | ||
419 | * called after lookup_swap_cache() failed, re-calling | ||
420 | * that would confuse statistics. | ||
421 | */ | ||
422 | found_page = find_get_page(swapper_space, entry.val); | ||
423 | if (found_page) | ||
424 | break; | ||
425 | |||
426 | /* | ||
427 | * Get a new page to read into from swap. | ||
428 | */ | ||
429 | if (!new_page) { | ||
430 | new_page = alloc_page(GFP_KERNEL); | ||
431 | if (!new_page) | ||
432 | break; /* Out of memory */ | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * call radix_tree_preload() while we can wait. | ||
437 | */ | ||
438 | err = radix_tree_preload(GFP_KERNEL); | ||
439 | if (err) | ||
440 | break; | ||
441 | |||
442 | /* | ||
443 | * Swap entry may have been freed since our caller observed it. | ||
444 | */ | ||
445 | err = swapcache_prepare(entry); | ||
446 | if (err == -EEXIST) { /* seems racy */ | ||
447 | radix_tree_preload_end(); | ||
448 | continue; | ||
449 | } | ||
450 | if (err) { /* swp entry is obsolete ? */ | ||
451 | radix_tree_preload_end(); | ||
452 | break; | ||
453 | } | ||
454 | |||
455 | /* May fail (-ENOMEM) if radix-tree node allocation failed. */ | ||
456 | __set_page_locked(new_page); | ||
457 | SetPageSwapBacked(new_page); | ||
458 | err = __add_to_swap_cache(new_page, entry); | ||
459 | if (likely(!err)) { | ||
460 | radix_tree_preload_end(); | ||
461 | lru_cache_add_anon(new_page); | ||
462 | *retpage = new_page; | ||
463 | return ZSWAP_SWAPCACHE_NEW; | ||
464 | } | ||
465 | radix_tree_preload_end(); | ||
466 | ClearPageSwapBacked(new_page); | ||
467 | __clear_page_locked(new_page); | ||
468 | /* | ||
469 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
470 | * clear SWAP_HAS_CACHE flag. | ||
471 | */ | ||
472 | swapcache_free(entry, NULL); | ||
473 | } while (err != -ENOMEM); | ||
474 | |||
475 | if (new_page) | ||
476 | page_cache_release(new_page); | ||
477 | if (!found_page) | ||
478 | return ZSWAP_SWAPCACHE_NOMEM; | ||
479 | *retpage = found_page; | ||
480 | return ZSWAP_SWAPCACHE_EXIST; | ||
481 | } | ||
482 | |||
483 | /* | ||
484 | * Attempts to free an entry by adding a page to the swap cache, | ||
485 | * decompressing the entry data into the page, and issuing a | ||
486 | * bio write to write the page back to the swap device. | ||
487 | * | ||
488 | * This can be thought of as a "resumed writeback" of the page | ||
489 | * to the swap device. We are basically resuming the same swap | ||
490 | * writeback path that was intercepted with the frontswap_store() | ||
491 | * in the first place. After the page has been decompressed into | ||
492 | * the swap cache, the compressed version stored by zswap can be | ||
493 | * freed. | ||
494 | */ | ||
495 | static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) | ||
496 | { | ||
497 | struct zswap_header *zhdr; | ||
498 | swp_entry_t swpentry; | ||
499 | struct zswap_tree *tree; | ||
500 | pgoff_t offset; | ||
501 | struct zswap_entry *entry; | ||
502 | struct page *page; | ||
503 | u8 *src, *dst; | ||
504 | unsigned int dlen; | ||
505 | int ret, refcount; | ||
506 | struct writeback_control wbc = { | ||
507 | .sync_mode = WB_SYNC_NONE, | ||
508 | }; | ||
509 | |||
510 | /* extract swpentry from data */ | ||
511 | zhdr = zbud_map(pool, handle); | ||
512 | swpentry = zhdr->swpentry; /* here */ | ||
513 | zbud_unmap(pool, handle); | ||
514 | tree = zswap_trees[swp_type(swpentry)]; | ||
515 | offset = swp_offset(swpentry); | ||
516 | BUG_ON(pool != tree->pool); | ||
517 | |||
518 | /* find and ref zswap entry */ | ||
519 | spin_lock(&tree->lock); | ||
520 | entry = zswap_rb_search(&tree->rbroot, offset); | ||
521 | if (!entry) { | ||
522 | /* entry was invalidated */ | ||
523 | spin_unlock(&tree->lock); | ||
524 | return 0; | ||
525 | } | ||
526 | zswap_entry_get(entry); | ||
527 | spin_unlock(&tree->lock); | ||
528 | BUG_ON(offset != entry->offset); | ||
529 | |||
530 | /* try to allocate swap cache page */ | ||
531 | switch (zswap_get_swap_cache_page(swpentry, &page)) { | ||
532 | case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ | ||
533 | ret = -ENOMEM; | ||
534 | goto fail; | ||
535 | |||
536 | case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ | ||
537 | /* page is already in the swap cache, ignore for now */ | ||
538 | page_cache_release(page); | ||
539 | ret = -EEXIST; | ||
540 | goto fail; | ||
541 | |||
542 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ | ||
543 | /* decompress */ | ||
544 | dlen = PAGE_SIZE; | ||
545 | src = (u8 *)zbud_map(tree->pool, entry->handle) + | ||
546 | sizeof(struct zswap_header); | ||
547 | dst = kmap_atomic(page); | ||
548 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, | ||
549 | entry->length, dst, &dlen); | ||
550 | kunmap_atomic(dst); | ||
551 | zbud_unmap(tree->pool, entry->handle); | ||
552 | BUG_ON(ret); | ||
553 | BUG_ON(dlen != PAGE_SIZE); | ||
554 | |||
555 | /* page is up to date */ | ||
556 | SetPageUptodate(page); | ||
557 | } | ||
558 | |||
559 | /* start writeback */ | ||
560 | __swap_writepage(page, &wbc, end_swap_bio_write); | ||
561 | page_cache_release(page); | ||
562 | zswap_written_back_pages++; | ||
563 | |||
564 | spin_lock(&tree->lock); | ||
565 | |||
566 | /* drop local reference */ | ||
567 | zswap_entry_put(entry); | ||
568 | /* drop the initial reference from entry creation */ | ||
569 | refcount = zswap_entry_put(entry); | ||
570 | |||
571 | /* | ||
572 | * There are three possible values for refcount here: | ||
573 | * (1) refcount is 1, load is in progress, unlink from rbtree, | ||
574 | * load will free | ||
575 | * (2) refcount is 0, (normal case) entry is valid, | ||
576 | * remove from rbtree and free entry | ||
577 | * (3) refcount is -1, invalidate happened during writeback, | ||
578 | * free entry | ||
579 | */ | ||
580 | if (refcount >= 0) { | ||
581 | /* no invalidate yet, remove from rbtree */ | ||
582 | rb_erase(&entry->rbnode, &tree->rbroot); | ||
583 | } | ||
584 | spin_unlock(&tree->lock); | ||
585 | if (refcount <= 0) { | ||
586 | /* free the entry */ | ||
587 | zswap_free_entry(tree, entry); | ||
588 | return 0; | ||
589 | } | ||
590 | return -EAGAIN; | ||
591 | |||
592 | fail: | ||
593 | spin_lock(&tree->lock); | ||
594 | zswap_entry_put(entry); | ||
595 | spin_unlock(&tree->lock); | ||
596 | return ret; | ||
597 | } | ||
598 | |||
599 | /********************************* | ||
600 | * frontswap hooks | ||
601 | **********************************/ | ||
602 | /* attempts to compress and store an single page */ | ||
603 | static int zswap_frontswap_store(unsigned type, pgoff_t offset, | ||
604 | struct page *page) | ||
605 | { | ||
606 | struct zswap_tree *tree = zswap_trees[type]; | ||
607 | struct zswap_entry *entry, *dupentry; | ||
608 | int ret; | ||
609 | unsigned int dlen = PAGE_SIZE, len; | ||
610 | unsigned long handle; | ||
611 | char *buf; | ||
612 | u8 *src, *dst; | ||
613 | struct zswap_header *zhdr; | ||
614 | |||
615 | if (!tree) { | ||
616 | ret = -ENODEV; | ||
617 | goto reject; | ||
618 | } | ||
619 | |||
620 | /* reclaim space if needed */ | ||
621 | if (zswap_is_full()) { | ||
622 | zswap_pool_limit_hit++; | ||
623 | if (zbud_reclaim_page(tree->pool, 8)) { | ||
624 | zswap_reject_reclaim_fail++; | ||
625 | ret = -ENOMEM; | ||
626 | goto reject; | ||
627 | } | ||
628 | } | ||
629 | |||
630 | /* allocate entry */ | ||
631 | entry = zswap_entry_cache_alloc(GFP_KERNEL); | ||
632 | if (!entry) { | ||
633 | zswap_reject_kmemcache_fail++; | ||
634 | ret = -ENOMEM; | ||
635 | goto reject; | ||
636 | } | ||
637 | |||
638 | /* compress */ | ||
639 | dst = get_cpu_var(zswap_dstmem); | ||
640 | src = kmap_atomic(page); | ||
641 | ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); | ||
642 | kunmap_atomic(src); | ||
643 | if (ret) { | ||
644 | ret = -EINVAL; | ||
645 | goto freepage; | ||
646 | } | ||
647 | |||
648 | /* store */ | ||
649 | len = dlen + sizeof(struct zswap_header); | ||
650 | ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, | ||
651 | &handle); | ||
652 | if (ret == -ENOSPC) { | ||
653 | zswap_reject_compress_poor++; | ||
654 | goto freepage; | ||
655 | } | ||
656 | if (ret) { | ||
657 | zswap_reject_alloc_fail++; | ||
658 | goto freepage; | ||
659 | } | ||
660 | zhdr = zbud_map(tree->pool, handle); | ||
661 | zhdr->swpentry = swp_entry(type, offset); | ||
662 | buf = (u8 *)(zhdr + 1); | ||
663 | memcpy(buf, dst, dlen); | ||
664 | zbud_unmap(tree->pool, handle); | ||
665 | put_cpu_var(zswap_dstmem); | ||
666 | |||
667 | /* populate entry */ | ||
668 | entry->offset = offset; | ||
669 | entry->handle = handle; | ||
670 | entry->length = dlen; | ||
671 | |||
672 | /* map */ | ||
673 | spin_lock(&tree->lock); | ||
674 | do { | ||
675 | ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); | ||
676 | if (ret == -EEXIST) { | ||
677 | zswap_duplicate_entry++; | ||
678 | /* remove from rbtree */ | ||
679 | rb_erase(&dupentry->rbnode, &tree->rbroot); | ||
680 | if (!zswap_entry_put(dupentry)) { | ||
681 | /* free */ | ||
682 | zswap_free_entry(tree, dupentry); | ||
683 | } | ||
684 | } | ||
685 | } while (ret == -EEXIST); | ||
686 | spin_unlock(&tree->lock); | ||
687 | |||
688 | /* update stats */ | ||
689 | atomic_inc(&zswap_stored_pages); | ||
690 | zswap_pool_pages = zbud_get_pool_size(tree->pool); | ||
691 | |||
692 | return 0; | ||
693 | |||
694 | freepage: | ||
695 | put_cpu_var(zswap_dstmem); | ||
696 | zswap_entry_cache_free(entry); | ||
697 | reject: | ||
698 | return ret; | ||
699 | } | ||
700 | |||
701 | /* | ||
702 | * returns 0 if the page was successfully decompressed | ||
703 | * return -1 on entry not found or error | ||
704 | */ | ||
705 | static int zswap_frontswap_load(unsigned type, pgoff_t offset, | ||
706 | struct page *page) | ||
707 | { | ||
708 | struct zswap_tree *tree = zswap_trees[type]; | ||
709 | struct zswap_entry *entry; | ||
710 | u8 *src, *dst; | ||
711 | unsigned int dlen; | ||
712 | int refcount, ret; | ||
713 | |||
714 | /* find */ | ||
715 | spin_lock(&tree->lock); | ||
716 | entry = zswap_rb_search(&tree->rbroot, offset); | ||
717 | if (!entry) { | ||
718 | /* entry was written back */ | ||
719 | spin_unlock(&tree->lock); | ||
720 | return -1; | ||
721 | } | ||
722 | zswap_entry_get(entry); | ||
723 | spin_unlock(&tree->lock); | ||
724 | |||
725 | /* decompress */ | ||
726 | dlen = PAGE_SIZE; | ||
727 | src = (u8 *)zbud_map(tree->pool, entry->handle) + | ||
728 | sizeof(struct zswap_header); | ||
729 | dst = kmap_atomic(page); | ||
730 | ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, | ||
731 | dst, &dlen); | ||
732 | kunmap_atomic(dst); | ||
733 | zbud_unmap(tree->pool, entry->handle); | ||
734 | BUG_ON(ret); | ||
735 | |||
736 | spin_lock(&tree->lock); | ||
737 | refcount = zswap_entry_put(entry); | ||
738 | if (likely(refcount)) { | ||
739 | spin_unlock(&tree->lock); | ||
740 | return 0; | ||
741 | } | ||
742 | spin_unlock(&tree->lock); | ||
743 | |||
744 | /* | ||
745 | * We don't have to unlink from the rbtree because | ||
746 | * zswap_writeback_entry() or zswap_frontswap_invalidate page() | ||
747 | * has already done this for us if we are the last reference. | ||
748 | */ | ||
749 | /* free */ | ||
750 | |||
751 | zswap_free_entry(tree, entry); | ||
752 | |||
753 | return 0; | ||
754 | } | ||
755 | |||
756 | /* frees an entry in zswap */ | ||
757 | static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
758 | { | ||
759 | struct zswap_tree *tree = zswap_trees[type]; | ||
760 | struct zswap_entry *entry; | ||
761 | int refcount; | ||
762 | |||
763 | /* find */ | ||
764 | spin_lock(&tree->lock); | ||
765 | entry = zswap_rb_search(&tree->rbroot, offset); | ||
766 | if (!entry) { | ||
767 | /* entry was written back */ | ||
768 | spin_unlock(&tree->lock); | ||
769 | return; | ||
770 | } | ||
771 | |||
772 | /* remove from rbtree */ | ||
773 | rb_erase(&entry->rbnode, &tree->rbroot); | ||
774 | |||
775 | /* drop the initial reference from entry creation */ | ||
776 | refcount = zswap_entry_put(entry); | ||
777 | |||
778 | spin_unlock(&tree->lock); | ||
779 | |||
780 | if (refcount) { | ||
781 | /* writeback in progress, writeback will free */ | ||
782 | return; | ||
783 | } | ||
784 | |||
785 | /* free */ | ||
786 | zswap_free_entry(tree, entry); | ||
787 | } | ||
788 | |||
789 | /* frees all zswap entries for the given swap type */ | ||
790 | static void zswap_frontswap_invalidate_area(unsigned type) | ||
791 | { | ||
792 | struct zswap_tree *tree = zswap_trees[type]; | ||
793 | struct rb_node *node; | ||
794 | struct zswap_entry *entry; | ||
795 | |||
796 | if (!tree) | ||
797 | return; | ||
798 | |||
799 | /* walk the tree and free everything */ | ||
800 | spin_lock(&tree->lock); | ||
801 | /* | ||
802 | * TODO: Even though this code should not be executed because | ||
803 | * the try_to_unuse() in swapoff should have emptied the tree, | ||
804 | * it is very wasteful to rebalance the tree after every | ||
805 | * removal when we are freeing the whole tree. | ||
806 | * | ||
807 | * If post-order traversal code is ever added to the rbtree | ||
808 | * implementation, it should be used here. | ||
809 | */ | ||
810 | while ((node = rb_first(&tree->rbroot))) { | ||
811 | entry = rb_entry(node, struct zswap_entry, rbnode); | ||
812 | rb_erase(&entry->rbnode, &tree->rbroot); | ||
813 | zbud_free(tree->pool, entry->handle); | ||
814 | zswap_entry_cache_free(entry); | ||
815 | atomic_dec(&zswap_stored_pages); | ||
816 | } | ||
817 | tree->rbroot = RB_ROOT; | ||
818 | spin_unlock(&tree->lock); | ||
819 | } | ||
820 | |||
821 | static struct zbud_ops zswap_zbud_ops = { | ||
822 | .evict = zswap_writeback_entry | ||
823 | }; | ||
824 | |||
825 | static void zswap_frontswap_init(unsigned type) | ||
826 | { | ||
827 | struct zswap_tree *tree; | ||
828 | |||
829 | tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); | ||
830 | if (!tree) | ||
831 | goto err; | ||
832 | tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); | ||
833 | if (!tree->pool) | ||
834 | goto freetree; | ||
835 | tree->rbroot = RB_ROOT; | ||
836 | spin_lock_init(&tree->lock); | ||
837 | zswap_trees[type] = tree; | ||
838 | return; | ||
839 | |||
840 | freetree: | ||
841 | kfree(tree); | ||
842 | err: | ||
843 | pr_err("alloc failed, zswap disabled for swap type %d\n", type); | ||
844 | } | ||
845 | |||
846 | static struct frontswap_ops zswap_frontswap_ops = { | ||
847 | .store = zswap_frontswap_store, | ||
848 | .load = zswap_frontswap_load, | ||
849 | .invalidate_page = zswap_frontswap_invalidate_page, | ||
850 | .invalidate_area = zswap_frontswap_invalidate_area, | ||
851 | .init = zswap_frontswap_init | ||
852 | }; | ||
853 | |||
854 | /********************************* | ||
855 | * debugfs functions | ||
856 | **********************************/ | ||
857 | #ifdef CONFIG_DEBUG_FS | ||
858 | #include <linux/debugfs.h> | ||
859 | |||
860 | static struct dentry *zswap_debugfs_root; | ||
861 | |||
862 | static int __init zswap_debugfs_init(void) | ||
863 | { | ||
864 | if (!debugfs_initialized()) | ||
865 | return -ENODEV; | ||
866 | |||
867 | zswap_debugfs_root = debugfs_create_dir("zswap", NULL); | ||
868 | if (!zswap_debugfs_root) | ||
869 | return -ENOMEM; | ||
870 | |||
871 | debugfs_create_u64("pool_limit_hit", S_IRUGO, | ||
872 | zswap_debugfs_root, &zswap_pool_limit_hit); | ||
873 | debugfs_create_u64("reject_reclaim_fail", S_IRUGO, | ||
874 | zswap_debugfs_root, &zswap_reject_reclaim_fail); | ||
875 | debugfs_create_u64("reject_alloc_fail", S_IRUGO, | ||
876 | zswap_debugfs_root, &zswap_reject_alloc_fail); | ||
877 | debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, | ||
878 | zswap_debugfs_root, &zswap_reject_kmemcache_fail); | ||
879 | debugfs_create_u64("reject_compress_poor", S_IRUGO, | ||
880 | zswap_debugfs_root, &zswap_reject_compress_poor); | ||
881 | debugfs_create_u64("written_back_pages", S_IRUGO, | ||
882 | zswap_debugfs_root, &zswap_written_back_pages); | ||
883 | debugfs_create_u64("duplicate_entry", S_IRUGO, | ||
884 | zswap_debugfs_root, &zswap_duplicate_entry); | ||
885 | debugfs_create_u64("pool_pages", S_IRUGO, | ||
886 | zswap_debugfs_root, &zswap_pool_pages); | ||
887 | debugfs_create_atomic_t("stored_pages", S_IRUGO, | ||
888 | zswap_debugfs_root, &zswap_stored_pages); | ||
889 | |||
890 | return 0; | ||
891 | } | ||
892 | |||
893 | static void __exit zswap_debugfs_exit(void) | ||
894 | { | ||
895 | debugfs_remove_recursive(zswap_debugfs_root); | ||
896 | } | ||
897 | #else | ||
898 | static int __init zswap_debugfs_init(void) | ||
899 | { | ||
900 | return 0; | ||
901 | } | ||
902 | |||
903 | static void __exit zswap_debugfs_exit(void) { } | ||
904 | #endif | ||
905 | |||
906 | /********************************* | ||
907 | * module init and exit | ||
908 | **********************************/ | ||
909 | static int __init init_zswap(void) | ||
910 | { | ||
911 | if (!zswap_enabled) | ||
912 | return 0; | ||
913 | |||
914 | pr_info("loading zswap\n"); | ||
915 | if (zswap_entry_cache_create()) { | ||
916 | pr_err("entry cache creation failed\n"); | ||
917 | goto error; | ||
918 | } | ||
919 | if (zswap_comp_init()) { | ||
920 | pr_err("compressor initialization failed\n"); | ||
921 | goto compfail; | ||
922 | } | ||
923 | if (zswap_cpu_init()) { | ||
924 | pr_err("per-cpu initialization failed\n"); | ||
925 | goto pcpufail; | ||
926 | } | ||
927 | frontswap_register_ops(&zswap_frontswap_ops); | ||
928 | if (zswap_debugfs_init()) | ||
929 | pr_warn("debugfs initialization failed\n"); | ||
930 | return 0; | ||
931 | pcpufail: | ||
932 | zswap_comp_exit(); | ||
933 | compfail: | ||
934 | zswap_entry_cache_destory(); | ||
935 | error: | ||
936 | return -ENOMEM; | ||
937 | } | ||
938 | /* must be late so crypto has time to come up */ | ||
939 | late_initcall(init_zswap); | ||
940 | |||
941 | MODULE_LICENSE("GPL"); | ||
942 | MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); | ||
943 | MODULE_DESCRIPTION("Compressed cache for swap pages"); | ||