aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorAlexander Graf <agraf@suse.de>2013-08-28 18:41:59 -0400
committerAlexander Graf <agraf@suse.de>2013-08-28 18:41:59 -0400
commitbf550fc93d9855872a95e69e4002256110d89858 (patch)
tree10876bb4304bffe54c4160a132e7b8de6577ac4e /mm
parent7e48c101e0c53e6095c5f4f5e63d14df50aae8fc (diff)
parentcc2df20c7c4ce594c3e17e9cc260c330646012c8 (diff)
Merge remote-tracking branch 'origin/next' into kvm-ppc-next
Conflicts: mm/Kconfig CMA DMA split and ZSWAP introduction were conflicting, fix up manually.
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig42
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/bootmem.c39
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/huge_memory.c30
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/internal.h5
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c363
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c15
-rw-r--r--mm/memory_hotplug.c139
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c40
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mremap.c20
-rw-r--r--mm/nobootmem.c35
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page_alloc.c384
-rw-r--r--mm/page_io.c50
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c9
-rw-r--r--mm/shmem.c16
-rw-r--r--mm/slab.c51
-rw-r--r--mm/slab.h3
-rw-r--r--mm/slab_common.c18
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c38
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c106
-rw-r--r--mm/swapfile.c55
-rw-r--r--mm/util.c1
-rw-r--r--mm/vmalloc.c164
-rw-r--r--mm/vmscan.c605
-rw-r--r--mm/zbud.c527
-rw-r--r--mm/zswap.c943
37 files changed, 2862 insertions, 953 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 81bcb4bd422d..6cdd27043303 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -501,3 +501,45 @@ config CMA_DEBUG
501 messages for every CMA call as well as various messages while 501 messages for every CMA call as well as various messages while
502 processing calls such as dma_alloc_from_contiguous(). 502 processing calls such as dma_alloc_from_contiguous().
503 This option does not affect warning and error messages. 503 This option does not affect warning and error messages.
504
505config ZBUD
506 tristate
507 default n
508 help
509 A special purpose allocator for storing compressed pages.
510 It is designed to store up to two compressed pages per physical
511 page. While this design limits storage density, it has simple and
512 deterministic reclaim properties that make it preferable to a higher
513 density approach when reclaim will be used.
514
515config ZSWAP
516 bool "Compressed cache for swap pages (EXPERIMENTAL)"
517 depends on FRONTSWAP && CRYPTO=y
518 select CRYPTO_LZO
519 select ZBUD
520 default n
521 help
522 A lightweight compressed cache for swap pages. It takes
523 pages that are in the process of being swapped out and attempts to
524 compress them into a dynamically allocated RAM-based memory pool.
525 This can result in a significant I/O reduction on swap device and,
526 in the case where decompressing from RAM is faster that swap device
527 reads, can also improve workload performance.
528
529 This is marked experimental because it is a new feature (as of
530 v3.11) that interacts heavily with memory reclaim. While these
531 interactions don't cause any known issues on simple memory setups,
532 they have not be fully explored on the large set of potential
533 configurations and workloads that exist.
534
535config MEM_SOFT_DIRTY
536 bool "Track memory changes"
537 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
538 select PROC_PAGE_MONITOR
539 help
540 This option enables memory changes tracking by introducing a
541 soft-dirty bit on pte-s. This bit it set when someone writes
542 into a page just as regular dirty bit, but unlike the latter
543 it can be cleared by hands.
544
545 See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..f00803386a67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32obj-$(CONFIG_BOUNCE) += bounce.o 32obj-$(CONFIG_BOUNCE) += bounce.o
33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
34obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
35obj-$(CONFIG_ZSWAP) += zswap.o
35obj-$(CONFIG_HAS_DMA) += dmapool.o 36obj-$(CONFIG_HAS_DMA) += dmapool.o
36obj-$(CONFIG_HUGETLBFS) += hugetlb.o 37obj-$(CONFIG_HUGETLBFS) += hugetlb.o
37obj-$(CONFIG_NUMA) += mempolicy.o 38obj-$(CONFIG_NUMA) += mempolicy.o
@@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
58obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 59obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
59obj-$(CONFIG_CLEANCACHE) += cleancache.o 60obj-$(CONFIG_CLEANCACHE) += cleancache.o
60obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
62obj-$(CONFIG_ZBUD) += zbud.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..d014ee5fcbbd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy);
515int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, 515int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
516 unsigned int cap) 516 unsigned int cap)
517{ 517{
518 char tmp[32];
519 int err; 518 int err;
520 519
521 bdi->name = name; 520 bdi->name = name;
@@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
524 if (err) 523 if (err)
525 return err; 524 return err;
526 525
527 sprintf(tmp, "%.28s%s", name, "-%d"); 526 err = bdi_register(bdi, NULL, "%.28s-%ld", name,
528 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); 527 atomic_long_inc_return(&bdi_seq));
529 if (err) { 528 if (err) {
530 bdi_destroy(bdi); 529 bdi_destroy(bdi);
531 return err; 530 return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
241 return count; 241 return count;
242} 242}
243 243
244static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) 244static int reset_managed_pages_done __initdata;
245
246static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
245{ 247{
246 struct zone *z; 248 struct zone *z;
247 249
248 /* 250 if (reset_managed_pages_done)
249 * In free_area_init_core(), highmem zone's managed_pages is set to 251 return;
250 * present_pages, and bootmem allocator doesn't allocate from highmem 252
251 * zones. So there's no need to recalculate managed_pages because all
252 * highmem pages will be managed by the buddy system. Here highmem
253 * zone also includes highmem movable zone.
254 */
255 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 253 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
256 if (!is_highmem(z)) 254 z->managed_pages = 0;
257 z->managed_pages = 0;
258} 255}
259 256
260/** 257void __init reset_all_zones_managed_pages(void)
261 * free_all_bootmem_node - release a node's free pages to the buddy allocator
262 * @pgdat: node to be released
263 *
264 * Returns the number of pages actually released.
265 */
266unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
267{ 258{
268 register_page_bootmem_info_node(pgdat); 259 struct pglist_data *pgdat;
269 reset_node_lowmem_managed_pages(pgdat); 260
270 return free_all_bootmem_core(pgdat->bdata); 261 for_each_online_pgdat(pgdat)
262 reset_node_managed_pages(pgdat);
263 reset_managed_pages_done = 1;
271} 264}
272 265
273/** 266/**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
279{ 272{
280 unsigned long total_pages = 0; 273 unsigned long total_pages = 0;
281 bootmem_data_t *bdata; 274 bootmem_data_t *bdata;
282 struct pglist_data *pgdat;
283 275
284 for_each_online_pgdat(pgdat) 276 reset_all_zones_managed_pages();
285 reset_node_lowmem_managed_pages(pgdat);
286 277
287 list_for_each_entry(bdata, &bdata_list, list) 278 list_for_each_entry(bdata, &bdata_list, list)
288 total_pages += free_all_bootmem_core(bdata); 279 total_pages += free_all_bootmem_core(bdata);
289 280
281 totalram_pages += total_pages;
282
290 return total_pages; 283 return total_pages;
291} 284}
292 285
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe721aa8..4b51ac1acae7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1539 struct address_space *mapping = file->f_mapping; 1539 struct address_space *mapping = file->f_mapping;
1540 1540
1541 /* If we don't want any read-ahead, don't bother */ 1541 /* If we don't want any read-ahead, don't bother */
1542 if (VM_RandomReadHint(vma)) 1542 if (vma->vm_flags & VM_RAND_READ)
1543 return; 1543 return;
1544 if (!ra->ra_pages) 1544 if (!ra->ra_pages)
1545 return; 1545 return;
1546 1546
1547 if (VM_SequentialReadHint(vma)) { 1547 if (vma->vm_flags & VM_SEQ_READ) {
1548 page_cache_sync_readahead(mapping, ra, file, offset, 1548 page_cache_sync_readahead(mapping, ra, file, offset,
1549 ra->ra_pages); 1549 ra->ra_pages);
1550 return; 1550 return;
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
1584 struct address_space *mapping = file->f_mapping; 1584 struct address_space *mapping = file->f_mapping;
1585 1585
1586 /* If we don't want any read-ahead, don't bother */ 1586 /* If we don't want any read-ahead, don't bother */
1587 if (VM_RandomReadHint(vma)) 1587 if (vma->vm_flags & VM_RAND_READ)
1588 return; 1588 return;
1589 if (ra->mmap_miss > 0) 1589 if (ra->mmap_miss > 0)
1590 ra->mmap_miss--; 1590 ra->mmap_miss--;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..243e710c6039 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
729 pmd_t entry; 729 pmd_t entry;
730 entry = mk_huge_pmd(page, vma); 730 entry = mk_huge_pmd(page, vma);
731 page_add_new_anon_rmap(page, vma, haddr); 731 page_add_new_anon_rmap(page, vma, haddr);
732 pgtable_trans_huge_deposit(mm, pmd, pgtable);
732 set_pmd_at(mm, haddr, pmd, entry); 733 set_pmd_at(mm, haddr, pmd, entry);
733 pgtable_trans_huge_deposit(mm, pgtable);
734 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 734 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
735 mm->nr_ptes++; 735 mm->nr_ptes++;
736 spin_unlock(&mm->page_table_lock); 736 spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
771 entry = mk_pmd(zero_page, vma->vm_page_prot); 771 entry = mk_pmd(zero_page, vma->vm_page_prot);
772 entry = pmd_wrprotect(entry); 772 entry = pmd_wrprotect(entry);
773 entry = pmd_mkhuge(entry); 773 entry = pmd_mkhuge(entry);
774 pgtable_trans_huge_deposit(mm, pmd, pgtable);
774 set_pmd_at(mm, haddr, pmd, entry); 775 set_pmd_at(mm, haddr, pmd, entry);
775 pgtable_trans_huge_deposit(mm, pgtable);
776 mm->nr_ptes++; 776 mm->nr_ptes++;
777 return true; 777 return true;
778} 778}
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 916
917 pmdp_set_wrprotect(src_mm, addr, src_pmd); 917 pmdp_set_wrprotect(src_mm, addr, src_pmd);
918 pmd = pmd_mkold(pmd_wrprotect(pmd)); 918 pmd = pmd_mkold(pmd_wrprotect(pmd));
919 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
919 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 920 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
920 pgtable_trans_huge_deposit(dst_mm, pgtable);
921 dst_mm->nr_ptes++; 921 dst_mm->nr_ptes++;
922 922
923 ret = 0; 923 ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
987 pmdp_clear_flush(vma, haddr, pmd); 987 pmdp_clear_flush(vma, haddr, pmd);
988 /* leave pmd empty until pte is filled */ 988 /* leave pmd empty until pte is filled */
989 989
990 pgtable = pgtable_trans_huge_withdraw(mm); 990 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
991 pmd_populate(mm, &_pmd, pgtable); 991 pmd_populate(mm, &_pmd, pgtable);
992 992
993 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 993 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1085 pmdp_clear_flush(vma, haddr, pmd); 1085 pmdp_clear_flush(vma, haddr, pmd);
1086 /* leave pmd empty until pte is filled */ 1086 /* leave pmd empty until pte is filled */
1087 1087
1088 pgtable = pgtable_trans_huge_withdraw(mm); 1088 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1089 pmd_populate(mm, &_pmd, pgtable); 1089 pmd_populate(mm, &_pmd, pgtable);
1090 1090
1091 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1091 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1265 * young bit, instead of the current set_pmd_at. 1265 * young bit, instead of the current set_pmd_at.
1266 */ 1266 */
1267 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1267 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1268 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1268 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1269 pmd, _pmd, 1))
1270 update_mmu_cache_pmd(vma, addr, pmd);
1269 } 1271 }
1270 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1272 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1271 if (page->mapping && trylock_page(page)) { 1273 if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1358 struct page *page; 1360 struct page *page;
1359 pgtable_t pgtable; 1361 pgtable_t pgtable;
1360 pmd_t orig_pmd; 1362 pmd_t orig_pmd;
1361 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1363 /*
1364 * For architectures like ppc64 we look at deposited pgtable
1365 * when calling pmdp_get_and_clear. So do the
1366 * pgtable_trans_huge_withdraw after finishing pmdp related
1367 * operations.
1368 */
1362 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1369 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1363 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1370 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1371 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1364 if (is_huge_zero_pmd(orig_pmd)) { 1372 if (is_huge_zero_pmd(orig_pmd)) {
1365 tlb->mm->nr_ptes--; 1373 tlb->mm->nr_ptes--;
1366 spin_unlock(&tlb->mm->page_table_lock); 1374 spin_unlock(&tlb->mm->page_table_lock);
@@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1429 if (ret == 1) { 1437 if (ret == 1) {
1430 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1438 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1431 VM_BUG_ON(!pmd_none(*new_pmd)); 1439 VM_BUG_ON(!pmd_none(*new_pmd));
1432 set_pmd_at(mm, new_addr, new_pmd, pmd); 1440 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1433 spin_unlock(&mm->page_table_lock); 1441 spin_unlock(&mm->page_table_lock);
1434 } 1442 }
1435out: 1443out:
@@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page,
1691 pmd = page_check_address_pmd(page, mm, address, 1699 pmd = page_check_address_pmd(page, mm, address,
1692 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1700 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1693 if (pmd) { 1701 if (pmd) {
1694 pgtable = pgtable_trans_huge_withdraw(mm); 1702 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1695 pmd_populate(mm, &_pmd, pgtable); 1703 pmd_populate(mm, &_pmd, pgtable);
1696 1704
1697 haddr = address; 1705 haddr = address;
@@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm,
2359 spin_lock(&mm->page_table_lock); 2367 spin_lock(&mm->page_table_lock);
2360 BUG_ON(!pmd_none(*pmd)); 2368 BUG_ON(!pmd_none(*pmd));
2361 page_add_new_anon_rmap(new_page, vma, address); 2369 page_add_new_anon_rmap(new_page, vma, address);
2370 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2362 set_pmd_at(mm, address, pmd, _pmd); 2371 set_pmd_at(mm, address, pmd, _pmd);
2363 update_mmu_cache_pmd(vma, address, pmd); 2372 update_mmu_cache_pmd(vma, address, pmd);
2364 pgtable_trans_huge_deposit(mm, pgtable);
2365 spin_unlock(&mm->page_table_lock); 2373 spin_unlock(&mm->page_table_lock);
2366 2374
2367 *hpage = NULL; 2375 *hpage = NULL;
@@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2667 pmdp_clear_flush(vma, haddr, pmd); 2675 pmdp_clear_flush(vma, haddr, pmd);
2668 /* leave pmd empty until pte is filled */ 2676 /* leave pmd empty until pte is filled */
2669 2677
2670 pgtable = pgtable_trans_huge_withdraw(mm); 2678 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2671 pmd_populate(mm, &_pmd, pgtable); 2679 pmd_populate(mm, &_pmd, pgtable);
2672 2680
2673 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2681 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index aed085ad11a8..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
319 319
320 hstate = hstate_vma(vma); 320 hstate = hstate_vma(vma);
321 321
322 return 1UL << (hstate->order + PAGE_SHIFT); 322 return 1UL << huge_page_shift(hstate);
323} 323}
324EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 324EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
325 325
@@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
1263 * side-effects, like CommitLimit going negative. 1263 * side-effects, like CommitLimit going negative.
1264 */ 1264 */
1265 if (h->order > (MAX_ORDER - 1)) 1265 if (h->order > (MAX_ORDER - 1))
1266 totalram_pages += 1 << h->order; 1266 adjust_managed_page_count(page, 1 << h->order);
1267 } 1267 }
1268} 1268}
1269 1269
diff --git a/mm/internal.h b/mm/internal.h
index 8562de0a5197..4390ac6c106e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page)
32 set_page_count(page, 1); 32 set_page_count(page, 1);
33} 33}
34 34
35static inline void __put_page(struct page *page)
36{
37 atomic_dec(&page->_count);
38}
39
40static inline void __get_page_tail_foll(struct page *page, 35static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head) 36 bool get_page_head)
42{ 37{
diff --git a/mm/memblock.c b/mm/memblock.c
index c5fad932fa51..a847bfe6f3ba 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
566/** 566/**
567 * __next_free_mem_range - next function for for_each_free_mem_range() 567 * __next_free_mem_range - next function for for_each_free_mem_range()
568 * @idx: pointer to u64 loop variable 568 * @idx: pointer to u64 loop variable
569 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 569 * @nid: node selector, %MAX_NUMNODES for all nodes
570 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 570 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
571 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 571 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
572 * @out_nid: ptr to int for nid of the range, can be %NULL 572 * @out_nid: ptr to int for nid of the range, can be %NULL
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..d12ca6f3c293 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
188}; 188};
189 189
190struct mem_cgroup_lru_info {
191 struct mem_cgroup_per_node *nodeinfo[0];
192};
193
194/* 190/*
195 * Cgroups above their limits are maintained in a RB-Tree, independent of 191 * Cgroups above their limits are maintained in a RB-Tree, independent of
196 * their hierarchy representation 192 * their hierarchy representation
@@ -267,28 +263,10 @@ struct mem_cgroup {
267 /* vmpressure notifications */ 263 /* vmpressure notifications */
268 struct vmpressure vmpressure; 264 struct vmpressure vmpressure;
269 265
270 union { 266 /*
271 /* 267 * the counter to account for mem+swap usage.
272 * the counter to account for mem+swap usage. 268 */
273 */ 269 struct res_counter memsw;
274 struct res_counter memsw;
275
276 /*
277 * rcu_freeing is used only when freeing struct mem_cgroup,
278 * so put it into a union to avoid wasting more memory.
279 * It must be disjoint from the css field. It could be
280 * in a union with the res field, but res plays a much
281 * larger part in mem_cgroup life than memsw, and might
282 * be of interest, even at time of free, when debugging.
283 * So share rcu_head with the less interesting memsw.
284 */
285 struct rcu_head rcu_freeing;
286 /*
287 * We also need some space for a worker in deferred freeing.
288 * By the time we call it, rcu_freeing is no longer in use.
289 */
290 struct work_struct work_freeing;
291 };
292 270
293 /* 271 /*
294 * the counter to account for kernel memory usage. 272 * the counter to account for kernel memory usage.
@@ -303,8 +281,6 @@ struct mem_cgroup {
303 bool oom_lock; 281 bool oom_lock;
304 atomic_t under_oom; 282 atomic_t under_oom;
305 283
306 atomic_t refcnt;
307
308 int swappiness; 284 int swappiness;
309 /* OOM-Killer disable */ 285 /* OOM-Killer disable */
310 int oom_kill_disable; 286 int oom_kill_disable;
@@ -366,14 +342,8 @@ struct mem_cgroup {
366 atomic_t numainfo_updating; 342 atomic_t numainfo_updating;
367#endif 343#endif
368 344
369 /* 345 struct mem_cgroup_per_node *nodeinfo[0];
370 * Per cgroup active and inactive list, similar to the 346 /* WARNING: nodeinfo must be the last member here */
371 * per zone LRU lists.
372 *
373 * WARNING: This has to be the last element of the struct. Don't
374 * add new fields after this point.
375 */
376 struct mem_cgroup_lru_info info;
377}; 347};
378 348
379static size_t memcg_size(void) 349static size_t memcg_size(void)
@@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
416 386
417static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 387static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
418{ 388{
389 /*
390 * Our caller must use css_get() first, because memcg_uncharge_kmem()
391 * will call css_put() if it sees the memcg is dead.
392 */
393 smp_wmb();
419 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 394 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
420 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 395 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
421} 396}
@@ -508,9 +483,6 @@ enum res_type {
508 */ 483 */
509static DEFINE_MUTEX(memcg_create_mutex); 484static DEFINE_MUTEX(memcg_create_mutex);
510 485
511static void mem_cgroup_get(struct mem_cgroup *memcg);
512static void mem_cgroup_put(struct mem_cgroup *memcg);
513
514static inline 486static inline
515struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 487struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
516{ 488{
@@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk)
561 */ 533 */
562 if (sk->sk_cgrp) { 534 if (sk->sk_cgrp) {
563 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 535 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
564 mem_cgroup_get(sk->sk_cgrp->memcg); 536 css_get(&sk->sk_cgrp->memcg->css);
565 return; 537 return;
566 } 538 }
567 539
568 rcu_read_lock(); 540 rcu_read_lock();
569 memcg = mem_cgroup_from_task(current); 541 memcg = mem_cgroup_from_task(current);
570 cg_proto = sk->sk_prot->proto_cgroup(memcg); 542 cg_proto = sk->sk_prot->proto_cgroup(memcg);
571 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { 543 if (!mem_cgroup_is_root(memcg) &&
572 mem_cgroup_get(memcg); 544 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
573 sk->sk_cgrp = cg_proto; 545 sk->sk_cgrp = cg_proto;
574 } 546 }
575 rcu_read_unlock(); 547 rcu_read_unlock();
@@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk)
583 struct mem_cgroup *memcg; 555 struct mem_cgroup *memcg;
584 WARN_ON(!sk->sk_cgrp->memcg); 556 WARN_ON(!sk->sk_cgrp->memcg);
585 memcg = sk->sk_cgrp->memcg; 557 memcg = sk->sk_cgrp->memcg;
586 mem_cgroup_put(memcg); 558 css_put(&sk->sk_cgrp->memcg->css);
587 } 559 }
588} 560}
589 561
@@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone *
683mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 655mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
684{ 656{
685 VM_BUG_ON((unsigned)nid >= nr_node_ids); 657 VM_BUG_ON((unsigned)nid >= nr_node_ids);
686 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 658 return &memcg->nodeinfo[nid]->zoneinfo[zid];
687} 659}
688 660
689struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 661struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -1148,6 +1120,58 @@ skip_node:
1148 return NULL; 1120 return NULL;
1149} 1121}
1150 1122
1123static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1124{
1125 /*
1126 * When a group in the hierarchy below root is destroyed, the
1127 * hierarchy iterator can no longer be trusted since it might
1128 * have pointed to the destroyed group. Invalidate it.
1129 */
1130 atomic_inc(&root->dead_count);
1131}
1132
1133static struct mem_cgroup *
1134mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1135 struct mem_cgroup *root,
1136 int *sequence)
1137{
1138 struct mem_cgroup *position = NULL;
1139 /*
1140 * A cgroup destruction happens in two stages: offlining and
1141 * release. They are separated by a RCU grace period.
1142 *
1143 * If the iterator is valid, we may still race with an
1144 * offlining. The RCU lock ensures the object won't be
1145 * released, tryget will fail if we lost the race.
1146 */
1147 *sequence = atomic_read(&root->dead_count);
1148 if (iter->last_dead_count == *sequence) {
1149 smp_rmb();
1150 position = iter->last_visited;
1151 if (position && !css_tryget(&position->css))
1152 position = NULL;
1153 }
1154 return position;
1155}
1156
1157static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1158 struct mem_cgroup *last_visited,
1159 struct mem_cgroup *new_position,
1160 int sequence)
1161{
1162 if (last_visited)
1163 css_put(&last_visited->css);
1164 /*
1165 * We store the sequence count from the time @last_visited was
1166 * loaded successfully instead of rereading it here so that we
1167 * don't lose destruction events in between. We could have
1168 * raced with the destruction of @new_position after all.
1169 */
1170 iter->last_visited = new_position;
1171 smp_wmb();
1172 iter->last_dead_count = sequence;
1173}
1174
1151/** 1175/**
1152 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1176 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1153 * @root: hierarchy root 1177 * @root: hierarchy root
@@ -1171,7 +1195,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1171{ 1195{
1172 struct mem_cgroup *memcg = NULL; 1196 struct mem_cgroup *memcg = NULL;
1173 struct mem_cgroup *last_visited = NULL; 1197 struct mem_cgroup *last_visited = NULL;
1174 unsigned long uninitialized_var(dead_count);
1175 1198
1176 if (mem_cgroup_disabled()) 1199 if (mem_cgroup_disabled())
1177 return NULL; 1200 return NULL;
@@ -1191,6 +1214,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1191 rcu_read_lock(); 1214 rcu_read_lock();
1192 while (!memcg) { 1215 while (!memcg) {
1193 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1216 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1217 int uninitialized_var(seq);
1194 1218
1195 if (reclaim) { 1219 if (reclaim) {
1196 int nid = zone_to_nid(reclaim->zone); 1220 int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1228,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1204 goto out_unlock; 1228 goto out_unlock;
1205 } 1229 }
1206 1230
1207 /* 1231 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1208 * If the dead_count mismatches, a destruction
1209 * has happened or is happening concurrently.
1210 * If the dead_count matches, a destruction
1211 * might still happen concurrently, but since
1212 * we checked under RCU, that destruction
1213 * won't free the object until we release the
1214 * RCU reader lock. Thus, the dead_count
1215 * check verifies the pointer is still valid,
1216 * css_tryget() verifies the cgroup pointed to
1217 * is alive.
1218 */
1219 dead_count = atomic_read(&root->dead_count);
1220 if (dead_count == iter->last_dead_count) {
1221 smp_rmb();
1222 last_visited = iter->last_visited;
1223 if (last_visited &&
1224 !css_tryget(&last_visited->css))
1225 last_visited = NULL;
1226 }
1227 } 1232 }
1228 1233
1229 memcg = __mem_cgroup_iter_next(root, last_visited); 1234 memcg = __mem_cgroup_iter_next(root, last_visited);
1230 1235
1231 if (reclaim) { 1236 if (reclaim) {
1232 if (last_visited) 1237 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1233 css_put(&last_visited->css);
1234
1235 iter->last_visited = memcg;
1236 smp_wmb();
1237 iter->last_dead_count = dead_count;
1238 1238
1239 if (!memcg) 1239 if (!memcg)
1240 iter->generation++; 1240 iter->generation++;
@@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1448 return ret; 1448 return ret;
1449} 1449}
1450 1450
1451int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1451bool task_in_mem_cgroup(struct task_struct *task,
1452 const struct mem_cgroup *memcg)
1452{ 1453{
1453 int ret;
1454 struct mem_cgroup *curr = NULL; 1454 struct mem_cgroup *curr = NULL;
1455 struct task_struct *p; 1455 struct task_struct *p;
1456 bool ret;
1456 1457
1457 p = find_lock_task_mm(task); 1458 p = find_lock_task_mm(task);
1458 if (p) { 1459 if (p) {
@@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1464 * killer still needs to detect if they have already been oom 1465 * killer still needs to detect if they have already been oom
1465 * killed to prevent needlessly killing additional tasks. 1466 * killed to prevent needlessly killing additional tasks.
1466 */ 1467 */
1467 task_lock(task); 1468 rcu_read_lock();
1468 curr = mem_cgroup_from_task(task); 1469 curr = mem_cgroup_from_task(task);
1469 if (curr) 1470 if (curr)
1470 css_get(&curr->css); 1471 css_get(&curr->css);
1471 task_unlock(task); 1472 rcu_read_unlock();
1472 } 1473 }
1473 if (!curr) 1474 if (!curr)
1474 return 0; 1475 return false;
1475 /* 1476 /*
1476 * We should check use_hierarchy of "memcg" not "curr". Because checking 1477 * We should check use_hierarchy of "memcg" not "curr". Because checking
1477 * use_hierarchy of "curr" here make this function true if hierarchy is 1478 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -3031,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3031 if (res_counter_uncharge(&memcg->kmem, size)) 3032 if (res_counter_uncharge(&memcg->kmem, size))
3032 return; 3033 return;
3033 3034
3035 /*
3036 * Releases a reference taken in kmem_cgroup_css_offline in case
3037 * this last uncharge is racing with the offlining code or it is
3038 * outliving the memcg existence.
3039 *
3040 * The memory barrier imposed by test&clear is paired with the
3041 * explicit one in memcg_kmem_mark_dead().
3042 */
3034 if (memcg_kmem_test_and_clear_dead(memcg)) 3043 if (memcg_kmem_test_and_clear_dead(memcg))
3035 mem_cgroup_put(memcg); 3044 css_put(&memcg->css);
3036} 3045}
3037 3046
3038void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) 3047void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
@@ -3223,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s)
3223 list_del(&s->memcg_params->list); 3232 list_del(&s->memcg_params->list);
3224 mutex_unlock(&memcg->slab_caches_mutex); 3233 mutex_unlock(&memcg->slab_caches_mutex);
3225 3234
3226 mem_cgroup_put(memcg); 3235 css_put(&memcg->css);
3227out: 3236out:
3228 kfree(s->memcg_params); 3237 kfree(s->memcg_params);
3229} 3238}
@@ -3383,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3383 3392
3384 mutex_lock(&memcg_cache_mutex); 3393 mutex_lock(&memcg_cache_mutex);
3385 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3394 new_cachep = cachep->memcg_params->memcg_caches[idx];
3386 if (new_cachep) 3395 if (new_cachep) {
3396 css_put(&memcg->css);
3387 goto out; 3397 goto out;
3398 }
3388 3399
3389 new_cachep = kmem_cache_dup(memcg, cachep); 3400 new_cachep = kmem_cache_dup(memcg, cachep);
3390 if (new_cachep == NULL) { 3401 if (new_cachep == NULL) {
3391 new_cachep = cachep; 3402 new_cachep = cachep;
3403 css_put(&memcg->css);
3392 goto out; 3404 goto out;
3393 } 3405 }
3394 3406
3395 mem_cgroup_get(memcg);
3396 atomic_set(&new_cachep->memcg_params->nr_pages , 0); 3407 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3397 3408
3398 cachep->memcg_params->memcg_caches[idx] = new_cachep; 3409 cachep->memcg_params->memcg_caches[idx] = new_cachep;
@@ -3480,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3480 3491
3481 cw = container_of(w, struct create_work, work); 3492 cw = container_of(w, struct create_work, work);
3482 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3493 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3483 /* Drop the reference gotten when we enqueued. */
3484 css_put(&cw->memcg->css);
3485 kfree(cw); 3494 kfree(cw);
3486} 3495}
3487 3496
@@ -3618,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3618 int ret; 3627 int ret;
3619 3628
3620 *_memcg = NULL; 3629 *_memcg = NULL;
3630
3631 /*
3632 * Disabling accounting is only relevant for some specific memcg
3633 * internal allocations. Therefore we would initially not have such
3634 * check here, since direct calls to the page allocator that are marked
3635 * with GFP_KMEMCG only happen outside memcg core. We are mostly
3636 * concerned with cache allocations, and by having this test at
3637 * memcg_kmem_get_cache, we are already able to relay the allocation to
3638 * the root cache and bypass the memcg cache altogether.
3639 *
3640 * There is one exception, though: the SLUB allocator does not create
3641 * large order caches, but rather service large kmallocs directly from
3642 * the page allocator. Therefore, the following sequence when backed by
3643 * the SLUB allocator:
3644 *
3645 * memcg_stop_kmem_account();
3646 * kmalloc(<large_number>)
3647 * memcg_resume_kmem_account();
3648 *
3649 * would effectively ignore the fact that we should skip accounting,
3650 * since it will drive us directly to this function without passing
3651 * through the cache selector memcg_kmem_get_cache. Such large
3652 * allocations are extremely rare but can happen, for instance, for the
3653 * cache arrays. We bring this test here.
3654 */
3655 if (!current->mm || current->memcg_kmem_skip_account)
3656 return true;
3657
3621 memcg = try_get_mem_cgroup_from_mm(current->mm); 3658 memcg = try_get_mem_cgroup_from_mm(current->mm);
3622 3659
3623 /* 3660 /*
@@ -4171,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4171 unlock_page_cgroup(pc); 4208 unlock_page_cgroup(pc);
4172 /* 4209 /*
4173 * even after unlock, we have memcg->res.usage here and this memcg 4210 * even after unlock, we have memcg->res.usage here and this memcg
4174 * will never be freed. 4211 * will never be freed, so it's safe to call css_get().
4175 */ 4212 */
4176 memcg_check_events(memcg, page); 4213 memcg_check_events(memcg, page);
4177 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4214 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4178 mem_cgroup_swap_statistics(memcg, true); 4215 mem_cgroup_swap_statistics(memcg, true);
4179 mem_cgroup_get(memcg); 4216 css_get(&memcg->css);
4180 } 4217 }
4181 /* 4218 /*
4182 * Migration does not charge the res_counter for the 4219 * Migration does not charge the res_counter for the
@@ -4288,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4288 4325
4289 /* 4326 /*
4290 * record memcg information, if swapout && memcg != NULL, 4327 * record memcg information, if swapout && memcg != NULL,
4291 * mem_cgroup_get() was called in uncharge(). 4328 * css_get() was called in uncharge().
4292 */ 4329 */
4293 if (do_swap_account && swapout && memcg) 4330 if (do_swap_account && swapout && memcg)
4294 swap_cgroup_record(ent, css_id(&memcg->css)); 4331 swap_cgroup_record(ent, css_id(&memcg->css));
@@ -4319,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
4319 if (!mem_cgroup_is_root(memcg)) 4356 if (!mem_cgroup_is_root(memcg))
4320 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4357 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4321 mem_cgroup_swap_statistics(memcg, false); 4358 mem_cgroup_swap_statistics(memcg, false);
4322 mem_cgroup_put(memcg); 4359 css_put(&memcg->css);
4323 } 4360 }
4324 rcu_read_unlock(); 4361 rcu_read_unlock();
4325} 4362}
@@ -4353,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
4353 * This function is only called from task migration context now. 4390 * This function is only called from task migration context now.
4354 * It postpones res_counter and refcount handling till the end 4391 * It postpones res_counter and refcount handling till the end
4355 * of task migration(mem_cgroup_clear_mc()) for performance 4392 * of task migration(mem_cgroup_clear_mc()) for performance
4356 * improvement. But we cannot postpone mem_cgroup_get(to) 4393 * improvement. But we cannot postpone css_get(to) because if
4357 * because if the process that has been moved to @to does 4394 * the process that has been moved to @to does swap-in, the
4358 * swap-in, the refcount of @to might be decreased to 0. 4395 * refcount of @to might be decreased to 0.
4396 *
4397 * We are in attach() phase, so the cgroup is guaranteed to be
4398 * alive, so we can just call css_get().
4359 */ 4399 */
4360 mem_cgroup_get(to); 4400 css_get(&to->css);
4361 return 0; 4401 return 0;
4362 } 4402 }
4363 return -EINVAL; 4403 return -EINVAL;
@@ -5136,14 +5176,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
5136 * starts accounting before all call sites are patched 5176 * starts accounting before all call sites are patched
5137 */ 5177 */
5138 memcg_kmem_set_active(memcg); 5178 memcg_kmem_set_active(memcg);
5139
5140 /*
5141 * kmem charges can outlive the cgroup. In the case of slab
5142 * pages, for instance, a page contain objects from various
5143 * processes, so it is unfeasible to migrate them away. We
5144 * need to reference count the memcg because of that.
5145 */
5146 mem_cgroup_get(memcg);
5147 } else 5179 } else
5148 ret = res_counter_set_limit(&memcg->kmem, val); 5180 ret = res_counter_set_limit(&memcg->kmem, val);
5149out: 5181out:
@@ -5176,16 +5208,16 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5176 goto out; 5208 goto out;
5177 5209
5178 /* 5210 /*
5179 * destroy(), called if we fail, will issue static_key_slow_inc() and 5211 * __mem_cgroup_free() will issue static_key_slow_dec() because this
5180 * mem_cgroup_put() if kmem is enabled. We have to either call them 5212 * memcg is active already. If the later initialization fails then the
5181 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find 5213 * cgroup core triggers the cleanup so we do not have to do it here.
5182 * this more consistent, since it always leads to the same destroy path
5183 */ 5214 */
5184 mem_cgroup_get(memcg);
5185 static_key_slow_inc(&memcg_kmem_enabled_key); 5215 static_key_slow_inc(&memcg_kmem_enabled_key);
5186 5216
5187 mutex_lock(&set_limit_mutex); 5217 mutex_lock(&set_limit_mutex);
5218 memcg_stop_kmem_account();
5188 ret = memcg_update_cache_sizes(memcg); 5219 ret = memcg_update_cache_sizes(memcg);
5220 memcg_resume_kmem_account();
5189 mutex_unlock(&set_limit_mutex); 5221 mutex_unlock(&set_limit_mutex);
5190out: 5222out:
5191 return ret; 5223 return ret;
@@ -5864,23 +5896,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5864 return mem_cgroup_sockets_init(memcg, ss); 5896 return mem_cgroup_sockets_init(memcg, ss);
5865} 5897}
5866 5898
5867static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5899static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5868{ 5900{
5869 mem_cgroup_sockets_destroy(memcg); 5901 mem_cgroup_sockets_destroy(memcg);
5902}
5903
5904static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5905{
5906 if (!memcg_kmem_is_active(memcg))
5907 return;
5908
5909 /*
5910 * kmem charges can outlive the cgroup. In the case of slab
5911 * pages, for instance, a page contain objects from various
5912 * processes. As we prevent from taking a reference for every
5913 * such allocation we have to be careful when doing uncharge
5914 * (see memcg_uncharge_kmem) and here during offlining.
5915 *
5916 * The idea is that that only the _last_ uncharge which sees
5917 * the dead memcg will drop the last reference. An additional
5918 * reference is taken here before the group is marked dead
5919 * which is then paired with css_put during uncharge resp. here.
5920 *
5921 * Although this might sound strange as this path is called from
5922 * css_offline() when the referencemight have dropped down to 0
5923 * and shouldn't be incremented anymore (css_tryget would fail)
5924 * we do not have other options because of the kmem allocations
5925 * lifetime.
5926 */
5927 css_get(&memcg->css);
5870 5928
5871 memcg_kmem_mark_dead(memcg); 5929 memcg_kmem_mark_dead(memcg);
5872 5930
5873 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5931 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5874 return; 5932 return;
5875 5933
5876 /*
5877 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5878 * path here, being careful not to race with memcg_uncharge_kmem: it is
5879 * possible that the charges went down to 0 between mark_dead and the
5880 * res_counter read, so in that case, we don't need the put
5881 */
5882 if (memcg_kmem_test_and_clear_dead(memcg)) 5934 if (memcg_kmem_test_and_clear_dead(memcg))
5883 mem_cgroup_put(memcg); 5935 css_put(&memcg->css);
5884} 5936}
5885#else 5937#else
5886static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5938static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5888,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5888 return 0; 5940 return 0;
5889} 5941}
5890 5942
5891static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5943static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5944{
5945}
5946
5947static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5892{ 5948{
5893} 5949}
5894#endif 5950#endif
@@ -6058,13 +6114,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6058 mz->on_tree = false; 6114 mz->on_tree = false;
6059 mz->memcg = memcg; 6115 mz->memcg = memcg;
6060 } 6116 }
6061 memcg->info.nodeinfo[node] = pn; 6117 memcg->nodeinfo[node] = pn;
6062 return 0; 6118 return 0;
6063} 6119}
6064 6120
6065static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6121static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6066{ 6122{
6067 kfree(memcg->info.nodeinfo[node]); 6123 kfree(memcg->nodeinfo[node]);
6068} 6124}
6069 6125
6070static struct mem_cgroup *mem_cgroup_alloc(void) 6126static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -6137,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
6137 vfree(memcg); 6193 vfree(memcg);
6138} 6194}
6139 6195
6140
6141/*
6142 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
6143 * but in process context. The work_freeing structure is overlaid
6144 * on the rcu_freeing structure, which itself is overlaid on memsw.
6145 */
6146static void free_work(struct work_struct *work)
6147{
6148 struct mem_cgroup *memcg;
6149
6150 memcg = container_of(work, struct mem_cgroup, work_freeing);
6151 __mem_cgroup_free(memcg);
6152}
6153
6154static void free_rcu(struct rcu_head *rcu_head)
6155{
6156 struct mem_cgroup *memcg;
6157
6158 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
6159 INIT_WORK(&memcg->work_freeing, free_work);
6160 schedule_work(&memcg->work_freeing);
6161}
6162
6163static void mem_cgroup_get(struct mem_cgroup *memcg)
6164{
6165 atomic_inc(&memcg->refcnt);
6166}
6167
6168static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
6169{
6170 if (atomic_sub_and_test(count, &memcg->refcnt)) {
6171 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
6172 call_rcu(&memcg->rcu_freeing, free_rcu);
6173 if (parent)
6174 mem_cgroup_put(parent);
6175 }
6176}
6177
6178static void mem_cgroup_put(struct mem_cgroup *memcg)
6179{
6180 __mem_cgroup_put(memcg, 1);
6181}
6182
6183/* 6196/*
6184 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6197 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
6185 */ 6198 */
@@ -6239,7 +6252,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6239 6252
6240 memcg->last_scanned_node = MAX_NUMNODES; 6253 memcg->last_scanned_node = MAX_NUMNODES;
6241 INIT_LIST_HEAD(&memcg->oom_notify); 6254 INIT_LIST_HEAD(&memcg->oom_notify);
6242 atomic_set(&memcg->refcnt, 1);
6243 memcg->move_charge_at_immigrate = 0; 6255 memcg->move_charge_at_immigrate = 0;
6244 mutex_init(&memcg->thresholds_lock); 6256 mutex_init(&memcg->thresholds_lock);
6245 spin_lock_init(&memcg->move_lock); 6257 spin_lock_init(&memcg->move_lock);
@@ -6275,12 +6287,9 @@ mem_cgroup_css_online(struct cgroup *cont)
6275 res_counter_init(&memcg->kmem, &parent->kmem); 6287 res_counter_init(&memcg->kmem, &parent->kmem);
6276 6288
6277 /* 6289 /*
6278 * We increment refcnt of the parent to ensure that we can 6290 * No need to take a reference to the parent because cgroup
6279 * safely access it on res_counter_charge/uncharge. 6291 * core guarantees its existence.
6280 * This refcnt will be decremented when freeing this
6281 * mem_cgroup(see mem_cgroup_put).
6282 */ 6292 */
6283 mem_cgroup_get(parent);
6284 } else { 6293 } else {
6285 res_counter_init(&memcg->res, NULL); 6294 res_counter_init(&memcg->res, NULL);
6286 res_counter_init(&memcg->memsw, NULL); 6295 res_counter_init(&memcg->memsw, NULL);
@@ -6296,16 +6305,6 @@ mem_cgroup_css_online(struct cgroup *cont)
6296 6305
6297 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6306 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6298 mutex_unlock(&memcg_create_mutex); 6307 mutex_unlock(&memcg_create_mutex);
6299 if (error) {
6300 /*
6301 * We call put now because our (and parent's) refcnts
6302 * are already in place. mem_cgroup_put() will internally
6303 * call __mem_cgroup_free, so return directly
6304 */
6305 mem_cgroup_put(memcg);
6306 if (parent->use_hierarchy)
6307 mem_cgroup_put(parent);
6308 }
6309 return error; 6308 return error;
6310} 6309}
6311 6310
@@ -6317,20 +6316,22 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6317 struct mem_cgroup *parent = memcg; 6316 struct mem_cgroup *parent = memcg;
6318 6317
6319 while ((parent = parent_mem_cgroup(parent))) 6318 while ((parent = parent_mem_cgroup(parent)))
6320 atomic_inc(&parent->dead_count); 6319 mem_cgroup_iter_invalidate(parent);
6321 6320
6322 /* 6321 /*
6323 * if the root memcg is not hierarchical we have to check it 6322 * if the root memcg is not hierarchical we have to check it
6324 * explicitely. 6323 * explicitely.
6325 */ 6324 */
6326 if (!root_mem_cgroup->use_hierarchy) 6325 if (!root_mem_cgroup->use_hierarchy)
6327 atomic_inc(&root_mem_cgroup->dead_count); 6326 mem_cgroup_iter_invalidate(root_mem_cgroup);
6328} 6327}
6329 6328
6330static void mem_cgroup_css_offline(struct cgroup *cont) 6329static void mem_cgroup_css_offline(struct cgroup *cont)
6331{ 6330{
6332 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6331 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6333 6332
6333 kmem_cgroup_css_offline(memcg);
6334
6334 mem_cgroup_invalidate_reclaim_iterators(memcg); 6335 mem_cgroup_invalidate_reclaim_iterators(memcg);
6335 mem_cgroup_reparent_charges(memcg); 6336 mem_cgroup_reparent_charges(memcg);
6336 mem_cgroup_destroy_all_caches(memcg); 6337 mem_cgroup_destroy_all_caches(memcg);
@@ -6340,9 +6341,8 @@ static void mem_cgroup_css_free(struct cgroup *cont)
6340{ 6341{
6341 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6342 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6342 6343
6343 kmem_cgroup_destroy(memcg); 6344 memcg_destroy_kmem(memcg);
6344 6345 __mem_cgroup_free(memcg);
6345 mem_cgroup_put(memcg);
6346} 6346}
6347 6347
6348#ifdef CONFIG_MMU 6348#ifdef CONFIG_MMU
@@ -6651,6 +6651,7 @@ static void __mem_cgroup_clear_mc(void)
6651{ 6651{
6652 struct mem_cgroup *from = mc.from; 6652 struct mem_cgroup *from = mc.from;
6653 struct mem_cgroup *to = mc.to; 6653 struct mem_cgroup *to = mc.to;
6654 int i;
6654 6655
6655 /* we must uncharge all the leftover precharges from mc.to */ 6656 /* we must uncharge all the leftover precharges from mc.to */
6656 if (mc.precharge) { 6657 if (mc.precharge) {
@@ -6671,7 +6672,9 @@ static void __mem_cgroup_clear_mc(void)
6671 if (!mem_cgroup_is_root(mc.from)) 6672 if (!mem_cgroup_is_root(mc.from))
6672 res_counter_uncharge(&mc.from->memsw, 6673 res_counter_uncharge(&mc.from->memsw,
6673 PAGE_SIZE * mc.moved_swap); 6674 PAGE_SIZE * mc.moved_swap);
6674 __mem_cgroup_put(mc.from, mc.moved_swap); 6675
6676 for (i = 0; i < mc.moved_swap; i++)
6677 css_put(&mc.from->css);
6675 6678
6676 if (!mem_cgroup_is_root(mc.to)) { 6679 if (!mem_cgroup_is_root(mc.to)) {
6677 /* 6680 /*
@@ -6681,7 +6684,7 @@ static void __mem_cgroup_clear_mc(void)
6681 res_counter_uncharge(&mc.to->res, 6684 res_counter_uncharge(&mc.to->res,
6682 PAGE_SIZE * mc.moved_swap); 6685 PAGE_SIZE * mc.moved_swap);
6683 } 6686 }
6684 /* we've already done mem_cgroup_get(mc.to) */ 6687 /* we've already done css_get(mc.to) */
6685 mc.moved_swap = 0; 6688 mc.moved_swap = 0;
6686 } 6689 }
6687 memcg_oom_recover(from); 6690 memcg_oom_recover(from);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..2c13aa7a0164 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1410 1410
1411 /* 1411 /*
1412 * Isolate the page, so that it doesn't get reallocated if it 1412 * Isolate the page, so that it doesn't get reallocated if it
1413 * was free. 1413 * was free. This flag should be kept set until the source page
1414 * is freed and PG_hwpoison on it is set.
1414 */ 1415 */
1415 set_migratetype_isolate(p, true); 1416 set_migratetype_isolate(p, true);
1416 /* 1417 /*
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1433 /* Not a free page */ 1434 /* Not a free page */
1434 ret = 1; 1435 ret = 1;
1435 } 1436 }
1436 unset_migratetype_isolate(p, MIGRATE_MOVABLE);
1437 unlock_memory_hotplug(); 1437 unlock_memory_hotplug();
1438 return ret; 1438 return ret;
1439} 1439}
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1494 atomic_long_add(1 << compound_trans_order(hpage), 1494 atomic_long_add(1 << compound_trans_order(hpage),
1495 &num_poisoned_pages); 1495 &num_poisoned_pages);
1496 } 1496 }
1497 /* keep elevated page count for bad page */
1498 return ret; 1497 return ret;
1499} 1498}
1500 1499
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags)
1559 atomic_long_inc(&num_poisoned_pages); 1558 atomic_long_inc(&num_poisoned_pages);
1560 } 1559 }
1561 } 1560 }
1562 /* keep elevated page count for bad page */ 1561 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1563 return ret; 1562 return ret;
1564} 1563}
1565 1564
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags)
1625 if (ret > 0) 1624 if (ret > 0)
1626 ret = -EIO; 1625 ret = -EIO;
1627 } else { 1626 } else {
1627 /*
1628 * After page migration succeeds, the source page can
1629 * be trapped in pagevec and actual freeing is delayed.
1630 * Freeing code works differently based on PG_hwpoison,
1631 * so there's a race. We need to make sure that the
1632 * source page should be freed back to buddy before
1633 * setting PG_hwpoison.
1634 */
1635 if (!is_free_buddy_page(page))
1636 lru_add_drain_all();
1637 if (!is_free_buddy_page(page))
1638 drain_all_pages();
1628 SetPageHWPoison(page); 1639 SetPageHWPoison(page);
1640 if (!is_free_buddy_page(page))
1641 pr_info("soft offline: %#lx: page leaked\n",
1642 pfn);
1629 atomic_long_inc(&num_poisoned_pages); 1643 atomic_long_inc(&num_poisoned_pages);
1630 } 1644 }
1631 } else { 1645 } else {
diff --git a/mm/memory.c b/mm/memory.c
index 95d0cce63583..1ce2e2a734fc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
82EXPORT_SYMBOL(mem_map); 82EXPORT_SYMBOL(mem_map);
83#endif 83#endif
84 84
85unsigned long num_physpages;
86/* 85/*
87 * A number of key systems in x86 including ioremap() rely on the assumption 86 * A number of key systems in x86 including ioremap() rely on the assumption
88 * that high_memory defines the upper bound on direct map memory, then end 87 * that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
92 */ 91 */
93void * high_memory; 92void * high_memory;
94 93
95EXPORT_SYMBOL(num_physpages);
96EXPORT_SYMBOL(high_memory); 94EXPORT_SYMBOL(high_memory);
97 95
98/* 96/*
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1101 spinlock_t *ptl; 1099 spinlock_t *ptl;
1102 pte_t *start_pte; 1100 pte_t *start_pte;
1103 pte_t *pte; 1101 pte_t *pte;
1102 unsigned long range_start = addr;
1104 1103
1105again: 1104again:
1106 init_rss_vec(rss); 1105 init_rss_vec(rss);
@@ -1151,7 +1150,7 @@ again:
1151 if (pte_dirty(ptent)) 1150 if (pte_dirty(ptent))
1152 set_page_dirty(page); 1151 set_page_dirty(page);
1153 if (pte_young(ptent) && 1152 if (pte_young(ptent) &&
1154 likely(!VM_SequentialReadHint(vma))) 1153 likely(!(vma->vm_flags & VM_SEQ_READ)))
1155 mark_page_accessed(page); 1154 mark_page_accessed(page);
1156 rss[MM_FILEPAGES]--; 1155 rss[MM_FILEPAGES]--;
1157 } 1156 }
@@ -1206,12 +1205,14 @@ again:
1206 force_flush = 0; 1205 force_flush = 0;
1207 1206
1208#ifdef HAVE_GENERIC_MMU_GATHER 1207#ifdef HAVE_GENERIC_MMU_GATHER
1209 tlb->start = addr; 1208 tlb->start = range_start;
1210 tlb->end = end; 1209 tlb->end = addr;
1211#endif 1210#endif
1212 tlb_flush_mmu(tlb); 1211 tlb_flush_mmu(tlb);
1213 if (addr != end) 1212 if (addr != end) {
1213 range_start = addr;
1214 goto again; 1214 goto again;
1215 }
1215 } 1216 }
1216 1217
1217 return addr; 1218 return addr;
@@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
2904 details->first_index, details->last_index) { 2905 details->first_index, details->last_index) {
2905 2906
2906 vba = vma->vm_pgoff; 2907 vba = vma->vm_pgoff;
2907 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2908 vea = vba + vma_pages(vma) - 1;
2908 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2909 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2909 zba = details->first_index; 2910 zba = details->first_index;
2910 if (zba < vba) 2911 if (zba < vba)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ad92b46753e..ca1dd3aa5eee 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
75 res->end = start + size - 1; 75 res->end = start + size - 1;
76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
77 if (request_resource(&iomem_resource, res) < 0) { 77 if (request_resource(&iomem_resource, res) < 0) {
78 printk("System RAM resource %pR cannot be added\n", res); 78 pr_debug("System RAM resource %pR cannot be added\n", res);
79 kfree(res); 79 kfree(res);
80 res = NULL; 80 res = NULL;
81 } 81 }
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page,
101 atomic_inc(&page->_count); 101 atomic_inc(&page->_count);
102} 102}
103 103
104/* reference to __meminit __free_pages_bootmem is valid 104void put_page_bootmem(struct page *page)
105 * so use __ref to tell modpost not to generate a warning */
106void __ref put_page_bootmem(struct page *page)
107{ 105{
108 unsigned long type; 106 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
110 107
111 type = (unsigned long) page->lru.next; 108 type = (unsigned long) page->lru.next;
112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 109 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page)
116 ClearPagePrivate(page); 113 ClearPagePrivate(page);
117 set_page_private(page, 0); 114 set_page_private(page, 0);
118 INIT_LIST_HEAD(&page->lru); 115 INIT_LIST_HEAD(&page->lru);
119 116 free_reserved_page(page);
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
128 } 117 }
129
130} 118}
131 119
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 120#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
@@ -220,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
220 pfn = pgdat->node_start_pfn; 208 pfn = pgdat->node_start_pfn;
221 end_pfn = pgdat_end_pfn(pgdat); 209 end_pfn = pgdat_end_pfn(pgdat);
222 210
223 /* register_section info */ 211 /* register section info */
224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 212 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
225 /* 213 /*
226 * Some platforms can assign the same pfn to multiple nodes - on 214 * Some platforms can assign the same pfn to multiple nodes - on
227 * node0 as well as nodeN. To avoid registering a pfn against 215 * node0 as well as nodeN. To avoid registering a pfn against
228 * multiple nodes we check that this pfn does not already 216 * multiple nodes we check that this pfn does not already
229 * reside in some other node. 217 * reside in some other nodes.
230 */ 218 */
231 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 219 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
232 register_page_bootmem_info_section(pfn); 220 register_page_bootmem_info_section(pfn);
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
309 /* can't move pfns which are higher than @z2 */ 297 /* can't move pfns which are higher than @z2 */
310 if (end_pfn > zone_end_pfn(z2)) 298 if (end_pfn > zone_end_pfn(z2))
311 goto out_fail; 299 goto out_fail;
312 /* the move out part mast at the left most of @z2 */ 300 /* the move out part must be at the left most of @z2 */
313 if (start_pfn > z2->zone_start_pfn) 301 if (start_pfn > z2->zone_start_pfn)
314 goto out_fail; 302 goto out_fail;
315 /* must included/overlap */ 303 /* must included/overlap */
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
775 763
776void __online_page_set_limits(struct page *page) 764void __online_page_set_limits(struct page *page)
777{ 765{
778 unsigned long pfn = page_to_pfn(page);
779
780 if (pfn >= num_physpages)
781 num_physpages = pfn + 1;
782} 766}
783EXPORT_SYMBOL_GPL(__online_page_set_limits); 767EXPORT_SYMBOL_GPL(__online_page_set_limits);
784 768
785void __online_page_increment_counters(struct page *page) 769void __online_page_increment_counters(struct page *page)
786{ 770{
787 totalram_pages++; 771 adjust_managed_page_count(page, 1);
788
789#ifdef CONFIG_HIGHMEM
790 if (PageHighMem(page))
791 totalhigh_pages++;
792#endif
793} 772}
794EXPORT_SYMBOL_GPL(__online_page_increment_counters); 773EXPORT_SYMBOL_GPL(__online_page_increment_counters);
795 774
796void __online_page_free(struct page *page) 775void __online_page_free(struct page *page)
797{ 776{
798 ClearPageReserved(page); 777 __free_reserved_page(page);
799 init_page_count(page);
800 __free_page(page);
801} 778}
802EXPORT_SYMBOL_GPL(__online_page_free); 779EXPORT_SYMBOL_GPL(__online_page_free);
803 780
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
918 895
919int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 896int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
920{ 897{
898 unsigned long flags;
921 unsigned long onlined_pages = 0; 899 unsigned long onlined_pages = 0;
922 struct zone *zone; 900 struct zone *zone;
923 int need_zonelists_rebuild = 0; 901 int need_zonelists_rebuild = 0;
@@ -936,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
936 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 914 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
937 !can_online_high_movable(zone)) { 915 !can_online_high_movable(zone)) {
938 unlock_memory_hotplug(); 916 unlock_memory_hotplug();
939 return -1; 917 return -EINVAL;
940 } 918 }
941 919
942 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 920 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
943 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 921 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
944 unlock_memory_hotplug(); 922 unlock_memory_hotplug();
945 return -1; 923 return -EINVAL;
946 } 924 }
947 } 925 }
948 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 926 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
949 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 927 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
950 unlock_memory_hotplug(); 928 unlock_memory_hotplug();
951 return -1; 929 return -EINVAL;
952 } 930 }
953 } 931 }
954 932
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
994 return ret; 972 return ret;
995 } 973 }
996 974
997 zone->managed_pages += onlined_pages;
998 zone->present_pages += onlined_pages; 975 zone->present_pages += onlined_pages;
976
977 pgdat_resize_lock(zone->zone_pgdat, &flags);
999 zone->zone_pgdat->node_present_pages += onlined_pages; 978 zone->zone_pgdat->node_present_pages += onlined_pages;
979 pgdat_resize_unlock(zone->zone_pgdat, &flags);
980
1000 if (onlined_pages) { 981 if (onlined_pages) {
1001 node_states_set_node(zone_to_nid(zone), &arg); 982 node_states_set_node(zone_to_nid(zone), &arg);
1002 if (need_zonelists_rebuild) 983 if (need_zonelists_rebuild)
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
1487 unsigned long pfn, nr_pages, expire; 1468 unsigned long pfn, nr_pages, expire;
1488 long offlined_pages; 1469 long offlined_pages;
1489 int ret, drain, retry_max, node; 1470 int ret, drain, retry_max, node;
1471 unsigned long flags;
1490 struct zone *zone; 1472 struct zone *zone;
1491 struct memory_notify arg; 1473 struct memory_notify arg;
1492 1474
@@ -1578,10 +1560,12 @@ repeat:
1578 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1560 /* reset pagetype flags and makes migrate type to be MOVABLE */
1579 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1561 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1580 /* removal success */ 1562 /* removal success */
1581 zone->managed_pages -= offlined_pages; 1563 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1582 zone->present_pages -= offlined_pages; 1564 zone->present_pages -= offlined_pages;
1565
1566 pgdat_resize_lock(zone->zone_pgdat, &flags);
1583 zone->zone_pgdat->node_present_pages -= offlined_pages; 1567 zone->zone_pgdat->node_present_pages -= offlined_pages;
1584 totalram_pages -= offlined_pages; 1568 pgdat_resize_unlock(zone->zone_pgdat, &flags);
1585 1569
1586 init_per_zone_wmark_min(); 1570 init_per_zone_wmark_min();
1587 1571
@@ -1621,6 +1605,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1621{ 1605{
1622 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1606 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1623} 1607}
1608#endif /* CONFIG_MEMORY_HOTREMOVE */
1624 1609
1625/** 1610/**
1626 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1611 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
@@ -1634,7 +1619,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1634 * 1619 *
1635 * Returns the return value of func. 1620 * Returns the return value of func.
1636 */ 1621 */
1637static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1622int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1638 void *arg, int (*func)(struct memory_block *, void *)) 1623 void *arg, int (*func)(struct memory_block *, void *))
1639{ 1624{
1640 struct memory_block *mem = NULL; 1625 struct memory_block *mem = NULL;
@@ -1671,24 +1656,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1671 return 0; 1656 return 0;
1672} 1657}
1673 1658
1674/** 1659#ifdef CONFIG_MEMORY_HOTREMOVE
1675 * offline_memory_block_cb - callback function for offlining memory block
1676 * @mem: the memory block to be offlined
1677 * @arg: buffer to hold error msg
1678 *
1679 * Always return 0, and put the error msg in arg if any.
1680 */
1681static int offline_memory_block_cb(struct memory_block *mem, void *arg)
1682{
1683 int *ret = arg;
1684 int error = offline_memory_block(mem);
1685
1686 if (error != 0 && *ret == 0)
1687 *ret = error;
1688
1689 return 0;
1690}
1691
1692static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1660static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1693{ 1661{
1694 int ret = !is_memblock_offlined(mem); 1662 int ret = !is_memblock_offlined(mem);
@@ -1814,54 +1782,22 @@ void try_offline_node(int nid)
1814} 1782}
1815EXPORT_SYMBOL(try_offline_node); 1783EXPORT_SYMBOL(try_offline_node);
1816 1784
1817int __ref remove_memory(int nid, u64 start, u64 size) 1785void __ref remove_memory(int nid, u64 start, u64 size)
1818{ 1786{
1819 unsigned long start_pfn, end_pfn; 1787 int ret;
1820 int ret = 0;
1821 int retry = 1;
1822
1823 start_pfn = PFN_DOWN(start);
1824 end_pfn = PFN_UP(start + size - 1);
1825
1826 /*
1827 * When CONFIG_MEMCG is on, one memory block may be used by other
1828 * blocks to store page cgroup when onlining pages. But we don't know
1829 * in what order pages are onlined. So we iterate twice to offline
1830 * memory:
1831 * 1st iterate: offline every non primary memory block.
1832 * 2nd iterate: offline primary (i.e. first added) memory block.
1833 */
1834repeat:
1835 walk_memory_range(start_pfn, end_pfn, &ret,
1836 offline_memory_block_cb);
1837 if (ret) {
1838 if (!retry)
1839 return ret;
1840
1841 retry = 0;
1842 ret = 0;
1843 goto repeat;
1844 }
1845 1788
1846 lock_memory_hotplug(); 1789 lock_memory_hotplug();
1847 1790
1848 /* 1791 /*
1849 * we have offlined all memory blocks like this: 1792 * All memory blocks must be offlined before removing memory. Check
1850 * 1. lock memory hotplug 1793 * whether all memory blocks in question are offline and trigger a BUG()
1851 * 2. offline a memory block 1794 * if this is not the case.
1852 * 3. unlock memory hotplug
1853 *
1854 * repeat step1-3 to offline the memory block. All memory blocks
1855 * must be offlined before removing memory. But we don't hold the
1856 * lock in the whole operation. So we should check whether all
1857 * memory blocks are offlined.
1858 */ 1795 */
1859 1796 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1860 ret = walk_memory_range(start_pfn, end_pfn, NULL,
1861 is_memblock_offlined_cb); 1797 is_memblock_offlined_cb);
1862 if (ret) { 1798 if (ret) {
1863 unlock_memory_hotplug(); 1799 unlock_memory_hotplug();
1864 return ret; 1800 BUG();
1865 } 1801 }
1866 1802
1867 /* remove memmap entry */ 1803 /* remove memmap entry */
@@ -1872,17 +1808,6 @@ repeat:
1872 try_offline_node(nid); 1808 try_offline_node(nid);
1873 1809
1874 unlock_memory_hotplug(); 1810 unlock_memory_hotplug();
1875
1876 return 0;
1877} 1811}
1878#else
1879int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1880{
1881 return -EINVAL;
1882}
1883int remove_memory(int nid, u64 start, u64 size)
1884{
1885 return -EINVAL;
1886}
1887#endif /* CONFIG_MEMORY_HOTREMOVE */
1888EXPORT_SYMBOL_GPL(remove_memory); 1812EXPORT_SYMBOL_GPL(remove_memory);
1813#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/memory.h>
13#include <linux/notifier.h>
12#include "internal.h" 14#include "internal.h"
13 15
14#ifdef CONFIG_DEBUG_MEMORY_INIT 16#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
147struct kobject *mm_kobj; 149struct kobject *mm_kobj;
148EXPORT_SYMBOL_GPL(mm_kobj); 150EXPORT_SYMBOL_GPL(mm_kobj);
149 151
152#ifdef CONFIG_SMP
153s32 vm_committed_as_batch = 32;
154
155static void __meminit mm_compute_batch(void)
156{
157 u64 memsized_batch;
158 s32 nr = num_present_cpus();
159 s32 batch = max_t(s32, nr*2, 32);
160
161 /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
162 memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
163
164 vm_committed_as_batch = max_t(s32, memsized_batch, batch);
165}
166
167static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
168 unsigned long action, void *arg)
169{
170 switch (action) {
171 case MEM_ONLINE:
172 case MEM_OFFLINE:
173 mm_compute_batch();
174 default:
175 break;
176 }
177 return NOTIFY_OK;
178}
179
180static struct notifier_block compute_batch_nb __meminitdata = {
181 .notifier_call = mm_compute_batch_notifier,
182 .priority = IPC_CALLBACK_PRI, /* use lowest priority */
183};
184
185static int __init mm_compute_batch_init(void)
186{
187 mm_compute_batch();
188 register_hotmemory_notifier(&compute_batch_nb);
189
190 return 0;
191}
192
193__initcall(mm_compute_batch_init);
194
195#endif
196
150static int __init mm_sysfs_init(void) 197static int __init mm_sysfs_init(void)
151{ 198{
152 mm_kobj = kobject_create_and_add("mm", kernel_kobj); 199 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fad..fbad7b091090 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
955 if (is_mergeable_vma(vma, file, vm_flags) && 955 if (is_mergeable_vma(vma, file, vm_flags) &&
956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
957 pgoff_t vm_pglen; 957 pgoff_t vm_pglen;
958 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 958 vm_pglen = vma_pages(vma);
959 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 959 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
960 return 1; 960 return 1;
961 } 961 }
@@ -1358,18 +1358,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1358 1358
1359 if (!(flags & MAP_ANONYMOUS)) { 1359 if (!(flags & MAP_ANONYMOUS)) {
1360 audit_mmap_fd(fd, flags); 1360 audit_mmap_fd(fd, flags);
1361 if (unlikely(flags & MAP_HUGETLB))
1362 return -EINVAL;
1363 file = fget(fd); 1361 file = fget(fd);
1364 if (!file) 1362 if (!file)
1365 goto out; 1363 goto out;
1366 if (is_file_hugepages(file)) 1364 if (is_file_hugepages(file))
1367 len = ALIGN(len, huge_page_size(hstate_file(file))); 1365 len = ALIGN(len, huge_page_size(hstate_file(file)));
1366 retval = -EINVAL;
1367 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1368 goto out_fput;
1368 } else if (flags & MAP_HUGETLB) { 1369 } else if (flags & MAP_HUGETLB) {
1369 struct user_struct *user = NULL; 1370 struct user_struct *user = NULL;
1370 struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & 1371 struct hstate *hs;
1371 SHM_HUGE_MASK);
1372 1372
1373 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
1373 if (!hs) 1374 if (!hs)
1374 return -EINVAL; 1375 return -EINVAL;
1375 1376
@@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1391 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1392 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1392 1393
1393 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1394 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1395out_fput:
1394 if (file) 1396 if (file)
1395 fput(file); 1397 fput(file);
1396out: 1398out:
@@ -1876,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1876} 1878}
1877#endif 1879#endif
1878 1880
1879void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1880{
1881 /*
1882 * Is this a new hole at the lowest possible address?
1883 */
1884 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1885 mm->free_area_cache = addr;
1886}
1887
1888/* 1881/*
1889 * This mmap-allocator allocates new areas top-down from below the 1882 * This mmap-allocator allocates new areas top-down from below the
1890 * stack's low limit (the base): 1883 * stack's low limit (the base):
@@ -1941,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1941} 1934}
1942#endif 1935#endif
1943 1936
1944void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1945{
1946 /*
1947 * Is this a new hole at the highest possible address?
1948 */
1949 if (addr > mm->free_area_cache)
1950 mm->free_area_cache = addr;
1951
1952 /* dont allow allocations above current base */
1953 if (mm->free_area_cache > mm->mmap_base)
1954 mm->free_area_cache = mm->mmap_base;
1955}
1956
1957unsigned long 1937unsigned long
1958get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 1938get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1959 unsigned long pgoff, unsigned long flags) 1939 unsigned long pgoff, unsigned long flags)
@@ -2374,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2374{ 2354{
2375 struct vm_area_struct **insertion_point; 2355 struct vm_area_struct **insertion_point;
2376 struct vm_area_struct *tail_vma = NULL; 2356 struct vm_area_struct *tail_vma = NULL;
2377 unsigned long addr;
2378 2357
2379 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2358 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2380 vma->vm_prev = NULL; 2359 vma->vm_prev = NULL;
@@ -2391,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2391 } else 2370 } else
2392 mm->highest_vm_end = prev ? prev->vm_end : 0; 2371 mm->highest_vm_end = prev ? prev->vm_end : 0;
2393 tail_vma->vm_next = NULL; 2372 tail_vma->vm_next = NULL;
2394 if (mm->unmap_area == arch_unmap_area)
2395 addr = prev ? prev->vm_end : mm->mmap_base;
2396 else
2397 addr = vma ? vma->vm_start : mm->mmap_base;
2398 mm->unmap_area(mm, addr);
2399 mm->mmap_cache = NULL; /* Kill the cache. */ 2373 mm->mmap_cache = NULL; /* Kill the cache. */
2400} 2374}
2401 2375
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 6725ff183374..93e6089cb456 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -315,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
315 315
316 /* 316 /*
317 * Wait for any running method to finish, of course including 317 * Wait for any running method to finish, of course including
318 * ->release if it was run by mmu_notifier_relase instead of us. 318 * ->release if it was run by mmu_notifier_release instead of us.
319 */ 319 */
320 synchronize_srcu(&srcu); 320 synchronize_srcu(&srcu);
321 321
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..457d34ef3bf2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
126 continue; 126 continue;
127 pte = ptep_get_and_clear(mm, old_addr, old_pte); 127 pte = ptep_get_and_clear(mm, old_addr, old_pte);
128 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 128 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
129 set_pte_at(mm, new_addr, new_pte, pte); 129 set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
130 } 130 }
131 131
132 arch_leave_lazy_mmu_mode(); 132 arch_leave_lazy_mmu_mode();
@@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
456 unsigned long charged = 0; 456 unsigned long charged = 0;
457 bool locked = false; 457 bool locked = false;
458 458
459 down_write(&current->mm->mmap_sem);
460
461 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 459 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
462 goto out; 460 return ret;
461
462 if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
463 return ret;
463 464
464 if (addr & ~PAGE_MASK) 465 if (addr & ~PAGE_MASK)
465 goto out; 466 return ret;
466 467
467 old_len = PAGE_ALIGN(old_len); 468 old_len = PAGE_ALIGN(old_len);
468 new_len = PAGE_ALIGN(new_len); 469 new_len = PAGE_ALIGN(new_len);
@@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
473 * a zero new-len is nonsensical. 474 * a zero new-len is nonsensical.
474 */ 475 */
475 if (!new_len) 476 if (!new_len)
476 goto out; 477 return ret;
478
479 down_write(&current->mm->mmap_sem);
477 480
478 if (flags & MREMAP_FIXED) { 481 if (flags & MREMAP_FIXED) {
479 if (flags & MREMAP_MAYMOVE) 482 ret = mremap_to(addr, old_len, new_addr, new_len,
480 ret = mremap_to(addr, old_len, new_addr, new_len, 483 &locked);
481 &locked);
482 goto out; 484 goto out;
483 } 485 }
484 486
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2fc73b..61107cf55bb3 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) 140static int reset_managed_pages_done __initdata;
141
142static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
141{ 143{
142 struct zone *z; 144 struct zone *z;
143 145
144 /* 146 if (reset_managed_pages_done)
145 * In free_area_init_core(), highmem zone's managed_pages is set to 147 return;
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 148 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z)) 149 z->managed_pages = 0;
153 z->managed_pages = 0; 150}
151
152void __init reset_all_zones_managed_pages(void)
153{
154 struct pglist_data *pgdat;
155
156 for_each_online_pgdat(pgdat)
157 reset_node_managed_pages(pgdat);
158 reset_managed_pages_done = 1;
154} 159}
155 160
156/** 161/**
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
160 */ 165 */
161unsigned long __init free_all_bootmem(void) 166unsigned long __init free_all_bootmem(void)
162{ 167{
163 struct pglist_data *pgdat; 168 unsigned long pages;
164 169
165 for_each_online_pgdat(pgdat) 170 reset_all_zones_managed_pages();
166 reset_node_lowmem_managed_pages(pgdat);
167 171
168 /* 172 /*
169 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 173 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
170 * because in some case like Node0 doesn't have RAM installed 174 * because in some case like Node0 doesn't have RAM installed
171 * low ram will be on Node1 175 * low ram will be on Node1
172 */ 176 */
173 return free_low_memory_core_early(); 177 pages = free_low_memory_core_early();
178 totalram_pages += pages;
179
180 return pages;
174} 181}
175 182
176/** 183/**
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884dcd6e7..ecd1f158548e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -56,7 +56,6 @@
56void *high_memory; 56void *high_memory;
57struct page *mem_map; 57struct page *mem_map;
58unsigned long max_mapnr; 58unsigned long max_mapnr;
59unsigned long num_physpages;
60unsigned long highest_memmap_pfn; 59unsigned long highest_memmap_pfn;
61struct percpu_counter vm_committed_as; 60struct percpu_counter vm_committed_as;
62int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void)
85EXPORT_SYMBOL_GPL(vm_memory_committed); 84EXPORT_SYMBOL_GPL(vm_memory_committed);
86 85
87EXPORT_SYMBOL(mem_map); 86EXPORT_SYMBOL(mem_map);
88EXPORT_SYMBOL(num_physpages);
89 87
90/* list of mapped, potentially shareable regions */ 88/* list of mapped, potentially shareable regions */
91static struct kmem_cache *vm_region_jar; 89static struct kmem_cache *vm_region_jar;
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
282 280
283long vread(char *buf, char *addr, unsigned long count) 281long vread(char *buf, char *addr, unsigned long count)
284{ 282{
283 /* Don't allow overflow */
284 if ((unsigned long) buf + count < count)
285 count = -(unsigned long) buf;
286
285 memcpy(buf, addr, count); 287 memcpy(buf, addr, count);
286 return count; 288 return count;
287} 289}
@@ -1869,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1869 return -ENOMEM; 1871 return -ENOMEM;
1870} 1872}
1871 1873
1872void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1873{
1874}
1875
1876void unmap_mapping_range(struct address_space *mapping, 1874void unmap_mapping_range(struct address_space *mapping,
1877 loff_t const holebegin, loff_t const holelen, 1875 loff_t const holebegin, loff_t const holelen,
1878 int even_cows) 1876 int even_cows)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..b100255dedda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
61#include <linux/hugetlb.h> 61#include <linux/hugetlb.h>
62#include <linux/sched/rt.h> 62#include <linux/sched/rt.h>
63 63
64#include <asm/sections.h>
64#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
65#include <asm/div64.h> 66#include <asm/div64.h>
66#include "internal.h" 67#include "internal.h"
67 68
69/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
70static DEFINE_MUTEX(pcp_batch_high_lock);
71
68#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 72#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
69DEFINE_PER_CPU(int, numa_node); 73DEFINE_PER_CPU(int, numa_node);
70EXPORT_PER_CPU_SYMBOL(numa_node); 74EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
100}; 104};
101EXPORT_SYMBOL(node_states); 105EXPORT_SYMBOL(node_states);
102 106
107/* Protect totalram_pages and zone->managed_pages */
108static DEFINE_SPINLOCK(managed_page_count_lock);
109
103unsigned long totalram_pages __read_mostly; 110unsigned long totalram_pages __read_mostly;
104unsigned long totalreserve_pages __read_mostly; 111unsigned long totalreserve_pages __read_mostly;
105/* 112/*
@@ -197,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
197}; 204};
198 205
199int min_free_kbytes = 1024; 206int min_free_kbytes = 1024;
207int user_min_free_kbytes;
200 208
201static unsigned long __meminitdata nr_kernel_pages; 209static unsigned long __meminitdata nr_kernel_pages;
202static unsigned long __meminitdata nr_all_pages; 210static unsigned long __meminitdata nr_all_pages;
@@ -739,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
739 local_irq_restore(flags); 747 local_irq_restore(flags);
740} 748}
741 749
742/* 750void __init __free_pages_bootmem(struct page *page, unsigned int order)
743 * Read access to zone->managed_pages is safe because it's unsigned long,
744 * but we still need to serialize writers. Currently all callers of
745 * __free_pages_bootmem() except put_page_bootmem() should only be used
746 * at boot time. So for shorter boot time, we shift the burden to
747 * put_page_bootmem() to serialize writers.
748 */
749void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750{ 751{
751 unsigned int nr_pages = 1 << order; 752 unsigned int nr_pages = 1 << order;
752 unsigned int loop; 753 unsigned int loop;
@@ -781,11 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
781 set_page_refcounted(page); 782 set_page_refcounted(page);
782 set_pageblock_migratetype(page, MIGRATE_CMA); 783 set_pageblock_migratetype(page, MIGRATE_CMA);
783 __free_pages(page, pageblock_order); 784 __free_pages(page, pageblock_order);
784 totalram_pages += pageblock_nr_pages; 785 adjust_managed_page_count(page, pageblock_nr_pages);
785#ifdef CONFIG_HIGHMEM
786 if (PageHighMem(page))
787 totalhigh_pages += pageblock_nr_pages;
788#endif
789} 786}
790#endif 787#endif
791 788
@@ -1050,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1050 * MIGRATE_CMA areas. 1047 * MIGRATE_CMA areas.
1051 */ 1048 */
1052 if (!is_migrate_cma(migratetype) && 1049 if (!is_migrate_cma(migratetype) &&
1053 (unlikely(current_order >= pageblock_order / 2) || 1050 (current_order >= pageblock_order / 2 ||
1054 start_migratetype == MIGRATE_RECLAIMABLE || 1051 start_migratetype == MIGRATE_RECLAIMABLE ||
1055 page_group_by_mobility_disabled)) { 1052 page_group_by_mobility_disabled)) {
1056 int pages; 1053 int pages;
@@ -1179,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1179{ 1176{
1180 unsigned long flags; 1177 unsigned long flags;
1181 int to_drain; 1178 int to_drain;
1179 unsigned long batch;
1182 1180
1183 local_irq_save(flags); 1181 local_irq_save(flags);
1184 if (pcp->count >= pcp->batch) 1182 batch = ACCESS_ONCE(pcp->batch);
1185 to_drain = pcp->batch; 1183 if (pcp->count >= batch)
1184 to_drain = batch;
1186 else 1185 else
1187 to_drain = pcp->count; 1186 to_drain = pcp->count;
1188 if (to_drain > 0) { 1187 if (to_drain > 0) {
@@ -1350,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold)
1350 list_add(&page->lru, &pcp->lists[migratetype]); 1349 list_add(&page->lru, &pcp->lists[migratetype]);
1351 pcp->count++; 1350 pcp->count++;
1352 if (pcp->count >= pcp->high) { 1351 if (pcp->count >= pcp->high) {
1353 free_pcppages_bulk(zone, pcp->batch, pcp); 1352 unsigned long batch = ACCESS_ONCE(pcp->batch);
1354 pcp->count -= pcp->batch; 1353 free_pcppages_bulk(zone, batch, pcp);
1354 pcp->count -= batch;
1355 } 1355 }
1356 1356
1357out: 1357out:
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL(free_pages_exact);
2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2840 * high watermark within all zones at or below a given zone index. For each 2840 * high watermark within all zones at or below a given zone index. For each
2841 * zone, the number of pages is calculated as: 2841 * zone, the number of pages is calculated as:
2842 * present_pages - high_pages 2842 * managed_pages - high_pages
2843 */ 2843 */
2844static unsigned long nr_free_zone_pages(int offset) 2844static unsigned long nr_free_zone_pages(int offset)
2845{ 2845{
@@ -2906,9 +2906,13 @@ EXPORT_SYMBOL(si_meminfo);
2906#ifdef CONFIG_NUMA 2906#ifdef CONFIG_NUMA
2907void si_meminfo_node(struct sysinfo *val, int nid) 2907void si_meminfo_node(struct sysinfo *val, int nid)
2908{ 2908{
2909 int zone_type; /* needs to be signed */
2910 unsigned long managed_pages = 0;
2909 pg_data_t *pgdat = NODE_DATA(nid); 2911 pg_data_t *pgdat = NODE_DATA(nid);
2910 2912
2911 val->totalram = pgdat->node_present_pages; 2913 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
2914 managed_pages += pgdat->node_zones[zone_type].managed_pages;
2915 val->totalram = managed_pages;
2912 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2916 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2913#ifdef CONFIG_HIGHMEM 2917#ifdef CONFIG_HIGHMEM
2914 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 2918 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3150,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3150 * Add all populated zones of a node to the zonelist. 3154 * Add all populated zones of a node to the zonelist.
3151 */ 3155 */
3152static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3156static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3153 int nr_zones, enum zone_type zone_type) 3157 int nr_zones)
3154{ 3158{
3155 struct zone *zone; 3159 struct zone *zone;
3156 3160 enum zone_type zone_type = MAX_NR_ZONES;
3157 BUG_ON(zone_type >= MAX_NR_ZONES);
3158 zone_type++;
3159 3161
3160 do { 3162 do {
3161 zone_type--; 3163 zone_type--;
@@ -3165,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3165 &zonelist->_zonerefs[nr_zones++]); 3167 &zonelist->_zonerefs[nr_zones++]);
3166 check_highest_zone(zone_type); 3168 check_highest_zone(zone_type);
3167 } 3169 }
3168
3169 } while (zone_type); 3170 } while (zone_type);
3171
3170 return nr_zones; 3172 return nr_zones;
3171} 3173}
3172 3174
@@ -3250,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3250 static DEFINE_MUTEX(zl_order_mutex); 3252 static DEFINE_MUTEX(zl_order_mutex);
3251 3253
3252 mutex_lock(&zl_order_mutex); 3254 mutex_lock(&zl_order_mutex);
3253 if (write) 3255 if (write) {
3254 strcpy(saved_string, (char*)table->data); 3256 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3257 ret = -EINVAL;
3258 goto out;
3259 }
3260 strcpy(saved_string, (char *)table->data);
3261 }
3255 ret = proc_dostring(table, write, buffer, length, ppos); 3262 ret = proc_dostring(table, write, buffer, length, ppos);
3256 if (ret) 3263 if (ret)
3257 goto out; 3264 goto out;
3258 if (write) { 3265 if (write) {
3259 int oldval = user_zonelist_order; 3266 int oldval = user_zonelist_order;
3260 if (__parse_numa_zonelist_order((char*)table->data)) { 3267
3268 ret = __parse_numa_zonelist_order((char *)table->data);
3269 if (ret) {
3261 /* 3270 /*
3262 * bogus value. restore saved string 3271 * bogus value. restore saved string
3263 */ 3272 */
3264 strncpy((char*)table->data, saved_string, 3273 strncpy((char *)table->data, saved_string,
3265 NUMA_ZONELIST_ORDER_LEN); 3274 NUMA_ZONELIST_ORDER_LEN);
3266 user_zonelist_order = oldval; 3275 user_zonelist_order = oldval;
3267 } else if (oldval != user_zonelist_order) { 3276 } else if (oldval != user_zonelist_order) {
@@ -3353,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3353 zonelist = &pgdat->node_zonelists[0]; 3362 zonelist = &pgdat->node_zonelists[0];
3354 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3363 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3355 ; 3364 ;
3356 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3365 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3357 MAX_NR_ZONES - 1);
3358 zonelist->_zonerefs[j].zone = NULL; 3366 zonelist->_zonerefs[j].zone = NULL;
3359 zonelist->_zonerefs[j].zone_idx = 0; 3367 zonelist->_zonerefs[j].zone_idx = 0;
3360} 3368}
@@ -3368,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
3368 struct zonelist *zonelist; 3376 struct zonelist *zonelist;
3369 3377
3370 zonelist = &pgdat->node_zonelists[1]; 3378 zonelist = &pgdat->node_zonelists[1];
3371 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3379 j = build_zonelists_node(pgdat, zonelist, 0);
3372 zonelist->_zonerefs[j].zone = NULL; 3380 zonelist->_zonerefs[j].zone = NULL;
3373 zonelist->_zonerefs[j].zone_idx = 0; 3381 zonelist->_zonerefs[j].zone_idx = 0;
3374} 3382}
@@ -3425,8 +3433,8 @@ static int default_zonelist_order(void)
3425 z = &NODE_DATA(nid)->node_zones[zone_type]; 3433 z = &NODE_DATA(nid)->node_zones[zone_type];
3426 if (populated_zone(z)) { 3434 if (populated_zone(z)) {
3427 if (zone_type < ZONE_NORMAL) 3435 if (zone_type < ZONE_NORMAL)
3428 low_kmem_size += z->present_pages; 3436 low_kmem_size += z->managed_pages;
3429 total_size += z->present_pages; 3437 total_size += z->managed_pages;
3430 } else if (zone_type == ZONE_NORMAL) { 3438 } else if (zone_type == ZONE_NORMAL) {
3431 /* 3439 /*
3432 * If any node has only lowmem, then node order 3440 * If any node has only lowmem, then node order
@@ -3576,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat)
3576 local_node = pgdat->node_id; 3584 local_node = pgdat->node_id;
3577 3585
3578 zonelist = &pgdat->node_zonelists[0]; 3586 zonelist = &pgdat->node_zonelists[0];
3579 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3587 j = build_zonelists_node(pgdat, zonelist, 0);
3580 3588
3581 /* 3589 /*
3582 * Now we build the zonelist so that it contains the zones 3590 * Now we build the zonelist so that it contains the zones
@@ -3589,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat)
3589 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3597 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3590 if (!node_online(node)) 3598 if (!node_online(node))
3591 continue; 3599 continue;
3592 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3600 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3593 MAX_NR_ZONES - 1);
3594 } 3601 }
3595 for (node = 0; node < local_node; node++) { 3602 for (node = 0; node < local_node; node++) {
3596 if (!node_online(node)) 3603 if (!node_online(node))
3597 continue; 3604 continue;
3598 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3605 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3599 MAX_NR_ZONES - 1);
3600 } 3606 }
3601 3607
3602 zonelist->_zonerefs[j].zone = NULL; 3608 zonelist->_zonerefs[j].zone = NULL;
@@ -3705,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3705 mminit_verify_zonelist(); 3711 mminit_verify_zonelist();
3706 cpuset_init_current_mems_allowed(); 3712 cpuset_init_current_mems_allowed();
3707 } else { 3713 } else {
3708 /* we have to stop all cpus to guarantee there is no user
3709 of zonelist */
3710#ifdef CONFIG_MEMORY_HOTPLUG 3714#ifdef CONFIG_MEMORY_HOTPLUG
3711 if (zone) 3715 if (zone)
3712 setup_zone_pageset(zone); 3716 setup_zone_pageset(zone);
3713#endif 3717#endif
3718 /* we have to stop all cpus to guarantee there is no user
3719 of zonelist */
3714 stop_machine(__build_all_zonelists, pgdat, NULL); 3720 stop_machine(__build_all_zonelists, pgdat, NULL);
3715 /* cpuset refresh routine should be here */ 3721 /* cpuset refresh routine should be here */
3716 } 3722 }
@@ -4032,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone)
4032#endif 4038#endif
4033} 4039}
4034 4040
4035static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4041/*
4042 * pcp->high and pcp->batch values are related and dependent on one another:
4043 * ->batch must never be higher then ->high.
4044 * The following function updates them in a safe manner without read side
4045 * locking.
4046 *
4047 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4048 * those fields changing asynchronously (acording the the above rule).
4049 *
4050 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4051 * outside of boot time (or some other assurance that no concurrent updaters
4052 * exist).
4053 */
4054static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4055 unsigned long batch)
4056{
4057 /* start with a fail safe value for batch */
4058 pcp->batch = 1;
4059 smp_wmb();
4060
4061 /* Update high, then batch, in order */
4062 pcp->high = high;
4063 smp_wmb();
4064
4065 pcp->batch = batch;
4066}
4067
4068/* a companion to pageset_set_high() */
4069static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4070{
4071 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4072}
4073
4074static void pageset_init(struct per_cpu_pageset *p)
4036{ 4075{
4037 struct per_cpu_pages *pcp; 4076 struct per_cpu_pages *pcp;
4038 int migratetype; 4077 int migratetype;
@@ -4041,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4041 4080
4042 pcp = &p->pcp; 4081 pcp = &p->pcp;
4043 pcp->count = 0; 4082 pcp->count = 0;
4044 pcp->high = 6 * batch;
4045 pcp->batch = max(1UL, 1 * batch);
4046 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4083 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4047 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4084 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4048} 4085}
4049 4086
4087static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4088{
4089 pageset_init(p);
4090 pageset_set_batch(p, batch);
4091}
4092
4050/* 4093/*
4051 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 4094 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4052 * to the value high for the pageset p. 4095 * to the value high for the pageset p.
4053 */ 4096 */
4054 4097static void pageset_set_high(struct per_cpu_pageset *p,
4055static void setup_pagelist_highmark(struct per_cpu_pageset *p,
4056 unsigned long high) 4098 unsigned long high)
4057{ 4099{
4058 struct per_cpu_pages *pcp; 4100 unsigned long batch = max(1UL, high / 4);
4101 if ((high / 4) > (PAGE_SHIFT * 8))
4102 batch = PAGE_SHIFT * 8;
4059 4103
4060 pcp = &p->pcp; 4104 pageset_update(&p->pcp, high, batch);
4061 pcp->high = high;
4062 pcp->batch = max(1UL, high/4);
4063 if ((high/4) > (PAGE_SHIFT * 8))
4064 pcp->batch = PAGE_SHIFT * 8;
4065} 4105}
4066 4106
4067static void __meminit setup_zone_pageset(struct zone *zone) 4107static void __meminit pageset_set_high_and_batch(struct zone *zone,
4108 struct per_cpu_pageset *pcp)
4068{ 4109{
4069 int cpu; 4110 if (percpu_pagelist_fraction)
4070 4111 pageset_set_high(pcp,
4071 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4112 (zone->managed_pages /
4113 percpu_pagelist_fraction));
4114 else
4115 pageset_set_batch(pcp, zone_batchsize(zone));
4116}
4072 4117
4073 for_each_possible_cpu(cpu) { 4118static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4074 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4119{
4120 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4075 4121
4076 setup_pageset(pcp, zone_batchsize(zone)); 4122 pageset_init(pcp);
4123 pageset_set_high_and_batch(zone, pcp);
4124}
4077 4125
4078 if (percpu_pagelist_fraction) 4126static void __meminit setup_zone_pageset(struct zone *zone)
4079 setup_pagelist_highmark(pcp, 4127{
4080 (zone->managed_pages / 4128 int cpu;
4081 percpu_pagelist_fraction)); 4129 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4082 } 4130 for_each_possible_cpu(cpu)
4131 zone_pageset_init(zone, cpu);
4083} 4132}
4084 4133
4085/* 4134/*
@@ -4368,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
4368 */ 4417 */
4369static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4418static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4370 unsigned long zone_type, 4419 unsigned long zone_type,
4420 unsigned long node_start_pfn,
4421 unsigned long node_end_pfn,
4371 unsigned long *ignored) 4422 unsigned long *ignored)
4372{ 4423{
4373 unsigned long node_start_pfn, node_end_pfn;
4374 unsigned long zone_start_pfn, zone_end_pfn; 4424 unsigned long zone_start_pfn, zone_end_pfn;
4375 4425
4376 /* Get the start and end of the node and zone */ 4426 /* Get the start and end of the zone */
4377 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4378 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4427 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4379 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4428 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4380 adjust_zone_range_for_zone_movable(nid, zone_type, 4429 adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4429,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4429/* Return the number of page frames in holes in a zone on a node */ 4478/* Return the number of page frames in holes in a zone on a node */
4430static unsigned long __meminit zone_absent_pages_in_node(int nid, 4479static unsigned long __meminit zone_absent_pages_in_node(int nid,
4431 unsigned long zone_type, 4480 unsigned long zone_type,
4481 unsigned long node_start_pfn,
4482 unsigned long node_end_pfn,
4432 unsigned long *ignored) 4483 unsigned long *ignored)
4433{ 4484{
4434 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4485 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4435 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4486 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4436 unsigned long node_start_pfn, node_end_pfn;
4437 unsigned long zone_start_pfn, zone_end_pfn; 4487 unsigned long zone_start_pfn, zone_end_pfn;
4438 4488
4439 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4440 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4489 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4441 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4490 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4442 4491
@@ -4449,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4449#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4498#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4450static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4499static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4451 unsigned long zone_type, 4500 unsigned long zone_type,
4501 unsigned long node_start_pfn,
4502 unsigned long node_end_pfn,
4452 unsigned long *zones_size) 4503 unsigned long *zones_size)
4453{ 4504{
4454 return zones_size[zone_type]; 4505 return zones_size[zone_type];
@@ -4456,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4456 4507
4457static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4508static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4458 unsigned long zone_type, 4509 unsigned long zone_type,
4510 unsigned long node_start_pfn,
4511 unsigned long node_end_pfn,
4459 unsigned long *zholes_size) 4512 unsigned long *zholes_size)
4460{ 4513{
4461 if (!zholes_size) 4514 if (!zholes_size)
@@ -4467,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4467#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4520#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4468 4521
4469static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4522static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4470 unsigned long *zones_size, unsigned long *zholes_size) 4523 unsigned long node_start_pfn,
4524 unsigned long node_end_pfn,
4525 unsigned long *zones_size,
4526 unsigned long *zholes_size)
4471{ 4527{
4472 unsigned long realtotalpages, totalpages = 0; 4528 unsigned long realtotalpages, totalpages = 0;
4473 enum zone_type i; 4529 enum zone_type i;
4474 4530
4475 for (i = 0; i < MAX_NR_ZONES; i++) 4531 for (i = 0; i < MAX_NR_ZONES; i++)
4476 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4532 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4477 zones_size); 4533 node_start_pfn,
4534 node_end_pfn,
4535 zones_size);
4478 pgdat->node_spanned_pages = totalpages; 4536 pgdat->node_spanned_pages = totalpages;
4479 4537
4480 realtotalpages = totalpages; 4538 realtotalpages = totalpages;
4481 for (i = 0; i < MAX_NR_ZONES; i++) 4539 for (i = 0; i < MAX_NR_ZONES; i++)
4482 realtotalpages -= 4540 realtotalpages -=
4483 zone_absent_pages_in_node(pgdat->node_id, i, 4541 zone_absent_pages_in_node(pgdat->node_id, i,
4484 zholes_size); 4542 node_start_pfn, node_end_pfn,
4543 zholes_size);
4485 pgdat->node_present_pages = realtotalpages; 4544 pgdat->node_present_pages = realtotalpages;
4486 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4545 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4487 realtotalpages); 4546 realtotalpages);
@@ -4590,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4590 * NOTE: pgdat should get zeroed by caller. 4649 * NOTE: pgdat should get zeroed by caller.
4591 */ 4650 */
4592static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4651static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4652 unsigned long node_start_pfn, unsigned long node_end_pfn,
4593 unsigned long *zones_size, unsigned long *zholes_size) 4653 unsigned long *zones_size, unsigned long *zholes_size)
4594{ 4654{
4595 enum zone_type j; 4655 enum zone_type j;
@@ -4611,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4611 struct zone *zone = pgdat->node_zones + j; 4671 struct zone *zone = pgdat->node_zones + j;
4612 unsigned long size, realsize, freesize, memmap_pages; 4672 unsigned long size, realsize, freesize, memmap_pages;
4613 4673
4614 size = zone_spanned_pages_in_node(nid, j, zones_size); 4674 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4675 node_end_pfn, zones_size);
4615 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4676 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4677 node_start_pfn,
4678 node_end_pfn,
4616 zholes_size); 4679 zholes_size);
4617 4680
4618 /* 4681 /*
@@ -4726,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4726 unsigned long node_start_pfn, unsigned long *zholes_size) 4789 unsigned long node_start_pfn, unsigned long *zholes_size)
4727{ 4790{
4728 pg_data_t *pgdat = NODE_DATA(nid); 4791 pg_data_t *pgdat = NODE_DATA(nid);
4792 unsigned long start_pfn = 0;
4793 unsigned long end_pfn = 0;
4729 4794
4730 /* pg_data_t should be reset to zero when it's allocated */ 4795 /* pg_data_t should be reset to zero when it's allocated */
4731 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4796 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
@@ -4733,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4733 pgdat->node_id = nid; 4798 pgdat->node_id = nid;
4734 pgdat->node_start_pfn = node_start_pfn; 4799 pgdat->node_start_pfn = node_start_pfn;
4735 init_zone_allows_reclaim(nid); 4800 init_zone_allows_reclaim(nid);
4736 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4801#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4802 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4803#endif
4804 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4805 zones_size, zholes_size);
4737 4806
4738 alloc_node_mem_map(pgdat); 4807 alloc_node_mem_map(pgdat);
4739#ifdef CONFIG_FLAT_NODE_MEM_MAP 4808#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4742,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4742 (unsigned long)pgdat->node_mem_map); 4811 (unsigned long)pgdat->node_mem_map);
4743#endif 4812#endif
4744 4813
4745 free_area_init_core(pgdat, zones_size, zholes_size); 4814 free_area_init_core(pgdat, start_pfn, end_pfn,
4815 zones_size, zholes_size);
4746} 4816}
4747 4817
4748#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4818#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5150,35 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore);
5150 5220
5151#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5221#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5152 5222
5153unsigned long free_reserved_area(unsigned long start, unsigned long end, 5223void adjust_managed_page_count(struct page *page, long count)
5154 int poison, char *s)
5155{ 5224{
5156 unsigned long pages, pos; 5225 spin_lock(&managed_page_count_lock);
5226 page_zone(page)->managed_pages += count;
5227 totalram_pages += count;
5228#ifdef CONFIG_HIGHMEM
5229 if (PageHighMem(page))
5230 totalhigh_pages += count;
5231#endif
5232 spin_unlock(&managed_page_count_lock);
5233}
5234EXPORT_SYMBOL(adjust_managed_page_count);
5157 5235
5158 pos = start = PAGE_ALIGN(start); 5236unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5159 end &= PAGE_MASK; 5237{
5160 for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { 5238 void *pos;
5161 if (poison) 5239 unsigned long pages = 0;
5162 memset((void *)pos, poison, PAGE_SIZE); 5240
5163 free_reserved_page(virt_to_page((void *)pos)); 5241 start = (void *)PAGE_ALIGN((unsigned long)start);
5242 end = (void *)((unsigned long)end & PAGE_MASK);
5243 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5244 if ((unsigned int)poison <= 0xFF)
5245 memset(pos, poison, PAGE_SIZE);
5246 free_reserved_page(virt_to_page(pos));
5164 } 5247 }
5165 5248
5166 if (pages && s) 5249 if (pages && s)
5167 pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", 5250 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5168 s, pages << (PAGE_SHIFT - 10), start, end); 5251 s, pages << (PAGE_SHIFT - 10), start, end);
5169 5252
5170 return pages; 5253 return pages;
5171} 5254}
5255EXPORT_SYMBOL(free_reserved_area);
5172 5256
5173#ifdef CONFIG_HIGHMEM 5257#ifdef CONFIG_HIGHMEM
5174void free_highmem_page(struct page *page) 5258void free_highmem_page(struct page *page)
5175{ 5259{
5176 __free_reserved_page(page); 5260 __free_reserved_page(page);
5177 totalram_pages++; 5261 totalram_pages++;
5262 page_zone(page)->managed_pages++;
5178 totalhigh_pages++; 5263 totalhigh_pages++;
5179} 5264}
5180#endif 5265#endif
5181 5266
5267
5268void __init mem_init_print_info(const char *str)
5269{
5270 unsigned long physpages, codesize, datasize, rosize, bss_size;
5271 unsigned long init_code_size, init_data_size;
5272
5273 physpages = get_num_physpages();
5274 codesize = _etext - _stext;
5275 datasize = _edata - _sdata;
5276 rosize = __end_rodata - __start_rodata;
5277 bss_size = __bss_stop - __bss_start;
5278 init_data_size = __init_end - __init_begin;
5279 init_code_size = _einittext - _sinittext;
5280
5281 /*
5282 * Detect special cases and adjust section sizes accordingly:
5283 * 1) .init.* may be embedded into .data sections
5284 * 2) .init.text.* may be out of [__init_begin, __init_end],
5285 * please refer to arch/tile/kernel/vmlinux.lds.S.
5286 * 3) .rodata.* may be embedded into .text or .data sections.
5287 */
5288#define adj_init_size(start, end, size, pos, adj) \
5289 if (start <= pos && pos < end && size > adj) \
5290 size -= adj;
5291
5292 adj_init_size(__init_begin, __init_end, init_data_size,
5293 _sinittext, init_code_size);
5294 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5295 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5296 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5297 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5298
5299#undef adj_init_size
5300
5301 printk("Memory: %luK/%luK available "
5302 "(%luK kernel code, %luK rwdata, %luK rodata, "
5303 "%luK init, %luK bss, %luK reserved"
5304#ifdef CONFIG_HIGHMEM
5305 ", %luK highmem"
5306#endif
5307 "%s%s)\n",
5308 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5309 codesize >> 10, datasize >> 10, rosize >> 10,
5310 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5311 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5312#ifdef CONFIG_HIGHMEM
5313 totalhigh_pages << (PAGE_SHIFT-10),
5314#endif
5315 str ? ", " : "", str ? str : "");
5316}
5317
5182/** 5318/**
5183 * set_dma_reserve - set the specified number of pages reserved in the first zone 5319 * set_dma_reserve - set the specified number of pages reserved in the first zone
5184 * @new_dma_reserve: The number of pages to mark reserved 5320 * @new_dma_reserve: The number of pages to mark reserved
@@ -5454,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void)
5454int __meminit init_per_zone_wmark_min(void) 5590int __meminit init_per_zone_wmark_min(void)
5455{ 5591{
5456 unsigned long lowmem_kbytes; 5592 unsigned long lowmem_kbytes;
5593 int new_min_free_kbytes;
5457 5594
5458 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5595 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5459 5596 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5460 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5597
5461 if (min_free_kbytes < 128) 5598 if (new_min_free_kbytes > user_min_free_kbytes) {
5462 min_free_kbytes = 128; 5599 min_free_kbytes = new_min_free_kbytes;
5463 if (min_free_kbytes > 65536) 5600 if (min_free_kbytes < 128)
5464 min_free_kbytes = 65536; 5601 min_free_kbytes = 128;
5602 if (min_free_kbytes > 65536)
5603 min_free_kbytes = 65536;
5604 } else {
5605 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5606 new_min_free_kbytes, user_min_free_kbytes);
5607 }
5465 setup_per_zone_wmarks(); 5608 setup_per_zone_wmarks();
5466 refresh_zone_stat_thresholds(); 5609 refresh_zone_stat_thresholds();
5467 setup_per_zone_lowmem_reserve(); 5610 setup_per_zone_lowmem_reserve();
@@ -5479,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5479 void __user *buffer, size_t *length, loff_t *ppos) 5622 void __user *buffer, size_t *length, loff_t *ppos)
5480{ 5623{
5481 proc_dointvec(table, write, buffer, length, ppos); 5624 proc_dointvec(table, write, buffer, length, ppos);
5482 if (write) 5625 if (write) {
5626 user_min_free_kbytes = min_free_kbytes;
5483 setup_per_zone_wmarks(); 5627 setup_per_zone_wmarks();
5628 }
5484 return 0; 5629 return 0;
5485} 5630}
5486 5631
@@ -5540,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5540 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5685 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5541 * can have before it gets flushed back to buddy allocator. 5686 * can have before it gets flushed back to buddy allocator.
5542 */ 5687 */
5543
5544int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5688int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5545 void __user *buffer, size_t *length, loff_t *ppos) 5689 void __user *buffer, size_t *length, loff_t *ppos)
5546{ 5690{
@@ -5551,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5551 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5695 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5552 if (!write || (ret < 0)) 5696 if (!write || (ret < 0))
5553 return ret; 5697 return ret;
5698
5699 mutex_lock(&pcp_batch_high_lock);
5554 for_each_populated_zone(zone) { 5700 for_each_populated_zone(zone) {
5555 for_each_possible_cpu(cpu) { 5701 unsigned long high;
5556 unsigned long high; 5702 high = zone->managed_pages / percpu_pagelist_fraction;
5557 high = zone->managed_pages / percpu_pagelist_fraction; 5703 for_each_possible_cpu(cpu)
5558 setup_pagelist_highmark( 5704 pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
5559 per_cpu_ptr(zone->pageset, cpu), high); 5705 high);
5560 }
5561 } 5706 }
5707 mutex_unlock(&pcp_batch_high_lock);
5562 return 0; 5708 return 0;
5563} 5709}
5564 5710
@@ -6047,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
6047#endif 6193#endif
6048 6194
6049#ifdef CONFIG_MEMORY_HOTPLUG 6195#ifdef CONFIG_MEMORY_HOTPLUG
6050static int __meminit __zone_pcp_update(void *data) 6196/*
6051{ 6197 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6052 struct zone *zone = data; 6198 * page high values need to be recalulated.
6053 int cpu; 6199 */
6054 unsigned long batch = zone_batchsize(zone), flags;
6055
6056 for_each_possible_cpu(cpu) {
6057 struct per_cpu_pageset *pset;
6058 struct per_cpu_pages *pcp;
6059
6060 pset = per_cpu_ptr(zone->pageset, cpu);
6061 pcp = &pset->pcp;
6062
6063 local_irq_save(flags);
6064 if (pcp->count > 0)
6065 free_pcppages_bulk(zone, pcp->count, pcp);
6066 drain_zonestat(zone, pset);
6067 setup_pageset(pset, batch);
6068 local_irq_restore(flags);
6069 }
6070 return 0;
6071}
6072
6073void __meminit zone_pcp_update(struct zone *zone) 6200void __meminit zone_pcp_update(struct zone *zone)
6074{ 6201{
6075 stop_machine(__zone_pcp_update, zone, NULL); 6202 unsigned cpu;
6203 mutex_lock(&pcp_batch_high_lock);
6204 for_each_possible_cpu(cpu)
6205 pageset_set_high_and_batch(zone,
6206 per_cpu_ptr(zone->pageset, cpu));
6207 mutex_unlock(&pcp_batch_high_lock);
6076} 6208}
6077#endif 6209#endif
6078 6210
@@ -6142,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6142 list_del(&page->lru); 6274 list_del(&page->lru);
6143 rmv_page_order(page); 6275 rmv_page_order(page);
6144 zone->free_area[order].nr_free--; 6276 zone->free_area[order].nr_free--;
6277#ifdef CONFIG_HIGHMEM
6278 if (PageHighMem(page))
6279 totalhigh_pages -= 1 << order;
6280#endif
6145 for (i = 0; i < (1 << order); i++) 6281 for (i = 0; i < (1 << order); i++)
6146 SetPageReserved((page+i)); 6282 SetPageReserved((page+i));
6147 pfn += (1 << order); 6283 pfn += (1 << order);
diff --git a/mm/page_io.c b/mm/page_io.c
index a8a3ef45fed7..ba05b64e5d8d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/frontswap.h> 22#include <linux/frontswap.h>
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include <linux/blkdev.h>
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25 26
26static struct bio *get_swap_bio(gfp_t gfp_flags, 27static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err)
80 imajor(bio->bi_bdev->bd_inode), 81 imajor(bio->bi_bdev->bd_inode),
81 iminor(bio->bi_bdev->bd_inode), 82 iminor(bio->bi_bdev->bd_inode),
82 (unsigned long long)bio->bi_sector); 83 (unsigned long long)bio->bi_sector);
83 } else { 84 goto out;
84 SetPageUptodate(page);
85 } 85 }
86
87 SetPageUptodate(page);
88
89 /*
90 * There is no guarantee that the page is in swap cache - the software
91 * suspend code (at least) uses end_swap_bio_read() against a non-
92 * swapcache page. So we must check PG_swapcache before proceeding with
93 * this optimization.
94 */
95 if (likely(PageSwapCache(page))) {
96 struct swap_info_struct *sis;
97
98 sis = page_swap_info(page);
99 if (sis->flags & SWP_BLKDEV) {
100 /*
101 * The swap subsystem performs lazy swap slot freeing,
102 * expecting that the page will be swapped out again.
103 * So we can avoid an unnecessary write if the page
104 * isn't redirtied.
105 * This is good for real swap storage because we can
106 * reduce unnecessary I/O and enhance wear-leveling
107 * if an SSD is used as the as swap device.
108 * But if in-memory swap device (eg zram) is used,
109 * this causes a duplicated copy between uncompressed
110 * data in VM-owned memory and compressed data in
111 * zram-owned memory. So let's free zram-owned memory
112 * and make the VM-owned decompressed page *dirty*,
113 * so the page should be swapped out somewhere again if
114 * we again wish to reclaim it.
115 */
116 struct gendisk *disk = sis->bdev->bd_disk;
117 if (disk->fops->swap_slot_free_notify) {
118 swp_entry_t entry;
119 unsigned long offset;
120
121 entry.val = page_private(page);
122 offset = swp_offset(entry);
123
124 SetPageDirty(page);
125 disk->fops->swap_slot_free_notify(sis->bdev,
126 offset);
127 }
128 }
129 }
130
131out:
86 unlock_page(page); 132 unlock_page(page);
87 bio_put(bio); 133 bio_put(bio);
88} 134}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323fe6c8f..e1a6e4fab016 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
124 124
125#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 125#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
126#ifdef CONFIG_TRANSPARENT_HUGEPAGE 126#ifdef CONFIG_TRANSPARENT_HUGEPAGE
127void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 127void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
128 pgtable_t pgtable)
128{ 129{
129 assert_spin_locked(&mm->page_table_lock); 130 assert_spin_locked(&mm->page_table_lock);
130 131
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
141#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW 142#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
142#ifdef CONFIG_TRANSPARENT_HUGEPAGE 143#ifdef CONFIG_TRANSPARENT_HUGEPAGE
143/* no "address" argument so destroys page coloring of some arch */ 144/* no "address" argument so destroys page coloring of some arch */
144pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 145pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
145{ 146{
146 pgtable_t pgtable; 147 pgtable_t pgtable;
147 148
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..cd356df4f71a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
720 * mapping is already gone, the unmap path will have 720 * mapping is already gone, the unmap path will have
721 * set PG_referenced or activated the page. 721 * set PG_referenced or activated the page.
722 */ 722 */
723 if (likely(!VM_SequentialReadHint(vma))) 723 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
724 referenced++; 724 referenced++;
725 } 725 }
726 pte_unmap_unlock(pte, ptl); 726 pte_unmap_unlock(pte, ptl);
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page,
1093 else 1093 else
1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1095 __page_set_anon_rmap(page, vma, address, 1); 1095 __page_set_anon_rmap(page, vma, address, 1);
1096 if (!mlocked_vma_newpage(vma, page)) 1096 if (!mlocked_vma_newpage(vma, page)) {
1097 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1097 SetPageActive(page);
1098 else 1098 lru_cache_add(page);
1099 } else
1099 add_page_to_unevictable_list(page); 1100 add_page_to_unevictable_list(page);
1100} 1101}
1101 1102
diff --git a/mm/shmem.c b/mm/shmem.c
index 118dfa4952f4..a87990cf9f94 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1936,6 +1936,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1936 1936
1937 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1937 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1938 if (inode) { 1938 if (inode) {
1939#ifdef CONFIG_TMPFS_POSIX_ACL
1940 error = generic_acl_init(inode, dir);
1941 if (error) {
1942 iput(inode);
1943 return error;
1944 }
1945#endif
1939 error = security_inode_init_security(inode, dir, 1946 error = security_inode_init_security(inode, dir,
1940 &dentry->d_name, 1947 &dentry->d_name,
1941 shmem_initxattrs, NULL); 1948 shmem_initxattrs, NULL);
@@ -1945,15 +1952,8 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1945 return error; 1952 return error;
1946 } 1953 }
1947 } 1954 }
1948#ifdef CONFIG_TMPFS_POSIX_ACL 1955
1949 error = generic_acl_init(inode, dir);
1950 if (error) {
1951 iput(inode);
1952 return error;
1953 }
1954#else
1955 error = 0; 1956 error = 0;
1956#endif
1957 dir->i_size += BOGO_DIRENT_SIZE; 1957 dir->i_size += BOGO_DIRENT_SIZE;
1958 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1958 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1959 d_instantiate(dentry, inode); 1959 d_instantiate(dentry, inode);
diff --git a/mm/slab.c b/mm/slab.c
index 8ccd296c6d9c..35cb0c861508 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -565,7 +565,7 @@ static void init_node_lock_keys(int q)
565 if (slab_state < UP) 565 if (slab_state < UP)
566 return; 566 return;
567 567
568 for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) { 568 for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
569 struct kmem_cache_node *n; 569 struct kmem_cache_node *n;
570 struct kmem_cache *cache = kmalloc_caches[i]; 570 struct kmem_cache *cache = kmalloc_caches[i];
571 571
@@ -1180,6 +1180,12 @@ static int init_cache_node_node(int node)
1180 return 0; 1180 return 0;
1181} 1181}
1182 1182
1183static inline int slabs_tofree(struct kmem_cache *cachep,
1184 struct kmem_cache_node *n)
1185{
1186 return (n->free_objects + cachep->num - 1) / cachep->num;
1187}
1188
1183static void __cpuinit cpuup_canceled(long cpu) 1189static void __cpuinit cpuup_canceled(long cpu)
1184{ 1190{
1185 struct kmem_cache *cachep; 1191 struct kmem_cache *cachep;
@@ -1241,7 +1247,7 @@ free_array_cache:
1241 n = cachep->node[node]; 1247 n = cachep->node[node];
1242 if (!n) 1248 if (!n)
1243 continue; 1249 continue;
1244 drain_freelist(cachep, n, n->free_objects); 1250 drain_freelist(cachep, n, slabs_tofree(cachep, n));
1245 } 1251 }
1246} 1252}
1247 1253
@@ -1408,7 +1414,7 @@ static int __meminit drain_cache_node_node(int node)
1408 if (!n) 1414 if (!n)
1409 continue; 1415 continue;
1410 1416
1411 drain_freelist(cachep, n, n->free_objects); 1417 drain_freelist(cachep, n, slabs_tofree(cachep, n));
1412 1418
1413 if (!list_empty(&n->slabs_full) || 1419 if (!list_empty(&n->slabs_full) ||
1414 !list_empty(&n->slabs_partial)) { 1420 !list_empty(&n->slabs_partial)) {
@@ -2532,7 +2538,7 @@ static int __cache_shrink(struct kmem_cache *cachep)
2532 if (!n) 2538 if (!n)
2533 continue; 2539 continue;
2534 2540
2535 drain_freelist(cachep, n, n->free_objects); 2541 drain_freelist(cachep, n, slabs_tofree(cachep, n));
2536 2542
2537 ret += !list_empty(&n->slabs_full) || 2543 ret += !list_empty(&n->slabs_full) ||
2538 !list_empty(&n->slabs_partial); 2544 !list_empty(&n->slabs_partial);
@@ -3338,18 +3344,6 @@ done:
3338 return obj; 3344 return obj;
3339} 3345}
3340 3346
3341/**
3342 * kmem_cache_alloc_node - Allocate an object on the specified node
3343 * @cachep: The cache to allocate from.
3344 * @flags: See kmalloc().
3345 * @nodeid: node number of the target node.
3346 * @caller: return address of caller, used for debug information
3347 *
3348 * Identical to kmem_cache_alloc but it will allocate memory on the given
3349 * node, which can improve the performance for cpu bound structures.
3350 *
3351 * Fallback to other node is possible if __GFP_THISNODE is not set.
3352 */
3353static __always_inline void * 3347static __always_inline void *
3354slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3348slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3355 unsigned long caller) 3349 unsigned long caller)
@@ -3643,6 +3637,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
3643#endif 3637#endif
3644 3638
3645#ifdef CONFIG_NUMA 3639#ifdef CONFIG_NUMA
3640/**
3641 * kmem_cache_alloc_node - Allocate an object on the specified node
3642 * @cachep: The cache to allocate from.
3643 * @flags: See kmalloc().
3644 * @nodeid: node number of the target node.
3645 *
3646 * Identical to kmem_cache_alloc but it will allocate memory on the given
3647 * node, which can improve the performance for cpu bound structures.
3648 *
3649 * Fallback to other node is possible if __GFP_THISNODE is not set.
3650 */
3646void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3651void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3647{ 3652{
3648 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3653 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
@@ -4431,20 +4436,10 @@ static int leaks_show(struct seq_file *m, void *p)
4431 return 0; 4436 return 0;
4432} 4437}
4433 4438
4434static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4435{
4436 return seq_list_next(p, &slab_caches, pos);
4437}
4438
4439static void s_stop(struct seq_file *m, void *p)
4440{
4441 mutex_unlock(&slab_mutex);
4442}
4443
4444static const struct seq_operations slabstats_op = { 4439static const struct seq_operations slabstats_op = {
4445 .start = leaks_start, 4440 .start = leaks_start,
4446 .next = s_next, 4441 .next = slab_next,
4447 .stop = s_stop, 4442 .stop = slab_stop,
4448 .show = leaks_show, 4443 .show = leaks_show,
4449}; 4444};
4450 4445
diff --git a/mm/slab.h b/mm/slab.h
index f96b49e4704e..620ceeddbe1a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -271,3 +271,6 @@ struct kmem_cache_node {
271#endif 271#endif
272 272
273}; 273};
274
275void *slab_next(struct seq_file *m, void *p, loff_t *pos);
276void slab_stop(struct seq_file *m, void *p);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2d414508e9ec..538bade6df7d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -497,6 +497,13 @@ void __init create_kmalloc_caches(unsigned long flags)
497 497
498 498
499#ifdef CONFIG_SLABINFO 499#ifdef CONFIG_SLABINFO
500
501#ifdef CONFIG_SLAB
502#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
503#else
504#define SLABINFO_RIGHTS S_IRUSR
505#endif
506
500void print_slabinfo_header(struct seq_file *m) 507void print_slabinfo_header(struct seq_file *m)
501{ 508{
502 /* 509 /*
@@ -531,12 +538,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
531 return seq_list_start(&slab_caches, *pos); 538 return seq_list_start(&slab_caches, *pos);
532} 539}
533 540
534static void *s_next(struct seq_file *m, void *p, loff_t *pos) 541void *slab_next(struct seq_file *m, void *p, loff_t *pos)
535{ 542{
536 return seq_list_next(p, &slab_caches, pos); 543 return seq_list_next(p, &slab_caches, pos);
537} 544}
538 545
539static void s_stop(struct seq_file *m, void *p) 546void slab_stop(struct seq_file *m, void *p)
540{ 547{
541 mutex_unlock(&slab_mutex); 548 mutex_unlock(&slab_mutex);
542} 549}
@@ -613,8 +620,8 @@ static int s_show(struct seq_file *m, void *p)
613 */ 620 */
614static const struct seq_operations slabinfo_op = { 621static const struct seq_operations slabinfo_op = {
615 .start = s_start, 622 .start = s_start,
616 .next = s_next, 623 .next = slab_next,
617 .stop = s_stop, 624 .stop = slab_stop,
618 .show = s_show, 625 .show = s_show,
619}; 626};
620 627
@@ -633,7 +640,8 @@ static const struct file_operations proc_slabinfo_operations = {
633 640
634static int __init slab_proc_init(void) 641static int __init slab_proc_init(void)
635{ 642{
636 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 643 proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
644 &proc_slabinfo_operations);
637 return 0; 645 return 0;
638} 646}
639module_init(slab_proc_init); 647module_init(slab_proc_init);
diff --git a/mm/slob.c b/mm/slob.c
index eeed4a05a2ef..91bd3f2dd2f0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -122,7 +122,7 @@ static inline void clear_slob_page_free(struct page *sp)
122} 122}
123 123
124#define SLOB_UNIT sizeof(slob_t) 124#define SLOB_UNIT sizeof(slob_t)
125#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) 125#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
126 126
127/* 127/*
128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -554,7 +554,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
554 flags, node); 554 flags, node);
555 } 555 }
556 556
557 if (c->ctor) 557 if (b && c->ctor)
558 c->ctor(b); 558 c->ctor(b);
559 559
560 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); 560 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 57707f01bcfb..3b482c863002 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -123,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
123#endif 123#endif
124} 124}
125 125
126static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
127{
128#ifdef CONFIG_SLUB_CPU_PARTIAL
129 return !kmem_cache_debug(s);
130#else
131 return false;
132#endif
133}
134
126/* 135/*
127 * Issues still to be resolved: 136 * Issues still to be resolved:
128 * 137 *
@@ -1573,7 +1582,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1573 put_cpu_partial(s, page, 0); 1582 put_cpu_partial(s, page, 0);
1574 stat(s, CPU_PARTIAL_NODE); 1583 stat(s, CPU_PARTIAL_NODE);
1575 } 1584 }
1576 if (kmem_cache_debug(s) || available > s->cpu_partial / 2) 1585 if (!kmem_cache_has_cpu_partial(s)
1586 || available > s->cpu_partial / 2)
1577 break; 1587 break;
1578 1588
1579 } 1589 }
@@ -1884,6 +1894,7 @@ redo:
1884static void unfreeze_partials(struct kmem_cache *s, 1894static void unfreeze_partials(struct kmem_cache *s,
1885 struct kmem_cache_cpu *c) 1895 struct kmem_cache_cpu *c)
1886{ 1896{
1897#ifdef CONFIG_SLUB_CPU_PARTIAL
1887 struct kmem_cache_node *n = NULL, *n2 = NULL; 1898 struct kmem_cache_node *n = NULL, *n2 = NULL;
1888 struct page *page, *discard_page = NULL; 1899 struct page *page, *discard_page = NULL;
1889 1900
@@ -1938,6 +1949,7 @@ static void unfreeze_partials(struct kmem_cache *s,
1938 discard_slab(s, page); 1949 discard_slab(s, page);
1939 stat(s, FREE_SLAB); 1950 stat(s, FREE_SLAB);
1940 } 1951 }
1952#endif
1941} 1953}
1942 1954
1943/* 1955/*
@@ -1951,10 +1963,14 @@ static void unfreeze_partials(struct kmem_cache *s,
1951 */ 1963 */
1952static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 1964static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1953{ 1965{
1966#ifdef CONFIG_SLUB_CPU_PARTIAL
1954 struct page *oldpage; 1967 struct page *oldpage;
1955 int pages; 1968 int pages;
1956 int pobjects; 1969 int pobjects;
1957 1970
1971 if (!s->cpu_partial)
1972 return;
1973
1958 do { 1974 do {
1959 pages = 0; 1975 pages = 0;
1960 pobjects = 0; 1976 pobjects = 0;
@@ -1987,6 +2003,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1987 page->next = oldpage; 2003 page->next = oldpage;
1988 2004
1989 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); 2005 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
2006#endif
1990} 2007}
1991 2008
1992static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 2009static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2358,7 +2375,7 @@ redo:
2358 2375
2359 object = c->freelist; 2376 object = c->freelist;
2360 page = c->page; 2377 page = c->page;
2361 if (unlikely(!object || !node_match(page, node))) 2378 if (unlikely(!object || !page || !node_match(page, node)))
2362 object = __slab_alloc(s, gfpflags, node, addr, c); 2379 object = __slab_alloc(s, gfpflags, node, addr, c);
2363 2380
2364 else { 2381 else {
@@ -2495,7 +2512,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2495 new.inuse--; 2512 new.inuse--;
2496 if ((!new.inuse || !prior) && !was_frozen) { 2513 if ((!new.inuse || !prior) && !was_frozen) {
2497 2514
2498 if (!kmem_cache_debug(s) && !prior) 2515 if (kmem_cache_has_cpu_partial(s) && !prior)
2499 2516
2500 /* 2517 /*
2501 * Slab was on no list before and will be partially empty 2518 * Slab was on no list before and will be partially empty
@@ -2550,8 +2567,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2550 * Objects left in the slab. If it was not on the partial list before 2567 * Objects left in the slab. If it was not on the partial list before
2551 * then add it. 2568 * then add it.
2552 */ 2569 */
2553 if (kmem_cache_debug(s) && unlikely(!prior)) { 2570 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2554 remove_full(s, page); 2571 if (kmem_cache_debug(s))
2572 remove_full(s, page);
2555 add_partial(n, page, DEACTIVATE_TO_TAIL); 2573 add_partial(n, page, DEACTIVATE_TO_TAIL);
2556 stat(s, FREE_ADD_PARTIAL); 2574 stat(s, FREE_ADD_PARTIAL);
2557 } 2575 }
@@ -3059,7 +3077,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3059 * per node list when we run out of per cpu objects. We only fetch 50% 3077 * per node list when we run out of per cpu objects. We only fetch 50%
3060 * to keep some capacity around for frees. 3078 * to keep some capacity around for frees.
3061 */ 3079 */
3062 if (kmem_cache_debug(s)) 3080 if (!kmem_cache_has_cpu_partial(s))
3063 s->cpu_partial = 0; 3081 s->cpu_partial = 0;
3064 else if (s->size >= PAGE_SIZE) 3082 else if (s->size >= PAGE_SIZE)
3065 s->cpu_partial = 2; 3083 s->cpu_partial = 2;
@@ -4456,7 +4474,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4456 err = strict_strtoul(buf, 10, &objects); 4474 err = strict_strtoul(buf, 10, &objects);
4457 if (err) 4475 if (err)
4458 return err; 4476 return err;
4459 if (objects && kmem_cache_debug(s)) 4477 if (objects && !kmem_cache_has_cpu_partial(s))
4460 return -EINVAL; 4478 return -EINVAL;
4461 4479
4462 s->cpu_partial = objects; 4480 s->cpu_partial = objects;
@@ -5269,7 +5287,6 @@ __initcall(slab_sysfs_init);
5269#ifdef CONFIG_SLABINFO 5287#ifdef CONFIG_SLABINFO
5270void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5288void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5271{ 5289{
5272 unsigned long nr_partials = 0;
5273 unsigned long nr_slabs = 0; 5290 unsigned long nr_slabs = 0;
5274 unsigned long nr_objs = 0; 5291 unsigned long nr_objs = 0;
5275 unsigned long nr_free = 0; 5292 unsigned long nr_free = 0;
@@ -5281,9 +5298,8 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5281 if (!n) 5298 if (!n)
5282 continue; 5299 continue;
5283 5300
5284 nr_partials += n->nr_partial; 5301 nr_slabs += node_nr_slabs(n);
5285 nr_slabs += atomic_long_read(&n->nr_slabs); 5302 nr_objs += node_nr_objs(n);
5286 nr_objs += atomic_long_read(&n->total_objects);
5287 nr_free += count_partial(n, count_free); 5303 nr_free += count_partial(n, count_free);
5288 } 5304 }
5289 5305
diff --git a/mm/sparse.c b/mm/sparse.c
index 1c91f0d3f6ab..308d50331bc3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -79,7 +79,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
79{ 79{
80 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
81 struct mem_section *section; 81 struct mem_section *section;
82 int ret = 0;
83 82
84 if (mem_section[root]) 83 if (mem_section[root])
85 return -EEXIST; 84 return -EEXIST;
@@ -90,7 +89,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 89
91 mem_section[root] = section; 90 mem_section[root] = section;
92 91
93 return ret; 92 return 0;
94} 93}
95#else /* !SPARSEMEM_EXTREME */ 94#else /* !SPARSEMEM_EXTREME */
96static inline int sparse_index_init(unsigned long section_nr, int nid) 95static inline int sparse_index_init(unsigned long section_nr, int nid)
@@ -481,6 +480,9 @@ void __init sparse_init(void)
481 struct page **map_map; 480 struct page **map_map;
482#endif 481#endif
483 482
483 /* see include/linux/mmzone.h 'struct mem_section' definition */
484 BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
485
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ 486 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order(); 487 set_pageblock_order();
486 488
@@ -751,6 +753,7 @@ out:
751 return ret; 753 return ret;
752} 754}
753 755
756#ifdef CONFIG_MEMORY_HOTREMOVE
754#ifdef CONFIG_MEMORY_FAILURE 757#ifdef CONFIG_MEMORY_FAILURE
755static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) 758static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
756{ 759{
@@ -772,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
772} 775}
773#endif 776#endif
774 777
775#ifdef CONFIG_MEMORY_HOTREMOVE
776static void free_section_usemap(struct page *memmap, unsigned long *usemap) 778static void free_section_usemap(struct page *memmap, unsigned long *usemap)
777{ 779{
778 struct page *usemap_page; 780 struct page *usemap_page;
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d6841..4a1d0d2c52fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,10 +34,13 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37#define CREATE_TRACE_POINTS
38#include <trace/events/pagemap.h>
39
37/* How many pages do we try to swap or page in/out together? */ 40/* How many pages do we try to swap or page in/out together? */
38int page_cluster; 41int page_cluster;
39 42
40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 43static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 44static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
42static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 45static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
43 46
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
384 SetPageActive(page); 387 SetPageActive(page);
385 lru += LRU_ACTIVE; 388 lru += LRU_ACTIVE;
386 add_page_to_lru_list(page, lruvec, lru); 389 add_page_to_lru_list(page, lruvec, lru);
390 trace_mm_lru_activate(page, page_to_pfn(page));
387 391
388 __count_vm_event(PGACTIVATE); 392 __count_vm_event(PGACTIVATE);
389 update_page_reclaim_stat(lruvec, file, 1); 393 update_page_reclaim_stat(lruvec, file, 1);
@@ -428,6 +432,33 @@ void activate_page(struct page *page)
428} 432}
429#endif 433#endif
430 434
435static void __lru_cache_activate_page(struct page *page)
436{
437 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
438 int i;
439
440 /*
441 * Search backwards on the optimistic assumption that the page being
442 * activated has just been added to this pagevec. Note that only
443 * the local pagevec is examined as a !PageLRU page could be in the
444 * process of being released, reclaimed, migrated or on a remote
445 * pagevec that is currently being drained. Furthermore, marking
446 * a remote pagevec's page PageActive potentially hits a race where
447 * a page is marked PageActive just after it is added to the inactive
448 * list causing accounting errors and BUG_ON checks to trigger.
449 */
450 for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
451 struct page *pagevec_page = pvec->pages[i];
452
453 if (pagevec_page == page) {
454 SetPageActive(page);
455 break;
456 }
457 }
458
459 put_cpu_var(lru_add_pvec);
460}
461
431/* 462/*
432 * Mark a page as having seen activity. 463 * Mark a page as having seen activity.
433 * 464 *
@@ -438,8 +469,18 @@ void activate_page(struct page *page)
438void mark_page_accessed(struct page *page) 469void mark_page_accessed(struct page *page)
439{ 470{
440 if (!PageActive(page) && !PageUnevictable(page) && 471 if (!PageActive(page) && !PageUnevictable(page) &&
441 PageReferenced(page) && PageLRU(page)) { 472 PageReferenced(page)) {
442 activate_page(page); 473
474 /*
475 * If the page is on the LRU, queue it for activation via
476 * activate_page_pvecs. Otherwise, assume the page is on a
477 * pagevec, mark it active and it'll be moved to the active
478 * LRU on the next drain.
479 */
480 if (PageLRU(page))
481 activate_page(page);
482 else
483 __lru_cache_activate_page(page);
443 ClearPageReferenced(page); 484 ClearPageReferenced(page);
444 } else if (!PageReferenced(page)) { 485 } else if (!PageReferenced(page)) {
445 SetPageReferenced(page); 486 SetPageReferenced(page);
@@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page)
448EXPORT_SYMBOL(mark_page_accessed); 489EXPORT_SYMBOL(mark_page_accessed);
449 490
450/* 491/*
451 * Order of operations is important: flush the pagevec when it's already 492 * Queue the page for addition to the LRU via pagevec. The decision on whether
452 * full, not when adding the last page, to make sure that last page is 493 * to add the page to the [in]active [file|anon] list is deferred until the
453 * not added to the LRU directly when passed to this function. Because 494 * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
454 * mark_page_accessed() (called after this when writing) only activates 495 * have the page added to the active list using mark_page_accessed().
455 * pages that are on the LRU, linear writes in subpage chunks would see
456 * every PAGEVEC_SIZE page activated, which is unexpected.
457 */ 496 */
458void __lru_cache_add(struct page *page, enum lru_list lru) 497void __lru_cache_add(struct page *page)
459{ 498{
460 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 499 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
461 500
462 page_cache_get(page); 501 page_cache_get(page);
463 if (!pagevec_space(pvec)) 502 if (!pagevec_space(pvec))
464 __pagevec_lru_add(pvec, lru); 503 __pagevec_lru_add(pvec);
465 pagevec_add(pvec, page); 504 pagevec_add(pvec, page);
466 put_cpu_var(lru_add_pvecs); 505 put_cpu_var(lru_add_pvec);
467} 506}
468EXPORT_SYMBOL(__lru_cache_add); 507EXPORT_SYMBOL(__lru_cache_add);
469 508
470/** 509/**
471 * lru_cache_add_lru - add a page to a page list 510 * lru_cache_add - add a page to a page list
472 * @page: the page to be added to the LRU. 511 * @page: the page to be added to the LRU.
473 * @lru: the LRU list to which the page is added.
474 */ 512 */
475void lru_cache_add_lru(struct page *page, enum lru_list lru) 513void lru_cache_add(struct page *page)
476{ 514{
477 if (PageActive(page)) { 515 if (PageActive(page)) {
478 VM_BUG_ON(PageUnevictable(page)); 516 VM_BUG_ON(PageUnevictable(page));
479 ClearPageActive(page);
480 } else if (PageUnevictable(page)) { 517 } else if (PageUnevictable(page)) {
481 VM_BUG_ON(PageActive(page)); 518 VM_BUG_ON(PageActive(page));
482 ClearPageUnevictable(page);
483 } 519 }
484 520
485 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); 521 VM_BUG_ON(PageLRU(page));
486 __lru_cache_add(page, lru); 522 __lru_cache_add(page);
487} 523}
488 524
489/** 525/**
@@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
583 */ 619 */
584void lru_add_drain_cpu(int cpu) 620void lru_add_drain_cpu(int cpu)
585{ 621{
586 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 622 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
587 struct pagevec *pvec;
588 int lru;
589 623
590 for_each_lru(lru) { 624 if (pagevec_count(pvec))
591 pvec = &pvecs[lru - LRU_BASE]; 625 __pagevec_lru_add(pvec);
592 if (pagevec_count(pvec))
593 __pagevec_lru_add(pvec, lru);
594 }
595 626
596 pvec = &per_cpu(lru_rotate_pvecs, cpu); 627 pvec = &per_cpu(lru_rotate_pvecs, cpu);
597 if (pagevec_count(pvec)) { 628 if (pagevec_count(pvec)) {
@@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold)
708 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 739 del_page_from_lru_list(page, lruvec, page_off_lru(page));
709 } 740 }
710 741
742 /* Clear Active bit in case of parallel mark_page_accessed */
743 ClearPageActive(page);
744
711 list_add(&page->lru, &pages_to_free); 745 list_add(&page->lru, &pages_to_free);
712 } 746 }
713 if (zone) 747 if (zone)
@@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
795static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 829static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
796 void *arg) 830 void *arg)
797{ 831{
798 enum lru_list lru = (enum lru_list)arg; 832 int file = page_is_file_cache(page);
799 int file = is_file_lru(lru); 833 int active = PageActive(page);
800 int active = is_active_lru(lru); 834 enum lru_list lru = page_lru(page);
801 835
802 VM_BUG_ON(PageActive(page));
803 VM_BUG_ON(PageUnevictable(page)); 836 VM_BUG_ON(PageUnevictable(page));
804 VM_BUG_ON(PageLRU(page)); 837 VM_BUG_ON(PageLRU(page));
805 838
806 SetPageLRU(page); 839 SetPageLRU(page);
807 if (active)
808 SetPageActive(page);
809 add_page_to_lru_list(page, lruvec, lru); 840 add_page_to_lru_list(page, lruvec, lru);
810 update_page_reclaim_stat(lruvec, file, active); 841 update_page_reclaim_stat(lruvec, file, active);
842 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
811} 843}
812 844
813/* 845/*
814 * Add the passed pages to the LRU, then drop the caller's refcount 846 * Add the passed pages to the LRU, then drop the caller's refcount
815 * on them. Reinitialises the caller's pagevec. 847 * on them. Reinitialises the caller's pagevec.
816 */ 848 */
817void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 849void __pagevec_lru_add(struct pagevec *pvec)
818{ 850{
819 VM_BUG_ON(is_unevictable_lru(lru)); 851 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
820
821 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
822} 852}
823EXPORT_SYMBOL(__pagevec_lru_add); 853EXPORT_SYMBOL(__pagevec_lru_add);
824 854
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746af55b8455..36af6eeaa67e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
212 si->cluster_nr = SWAPFILE_CLUSTER - 1; 212 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks; 213 goto checks;
214 } 214 }
215 if (si->flags & SWP_DISCARDABLE) { 215 if (si->flags & SWP_PAGE_DISCARD) {
216 /* 216 /*
217 * Start range check on racing allocations, in case 217 * Start range check on racing allocations, in case
218 * they overlap the cluster we eventually decide on 218 * they overlap the cluster we eventually decide on
@@ -322,7 +322,7 @@ checks:
322 322
323 if (si->lowest_alloc) { 323 if (si->lowest_alloc) {
324 /* 324 /*
325 * Only set when SWP_DISCARDABLE, and there's a scan 325 * Only set when SWP_PAGE_DISCARD, and there's a scan
326 * for a free cluster in progress or just completed. 326 * for a free cluster in progress or just completed.
327 */ 327 */
328 if (found_free_cluster) { 328 if (found_free_cluster) {
@@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2016 return nr_extents; 2016 return nr_extents;
2017} 2017}
2018 2018
2019/*
2020 * Helper to sys_swapon determining if a given swap
2021 * backing device queue supports DISCARD operations.
2022 */
2023static bool swap_discardable(struct swap_info_struct *si)
2024{
2025 struct request_queue *q = bdev_get_queue(si->bdev);
2026
2027 if (!q || !blk_queue_discard(q))
2028 return false;
2029
2030 return true;
2031}
2032
2019SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 2033SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2020{ 2034{
2021 struct swap_info_struct *p; 2035 struct swap_info_struct *p;
@@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2123 p->flags |= SWP_SOLIDSTATE; 2137 p->flags |= SWP_SOLIDSTATE;
2124 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2138 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2125 } 2139 }
2126 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) 2140
2127 p->flags |= SWP_DISCARDABLE; 2141 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2142 /*
2143 * When discard is enabled for swap with no particular
2144 * policy flagged, we set all swap discard flags here in
2145 * order to sustain backward compatibility with older
2146 * swapon(8) releases.
2147 */
2148 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2149 SWP_PAGE_DISCARD);
2150
2151 /*
2152 * By flagging sys_swapon, a sysadmin can tell us to
2153 * either do single-time area discards only, or to just
2154 * perform discards for released swap page-clusters.
2155 * Now it's time to adjust the p->flags accordingly.
2156 */
2157 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2158 p->flags &= ~SWP_PAGE_DISCARD;
2159 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2160 p->flags &= ~SWP_AREA_DISCARD;
2161
2162 /* issue a swapon-time discard if it's still required */
2163 if (p->flags & SWP_AREA_DISCARD) {
2164 int err = discard_swap(p);
2165 if (unlikely(err))
2166 printk(KERN_ERR
2167 "swapon: discard_swap(%p): %d\n",
2168 p, err);
2169 }
2170 }
2128 } 2171 }
2129 2172
2130 mutex_lock(&swapon_mutex); 2173 mutex_lock(&swapon_mutex);
@@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2135 enable_swap_info(p, prio, swap_map, frontswap_map); 2178 enable_swap_info(p, prio, swap_map, frontswap_map);
2136 2179
2137 printk(KERN_INFO "Adding %uk swap on %s. " 2180 printk(KERN_INFO "Adding %uk swap on %s. "
2138 "Priority:%d extents:%d across:%lluk %s%s%s\n", 2181 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2139 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2182 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2140 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2183 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2141 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2184 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2142 (p->flags & SWP_DISCARDABLE) ? "D" : "", 2185 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2186 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2187 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2143 (frontswap_map) ? "FS" : ""); 2188 (frontswap_map) ? "FS" : "");
2144 2189
2145 mutex_unlock(&swapon_mutex); 2190 mutex_unlock(&swapon_mutex);
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..7441c41d00f6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
295{ 295{
296 mm->mmap_base = TASK_UNMAPPED_BASE; 296 mm->mmap_base = TASK_UNMAPPED_BASE;
297 mm->get_unmapped_area = arch_get_unmapped_area; 297 mm->get_unmapped_area = arch_get_unmapped_area;
298 mm->unmap_area = arch_unmap_area;
299} 298}
300#endif 299#endif
301 300
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb05..13a54953a273 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
292 va = rb_entry(n, struct vmap_area, rb_node); 292 va = rb_entry(n, struct vmap_area, rb_node);
293 if (addr < va->va_start) 293 if (addr < va->va_start)
294 n = n->rb_left; 294 n = n->rb_left;
295 else if (addr > va->va_start) 295 else if (addr >= va->va_end)
296 n = n->rb_right; 296 n = n->rb_right;
297 else 297 else
298 return va; 298 return va;
@@ -388,12 +388,12 @@ nocache:
388 addr = ALIGN(first->va_end, align); 388 addr = ALIGN(first->va_end, align);
389 if (addr < vstart) 389 if (addr < vstart)
390 goto nocache; 390 goto nocache;
391 if (addr + size - 1 < addr) 391 if (addr + size < addr)
392 goto overflow; 392 goto overflow;
393 393
394 } else { 394 } else {
395 addr = ALIGN(vstart, align); 395 addr = ALIGN(vstart, align);
396 if (addr + size - 1 < addr) 396 if (addr + size < addr)
397 goto overflow; 397 goto overflow;
398 398
399 n = vmap_area_root.rb_node; 399 n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
420 if (addr + cached_hole_size < first->va_start) 420 if (addr + cached_hole_size < first->va_start)
421 cached_hole_size = first->va_start - addr; 421 cached_hole_size = first->va_start - addr;
422 addr = ALIGN(first->va_end, align); 422 addr = ALIGN(first->va_end, align);
423 if (addr + size - 1 < addr) 423 if (addr + size < addr)
424 goto overflow; 424 goto overflow;
425 425
426 if (list_is_last(&first->list, &vmap_area_list)) 426 if (list_is_last(&first->list, &vmap_area_list))
@@ -754,7 +754,6 @@ struct vmap_block {
754 struct vmap_area *va; 754 struct vmap_area *va;
755 struct vmap_block_queue *vbq; 755 struct vmap_block_queue *vbq;
756 unsigned long free, dirty; 756 unsigned long free, dirty;
757 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
758 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 757 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
759 struct list_head free_list; 758 struct list_head free_list;
760 struct rcu_head rcu_head; 759 struct rcu_head rcu_head;
@@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
820 vb->va = va; 819 vb->va = va;
821 vb->free = VMAP_BBMAP_BITS; 820 vb->free = VMAP_BBMAP_BITS;
822 vb->dirty = 0; 821 vb->dirty = 0;
823 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
824 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 822 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
825 INIT_LIST_HEAD(&vb->free_list); 823 INIT_LIST_HEAD(&vb->free_list);
826 824
@@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu)
873 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 871 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
874 vb->free = 0; /* prevent further allocs after releasing lock */ 872 vb->free = 0; /* prevent further allocs after releasing lock */
875 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 873 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
876 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
877 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 874 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
878 spin_lock(&vbq->lock); 875 spin_lock(&vbq->lock);
879 list_del_rcu(&vb->free_list); 876 list_del_rcu(&vb->free_list);
@@ -891,11 +888,6 @@ static void purge_fragmented_blocks(int cpu)
891 } 888 }
892} 889}
893 890
894static void purge_fragmented_blocks_thiscpu(void)
895{
896 purge_fragmented_blocks(smp_processor_id());
897}
898
899static void purge_fragmented_blocks_allcpus(void) 891static void purge_fragmented_blocks_allcpus(void)
900{ 892{
901 int cpu; 893 int cpu;
@@ -910,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
910 struct vmap_block *vb; 902 struct vmap_block *vb;
911 unsigned long addr = 0; 903 unsigned long addr = 0;
912 unsigned int order; 904 unsigned int order;
913 int purge = 0;
914 905
915 BUG_ON(size & ~PAGE_MASK); 906 BUG_ON(size & ~PAGE_MASK);
916 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 907 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -934,17 +925,7 @@ again:
934 if (vb->free < 1UL << order) 925 if (vb->free < 1UL << order)
935 goto next; 926 goto next;
936 927
937 i = bitmap_find_free_region(vb->alloc_map, 928 i = VMAP_BBMAP_BITS - vb->free;
938 VMAP_BBMAP_BITS, order);
939
940 if (i < 0) {
941 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
942 /* fragmented and no outstanding allocations */
943 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
944 purge = 1;
945 }
946 goto next;
947 }
948 addr = vb->va->va_start + (i << PAGE_SHIFT); 929 addr = vb->va->va_start + (i << PAGE_SHIFT);
949 BUG_ON(addr_to_vb_idx(addr) != 930 BUG_ON(addr_to_vb_idx(addr) !=
950 addr_to_vb_idx(vb->va->va_start)); 931 addr_to_vb_idx(vb->va->va_start));
@@ -960,9 +941,6 @@ next:
960 spin_unlock(&vb->lock); 941 spin_unlock(&vb->lock);
961 } 942 }
962 943
963 if (purge)
964 purge_fragmented_blocks_thiscpu();
965
966 put_cpu_var(vmap_block_queue); 944 put_cpu_var(vmap_block_queue);
967 rcu_read_unlock(); 945 rcu_read_unlock();
968 946
@@ -1311,22 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1311 spin_unlock(&vmap_area_lock); 1289 spin_unlock(&vmap_area_lock);
1312} 1290}
1313 1291
1314static void clear_vm_unlist(struct vm_struct *vm) 1292static void clear_vm_uninitialized_flag(struct vm_struct *vm)
1315{ 1293{
1316 /* 1294 /*
1317 * Before removing VM_UNLIST, 1295 * Before removing VM_UNINITIALIZED,
1318 * we should make sure that vm has proper values. 1296 * we should make sure that vm has proper values.
1319 * Pair with smp_rmb() in show_numa_info(). 1297 * Pair with smp_rmb() in show_numa_info().
1320 */ 1298 */
1321 smp_wmb(); 1299 smp_wmb();
1322 vm->flags &= ~VM_UNLIST; 1300 vm->flags &= ~VM_UNINITIALIZED;
1323}
1324
1325static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1326 unsigned long flags, const void *caller)
1327{
1328 setup_vmalloc_vm(vm, va, flags, caller);
1329 clear_vm_unlist(vm);
1330} 1301}
1331 1302
1332static struct vm_struct *__get_vm_area_node(unsigned long size, 1303static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1337,16 +1308,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1337 struct vm_struct *area; 1308 struct vm_struct *area;
1338 1309
1339 BUG_ON(in_interrupt()); 1310 BUG_ON(in_interrupt());
1340 if (flags & VM_IOREMAP) { 1311 if (flags & VM_IOREMAP)
1341 int bit = fls(size); 1312 align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
1342
1343 if (bit > IOREMAP_MAX_ORDER)
1344 bit = IOREMAP_MAX_ORDER;
1345 else if (bit < PAGE_SHIFT)
1346 bit = PAGE_SHIFT;
1347
1348 align = 1ul << bit;
1349 }
1350 1313
1351 size = PAGE_ALIGN(size); 1314 size = PAGE_ALIGN(size);
1352 if (unlikely(!size)) 1315 if (unlikely(!size))
@@ -1367,16 +1330,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1367 return NULL; 1330 return NULL;
1368 } 1331 }
1369 1332
1370 /* 1333 setup_vmalloc_vm(area, va, flags, caller);
1371 * When this function is called from __vmalloc_node_range,
1372 * we add VM_UNLIST flag to avoid accessing uninitialized
1373 * members of vm_struct such as pages and nr_pages fields.
1374 * They will be set later.
1375 */
1376 if (flags & VM_UNLIST)
1377 setup_vmalloc_vm(area, va, flags, caller);
1378 else
1379 insert_vmalloc_vm(area, va, flags, caller);
1380 1334
1381 return area; 1335 return area;
1382} 1336}
@@ -1476,10 +1430,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1476 if (!addr) 1430 if (!addr)
1477 return; 1431 return;
1478 1432
1479 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1433 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
1480 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1434 addr))
1481 return; 1435 return;
1482 }
1483 1436
1484 area = remove_vm_area(addr); 1437 area = remove_vm_area(addr);
1485 if (unlikely(!area)) { 1438 if (unlikely(!area)) {
@@ -1524,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
1524 * conventions for vfree() arch-depenedent would be a really bad idea) 1477 * conventions for vfree() arch-depenedent would be a really bad idea)
1525 * 1478 *
1526 * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) 1479 * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
1527 *
1528 */ 1480 */
1529void vfree(const void *addr) 1481void vfree(const void *addr)
1530{ 1482{
@@ -1536,8 +1488,8 @@ void vfree(const void *addr)
1536 return; 1488 return;
1537 if (unlikely(in_interrupt())) { 1489 if (unlikely(in_interrupt())) {
1538 struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); 1490 struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
1539 llist_add((struct llist_node *)addr, &p->list); 1491 if (llist_add((struct llist_node *)addr, &p->list))
1540 schedule_work(&p->wq); 1492 schedule_work(&p->wq);
1541 } else 1493 } else
1542 __vunmap(addr, 1); 1494 __vunmap(addr, 1);
1543} 1495}
@@ -1682,21 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1682 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1634 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1683 goto fail; 1635 goto fail;
1684 1636
1685 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, 1637 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
1686 start, end, node, gfp_mask, caller); 1638 start, end, node, gfp_mask, caller);
1687 if (!area) 1639 if (!area)
1688 goto fail; 1640 goto fail;
1689 1641
1690 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1642 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1691 if (!addr) 1643 if (!addr)
1692 return NULL; 1644 goto fail;
1693 1645
1694 /* 1646 /*
1695 * In this function, newly allocated vm_struct has VM_UNLIST flag. 1647 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
1696 * It means that vm_struct is not fully initialized. 1648 * flag. It means that vm_struct is not fully initialized.
1697 * Now, it is fully initialized, so remove this flag here. 1649 * Now, it is fully initialized, so remove this flag here.
1698 */ 1650 */
1699 clear_vm_unlist(area); 1651 clear_vm_uninitialized_flag(area);
1700 1652
1701 /* 1653 /*
1702 * A ref_count = 3 is needed because the vm_struct and vmap_area 1654 * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2148,42 +2100,43 @@ finished:
2148} 2100}
2149 2101
2150/** 2102/**
2151 * remap_vmalloc_range - map vmalloc pages to userspace 2103 * remap_vmalloc_range_partial - map vmalloc pages to userspace
2152 * @vma: vma to cover (map full range of vma) 2104 * @vma: vma to cover
2153 * @addr: vmalloc memory 2105 * @uaddr: target user address to start at
2154 * @pgoff: number of pages into addr before first page to map 2106 * @kaddr: virtual address of vmalloc kernel memory
2107 * @size: size of map area
2155 * 2108 *
2156 * Returns: 0 for success, -Exxx on failure 2109 * Returns: 0 for success, -Exxx on failure
2157 * 2110 *
2158 * This function checks that addr is a valid vmalloc'ed area, and 2111 * This function checks that @kaddr is a valid vmalloc'ed area,
2159 * that it is big enough to cover the vma. Will return failure if 2112 * and that it is big enough to cover the range starting at
2160 * that criteria isn't met. 2113 * @uaddr in @vma. Will return failure if that criteria isn't
2114 * met.
2161 * 2115 *
2162 * Similar to remap_pfn_range() (see mm/memory.c) 2116 * Similar to remap_pfn_range() (see mm/memory.c)
2163 */ 2117 */
2164int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 2118int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2165 unsigned long pgoff) 2119 void *kaddr, unsigned long size)
2166{ 2120{
2167 struct vm_struct *area; 2121 struct vm_struct *area;
2168 unsigned long uaddr = vma->vm_start;
2169 unsigned long usize = vma->vm_end - vma->vm_start;
2170 2122
2171 if ((PAGE_SIZE-1) & (unsigned long)addr) 2123 size = PAGE_ALIGN(size);
2124
2125 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
2172 return -EINVAL; 2126 return -EINVAL;
2173 2127
2174 area = find_vm_area(addr); 2128 area = find_vm_area(kaddr);
2175 if (!area) 2129 if (!area)
2176 return -EINVAL; 2130 return -EINVAL;
2177 2131
2178 if (!(area->flags & VM_USERMAP)) 2132 if (!(area->flags & VM_USERMAP))
2179 return -EINVAL; 2133 return -EINVAL;
2180 2134
2181 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 2135 if (kaddr + size > area->addr + area->size)
2182 return -EINVAL; 2136 return -EINVAL;
2183 2137
2184 addr += pgoff << PAGE_SHIFT;
2185 do { 2138 do {
2186 struct page *page = vmalloc_to_page(addr); 2139 struct page *page = vmalloc_to_page(kaddr);
2187 int ret; 2140 int ret;
2188 2141
2189 ret = vm_insert_page(vma, uaddr, page); 2142 ret = vm_insert_page(vma, uaddr, page);
@@ -2191,14 +2144,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2191 return ret; 2144 return ret;
2192 2145
2193 uaddr += PAGE_SIZE; 2146 uaddr += PAGE_SIZE;
2194 addr += PAGE_SIZE; 2147 kaddr += PAGE_SIZE;
2195 usize -= PAGE_SIZE; 2148 size -= PAGE_SIZE;
2196 } while (usize > 0); 2149 } while (size > 0);
2197 2150
2198 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 2151 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2199 2152
2200 return 0; 2153 return 0;
2201} 2154}
2155EXPORT_SYMBOL(remap_vmalloc_range_partial);
2156
2157/**
2158 * remap_vmalloc_range - map vmalloc pages to userspace
2159 * @vma: vma to cover (map full range of vma)
2160 * @addr: vmalloc memory
2161 * @pgoff: number of pages into addr before first page to map
2162 *
2163 * Returns: 0 for success, -Exxx on failure
2164 *
2165 * This function checks that addr is a valid vmalloc'ed area, and
2166 * that it is big enough to cover the vma. Will return failure if
2167 * that criteria isn't met.
2168 *
2169 * Similar to remap_pfn_range() (see mm/memory.c)
2170 */
2171int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2172 unsigned long pgoff)
2173{
2174 return remap_vmalloc_range_partial(vma, vma->vm_start,
2175 addr + (pgoff << PAGE_SHIFT),
2176 vma->vm_end - vma->vm_start);
2177}
2202EXPORT_SYMBOL(remap_vmalloc_range); 2178EXPORT_SYMBOL(remap_vmalloc_range);
2203 2179
2204/* 2180/*
@@ -2512,8 +2488,8 @@ found:
2512 2488
2513 /* insert all vm's */ 2489 /* insert all vm's */
2514 for (area = 0; area < nr_vms; area++) 2490 for (area = 0; area < nr_vms; area++)
2515 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 2491 setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2516 pcpu_get_vm_areas); 2492 pcpu_get_vm_areas);
2517 2493
2518 kfree(vas); 2494 kfree(vas);
2519 return vms; 2495 return vms;
@@ -2592,11 +2568,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2592 if (!counters) 2568 if (!counters)
2593 return; 2569 return;
2594 2570
2595 /* Pair with smp_wmb() in clear_vm_unlist() */
2596 smp_rmb();
2597 if (v->flags & VM_UNLIST)
2598 return;
2599
2600 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2571 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2601 2572
2602 for (nr = 0; nr < v->nr_pages; nr++) 2573 for (nr = 0; nr < v->nr_pages; nr++)
@@ -2625,6 +2596,11 @@ static int s_show(struct seq_file *m, void *p)
2625 2596
2626 v = va->vm; 2597 v = va->vm;
2627 2598
2599 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2600 smp_rmb();
2601 if (v->flags & VM_UNINITIALIZED)
2602 return 0;
2603
2628 seq_printf(m, "0x%pK-0x%pK %7ld", 2604 seq_printf(m, "0x%pK-0x%pK %7ld",
2629 v->addr, v->addr + v->size, v->size); 2605 v->addr, v->addr + v->size, v->size);
2630 2606
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..2cff0d491c6d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
546void putback_lru_page(struct page *page) 546void putback_lru_page(struct page *page)
547{ 547{
548 int lru; 548 int lru;
549 int active = !!TestClearPageActive(page);
550 int was_unevictable = PageUnevictable(page); 549 int was_unevictable = PageUnevictable(page);
551 550
552 VM_BUG_ON(PageLRU(page)); 551 VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
561 * unevictable page on [in]active list. 560 * unevictable page on [in]active list.
562 * We know how to handle that. 561 * We know how to handle that.
563 */ 562 */
564 lru = active + page_lru_base_type(page); 563 lru = page_lru_base_type(page);
565 lru_cache_add_lru(page, lru); 564 lru_cache_add(page);
566 } else { 565 } else {
567 /* 566 /*
568 * Put unevictable pages directly on zone's unevictable 567 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
669 return PAGEREF_RECLAIM; 668 return PAGEREF_RECLAIM;
670} 669}
671 670
671/* Check if a page is dirty or under writeback */
672static void page_check_dirty_writeback(struct page *page,
673 bool *dirty, bool *writeback)
674{
675 struct address_space *mapping;
676
677 /*
678 * Anonymous pages are not handled by flushers and must be written
679 * from reclaim context. Do not stall reclaim based on them
680 */
681 if (!page_is_file_cache(page)) {
682 *dirty = false;
683 *writeback = false;
684 return;
685 }
686
687 /* By default assume that the page flags are accurate */
688 *dirty = PageDirty(page);
689 *writeback = PageWriteback(page);
690
691 /* Verify dirty/writeback state if the filesystem supports it */
692 if (!page_has_private(page))
693 return;
694
695 mapping = page_mapping(page);
696 if (mapping && mapping->a_ops->is_dirty_writeback)
697 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
698}
699
672/* 700/*
673 * shrink_page_list() returns the number of reclaimed pages 701 * shrink_page_list() returns the number of reclaimed pages
674 */ 702 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
677 struct scan_control *sc, 705 struct scan_control *sc,
678 enum ttu_flags ttu_flags, 706 enum ttu_flags ttu_flags,
679 unsigned long *ret_nr_dirty, 707 unsigned long *ret_nr_dirty,
708 unsigned long *ret_nr_unqueued_dirty,
709 unsigned long *ret_nr_congested,
680 unsigned long *ret_nr_writeback, 710 unsigned long *ret_nr_writeback,
711 unsigned long *ret_nr_immediate,
681 bool force_reclaim) 712 bool force_reclaim)
682{ 713{
683 LIST_HEAD(ret_pages); 714 LIST_HEAD(ret_pages);
684 LIST_HEAD(free_pages); 715 LIST_HEAD(free_pages);
685 int pgactivate = 0; 716 int pgactivate = 0;
717 unsigned long nr_unqueued_dirty = 0;
686 unsigned long nr_dirty = 0; 718 unsigned long nr_dirty = 0;
687 unsigned long nr_congested = 0; 719 unsigned long nr_congested = 0;
688 unsigned long nr_reclaimed = 0; 720 unsigned long nr_reclaimed = 0;
689 unsigned long nr_writeback = 0; 721 unsigned long nr_writeback = 0;
722 unsigned long nr_immediate = 0;
690 723
691 cond_resched(); 724 cond_resched();
692 725
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
696 struct page *page; 729 struct page *page;
697 int may_enter_fs; 730 int may_enter_fs;
698 enum page_references references = PAGEREF_RECLAIM_CLEAN; 731 enum page_references references = PAGEREF_RECLAIM_CLEAN;
732 bool dirty, writeback;
699 733
700 cond_resched(); 734 cond_resched();
701 735
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
723 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 757 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
724 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 758 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
725 759
760 /*
761 * The number of dirty pages determines if a zone is marked
762 * reclaim_congested which affects wait_iff_congested. kswapd
763 * will stall and start writing pages if the tail of the LRU
764 * is all dirty unqueued pages.
765 */
766 page_check_dirty_writeback(page, &dirty, &writeback);
767 if (dirty || writeback)
768 nr_dirty++;
769
770 if (dirty && !writeback)
771 nr_unqueued_dirty++;
772
773 /*
774 * Treat this page as congested if the underlying BDI is or if
775 * pages are cycling through the LRU so quickly that the
776 * pages marked for immediate reclaim are making it to the
777 * end of the LRU a second time.
778 */
779 mapping = page_mapping(page);
780 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
781 (writeback && PageReclaim(page)))
782 nr_congested++;
783
784 /*
785 * If a page at the tail of the LRU is under writeback, there
786 * are three cases to consider.
787 *
788 * 1) If reclaim is encountering an excessive number of pages
789 * under writeback and this page is both under writeback and
790 * PageReclaim then it indicates that pages are being queued
791 * for IO but are being recycled through the LRU before the
792 * IO can complete. Waiting on the page itself risks an
793 * indefinite stall if it is impossible to writeback the
794 * page due to IO error or disconnected storage so instead
795 * note that the LRU is being scanned too quickly and the
796 * caller can stall after page list has been processed.
797 *
798 * 2) Global reclaim encounters a page, memcg encounters a
799 * page that is not marked for immediate reclaim or
800 * the caller does not have __GFP_IO. In this case mark
801 * the page for immediate reclaim and continue scanning.
802 *
803 * __GFP_IO is checked because a loop driver thread might
804 * enter reclaim, and deadlock if it waits on a page for
805 * which it is needed to do the write (loop masks off
806 * __GFP_IO|__GFP_FS for this reason); but more thought
807 * would probably show more reasons.
808 *
809 * Don't require __GFP_FS, since we're not going into the
810 * FS, just waiting on its writeback completion. Worryingly,
811 * ext4 gfs2 and xfs allocate pages with
812 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
813 * may_enter_fs here is liable to OOM on them.
814 *
815 * 3) memcg encounters a page that is not already marked
816 * PageReclaim. memcg does not have any dirty pages
817 * throttling so we could easily OOM just because too many
818 * pages are in writeback and there is nothing else to
819 * reclaim. Wait for the writeback to complete.
820 */
726 if (PageWriteback(page)) { 821 if (PageWriteback(page)) {
727 /* 822 /* Case 1 above */
728 * memcg doesn't have any dirty pages throttling so we 823 if (current_is_kswapd() &&
729 * could easily OOM just because too many pages are in 824 PageReclaim(page) &&
730 * writeback and there is nothing else to reclaim. 825 zone_is_reclaim_writeback(zone)) {
731 * 826 nr_immediate++;
732 * Check __GFP_IO, certainly because a loop driver 827 goto keep_locked;
733 * thread might enter reclaim, and deadlock if it waits 828
734 * on a page for which it is needed to do the write 829 /* Case 2 above */
735 * (loop masks off __GFP_IO|__GFP_FS for this reason); 830 } else if (global_reclaim(sc) ||
736 * but more thought would probably show more reasons.
737 *
738 * Don't require __GFP_FS, since we're not going into
739 * the FS, just waiting on its writeback completion.
740 * Worryingly, ext4 gfs2 and xfs allocate pages with
741 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
742 * testing may_enter_fs here is liable to OOM on them.
743 */
744 if (global_reclaim(sc) ||
745 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 831 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
746 /* 832 /*
747 * This is slightly racy - end_page_writeback() 833 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
756 */ 842 */
757 SetPageReclaim(page); 843 SetPageReclaim(page);
758 nr_writeback++; 844 nr_writeback++;
845
759 goto keep_locked; 846 goto keep_locked;
847
848 /* Case 3 above */
849 } else {
850 wait_on_page_writeback(page);
760 } 851 }
761 wait_on_page_writeback(page);
762 } 852 }
763 853
764 if (!force_reclaim) 854 if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
784 if (!add_to_swap(page, page_list)) 874 if (!add_to_swap(page, page_list))
785 goto activate_locked; 875 goto activate_locked;
786 may_enter_fs = 1; 876 may_enter_fs = 1;
787 }
788 877
789 mapping = page_mapping(page); 878 /* Adding to swap updated mapping */
879 mapping = page_mapping(page);
880 }
790 881
791 /* 882 /*
792 * The page is mapped into the page tables of one or more 883 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 } 897 }
807 898
808 if (PageDirty(page)) { 899 if (PageDirty(page)) {
809 nr_dirty++;
810
811 /* 900 /*
812 * Only kswapd can writeback filesystem pages to 901 * Only kswapd can writeback filesystem pages to
813 * avoid risk of stack overflow but do not writeback 902 * avoid risk of stack overflow but only writeback
814 * unless under significant pressure. 903 * if many dirty pages have been encountered.
815 */ 904 */
816 if (page_is_file_cache(page) && 905 if (page_is_file_cache(page) &&
817 (!current_is_kswapd() || 906 (!current_is_kswapd() ||
818 sc->priority >= DEF_PRIORITY - 2)) { 907 !zone_is_reclaim_dirty(zone))) {
819 /* 908 /*
820 * Immediately reclaim when written back. 909 * Immediately reclaim when written back.
821 * Similar in principal to deactivate_page() 910 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
838 /* Page is dirty, try to write it out here */ 927 /* Page is dirty, try to write it out here */
839 switch (pageout(page, mapping, sc)) { 928 switch (pageout(page, mapping, sc)) {
840 case PAGE_KEEP: 929 case PAGE_KEEP:
841 nr_congested++;
842 goto keep_locked; 930 goto keep_locked;
843 case PAGE_ACTIVATE: 931 case PAGE_ACTIVATE:
844 goto activate_locked; 932 goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
946 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1034 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
947 } 1035 }
948 1036
949 /*
950 * Tag a zone as congested if all the dirty pages encountered were
951 * backed by a congested BDI. In this case, reclaimers should just
952 * back off and wait for congestion to clear because further reclaim
953 * will encounter the same problem
954 */
955 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
956 zone_set_flag(zone, ZONE_CONGESTED);
957
958 free_hot_cold_page_list(&free_pages, 1); 1037 free_hot_cold_page_list(&free_pages, 1);
959 1038
960 list_splice(&ret_pages, page_list); 1039 list_splice(&ret_pages, page_list);
961 count_vm_events(PGACTIVATE, pgactivate); 1040 count_vm_events(PGACTIVATE, pgactivate);
962 mem_cgroup_uncharge_end(); 1041 mem_cgroup_uncharge_end();
963 *ret_nr_dirty += nr_dirty; 1042 *ret_nr_dirty += nr_dirty;
1043 *ret_nr_congested += nr_congested;
1044 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
964 *ret_nr_writeback += nr_writeback; 1045 *ret_nr_writeback += nr_writeback;
1046 *ret_nr_immediate += nr_immediate;
965 return nr_reclaimed; 1047 return nr_reclaimed;
966} 1048}
967 1049
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
973 .priority = DEF_PRIORITY, 1055 .priority = DEF_PRIORITY,
974 .may_unmap = 1, 1056 .may_unmap = 1,
975 }; 1057 };
976 unsigned long ret, dummy1, dummy2; 1058 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
977 struct page *page, *next; 1059 struct page *page, *next;
978 LIST_HEAD(clean_pages); 1060 LIST_HEAD(clean_pages);
979 1061
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
985 } 1067 }
986 1068
987 ret = shrink_page_list(&clean_pages, zone, &sc, 1069 ret = shrink_page_list(&clean_pages, zone, &sc,
988 TTU_UNMAP|TTU_IGNORE_ACCESS, 1070 TTU_UNMAP|TTU_IGNORE_ACCESS,
989 &dummy1, &dummy2, true); 1071 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
990 list_splice(&clean_pages, page_list); 1072 list_splice(&clean_pages, page_list);
991 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1073 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
992 return ret; 1074 return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1281 unsigned long nr_reclaimed = 0; 1363 unsigned long nr_reclaimed = 0;
1282 unsigned long nr_taken; 1364 unsigned long nr_taken;
1283 unsigned long nr_dirty = 0; 1365 unsigned long nr_dirty = 0;
1366 unsigned long nr_congested = 0;
1367 unsigned long nr_unqueued_dirty = 0;
1284 unsigned long nr_writeback = 0; 1368 unsigned long nr_writeback = 0;
1369 unsigned long nr_immediate = 0;
1285 isolate_mode_t isolate_mode = 0; 1370 isolate_mode_t isolate_mode = 0;
1286 int file = is_file_lru(lru); 1371 int file = is_file_lru(lru);
1287 struct zone *zone = lruvec_zone(lruvec); 1372 struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1323 return 0; 1408 return 0;
1324 1409
1325 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1410 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1326 &nr_dirty, &nr_writeback, false); 1411 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1412 &nr_writeback, &nr_immediate,
1413 false);
1327 1414
1328 spin_lock_irq(&zone->lru_lock); 1415 spin_lock_irq(&zone->lru_lock);
1329 1416
@@ -1356,21 +1443,51 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1356 * as there is no guarantee the dirtying process is throttled in the 1443 * as there is no guarantee the dirtying process is throttled in the
1357 * same way balance_dirty_pages() manages. 1444 * same way balance_dirty_pages() manages.
1358 * 1445 *
1359 * This scales the number of dirty pages that must be under writeback 1446 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1360 * before throttling depending on priority. It is a simple backoff 1447 * of pages under pages flagged for immediate reclaim and stall if any
1361 * function that has the most effect in the range DEF_PRIORITY to 1448 * are encountered in the nr_immediate check below.
1362 * DEF_PRIORITY-2 which is the priority reclaim is considered to be 1449 */
1363 * in trouble and reclaim is considered to be in trouble. 1450 if (nr_writeback && nr_writeback == nr_taken)
1364 * 1451 zone_set_flag(zone, ZONE_WRITEBACK);
1365 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle 1452
1366 * DEF_PRIORITY-1 50% must be PageWriteback 1453 /*
1367 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble 1454 * memcg will stall in page writeback so only consider forcibly
1368 * ... 1455 * stalling for global reclaim
1369 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1370 * isolated page is PageWriteback
1371 */ 1456 */
1372 if (nr_writeback && nr_writeback >= 1457 if (global_reclaim(sc)) {
1373 (nr_taken >> (DEF_PRIORITY - sc->priority))) 1458 /*
1459 * Tag a zone as congested if all the dirty pages scanned were
1460 * backed by a congested BDI and wait_iff_congested will stall.
1461 */
1462 if (nr_dirty && nr_dirty == nr_congested)
1463 zone_set_flag(zone, ZONE_CONGESTED);
1464
1465 /*
1466 * If dirty pages are scanned that are not queued for IO, it
1467 * implies that flushers are not keeping up. In this case, flag
1468 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1469 * pages from reclaim context. It will forcibly stall in the
1470 * next check.
1471 */
1472 if (nr_unqueued_dirty == nr_taken)
1473 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1474
1475 /*
1476 * In addition, if kswapd scans pages marked marked for
1477 * immediate reclaim and under writeback (nr_immediate), it
1478 * implies that pages are cycling through the LRU faster than
1479 * they are written so also forcibly stall.
1480 */
1481 if (nr_unqueued_dirty == nr_taken || nr_immediate)
1482 congestion_wait(BLK_RW_ASYNC, HZ/10);
1483 }
1484
1485 /*
1486 * Stall direct reclaim for IO completions if underlying BDIs or zone
1487 * is congested. Allow kswapd to continue until it starts encountering
1488 * unqueued dirty pages or cycling through the LRU too quickly.
1489 */
1490 if (!sc->hibernation_mode && !current_is_kswapd())
1374 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1491 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1375 1492
1376 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1493 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1939,25 @@ out:
1822static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 1939static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1823{ 1940{
1824 unsigned long nr[NR_LRU_LISTS]; 1941 unsigned long nr[NR_LRU_LISTS];
1942 unsigned long targets[NR_LRU_LISTS];
1825 unsigned long nr_to_scan; 1943 unsigned long nr_to_scan;
1826 enum lru_list lru; 1944 enum lru_list lru;
1827 unsigned long nr_reclaimed = 0; 1945 unsigned long nr_reclaimed = 0;
1828 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1946 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1829 struct blk_plug plug; 1947 struct blk_plug plug;
1948 bool scan_adjusted = false;
1830 1949
1831 get_scan_count(lruvec, sc, nr); 1950 get_scan_count(lruvec, sc, nr);
1832 1951
1952 /* Record the original scan target for proportional adjustments later */
1953 memcpy(targets, nr, sizeof(nr));
1954
1833 blk_start_plug(&plug); 1955 blk_start_plug(&plug);
1834 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1956 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1835 nr[LRU_INACTIVE_FILE]) { 1957 nr[LRU_INACTIVE_FILE]) {
1958 unsigned long nr_anon, nr_file, percentage;
1959 unsigned long nr_scanned;
1960
1836 for_each_evictable_lru(lru) { 1961 for_each_evictable_lru(lru) {
1837 if (nr[lru]) { 1962 if (nr[lru]) {
1838 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 1963 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1967,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1842 lruvec, sc); 1967 lruvec, sc);
1843 } 1968 }
1844 } 1969 }
1970
1971 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
1972 continue;
1973
1845 /* 1974 /*
1846 * On large memory systems, scan >> priority can become 1975 * For global direct reclaim, reclaim only the number of pages
1847 * really large. This is fine for the starting priority; 1976 * requested. Less care is taken to scan proportionally as it
1848 * we want to put equal scanning pressure on each zone. 1977 * is more important to minimise direct reclaim stall latency
1849 * However, if the VM has a harder time of freeing pages, 1978 * than it is to properly age the LRU lists.
1850 * with multiple processes reclaiming pages, the total
1851 * freeing target can get unreasonably large.
1852 */ 1979 */
1853 if (nr_reclaimed >= nr_to_reclaim && 1980 if (global_reclaim(sc) && !current_is_kswapd())
1854 sc->priority < DEF_PRIORITY)
1855 break; 1981 break;
1982
1983 /*
1984 * For kswapd and memcg, reclaim at least the number of pages
1985 * requested. Ensure that the anon and file LRUs shrink
1986 * proportionally what was requested by get_scan_count(). We
1987 * stop reclaiming one LRU and reduce the amount scanning
1988 * proportional to the original scan target.
1989 */
1990 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
1991 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
1992
1993 if (nr_file > nr_anon) {
1994 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
1995 targets[LRU_ACTIVE_ANON] + 1;
1996 lru = LRU_BASE;
1997 percentage = nr_anon * 100 / scan_target;
1998 } else {
1999 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2000 targets[LRU_ACTIVE_FILE] + 1;
2001 lru = LRU_FILE;
2002 percentage = nr_file * 100 / scan_target;
2003 }
2004
2005 /* Stop scanning the smaller of the LRU */
2006 nr[lru] = 0;
2007 nr[lru + LRU_ACTIVE] = 0;
2008
2009 /*
2010 * Recalculate the other LRU scan count based on its original
2011 * scan target and the percentage scanning already complete
2012 */
2013 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2014 nr_scanned = targets[lru] - nr[lru];
2015 nr[lru] = targets[lru] * (100 - percentage) / 100;
2016 nr[lru] -= min(nr[lru], nr_scanned);
2017
2018 lru += LRU_ACTIVE;
2019 nr_scanned = targets[lru] - nr[lru];
2020 nr[lru] = targets[lru] * (100 - percentage) / 100;
2021 nr[lru] -= min(nr[lru], nr_scanned);
2022
2023 scan_adjusted = true;
1856 } 2024 }
1857 blk_finish_plug(&plug); 2025 blk_finish_plug(&plug);
1858 sc->nr_reclaimed += nr_reclaimed; 2026 sc->nr_reclaimed += nr_reclaimed;
@@ -2179,8 +2347,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2179 aborted_reclaim = shrink_zones(zonelist, sc); 2347 aborted_reclaim = shrink_zones(zonelist, sc);
2180 2348
2181 /* 2349 /*
2182 * Don't shrink slabs when reclaiming memory from 2350 * Don't shrink slabs when reclaiming memory from over limit
2183 * over limit cgroups 2351 * cgroups but do shrink slab at least once when aborting
2352 * reclaim for compaction to avoid unevenly scanning file/anon
2353 * LRU pages over slab pages.
2184 */ 2354 */
2185 if (global_reclaim(sc)) { 2355 if (global_reclaim(sc)) {
2186 unsigned long lru_pages = 0; 2356 unsigned long lru_pages = 0;
@@ -2222,18 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2222 WB_REASON_TRY_TO_FREE_PAGES); 2392 WB_REASON_TRY_TO_FREE_PAGES);
2223 sc->may_writepage = 1; 2393 sc->may_writepage = 1;
2224 } 2394 }
2225 2395 } while (--sc->priority >= 0 && !aborted_reclaim);
2226 /* Take a nap, wait for some writeback to complete */
2227 if (!sc->hibernation_mode && sc->nr_scanned &&
2228 sc->priority < DEF_PRIORITY - 2) {
2229 struct zone *preferred_zone;
2230
2231 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2232 &cpuset_current_mems_allowed,
2233 &preferred_zone);
2234 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2235 }
2236 } while (--sc->priority >= 0);
2237 2396
2238out: 2397out:
2239 delayacct_freepages_end(); 2398 delayacct_freepages_end();
@@ -2601,6 +2760,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2601} 2760}
2602 2761
2603/* 2762/*
2763 * kswapd shrinks the zone by the number of pages required to reach
2764 * the high watermark.
2765 *
2766 * Returns true if kswapd scanned at least the requested number of pages to
2767 * reclaim or if the lack of progress was due to pages under writeback.
2768 * This is used to determine if the scanning priority needs to be raised.
2769 */
2770static bool kswapd_shrink_zone(struct zone *zone,
2771 int classzone_idx,
2772 struct scan_control *sc,
2773 unsigned long lru_pages,
2774 unsigned long *nr_attempted)
2775{
2776 unsigned long nr_slab;
2777 int testorder = sc->order;
2778 unsigned long balance_gap;
2779 struct reclaim_state *reclaim_state = current->reclaim_state;
2780 struct shrink_control shrink = {
2781 .gfp_mask = sc->gfp_mask,
2782 };
2783 bool lowmem_pressure;
2784
2785 /* Reclaim above the high watermark. */
2786 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2787
2788 /*
2789 * Kswapd reclaims only single pages with compaction enabled. Trying
2790 * too hard to reclaim until contiguous free pages have become
2791 * available can hurt performance by evicting too much useful data
2792 * from memory. Do not reclaim more than needed for compaction.
2793 */
2794 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2795 compaction_suitable(zone, sc->order) !=
2796 COMPACT_SKIPPED)
2797 testorder = 0;
2798
2799 /*
2800 * We put equal pressure on every zone, unless one zone has way too
2801 * many pages free already. The "too many pages" is defined as the
2802 * high wmark plus a "gap" where the gap is either the low
2803 * watermark or 1% of the zone, whichever is smaller.
2804 */
2805 balance_gap = min(low_wmark_pages(zone),
2806 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2807 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2808
2809 /*
2810 * If there is no low memory pressure or the zone is balanced then no
2811 * reclaim is necessary
2812 */
2813 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2814 if (!lowmem_pressure && zone_balanced(zone, testorder,
2815 balance_gap, classzone_idx))
2816 return true;
2817
2818 shrink_zone(zone, sc);
2819
2820 reclaim_state->reclaimed_slab = 0;
2821 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2822 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2823
2824 /* Account for the number of pages attempted to reclaim */
2825 *nr_attempted += sc->nr_to_reclaim;
2826
2827 if (nr_slab == 0 && !zone_reclaimable(zone))
2828 zone->all_unreclaimable = 1;
2829
2830 zone_clear_flag(zone, ZONE_WRITEBACK);
2831
2832 /*
2833 * If a zone reaches its high watermark, consider it to be no longer
2834 * congested. It's possible there are dirty pages backed by congested
2835 * BDIs but as pressure is relieved, speculatively avoid congestion
2836 * waits.
2837 */
2838 if (!zone->all_unreclaimable &&
2839 zone_balanced(zone, testorder, 0, classzone_idx)) {
2840 zone_clear_flag(zone, ZONE_CONGESTED);
2841 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2842 }
2843
2844 return sc->nr_scanned >= sc->nr_to_reclaim;
2845}
2846
2847/*
2604 * For kswapd, balance_pgdat() will work across all this node's zones until 2848 * For kswapd, balance_pgdat() will work across all this node's zones until
2605 * they are all at high_wmark_pages(zone). 2849 * they are all at high_wmark_pages(zone).
2606 * 2850 *
@@ -2624,35 +2868,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2624static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2868static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2625 int *classzone_idx) 2869 int *classzone_idx)
2626{ 2870{
2627 bool pgdat_is_balanced = false;
2628 int i; 2871 int i;
2629 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2872 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2630 struct reclaim_state *reclaim_state = current->reclaim_state;
2631 unsigned long nr_soft_reclaimed; 2873 unsigned long nr_soft_reclaimed;
2632 unsigned long nr_soft_scanned; 2874 unsigned long nr_soft_scanned;
2633 struct scan_control sc = { 2875 struct scan_control sc = {
2634 .gfp_mask = GFP_KERNEL, 2876 .gfp_mask = GFP_KERNEL,
2877 .priority = DEF_PRIORITY,
2635 .may_unmap = 1, 2878 .may_unmap = 1,
2636 .may_swap = 1, 2879 .may_swap = 1,
2637 /* 2880 .may_writepage = !laptop_mode,
2638 * kswapd doesn't want to be bailed out while reclaim. because
2639 * we want to put equal scanning pressure on each zone.
2640 */
2641 .nr_to_reclaim = ULONG_MAX,
2642 .order = order, 2881 .order = order,
2643 .target_mem_cgroup = NULL, 2882 .target_mem_cgroup = NULL,
2644 }; 2883 };
2645 struct shrink_control shrink = {
2646 .gfp_mask = sc.gfp_mask,
2647 };
2648loop_again:
2649 sc.priority = DEF_PRIORITY;
2650 sc.nr_reclaimed = 0;
2651 sc.may_writepage = !laptop_mode;
2652 count_vm_event(PAGEOUTRUN); 2884 count_vm_event(PAGEOUTRUN);
2653 2885
2654 do { 2886 do {
2655 unsigned long lru_pages = 0; 2887 unsigned long lru_pages = 0;
2888 unsigned long nr_attempted = 0;
2889 bool raise_priority = true;
2890 bool pgdat_needs_compaction = (order > 0);
2891
2892 sc.nr_reclaimed = 0;
2656 2893
2657 /* 2894 /*
2658 * Scan in the highmem->dma direction for the highest 2895 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2926,46 @@ loop_again:
2689 end_zone = i; 2926 end_zone = i;
2690 break; 2927 break;
2691 } else { 2928 } else {
2692 /* If balanced, clear the congested flag */ 2929 /*
2930 * If balanced, clear the dirty and congested
2931 * flags
2932 */
2693 zone_clear_flag(zone, ZONE_CONGESTED); 2933 zone_clear_flag(zone, ZONE_CONGESTED);
2934 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2694 } 2935 }
2695 } 2936 }
2696 2937
2697 if (i < 0) { 2938 if (i < 0)
2698 pgdat_is_balanced = true;
2699 goto out; 2939 goto out;
2700 }
2701 2940
2702 for (i = 0; i <= end_zone; i++) { 2941 for (i = 0; i <= end_zone; i++) {
2703 struct zone *zone = pgdat->node_zones + i; 2942 struct zone *zone = pgdat->node_zones + i;
2704 2943
2944 if (!populated_zone(zone))
2945 continue;
2946
2705 lru_pages += zone_reclaimable_pages(zone); 2947 lru_pages += zone_reclaimable_pages(zone);
2948
2949 /*
2950 * If any zone is currently balanced then kswapd will
2951 * not call compaction as it is expected that the
2952 * necessary pages are already available.
2953 */
2954 if (pgdat_needs_compaction &&
2955 zone_watermark_ok(zone, order,
2956 low_wmark_pages(zone),
2957 *classzone_idx, 0))
2958 pgdat_needs_compaction = false;
2706 } 2959 }
2707 2960
2708 /* 2961 /*
2962 * If we're getting trouble reclaiming, start doing writepage
2963 * even in laptop mode.
2964 */
2965 if (sc.priority < DEF_PRIORITY - 2)
2966 sc.may_writepage = 1;
2967
2968 /*
2709 * Now scan the zone in the dma->highmem direction, stopping 2969 * Now scan the zone in the dma->highmem direction, stopping
2710 * at the last zone which needs scanning. 2970 * at the last zone which needs scanning.
2711 * 2971 *
@@ -2716,8 +2976,6 @@ loop_again:
2716 */ 2976 */
2717 for (i = 0; i <= end_zone; i++) { 2977 for (i = 0; i <= end_zone; i++) {
2718 struct zone *zone = pgdat->node_zones + i; 2978 struct zone *zone = pgdat->node_zones + i;
2719 int nr_slab, testorder;
2720 unsigned long balance_gap;
2721 2979
2722 if (!populated_zone(zone)) 2980 if (!populated_zone(zone))
2723 continue; 2981 continue;
@@ -2738,65 +2996,14 @@ loop_again:
2738 sc.nr_reclaimed += nr_soft_reclaimed; 2996 sc.nr_reclaimed += nr_soft_reclaimed;
2739 2997
2740 /* 2998 /*
2741 * We put equal pressure on every zone, unless 2999 * There should be no need to raise the scanning
2742 * one zone has way too many pages free 3000 * priority if enough pages are already being scanned
2743 * already. The "too many pages" is defined 3001 * that that high watermark would be met at 100%
2744 * as the high wmark plus a "gap" where the 3002 * efficiency.
2745 * gap is either the low watermark or 1%
2746 * of the zone, whichever is smaller.
2747 */ 3003 */
2748 balance_gap = min(low_wmark_pages(zone), 3004 if (kswapd_shrink_zone(zone, end_zone, &sc,
2749 (zone->managed_pages + 3005 lru_pages, &nr_attempted))
2750 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 3006 raise_priority = false;
2751 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2752 /*
2753 * Kswapd reclaims only single pages with compaction
2754 * enabled. Trying too hard to reclaim until contiguous
2755 * free pages have become available can hurt performance
2756 * by evicting too much useful data from memory.
2757 * Do not reclaim more than needed for compaction.
2758 */
2759 testorder = order;
2760 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2761 compaction_suitable(zone, order) !=
2762 COMPACT_SKIPPED)
2763 testorder = 0;
2764
2765 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2766 !zone_balanced(zone, testorder,
2767 balance_gap, end_zone)) {
2768 shrink_zone(zone, &sc);
2769
2770 reclaim_state->reclaimed_slab = 0;
2771 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2772 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2773
2774 if (nr_slab == 0 && !zone_reclaimable(zone))
2775 zone->all_unreclaimable = 1;
2776 }
2777
2778 /*
2779 * If we're getting trouble reclaiming, start doing
2780 * writepage even in laptop mode.
2781 */
2782 if (sc.priority < DEF_PRIORITY - 2)
2783 sc.may_writepage = 1;
2784
2785 if (zone->all_unreclaimable) {
2786 if (end_zone && end_zone == i)
2787 end_zone--;
2788 continue;
2789 }
2790
2791 if (zone_balanced(zone, testorder, 0, end_zone))
2792 /*
2793 * If a zone reaches its high watermark,
2794 * consider it to be no longer congested. It's
2795 * possible there are dirty pages backed by
2796 * congested BDIs but as pressure is relieved,
2797 * speculatively avoid congestion waits
2798 */
2799 zone_clear_flag(zone, ZONE_CONGESTED);
2800 } 3007 }
2801 3008
2802 /* 3009 /*
@@ -2808,74 +3015,38 @@ loop_again:
2808 pfmemalloc_watermark_ok(pgdat)) 3015 pfmemalloc_watermark_ok(pgdat))
2809 wake_up(&pgdat->pfmemalloc_wait); 3016 wake_up(&pgdat->pfmemalloc_wait);
2810 3017
2811 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2812 pgdat_is_balanced = true;
2813 break; /* kswapd: all done */
2814 }
2815
2816 /* 3018 /*
2817 * We do this so kswapd doesn't build up large priorities for 3019 * Fragmentation may mean that the system cannot be rebalanced
2818 * example when it is freeing in parallel with allocators. It 3020 * for high-order allocations in all zones. If twice the
2819 * matches the direct reclaim path behaviour in terms of impact 3021 * allocation size has been reclaimed and the zones are still
2820 * on zone->*_priority. 3022 * not balanced then recheck the watermarks at order-0 to
3023 * prevent kswapd reclaiming excessively. Assume that a
3024 * process requested a high-order can direct reclaim/compact.
2821 */ 3025 */
2822 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 3026 if (order && sc.nr_reclaimed >= 2UL << order)
2823 break; 3027 order = sc.order = 0;
2824 } while (--sc.priority >= 0);
2825
2826out:
2827 if (!pgdat_is_balanced) {
2828 cond_resched();
2829 3028
2830 try_to_freeze(); 3029 /* Check if kswapd should be suspending */
3030 if (try_to_freeze() || kthread_should_stop())
3031 break;
2831 3032
2832 /* 3033 /*
2833 * Fragmentation may mean that the system cannot be 3034 * Compact if necessary and kswapd is reclaiming at least the
2834 * rebalanced for high-order allocations in all zones. 3035 * high watermark number of pages as requsted
2835 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
2836 * it means the zones have been fully scanned and are still
2837 * not balanced. For high-order allocations, there is
2838 * little point trying all over again as kswapd may
2839 * infinite loop.
2840 *
2841 * Instead, recheck all watermarks at order-0 as they
2842 * are the most important. If watermarks are ok, kswapd will go
2843 * back to sleep. High-order users can still perform direct
2844 * reclaim if they wish.
2845 */ 3036 */
2846 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 3037 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
2847 order = sc.order = 0;
2848
2849 goto loop_again;
2850 }
2851
2852 /*
2853 * If kswapd was reclaiming at a higher order, it has the option of
2854 * sleeping without all zones being balanced. Before it does, it must
2855 * ensure that the watermarks for order-0 on *all* zones are met and
2856 * that the congestion flags are cleared. The congestion flag must
2857 * be cleared as kswapd is the only mechanism that clears the flag
2858 * and it is potentially going to sleep here.
2859 */
2860 if (order) {
2861 int zones_need_compaction = 1;
2862
2863 for (i = 0; i <= end_zone; i++) {
2864 struct zone *zone = pgdat->node_zones + i;
2865
2866 if (!populated_zone(zone))
2867 continue;
2868
2869 /* Check if the memory needs to be defragmented. */
2870 if (zone_watermark_ok(zone, order,
2871 low_wmark_pages(zone), *classzone_idx, 0))
2872 zones_need_compaction = 0;
2873 }
2874
2875 if (zones_need_compaction)
2876 compact_pgdat(pgdat, order); 3038 compact_pgdat(pgdat, order);
2877 }
2878 3039
3040 /*
3041 * Raise priority if scanning rate is too low or there was no
3042 * progress in reclaiming pages
3043 */
3044 if (raise_priority || !sc.nr_reclaimed)
3045 sc.priority--;
3046 } while (sc.priority >= 1 &&
3047 !pgdat_balanced(pgdat, order, *classzone_idx));
3048
3049out:
2879 /* 3050 /*
2880 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3051 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2881 * makes a decision on the order we were last reclaiming at. However, 3052 * makes a decision on the order we were last reclaiming at. However,
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000000..9bb4710e3589
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,527 @@
1/*
2 * zbud.c
3 *
4 * Copyright (C) 2013, Seth Jennings, IBM
5 *
6 * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
7 *
8 * zbud is an special purpose allocator for storing compressed pages. Contrary
9 * to what its name may suggest, zbud is not a buddy allocator, but rather an
10 * allocator that "buddies" two compressed pages together in a single memory
11 * page.
12 *
13 * While this design limits storage density, it has simple and deterministic
14 * reclaim properties that make it preferable to a higher density approach when
15 * reclaim will be used.
16 *
17 * zbud works by storing compressed pages, or "zpages", together in pairs in a
18 * single memory page called a "zbud page". The first buddy is "left
19 * justifed" at the beginning of the zbud page, and the last buddy is "right
20 * justified" at the end of the zbud page. The benefit is that if either
21 * buddy is freed, the freed buddy space, coalesced with whatever slack space
22 * that existed between the buddies, results in the largest possible free region
23 * within the zbud page.
24 *
25 * zbud also provides an attractive lower bound on density. The ratio of zpages
26 * to zbud pages can not be less than 1. This ensures that zbud can never "do
27 * harm" by using more pages to store zpages than the uncompressed zpages would
28 * have used on their own.
29 *
30 * zbud pages are divided into "chunks". The size of the chunks is fixed at
31 * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
32 * into chunks allows organizing unbuddied zbud pages into a manageable number
33 * of unbuddied lists according to the number of free chunks available in the
34 * zbud page.
35 *
36 * The zbud API differs from that of conventional allocators in that the
37 * allocation function, zbud_alloc(), returns an opaque handle to the user,
38 * not a dereferenceable pointer. The user must map the handle using
39 * zbud_map() in order to get a usable pointer by which to access the
40 * allocation data and unmap the handle with zbud_unmap() when operations
41 * on the allocation data are complete.
42 */
43
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45
46#include <linux/atomic.h>
47#include <linux/list.h>
48#include <linux/mm.h>
49#include <linux/module.h>
50#include <linux/preempt.h>
51#include <linux/slab.h>
52#include <linux/spinlock.h>
53#include <linux/zbud.h>
54
55/*****************
56 * Structures
57*****************/
58/*
59 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
60 * adjusting internal fragmentation. It also determines the number of
61 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
62 * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
63 * will be 64 freelists per pool.
64 */
65#define NCHUNKS_ORDER 6
66
67#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
68#define CHUNK_SIZE (1 << CHUNK_SHIFT)
69#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
70#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
71
72/**
73 * struct zbud_pool - stores metadata for each zbud pool
74 * @lock: protects all pool fields and first|last_chunk fields of any
75 * zbud page in the pool
76 * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
77 * the lists each zbud page is added to depends on the size of
78 * its free region.
79 * @buddied: list tracking the zbud pages that contain two buddies;
80 * these zbud pages are full
81 * @lru: list tracking the zbud pages in LRU order by most recently
82 * added buddy.
83 * @pages_nr: number of zbud pages in the pool.
84 * @ops: pointer to a structure of user defined operations specified at
85 * pool creation time.
86 *
87 * This structure is allocated at pool creation time and maintains metadata
88 * pertaining to a particular zbud pool.
89 */
90struct zbud_pool {
91 spinlock_t lock;
92 struct list_head unbuddied[NCHUNKS];
93 struct list_head buddied;
94 struct list_head lru;
95 u64 pages_nr;
96 struct zbud_ops *ops;
97};
98
99/*
100 * struct zbud_header - zbud page metadata occupying the first chunk of each
101 * zbud page.
102 * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
103 * @lru: links the zbud page into the lru list in the pool
104 * @first_chunks: the size of the first buddy in chunks, 0 if free
105 * @last_chunks: the size of the last buddy in chunks, 0 if free
106 */
107struct zbud_header {
108 struct list_head buddy;
109 struct list_head lru;
110 unsigned int first_chunks;
111 unsigned int last_chunks;
112 bool under_reclaim;
113};
114
115/*****************
116 * Helpers
117*****************/
118/* Just to make the code easier to read */
119enum buddy {
120 FIRST,
121 LAST
122};
123
124/* Converts an allocation size in bytes to size in zbud chunks */
125static int size_to_chunks(int size)
126{
127 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
128}
129
130#define for_each_unbuddied_list(_iter, _begin) \
131 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
132
133/* Initializes the zbud header of a newly allocated zbud page */
134static struct zbud_header *init_zbud_page(struct page *page)
135{
136 struct zbud_header *zhdr = page_address(page);
137 zhdr->first_chunks = 0;
138 zhdr->last_chunks = 0;
139 INIT_LIST_HEAD(&zhdr->buddy);
140 INIT_LIST_HEAD(&zhdr->lru);
141 zhdr->under_reclaim = 0;
142 return zhdr;
143}
144
145/* Resets the struct page fields and frees the page */
146static void free_zbud_page(struct zbud_header *zhdr)
147{
148 __free_page(virt_to_page(zhdr));
149}
150
151/*
152 * Encodes the handle of a particular buddy within a zbud page
153 * Pool lock should be held as this function accesses first|last_chunks
154 */
155static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
156{
157 unsigned long handle;
158
159 /*
160 * For now, the encoded handle is actually just the pointer to the data
161 * but this might not always be the case. A little information hiding.
162 * Add CHUNK_SIZE to the handle if it is the first allocation to jump
163 * over the zbud header in the first chunk.
164 */
165 handle = (unsigned long)zhdr;
166 if (bud == FIRST)
167 /* skip over zbud header */
168 handle += ZHDR_SIZE_ALIGNED;
169 else /* bud == LAST */
170 handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
171 return handle;
172}
173
174/* Returns the zbud page where a given handle is stored */
175static struct zbud_header *handle_to_zbud_header(unsigned long handle)
176{
177 return (struct zbud_header *)(handle & PAGE_MASK);
178}
179
180/* Returns the number of free chunks in a zbud page */
181static int num_free_chunks(struct zbud_header *zhdr)
182{
183 /*
184 * Rather than branch for different situations, just use the fact that
185 * free buddies have a length of zero to simplify everything. -1 at the
186 * end for the zbud header.
187 */
188 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
189}
190
191/*****************
192 * API Functions
193*****************/
194/**
195 * zbud_create_pool() - create a new zbud pool
196 * @gfp: gfp flags when allocating the zbud pool structure
197 * @ops: user-defined operations for the zbud pool
198 *
199 * Return: pointer to the new zbud pool or NULL if the metadata allocation
200 * failed.
201 */
202struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
203{
204 struct zbud_pool *pool;
205 int i;
206
207 pool = kmalloc(sizeof(struct zbud_pool), gfp);
208 if (!pool)
209 return NULL;
210 spin_lock_init(&pool->lock);
211 for_each_unbuddied_list(i, 0)
212 INIT_LIST_HEAD(&pool->unbuddied[i]);
213 INIT_LIST_HEAD(&pool->buddied);
214 INIT_LIST_HEAD(&pool->lru);
215 pool->pages_nr = 0;
216 pool->ops = ops;
217 return pool;
218}
219
220/**
221 * zbud_destroy_pool() - destroys an existing zbud pool
222 * @pool: the zbud pool to be destroyed
223 *
224 * The pool should be emptied before this function is called.
225 */
226void zbud_destroy_pool(struct zbud_pool *pool)
227{
228 kfree(pool);
229}
230
231/**
232 * zbud_alloc() - allocates a region of a given size
233 * @pool: zbud pool from which to allocate
234 * @size: size in bytes of the desired allocation
235 * @gfp: gfp flags used if the pool needs to grow
236 * @handle: handle of the new allocation
237 *
238 * This function will attempt to find a free region in the pool large enough to
239 * satisfy the allocation request. A search of the unbuddied lists is
240 * performed first. If no suitable free region is found, then a new page is
241 * allocated and added to the pool to satisfy the request.
242 *
243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
244 * as zbud pool pages.
245 *
246 * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page.
249 */
250int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
251 unsigned long *handle)
252{
253 int chunks, i, freechunks;
254 struct zbud_header *zhdr = NULL;
255 enum buddy bud;
256 struct page *page;
257
258 if (size <= 0 || gfp & __GFP_HIGHMEM)
259 return -EINVAL;
260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
261 return -ENOSPC;
262 chunks = size_to_chunks(size);
263 spin_lock(&pool->lock);
264
265 /* First, try to find an unbuddied zbud page. */
266 zhdr = NULL;
267 for_each_unbuddied_list(i, chunks) {
268 if (!list_empty(&pool->unbuddied[i])) {
269 zhdr = list_first_entry(&pool->unbuddied[i],
270 struct zbud_header, buddy);
271 list_del(&zhdr->buddy);
272 if (zhdr->first_chunks == 0)
273 bud = FIRST;
274 else
275 bud = LAST;
276 goto found;
277 }
278 }
279
280 /* Couldn't find unbuddied zbud page, create new one */
281 spin_unlock(&pool->lock);
282 page = alloc_page(gfp);
283 if (!page)
284 return -ENOMEM;
285 spin_lock(&pool->lock);
286 pool->pages_nr++;
287 zhdr = init_zbud_page(page);
288 bud = FIRST;
289
290found:
291 if (bud == FIRST)
292 zhdr->first_chunks = chunks;
293 else
294 zhdr->last_chunks = chunks;
295
296 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
297 /* Add to unbuddied list */
298 freechunks = num_free_chunks(zhdr);
299 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
300 } else {
301 /* Add to buddied list */
302 list_add(&zhdr->buddy, &pool->buddied);
303 }
304
305 /* Add/move zbud page to beginning of LRU */
306 if (!list_empty(&zhdr->lru))
307 list_del(&zhdr->lru);
308 list_add(&zhdr->lru, &pool->lru);
309
310 *handle = encode_handle(zhdr, bud);
311 spin_unlock(&pool->lock);
312
313 return 0;
314}
315
316/**
317 * zbud_free() - frees the allocation associated with the given handle
318 * @pool: pool in which the allocation resided
319 * @handle: handle associated with the allocation returned by zbud_alloc()
320 *
321 * In the case that the zbud page in which the allocation resides is under
322 * reclaim, as indicated by the PG_reclaim flag being set, this function
323 * only sets the first|last_chunks to 0. The page is actually freed
324 * once both buddies are evicted (see zbud_reclaim_page() below).
325 */
326void zbud_free(struct zbud_pool *pool, unsigned long handle)
327{
328 struct zbud_header *zhdr;
329 int freechunks;
330
331 spin_lock(&pool->lock);
332 zhdr = handle_to_zbud_header(handle);
333
334 /* If first buddy, handle will be page aligned */
335 if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
336 zhdr->last_chunks = 0;
337 else
338 zhdr->first_chunks = 0;
339
340 if (zhdr->under_reclaim) {
341 /* zbud page is under reclaim, reclaim will free */
342 spin_unlock(&pool->lock);
343 return;
344 }
345
346 /* Remove from existing buddy list */
347 list_del(&zhdr->buddy);
348
349 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
350 /* zbud page is empty, free */
351 list_del(&zhdr->lru);
352 free_zbud_page(zhdr);
353 pool->pages_nr--;
354 } else {
355 /* Add to unbuddied list */
356 freechunks = num_free_chunks(zhdr);
357 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
358 }
359
360 spin_unlock(&pool->lock);
361}
362
363#define list_tail_entry(ptr, type, member) \
364 list_entry((ptr)->prev, type, member)
365
366/**
367 * zbud_reclaim_page() - evicts allocations from a pool page and frees it
368 * @pool: pool from which a page will attempt to be evicted
369 * @retires: number of pages on the LRU list for which eviction will
370 * be attempted before failing
371 *
372 * zbud reclaim is different from normal system reclaim in that the reclaim is
373 * done from the bottom, up. This is because only the bottom layer, zbud, has
374 * information on how the allocations are organized within each zbud page. This
375 * has the potential to create interesting locking situations between zbud and
376 * the user, however.
377 *
378 * To avoid these, this is how zbud_reclaim_page() should be called:
379
380 * The user detects a page should be reclaimed and calls zbud_reclaim_page().
381 * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
382 * the user-defined eviction handler with the pool and handle as arguments.
383 *
384 * If the handle can not be evicted, the eviction handler should return
385 * non-zero. zbud_reclaim_page() will add the zbud page back to the
386 * appropriate list and try the next zbud page on the LRU up to
387 * a user defined number of retries.
388 *
389 * If the handle is successfully evicted, the eviction handler should
390 * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
391 * contains logic to delay freeing the page if the page is under reclaim,
392 * as indicated by the setting of the PG_reclaim flag on the underlying page.
393 *
394 * If all buddies in the zbud page are successfully evicted, then the
395 * zbud page can be freed.
396 *
397 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
398 * no pages to evict or an eviction handler is not registered, -EAGAIN if
399 * the retry limit was hit.
400 */
401int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
402{
403 int i, ret, freechunks;
404 struct zbud_header *zhdr;
405 unsigned long first_handle = 0, last_handle = 0;
406
407 spin_lock(&pool->lock);
408 if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
409 retries == 0) {
410 spin_unlock(&pool->lock);
411 return -EINVAL;
412 }
413 for (i = 0; i < retries; i++) {
414 zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
415 list_del(&zhdr->lru);
416 list_del(&zhdr->buddy);
417 /* Protect zbud page against free */
418 zhdr->under_reclaim = true;
419 /*
420 * We need encode the handles before unlocking, since we can
421 * race with free that will set (first|last)_chunks to 0
422 */
423 first_handle = 0;
424 last_handle = 0;
425 if (zhdr->first_chunks)
426 first_handle = encode_handle(zhdr, FIRST);
427 if (zhdr->last_chunks)
428 last_handle = encode_handle(zhdr, LAST);
429 spin_unlock(&pool->lock);
430
431 /* Issue the eviction callback(s) */
432 if (first_handle) {
433 ret = pool->ops->evict(pool, first_handle);
434 if (ret)
435 goto next;
436 }
437 if (last_handle) {
438 ret = pool->ops->evict(pool, last_handle);
439 if (ret)
440 goto next;
441 }
442next:
443 spin_lock(&pool->lock);
444 zhdr->under_reclaim = false;
445 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
446 /*
447 * Both buddies are now free, free the zbud page and
448 * return success.
449 */
450 free_zbud_page(zhdr);
451 pool->pages_nr--;
452 spin_unlock(&pool->lock);
453 return 0;
454 } else if (zhdr->first_chunks == 0 ||
455 zhdr->last_chunks == 0) {
456 /* add to unbuddied list */
457 freechunks = num_free_chunks(zhdr);
458 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
459 } else {
460 /* add to buddied list */
461 list_add(&zhdr->buddy, &pool->buddied);
462 }
463
464 /* add to beginning of LRU */
465 list_add(&zhdr->lru, &pool->lru);
466 }
467 spin_unlock(&pool->lock);
468 return -EAGAIN;
469}
470
471/**
472 * zbud_map() - maps the allocation associated with the given handle
473 * @pool: pool in which the allocation resides
474 * @handle: handle associated with the allocation to be mapped
475 *
476 * While trivial for zbud, the mapping functions for others allocators
477 * implementing this allocation API could have more complex information encoded
478 * in the handle and could create temporary mappings to make the data
479 * accessible to the user.
480 *
481 * Returns: a pointer to the mapped allocation
482 */
483void *zbud_map(struct zbud_pool *pool, unsigned long handle)
484{
485 return (void *)(handle);
486}
487
488/**
489 * zbud_unmap() - maps the allocation associated with the given handle
490 * @pool: pool in which the allocation resides
491 * @handle: handle associated with the allocation to be unmapped
492 */
493void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
494{
495}
496
497/**
498 * zbud_get_pool_size() - gets the zbud pool size in pages
499 * @pool: pool whose size is being queried
500 *
501 * Returns: size in pages of the given pool. The pool lock need not be
502 * taken to access pages_nr.
503 */
504u64 zbud_get_pool_size(struct zbud_pool *pool)
505{
506 return pool->pages_nr;
507}
508
509static int __init init_zbud(void)
510{
511 /* Make sure the zbud header will fit in one chunk */
512 BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
513 pr_info("loaded\n");
514 return 0;
515}
516
517static void __exit exit_zbud(void)
518{
519 pr_info("unloaded\n");
520}
521
522module_init(init_zbud);
523module_exit(exit_zbud);
524
525MODULE_LICENSE("GPL");
526MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
527MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 000000000000..deda2b671e12
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,943 @@
1/*
2 * zswap.c - zswap driver file
3 *
4 * zswap is a backend for frontswap that takes pages that are in the process
5 * of being swapped out and attempts to compress and store them in a
6 * RAM-based memory pool. This can result in a significant I/O reduction on
7 * the swap device and, in the case where decompressing from RAM is faster
8 * than reading from the swap device, can also improve workload performance.
9 *
10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21*/
22
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/highmem.h>
28#include <linux/slab.h>
29#include <linux/spinlock.h>
30#include <linux/types.h>
31#include <linux/atomic.h>
32#include <linux/frontswap.h>
33#include <linux/rbtree.h>
34#include <linux/swap.h>
35#include <linux/crypto.h>
36#include <linux/mempool.h>
37#include <linux/zbud.h>
38
39#include <linux/mm_types.h>
40#include <linux/page-flags.h>
41#include <linux/swapops.h>
42#include <linux/writeback.h>
43#include <linux/pagemap.h>
44
45/*********************************
46* statistics
47**********************************/
48/* Number of memory pages used by the compressed pool */
49static u64 zswap_pool_pages;
50/* The number of compressed pages currently stored in zswap */
51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
52
53/*
54 * The statistics below are not protected from concurrent access for
55 * performance reasons so they may not be a 100% accurate. However,
56 * they do provide useful information on roughly how many times a
57 * certain event is occurring.
58*/
59
60/* Pool limit was hit (see zswap_max_pool_percent) */
61static u64 zswap_pool_limit_hit;
62/* Pages written back when pool limit was reached */
63static u64 zswap_written_back_pages;
64/* Store failed due to a reclaim failure after pool limit was reached */
65static u64 zswap_reject_reclaim_fail;
66/* Compressed page was too big for the allocator to (optimally) store */
67static u64 zswap_reject_compress_poor;
68/* Store failed because underlying allocator could not get memory */
69static u64 zswap_reject_alloc_fail;
70/* Store failed because the entry metadata could not be allocated (rare) */
71static u64 zswap_reject_kmemcache_fail;
72/* Duplicate store was encountered (rare) */
73static u64 zswap_duplicate_entry;
74
75/*********************************
76* tunables
77**********************************/
78/* Enable/disable zswap (disabled by default, fixed at boot for now) */
79static bool zswap_enabled __read_mostly;
80module_param_named(enabled, zswap_enabled, bool, 0);
81
82/* Compressor to be used by zswap (fixed at boot for now) */
83#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
85module_param_named(compressor, zswap_compressor, charp, 0);
86
87/* The maximum percentage of memory that the compressed pool can occupy */
88static unsigned int zswap_max_pool_percent = 20;
89module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644);
91
92/*********************************
93* compression functions
94**********************************/
95/* per-cpu compression transforms */
96static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
97
98enum comp_op {
99 ZSWAP_COMPOP_COMPRESS,
100 ZSWAP_COMPOP_DECOMPRESS
101};
102
103static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
104 u8 *dst, unsigned int *dlen)
105{
106 struct crypto_comp *tfm;
107 int ret;
108
109 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
110 switch (op) {
111 case ZSWAP_COMPOP_COMPRESS:
112 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
113 break;
114 case ZSWAP_COMPOP_DECOMPRESS:
115 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
116 break;
117 default:
118 ret = -EINVAL;
119 }
120
121 put_cpu();
122 return ret;
123}
124
125static int __init zswap_comp_init(void)
126{
127 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
128 pr_info("%s compressor not available\n", zswap_compressor);
129 /* fall back to default compressor */
130 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
131 if (!crypto_has_comp(zswap_compressor, 0, 0))
132 /* can't even load the default compressor */
133 return -ENODEV;
134 }
135 pr_info("using %s compressor\n", zswap_compressor);
136
137 /* alloc percpu transforms */
138 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
139 if (!zswap_comp_pcpu_tfms)
140 return -ENOMEM;
141 return 0;
142}
143
144static void zswap_comp_exit(void)
145{
146 /* free percpu transforms */
147 if (zswap_comp_pcpu_tfms)
148 free_percpu(zswap_comp_pcpu_tfms);
149}
150
151/*********************************
152* data structures
153**********************************/
154/*
155 * struct zswap_entry
156 *
157 * This structure contains the metadata for tracking a single compressed
158 * page within zswap.
159 *
160 * rbnode - links the entry into red-black tree for the appropriate swap type
161 * refcount - the number of outstanding reference to the entry. This is needed
162 * to protect against premature freeing of the entry by code
163 * concurent calls to load, invalidate, and writeback. The lock
164 * for the zswap_tree structure that contains the entry must
165 * be held while changing the refcount. Since the lock must
166 * be held, there is no reason to also make refcount atomic.
167 * offset - the swap offset for the entry. Index into the red-black tree.
168 * handle - zsmalloc allocation handle that stores the compressed page data
169 * length - the length in bytes of the compressed page data. Needed during
170 * decompression
171 */
172struct zswap_entry {
173 struct rb_node rbnode;
174 pgoff_t offset;
175 int refcount;
176 unsigned int length;
177 unsigned long handle;
178};
179
180struct zswap_header {
181 swp_entry_t swpentry;
182};
183
184/*
185 * The tree lock in the zswap_tree struct protects a few things:
186 * - the rbtree
187 * - the refcount field of each entry in the tree
188 */
189struct zswap_tree {
190 struct rb_root rbroot;
191 spinlock_t lock;
192 struct zbud_pool *pool;
193};
194
195static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
196
197/*********************************
198* zswap entry functions
199**********************************/
200static struct kmem_cache *zswap_entry_cache;
201
202static int zswap_entry_cache_create(void)
203{
204 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
205 return (zswap_entry_cache == NULL);
206}
207
208static void zswap_entry_cache_destory(void)
209{
210 kmem_cache_destroy(zswap_entry_cache);
211}
212
213static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
214{
215 struct zswap_entry *entry;
216 entry = kmem_cache_alloc(zswap_entry_cache, gfp);
217 if (!entry)
218 return NULL;
219 entry->refcount = 1;
220 return entry;
221}
222
223static void zswap_entry_cache_free(struct zswap_entry *entry)
224{
225 kmem_cache_free(zswap_entry_cache, entry);
226}
227
228/* caller must hold the tree lock */
229static void zswap_entry_get(struct zswap_entry *entry)
230{
231 entry->refcount++;
232}
233
234/* caller must hold the tree lock */
235static int zswap_entry_put(struct zswap_entry *entry)
236{
237 entry->refcount--;
238 return entry->refcount;
239}
240
241/*********************************
242* rbtree functions
243**********************************/
244static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
245{
246 struct rb_node *node = root->rb_node;
247 struct zswap_entry *entry;
248
249 while (node) {
250 entry = rb_entry(node, struct zswap_entry, rbnode);
251 if (entry->offset > offset)
252 node = node->rb_left;
253 else if (entry->offset < offset)
254 node = node->rb_right;
255 else
256 return entry;
257 }
258 return NULL;
259}
260
261/*
262 * In the case that a entry with the same offset is found, a pointer to
263 * the existing entry is stored in dupentry and the function returns -EEXIST
264 */
265static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
266 struct zswap_entry **dupentry)
267{
268 struct rb_node **link = &root->rb_node, *parent = NULL;
269 struct zswap_entry *myentry;
270
271 while (*link) {
272 parent = *link;
273 myentry = rb_entry(parent, struct zswap_entry, rbnode);
274 if (myentry->offset > entry->offset)
275 link = &(*link)->rb_left;
276 else if (myentry->offset < entry->offset)
277 link = &(*link)->rb_right;
278 else {
279 *dupentry = myentry;
280 return -EEXIST;
281 }
282 }
283 rb_link_node(&entry->rbnode, parent, link);
284 rb_insert_color(&entry->rbnode, root);
285 return 0;
286}
287
288/*********************************
289* per-cpu code
290**********************************/
291static DEFINE_PER_CPU(u8 *, zswap_dstmem);
292
293static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
294{
295 struct crypto_comp *tfm;
296 u8 *dst;
297
298 switch (action) {
299 case CPU_UP_PREPARE:
300 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
301 if (IS_ERR(tfm)) {
302 pr_err("can't allocate compressor transform\n");
303 return NOTIFY_BAD;
304 }
305 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
306 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
307 if (!dst) {
308 pr_err("can't allocate compressor buffer\n");
309 crypto_free_comp(tfm);
310 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
311 return NOTIFY_BAD;
312 }
313 per_cpu(zswap_dstmem, cpu) = dst;
314 break;
315 case CPU_DEAD:
316 case CPU_UP_CANCELED:
317 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
318 if (tfm) {
319 crypto_free_comp(tfm);
320 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
321 }
322 dst = per_cpu(zswap_dstmem, cpu);
323 kfree(dst);
324 per_cpu(zswap_dstmem, cpu) = NULL;
325 break;
326 default:
327 break;
328 }
329 return NOTIFY_OK;
330}
331
332static int zswap_cpu_notifier(struct notifier_block *nb,
333 unsigned long action, void *pcpu)
334{
335 unsigned long cpu = (unsigned long)pcpu;
336 return __zswap_cpu_notifier(action, cpu);
337}
338
339static struct notifier_block zswap_cpu_notifier_block = {
340 .notifier_call = zswap_cpu_notifier
341};
342
343static int zswap_cpu_init(void)
344{
345 unsigned long cpu;
346
347 get_online_cpus();
348 for_each_online_cpu(cpu)
349 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
350 goto cleanup;
351 register_cpu_notifier(&zswap_cpu_notifier_block);
352 put_online_cpus();
353 return 0;
354
355cleanup:
356 for_each_online_cpu(cpu)
357 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
358 put_online_cpus();
359 return -ENOMEM;
360}
361
362/*********************************
363* helpers
364**********************************/
365static bool zswap_is_full(void)
366{
367 return (totalram_pages * zswap_max_pool_percent / 100 <
368 zswap_pool_pages);
369}
370
371/*
372 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
373 * freeing the entry itself, and decrementing the number of stored pages.
374 */
375static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
376{
377 zbud_free(tree->pool, entry->handle);
378 zswap_entry_cache_free(entry);
379 atomic_dec(&zswap_stored_pages);
380 zswap_pool_pages = zbud_get_pool_size(tree->pool);
381}
382
383/*********************************
384* writeback code
385**********************************/
386/* return enum for zswap_get_swap_cache_page */
387enum zswap_get_swap_ret {
388 ZSWAP_SWAPCACHE_NEW,
389 ZSWAP_SWAPCACHE_EXIST,
390 ZSWAP_SWAPCACHE_NOMEM
391};
392
393/*
394 * zswap_get_swap_cache_page
395 *
396 * This is an adaption of read_swap_cache_async()
397 *
398 * This function tries to find a page with the given swap entry
399 * in the swapper_space address space (the swap cache). If the page
400 * is found, it is returned in retpage. Otherwise, a page is allocated,
401 * added to the swap cache, and returned in retpage.
402 *
403 * If success, the swap cache page is returned in retpage
404 * Returns 0 if page was already in the swap cache, page is not locked
405 * Returns 1 if the new page needs to be populated, page is locked
406 * Returns <0 on error
407 */
408static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage)
410{
411 struct page *found_page, *new_page = NULL;
412 struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
413 int err;
414
415 *retpage = NULL;
416 do {
417 /*
418 * First check the swap cache. Since this is normally
419 * called after lookup_swap_cache() failed, re-calling
420 * that would confuse statistics.
421 */
422 found_page = find_get_page(swapper_space, entry.val);
423 if (found_page)
424 break;
425
426 /*
427 * Get a new page to read into from swap.
428 */
429 if (!new_page) {
430 new_page = alloc_page(GFP_KERNEL);
431 if (!new_page)
432 break; /* Out of memory */
433 }
434
435 /*
436 * call radix_tree_preload() while we can wait.
437 */
438 err = radix_tree_preload(GFP_KERNEL);
439 if (err)
440 break;
441
442 /*
443 * Swap entry may have been freed since our caller observed it.
444 */
445 err = swapcache_prepare(entry);
446 if (err == -EEXIST) { /* seems racy */
447 radix_tree_preload_end();
448 continue;
449 }
450 if (err) { /* swp entry is obsolete ? */
451 radix_tree_preload_end();
452 break;
453 }
454
455 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
456 __set_page_locked(new_page);
457 SetPageSwapBacked(new_page);
458 err = __add_to_swap_cache(new_page, entry);
459 if (likely(!err)) {
460 radix_tree_preload_end();
461 lru_cache_add_anon(new_page);
462 *retpage = new_page;
463 return ZSWAP_SWAPCACHE_NEW;
464 }
465 radix_tree_preload_end();
466 ClearPageSwapBacked(new_page);
467 __clear_page_locked(new_page);
468 /*
469 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
470 * clear SWAP_HAS_CACHE flag.
471 */
472 swapcache_free(entry, NULL);
473 } while (err != -ENOMEM);
474
475 if (new_page)
476 page_cache_release(new_page);
477 if (!found_page)
478 return ZSWAP_SWAPCACHE_NOMEM;
479 *retpage = found_page;
480 return ZSWAP_SWAPCACHE_EXIST;
481}
482
483/*
484 * Attempts to free an entry by adding a page to the swap cache,
485 * decompressing the entry data into the page, and issuing a
486 * bio write to write the page back to the swap device.
487 *
488 * This can be thought of as a "resumed writeback" of the page
489 * to the swap device. We are basically resuming the same swap
490 * writeback path that was intercepted with the frontswap_store()
491 * in the first place. After the page has been decompressed into
492 * the swap cache, the compressed version stored by zswap can be
493 * freed.
494 */
495static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
496{
497 struct zswap_header *zhdr;
498 swp_entry_t swpentry;
499 struct zswap_tree *tree;
500 pgoff_t offset;
501 struct zswap_entry *entry;
502 struct page *page;
503 u8 *src, *dst;
504 unsigned int dlen;
505 int ret, refcount;
506 struct writeback_control wbc = {
507 .sync_mode = WB_SYNC_NONE,
508 };
509
510 /* extract swpentry from data */
511 zhdr = zbud_map(pool, handle);
512 swpentry = zhdr->swpentry; /* here */
513 zbud_unmap(pool, handle);
514 tree = zswap_trees[swp_type(swpentry)];
515 offset = swp_offset(swpentry);
516 BUG_ON(pool != tree->pool);
517
518 /* find and ref zswap entry */
519 spin_lock(&tree->lock);
520 entry = zswap_rb_search(&tree->rbroot, offset);
521 if (!entry) {
522 /* entry was invalidated */
523 spin_unlock(&tree->lock);
524 return 0;
525 }
526 zswap_entry_get(entry);
527 spin_unlock(&tree->lock);
528 BUG_ON(offset != entry->offset);
529
530 /* try to allocate swap cache page */
531 switch (zswap_get_swap_cache_page(swpentry, &page)) {
532 case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
533 ret = -ENOMEM;
534 goto fail;
535
536 case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
537 /* page is already in the swap cache, ignore for now */
538 page_cache_release(page);
539 ret = -EEXIST;
540 goto fail;
541
542 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
543 /* decompress */
544 dlen = PAGE_SIZE;
545 src = (u8 *)zbud_map(tree->pool, entry->handle) +
546 sizeof(struct zswap_header);
547 dst = kmap_atomic(page);
548 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
549 entry->length, dst, &dlen);
550 kunmap_atomic(dst);
551 zbud_unmap(tree->pool, entry->handle);
552 BUG_ON(ret);
553 BUG_ON(dlen != PAGE_SIZE);
554
555 /* page is up to date */
556 SetPageUptodate(page);
557 }
558
559 /* start writeback */
560 __swap_writepage(page, &wbc, end_swap_bio_write);
561 page_cache_release(page);
562 zswap_written_back_pages++;
563
564 spin_lock(&tree->lock);
565
566 /* drop local reference */
567 zswap_entry_put(entry);
568 /* drop the initial reference from entry creation */
569 refcount = zswap_entry_put(entry);
570
571 /*
572 * There are three possible values for refcount here:
573 * (1) refcount is 1, load is in progress, unlink from rbtree,
574 * load will free
575 * (2) refcount is 0, (normal case) entry is valid,
576 * remove from rbtree and free entry
577 * (3) refcount is -1, invalidate happened during writeback,
578 * free entry
579 */
580 if (refcount >= 0) {
581 /* no invalidate yet, remove from rbtree */
582 rb_erase(&entry->rbnode, &tree->rbroot);
583 }
584 spin_unlock(&tree->lock);
585 if (refcount <= 0) {
586 /* free the entry */
587 zswap_free_entry(tree, entry);
588 return 0;
589 }
590 return -EAGAIN;
591
592fail:
593 spin_lock(&tree->lock);
594 zswap_entry_put(entry);
595 spin_unlock(&tree->lock);
596 return ret;
597}
598
599/*********************************
600* frontswap hooks
601**********************************/
602/* attempts to compress and store an single page */
603static int zswap_frontswap_store(unsigned type, pgoff_t offset,
604 struct page *page)
605{
606 struct zswap_tree *tree = zswap_trees[type];
607 struct zswap_entry *entry, *dupentry;
608 int ret;
609 unsigned int dlen = PAGE_SIZE, len;
610 unsigned long handle;
611 char *buf;
612 u8 *src, *dst;
613 struct zswap_header *zhdr;
614
615 if (!tree) {
616 ret = -ENODEV;
617 goto reject;
618 }
619
620 /* reclaim space if needed */
621 if (zswap_is_full()) {
622 zswap_pool_limit_hit++;
623 if (zbud_reclaim_page(tree->pool, 8)) {
624 zswap_reject_reclaim_fail++;
625 ret = -ENOMEM;
626 goto reject;
627 }
628 }
629
630 /* allocate entry */
631 entry = zswap_entry_cache_alloc(GFP_KERNEL);
632 if (!entry) {
633 zswap_reject_kmemcache_fail++;
634 ret = -ENOMEM;
635 goto reject;
636 }
637
638 /* compress */
639 dst = get_cpu_var(zswap_dstmem);
640 src = kmap_atomic(page);
641 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
642 kunmap_atomic(src);
643 if (ret) {
644 ret = -EINVAL;
645 goto freepage;
646 }
647
648 /* store */
649 len = dlen + sizeof(struct zswap_header);
650 ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
651 &handle);
652 if (ret == -ENOSPC) {
653 zswap_reject_compress_poor++;
654 goto freepage;
655 }
656 if (ret) {
657 zswap_reject_alloc_fail++;
658 goto freepage;
659 }
660 zhdr = zbud_map(tree->pool, handle);
661 zhdr->swpentry = swp_entry(type, offset);
662 buf = (u8 *)(zhdr + 1);
663 memcpy(buf, dst, dlen);
664 zbud_unmap(tree->pool, handle);
665 put_cpu_var(zswap_dstmem);
666
667 /* populate entry */
668 entry->offset = offset;
669 entry->handle = handle;
670 entry->length = dlen;
671
672 /* map */
673 spin_lock(&tree->lock);
674 do {
675 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
676 if (ret == -EEXIST) {
677 zswap_duplicate_entry++;
678 /* remove from rbtree */
679 rb_erase(&dupentry->rbnode, &tree->rbroot);
680 if (!zswap_entry_put(dupentry)) {
681 /* free */
682 zswap_free_entry(tree, dupentry);
683 }
684 }
685 } while (ret == -EEXIST);
686 spin_unlock(&tree->lock);
687
688 /* update stats */
689 atomic_inc(&zswap_stored_pages);
690 zswap_pool_pages = zbud_get_pool_size(tree->pool);
691
692 return 0;
693
694freepage:
695 put_cpu_var(zswap_dstmem);
696 zswap_entry_cache_free(entry);
697reject:
698 return ret;
699}
700
701/*
702 * returns 0 if the page was successfully decompressed
703 * return -1 on entry not found or error
704*/
705static int zswap_frontswap_load(unsigned type, pgoff_t offset,
706 struct page *page)
707{
708 struct zswap_tree *tree = zswap_trees[type];
709 struct zswap_entry *entry;
710 u8 *src, *dst;
711 unsigned int dlen;
712 int refcount, ret;
713
714 /* find */
715 spin_lock(&tree->lock);
716 entry = zswap_rb_search(&tree->rbroot, offset);
717 if (!entry) {
718 /* entry was written back */
719 spin_unlock(&tree->lock);
720 return -1;
721 }
722 zswap_entry_get(entry);
723 spin_unlock(&tree->lock);
724
725 /* decompress */
726 dlen = PAGE_SIZE;
727 src = (u8 *)zbud_map(tree->pool, entry->handle) +
728 sizeof(struct zswap_header);
729 dst = kmap_atomic(page);
730 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
731 dst, &dlen);
732 kunmap_atomic(dst);
733 zbud_unmap(tree->pool, entry->handle);
734 BUG_ON(ret);
735
736 spin_lock(&tree->lock);
737 refcount = zswap_entry_put(entry);
738 if (likely(refcount)) {
739 spin_unlock(&tree->lock);
740 return 0;
741 }
742 spin_unlock(&tree->lock);
743
744 /*
745 * We don't have to unlink from the rbtree because
746 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
747 * has already done this for us if we are the last reference.
748 */
749 /* free */
750
751 zswap_free_entry(tree, entry);
752
753 return 0;
754}
755
756/* frees an entry in zswap */
757static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
758{
759 struct zswap_tree *tree = zswap_trees[type];
760 struct zswap_entry *entry;
761 int refcount;
762
763 /* find */
764 spin_lock(&tree->lock);
765 entry = zswap_rb_search(&tree->rbroot, offset);
766 if (!entry) {
767 /* entry was written back */
768 spin_unlock(&tree->lock);
769 return;
770 }
771
772 /* remove from rbtree */
773 rb_erase(&entry->rbnode, &tree->rbroot);
774
775 /* drop the initial reference from entry creation */
776 refcount = zswap_entry_put(entry);
777
778 spin_unlock(&tree->lock);
779
780 if (refcount) {
781 /* writeback in progress, writeback will free */
782 return;
783 }
784
785 /* free */
786 zswap_free_entry(tree, entry);
787}
788
789/* frees all zswap entries for the given swap type */
790static void zswap_frontswap_invalidate_area(unsigned type)
791{
792 struct zswap_tree *tree = zswap_trees[type];
793 struct rb_node *node;
794 struct zswap_entry *entry;
795
796 if (!tree)
797 return;
798
799 /* walk the tree and free everything */
800 spin_lock(&tree->lock);
801 /*
802 * TODO: Even though this code should not be executed because
803 * the try_to_unuse() in swapoff should have emptied the tree,
804 * it is very wasteful to rebalance the tree after every
805 * removal when we are freeing the whole tree.
806 *
807 * If post-order traversal code is ever added to the rbtree
808 * implementation, it should be used here.
809 */
810 while ((node = rb_first(&tree->rbroot))) {
811 entry = rb_entry(node, struct zswap_entry, rbnode);
812 rb_erase(&entry->rbnode, &tree->rbroot);
813 zbud_free(tree->pool, entry->handle);
814 zswap_entry_cache_free(entry);
815 atomic_dec(&zswap_stored_pages);
816 }
817 tree->rbroot = RB_ROOT;
818 spin_unlock(&tree->lock);
819}
820
821static struct zbud_ops zswap_zbud_ops = {
822 .evict = zswap_writeback_entry
823};
824
825static void zswap_frontswap_init(unsigned type)
826{
827 struct zswap_tree *tree;
828
829 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
830 if (!tree)
831 goto err;
832 tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
833 if (!tree->pool)
834 goto freetree;
835 tree->rbroot = RB_ROOT;
836 spin_lock_init(&tree->lock);
837 zswap_trees[type] = tree;
838 return;
839
840freetree:
841 kfree(tree);
842err:
843 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
844}
845
846static struct frontswap_ops zswap_frontswap_ops = {
847 .store = zswap_frontswap_store,
848 .load = zswap_frontswap_load,
849 .invalidate_page = zswap_frontswap_invalidate_page,
850 .invalidate_area = zswap_frontswap_invalidate_area,
851 .init = zswap_frontswap_init
852};
853
854/*********************************
855* debugfs functions
856**********************************/
857#ifdef CONFIG_DEBUG_FS
858#include <linux/debugfs.h>
859
860static struct dentry *zswap_debugfs_root;
861
862static int __init zswap_debugfs_init(void)
863{
864 if (!debugfs_initialized())
865 return -ENODEV;
866
867 zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
868 if (!zswap_debugfs_root)
869 return -ENOMEM;
870
871 debugfs_create_u64("pool_limit_hit", S_IRUGO,
872 zswap_debugfs_root, &zswap_pool_limit_hit);
873 debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
874 zswap_debugfs_root, &zswap_reject_reclaim_fail);
875 debugfs_create_u64("reject_alloc_fail", S_IRUGO,
876 zswap_debugfs_root, &zswap_reject_alloc_fail);
877 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
878 zswap_debugfs_root, &zswap_reject_kmemcache_fail);
879 debugfs_create_u64("reject_compress_poor", S_IRUGO,
880 zswap_debugfs_root, &zswap_reject_compress_poor);
881 debugfs_create_u64("written_back_pages", S_IRUGO,
882 zswap_debugfs_root, &zswap_written_back_pages);
883 debugfs_create_u64("duplicate_entry", S_IRUGO,
884 zswap_debugfs_root, &zswap_duplicate_entry);
885 debugfs_create_u64("pool_pages", S_IRUGO,
886 zswap_debugfs_root, &zswap_pool_pages);
887 debugfs_create_atomic_t("stored_pages", S_IRUGO,
888 zswap_debugfs_root, &zswap_stored_pages);
889
890 return 0;
891}
892
893static void __exit zswap_debugfs_exit(void)
894{
895 debugfs_remove_recursive(zswap_debugfs_root);
896}
897#else
898static int __init zswap_debugfs_init(void)
899{
900 return 0;
901}
902
903static void __exit zswap_debugfs_exit(void) { }
904#endif
905
906/*********************************
907* module init and exit
908**********************************/
909static int __init init_zswap(void)
910{
911 if (!zswap_enabled)
912 return 0;
913
914 pr_info("loading zswap\n");
915 if (zswap_entry_cache_create()) {
916 pr_err("entry cache creation failed\n");
917 goto error;
918 }
919 if (zswap_comp_init()) {
920 pr_err("compressor initialization failed\n");
921 goto compfail;
922 }
923 if (zswap_cpu_init()) {
924 pr_err("per-cpu initialization failed\n");
925 goto pcpufail;
926 }
927 frontswap_register_ops(&zswap_frontswap_ops);
928 if (zswap_debugfs_init())
929 pr_warn("debugfs initialization failed\n");
930 return 0;
931pcpufail:
932 zswap_comp_exit();
933compfail:
934 zswap_entry_cache_destory();
935error:
936 return -ENOMEM;
937}
938/* must be late so crypto has time to come up */
939late_initcall(init_zswap);
940
941MODULE_LICENSE("GPL");
942MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
943MODULE_DESCRIPTION("Compressed cache for swap pages");