aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
committerSage Weil <sage@inktank.com>2013-08-15 14:11:45 -0400
commitee3e542fec6e69bc9fb668698889a37d93950ddf (patch)
treee74ee766a4764769ef1d3d45d266b4dea64101d3 /mm
parentfe2a801b50c0bb8039d627e5ae1fec249d10ff39 (diff)
parentf1d6e17f540af37bb1891480143669ba7636c4cf (diff)
Merge remote-tracking branch 'linus/master' into testing
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig44
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c7
-rw-r--r--mm/bootmem.c39
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/fremap.c11
-rw-r--r--mm/huge_memory.c34
-rw-r--r--mm/hugetlb.c240
-rw-r--r--mm/internal.h5
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c370
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c38
-rw-r--r--mm/memory_hotplug.c139
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mm_init.c47
-rw-r--r--mm/mmap.c42
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mremap.c20
-rw-r--r--mm/nobootmem.c35
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c384
-rw-r--r--mm/page_io.c50
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c23
-rw-r--r--mm/shmem.c46
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slab.h3
-rw-r--r--mm/slab_common.c18
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c39
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c135
-rw-r--r--mm/swapfile.c74
-rw-r--r--mm/truncate.c117
-rw-r--r--mm/util.c1
-rw-r--r--mm/vmalloc.c164
-rw-r--r--mm/vmpressure.c28
-rw-r--r--mm/vmscan.c605
-rw-r--r--mm/vmstat.c6
-rw-r--r--mm/zbud.c527
-rw-r--r--mm/zswap.c943
44 files changed, 3303 insertions, 1065 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e742d06285b7..8028dcc6615c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -173,7 +173,7 @@ config HAVE_BOOTMEM_INFO_NODE
173config MEMORY_HOTPLUG 173config MEMORY_HOTPLUG
174 bool "Allow for memory hot-add" 174 bool "Allow for memory hot-add"
175 depends on SPARSEMEM || X86_64_ACPI_NUMA 175 depends on SPARSEMEM || X86_64_ACPI_NUMA
176 depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG 176 depends on ARCH_ENABLE_MEMORY_HOTPLUG
177 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) 177 depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
178 178
179config MEMORY_HOTPLUG_SPARSE 179config MEMORY_HOTPLUG_SPARSE
@@ -477,3 +477,45 @@ config FRONTSWAP
477 and swap data is stored as normal on the matching swap device. 477 and swap data is stored as normal on the matching swap device.
478 478
479 If unsure, say Y to enable frontswap. 479 If unsure, say Y to enable frontswap.
480
481config ZBUD
482 tristate
483 default n
484 help
485 A special purpose allocator for storing compressed pages.
486 It is designed to store up to two compressed pages per physical
487 page. While this design limits storage density, it has simple and
488 deterministic reclaim properties that make it preferable to a higher
489 density approach when reclaim will be used.
490
491config ZSWAP
492 bool "Compressed cache for swap pages (EXPERIMENTAL)"
493 depends on FRONTSWAP && CRYPTO=y
494 select CRYPTO_LZO
495 select ZBUD
496 default n
497 help
498 A lightweight compressed cache for swap pages. It takes
499 pages that are in the process of being swapped out and attempts to
500 compress them into a dynamically allocated RAM-based memory pool.
501 This can result in a significant I/O reduction on swap device and,
502 in the case where decompressing from RAM is faster that swap device
503 reads, can also improve workload performance.
504
505 This is marked experimental because it is a new feature (as of
506 v3.11) that interacts heavily with memory reclaim. While these
507 interactions don't cause any known issues on simple memory setups,
508 they have not be fully explored on the large set of potential
509 configurations and workloads that exist.
510
511config MEM_SOFT_DIRTY
512 bool "Track memory changes"
513 depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
514 select PROC_PAGE_MONITOR
515 help
516 This option enables memory changes tracking by introducing a
517 soft-dirty bit on pte-s. This bit it set when someone writes
518 into a page just as regular dirty bit, but unlike the latter
519 it can be cleared by hands.
520
521 See Documentation/vm/soft-dirty.txt for more details.
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..f00803386a67 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
32obj-$(CONFIG_BOUNCE) += bounce.o 32obj-$(CONFIG_BOUNCE) += bounce.o
33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 33obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
34obj-$(CONFIG_FRONTSWAP) += frontswap.o 34obj-$(CONFIG_FRONTSWAP) += frontswap.o
35obj-$(CONFIG_ZSWAP) += zswap.o
35obj-$(CONFIG_HAS_DMA) += dmapool.o 36obj-$(CONFIG_HAS_DMA) += dmapool.o
36obj-$(CONFIG_HUGETLBFS) += hugetlb.o 37obj-$(CONFIG_HUGETLBFS) += hugetlb.o
37obj-$(CONFIG_NUMA) += mempolicy.o 38obj-$(CONFIG_NUMA) += mempolicy.o
@@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
58obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o 59obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
59obj-$(CONFIG_CLEANCACHE) += cleancache.o 60obj-$(CONFIG_CLEANCACHE) += cleancache.o
60obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o 61obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
62obj-$(CONFIG_ZBUD) += zbud.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 502517492258..e04454cdb33f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -232,8 +232,6 @@ static ssize_t stable_pages_required_show(struct device *dev,
232 bdi_cap_stable_pages_required(bdi) ? 1 : 0); 232 bdi_cap_stable_pages_required(bdi) ? 1 : 0);
233} 233}
234 234
235#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
236
237static struct device_attribute bdi_dev_attrs[] = { 235static struct device_attribute bdi_dev_attrs[] = {
238 __ATTR_RW(read_ahead_kb), 236 __ATTR_RW(read_ahead_kb),
239 __ATTR_RW(min_ratio), 237 __ATTR_RW(min_ratio),
@@ -515,7 +513,6 @@ EXPORT_SYMBOL(bdi_destroy);
515int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, 513int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
516 unsigned int cap) 514 unsigned int cap)
517{ 515{
518 char tmp[32];
519 int err; 516 int err;
520 517
521 bdi->name = name; 518 bdi->name = name;
@@ -524,8 +521,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
524 if (err) 521 if (err)
525 return err; 522 return err;
526 523
527 sprintf(tmp, "%.28s%s", name, "-%d"); 524 err = bdi_register(bdi, NULL, "%.28s-%ld", name,
528 err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); 525 atomic_long_inc_return(&bdi_seq));
529 if (err) { 526 if (err) {
530 bdi_destroy(bdi); 527 bdi_destroy(bdi);
531 return err; 528 return err;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2b0bcb019ec2..6ab7744e692e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
241 return count; 241 return count;
242} 242}
243 243
244static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) 244static int reset_managed_pages_done __initdata;
245
246static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
245{ 247{
246 struct zone *z; 248 struct zone *z;
247 249
248 /* 250 if (reset_managed_pages_done)
249 * In free_area_init_core(), highmem zone's managed_pages is set to 251 return;
250 * present_pages, and bootmem allocator doesn't allocate from highmem 252
251 * zones. So there's no need to recalculate managed_pages because all
252 * highmem pages will be managed by the buddy system. Here highmem
253 * zone also includes highmem movable zone.
254 */
255 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 253 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
256 if (!is_highmem(z)) 254 z->managed_pages = 0;
257 z->managed_pages = 0;
258} 255}
259 256
260/** 257void __init reset_all_zones_managed_pages(void)
261 * free_all_bootmem_node - release a node's free pages to the buddy allocator
262 * @pgdat: node to be released
263 *
264 * Returns the number of pages actually released.
265 */
266unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
267{ 258{
268 register_page_bootmem_info_node(pgdat); 259 struct pglist_data *pgdat;
269 reset_node_lowmem_managed_pages(pgdat); 260
270 return free_all_bootmem_core(pgdat->bdata); 261 for_each_online_pgdat(pgdat)
262 reset_node_managed_pages(pgdat);
263 reset_managed_pages_done = 1;
271} 264}
272 265
273/** 266/**
@@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void)
279{ 272{
280 unsigned long total_pages = 0; 273 unsigned long total_pages = 0;
281 bootmem_data_t *bdata; 274 bootmem_data_t *bdata;
282 struct pglist_data *pgdat;
283 275
284 for_each_online_pgdat(pgdat) 276 reset_all_zones_managed_pages();
285 reset_node_lowmem_managed_pages(pgdat);
286 277
287 list_for_each_entry(bdata, &bdata_list, list) 278 list_for_each_entry(bdata, &bdata_list, list)
288 total_pages += free_all_bootmem_core(bdata); 279 total_pages += free_all_bootmem_core(bdata);
289 280
281 totalram_pages += total_pages;
282
290 return total_pages; 283 return total_pages;
291} 284}
292 285
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe721aa8..4b51ac1acae7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
1539 struct address_space *mapping = file->f_mapping; 1539 struct address_space *mapping = file->f_mapping;
1540 1540
1541 /* If we don't want any read-ahead, don't bother */ 1541 /* If we don't want any read-ahead, don't bother */
1542 if (VM_RandomReadHint(vma)) 1542 if (vma->vm_flags & VM_RAND_READ)
1543 return; 1543 return;
1544 if (!ra->ra_pages) 1544 if (!ra->ra_pages)
1545 return; 1545 return;
1546 1546
1547 if (VM_SequentialReadHint(vma)) { 1547 if (vma->vm_flags & VM_SEQ_READ) {
1548 page_cache_sync_readahead(mapping, ra, file, offset, 1548 page_cache_sync_readahead(mapping, ra, file, offset,
1549 ra->ra_pages); 1549 ra->ra_pages);
1550 return; 1550 return;
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
1584 struct address_space *mapping = file->f_mapping; 1584 struct address_space *mapping = file->f_mapping;
1585 1585
1586 /* If we don't want any read-ahead, don't bother */ 1586 /* If we don't want any read-ahead, don't bother */
1587 if (VM_RandomReadHint(vma)) 1587 if (vma->vm_flags & VM_RAND_READ)
1588 return; 1588 return;
1589 if (ra->mmap_miss > 0) 1589 if (ra->mmap_miss > 0)
1590 ra->mmap_miss--; 1590 ra->mmap_miss--;
diff --git a/mm/fremap.c b/mm/fremap.c
index 87da3590c61e..5bff08147768 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -57,17 +57,22 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
57 unsigned long addr, unsigned long pgoff, pgprot_t prot) 57 unsigned long addr, unsigned long pgoff, pgprot_t prot)
58{ 58{
59 int err = -ENOMEM; 59 int err = -ENOMEM;
60 pte_t *pte; 60 pte_t *pte, ptfile;
61 spinlock_t *ptl; 61 spinlock_t *ptl;
62 62
63 pte = get_locked_pte(mm, addr, &ptl); 63 pte = get_locked_pte(mm, addr, &ptl);
64 if (!pte) 64 if (!pte)
65 goto out; 65 goto out;
66 66
67 if (!pte_none(*pte)) 67 ptfile = pgoff_to_pte(pgoff);
68
69 if (!pte_none(*pte)) {
70 if (pte_present(*pte) && pte_soft_dirty(*pte))
71 pte_file_mksoft_dirty(ptfile);
68 zap_pte(mm, vma, addr, pte); 72 zap_pte(mm, vma, addr, pte);
73 }
69 74
70 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 75 set_pte_at(mm, addr, pte, ptfile);
71 /* 76 /*
72 * We don't need to run update_mmu_cache() here because the "file pte" 77 * We don't need to run update_mmu_cache() here because the "file pte"
73 * being installed by install_file_pte() is not a real pte - it's a 78 * being installed by install_file_pte() is not a real pte - it's a
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 362c329b83fe..a92012a71702 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
729 pmd_t entry; 729 pmd_t entry;
730 entry = mk_huge_pmd(page, vma); 730 entry = mk_huge_pmd(page, vma);
731 page_add_new_anon_rmap(page, vma, haddr); 731 page_add_new_anon_rmap(page, vma, haddr);
732 pgtable_trans_huge_deposit(mm, pmd, pgtable);
732 set_pmd_at(mm, haddr, pmd, entry); 733 set_pmd_at(mm, haddr, pmd, entry);
733 pgtable_trans_huge_deposit(mm, pgtable);
734 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 734 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
735 mm->nr_ptes++; 735 mm->nr_ptes++;
736 spin_unlock(&mm->page_table_lock); 736 spin_unlock(&mm->page_table_lock);
@@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
771 entry = mk_pmd(zero_page, vma->vm_page_prot); 771 entry = mk_pmd(zero_page, vma->vm_page_prot);
772 entry = pmd_wrprotect(entry); 772 entry = pmd_wrprotect(entry);
773 entry = pmd_mkhuge(entry); 773 entry = pmd_mkhuge(entry);
774 pgtable_trans_huge_deposit(mm, pmd, pgtable);
774 set_pmd_at(mm, haddr, pmd, entry); 775 set_pmd_at(mm, haddr, pmd, entry);
775 pgtable_trans_huge_deposit(mm, pgtable);
776 mm->nr_ptes++; 776 mm->nr_ptes++;
777 return true; 777 return true;
778} 778}
@@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
916 916
917 pmdp_set_wrprotect(src_mm, addr, src_pmd); 917 pmdp_set_wrprotect(src_mm, addr, src_pmd);
918 pmd = pmd_mkold(pmd_wrprotect(pmd)); 918 pmd = pmd_mkold(pmd_wrprotect(pmd));
919 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
919 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 920 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
920 pgtable_trans_huge_deposit(dst_mm, pgtable);
921 dst_mm->nr_ptes++; 921 dst_mm->nr_ptes++;
922 922
923 ret = 0; 923 ret = 0;
@@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
987 pmdp_clear_flush(vma, haddr, pmd); 987 pmdp_clear_flush(vma, haddr, pmd);
988 /* leave pmd empty until pte is filled */ 988 /* leave pmd empty until pte is filled */
989 989
990 pgtable = pgtable_trans_huge_withdraw(mm); 990 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
991 pmd_populate(mm, &_pmd, pgtable); 991 pmd_populate(mm, &_pmd, pgtable);
992 992
993 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 993 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1085 pmdp_clear_flush(vma, haddr, pmd); 1085 pmdp_clear_flush(vma, haddr, pmd);
1086 /* leave pmd empty until pte is filled */ 1086 /* leave pmd empty until pte is filled */
1087 1087
1088 pgtable = pgtable_trans_huge_withdraw(mm); 1088 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1089 pmd_populate(mm, &_pmd, pgtable); 1089 pmd_populate(mm, &_pmd, pgtable);
1090 1090
1091 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 1091 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1265 * young bit, instead of the current set_pmd_at. 1265 * young bit, instead of the current set_pmd_at.
1266 */ 1266 */
1267 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 1267 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1268 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1268 if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
1269 pmd, _pmd, 1))
1270 update_mmu_cache_pmd(vma, addr, pmd);
1269 } 1271 }
1270 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1272 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1271 if (page->mapping && trylock_page(page)) { 1273 if (page->mapping && trylock_page(page)) {
@@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1358 struct page *page; 1360 struct page *page;
1359 pgtable_t pgtable; 1361 pgtable_t pgtable;
1360 pmd_t orig_pmd; 1362 pmd_t orig_pmd;
1361 pgtable = pgtable_trans_huge_withdraw(tlb->mm); 1363 /*
1364 * For architectures like ppc64 we look at deposited pgtable
1365 * when calling pmdp_get_and_clear. So do the
1366 * pgtable_trans_huge_withdraw after finishing pmdp related
1367 * operations.
1368 */
1362 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); 1369 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1363 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1370 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1371 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1364 if (is_huge_zero_pmd(orig_pmd)) { 1372 if (is_huge_zero_pmd(orig_pmd)) {
1365 tlb->mm->nr_ptes--; 1373 tlb->mm->nr_ptes--;
1366 spin_unlock(&tlb->mm->page_table_lock); 1374 spin_unlock(&tlb->mm->page_table_lock);
@@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1429 if (ret == 1) { 1437 if (ret == 1) {
1430 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); 1438 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1431 VM_BUG_ON(!pmd_none(*new_pmd)); 1439 VM_BUG_ON(!pmd_none(*new_pmd));
1432 set_pmd_at(mm, new_addr, new_pmd, pmd); 1440 set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
1433 spin_unlock(&mm->page_table_lock); 1441 spin_unlock(&mm->page_table_lock);
1434 } 1442 }
1435out: 1443out:
@@ -1612,7 +1620,9 @@ static void __split_huge_page_refcount(struct page *page,
1612 ((1L << PG_referenced) | 1620 ((1L << PG_referenced) |
1613 (1L << PG_swapbacked) | 1621 (1L << PG_swapbacked) |
1614 (1L << PG_mlocked) | 1622 (1L << PG_mlocked) |
1615 (1L << PG_uptodate))); 1623 (1L << PG_uptodate) |
1624 (1L << PG_active) |
1625 (1L << PG_unevictable)));
1616 page_tail->flags |= (1L << PG_dirty); 1626 page_tail->flags |= (1L << PG_dirty);
1617 1627
1618 /* clear PageTail before overwriting first_page */ 1628 /* clear PageTail before overwriting first_page */
@@ -1691,7 +1701,7 @@ static int __split_huge_page_map(struct page *page,
1691 pmd = page_check_address_pmd(page, mm, address, 1701 pmd = page_check_address_pmd(page, mm, address,
1692 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1702 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1693 if (pmd) { 1703 if (pmd) {
1694 pgtable = pgtable_trans_huge_withdraw(mm); 1704 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1695 pmd_populate(mm, &_pmd, pgtable); 1705 pmd_populate(mm, &_pmd, pgtable);
1696 1706
1697 haddr = address; 1707 haddr = address;
@@ -2359,9 +2369,9 @@ static void collapse_huge_page(struct mm_struct *mm,
2359 spin_lock(&mm->page_table_lock); 2369 spin_lock(&mm->page_table_lock);
2360 BUG_ON(!pmd_none(*pmd)); 2370 BUG_ON(!pmd_none(*pmd));
2361 page_add_new_anon_rmap(new_page, vma, address); 2371 page_add_new_anon_rmap(new_page, vma, address);
2372 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2362 set_pmd_at(mm, address, pmd, _pmd); 2373 set_pmd_at(mm, address, pmd, _pmd);
2363 update_mmu_cache_pmd(vma, address, pmd); 2374 update_mmu_cache_pmd(vma, address, pmd);
2364 pgtable_trans_huge_deposit(mm, pgtable);
2365 spin_unlock(&mm->page_table_lock); 2375 spin_unlock(&mm->page_table_lock);
2366 2376
2367 *hpage = NULL; 2377 *hpage = NULL;
@@ -2667,7 +2677,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2667 pmdp_clear_flush(vma, haddr, pmd); 2677 pmdp_clear_flush(vma, haddr, pmd);
2668 /* leave pmd empty until pte is filled */ 2678 /* leave pmd empty until pte is filled */
2669 2679
2670 pgtable = pgtable_trans_huge_withdraw(mm); 2680 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2671 pmd_populate(mm, &_pmd, pgtable); 2681 pmd_populate(mm, &_pmd, pgtable);
2672 2682
2673 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 2683 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e2bfbf73a551..83aff0a4d093 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
319 319
320 hstate = hstate_vma(vma); 320 hstate = hstate_vma(vma);
321 321
322 return 1UL << (hstate->order + PAGE_SHIFT); 322 return 1UL << huge_page_shift(hstate);
323} 323}
324EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 324EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
325 325
@@ -690,6 +690,23 @@ int PageHuge(struct page *page)
690} 690}
691EXPORT_SYMBOL_GPL(PageHuge); 691EXPORT_SYMBOL_GPL(PageHuge);
692 692
693pgoff_t __basepage_index(struct page *page)
694{
695 struct page *page_head = compound_head(page);
696 pgoff_t index = page_index(page_head);
697 unsigned long compound_idx;
698
699 if (!PageHuge(page_head))
700 return page_index(page);
701
702 if (compound_order(page_head) >= MAX_ORDER)
703 compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
704 else
705 compound_idx = page - page_head;
706
707 return (index << compound_order(page_head)) + compound_idx;
708}
709
693static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) 710static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
694{ 711{
695 struct page *page; 712 struct page *page;
@@ -1246,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void)
1246 * side-effects, like CommitLimit going negative. 1263 * side-effects, like CommitLimit going negative.
1247 */ 1264 */
1248 if (h->order > (MAX_ORDER - 1)) 1265 if (h->order > (MAX_ORDER - 1))
1249 totalram_pages += 1 << h->order; 1266 adjust_managed_page_count(page, 1 << h->order);
1250 } 1267 }
1251} 1268}
1252 1269
@@ -2931,15 +2948,6 @@ out_mutex:
2931 return ret; 2948 return ret;
2932} 2949}
2933 2950
2934/* Can be overriden by architectures */
2935__attribute__((weak)) struct page *
2936follow_huge_pud(struct mm_struct *mm, unsigned long address,
2937 pud_t *pud, int write)
2938{
2939 BUG();
2940 return NULL;
2941}
2942
2943long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2951long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2944 struct page **pages, struct vm_area_struct **vmas, 2952 struct page **pages, struct vm_area_struct **vmas,
2945 unsigned long *position, unsigned long *nr_pages, 2953 unsigned long *position, unsigned long *nr_pages,
@@ -3169,6 +3177,216 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3169 hugetlb_acct_memory(h, -(chg - freed)); 3177 hugetlb_acct_memory(h, -(chg - freed));
3170} 3178}
3171 3179
3180#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
3181static unsigned long page_table_shareable(struct vm_area_struct *svma,
3182 struct vm_area_struct *vma,
3183 unsigned long addr, pgoff_t idx)
3184{
3185 unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
3186 svma->vm_start;
3187 unsigned long sbase = saddr & PUD_MASK;
3188 unsigned long s_end = sbase + PUD_SIZE;
3189
3190 /* Allow segments to share if only one is marked locked */
3191 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
3192 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
3193
3194 /*
3195 * match the virtual addresses, permission and the alignment of the
3196 * page table page.
3197 */
3198 if (pmd_index(addr) != pmd_index(saddr) ||
3199 vm_flags != svm_flags ||
3200 sbase < svma->vm_start || svma->vm_end < s_end)
3201 return 0;
3202
3203 return saddr;
3204}
3205
3206static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
3207{
3208 unsigned long base = addr & PUD_MASK;
3209 unsigned long end = base + PUD_SIZE;
3210
3211 /*
3212 * check on proper vm_flags and page table alignment
3213 */
3214 if (vma->vm_flags & VM_MAYSHARE &&
3215 vma->vm_start <= base && end <= vma->vm_end)
3216 return 1;
3217 return 0;
3218}
3219
3220/*
3221 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
3222 * and returns the corresponding pte. While this is not necessary for the
3223 * !shared pmd case because we can allocate the pmd later as well, it makes the
3224 * code much cleaner. pmd allocation is essential for the shared case because
3225 * pud has to be populated inside the same i_mmap_mutex section - otherwise
3226 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
3227 * bad pmd for sharing.
3228 */
3229pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3230{
3231 struct vm_area_struct *vma = find_vma(mm, addr);
3232 struct address_space *mapping = vma->vm_file->f_mapping;
3233 pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
3234 vma->vm_pgoff;
3235 struct vm_area_struct *svma;
3236 unsigned long saddr;
3237 pte_t *spte = NULL;
3238 pte_t *pte;
3239
3240 if (!vma_shareable(vma, addr))
3241 return (pte_t *)pmd_alloc(mm, pud, addr);
3242
3243 mutex_lock(&mapping->i_mmap_mutex);
3244 vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
3245 if (svma == vma)
3246 continue;
3247
3248 saddr = page_table_shareable(svma, vma, addr, idx);
3249 if (saddr) {
3250 spte = huge_pte_offset(svma->vm_mm, saddr);
3251 if (spte) {
3252 get_page(virt_to_page(spte));
3253 break;
3254 }
3255 }
3256 }
3257
3258 if (!spte)
3259 goto out;
3260
3261 spin_lock(&mm->page_table_lock);
3262 if (pud_none(*pud))
3263 pud_populate(mm, pud,
3264 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3265 else
3266 put_page(virt_to_page(spte));
3267 spin_unlock(&mm->page_table_lock);
3268out:
3269 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3270 mutex_unlock(&mapping->i_mmap_mutex);
3271 return pte;
3272}
3273
3274/*
3275 * unmap huge page backed by shared pte.
3276 *
3277 * Hugetlb pte page is ref counted at the time of mapping. If pte is shared
3278 * indicated by page_count > 1, unmap is achieved by clearing pud and
3279 * decrementing the ref count. If count == 1, the pte page is not shared.
3280 *
3281 * called with vma->vm_mm->page_table_lock held.
3282 *
3283 * returns: 1 successfully unmapped a shared pte page
3284 * 0 the underlying pte page is not shared, or it is the last user
3285 */
3286int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3287{
3288 pgd_t *pgd = pgd_offset(mm, *addr);
3289 pud_t *pud = pud_offset(pgd, *addr);
3290
3291 BUG_ON(page_count(virt_to_page(ptep)) == 0);
3292 if (page_count(virt_to_page(ptep)) == 1)
3293 return 0;
3294
3295 pud_clear(pud);
3296 put_page(virt_to_page(ptep));
3297 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
3298 return 1;
3299}
3300#define want_pmd_share() (1)
3301#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3302pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3303{
3304 return NULL;
3305}
3306#define want_pmd_share() (0)
3307#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
3308
3309#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
3310pte_t *huge_pte_alloc(struct mm_struct *mm,
3311 unsigned long addr, unsigned long sz)
3312{
3313 pgd_t *pgd;
3314 pud_t *pud;
3315 pte_t *pte = NULL;
3316
3317 pgd = pgd_offset(mm, addr);
3318 pud = pud_alloc(mm, pgd, addr);
3319 if (pud) {
3320 if (sz == PUD_SIZE) {
3321 pte = (pte_t *)pud;
3322 } else {
3323 BUG_ON(sz != PMD_SIZE);
3324 if (want_pmd_share() && pud_none(*pud))
3325 pte = huge_pmd_share(mm, addr, pud);
3326 else
3327 pte = (pte_t *)pmd_alloc(mm, pud, addr);
3328 }
3329 }
3330 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
3331
3332 return pte;
3333}
3334
3335pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
3336{
3337 pgd_t *pgd;
3338 pud_t *pud;
3339 pmd_t *pmd = NULL;
3340
3341 pgd = pgd_offset(mm, addr);
3342 if (pgd_present(*pgd)) {
3343 pud = pud_offset(pgd, addr);
3344 if (pud_present(*pud)) {
3345 if (pud_huge(*pud))
3346 return (pte_t *)pud;
3347 pmd = pmd_offset(pud, addr);
3348 }
3349 }
3350 return (pte_t *) pmd;
3351}
3352
3353struct page *
3354follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3355 pmd_t *pmd, int write)
3356{
3357 struct page *page;
3358
3359 page = pte_page(*(pte_t *)pmd);
3360 if (page)
3361 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
3362 return page;
3363}
3364
3365struct page *
3366follow_huge_pud(struct mm_struct *mm, unsigned long address,
3367 pud_t *pud, int write)
3368{
3369 struct page *page;
3370
3371 page = pte_page(*(pte_t *)pud);
3372 if (page)
3373 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
3374 return page;
3375}
3376
3377#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3378
3379/* Can be overriden by architectures */
3380__attribute__((weak)) struct page *
3381follow_huge_pud(struct mm_struct *mm, unsigned long address,
3382 pud_t *pud, int write)
3383{
3384 BUG();
3385 return NULL;
3386}
3387
3388#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3389
3172#ifdef CONFIG_MEMORY_FAILURE 3390#ifdef CONFIG_MEMORY_FAILURE
3173 3391
3174/* Should be called in hugetlb_lock */ 3392/* Should be called in hugetlb_lock */
diff --git a/mm/internal.h b/mm/internal.h
index 8562de0a5197..4390ac6c106e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page)
32 set_page_count(page, 1); 32 set_page_count(page, 1);
33} 33}
34 34
35static inline void __put_page(struct page *page)
36{
37 atomic_dec(&page->_count);
38}
39
40static inline void __get_page_tail_foll(struct page *page, 35static inline void __get_page_tail_foll(struct page *page,
41 bool get_page_head) 36 bool get_page_head)
42{ 37{
diff --git a/mm/memblock.c b/mm/memblock.c
index c5fad932fa51..a847bfe6f3ba 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
566/** 566/**
567 * __next_free_mem_range - next function for for_each_free_mem_range() 567 * __next_free_mem_range - next function for for_each_free_mem_range()
568 * @idx: pointer to u64 loop variable 568 * @idx: pointer to u64 loop variable
569 * @nid: nid: node selector, %MAX_NUMNODES for all nodes 569 * @nid: node selector, %MAX_NUMNODES for all nodes
570 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL 570 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
571 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL 571 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
572 * @out_nid: ptr to int for nid of the range, can be %NULL 572 * @out_nid: ptr to int for nid of the range, can be %NULL
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 194721839cf5..c5792a5d87ce 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 187 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
188}; 188};
189 189
190struct mem_cgroup_lru_info {
191 struct mem_cgroup_per_node *nodeinfo[0];
192};
193
194/* 190/*
195 * Cgroups above their limits are maintained in a RB-Tree, independent of 191 * Cgroups above their limits are maintained in a RB-Tree, independent of
196 * their hierarchy representation 192 * their hierarchy representation
@@ -267,28 +263,10 @@ struct mem_cgroup {
267 /* vmpressure notifications */ 263 /* vmpressure notifications */
268 struct vmpressure vmpressure; 264 struct vmpressure vmpressure;
269 265
270 union { 266 /*
271 /* 267 * the counter to account for mem+swap usage.
272 * the counter to account for mem+swap usage. 268 */
273 */ 269 struct res_counter memsw;
274 struct res_counter memsw;
275
276 /*
277 * rcu_freeing is used only when freeing struct mem_cgroup,
278 * so put it into a union to avoid wasting more memory.
279 * It must be disjoint from the css field. It could be
280 * in a union with the res field, but res plays a much
281 * larger part in mem_cgroup life than memsw, and might
282 * be of interest, even at time of free, when debugging.
283 * So share rcu_head with the less interesting memsw.
284 */
285 struct rcu_head rcu_freeing;
286 /*
287 * We also need some space for a worker in deferred freeing.
288 * By the time we call it, rcu_freeing is no longer in use.
289 */
290 struct work_struct work_freeing;
291 };
292 270
293 /* 271 /*
294 * the counter to account for kernel memory usage. 272 * the counter to account for kernel memory usage.
@@ -303,8 +281,6 @@ struct mem_cgroup {
303 bool oom_lock; 281 bool oom_lock;
304 atomic_t under_oom; 282 atomic_t under_oom;
305 283
306 atomic_t refcnt;
307
308 int swappiness; 284 int swappiness;
309 /* OOM-Killer disable */ 285 /* OOM-Killer disable */
310 int oom_kill_disable; 286 int oom_kill_disable;
@@ -366,14 +342,8 @@ struct mem_cgroup {
366 atomic_t numainfo_updating; 342 atomic_t numainfo_updating;
367#endif 343#endif
368 344
369 /* 345 struct mem_cgroup_per_node *nodeinfo[0];
370 * Per cgroup active and inactive list, similar to the 346 /* WARNING: nodeinfo must be the last member here */
371 * per zone LRU lists.
372 *
373 * WARNING: This has to be the last element of the struct. Don't
374 * add new fields after this point.
375 */
376 struct mem_cgroup_lru_info info;
377}; 347};
378 348
379static size_t memcg_size(void) 349static size_t memcg_size(void)
@@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
416 386
417static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 387static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
418{ 388{
389 /*
390 * Our caller must use css_get() first, because memcg_uncharge_kmem()
391 * will call css_put() if it sees the memcg is dead.
392 */
393 smp_wmb();
419 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 394 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
420 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 395 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
421} 396}
@@ -508,9 +483,6 @@ enum res_type {
508 */ 483 */
509static DEFINE_MUTEX(memcg_create_mutex); 484static DEFINE_MUTEX(memcg_create_mutex);
510 485
511static void mem_cgroup_get(struct mem_cgroup *memcg);
512static void mem_cgroup_put(struct mem_cgroup *memcg);
513
514static inline 486static inline
515struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 487struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
516{ 488{
@@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk)
561 */ 533 */
562 if (sk->sk_cgrp) { 534 if (sk->sk_cgrp) {
563 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 535 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
564 mem_cgroup_get(sk->sk_cgrp->memcg); 536 css_get(&sk->sk_cgrp->memcg->css);
565 return; 537 return;
566 } 538 }
567 539
568 rcu_read_lock(); 540 rcu_read_lock();
569 memcg = mem_cgroup_from_task(current); 541 memcg = mem_cgroup_from_task(current);
570 cg_proto = sk->sk_prot->proto_cgroup(memcg); 542 cg_proto = sk->sk_prot->proto_cgroup(memcg);
571 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { 543 if (!mem_cgroup_is_root(memcg) &&
572 mem_cgroup_get(memcg); 544 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
573 sk->sk_cgrp = cg_proto; 545 sk->sk_cgrp = cg_proto;
574 } 546 }
575 rcu_read_unlock(); 547 rcu_read_unlock();
@@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk)
583 struct mem_cgroup *memcg; 555 struct mem_cgroup *memcg;
584 WARN_ON(!sk->sk_cgrp->memcg); 556 WARN_ON(!sk->sk_cgrp->memcg);
585 memcg = sk->sk_cgrp->memcg; 557 memcg = sk->sk_cgrp->memcg;
586 mem_cgroup_put(memcg); 558 css_put(&sk->sk_cgrp->memcg->css);
587 } 559 }
588} 560}
589 561
@@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone *
683mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 655mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
684{ 656{
685 VM_BUG_ON((unsigned)nid >= nr_node_ids); 657 VM_BUG_ON((unsigned)nid >= nr_node_ids);
686 return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; 658 return &memcg->nodeinfo[nid]->zoneinfo[zid];
687} 659}
688 660
689struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 661struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -1148,6 +1120,58 @@ skip_node:
1148 return NULL; 1120 return NULL;
1149} 1121}
1150 1122
1123static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1124{
1125 /*
1126 * When a group in the hierarchy below root is destroyed, the
1127 * hierarchy iterator can no longer be trusted since it might
1128 * have pointed to the destroyed group. Invalidate it.
1129 */
1130 atomic_inc(&root->dead_count);
1131}
1132
1133static struct mem_cgroup *
1134mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1135 struct mem_cgroup *root,
1136 int *sequence)
1137{
1138 struct mem_cgroup *position = NULL;
1139 /*
1140 * A cgroup destruction happens in two stages: offlining and
1141 * release. They are separated by a RCU grace period.
1142 *
1143 * If the iterator is valid, we may still race with an
1144 * offlining. The RCU lock ensures the object won't be
1145 * released, tryget will fail if we lost the race.
1146 */
1147 *sequence = atomic_read(&root->dead_count);
1148 if (iter->last_dead_count == *sequence) {
1149 smp_rmb();
1150 position = iter->last_visited;
1151 if (position && !css_tryget(&position->css))
1152 position = NULL;
1153 }
1154 return position;
1155}
1156
1157static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1158 struct mem_cgroup *last_visited,
1159 struct mem_cgroup *new_position,
1160 int sequence)
1161{
1162 if (last_visited)
1163 css_put(&last_visited->css);
1164 /*
1165 * We store the sequence count from the time @last_visited was
1166 * loaded successfully instead of rereading it here so that we
1167 * don't lose destruction events in between. We could have
1168 * raced with the destruction of @new_position after all.
1169 */
1170 iter->last_visited = new_position;
1171 smp_wmb();
1172 iter->last_dead_count = sequence;
1173}
1174
1151/** 1175/**
1152 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1176 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1153 * @root: hierarchy root 1177 * @root: hierarchy root
@@ -1171,7 +1195,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1171{ 1195{
1172 struct mem_cgroup *memcg = NULL; 1196 struct mem_cgroup *memcg = NULL;
1173 struct mem_cgroup *last_visited = NULL; 1197 struct mem_cgroup *last_visited = NULL;
1174 unsigned long uninitialized_var(dead_count);
1175 1198
1176 if (mem_cgroup_disabled()) 1199 if (mem_cgroup_disabled())
1177 return NULL; 1200 return NULL;
@@ -1191,6 +1214,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1191 rcu_read_lock(); 1214 rcu_read_lock();
1192 while (!memcg) { 1215 while (!memcg) {
1193 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1216 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1217 int uninitialized_var(seq);
1194 1218
1195 if (reclaim) { 1219 if (reclaim) {
1196 int nid = zone_to_nid(reclaim->zone); 1220 int nid = zone_to_nid(reclaim->zone);
@@ -1204,37 +1228,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1204 goto out_unlock; 1228 goto out_unlock;
1205 } 1229 }
1206 1230
1207 /* 1231 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1208 * If the dead_count mismatches, a destruction
1209 * has happened or is happening concurrently.
1210 * If the dead_count matches, a destruction
1211 * might still happen concurrently, but since
1212 * we checked under RCU, that destruction
1213 * won't free the object until we release the
1214 * RCU reader lock. Thus, the dead_count
1215 * check verifies the pointer is still valid,
1216 * css_tryget() verifies the cgroup pointed to
1217 * is alive.
1218 */
1219 dead_count = atomic_read(&root->dead_count);
1220 if (dead_count == iter->last_dead_count) {
1221 smp_rmb();
1222 last_visited = iter->last_visited;
1223 if (last_visited &&
1224 !css_tryget(&last_visited->css))
1225 last_visited = NULL;
1226 }
1227 } 1232 }
1228 1233
1229 memcg = __mem_cgroup_iter_next(root, last_visited); 1234 memcg = __mem_cgroup_iter_next(root, last_visited);
1230 1235
1231 if (reclaim) { 1236 if (reclaim) {
1232 if (last_visited) 1237 mem_cgroup_iter_update(iter, last_visited, memcg, seq);
1233 css_put(&last_visited->css);
1234
1235 iter->last_visited = memcg;
1236 smp_wmb();
1237 iter->last_dead_count = dead_count;
1238 1238
1239 if (!memcg) 1239 if (!memcg)
1240 iter->generation++; 1240 iter->generation++;
@@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1448 return ret; 1448 return ret;
1449} 1449}
1450 1450
1451int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1451bool task_in_mem_cgroup(struct task_struct *task,
1452 const struct mem_cgroup *memcg)
1452{ 1453{
1453 int ret;
1454 struct mem_cgroup *curr = NULL; 1454 struct mem_cgroup *curr = NULL;
1455 struct task_struct *p; 1455 struct task_struct *p;
1456 bool ret;
1456 1457
1457 p = find_lock_task_mm(task); 1458 p = find_lock_task_mm(task);
1458 if (p) { 1459 if (p) {
@@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1464 * killer still needs to detect if they have already been oom 1465 * killer still needs to detect if they have already been oom
1465 * killed to prevent needlessly killing additional tasks. 1466 * killed to prevent needlessly killing additional tasks.
1466 */ 1467 */
1467 task_lock(task); 1468 rcu_read_lock();
1468 curr = mem_cgroup_from_task(task); 1469 curr = mem_cgroup_from_task(task);
1469 if (curr) 1470 if (curr)
1470 css_get(&curr->css); 1471 css_get(&curr->css);
1471 task_unlock(task); 1472 rcu_read_unlock();
1472 } 1473 }
1473 if (!curr) 1474 if (!curr)
1474 return 0; 1475 return false;
1475 /* 1476 /*
1476 * We should check use_hierarchy of "memcg" not "curr". Because checking 1477 * We should check use_hierarchy of "memcg" not "curr". Because checking
1477 * use_hierarchy of "curr" here make this function true if hierarchy is 1478 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -2521,7 +2522,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2521 spin_unlock(&memcg->pcp_counter_lock); 2522 spin_unlock(&memcg->pcp_counter_lock);
2522} 2523}
2523 2524
2524static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2525static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2525 unsigned long action, 2526 unsigned long action,
2526 void *hcpu) 2527 void *hcpu)
2527{ 2528{
@@ -3031,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
3031 if (res_counter_uncharge(&memcg->kmem, size)) 3032 if (res_counter_uncharge(&memcg->kmem, size))
3032 return; 3033 return;
3033 3034
3035 /*
3036 * Releases a reference taken in kmem_cgroup_css_offline in case
3037 * this last uncharge is racing with the offlining code or it is
3038 * outliving the memcg existence.
3039 *
3040 * The memory barrier imposed by test&clear is paired with the
3041 * explicit one in memcg_kmem_mark_dead().
3042 */
3034 if (memcg_kmem_test_and_clear_dead(memcg)) 3043 if (memcg_kmem_test_and_clear_dead(memcg))
3035 mem_cgroup_put(memcg); 3044 css_put(&memcg->css);
3036} 3045}
3037 3046
3038void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) 3047void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
@@ -3186,11 +3195,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3186 if (!s->memcg_params) 3195 if (!s->memcg_params)
3187 return -ENOMEM; 3196 return -ENOMEM;
3188 3197
3189 INIT_WORK(&s->memcg_params->destroy,
3190 kmem_cache_destroy_work_func);
3191 if (memcg) { 3198 if (memcg) {
3192 s->memcg_params->memcg = memcg; 3199 s->memcg_params->memcg = memcg;
3193 s->memcg_params->root_cache = root_cache; 3200 s->memcg_params->root_cache = root_cache;
3201 INIT_WORK(&s->memcg_params->destroy,
3202 kmem_cache_destroy_work_func);
3194 } else 3203 } else
3195 s->memcg_params->is_root_cache = true; 3204 s->memcg_params->is_root_cache = true;
3196 3205
@@ -3223,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s)
3223 list_del(&s->memcg_params->list); 3232 list_del(&s->memcg_params->list);
3224 mutex_unlock(&memcg->slab_caches_mutex); 3233 mutex_unlock(&memcg->slab_caches_mutex);
3225 3234
3226 mem_cgroup_put(memcg); 3235 css_put(&memcg->css);
3227out: 3236out:
3228 kfree(s->memcg_params); 3237 kfree(s->memcg_params);
3229} 3238}
@@ -3383,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
3383 3392
3384 mutex_lock(&memcg_cache_mutex); 3393 mutex_lock(&memcg_cache_mutex);
3385 new_cachep = cachep->memcg_params->memcg_caches[idx]; 3394 new_cachep = cachep->memcg_params->memcg_caches[idx];
3386 if (new_cachep) 3395 if (new_cachep) {
3396 css_put(&memcg->css);
3387 goto out; 3397 goto out;
3398 }
3388 3399
3389 new_cachep = kmem_cache_dup(memcg, cachep); 3400 new_cachep = kmem_cache_dup(memcg, cachep);
3390 if (new_cachep == NULL) { 3401 if (new_cachep == NULL) {
3391 new_cachep = cachep; 3402 new_cachep = cachep;
3403 css_put(&memcg->css);
3392 goto out; 3404 goto out;
3393 } 3405 }
3394 3406
3395 mem_cgroup_get(memcg);
3396 atomic_set(&new_cachep->memcg_params->nr_pages , 0); 3407 atomic_set(&new_cachep->memcg_params->nr_pages , 0);
3397 3408
3398 cachep->memcg_params->memcg_caches[idx] = new_cachep; 3409 cachep->memcg_params->memcg_caches[idx] = new_cachep;
@@ -3480,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
3480 3491
3481 cw = container_of(w, struct create_work, work); 3492 cw = container_of(w, struct create_work, work);
3482 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3493 memcg_create_kmem_cache(cw->memcg, cw->cachep);
3483 /* Drop the reference gotten when we enqueued. */
3484 css_put(&cw->memcg->css);
3485 kfree(cw); 3494 kfree(cw);
3486} 3495}
3487 3496
@@ -3618,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3618 int ret; 3627 int ret;
3619 3628
3620 *_memcg = NULL; 3629 *_memcg = NULL;
3630
3631 /*
3632 * Disabling accounting is only relevant for some specific memcg
3633 * internal allocations. Therefore we would initially not have such
3634 * check here, since direct calls to the page allocator that are marked
3635 * with GFP_KMEMCG only happen outside memcg core. We are mostly
3636 * concerned with cache allocations, and by having this test at
3637 * memcg_kmem_get_cache, we are already able to relay the allocation to
3638 * the root cache and bypass the memcg cache altogether.
3639 *
3640 * There is one exception, though: the SLUB allocator does not create
3641 * large order caches, but rather service large kmallocs directly from
3642 * the page allocator. Therefore, the following sequence when backed by
3643 * the SLUB allocator:
3644 *
3645 * memcg_stop_kmem_account();
3646 * kmalloc(<large_number>)
3647 * memcg_resume_kmem_account();
3648 *
3649 * would effectively ignore the fact that we should skip accounting,
3650 * since it will drive us directly to this function without passing
3651 * through the cache selector memcg_kmem_get_cache. Such large
3652 * allocations are extremely rare but can happen, for instance, for the
3653 * cache arrays. We bring this test here.
3654 */
3655 if (!current->mm || current->memcg_kmem_skip_account)
3656 return true;
3657
3621 memcg = try_get_mem_cgroup_from_mm(current->mm); 3658 memcg = try_get_mem_cgroup_from_mm(current->mm);
3622 3659
3623 /* 3660 /*
@@ -4171,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4171 unlock_page_cgroup(pc); 4208 unlock_page_cgroup(pc);
4172 /* 4209 /*
4173 * even after unlock, we have memcg->res.usage here and this memcg 4210 * even after unlock, we have memcg->res.usage here and this memcg
4174 * will never be freed. 4211 * will never be freed, so it's safe to call css_get().
4175 */ 4212 */
4176 memcg_check_events(memcg, page); 4213 memcg_check_events(memcg, page);
4177 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4214 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4178 mem_cgroup_swap_statistics(memcg, true); 4215 mem_cgroup_swap_statistics(memcg, true);
4179 mem_cgroup_get(memcg); 4216 css_get(&memcg->css);
4180 } 4217 }
4181 /* 4218 /*
4182 * Migration does not charge the res_counter for the 4219 * Migration does not charge the res_counter for the
@@ -4288,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4288 4325
4289 /* 4326 /*
4290 * record memcg information, if swapout && memcg != NULL, 4327 * record memcg information, if swapout && memcg != NULL,
4291 * mem_cgroup_get() was called in uncharge(). 4328 * css_get() was called in uncharge().
4292 */ 4329 */
4293 if (do_swap_account && swapout && memcg) 4330 if (do_swap_account && swapout && memcg)
4294 swap_cgroup_record(ent, css_id(&memcg->css)); 4331 swap_cgroup_record(ent, css_id(&memcg->css));
@@ -4319,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
4319 if (!mem_cgroup_is_root(memcg)) 4356 if (!mem_cgroup_is_root(memcg))
4320 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4357 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4321 mem_cgroup_swap_statistics(memcg, false); 4358 mem_cgroup_swap_statistics(memcg, false);
4322 mem_cgroup_put(memcg); 4359 css_put(&memcg->css);
4323 } 4360 }
4324 rcu_read_unlock(); 4361 rcu_read_unlock();
4325} 4362}
@@ -4353,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
4353 * This function is only called from task migration context now. 4390 * This function is only called from task migration context now.
4354 * It postpones res_counter and refcount handling till the end 4391 * It postpones res_counter and refcount handling till the end
4355 * of task migration(mem_cgroup_clear_mc()) for performance 4392 * of task migration(mem_cgroup_clear_mc()) for performance
4356 * improvement. But we cannot postpone mem_cgroup_get(to) 4393 * improvement. But we cannot postpone css_get(to) because if
4357 * because if the process that has been moved to @to does 4394 * the process that has been moved to @to does swap-in, the
4358 * swap-in, the refcount of @to might be decreased to 0. 4395 * refcount of @to might be decreased to 0.
4396 *
4397 * We are in attach() phase, so the cgroup is guaranteed to be
4398 * alive, so we can just call css_get().
4359 */ 4399 */
4360 mem_cgroup_get(to); 4400 css_get(&to->css);
4361 return 0; 4401 return 0;
4362 } 4402 }
4363 return -EINVAL; 4403 return -EINVAL;
@@ -5136,14 +5176,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
5136 * starts accounting before all call sites are patched 5176 * starts accounting before all call sites are patched
5137 */ 5177 */
5138 memcg_kmem_set_active(memcg); 5178 memcg_kmem_set_active(memcg);
5139
5140 /*
5141 * kmem charges can outlive the cgroup. In the case of slab
5142 * pages, for instance, a page contain objects from various
5143 * processes, so it is unfeasible to migrate them away. We
5144 * need to reference count the memcg because of that.
5145 */
5146 mem_cgroup_get(memcg);
5147 } else 5179 } else
5148 ret = res_counter_set_limit(&memcg->kmem, val); 5180 ret = res_counter_set_limit(&memcg->kmem, val);
5149out: 5181out:
@@ -5176,16 +5208,16 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5176 goto out; 5208 goto out;
5177 5209
5178 /* 5210 /*
5179 * destroy(), called if we fail, will issue static_key_slow_inc() and 5211 * __mem_cgroup_free() will issue static_key_slow_dec() because this
5180 * mem_cgroup_put() if kmem is enabled. We have to either call them 5212 * memcg is active already. If the later initialization fails then the
5181 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find 5213 * cgroup core triggers the cleanup so we do not have to do it here.
5182 * this more consistent, since it always leads to the same destroy path
5183 */ 5214 */
5184 mem_cgroup_get(memcg);
5185 static_key_slow_inc(&memcg_kmem_enabled_key); 5215 static_key_slow_inc(&memcg_kmem_enabled_key);
5186 5216
5187 mutex_lock(&set_limit_mutex); 5217 mutex_lock(&set_limit_mutex);
5218 memcg_stop_kmem_account();
5188 ret = memcg_update_cache_sizes(memcg); 5219 ret = memcg_update_cache_sizes(memcg);
5220 memcg_resume_kmem_account();
5189 mutex_unlock(&set_limit_mutex); 5221 mutex_unlock(&set_limit_mutex);
5190out: 5222out:
5191 return ret; 5223 return ret;
@@ -5864,23 +5896,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5864 return mem_cgroup_sockets_init(memcg, ss); 5896 return mem_cgroup_sockets_init(memcg, ss);
5865} 5897}
5866 5898
5867static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5899static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5868{ 5900{
5869 mem_cgroup_sockets_destroy(memcg); 5901 mem_cgroup_sockets_destroy(memcg);
5902}
5903
5904static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5905{
5906 if (!memcg_kmem_is_active(memcg))
5907 return;
5908
5909 /*
5910 * kmem charges can outlive the cgroup. In the case of slab
5911 * pages, for instance, a page contain objects from various
5912 * processes. As we prevent from taking a reference for every
5913 * such allocation we have to be careful when doing uncharge
5914 * (see memcg_uncharge_kmem) and here during offlining.
5915 *
5916 * The idea is that that only the _last_ uncharge which sees
5917 * the dead memcg will drop the last reference. An additional
5918 * reference is taken here before the group is marked dead
5919 * which is then paired with css_put during uncharge resp. here.
5920 *
5921 * Although this might sound strange as this path is called from
5922 * css_offline() when the referencemight have dropped down to 0
5923 * and shouldn't be incremented anymore (css_tryget would fail)
5924 * we do not have other options because of the kmem allocations
5925 * lifetime.
5926 */
5927 css_get(&memcg->css);
5870 5928
5871 memcg_kmem_mark_dead(memcg); 5929 memcg_kmem_mark_dead(memcg);
5872 5930
5873 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5931 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5874 return; 5932 return;
5875 5933
5876 /*
5877 * Charges already down to 0, undo mem_cgroup_get() done in the charge
5878 * path here, being careful not to race with memcg_uncharge_kmem: it is
5879 * possible that the charges went down to 0 between mark_dead and the
5880 * res_counter read, so in that case, we don't need the put
5881 */
5882 if (memcg_kmem_test_and_clear_dead(memcg)) 5934 if (memcg_kmem_test_and_clear_dead(memcg))
5883 mem_cgroup_put(memcg); 5935 css_put(&memcg->css);
5884} 5936}
5885#else 5937#else
5886static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5938static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5888,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5888 return 0; 5940 return 0;
5889} 5941}
5890 5942
5891static void kmem_cgroup_destroy(struct mem_cgroup *memcg) 5943static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5944{
5945}
5946
5947static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5892{ 5948{
5893} 5949}
5894#endif 5950#endif
@@ -6058,13 +6114,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6058 mz->on_tree = false; 6114 mz->on_tree = false;
6059 mz->memcg = memcg; 6115 mz->memcg = memcg;
6060 } 6116 }
6061 memcg->info.nodeinfo[node] = pn; 6117 memcg->nodeinfo[node] = pn;
6062 return 0; 6118 return 0;
6063} 6119}
6064 6120
6065static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6121static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6066{ 6122{
6067 kfree(memcg->info.nodeinfo[node]); 6123 kfree(memcg->nodeinfo[node]);
6068} 6124}
6069 6125
6070static struct mem_cgroup *mem_cgroup_alloc(void) 6126static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -6137,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
6137 vfree(memcg); 6193 vfree(memcg);
6138} 6194}
6139 6195
6140
6141/*
6142 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
6143 * but in process context. The work_freeing structure is overlaid
6144 * on the rcu_freeing structure, which itself is overlaid on memsw.
6145 */
6146static void free_work(struct work_struct *work)
6147{
6148 struct mem_cgroup *memcg;
6149
6150 memcg = container_of(work, struct mem_cgroup, work_freeing);
6151 __mem_cgroup_free(memcg);
6152}
6153
6154static void free_rcu(struct rcu_head *rcu_head)
6155{
6156 struct mem_cgroup *memcg;
6157
6158 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
6159 INIT_WORK(&memcg->work_freeing, free_work);
6160 schedule_work(&memcg->work_freeing);
6161}
6162
6163static void mem_cgroup_get(struct mem_cgroup *memcg)
6164{
6165 atomic_inc(&memcg->refcnt);
6166}
6167
6168static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
6169{
6170 if (atomic_sub_and_test(count, &memcg->refcnt)) {
6171 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
6172 call_rcu(&memcg->rcu_freeing, free_rcu);
6173 if (parent)
6174 mem_cgroup_put(parent);
6175 }
6176}
6177
6178static void mem_cgroup_put(struct mem_cgroup *memcg)
6179{
6180 __mem_cgroup_put(memcg, 1);
6181}
6182
6183/* 6196/*
6184 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6197 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
6185 */ 6198 */
@@ -6239,7 +6252,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
6239 6252
6240 memcg->last_scanned_node = MAX_NUMNODES; 6253 memcg->last_scanned_node = MAX_NUMNODES;
6241 INIT_LIST_HEAD(&memcg->oom_notify); 6254 INIT_LIST_HEAD(&memcg->oom_notify);
6242 atomic_set(&memcg->refcnt, 1);
6243 memcg->move_charge_at_immigrate = 0; 6255 memcg->move_charge_at_immigrate = 0;
6244 mutex_init(&memcg->thresholds_lock); 6256 mutex_init(&memcg->thresholds_lock);
6245 spin_lock_init(&memcg->move_lock); 6257 spin_lock_init(&memcg->move_lock);
@@ -6275,12 +6287,9 @@ mem_cgroup_css_online(struct cgroup *cont)
6275 res_counter_init(&memcg->kmem, &parent->kmem); 6287 res_counter_init(&memcg->kmem, &parent->kmem);
6276 6288
6277 /* 6289 /*
6278 * We increment refcnt of the parent to ensure that we can 6290 * No need to take a reference to the parent because cgroup
6279 * safely access it on res_counter_charge/uncharge. 6291 * core guarantees its existence.
6280 * This refcnt will be decremented when freeing this
6281 * mem_cgroup(see mem_cgroup_put).
6282 */ 6292 */
6283 mem_cgroup_get(parent);
6284 } else { 6293 } else {
6285 res_counter_init(&memcg->res, NULL); 6294 res_counter_init(&memcg->res, NULL);
6286 res_counter_init(&memcg->memsw, NULL); 6295 res_counter_init(&memcg->memsw, NULL);
@@ -6296,16 +6305,6 @@ mem_cgroup_css_online(struct cgroup *cont)
6296 6305
6297 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6306 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
6298 mutex_unlock(&memcg_create_mutex); 6307 mutex_unlock(&memcg_create_mutex);
6299 if (error) {
6300 /*
6301 * We call put now because our (and parent's) refcnts
6302 * are already in place. mem_cgroup_put() will internally
6303 * call __mem_cgroup_free, so return directly
6304 */
6305 mem_cgroup_put(memcg);
6306 if (parent->use_hierarchy)
6307 mem_cgroup_put(parent);
6308 }
6309 return error; 6308 return error;
6310} 6309}
6311 6310
@@ -6317,32 +6316,34 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6317 struct mem_cgroup *parent = memcg; 6316 struct mem_cgroup *parent = memcg;
6318 6317
6319 while ((parent = parent_mem_cgroup(parent))) 6318 while ((parent = parent_mem_cgroup(parent)))
6320 atomic_inc(&parent->dead_count); 6319 mem_cgroup_iter_invalidate(parent);
6321 6320
6322 /* 6321 /*
6323 * if the root memcg is not hierarchical we have to check it 6322 * if the root memcg is not hierarchical we have to check it
6324 * explicitely. 6323 * explicitely.
6325 */ 6324 */
6326 if (!root_mem_cgroup->use_hierarchy) 6325 if (!root_mem_cgroup->use_hierarchy)
6327 atomic_inc(&root_mem_cgroup->dead_count); 6326 mem_cgroup_iter_invalidate(root_mem_cgroup);
6328} 6327}
6329 6328
6330static void mem_cgroup_css_offline(struct cgroup *cont) 6329static void mem_cgroup_css_offline(struct cgroup *cont)
6331{ 6330{
6332 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6331 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6333 6332
6333 kmem_cgroup_css_offline(memcg);
6334
6334 mem_cgroup_invalidate_reclaim_iterators(memcg); 6335 mem_cgroup_invalidate_reclaim_iterators(memcg);
6335 mem_cgroup_reparent_charges(memcg); 6336 mem_cgroup_reparent_charges(memcg);
6336 mem_cgroup_destroy_all_caches(memcg); 6337 mem_cgroup_destroy_all_caches(memcg);
6338 vmpressure_cleanup(&memcg->vmpressure);
6337} 6339}
6338 6340
6339static void mem_cgroup_css_free(struct cgroup *cont) 6341static void mem_cgroup_css_free(struct cgroup *cont)
6340{ 6342{
6341 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 6343 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
6342 6344
6343 kmem_cgroup_destroy(memcg); 6345 memcg_destroy_kmem(memcg);
6344 6346 __mem_cgroup_free(memcg);
6345 mem_cgroup_put(memcg);
6346} 6347}
6347 6348
6348#ifdef CONFIG_MMU 6349#ifdef CONFIG_MMU
@@ -6651,6 +6652,7 @@ static void __mem_cgroup_clear_mc(void)
6651{ 6652{
6652 struct mem_cgroup *from = mc.from; 6653 struct mem_cgroup *from = mc.from;
6653 struct mem_cgroup *to = mc.to; 6654 struct mem_cgroup *to = mc.to;
6655 int i;
6654 6656
6655 /* we must uncharge all the leftover precharges from mc.to */ 6657 /* we must uncharge all the leftover precharges from mc.to */
6656 if (mc.precharge) { 6658 if (mc.precharge) {
@@ -6671,7 +6673,9 @@ static void __mem_cgroup_clear_mc(void)
6671 if (!mem_cgroup_is_root(mc.from)) 6673 if (!mem_cgroup_is_root(mc.from))
6672 res_counter_uncharge(&mc.from->memsw, 6674 res_counter_uncharge(&mc.from->memsw,
6673 PAGE_SIZE * mc.moved_swap); 6675 PAGE_SIZE * mc.moved_swap);
6674 __mem_cgroup_put(mc.from, mc.moved_swap); 6676
6677 for (i = 0; i < mc.moved_swap; i++)
6678 css_put(&mc.from->css);
6675 6679
6676 if (!mem_cgroup_is_root(mc.to)) { 6680 if (!mem_cgroup_is_root(mc.to)) {
6677 /* 6681 /*
@@ -6681,7 +6685,7 @@ static void __mem_cgroup_clear_mc(void)
6681 res_counter_uncharge(&mc.to->res, 6685 res_counter_uncharge(&mc.to->res,
6682 PAGE_SIZE * mc.moved_swap); 6686 PAGE_SIZE * mc.moved_swap);
6683 } 6687 }
6684 /* we've already done mem_cgroup_get(mc.to) */ 6688 /* we've already done css_get(mc.to) */
6685 mc.moved_swap = 0; 6689 mc.moved_swap = 0;
6686 } 6690 }
6687 memcg_oom_recover(from); 6691 memcg_oom_recover(from);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f1932f..2c13aa7a0164 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1410 1410
1411 /* 1411 /*
1412 * Isolate the page, so that it doesn't get reallocated if it 1412 * Isolate the page, so that it doesn't get reallocated if it
1413 * was free. 1413 * was free. This flag should be kept set until the source page
1414 * is freed and PG_hwpoison on it is set.
1414 */ 1415 */
1415 set_migratetype_isolate(p, true); 1416 set_migratetype_isolate(p, true);
1416 /* 1417 /*
@@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1433 /* Not a free page */ 1434 /* Not a free page */
1434 ret = 1; 1435 ret = 1;
1435 } 1436 }
1436 unset_migratetype_isolate(p, MIGRATE_MOVABLE);
1437 unlock_memory_hotplug(); 1437 unlock_memory_hotplug();
1438 return ret; 1438 return ret;
1439} 1439}
@@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
1494 atomic_long_add(1 << compound_trans_order(hpage), 1494 atomic_long_add(1 << compound_trans_order(hpage),
1495 &num_poisoned_pages); 1495 &num_poisoned_pages);
1496 } 1496 }
1497 /* keep elevated page count for bad page */
1498 return ret; 1497 return ret;
1499} 1498}
1500 1499
@@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags)
1559 atomic_long_inc(&num_poisoned_pages); 1558 atomic_long_inc(&num_poisoned_pages);
1560 } 1559 }
1561 } 1560 }
1562 /* keep elevated page count for bad page */ 1561 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1563 return ret; 1562 return ret;
1564} 1563}
1565 1564
@@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags)
1625 if (ret > 0) 1624 if (ret > 0)
1626 ret = -EIO; 1625 ret = -EIO;
1627 } else { 1626 } else {
1627 /*
1628 * After page migration succeeds, the source page can
1629 * be trapped in pagevec and actual freeing is delayed.
1630 * Freeing code works differently based on PG_hwpoison,
1631 * so there's a race. We need to make sure that the
1632 * source page should be freed back to buddy before
1633 * setting PG_hwpoison.
1634 */
1635 if (!is_free_buddy_page(page))
1636 lru_add_drain_all();
1637 if (!is_free_buddy_page(page))
1638 drain_all_pages();
1628 SetPageHWPoison(page); 1639 SetPageHWPoison(page);
1640 if (!is_free_buddy_page(page))
1641 pr_info("soft offline: %#lx: page leaked\n",
1642 pfn);
1629 atomic_long_inc(&num_poisoned_pages); 1643 atomic_long_inc(&num_poisoned_pages);
1630 } 1644 }
1631 } else { 1645 } else {
diff --git a/mm/memory.c b/mm/memory.c
index 61a262b08e53..40268410732a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr);
82EXPORT_SYMBOL(mem_map); 82EXPORT_SYMBOL(mem_map);
83#endif 83#endif
84 84
85unsigned long num_physpages;
86/* 85/*
87 * A number of key systems in x86 including ioremap() rely on the assumption 86 * A number of key systems in x86 including ioremap() rely on the assumption
88 * that high_memory defines the upper bound on direct map memory, then end 87 * that high_memory defines the upper bound on direct map memory, then end
@@ -92,7 +91,6 @@ unsigned long num_physpages;
92 */ 91 */
93void * high_memory; 92void * high_memory;
94 93
95EXPORT_SYMBOL(num_physpages);
96EXPORT_SYMBOL(high_memory); 94EXPORT_SYMBOL(high_memory);
97 95
98/* 96/*
@@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1101 spinlock_t *ptl; 1099 spinlock_t *ptl;
1102 pte_t *start_pte; 1100 pte_t *start_pte;
1103 pte_t *pte; 1101 pte_t *pte;
1102 unsigned long range_start = addr;
1104 1103
1105again: 1104again:
1106 init_rss_vec(rss); 1105 init_rss_vec(rss);
@@ -1142,16 +1141,19 @@ again:
1142 continue; 1141 continue;
1143 if (unlikely(details) && details->nonlinear_vma 1142 if (unlikely(details) && details->nonlinear_vma
1144 && linear_page_index(details->nonlinear_vma, 1143 && linear_page_index(details->nonlinear_vma,
1145 addr) != page->index) 1144 addr) != page->index) {
1146 set_pte_at(mm, addr, pte, 1145 pte_t ptfile = pgoff_to_pte(page->index);
1147 pgoff_to_pte(page->index)); 1146 if (pte_soft_dirty(ptent))
1147 pte_file_mksoft_dirty(ptfile);
1148 set_pte_at(mm, addr, pte, ptfile);
1149 }
1148 if (PageAnon(page)) 1150 if (PageAnon(page))
1149 rss[MM_ANONPAGES]--; 1151 rss[MM_ANONPAGES]--;
1150 else { 1152 else {
1151 if (pte_dirty(ptent)) 1153 if (pte_dirty(ptent))
1152 set_page_dirty(page); 1154 set_page_dirty(page);
1153 if (pte_young(ptent) && 1155 if (pte_young(ptent) &&
1154 likely(!VM_SequentialReadHint(vma))) 1156 likely(!(vma->vm_flags & VM_SEQ_READ)))
1155 mark_page_accessed(page); 1157 mark_page_accessed(page);
1156 rss[MM_FILEPAGES]--; 1158 rss[MM_FILEPAGES]--;
1157 } 1159 }
@@ -1206,12 +1208,14 @@ again:
1206 force_flush = 0; 1208 force_flush = 0;
1207 1209
1208#ifdef HAVE_GENERIC_MMU_GATHER 1210#ifdef HAVE_GENERIC_MMU_GATHER
1209 tlb->start = addr; 1211 tlb->start = range_start;
1210 tlb->end = end; 1212 tlb->end = addr;
1211#endif 1213#endif
1212 tlb_flush_mmu(tlb); 1214 tlb_flush_mmu(tlb);
1213 if (addr != end) 1215 if (addr != end) {
1216 range_start = addr;
1214 goto again; 1217 goto again;
1218 }
1215 } 1219 }
1216 1220
1217 return addr; 1221 return addr;
@@ -2904,7 +2908,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
2904 details->first_index, details->last_index) { 2908 details->first_index, details->last_index) {
2905 2909
2906 vba = vma->vm_pgoff; 2910 vba = vma->vm_pgoff;
2907 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; 2911 vea = vba + vma_pages(vma) - 1;
2908 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ 2912 /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
2909 zba = details->first_index; 2913 zba = details->first_index;
2910 if (zba < vba) 2914 if (zba < vba)
@@ -3114,6 +3118,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
3114 exclusive = 1; 3118 exclusive = 1;
3115 } 3119 }
3116 flush_icache_page(vma, page); 3120 flush_icache_page(vma, page);
3121 if (pte_swp_soft_dirty(orig_pte))
3122 pte = pte_mksoft_dirty(pte);
3117 set_pte_at(mm, address, page_table, pte); 3123 set_pte_at(mm, address, page_table, pte);
3118 if (page == swapcache) 3124 if (page == swapcache)
3119 do_page_add_anon_rmap(page, vma, address, exclusive); 3125 do_page_add_anon_rmap(page, vma, address, exclusive);
@@ -3407,6 +3413,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3407 entry = mk_pte(page, vma->vm_page_prot); 3413 entry = mk_pte(page, vma->vm_page_prot);
3408 if (flags & FAULT_FLAG_WRITE) 3414 if (flags & FAULT_FLAG_WRITE)
3409 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 3415 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
3416 else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte))
3417 pte_mksoft_dirty(entry);
3410 if (anon) { 3418 if (anon) {
3411 inc_mm_counter_fast(mm, MM_ANONPAGES); 3419 inc_mm_counter_fast(mm, MM_ANONPAGES);
3412 page_add_new_anon_rmap(page, vma, address); 3420 page_add_new_anon_rmap(page, vma, address);
@@ -4201,7 +4209,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
4201 up_read(&mm->mmap_sem); 4209 up_read(&mm->mmap_sem);
4202} 4210}
4203 4211
4204#ifdef CONFIG_PROVE_LOCKING 4212#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
4205void might_fault(void) 4213void might_fault(void)
4206{ 4214{
4207 /* 4215 /*
@@ -4213,13 +4221,17 @@ void might_fault(void)
4213 if (segment_eq(get_fs(), KERNEL_DS)) 4221 if (segment_eq(get_fs(), KERNEL_DS))
4214 return; 4222 return;
4215 4223
4216 might_sleep();
4217 /* 4224 /*
4218 * it would be nicer only to annotate paths which are not under 4225 * it would be nicer only to annotate paths which are not under
4219 * pagefault_disable, however that requires a larger audit and 4226 * pagefault_disable, however that requires a larger audit and
4220 * providing helpers like get_user_atomic. 4227 * providing helpers like get_user_atomic.
4221 */ 4228 */
4222 if (!in_atomic() && current->mm) 4229 if (in_atomic())
4230 return;
4231
4232 __might_sleep(__FILE__, __LINE__, 0);
4233
4234 if (current->mm)
4223 might_lock_read(&current->mm->mmap_sem); 4235 might_lock_read(&current->mm->mmap_sem);
4224} 4236}
4225EXPORT_SYMBOL(might_fault); 4237EXPORT_SYMBOL(might_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ad92b46753e..ca1dd3aa5eee 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
75 res->end = start + size - 1; 75 res->end = start + size - 1;
76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 76 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
77 if (request_resource(&iomem_resource, res) < 0) { 77 if (request_resource(&iomem_resource, res) < 0) {
78 printk("System RAM resource %pR cannot be added\n", res); 78 pr_debug("System RAM resource %pR cannot be added\n", res);
79 kfree(res); 79 kfree(res);
80 res = NULL; 80 res = NULL;
81 } 81 }
@@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page,
101 atomic_inc(&page->_count); 101 atomic_inc(&page->_count);
102} 102}
103 103
104/* reference to __meminit __free_pages_bootmem is valid 104void put_page_bootmem(struct page *page)
105 * so use __ref to tell modpost not to generate a warning */
106void __ref put_page_bootmem(struct page *page)
107{ 105{
108 unsigned long type; 106 unsigned long type;
109 static DEFINE_MUTEX(ppb_lock);
110 107
111 type = (unsigned long) page->lru.next; 108 type = (unsigned long) page->lru.next;
112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 109 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page)
116 ClearPagePrivate(page); 113 ClearPagePrivate(page);
117 set_page_private(page, 0); 114 set_page_private(page, 0);
118 INIT_LIST_HEAD(&page->lru); 115 INIT_LIST_HEAD(&page->lru);
119 116 free_reserved_page(page);
120 /*
121 * Please refer to comment for __free_pages_bootmem()
122 * for why we serialize here.
123 */
124 mutex_lock(&ppb_lock);
125 __free_pages_bootmem(page, 0);
126 mutex_unlock(&ppb_lock);
127 totalram_pages++;
128 } 117 }
129
130} 118}
131 119
132#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE 120#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
@@ -220,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
220 pfn = pgdat->node_start_pfn; 208 pfn = pgdat->node_start_pfn;
221 end_pfn = pgdat_end_pfn(pgdat); 209 end_pfn = pgdat_end_pfn(pgdat);
222 210
223 /* register_section info */ 211 /* register section info */
224 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { 212 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
225 /* 213 /*
226 * Some platforms can assign the same pfn to multiple nodes - on 214 * Some platforms can assign the same pfn to multiple nodes - on
227 * node0 as well as nodeN. To avoid registering a pfn against 215 * node0 as well as nodeN. To avoid registering a pfn against
228 * multiple nodes we check that this pfn does not already 216 * multiple nodes we check that this pfn does not already
229 * reside in some other node. 217 * reside in some other nodes.
230 */ 218 */
231 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) 219 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
232 register_page_bootmem_info_section(pfn); 220 register_page_bootmem_info_section(pfn);
@@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
309 /* can't move pfns which are higher than @z2 */ 297 /* can't move pfns which are higher than @z2 */
310 if (end_pfn > zone_end_pfn(z2)) 298 if (end_pfn > zone_end_pfn(z2))
311 goto out_fail; 299 goto out_fail;
312 /* the move out part mast at the left most of @z2 */ 300 /* the move out part must be at the left most of @z2 */
313 if (start_pfn > z2->zone_start_pfn) 301 if (start_pfn > z2->zone_start_pfn)
314 goto out_fail; 302 goto out_fail;
315 /* must included/overlap */ 303 /* must included/overlap */
@@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback);
775 763
776void __online_page_set_limits(struct page *page) 764void __online_page_set_limits(struct page *page)
777{ 765{
778 unsigned long pfn = page_to_pfn(page);
779
780 if (pfn >= num_physpages)
781 num_physpages = pfn + 1;
782} 766}
783EXPORT_SYMBOL_GPL(__online_page_set_limits); 767EXPORT_SYMBOL_GPL(__online_page_set_limits);
784 768
785void __online_page_increment_counters(struct page *page) 769void __online_page_increment_counters(struct page *page)
786{ 770{
787 totalram_pages++; 771 adjust_managed_page_count(page, 1);
788
789#ifdef CONFIG_HIGHMEM
790 if (PageHighMem(page))
791 totalhigh_pages++;
792#endif
793} 772}
794EXPORT_SYMBOL_GPL(__online_page_increment_counters); 773EXPORT_SYMBOL_GPL(__online_page_increment_counters);
795 774
796void __online_page_free(struct page *page) 775void __online_page_free(struct page *page)
797{ 776{
798 ClearPageReserved(page); 777 __free_reserved_page(page);
799 init_page_count(page);
800 __free_page(page);
801} 778}
802EXPORT_SYMBOL_GPL(__online_page_free); 779EXPORT_SYMBOL_GPL(__online_page_free);
803 780
@@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
918 895
919int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 896int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
920{ 897{
898 unsigned long flags;
921 unsigned long onlined_pages = 0; 899 unsigned long onlined_pages = 0;
922 struct zone *zone; 900 struct zone *zone;
923 int need_zonelists_rebuild = 0; 901 int need_zonelists_rebuild = 0;
@@ -936,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
936 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && 914 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
937 !can_online_high_movable(zone)) { 915 !can_online_high_movable(zone)) {
938 unlock_memory_hotplug(); 916 unlock_memory_hotplug();
939 return -1; 917 return -EINVAL;
940 } 918 }
941 919
942 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { 920 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
943 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { 921 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
944 unlock_memory_hotplug(); 922 unlock_memory_hotplug();
945 return -1; 923 return -EINVAL;
946 } 924 }
947 } 925 }
948 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { 926 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
949 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { 927 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
950 unlock_memory_hotplug(); 928 unlock_memory_hotplug();
951 return -1; 929 return -EINVAL;
952 } 930 }
953 } 931 }
954 932
@@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
994 return ret; 972 return ret;
995 } 973 }
996 974
997 zone->managed_pages += onlined_pages;
998 zone->present_pages += onlined_pages; 975 zone->present_pages += onlined_pages;
976
977 pgdat_resize_lock(zone->zone_pgdat, &flags);
999 zone->zone_pgdat->node_present_pages += onlined_pages; 978 zone->zone_pgdat->node_present_pages += onlined_pages;
979 pgdat_resize_unlock(zone->zone_pgdat, &flags);
980
1000 if (onlined_pages) { 981 if (onlined_pages) {
1001 node_states_set_node(zone_to_nid(zone), &arg); 982 node_states_set_node(zone_to_nid(zone), &arg);
1002 if (need_zonelists_rebuild) 983 if (need_zonelists_rebuild)
@@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
1487 unsigned long pfn, nr_pages, expire; 1468 unsigned long pfn, nr_pages, expire;
1488 long offlined_pages; 1469 long offlined_pages;
1489 int ret, drain, retry_max, node; 1470 int ret, drain, retry_max, node;
1471 unsigned long flags;
1490 struct zone *zone; 1472 struct zone *zone;
1491 struct memory_notify arg; 1473 struct memory_notify arg;
1492 1474
@@ -1578,10 +1560,12 @@ repeat:
1578 /* reset pagetype flags and makes migrate type to be MOVABLE */ 1560 /* reset pagetype flags and makes migrate type to be MOVABLE */
1579 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1561 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1580 /* removal success */ 1562 /* removal success */
1581 zone->managed_pages -= offlined_pages; 1563 adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
1582 zone->present_pages -= offlined_pages; 1564 zone->present_pages -= offlined_pages;
1565
1566 pgdat_resize_lock(zone->zone_pgdat, &flags);
1583 zone->zone_pgdat->node_present_pages -= offlined_pages; 1567 zone->zone_pgdat->node_present_pages -= offlined_pages;
1584 totalram_pages -= offlined_pages; 1568 pgdat_resize_unlock(zone->zone_pgdat, &flags);
1585 1569
1586 init_per_zone_wmark_min(); 1570 init_per_zone_wmark_min();
1587 1571
@@ -1621,6 +1605,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1621{ 1605{
1622 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1606 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1623} 1607}
1608#endif /* CONFIG_MEMORY_HOTREMOVE */
1624 1609
1625/** 1610/**
1626 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) 1611 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
@@ -1634,7 +1619,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1634 * 1619 *
1635 * Returns the return value of func. 1620 * Returns the return value of func.
1636 */ 1621 */
1637static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, 1622int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1638 void *arg, int (*func)(struct memory_block *, void *)) 1623 void *arg, int (*func)(struct memory_block *, void *))
1639{ 1624{
1640 struct memory_block *mem = NULL; 1625 struct memory_block *mem = NULL;
@@ -1671,24 +1656,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
1671 return 0; 1656 return 0;
1672} 1657}
1673 1658
1674/** 1659#ifdef CONFIG_MEMORY_HOTREMOVE
1675 * offline_memory_block_cb - callback function for offlining memory block
1676 * @mem: the memory block to be offlined
1677 * @arg: buffer to hold error msg
1678 *
1679 * Always return 0, and put the error msg in arg if any.
1680 */
1681static int offline_memory_block_cb(struct memory_block *mem, void *arg)
1682{
1683 int *ret = arg;
1684 int error = offline_memory_block(mem);
1685
1686 if (error != 0 && *ret == 0)
1687 *ret = error;
1688
1689 return 0;
1690}
1691
1692static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) 1660static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1693{ 1661{
1694 int ret = !is_memblock_offlined(mem); 1662 int ret = !is_memblock_offlined(mem);
@@ -1814,54 +1782,22 @@ void try_offline_node(int nid)
1814} 1782}
1815EXPORT_SYMBOL(try_offline_node); 1783EXPORT_SYMBOL(try_offline_node);
1816 1784
1817int __ref remove_memory(int nid, u64 start, u64 size) 1785void __ref remove_memory(int nid, u64 start, u64 size)
1818{ 1786{
1819 unsigned long start_pfn, end_pfn; 1787 int ret;
1820 int ret = 0;
1821 int retry = 1;
1822
1823 start_pfn = PFN_DOWN(start);
1824 end_pfn = PFN_UP(start + size - 1);
1825
1826 /*
1827 * When CONFIG_MEMCG is on, one memory block may be used by other
1828 * blocks to store page cgroup when onlining pages. But we don't know
1829 * in what order pages are onlined. So we iterate twice to offline
1830 * memory:
1831 * 1st iterate: offline every non primary memory block.
1832 * 2nd iterate: offline primary (i.e. first added) memory block.
1833 */
1834repeat:
1835 walk_memory_range(start_pfn, end_pfn, &ret,
1836 offline_memory_block_cb);
1837 if (ret) {
1838 if (!retry)
1839 return ret;
1840
1841 retry = 0;
1842 ret = 0;
1843 goto repeat;
1844 }
1845 1788
1846 lock_memory_hotplug(); 1789 lock_memory_hotplug();
1847 1790
1848 /* 1791 /*
1849 * we have offlined all memory blocks like this: 1792 * All memory blocks must be offlined before removing memory. Check
1850 * 1. lock memory hotplug 1793 * whether all memory blocks in question are offline and trigger a BUG()
1851 * 2. offline a memory block 1794 * if this is not the case.
1852 * 3. unlock memory hotplug
1853 *
1854 * repeat step1-3 to offline the memory block. All memory blocks
1855 * must be offlined before removing memory. But we don't hold the
1856 * lock in the whole operation. So we should check whether all
1857 * memory blocks are offlined.
1858 */ 1795 */
1859 1796 ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
1860 ret = walk_memory_range(start_pfn, end_pfn, NULL,
1861 is_memblock_offlined_cb); 1797 is_memblock_offlined_cb);
1862 if (ret) { 1798 if (ret) {
1863 unlock_memory_hotplug(); 1799 unlock_memory_hotplug();
1864 return ret; 1800 BUG();
1865 } 1801 }
1866 1802
1867 /* remove memmap entry */ 1803 /* remove memmap entry */
@@ -1872,17 +1808,6 @@ repeat:
1872 try_offline_node(nid); 1808 try_offline_node(nid);
1873 1809
1874 unlock_memory_hotplug(); 1810 unlock_memory_hotplug();
1875
1876 return 0;
1877} 1811}
1878#else
1879int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1880{
1881 return -EINVAL;
1882}
1883int remove_memory(int nid, u64 start, u64 size)
1884{
1885 return -EINVAL;
1886}
1887#endif /* CONFIG_MEMORY_HOTREMOVE */
1888EXPORT_SYMBOL_GPL(remove_memory); 1812EXPORT_SYMBOL_GPL(remove_memory);
1813#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 74310017296e..4baf12e534d1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -732,7 +732,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
732 if (prev) { 732 if (prev) {
733 vma = prev; 733 vma = prev;
734 next = vma->vm_next; 734 next = vma->vm_next;
735 continue; 735 if (mpol_equal(vma_policy(vma), new_pol))
736 continue;
737 /* vma_merge() joined vma && vma->next, case 8 */
738 goto replace;
736 } 739 }
737 if (vma->vm_start != vmstart) { 740 if (vma->vm_start != vmstart) {
738 err = split_vma(vma->vm_mm, vma, vmstart, 1); 741 err = split_vma(vma->vm_mm, vma, vmstart, 1);
@@ -744,6 +747,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
744 if (err) 747 if (err)
745 goto out; 748 goto out;
746 } 749 }
750 replace:
747 err = vma_replace_policy(vma, new_pol); 751 err = vma_replace_policy(vma, new_pol);
748 if (err) 752 if (err)
749 goto out; 753 goto out;
diff --git a/mm/mm_init.c b/mm/mm_init.c
index c280a02ea11e..633c08863fd8 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -9,6 +9,8 @@
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kobject.h> 10#include <linux/kobject.h>
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/memory.h>
13#include <linux/notifier.h>
12#include "internal.h" 14#include "internal.h"
13 15
14#ifdef CONFIG_DEBUG_MEMORY_INIT 16#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel);
147struct kobject *mm_kobj; 149struct kobject *mm_kobj;
148EXPORT_SYMBOL_GPL(mm_kobj); 150EXPORT_SYMBOL_GPL(mm_kobj);
149 151
152#ifdef CONFIG_SMP
153s32 vm_committed_as_batch = 32;
154
155static void __meminit mm_compute_batch(void)
156{
157 u64 memsized_batch;
158 s32 nr = num_present_cpus();
159 s32 batch = max_t(s32, nr*2, 32);
160
161 /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
162 memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
163
164 vm_committed_as_batch = max_t(s32, memsized_batch, batch);
165}
166
167static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
168 unsigned long action, void *arg)
169{
170 switch (action) {
171 case MEM_ONLINE:
172 case MEM_OFFLINE:
173 mm_compute_batch();
174 default:
175 break;
176 }
177 return NOTIFY_OK;
178}
179
180static struct notifier_block compute_batch_nb __meminitdata = {
181 .notifier_call = mm_compute_batch_notifier,
182 .priority = IPC_CALLBACK_PRI, /* use lowest priority */
183};
184
185static int __init mm_compute_batch_init(void)
186{
187 mm_compute_batch();
188 register_hotmemory_notifier(&compute_batch_nb);
189
190 return 0;
191}
192
193__initcall(mm_compute_batch_init);
194
195#endif
196
150static int __init mm_sysfs_init(void) 197static int __init mm_sysfs_init(void)
151{ 198{
152 mm_kobj = kobject_create_and_add("mm", kernel_kobj); 199 mm_kobj = kobject_create_and_add("mm", kernel_kobj);
diff --git a/mm/mmap.c b/mm/mmap.c
index f681e1842fad..1edbaa3136c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -865,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end);
865 if (next->anon_vma) 865 if (next->anon_vma)
866 anon_vma_merge(vma, next); 866 anon_vma_merge(vma, next);
867 mm->map_count--; 867 mm->map_count--;
868 vma_set_policy(vma, vma_policy(next)); 868 mpol_put(vma_policy(next));
869 kmem_cache_free(vm_area_cachep, next); 869 kmem_cache_free(vm_area_cachep, next);
870 /* 870 /*
871 * In mprotect's case 6 (see comments on vma_merge), 871 * In mprotect's case 6 (see comments on vma_merge),
@@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
955 if (is_mergeable_vma(vma, file, vm_flags) && 955 if (is_mergeable_vma(vma, file, vm_flags) &&
956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { 956 is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
957 pgoff_t vm_pglen; 957 pgoff_t vm_pglen;
958 vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 958 vm_pglen = vma_pages(vma);
959 if (vma->vm_pgoff + vm_pglen == vm_pgoff) 959 if (vma->vm_pgoff + vm_pglen == vm_pgoff)
960 return 1; 960 return 1;
961 } 961 }
@@ -1358,18 +1358,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1358 1358
1359 if (!(flags & MAP_ANONYMOUS)) { 1359 if (!(flags & MAP_ANONYMOUS)) {
1360 audit_mmap_fd(fd, flags); 1360 audit_mmap_fd(fd, flags);
1361 if (unlikely(flags & MAP_HUGETLB))
1362 return -EINVAL;
1363 file = fget(fd); 1361 file = fget(fd);
1364 if (!file) 1362 if (!file)
1365 goto out; 1363 goto out;
1366 if (is_file_hugepages(file)) 1364 if (is_file_hugepages(file))
1367 len = ALIGN(len, huge_page_size(hstate_file(file))); 1365 len = ALIGN(len, huge_page_size(hstate_file(file)));
1366 retval = -EINVAL;
1367 if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
1368 goto out_fput;
1368 } else if (flags & MAP_HUGETLB) { 1369 } else if (flags & MAP_HUGETLB) {
1369 struct user_struct *user = NULL; 1370 struct user_struct *user = NULL;
1370 struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & 1371 struct hstate *hs;
1371 SHM_HUGE_MASK);
1372 1372
1373 hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
1373 if (!hs) 1374 if (!hs)
1374 return -EINVAL; 1375 return -EINVAL;
1375 1376
@@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1391 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1392 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1392 1393
1393 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1394 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
1395out_fput:
1394 if (file) 1396 if (file)
1395 fput(file); 1397 fput(file);
1396out: 1398out:
@@ -1876,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1876} 1878}
1877#endif 1879#endif
1878 1880
1879void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1880{
1881 /*
1882 * Is this a new hole at the lowest possible address?
1883 */
1884 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1885 mm->free_area_cache = addr;
1886}
1887
1888/* 1881/*
1889 * This mmap-allocator allocates new areas top-down from below the 1882 * This mmap-allocator allocates new areas top-down from below the
1890 * stack's low limit (the base): 1883 * stack's low limit (the base):
@@ -1941,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1941} 1934}
1942#endif 1935#endif
1943 1936
1944void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr)
1945{
1946 /*
1947 * Is this a new hole at the highest possible address?
1948 */
1949 if (addr > mm->free_area_cache)
1950 mm->free_area_cache = addr;
1951
1952 /* dont allow allocations above current base */
1953 if (mm->free_area_cache > mm->mmap_base)
1954 mm->free_area_cache = mm->mmap_base;
1955}
1956
1957unsigned long 1937unsigned long
1958get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, 1938get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1959 unsigned long pgoff, unsigned long flags) 1939 unsigned long pgoff, unsigned long flags)
@@ -2374,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2374{ 2354{
2375 struct vm_area_struct **insertion_point; 2355 struct vm_area_struct **insertion_point;
2376 struct vm_area_struct *tail_vma = NULL; 2356 struct vm_area_struct *tail_vma = NULL;
2377 unsigned long addr;
2378 2357
2379 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2358 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
2380 vma->vm_prev = NULL; 2359 vma->vm_prev = NULL;
@@ -2391,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2391 } else 2370 } else
2392 mm->highest_vm_end = prev ? prev->vm_end : 0; 2371 mm->highest_vm_end = prev ? prev->vm_end : 0;
2393 tail_vma->vm_next = NULL; 2372 tail_vma->vm_next = NULL;
2394 if (mm->unmap_area == arch_unmap_area)
2395 addr = prev ? prev->vm_end : mm->mmap_base;
2396 else
2397 addr = vma ? vma->vm_start : mm->mmap_base;
2398 mm->unmap_area(mm, addr);
2399 mm->mmap_cache = NULL; /* Kill the cache. */ 2373 mm->mmap_cache = NULL; /* Kill the cache. */
2400} 2374}
2401 2375
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 6725ff183374..93e6089cb456 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -315,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
315 315
316 /* 316 /*
317 * Wait for any running method to finish, of course including 317 * Wait for any running method to finish, of course including
318 * ->release if it was run by mmu_notifier_relase instead of us. 318 * ->release if it was run by mmu_notifier_release instead of us.
319 */ 319 */
320 synchronize_srcu(&srcu); 320 synchronize_srcu(&srcu);
321 321
diff --git a/mm/mremap.c b/mm/mremap.c
index 463a25705ac6..457d34ef3bf2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
126 continue; 126 continue;
127 pte = ptep_get_and_clear(mm, old_addr, old_pte); 127 pte = ptep_get_and_clear(mm, old_addr, old_pte);
128 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); 128 pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
129 set_pte_at(mm, new_addr, new_pte, pte); 129 set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte));
130 } 130 }
131 131
132 arch_leave_lazy_mmu_mode(); 132 arch_leave_lazy_mmu_mode();
@@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
456 unsigned long charged = 0; 456 unsigned long charged = 0;
457 bool locked = false; 457 bool locked = false;
458 458
459 down_write(&current->mm->mmap_sem);
460
461 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) 459 if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
462 goto out; 460 return ret;
461
462 if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
463 return ret;
463 464
464 if (addr & ~PAGE_MASK) 465 if (addr & ~PAGE_MASK)
465 goto out; 466 return ret;
466 467
467 old_len = PAGE_ALIGN(old_len); 468 old_len = PAGE_ALIGN(old_len);
468 new_len = PAGE_ALIGN(new_len); 469 new_len = PAGE_ALIGN(new_len);
@@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
473 * a zero new-len is nonsensical. 474 * a zero new-len is nonsensical.
474 */ 475 */
475 if (!new_len) 476 if (!new_len)
476 goto out; 477 return ret;
478
479 down_write(&current->mm->mmap_sem);
477 480
478 if (flags & MREMAP_FIXED) { 481 if (flags & MREMAP_FIXED) {
479 if (flags & MREMAP_MAYMOVE) 482 ret = mremap_to(addr, old_len, new_addr, new_len,
480 ret = mremap_to(addr, old_len, new_addr, new_len, 483 &locked);
481 &locked);
482 goto out; 484 goto out;
483 } 485 }
484 486
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2fc73b..61107cf55bb3 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void)
137 return count; 137 return count;
138} 138}
139 139
140static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) 140static int reset_managed_pages_done __initdata;
141
142static inline void __init reset_node_managed_pages(pg_data_t *pgdat)
141{ 143{
142 struct zone *z; 144 struct zone *z;
143 145
144 /* 146 if (reset_managed_pages_done)
145 * In free_area_init_core(), highmem zone's managed_pages is set to 147 return;
146 * present_pages, and bootmem allocator doesn't allocate from highmem
147 * zones. So there's no need to recalculate managed_pages because all
148 * highmem pages will be managed by the buddy system. Here highmem
149 * zone also includes highmem movable zone.
150 */
151 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) 148 for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
152 if (!is_highmem(z)) 149 z->managed_pages = 0;
153 z->managed_pages = 0; 150}
151
152void __init reset_all_zones_managed_pages(void)
153{
154 struct pglist_data *pgdat;
155
156 for_each_online_pgdat(pgdat)
157 reset_node_managed_pages(pgdat);
158 reset_managed_pages_done = 1;
154} 159}
155 160
156/** 161/**
@@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat)
160 */ 165 */
161unsigned long __init free_all_bootmem(void) 166unsigned long __init free_all_bootmem(void)
162{ 167{
163 struct pglist_data *pgdat; 168 unsigned long pages;
164 169
165 for_each_online_pgdat(pgdat) 170 reset_all_zones_managed_pages();
166 reset_node_lowmem_managed_pages(pgdat);
167 171
168 /* 172 /*
169 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 173 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
170 * because in some case like Node0 doesn't have RAM installed 174 * because in some case like Node0 doesn't have RAM installed
171 * low ram will be on Node1 175 * low ram will be on Node1
172 */ 176 */
173 return free_low_memory_core_early(); 177 pages = free_low_memory_core_early();
178 totalram_pages += pages;
179
180 return pages;
174} 181}
175 182
176/** 183/**
diff --git a/mm/nommu.c b/mm/nommu.c
index 298884dcd6e7..ecd1f158548e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -56,7 +56,6 @@
56void *high_memory; 56void *high_memory;
57struct page *mem_map; 57struct page *mem_map;
58unsigned long max_mapnr; 58unsigned long max_mapnr;
59unsigned long num_physpages;
60unsigned long highest_memmap_pfn; 59unsigned long highest_memmap_pfn;
61struct percpu_counter vm_committed_as; 60struct percpu_counter vm_committed_as;
62int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 61int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void)
85EXPORT_SYMBOL_GPL(vm_memory_committed); 84EXPORT_SYMBOL_GPL(vm_memory_committed);
86 85
87EXPORT_SYMBOL(mem_map); 86EXPORT_SYMBOL(mem_map);
88EXPORT_SYMBOL(num_physpages);
89 87
90/* list of mapped, potentially shareable regions */ 88/* list of mapped, potentially shareable regions */
91static struct kmem_cache *vm_region_jar; 89static struct kmem_cache *vm_region_jar;
@@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
282 280
283long vread(char *buf, char *addr, unsigned long count) 281long vread(char *buf, char *addr, unsigned long count)
284{ 282{
283 /* Don't allow overflow */
284 if ((unsigned long) buf + count < count)
285 count = -(unsigned long) buf;
286
285 memcpy(buf, addr, count); 287 memcpy(buf, addr, count);
286 return count; 288 return count;
287} 289}
@@ -1869,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
1869 return -ENOMEM; 1871 return -ENOMEM;
1870} 1872}
1871 1873
1872void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1873{
1874}
1875
1876void unmap_mapping_range(struct address_space *mapping, 1874void unmap_mapping_range(struct address_space *mapping,
1877 loff_t const holebegin, loff_t const holelen, 1875 loff_t const holebegin, loff_t const holelen,
1878 int even_cows) 1876 int even_cows)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4514ad7415c3..3f0c895c71fe 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1619,7 +1619,7 @@ void writeback_set_ratelimit(void)
1619 ratelimit_pages = 16; 1619 ratelimit_pages = 16;
1620} 1620}
1621 1621
1622static int __cpuinit 1622static int
1623ratelimit_handler(struct notifier_block *self, unsigned long action, 1623ratelimit_handler(struct notifier_block *self, unsigned long action,
1624 void *hcpu) 1624 void *hcpu)
1625{ 1625{
@@ -1634,7 +1634,7 @@ ratelimit_handler(struct notifier_block *self, unsigned long action,
1634 } 1634 }
1635} 1635}
1636 1636
1637static struct notifier_block __cpuinitdata ratelimit_nb = { 1637static struct notifier_block ratelimit_nb = {
1638 .notifier_call = ratelimit_handler, 1638 .notifier_call = ratelimit_handler,
1639 .next = NULL, 1639 .next = NULL,
1640}; 1640};
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c3edb624fccf..b100255dedda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -61,10 +61,14 @@
61#include <linux/hugetlb.h> 61#include <linux/hugetlb.h>
62#include <linux/sched/rt.h> 62#include <linux/sched/rt.h>
63 63
64#include <asm/sections.h>
64#include <asm/tlbflush.h> 65#include <asm/tlbflush.h>
65#include <asm/div64.h> 66#include <asm/div64.h>
66#include "internal.h" 67#include "internal.h"
67 68
69/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
70static DEFINE_MUTEX(pcp_batch_high_lock);
71
68#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID 72#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
69DEFINE_PER_CPU(int, numa_node); 73DEFINE_PER_CPU(int, numa_node);
70EXPORT_PER_CPU_SYMBOL(numa_node); 74EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
100}; 104};
101EXPORT_SYMBOL(node_states); 105EXPORT_SYMBOL(node_states);
102 106
107/* Protect totalram_pages and zone->managed_pages */
108static DEFINE_SPINLOCK(managed_page_count_lock);
109
103unsigned long totalram_pages __read_mostly; 110unsigned long totalram_pages __read_mostly;
104unsigned long totalreserve_pages __read_mostly; 111unsigned long totalreserve_pages __read_mostly;
105/* 112/*
@@ -197,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
197}; 204};
198 205
199int min_free_kbytes = 1024; 206int min_free_kbytes = 1024;
207int user_min_free_kbytes;
200 208
201static unsigned long __meminitdata nr_kernel_pages; 209static unsigned long __meminitdata nr_kernel_pages;
202static unsigned long __meminitdata nr_all_pages; 210static unsigned long __meminitdata nr_all_pages;
@@ -739,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
739 local_irq_restore(flags); 747 local_irq_restore(flags);
740} 748}
741 749
742/* 750void __init __free_pages_bootmem(struct page *page, unsigned int order)
743 * Read access to zone->managed_pages is safe because it's unsigned long,
744 * but we still need to serialize writers. Currently all callers of
745 * __free_pages_bootmem() except put_page_bootmem() should only be used
746 * at boot time. So for shorter boot time, we shift the burden to
747 * put_page_bootmem() to serialize writers.
748 */
749void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
750{ 751{
751 unsigned int nr_pages = 1 << order; 752 unsigned int nr_pages = 1 << order;
752 unsigned int loop; 753 unsigned int loop;
@@ -781,11 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
781 set_page_refcounted(page); 782 set_page_refcounted(page);
782 set_pageblock_migratetype(page, MIGRATE_CMA); 783 set_pageblock_migratetype(page, MIGRATE_CMA);
783 __free_pages(page, pageblock_order); 784 __free_pages(page, pageblock_order);
784 totalram_pages += pageblock_nr_pages; 785 adjust_managed_page_count(page, pageblock_nr_pages);
785#ifdef CONFIG_HIGHMEM
786 if (PageHighMem(page))
787 totalhigh_pages += pageblock_nr_pages;
788#endif
789} 786}
790#endif 787#endif
791 788
@@ -1050,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1050 * MIGRATE_CMA areas. 1047 * MIGRATE_CMA areas.
1051 */ 1048 */
1052 if (!is_migrate_cma(migratetype) && 1049 if (!is_migrate_cma(migratetype) &&
1053 (unlikely(current_order >= pageblock_order / 2) || 1050 (current_order >= pageblock_order / 2 ||
1054 start_migratetype == MIGRATE_RECLAIMABLE || 1051 start_migratetype == MIGRATE_RECLAIMABLE ||
1055 page_group_by_mobility_disabled)) { 1052 page_group_by_mobility_disabled)) {
1056 int pages; 1053 int pages;
@@ -1179,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1179{ 1176{
1180 unsigned long flags; 1177 unsigned long flags;
1181 int to_drain; 1178 int to_drain;
1179 unsigned long batch;
1182 1180
1183 local_irq_save(flags); 1181 local_irq_save(flags);
1184 if (pcp->count >= pcp->batch) 1182 batch = ACCESS_ONCE(pcp->batch);
1185 to_drain = pcp->batch; 1183 if (pcp->count >= batch)
1184 to_drain = batch;
1186 else 1185 else
1187 to_drain = pcp->count; 1186 to_drain = pcp->count;
1188 if (to_drain > 0) { 1187 if (to_drain > 0) {
@@ -1350,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold)
1350 list_add(&page->lru, &pcp->lists[migratetype]); 1349 list_add(&page->lru, &pcp->lists[migratetype]);
1351 pcp->count++; 1350 pcp->count++;
1352 if (pcp->count >= pcp->high) { 1351 if (pcp->count >= pcp->high) {
1353 free_pcppages_bulk(zone, pcp->batch, pcp); 1352 unsigned long batch = ACCESS_ONCE(pcp->batch);
1354 pcp->count -= pcp->batch; 1353 free_pcppages_bulk(zone, batch, pcp);
1354 pcp->count -= batch;
1355 } 1355 }
1356 1356
1357out: 1357out:
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL(free_pages_exact);
2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the 2839 * nr_free_zone_pages() counts the number of counts pages which are beyond the
2840 * high watermark within all zones at or below a given zone index. For each 2840 * high watermark within all zones at or below a given zone index. For each
2841 * zone, the number of pages is calculated as: 2841 * zone, the number of pages is calculated as:
2842 * present_pages - high_pages 2842 * managed_pages - high_pages
2843 */ 2843 */
2844static unsigned long nr_free_zone_pages(int offset) 2844static unsigned long nr_free_zone_pages(int offset)
2845{ 2845{
@@ -2906,9 +2906,13 @@ EXPORT_SYMBOL(si_meminfo);
2906#ifdef CONFIG_NUMA 2906#ifdef CONFIG_NUMA
2907void si_meminfo_node(struct sysinfo *val, int nid) 2907void si_meminfo_node(struct sysinfo *val, int nid)
2908{ 2908{
2909 int zone_type; /* needs to be signed */
2910 unsigned long managed_pages = 0;
2909 pg_data_t *pgdat = NODE_DATA(nid); 2911 pg_data_t *pgdat = NODE_DATA(nid);
2910 2912
2911 val->totalram = pgdat->node_present_pages; 2913 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
2914 managed_pages += pgdat->node_zones[zone_type].managed_pages;
2915 val->totalram = managed_pages;
2912 val->freeram = node_page_state(nid, NR_FREE_PAGES); 2916 val->freeram = node_page_state(nid, NR_FREE_PAGES);
2913#ifdef CONFIG_HIGHMEM 2917#ifdef CONFIG_HIGHMEM
2914 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; 2918 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
@@ -3150,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
3150 * Add all populated zones of a node to the zonelist. 3154 * Add all populated zones of a node to the zonelist.
3151 */ 3155 */
3152static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, 3156static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3153 int nr_zones, enum zone_type zone_type) 3157 int nr_zones)
3154{ 3158{
3155 struct zone *zone; 3159 struct zone *zone;
3156 3160 enum zone_type zone_type = MAX_NR_ZONES;
3157 BUG_ON(zone_type >= MAX_NR_ZONES);
3158 zone_type++;
3159 3161
3160 do { 3162 do {
3161 zone_type--; 3163 zone_type--;
@@ -3165,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
3165 &zonelist->_zonerefs[nr_zones++]); 3167 &zonelist->_zonerefs[nr_zones++]);
3166 check_highest_zone(zone_type); 3168 check_highest_zone(zone_type);
3167 } 3169 }
3168
3169 } while (zone_type); 3170 } while (zone_type);
3171
3170 return nr_zones; 3172 return nr_zones;
3171} 3173}
3172 3174
@@ -3250,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3250 static DEFINE_MUTEX(zl_order_mutex); 3252 static DEFINE_MUTEX(zl_order_mutex);
3251 3253
3252 mutex_lock(&zl_order_mutex); 3254 mutex_lock(&zl_order_mutex);
3253 if (write) 3255 if (write) {
3254 strcpy(saved_string, (char*)table->data); 3256 if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
3257 ret = -EINVAL;
3258 goto out;
3259 }
3260 strcpy(saved_string, (char *)table->data);
3261 }
3255 ret = proc_dostring(table, write, buffer, length, ppos); 3262 ret = proc_dostring(table, write, buffer, length, ppos);
3256 if (ret) 3263 if (ret)
3257 goto out; 3264 goto out;
3258 if (write) { 3265 if (write) {
3259 int oldval = user_zonelist_order; 3266 int oldval = user_zonelist_order;
3260 if (__parse_numa_zonelist_order((char*)table->data)) { 3267
3268 ret = __parse_numa_zonelist_order((char *)table->data);
3269 if (ret) {
3261 /* 3270 /*
3262 * bogus value. restore saved string 3271 * bogus value. restore saved string
3263 */ 3272 */
3264 strncpy((char*)table->data, saved_string, 3273 strncpy((char *)table->data, saved_string,
3265 NUMA_ZONELIST_ORDER_LEN); 3274 NUMA_ZONELIST_ORDER_LEN);
3266 user_zonelist_order = oldval; 3275 user_zonelist_order = oldval;
3267 } else if (oldval != user_zonelist_order) { 3276 } else if (oldval != user_zonelist_order) {
@@ -3353,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
3353 zonelist = &pgdat->node_zonelists[0]; 3362 zonelist = &pgdat->node_zonelists[0];
3354 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) 3363 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
3355 ; 3364 ;
3356 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3365 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3357 MAX_NR_ZONES - 1);
3358 zonelist->_zonerefs[j].zone = NULL; 3366 zonelist->_zonerefs[j].zone = NULL;
3359 zonelist->_zonerefs[j].zone_idx = 0; 3367 zonelist->_zonerefs[j].zone_idx = 0;
3360} 3368}
@@ -3368,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
3368 struct zonelist *zonelist; 3376 struct zonelist *zonelist;
3369 3377
3370 zonelist = &pgdat->node_zonelists[1]; 3378 zonelist = &pgdat->node_zonelists[1];
3371 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3379 j = build_zonelists_node(pgdat, zonelist, 0);
3372 zonelist->_zonerefs[j].zone = NULL; 3380 zonelist->_zonerefs[j].zone = NULL;
3373 zonelist->_zonerefs[j].zone_idx = 0; 3381 zonelist->_zonerefs[j].zone_idx = 0;
3374} 3382}
@@ -3425,8 +3433,8 @@ static int default_zonelist_order(void)
3425 z = &NODE_DATA(nid)->node_zones[zone_type]; 3433 z = &NODE_DATA(nid)->node_zones[zone_type];
3426 if (populated_zone(z)) { 3434 if (populated_zone(z)) {
3427 if (zone_type < ZONE_NORMAL) 3435 if (zone_type < ZONE_NORMAL)
3428 low_kmem_size += z->present_pages; 3436 low_kmem_size += z->managed_pages;
3429 total_size += z->present_pages; 3437 total_size += z->managed_pages;
3430 } else if (zone_type == ZONE_NORMAL) { 3438 } else if (zone_type == ZONE_NORMAL) {
3431 /* 3439 /*
3432 * If any node has only lowmem, then node order 3440 * If any node has only lowmem, then node order
@@ -3576,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat)
3576 local_node = pgdat->node_id; 3584 local_node = pgdat->node_id;
3577 3585
3578 zonelist = &pgdat->node_zonelists[0]; 3586 zonelist = &pgdat->node_zonelists[0];
3579 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); 3587 j = build_zonelists_node(pgdat, zonelist, 0);
3580 3588
3581 /* 3589 /*
3582 * Now we build the zonelist so that it contains the zones 3590 * Now we build the zonelist so that it contains the zones
@@ -3589,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat)
3589 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 3597 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
3590 if (!node_online(node)) 3598 if (!node_online(node))
3591 continue; 3599 continue;
3592 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3600 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3593 MAX_NR_ZONES - 1);
3594 } 3601 }
3595 for (node = 0; node < local_node; node++) { 3602 for (node = 0; node < local_node; node++) {
3596 if (!node_online(node)) 3603 if (!node_online(node))
3597 continue; 3604 continue;
3598 j = build_zonelists_node(NODE_DATA(node), zonelist, j, 3605 j = build_zonelists_node(NODE_DATA(node), zonelist, j);
3599 MAX_NR_ZONES - 1);
3600 } 3606 }
3601 3607
3602 zonelist->_zonerefs[j].zone = NULL; 3608 zonelist->_zonerefs[j].zone = NULL;
@@ -3705,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3705 mminit_verify_zonelist(); 3711 mminit_verify_zonelist();
3706 cpuset_init_current_mems_allowed(); 3712 cpuset_init_current_mems_allowed();
3707 } else { 3713 } else {
3708 /* we have to stop all cpus to guarantee there is no user
3709 of zonelist */
3710#ifdef CONFIG_MEMORY_HOTPLUG 3714#ifdef CONFIG_MEMORY_HOTPLUG
3711 if (zone) 3715 if (zone)
3712 setup_zone_pageset(zone); 3716 setup_zone_pageset(zone);
3713#endif 3717#endif
3718 /* we have to stop all cpus to guarantee there is no user
3719 of zonelist */
3714 stop_machine(__build_all_zonelists, pgdat, NULL); 3720 stop_machine(__build_all_zonelists, pgdat, NULL);
3715 /* cpuset refresh routine should be here */ 3721 /* cpuset refresh routine should be here */
3716 } 3722 }
@@ -4032,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone)
4032#endif 4038#endif
4033} 4039}
4034 4040
4035static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 4041/*
4042 * pcp->high and pcp->batch values are related and dependent on one another:
4043 * ->batch must never be higher then ->high.
4044 * The following function updates them in a safe manner without read side
4045 * locking.
4046 *
4047 * Any new users of pcp->batch and pcp->high should ensure they can cope with
4048 * those fields changing asynchronously (acording the the above rule).
4049 *
4050 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
4051 * outside of boot time (or some other assurance that no concurrent updaters
4052 * exist).
4053 */
4054static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
4055 unsigned long batch)
4056{
4057 /* start with a fail safe value for batch */
4058 pcp->batch = 1;
4059 smp_wmb();
4060
4061 /* Update high, then batch, in order */
4062 pcp->high = high;
4063 smp_wmb();
4064
4065 pcp->batch = batch;
4066}
4067
4068/* a companion to pageset_set_high() */
4069static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
4070{
4071 pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
4072}
4073
4074static void pageset_init(struct per_cpu_pageset *p)
4036{ 4075{
4037 struct per_cpu_pages *pcp; 4076 struct per_cpu_pages *pcp;
4038 int migratetype; 4077 int migratetype;
@@ -4041,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4041 4080
4042 pcp = &p->pcp; 4081 pcp = &p->pcp;
4043 pcp->count = 0; 4082 pcp->count = 0;
4044 pcp->high = 6 * batch;
4045 pcp->batch = max(1UL, 1 * batch);
4046 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) 4083 for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
4047 INIT_LIST_HEAD(&pcp->lists[migratetype]); 4084 INIT_LIST_HEAD(&pcp->lists[migratetype]);
4048} 4085}
4049 4086
4087static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
4088{
4089 pageset_init(p);
4090 pageset_set_batch(p, batch);
4091}
4092
4050/* 4093/*
4051 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 4094 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
4052 * to the value high for the pageset p. 4095 * to the value high for the pageset p.
4053 */ 4096 */
4054 4097static void pageset_set_high(struct per_cpu_pageset *p,
4055static void setup_pagelist_highmark(struct per_cpu_pageset *p,
4056 unsigned long high) 4098 unsigned long high)
4057{ 4099{
4058 struct per_cpu_pages *pcp; 4100 unsigned long batch = max(1UL, high / 4);
4101 if ((high / 4) > (PAGE_SHIFT * 8))
4102 batch = PAGE_SHIFT * 8;
4059 4103
4060 pcp = &p->pcp; 4104 pageset_update(&p->pcp, high, batch);
4061 pcp->high = high;
4062 pcp->batch = max(1UL, high/4);
4063 if ((high/4) > (PAGE_SHIFT * 8))
4064 pcp->batch = PAGE_SHIFT * 8;
4065} 4105}
4066 4106
4067static void __meminit setup_zone_pageset(struct zone *zone) 4107static void __meminit pageset_set_high_and_batch(struct zone *zone,
4108 struct per_cpu_pageset *pcp)
4068{ 4109{
4069 int cpu; 4110 if (percpu_pagelist_fraction)
4070 4111 pageset_set_high(pcp,
4071 zone->pageset = alloc_percpu(struct per_cpu_pageset); 4112 (zone->managed_pages /
4113 percpu_pagelist_fraction));
4114 else
4115 pageset_set_batch(pcp, zone_batchsize(zone));
4116}
4072 4117
4073 for_each_possible_cpu(cpu) { 4118static void __meminit zone_pageset_init(struct zone *zone, int cpu)
4074 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); 4119{
4120 struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
4075 4121
4076 setup_pageset(pcp, zone_batchsize(zone)); 4122 pageset_init(pcp);
4123 pageset_set_high_and_batch(zone, pcp);
4124}
4077 4125
4078 if (percpu_pagelist_fraction) 4126static void __meminit setup_zone_pageset(struct zone *zone)
4079 setup_pagelist_highmark(pcp, 4127{
4080 (zone->managed_pages / 4128 int cpu;
4081 percpu_pagelist_fraction)); 4129 zone->pageset = alloc_percpu(struct per_cpu_pageset);
4082 } 4130 for_each_possible_cpu(cpu)
4131 zone_pageset_init(zone, cpu);
4083} 4132}
4084 4133
4085/* 4134/*
@@ -4368,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
4368 */ 4417 */
4369static unsigned long __meminit zone_spanned_pages_in_node(int nid, 4418static unsigned long __meminit zone_spanned_pages_in_node(int nid,
4370 unsigned long zone_type, 4419 unsigned long zone_type,
4420 unsigned long node_start_pfn,
4421 unsigned long node_end_pfn,
4371 unsigned long *ignored) 4422 unsigned long *ignored)
4372{ 4423{
4373 unsigned long node_start_pfn, node_end_pfn;
4374 unsigned long zone_start_pfn, zone_end_pfn; 4424 unsigned long zone_start_pfn, zone_end_pfn;
4375 4425
4376 /* Get the start and end of the node and zone */ 4426 /* Get the start and end of the zone */
4377 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4378 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 4427 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
4379 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 4428 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
4380 adjust_zone_range_for_zone_movable(nid, zone_type, 4429 adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4429,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
4429/* Return the number of page frames in holes in a zone on a node */ 4478/* Return the number of page frames in holes in a zone on a node */
4430static unsigned long __meminit zone_absent_pages_in_node(int nid, 4479static unsigned long __meminit zone_absent_pages_in_node(int nid,
4431 unsigned long zone_type, 4480 unsigned long zone_type,
4481 unsigned long node_start_pfn,
4482 unsigned long node_end_pfn,
4432 unsigned long *ignored) 4483 unsigned long *ignored)
4433{ 4484{
4434 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; 4485 unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
4435 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; 4486 unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
4436 unsigned long node_start_pfn, node_end_pfn;
4437 unsigned long zone_start_pfn, zone_end_pfn; 4487 unsigned long zone_start_pfn, zone_end_pfn;
4438 4488
4439 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
4440 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); 4489 zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
4441 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); 4490 zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
4442 4491
@@ -4449,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
4449#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4498#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4450static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, 4499static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4451 unsigned long zone_type, 4500 unsigned long zone_type,
4501 unsigned long node_start_pfn,
4502 unsigned long node_end_pfn,
4452 unsigned long *zones_size) 4503 unsigned long *zones_size)
4453{ 4504{
4454 return zones_size[zone_type]; 4505 return zones_size[zone_type];
@@ -4456,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
4456 4507
4457static inline unsigned long __meminit zone_absent_pages_in_node(int nid, 4508static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4458 unsigned long zone_type, 4509 unsigned long zone_type,
4510 unsigned long node_start_pfn,
4511 unsigned long node_end_pfn,
4459 unsigned long *zholes_size) 4512 unsigned long *zholes_size)
4460{ 4513{
4461 if (!zholes_size) 4514 if (!zholes_size)
@@ -4467,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
4467#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 4520#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
4468 4521
4469static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, 4522static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
4470 unsigned long *zones_size, unsigned long *zholes_size) 4523 unsigned long node_start_pfn,
4524 unsigned long node_end_pfn,
4525 unsigned long *zones_size,
4526 unsigned long *zholes_size)
4471{ 4527{
4472 unsigned long realtotalpages, totalpages = 0; 4528 unsigned long realtotalpages, totalpages = 0;
4473 enum zone_type i; 4529 enum zone_type i;
4474 4530
4475 for (i = 0; i < MAX_NR_ZONES; i++) 4531 for (i = 0; i < MAX_NR_ZONES; i++)
4476 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 4532 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
4477 zones_size); 4533 node_start_pfn,
4534 node_end_pfn,
4535 zones_size);
4478 pgdat->node_spanned_pages = totalpages; 4536 pgdat->node_spanned_pages = totalpages;
4479 4537
4480 realtotalpages = totalpages; 4538 realtotalpages = totalpages;
4481 for (i = 0; i < MAX_NR_ZONES; i++) 4539 for (i = 0; i < MAX_NR_ZONES; i++)
4482 realtotalpages -= 4540 realtotalpages -=
4483 zone_absent_pages_in_node(pgdat->node_id, i, 4541 zone_absent_pages_in_node(pgdat->node_id, i,
4484 zholes_size); 4542 node_start_pfn, node_end_pfn,
4543 zholes_size);
4485 pgdat->node_present_pages = realtotalpages; 4544 pgdat->node_present_pages = realtotalpages;
4486 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 4545 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
4487 realtotalpages); 4546 realtotalpages);
@@ -4590,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4590 * NOTE: pgdat should get zeroed by caller. 4649 * NOTE: pgdat should get zeroed by caller.
4591 */ 4650 */
4592static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4651static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4652 unsigned long node_start_pfn, unsigned long node_end_pfn,
4593 unsigned long *zones_size, unsigned long *zholes_size) 4653 unsigned long *zones_size, unsigned long *zholes_size)
4594{ 4654{
4595 enum zone_type j; 4655 enum zone_type j;
@@ -4611,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4611 struct zone *zone = pgdat->node_zones + j; 4671 struct zone *zone = pgdat->node_zones + j;
4612 unsigned long size, realsize, freesize, memmap_pages; 4672 unsigned long size, realsize, freesize, memmap_pages;
4613 4673
4614 size = zone_spanned_pages_in_node(nid, j, zones_size); 4674 size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
4675 node_end_pfn, zones_size);
4615 realsize = freesize = size - zone_absent_pages_in_node(nid, j, 4676 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4677 node_start_pfn,
4678 node_end_pfn,
4616 zholes_size); 4679 zholes_size);
4617 4680
4618 /* 4681 /*
@@ -4726,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4726 unsigned long node_start_pfn, unsigned long *zholes_size) 4789 unsigned long node_start_pfn, unsigned long *zholes_size)
4727{ 4790{
4728 pg_data_t *pgdat = NODE_DATA(nid); 4791 pg_data_t *pgdat = NODE_DATA(nid);
4792 unsigned long start_pfn = 0;
4793 unsigned long end_pfn = 0;
4729 4794
4730 /* pg_data_t should be reset to zero when it's allocated */ 4795 /* pg_data_t should be reset to zero when it's allocated */
4731 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); 4796 WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
@@ -4733,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4733 pgdat->node_id = nid; 4798 pgdat->node_id = nid;
4734 pgdat->node_start_pfn = node_start_pfn; 4799 pgdat->node_start_pfn = node_start_pfn;
4735 init_zone_allows_reclaim(nid); 4800 init_zone_allows_reclaim(nid);
4736 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4801#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
4802 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
4803#endif
4804 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
4805 zones_size, zholes_size);
4737 4806
4738 alloc_node_mem_map(pgdat); 4807 alloc_node_mem_map(pgdat);
4739#ifdef CONFIG_FLAT_NODE_MEM_MAP 4808#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4742,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4742 (unsigned long)pgdat->node_mem_map); 4811 (unsigned long)pgdat->node_mem_map);
4743#endif 4812#endif
4744 4813
4745 free_area_init_core(pgdat, zones_size, zholes_size); 4814 free_area_init_core(pgdat, start_pfn, end_pfn,
4815 zones_size, zholes_size);
4746} 4816}
4747 4817
4748#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 4818#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5150,35 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore);
5150 5220
5151#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ 5221#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5152 5222
5153unsigned long free_reserved_area(unsigned long start, unsigned long end, 5223void adjust_managed_page_count(struct page *page, long count)
5154 int poison, char *s)
5155{ 5224{
5156 unsigned long pages, pos; 5225 spin_lock(&managed_page_count_lock);
5226 page_zone(page)->managed_pages += count;
5227 totalram_pages += count;
5228#ifdef CONFIG_HIGHMEM
5229 if (PageHighMem(page))
5230 totalhigh_pages += count;
5231#endif
5232 spin_unlock(&managed_page_count_lock);
5233}
5234EXPORT_SYMBOL(adjust_managed_page_count);
5157 5235
5158 pos = start = PAGE_ALIGN(start); 5236unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
5159 end &= PAGE_MASK; 5237{
5160 for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { 5238 void *pos;
5161 if (poison) 5239 unsigned long pages = 0;
5162 memset((void *)pos, poison, PAGE_SIZE); 5240
5163 free_reserved_page(virt_to_page((void *)pos)); 5241 start = (void *)PAGE_ALIGN((unsigned long)start);
5242 end = (void *)((unsigned long)end & PAGE_MASK);
5243 for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
5244 if ((unsigned int)poison <= 0xFF)
5245 memset(pos, poison, PAGE_SIZE);
5246 free_reserved_page(virt_to_page(pos));
5164 } 5247 }
5165 5248
5166 if (pages && s) 5249 if (pages && s)
5167 pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", 5250 pr_info("Freeing %s memory: %ldK (%p - %p)\n",
5168 s, pages << (PAGE_SHIFT - 10), start, end); 5251 s, pages << (PAGE_SHIFT - 10), start, end);
5169 5252
5170 return pages; 5253 return pages;
5171} 5254}
5255EXPORT_SYMBOL(free_reserved_area);
5172 5256
5173#ifdef CONFIG_HIGHMEM 5257#ifdef CONFIG_HIGHMEM
5174void free_highmem_page(struct page *page) 5258void free_highmem_page(struct page *page)
5175{ 5259{
5176 __free_reserved_page(page); 5260 __free_reserved_page(page);
5177 totalram_pages++; 5261 totalram_pages++;
5262 page_zone(page)->managed_pages++;
5178 totalhigh_pages++; 5263 totalhigh_pages++;
5179} 5264}
5180#endif 5265#endif
5181 5266
5267
5268void __init mem_init_print_info(const char *str)
5269{
5270 unsigned long physpages, codesize, datasize, rosize, bss_size;
5271 unsigned long init_code_size, init_data_size;
5272
5273 physpages = get_num_physpages();
5274 codesize = _etext - _stext;
5275 datasize = _edata - _sdata;
5276 rosize = __end_rodata - __start_rodata;
5277 bss_size = __bss_stop - __bss_start;
5278 init_data_size = __init_end - __init_begin;
5279 init_code_size = _einittext - _sinittext;
5280
5281 /*
5282 * Detect special cases and adjust section sizes accordingly:
5283 * 1) .init.* may be embedded into .data sections
5284 * 2) .init.text.* may be out of [__init_begin, __init_end],
5285 * please refer to arch/tile/kernel/vmlinux.lds.S.
5286 * 3) .rodata.* may be embedded into .text or .data sections.
5287 */
5288#define adj_init_size(start, end, size, pos, adj) \
5289 if (start <= pos && pos < end && size > adj) \
5290 size -= adj;
5291
5292 adj_init_size(__init_begin, __init_end, init_data_size,
5293 _sinittext, init_code_size);
5294 adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
5295 adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
5296 adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
5297 adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
5298
5299#undef adj_init_size
5300
5301 printk("Memory: %luK/%luK available "
5302 "(%luK kernel code, %luK rwdata, %luK rodata, "
5303 "%luK init, %luK bss, %luK reserved"
5304#ifdef CONFIG_HIGHMEM
5305 ", %luK highmem"
5306#endif
5307 "%s%s)\n",
5308 nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10),
5309 codesize >> 10, datasize >> 10, rosize >> 10,
5310 (init_data_size + init_code_size) >> 10, bss_size >> 10,
5311 (physpages - totalram_pages) << (PAGE_SHIFT-10),
5312#ifdef CONFIG_HIGHMEM
5313 totalhigh_pages << (PAGE_SHIFT-10),
5314#endif
5315 str ? ", " : "", str ? str : "");
5316}
5317
5182/** 5318/**
5183 * set_dma_reserve - set the specified number of pages reserved in the first zone 5319 * set_dma_reserve - set the specified number of pages reserved in the first zone
5184 * @new_dma_reserve: The number of pages to mark reserved 5320 * @new_dma_reserve: The number of pages to mark reserved
@@ -5454,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void)
5454int __meminit init_per_zone_wmark_min(void) 5590int __meminit init_per_zone_wmark_min(void)
5455{ 5591{
5456 unsigned long lowmem_kbytes; 5592 unsigned long lowmem_kbytes;
5593 int new_min_free_kbytes;
5457 5594
5458 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 5595 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
5459 5596 new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
5460 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 5597
5461 if (min_free_kbytes < 128) 5598 if (new_min_free_kbytes > user_min_free_kbytes) {
5462 min_free_kbytes = 128; 5599 min_free_kbytes = new_min_free_kbytes;
5463 if (min_free_kbytes > 65536) 5600 if (min_free_kbytes < 128)
5464 min_free_kbytes = 65536; 5601 min_free_kbytes = 128;
5602 if (min_free_kbytes > 65536)
5603 min_free_kbytes = 65536;
5604 } else {
5605 pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
5606 new_min_free_kbytes, user_min_free_kbytes);
5607 }
5465 setup_per_zone_wmarks(); 5608 setup_per_zone_wmarks();
5466 refresh_zone_stat_thresholds(); 5609 refresh_zone_stat_thresholds();
5467 setup_per_zone_lowmem_reserve(); 5610 setup_per_zone_lowmem_reserve();
@@ -5479,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5479 void __user *buffer, size_t *length, loff_t *ppos) 5622 void __user *buffer, size_t *length, loff_t *ppos)
5480{ 5623{
5481 proc_dointvec(table, write, buffer, length, ppos); 5624 proc_dointvec(table, write, buffer, length, ppos);
5482 if (write) 5625 if (write) {
5626 user_min_free_kbytes = min_free_kbytes;
5483 setup_per_zone_wmarks(); 5627 setup_per_zone_wmarks();
5628 }
5484 return 0; 5629 return 0;
5485} 5630}
5486 5631
@@ -5540,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5540 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5685 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
5541 * can have before it gets flushed back to buddy allocator. 5686 * can have before it gets flushed back to buddy allocator.
5542 */ 5687 */
5543
5544int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5688int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5545 void __user *buffer, size_t *length, loff_t *ppos) 5689 void __user *buffer, size_t *length, loff_t *ppos)
5546{ 5690{
@@ -5551,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5551 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 5695 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
5552 if (!write || (ret < 0)) 5696 if (!write || (ret < 0))
5553 return ret; 5697 return ret;
5698
5699 mutex_lock(&pcp_batch_high_lock);
5554 for_each_populated_zone(zone) { 5700 for_each_populated_zone(zone) {
5555 for_each_possible_cpu(cpu) { 5701 unsigned long high;
5556 unsigned long high; 5702 high = zone->managed_pages / percpu_pagelist_fraction;
5557 high = zone->managed_pages / percpu_pagelist_fraction; 5703 for_each_possible_cpu(cpu)
5558 setup_pagelist_highmark( 5704 pageset_set_high(per_cpu_ptr(zone->pageset, cpu),
5559 per_cpu_ptr(zone->pageset, cpu), high); 5705 high);
5560 }
5561 } 5706 }
5707 mutex_unlock(&pcp_batch_high_lock);
5562 return 0; 5708 return 0;
5563} 5709}
5564 5710
@@ -6047,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
6047#endif 6193#endif
6048 6194
6049#ifdef CONFIG_MEMORY_HOTPLUG 6195#ifdef CONFIG_MEMORY_HOTPLUG
6050static int __meminit __zone_pcp_update(void *data) 6196/*
6051{ 6197 * The zone indicated has a new number of managed_pages; batch sizes and percpu
6052 struct zone *zone = data; 6198 * page high values need to be recalulated.
6053 int cpu; 6199 */
6054 unsigned long batch = zone_batchsize(zone), flags;
6055
6056 for_each_possible_cpu(cpu) {
6057 struct per_cpu_pageset *pset;
6058 struct per_cpu_pages *pcp;
6059
6060 pset = per_cpu_ptr(zone->pageset, cpu);
6061 pcp = &pset->pcp;
6062
6063 local_irq_save(flags);
6064 if (pcp->count > 0)
6065 free_pcppages_bulk(zone, pcp->count, pcp);
6066 drain_zonestat(zone, pset);
6067 setup_pageset(pset, batch);
6068 local_irq_restore(flags);
6069 }
6070 return 0;
6071}
6072
6073void __meminit zone_pcp_update(struct zone *zone) 6200void __meminit zone_pcp_update(struct zone *zone)
6074{ 6201{
6075 stop_machine(__zone_pcp_update, zone, NULL); 6202 unsigned cpu;
6203 mutex_lock(&pcp_batch_high_lock);
6204 for_each_possible_cpu(cpu)
6205 pageset_set_high_and_batch(zone,
6206 per_cpu_ptr(zone->pageset, cpu));
6207 mutex_unlock(&pcp_batch_high_lock);
6076} 6208}
6077#endif 6209#endif
6078 6210
@@ -6142,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
6142 list_del(&page->lru); 6274 list_del(&page->lru);
6143 rmv_page_order(page); 6275 rmv_page_order(page);
6144 zone->free_area[order].nr_free--; 6276 zone->free_area[order].nr_free--;
6277#ifdef CONFIG_HIGHMEM
6278 if (PageHighMem(page))
6279 totalhigh_pages -= 1 << order;
6280#endif
6145 for (i = 0; i < (1 << order); i++) 6281 for (i = 0; i < (1 << order); i++)
6146 SetPageReserved((page+i)); 6282 SetPageReserved((page+i));
6147 pfn += (1 << order); 6283 pfn += (1 << order);
diff --git a/mm/page_io.c b/mm/page_io.c
index a8a3ef45fed7..ba05b64e5d8d 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/frontswap.h> 22#include <linux/frontswap.h>
23#include <linux/aio.h> 23#include <linux/aio.h>
24#include <linux/blkdev.h>
24#include <asm/pgtable.h> 25#include <asm/pgtable.h>
25 26
26static struct bio *get_swap_bio(gfp_t gfp_flags, 27static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err)
80 imajor(bio->bi_bdev->bd_inode), 81 imajor(bio->bi_bdev->bd_inode),
81 iminor(bio->bi_bdev->bd_inode), 82 iminor(bio->bi_bdev->bd_inode),
82 (unsigned long long)bio->bi_sector); 83 (unsigned long long)bio->bi_sector);
83 } else { 84 goto out;
84 SetPageUptodate(page);
85 } 85 }
86
87 SetPageUptodate(page);
88
89 /*
90 * There is no guarantee that the page is in swap cache - the software
91 * suspend code (at least) uses end_swap_bio_read() against a non-
92 * swapcache page. So we must check PG_swapcache before proceeding with
93 * this optimization.
94 */
95 if (likely(PageSwapCache(page))) {
96 struct swap_info_struct *sis;
97
98 sis = page_swap_info(page);
99 if (sis->flags & SWP_BLKDEV) {
100 /*
101 * The swap subsystem performs lazy swap slot freeing,
102 * expecting that the page will be swapped out again.
103 * So we can avoid an unnecessary write if the page
104 * isn't redirtied.
105 * This is good for real swap storage because we can
106 * reduce unnecessary I/O and enhance wear-leveling
107 * if an SSD is used as the as swap device.
108 * But if in-memory swap device (eg zram) is used,
109 * this causes a duplicated copy between uncompressed
110 * data in VM-owned memory and compressed data in
111 * zram-owned memory. So let's free zram-owned memory
112 * and make the VM-owned decompressed page *dirty*,
113 * so the page should be swapped out somewhere again if
114 * we again wish to reclaim it.
115 */
116 struct gendisk *disk = sis->bdev->bd_disk;
117 if (disk->fops->swap_slot_free_notify) {
118 swp_entry_t entry;
119 unsigned long offset;
120
121 entry.val = page_private(page);
122 offset = swp_offset(entry);
123
124 SetPageDirty(page);
125 disk->fops->swap_slot_free_notify(sis->bdev,
126 offset);
127 }
128 }
129 }
130
131out:
86 unlock_page(page); 132 unlock_page(page);
87 bio_put(bio); 133 bio_put(bio);
88} 134}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 0c8323fe6c8f..e1a6e4fab016 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
124 124
125#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT 125#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
126#ifdef CONFIG_TRANSPARENT_HUGEPAGE 126#ifdef CONFIG_TRANSPARENT_HUGEPAGE
127void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) 127void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
128 pgtable_t pgtable)
128{ 129{
129 assert_spin_locked(&mm->page_table_lock); 130 assert_spin_locked(&mm->page_table_lock);
130 131
@@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
141#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW 142#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
142#ifdef CONFIG_TRANSPARENT_HUGEPAGE 143#ifdef CONFIG_TRANSPARENT_HUGEPAGE
143/* no "address" argument so destroys page coloring of some arch */ 144/* no "address" argument so destroys page coloring of some arch */
144pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) 145pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
145{ 146{
146 pgtable_t pgtable; 147 pgtable_t pgtable;
147 148
diff --git a/mm/readahead.c b/mm/readahead.c
index daed28dd5830..829a77c62834 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -48,7 +48,7 @@ static void read_cache_pages_invalidate_page(struct address_space *mapping,
48 if (!trylock_page(page)) 48 if (!trylock_page(page))
49 BUG(); 49 BUG();
50 page->mapping = mapping; 50 page->mapping = mapping;
51 do_invalidatepage(page, 0); 51 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
52 page->mapping = NULL; 52 page->mapping = NULL;
53 unlock_page(page); 53 unlock_page(page);
54 } 54 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 6280da86b5d6..b2e29acd7e3d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
720 * mapping is already gone, the unmap path will have 720 * mapping is already gone, the unmap path will have
721 * set PG_referenced or activated the page. 721 * set PG_referenced or activated the page.
722 */ 722 */
723 if (likely(!VM_SequentialReadHint(vma))) 723 if (likely(!(vma->vm_flags & VM_SEQ_READ)))
724 referenced++; 724 referenced++;
725 } 725 }
726 pte_unmap_unlock(pte, ptl); 726 pte_unmap_unlock(pte, ptl);
@@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page,
1093 else 1093 else
1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1094 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1095 __page_set_anon_rmap(page, vma, address, 1); 1095 __page_set_anon_rmap(page, vma, address, 1);
1096 if (!mlocked_vma_newpage(vma, page)) 1096 if (!mlocked_vma_newpage(vma, page)) {
1097 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1097 SetPageActive(page);
1098 else 1098 lru_cache_add(page);
1099 } else
1099 add_page_to_unevictable_list(page); 1100 add_page_to_unevictable_list(page);
1100} 1101}
1101 1102
@@ -1235,6 +1236,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1235 swp_entry_to_pte(make_hwpoison_entry(page))); 1236 swp_entry_to_pte(make_hwpoison_entry(page)));
1236 } else if (PageAnon(page)) { 1237 } else if (PageAnon(page)) {
1237 swp_entry_t entry = { .val = page_private(page) }; 1238 swp_entry_t entry = { .val = page_private(page) };
1239 pte_t swp_pte;
1238 1240
1239 if (PageSwapCache(page)) { 1241 if (PageSwapCache(page)) {
1240 /* 1242 /*
@@ -1263,7 +1265,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1263 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); 1265 BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION);
1264 entry = make_migration_entry(page, pte_write(pteval)); 1266 entry = make_migration_entry(page, pte_write(pteval));
1265 } 1267 }
1266 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1268 swp_pte = swp_entry_to_pte(entry);
1269 if (pte_soft_dirty(pteval))
1270 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1271 set_pte_at(mm, address, pte, swp_pte);
1267 BUG_ON(pte_file(*pte)); 1272 BUG_ON(pte_file(*pte));
1268 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1273 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1269 (TTU_ACTION(flags) == TTU_MIGRATION)) { 1274 (TTU_ACTION(flags) == TTU_MIGRATION)) {
@@ -1400,8 +1405,12 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1400 pteval = ptep_clear_flush(vma, address, pte); 1405 pteval = ptep_clear_flush(vma, address, pte);
1401 1406
1402 /* If nonlinear, store the file page offset in the pte. */ 1407 /* If nonlinear, store the file page offset in the pte. */
1403 if (page->index != linear_page_index(vma, address)) 1408 if (page->index != linear_page_index(vma, address)) {
1404 set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); 1409 pte_t ptfile = pgoff_to_pte(page->index);
1410 if (pte_soft_dirty(pteval))
1411 pte_file_mksoft_dirty(ptfile);
1412 set_pte_at(mm, address, pte, ptfile);
1413 }
1405 1414
1406 /* Move the dirty bit to the physical page now the pte is gone. */ 1415 /* Move the dirty bit to the physical page now the pte is gone. */
1407 if (pte_dirty(pteval)) 1416 if (pte_dirty(pteval))
diff --git a/mm/shmem.c b/mm/shmem.c
index 5e6a8422658b..8335dbd3fc35 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1798,10 +1798,8 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
1798 } 1798 }
1799 } 1799 }
1800 1800
1801 if (offset >= 0 && offset != file->f_pos) { 1801 if (offset >= 0)
1802 file->f_pos = offset; 1802 offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
1803 file->f_version = 0;
1804 }
1805 mutex_unlock(&inode->i_mutex); 1803 mutex_unlock(&inode->i_mutex);
1806 return offset; 1804 return offset;
1807} 1805}
@@ -1939,6 +1937,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1939 1937
1940 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); 1938 inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
1941 if (inode) { 1939 if (inode) {
1940#ifdef CONFIG_TMPFS_POSIX_ACL
1941 error = generic_acl_init(inode, dir);
1942 if (error) {
1943 iput(inode);
1944 return error;
1945 }
1946#endif
1942 error = security_inode_init_security(inode, dir, 1947 error = security_inode_init_security(inode, dir,
1943 &dentry->d_name, 1948 &dentry->d_name,
1944 shmem_initxattrs, NULL); 1949 shmem_initxattrs, NULL);
@@ -1948,6 +1953,33 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1948 return error; 1953 return error;
1949 } 1954 }
1950 } 1955 }
1956
1957 error = 0;
1958 dir->i_size += BOGO_DIRENT_SIZE;
1959 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1960 d_instantiate(dentry, inode);
1961 dget(dentry); /* Extra count - pin the dentry in core */
1962 }
1963 return error;
1964}
1965
1966static int
1967shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
1968{
1969 struct inode *inode;
1970 int error = -ENOSPC;
1971
1972 inode = shmem_get_inode(dir->i_sb, dir, mode, 0, VM_NORESERVE);
1973 if (inode) {
1974 error = security_inode_init_security(inode, dir,
1975 NULL,
1976 shmem_initxattrs, NULL);
1977 if (error) {
1978 if (error != -EOPNOTSUPP) {
1979 iput(inode);
1980 return error;
1981 }
1982 }
1951#ifdef CONFIG_TMPFS_POSIX_ACL 1983#ifdef CONFIG_TMPFS_POSIX_ACL
1952 error = generic_acl_init(inode, dir); 1984 error = generic_acl_init(inode, dir);
1953 if (error) { 1985 if (error) {
@@ -1957,10 +1989,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1957#else 1989#else
1958 error = 0; 1990 error = 0;
1959#endif 1991#endif
1960 dir->i_size += BOGO_DIRENT_SIZE; 1992 d_tmpfile(dentry, inode);
1961 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1962 d_instantiate(dentry, inode);
1963 dget(dentry); /* Extra count - pin the dentry in core */
1964 } 1993 }
1965 return error; 1994 return error;
1966} 1995}
@@ -2723,6 +2752,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
2723 .rmdir = shmem_rmdir, 2752 .rmdir = shmem_rmdir,
2724 .mknod = shmem_mknod, 2753 .mknod = shmem_mknod,
2725 .rename = shmem_rename, 2754 .rename = shmem_rename,
2755 .tmpfile = shmem_tmpfile,
2726#endif 2756#endif
2727#ifdef CONFIG_TMPFS_XATTR 2757#ifdef CONFIG_TMPFS_XATTR
2728 .setxattr = shmem_setxattr, 2758 .setxattr = shmem_setxattr,
diff --git a/mm/slab.c b/mm/slab.c
index 8ccd296c6d9c..2580db062df9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -565,7 +565,7 @@ static void init_node_lock_keys(int q)
565 if (slab_state < UP) 565 if (slab_state < UP)
566 return; 566 return;
567 567
568 for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) { 568 for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) {
569 struct kmem_cache_node *n; 569 struct kmem_cache_node *n;
570 struct kmem_cache *cache = kmalloc_caches[i]; 570 struct kmem_cache *cache = kmalloc_caches[i];
571 571
@@ -787,7 +787,7 @@ static void next_reap_node(void)
787 * the CPUs getting into lockstep and contending for the global cache chain 787 * the CPUs getting into lockstep and contending for the global cache chain
788 * lock. 788 * lock.
789 */ 789 */
790static void __cpuinit start_cpu_timer(int cpu) 790static void start_cpu_timer(int cpu)
791{ 791{
792 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); 792 struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
793 793
@@ -1180,7 +1180,13 @@ static int init_cache_node_node(int node)
1180 return 0; 1180 return 0;
1181} 1181}
1182 1182
1183static void __cpuinit cpuup_canceled(long cpu) 1183static inline int slabs_tofree(struct kmem_cache *cachep,
1184 struct kmem_cache_node *n)
1185{
1186 return (n->free_objects + cachep->num - 1) / cachep->num;
1187}
1188
1189static void cpuup_canceled(long cpu)
1184{ 1190{
1185 struct kmem_cache *cachep; 1191 struct kmem_cache *cachep;
1186 struct kmem_cache_node *n = NULL; 1192 struct kmem_cache_node *n = NULL;
@@ -1241,11 +1247,11 @@ free_array_cache:
1241 n = cachep->node[node]; 1247 n = cachep->node[node];
1242 if (!n) 1248 if (!n)
1243 continue; 1249 continue;
1244 drain_freelist(cachep, n, n->free_objects); 1250 drain_freelist(cachep, n, slabs_tofree(cachep, n));
1245 } 1251 }
1246} 1252}
1247 1253
1248static int __cpuinit cpuup_prepare(long cpu) 1254static int cpuup_prepare(long cpu)
1249{ 1255{
1250 struct kmem_cache *cachep; 1256 struct kmem_cache *cachep;
1251 struct kmem_cache_node *n = NULL; 1257 struct kmem_cache_node *n = NULL;
@@ -1328,7 +1334,7 @@ bad:
1328 return -ENOMEM; 1334 return -ENOMEM;
1329} 1335}
1330 1336
1331static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1337static int cpuup_callback(struct notifier_block *nfb,
1332 unsigned long action, void *hcpu) 1338 unsigned long action, void *hcpu)
1333{ 1339{
1334 long cpu = (long)hcpu; 1340 long cpu = (long)hcpu;
@@ -1384,7 +1390,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1384 return notifier_from_errno(err); 1390 return notifier_from_errno(err);
1385} 1391}
1386 1392
1387static struct notifier_block __cpuinitdata cpucache_notifier = { 1393static struct notifier_block cpucache_notifier = {
1388 &cpuup_callback, NULL, 0 1394 &cpuup_callback, NULL, 0
1389}; 1395};
1390 1396
@@ -1408,7 +1414,7 @@ static int __meminit drain_cache_node_node(int node)
1408 if (!n) 1414 if (!n)
1409 continue; 1415 continue;
1410 1416
1411 drain_freelist(cachep, n, n->free_objects); 1417 drain_freelist(cachep, n, slabs_tofree(cachep, n));
1412 1418
1413 if (!list_empty(&n->slabs_full) || 1419 if (!list_empty(&n->slabs_full) ||
1414 !list_empty(&n->slabs_partial)) { 1420 !list_empty(&n->slabs_partial)) {
@@ -2532,7 +2538,7 @@ static int __cache_shrink(struct kmem_cache *cachep)
2532 if (!n) 2538 if (!n)
2533 continue; 2539 continue;
2534 2540
2535 drain_freelist(cachep, n, n->free_objects); 2541 drain_freelist(cachep, n, slabs_tofree(cachep, n));
2536 2542
2537 ret += !list_empty(&n->slabs_full) || 2543 ret += !list_empty(&n->slabs_full) ||
2538 !list_empty(&n->slabs_partial); 2544 !list_empty(&n->slabs_partial);
@@ -3338,18 +3344,6 @@ done:
3338 return obj; 3344 return obj;
3339} 3345}
3340 3346
3341/**
3342 * kmem_cache_alloc_node - Allocate an object on the specified node
3343 * @cachep: The cache to allocate from.
3344 * @flags: See kmalloc().
3345 * @nodeid: node number of the target node.
3346 * @caller: return address of caller, used for debug information
3347 *
3348 * Identical to kmem_cache_alloc but it will allocate memory on the given
3349 * node, which can improve the performance for cpu bound structures.
3350 *
3351 * Fallback to other node is possible if __GFP_THISNODE is not set.
3352 */
3353static __always_inline void * 3347static __always_inline void *
3354slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, 3348slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3355 unsigned long caller) 3349 unsigned long caller)
@@ -3643,6 +3637,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
3643#endif 3637#endif
3644 3638
3645#ifdef CONFIG_NUMA 3639#ifdef CONFIG_NUMA
3640/**
3641 * kmem_cache_alloc_node - Allocate an object on the specified node
3642 * @cachep: The cache to allocate from.
3643 * @flags: See kmalloc().
3644 * @nodeid: node number of the target node.
3645 *
3646 * Identical to kmem_cache_alloc but it will allocate memory on the given
3647 * node, which can improve the performance for cpu bound structures.
3648 *
3649 * Fallback to other node is possible if __GFP_THISNODE is not set.
3650 */
3646void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3651void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3647{ 3652{
3648 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); 3653 void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
@@ -4431,20 +4436,10 @@ static int leaks_show(struct seq_file *m, void *p)
4431 return 0; 4436 return 0;
4432} 4437}
4433 4438
4434static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4435{
4436 return seq_list_next(p, &slab_caches, pos);
4437}
4438
4439static void s_stop(struct seq_file *m, void *p)
4440{
4441 mutex_unlock(&slab_mutex);
4442}
4443
4444static const struct seq_operations slabstats_op = { 4439static const struct seq_operations slabstats_op = {
4445 .start = leaks_start, 4440 .start = leaks_start,
4446 .next = s_next, 4441 .next = slab_next,
4447 .stop = s_stop, 4442 .stop = slab_stop,
4448 .show = leaks_show, 4443 .show = leaks_show,
4449}; 4444};
4450 4445
diff --git a/mm/slab.h b/mm/slab.h
index f96b49e4704e..620ceeddbe1a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -271,3 +271,6 @@ struct kmem_cache_node {
271#endif 271#endif
272 272
273}; 273};
274
275void *slab_next(struct seq_file *m, void *p, loff_t *pos);
276void slab_stop(struct seq_file *m, void *p);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2d414508e9ec..538bade6df7d 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -497,6 +497,13 @@ void __init create_kmalloc_caches(unsigned long flags)
497 497
498 498
499#ifdef CONFIG_SLABINFO 499#ifdef CONFIG_SLABINFO
500
501#ifdef CONFIG_SLAB
502#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
503#else
504#define SLABINFO_RIGHTS S_IRUSR
505#endif
506
500void print_slabinfo_header(struct seq_file *m) 507void print_slabinfo_header(struct seq_file *m)
501{ 508{
502 /* 509 /*
@@ -531,12 +538,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
531 return seq_list_start(&slab_caches, *pos); 538 return seq_list_start(&slab_caches, *pos);
532} 539}
533 540
534static void *s_next(struct seq_file *m, void *p, loff_t *pos) 541void *slab_next(struct seq_file *m, void *p, loff_t *pos)
535{ 542{
536 return seq_list_next(p, &slab_caches, pos); 543 return seq_list_next(p, &slab_caches, pos);
537} 544}
538 545
539static void s_stop(struct seq_file *m, void *p) 546void slab_stop(struct seq_file *m, void *p)
540{ 547{
541 mutex_unlock(&slab_mutex); 548 mutex_unlock(&slab_mutex);
542} 549}
@@ -613,8 +620,8 @@ static int s_show(struct seq_file *m, void *p)
613 */ 620 */
614static const struct seq_operations slabinfo_op = { 621static const struct seq_operations slabinfo_op = {
615 .start = s_start, 622 .start = s_start,
616 .next = s_next, 623 .next = slab_next,
617 .stop = s_stop, 624 .stop = slab_stop,
618 .show = s_show, 625 .show = s_show,
619}; 626};
620 627
@@ -633,7 +640,8 @@ static const struct file_operations proc_slabinfo_operations = {
633 640
634static int __init slab_proc_init(void) 641static int __init slab_proc_init(void)
635{ 642{
636 proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); 643 proc_create("slabinfo", SLABINFO_RIGHTS, NULL,
644 &proc_slabinfo_operations);
637 return 0; 645 return 0;
638} 646}
639module_init(slab_proc_init); 647module_init(slab_proc_init);
diff --git a/mm/slob.c b/mm/slob.c
index eeed4a05a2ef..91bd3f2dd2f0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -122,7 +122,7 @@ static inline void clear_slob_page_free(struct page *sp)
122} 122}
123 123
124#define SLOB_UNIT sizeof(slob_t) 124#define SLOB_UNIT sizeof(slob_t)
125#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) 125#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT)
126 126
127/* 127/*
128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 128 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
@@ -554,7 +554,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
554 flags, node); 554 flags, node);
555 } 555 }
556 556
557 if (c->ctor) 557 if (b && c->ctor)
558 c->ctor(b); 558 c->ctor(b);
559 559
560 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); 560 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
diff --git a/mm/slub.c b/mm/slub.c
index 57707f01bcfb..e3ba1f2cf60c 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -123,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
123#endif 123#endif
124} 124}
125 125
126static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
127{
128#ifdef CONFIG_SLUB_CPU_PARTIAL
129 return !kmem_cache_debug(s);
130#else
131 return false;
132#endif
133}
134
126/* 135/*
127 * Issues still to be resolved: 136 * Issues still to be resolved:
128 * 137 *
@@ -1573,7 +1582,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1573 put_cpu_partial(s, page, 0); 1582 put_cpu_partial(s, page, 0);
1574 stat(s, CPU_PARTIAL_NODE); 1583 stat(s, CPU_PARTIAL_NODE);
1575 } 1584 }
1576 if (kmem_cache_debug(s) || available > s->cpu_partial / 2) 1585 if (!kmem_cache_has_cpu_partial(s)
1586 || available > s->cpu_partial / 2)
1577 break; 1587 break;
1578 1588
1579 } 1589 }
@@ -1884,6 +1894,7 @@ redo:
1884static void unfreeze_partials(struct kmem_cache *s, 1894static void unfreeze_partials(struct kmem_cache *s,
1885 struct kmem_cache_cpu *c) 1895 struct kmem_cache_cpu *c)
1886{ 1896{
1897#ifdef CONFIG_SLUB_CPU_PARTIAL
1887 struct kmem_cache_node *n = NULL, *n2 = NULL; 1898 struct kmem_cache_node *n = NULL, *n2 = NULL;
1888 struct page *page, *discard_page = NULL; 1899 struct page *page, *discard_page = NULL;
1889 1900
@@ -1938,6 +1949,7 @@ static void unfreeze_partials(struct kmem_cache *s,
1938 discard_slab(s, page); 1949 discard_slab(s, page);
1939 stat(s, FREE_SLAB); 1950 stat(s, FREE_SLAB);
1940 } 1951 }
1952#endif
1941} 1953}
1942 1954
1943/* 1955/*
@@ -1951,6 +1963,7 @@ static void unfreeze_partials(struct kmem_cache *s,
1951 */ 1963 */
1952static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) 1964static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1953{ 1965{
1966#ifdef CONFIG_SLUB_CPU_PARTIAL
1954 struct page *oldpage; 1967 struct page *oldpage;
1955 int pages; 1968 int pages;
1956 int pobjects; 1969 int pobjects;
@@ -1987,6 +2000,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1987 page->next = oldpage; 2000 page->next = oldpage;
1988 2001
1989 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); 2002 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
2003#endif
1990} 2004}
1991 2005
1992static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 2006static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2358,7 +2372,7 @@ redo:
2358 2372
2359 object = c->freelist; 2373 object = c->freelist;
2360 page = c->page; 2374 page = c->page;
2361 if (unlikely(!object || !node_match(page, node))) 2375 if (unlikely(!object || !page || !node_match(page, node)))
2362 object = __slab_alloc(s, gfpflags, node, addr, c); 2376 object = __slab_alloc(s, gfpflags, node, addr, c);
2363 2377
2364 else { 2378 else {
@@ -2495,7 +2509,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2495 new.inuse--; 2509 new.inuse--;
2496 if ((!new.inuse || !prior) && !was_frozen) { 2510 if ((!new.inuse || !prior) && !was_frozen) {
2497 2511
2498 if (!kmem_cache_debug(s) && !prior) 2512 if (kmem_cache_has_cpu_partial(s) && !prior)
2499 2513
2500 /* 2514 /*
2501 * Slab was on no list before and will be partially empty 2515 * Slab was on no list before and will be partially empty
@@ -2550,8 +2564,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2550 * Objects left in the slab. If it was not on the partial list before 2564 * Objects left in the slab. If it was not on the partial list before
2551 * then add it. 2565 * then add it.
2552 */ 2566 */
2553 if (kmem_cache_debug(s) && unlikely(!prior)) { 2567 if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
2554 remove_full(s, page); 2568 if (kmem_cache_debug(s))
2569 remove_full(s, page);
2555 add_partial(n, page, DEACTIVATE_TO_TAIL); 2570 add_partial(n, page, DEACTIVATE_TO_TAIL);
2556 stat(s, FREE_ADD_PARTIAL); 2571 stat(s, FREE_ADD_PARTIAL);
2557 } 2572 }
@@ -3059,7 +3074,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3059 * per node list when we run out of per cpu objects. We only fetch 50% 3074 * per node list when we run out of per cpu objects. We only fetch 50%
3060 * to keep some capacity around for frees. 3075 * to keep some capacity around for frees.
3061 */ 3076 */
3062 if (kmem_cache_debug(s)) 3077 if (!kmem_cache_has_cpu_partial(s))
3063 s->cpu_partial = 0; 3078 s->cpu_partial = 0;
3064 else if (s->size >= PAGE_SIZE) 3079 else if (s->size >= PAGE_SIZE)
3065 s->cpu_partial = 2; 3080 s->cpu_partial = 2;
@@ -3755,7 +3770,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
3755 * Use the cpu notifier to insure that the cpu slabs are flushed when 3770 * Use the cpu notifier to insure that the cpu slabs are flushed when
3756 * necessary. 3771 * necessary.
3757 */ 3772 */
3758static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, 3773static int slab_cpuup_callback(struct notifier_block *nfb,
3759 unsigned long action, void *hcpu) 3774 unsigned long action, void *hcpu)
3760{ 3775{
3761 long cpu = (long)hcpu; 3776 long cpu = (long)hcpu;
@@ -3781,7 +3796,7 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3781 return NOTIFY_OK; 3796 return NOTIFY_OK;
3782} 3797}
3783 3798
3784static struct notifier_block __cpuinitdata slab_notifier = { 3799static struct notifier_block slab_notifier = {
3785 .notifier_call = slab_cpuup_callback 3800 .notifier_call = slab_cpuup_callback
3786}; 3801};
3787 3802
@@ -4456,7 +4471,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4456 err = strict_strtoul(buf, 10, &objects); 4471 err = strict_strtoul(buf, 10, &objects);
4457 if (err) 4472 if (err)
4458 return err; 4473 return err;
4459 if (objects && kmem_cache_debug(s)) 4474 if (objects && !kmem_cache_has_cpu_partial(s))
4460 return -EINVAL; 4475 return -EINVAL;
4461 4476
4462 s->cpu_partial = objects; 4477 s->cpu_partial = objects;
@@ -5269,7 +5284,6 @@ __initcall(slab_sysfs_init);
5269#ifdef CONFIG_SLABINFO 5284#ifdef CONFIG_SLABINFO
5270void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5285void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5271{ 5286{
5272 unsigned long nr_partials = 0;
5273 unsigned long nr_slabs = 0; 5287 unsigned long nr_slabs = 0;
5274 unsigned long nr_objs = 0; 5288 unsigned long nr_objs = 0;
5275 unsigned long nr_free = 0; 5289 unsigned long nr_free = 0;
@@ -5281,9 +5295,8 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5281 if (!n) 5295 if (!n)
5282 continue; 5296 continue;
5283 5297
5284 nr_partials += n->nr_partial; 5298 nr_slabs += node_nr_slabs(n);
5285 nr_slabs += atomic_long_read(&n->nr_slabs); 5299 nr_objs += node_nr_objs(n);
5286 nr_objs += atomic_long_read(&n->total_objects);
5287 nr_free += count_partial(n, count_free); 5300 nr_free += count_partial(n, count_free);
5288 } 5301 }
5289 5302
diff --git a/mm/sparse.c b/mm/sparse.c
index 1c91f0d3f6ab..308d50331bc3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -79,7 +79,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
79{ 79{
80 unsigned long root = SECTION_NR_TO_ROOT(section_nr); 80 unsigned long root = SECTION_NR_TO_ROOT(section_nr);
81 struct mem_section *section; 81 struct mem_section *section;
82 int ret = 0;
83 82
84 if (mem_section[root]) 83 if (mem_section[root])
85 return -EEXIST; 84 return -EEXIST;
@@ -90,7 +89,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
90 89
91 mem_section[root] = section; 90 mem_section[root] = section;
92 91
93 return ret; 92 return 0;
94} 93}
95#else /* !SPARSEMEM_EXTREME */ 94#else /* !SPARSEMEM_EXTREME */
96static inline int sparse_index_init(unsigned long section_nr, int nid) 95static inline int sparse_index_init(unsigned long section_nr, int nid)
@@ -481,6 +480,9 @@ void __init sparse_init(void)
481 struct page **map_map; 480 struct page **map_map;
482#endif 481#endif
483 482
483 /* see include/linux/mmzone.h 'struct mem_section' definition */
484 BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
485
484 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ 486 /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
485 set_pageblock_order(); 487 set_pageblock_order();
486 488
@@ -751,6 +753,7 @@ out:
751 return ret; 753 return ret;
752} 754}
753 755
756#ifdef CONFIG_MEMORY_HOTREMOVE
754#ifdef CONFIG_MEMORY_FAILURE 757#ifdef CONFIG_MEMORY_FAILURE
755static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) 758static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
756{ 759{
@@ -772,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
772} 775}
773#endif 776#endif
774 777
775#ifdef CONFIG_MEMORY_HOTREMOVE
776static void free_section_usemap(struct page *memmap, unsigned long *usemap) 778static void free_section_usemap(struct page *memmap, unsigned long *usemap)
777{ 779{
778 struct page *usemap_page; 780 struct page *usemap_page;
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71d6841..62b78a6e224f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,10 +34,13 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37#define CREATE_TRACE_POINTS
38#include <trace/events/pagemap.h>
39
37/* How many pages do we try to swap or page in/out together? */ 40/* How many pages do we try to swap or page in/out together? */
38int page_cluster; 41int page_cluster;
39 42
40static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); 43static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
41static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 44static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
42static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 45static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
43 46
@@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
384 SetPageActive(page); 387 SetPageActive(page);
385 lru += LRU_ACTIVE; 388 lru += LRU_ACTIVE;
386 add_page_to_lru_list(page, lruvec, lru); 389 add_page_to_lru_list(page, lruvec, lru);
390 trace_mm_lru_activate(page, page_to_pfn(page));
387 391
388 __count_vm_event(PGACTIVATE); 392 __count_vm_event(PGACTIVATE);
389 update_page_reclaim_stat(lruvec, file, 1); 393 update_page_reclaim_stat(lruvec, file, 1);
@@ -428,6 +432,33 @@ void activate_page(struct page *page)
428} 432}
429#endif 433#endif
430 434
435static void __lru_cache_activate_page(struct page *page)
436{
437 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
438 int i;
439
440 /*
441 * Search backwards on the optimistic assumption that the page being
442 * activated has just been added to this pagevec. Note that only
443 * the local pagevec is examined as a !PageLRU page could be in the
444 * process of being released, reclaimed, migrated or on a remote
445 * pagevec that is currently being drained. Furthermore, marking
446 * a remote pagevec's page PageActive potentially hits a race where
447 * a page is marked PageActive just after it is added to the inactive
448 * list causing accounting errors and BUG_ON checks to trigger.
449 */
450 for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
451 struct page *pagevec_page = pvec->pages[i];
452
453 if (pagevec_page == page) {
454 SetPageActive(page);
455 break;
456 }
457 }
458
459 put_cpu_var(lru_add_pvec);
460}
461
431/* 462/*
432 * Mark a page as having seen activity. 463 * Mark a page as having seen activity.
433 * 464 *
@@ -438,8 +469,18 @@ void activate_page(struct page *page)
438void mark_page_accessed(struct page *page) 469void mark_page_accessed(struct page *page)
439{ 470{
440 if (!PageActive(page) && !PageUnevictable(page) && 471 if (!PageActive(page) && !PageUnevictable(page) &&
441 PageReferenced(page) && PageLRU(page)) { 472 PageReferenced(page)) {
442 activate_page(page); 473
474 /*
475 * If the page is on the LRU, queue it for activation via
476 * activate_page_pvecs. Otherwise, assume the page is on a
477 * pagevec, mark it active and it'll be moved to the active
478 * LRU on the next drain.
479 */
480 if (PageLRU(page))
481 activate_page(page);
482 else
483 __lru_cache_activate_page(page);
443 ClearPageReferenced(page); 484 ClearPageReferenced(page);
444 } else if (!PageReferenced(page)) { 485 } else if (!PageReferenced(page)) {
445 SetPageReferenced(page); 486 SetPageReferenced(page);
@@ -448,42 +489,32 @@ void mark_page_accessed(struct page *page)
448EXPORT_SYMBOL(mark_page_accessed); 489EXPORT_SYMBOL(mark_page_accessed);
449 490
450/* 491/*
451 * Order of operations is important: flush the pagevec when it's already 492 * Queue the page for addition to the LRU via pagevec. The decision on whether
452 * full, not when adding the last page, to make sure that last page is 493 * to add the page to the [in]active [file|anon] list is deferred until the
453 * not added to the LRU directly when passed to this function. Because 494 * pagevec is drained. This gives a chance for the caller of __lru_cache_add()
454 * mark_page_accessed() (called after this when writing) only activates 495 * have the page added to the active list using mark_page_accessed().
455 * pages that are on the LRU, linear writes in subpage chunks would see
456 * every PAGEVEC_SIZE page activated, which is unexpected.
457 */ 496 */
458void __lru_cache_add(struct page *page, enum lru_list lru) 497void __lru_cache_add(struct page *page)
459{ 498{
460 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 499 struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
461 500
462 page_cache_get(page); 501 page_cache_get(page);
463 if (!pagevec_space(pvec)) 502 if (!pagevec_space(pvec))
464 __pagevec_lru_add(pvec, lru); 503 __pagevec_lru_add(pvec);
465 pagevec_add(pvec, page); 504 pagevec_add(pvec, page);
466 put_cpu_var(lru_add_pvecs); 505 put_cpu_var(lru_add_pvec);
467} 506}
468EXPORT_SYMBOL(__lru_cache_add); 507EXPORT_SYMBOL(__lru_cache_add);
469 508
470/** 509/**
471 * lru_cache_add_lru - add a page to a page list 510 * lru_cache_add - add a page to a page list
472 * @page: the page to be added to the LRU. 511 * @page: the page to be added to the LRU.
473 * @lru: the LRU list to which the page is added.
474 */ 512 */
475void lru_cache_add_lru(struct page *page, enum lru_list lru) 513void lru_cache_add(struct page *page)
476{ 514{
477 if (PageActive(page)) { 515 VM_BUG_ON(PageActive(page) && PageUnevictable(page));
478 VM_BUG_ON(PageUnevictable(page)); 516 VM_BUG_ON(PageLRU(page));
479 ClearPageActive(page); 517 __lru_cache_add(page);
480 } else if (PageUnevictable(page)) {
481 VM_BUG_ON(PageActive(page));
482 ClearPageUnevictable(page);
483 }
484
485 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
486 __lru_cache_add(page, lru);
487} 518}
488 519
489/** 520/**
@@ -503,6 +534,7 @@ void add_page_to_unevictable_list(struct page *page)
503 534
504 spin_lock_irq(&zone->lru_lock); 535 spin_lock_irq(&zone->lru_lock);
505 lruvec = mem_cgroup_page_lruvec(page, zone); 536 lruvec = mem_cgroup_page_lruvec(page, zone);
537 ClearPageActive(page);
506 SetPageUnevictable(page); 538 SetPageUnevictable(page);
507 SetPageLRU(page); 539 SetPageLRU(page);
508 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); 540 add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
@@ -583,15 +615,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
583 */ 615 */
584void lru_add_drain_cpu(int cpu) 616void lru_add_drain_cpu(int cpu)
585{ 617{
586 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 618 struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
587 struct pagevec *pvec;
588 int lru;
589 619
590 for_each_lru(lru) { 620 if (pagevec_count(pvec))
591 pvec = &pvecs[lru - LRU_BASE]; 621 __pagevec_lru_add(pvec);
592 if (pagevec_count(pvec))
593 __pagevec_lru_add(pvec, lru);
594 }
595 622
596 pvec = &per_cpu(lru_rotate_pvecs, cpu); 623 pvec = &per_cpu(lru_rotate_pvecs, cpu);
597 if (pagevec_count(pvec)) { 624 if (pagevec_count(pvec)) {
@@ -708,6 +735,9 @@ void release_pages(struct page **pages, int nr, int cold)
708 del_page_from_lru_list(page, lruvec, page_off_lru(page)); 735 del_page_from_lru_list(page, lruvec, page_off_lru(page));
709 } 736 }
710 737
738 /* Clear Active bit in case of parallel mark_page_accessed */
739 ClearPageActive(page);
740
711 list_add(&page->lru, &pages_to_free); 741 list_add(&page->lru, &pages_to_free);
712 } 742 }
713 if (zone) 743 if (zone)
@@ -740,8 +770,6 @@ EXPORT_SYMBOL(__pagevec_release);
740void lru_add_page_tail(struct page *page, struct page *page_tail, 770void lru_add_page_tail(struct page *page, struct page *page_tail,
741 struct lruvec *lruvec, struct list_head *list) 771 struct lruvec *lruvec, struct list_head *list)
742{ 772{
743 int uninitialized_var(active);
744 enum lru_list lru;
745 const int file = 0; 773 const int file = 0;
746 774
747 VM_BUG_ON(!PageHead(page)); 775 VM_BUG_ON(!PageHead(page));
@@ -753,20 +781,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
753 if (!list) 781 if (!list)
754 SetPageLRU(page_tail); 782 SetPageLRU(page_tail);
755 783
756 if (page_evictable(page_tail)) {
757 if (PageActive(page)) {
758 SetPageActive(page_tail);
759 active = 1;
760 lru = LRU_ACTIVE_ANON;
761 } else {
762 active = 0;
763 lru = LRU_INACTIVE_ANON;
764 }
765 } else {
766 SetPageUnevictable(page_tail);
767 lru = LRU_UNEVICTABLE;
768 }
769
770 if (likely(PageLRU(page))) 784 if (likely(PageLRU(page)))
771 list_add_tail(&page_tail->lru, &page->lru); 785 list_add_tail(&page_tail->lru, &page->lru);
772 else if (list) { 786 else if (list) {
@@ -782,43 +796,38 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
782 * Use the standard add function to put page_tail on the list, 796 * Use the standard add function to put page_tail on the list,
783 * but then correct its position so they all end up in order. 797 * but then correct its position so they all end up in order.
784 */ 798 */
785 add_page_to_lru_list(page_tail, lruvec, lru); 799 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
786 list_head = page_tail->lru.prev; 800 list_head = page_tail->lru.prev;
787 list_move_tail(&page_tail->lru, list_head); 801 list_move_tail(&page_tail->lru, list_head);
788 } 802 }
789 803
790 if (!PageUnevictable(page)) 804 if (!PageUnevictable(page))
791 update_page_reclaim_stat(lruvec, file, active); 805 update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
792} 806}
793#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 807#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
794 808
795static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, 809static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
796 void *arg) 810 void *arg)
797{ 811{
798 enum lru_list lru = (enum lru_list)arg; 812 int file = page_is_file_cache(page);
799 int file = is_file_lru(lru); 813 int active = PageActive(page);
800 int active = is_active_lru(lru); 814 enum lru_list lru = page_lru(page);
801 815
802 VM_BUG_ON(PageActive(page));
803 VM_BUG_ON(PageUnevictable(page));
804 VM_BUG_ON(PageLRU(page)); 816 VM_BUG_ON(PageLRU(page));
805 817
806 SetPageLRU(page); 818 SetPageLRU(page);
807 if (active)
808 SetPageActive(page);
809 add_page_to_lru_list(page, lruvec, lru); 819 add_page_to_lru_list(page, lruvec, lru);
810 update_page_reclaim_stat(lruvec, file, active); 820 update_page_reclaim_stat(lruvec, file, active);
821 trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page));
811} 822}
812 823
813/* 824/*
814 * Add the passed pages to the LRU, then drop the caller's refcount 825 * Add the passed pages to the LRU, then drop the caller's refcount
815 * on them. Reinitialises the caller's pagevec. 826 * on them. Reinitialises the caller's pagevec.
816 */ 827 */
817void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 828void __pagevec_lru_add(struct pagevec *pvec)
818{ 829{
819 VM_BUG_ON(is_unevictable_lru(lru)); 830 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
820
821 pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru);
822} 831}
823EXPORT_SYMBOL(__pagevec_lru_add); 832EXPORT_SYMBOL(__pagevec_lru_add);
824 833
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 746af55b8455..6cf2e60983b7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
212 si->cluster_nr = SWAPFILE_CLUSTER - 1; 212 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks; 213 goto checks;
214 } 214 }
215 if (si->flags & SWP_DISCARDABLE) { 215 if (si->flags & SWP_PAGE_DISCARD) {
216 /* 216 /*
217 * Start range check on racing allocations, in case 217 * Start range check on racing allocations, in case
218 * they overlap the cluster we eventually decide on 218 * they overlap the cluster we eventually decide on
@@ -322,7 +322,7 @@ checks:
322 322
323 if (si->lowest_alloc) { 323 if (si->lowest_alloc) {
324 /* 324 /*
325 * Only set when SWP_DISCARDABLE, and there's a scan 325 * Only set when SWP_PAGE_DISCARD, and there's a scan
326 * for a free cluster in progress or just completed. 326 * for a free cluster in progress or just completed.
327 */ 327 */
328 if (found_free_cluster) { 328 if (found_free_cluster) {
@@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, int free)
866} 866}
867#endif /* CONFIG_HIBERNATION */ 867#endif /* CONFIG_HIBERNATION */
868 868
869static inline int maybe_same_pte(pte_t pte, pte_t swp_pte)
870{
871#ifdef CONFIG_MEM_SOFT_DIRTY
872 /*
873 * When pte keeps soft dirty bit the pte generated
874 * from swap entry does not has it, still it's same
875 * pte from logical point of view.
876 */
877 pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte);
878 return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty);
879#else
880 return pte_same(pte, swp_pte);
881#endif
882}
883
869/* 884/*
870 * No need to decide whether this PTE shares the swap entry with others, 885 * No need to decide whether this PTE shares the swap entry with others,
871 * just let do_wp_page work it out if a write is requested later - to 886 * just let do_wp_page work it out if a write is requested later - to
@@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
892 } 907 }
893 908
894 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 909 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
895 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 910 if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
896 mem_cgroup_cancel_charge_swapin(memcg); 911 mem_cgroup_cancel_charge_swapin(memcg);
897 ret = 0; 912 ret = 0;
898 goto out; 913 goto out;
@@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
947 * swapoff spends a _lot_ of time in this loop! 962 * swapoff spends a _lot_ of time in this loop!
948 * Test inline before going to call unuse_pte. 963 * Test inline before going to call unuse_pte.
949 */ 964 */
950 if (unlikely(pte_same(*pte, swp_pte))) { 965 if (unlikely(maybe_same_pte(*pte, swp_pte))) {
951 pte_unmap(pte); 966 pte_unmap(pte);
952 ret = unuse_pte(vma, pmd, addr, entry, page); 967 ret = unuse_pte(vma, pmd, addr, entry, page);
953 if (ret) 968 if (ret)
@@ -2016,6 +2031,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2016 return nr_extents; 2031 return nr_extents;
2017} 2032}
2018 2033
2034/*
2035 * Helper to sys_swapon determining if a given swap
2036 * backing device queue supports DISCARD operations.
2037 */
2038static bool swap_discardable(struct swap_info_struct *si)
2039{
2040 struct request_queue *q = bdev_get_queue(si->bdev);
2041
2042 if (!q || !blk_queue_discard(q))
2043 return false;
2044
2045 return true;
2046}
2047
2019SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) 2048SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2020{ 2049{
2021 struct swap_info_struct *p; 2050 struct swap_info_struct *p;
@@ -2123,8 +2152,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2123 p->flags |= SWP_SOLIDSTATE; 2152 p->flags |= SWP_SOLIDSTATE;
2124 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2153 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2125 } 2154 }
2126 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) 2155
2127 p->flags |= SWP_DISCARDABLE; 2156 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2157 /*
2158 * When discard is enabled for swap with no particular
2159 * policy flagged, we set all swap discard flags here in
2160 * order to sustain backward compatibility with older
2161 * swapon(8) releases.
2162 */
2163 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2164 SWP_PAGE_DISCARD);
2165
2166 /*
2167 * By flagging sys_swapon, a sysadmin can tell us to
2168 * either do single-time area discards only, or to just
2169 * perform discards for released swap page-clusters.
2170 * Now it's time to adjust the p->flags accordingly.
2171 */
2172 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2173 p->flags &= ~SWP_PAGE_DISCARD;
2174 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2175 p->flags &= ~SWP_AREA_DISCARD;
2176
2177 /* issue a swapon-time discard if it's still required */
2178 if (p->flags & SWP_AREA_DISCARD) {
2179 int err = discard_swap(p);
2180 if (unlikely(err))
2181 printk(KERN_ERR
2182 "swapon: discard_swap(%p): %d\n",
2183 p, err);
2184 }
2185 }
2128 } 2186 }
2129 2187
2130 mutex_lock(&swapon_mutex); 2188 mutex_lock(&swapon_mutex);
@@ -2135,11 +2193,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2135 enable_swap_info(p, prio, swap_map, frontswap_map); 2193 enable_swap_info(p, prio, swap_map, frontswap_map);
2136 2194
2137 printk(KERN_INFO "Adding %uk swap on %s. " 2195 printk(KERN_INFO "Adding %uk swap on %s. "
2138 "Priority:%d extents:%d across:%lluk %s%s%s\n", 2196 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2139 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2197 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2140 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2198 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
2141 (p->flags & SWP_SOLIDSTATE) ? "SS" : "", 2199 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
2142 (p->flags & SWP_DISCARDABLE) ? "D" : "", 2200 (p->flags & SWP_DISCARDABLE) ? "D" : "",
2201 (p->flags & SWP_AREA_DISCARD) ? "s" : "",
2202 (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
2143 (frontswap_map) ? "FS" : ""); 2203 (frontswap_map) ? "FS" : "");
2144 2204
2145 mutex_unlock(&swapon_mutex); 2205 mutex_unlock(&swapon_mutex);
diff --git a/mm/truncate.c b/mm/truncate.c
index c75b736e54b7..e2e8a8a7eb9d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -26,7 +26,8 @@
26/** 26/**
27 * do_invalidatepage - invalidate part or all of a page 27 * do_invalidatepage - invalidate part or all of a page
28 * @page: the page which is affected 28 * @page: the page which is affected
29 * @offset: the index of the truncation point 29 * @offset: start of the range to invalidate
30 * @length: length of the range to invalidate
30 * 31 *
31 * do_invalidatepage() is called when all or part of the page has become 32 * do_invalidatepage() is called when all or part of the page has become
32 * invalidated by a truncate operation. 33 * invalidated by a truncate operation.
@@ -37,24 +38,18 @@
37 * point. Because the caller is about to free (and possibly reuse) those 38 * point. Because the caller is about to free (and possibly reuse) those
38 * blocks on-disk. 39 * blocks on-disk.
39 */ 40 */
40void do_invalidatepage(struct page *page, unsigned long offset) 41void do_invalidatepage(struct page *page, unsigned int offset,
42 unsigned int length)
41{ 43{
42 void (*invalidatepage)(struct page *, unsigned long); 44 void (*invalidatepage)(struct page *, unsigned int, unsigned int);
45
43 invalidatepage = page->mapping->a_ops->invalidatepage; 46 invalidatepage = page->mapping->a_ops->invalidatepage;
44#ifdef CONFIG_BLOCK 47#ifdef CONFIG_BLOCK
45 if (!invalidatepage) 48 if (!invalidatepage)
46 invalidatepage = block_invalidatepage; 49 invalidatepage = block_invalidatepage;
47#endif 50#endif
48 if (invalidatepage) 51 if (invalidatepage)
49 (*invalidatepage)(page, offset); 52 (*invalidatepage)(page, offset, length);
50}
51
52static inline void truncate_partial_page(struct page *page, unsigned partial)
53{
54 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
55 cleancache_invalidate_page(page->mapping, page);
56 if (page_has_private(page))
57 do_invalidatepage(page, partial);
58} 53}
59 54
60/* 55/*
@@ -103,7 +98,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
103 return -EIO; 98 return -EIO;
104 99
105 if (page_has_private(page)) 100 if (page_has_private(page))
106 do_invalidatepage(page, 0); 101 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
107 102
108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 103 cancel_dirty_page(page, PAGE_CACHE_SIZE);
109 104
@@ -185,11 +180,11 @@ int invalidate_inode_page(struct page *page)
185 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets 180 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
186 * @mapping: mapping to truncate 181 * @mapping: mapping to truncate
187 * @lstart: offset from which to truncate 182 * @lstart: offset from which to truncate
188 * @lend: offset to which to truncate 183 * @lend: offset to which to truncate (inclusive)
189 * 184 *
190 * Truncate the page cache, removing the pages that are between 185 * Truncate the page cache, removing the pages that are between
191 * specified offsets (and zeroing out partial page 186 * specified offsets (and zeroing out partial pages
192 * (if lstart is not page aligned)). 187 * if lstart or lend + 1 is not page aligned).
193 * 188 *
194 * Truncate takes two passes - the first pass is nonblocking. It will not 189 * Truncate takes two passes - the first pass is nonblocking. It will not
195 * block on page locks and it will not block on writeback. The second pass 190 * block on page locks and it will not block on writeback. The second pass
@@ -200,35 +195,58 @@ int invalidate_inode_page(struct page *page)
200 * We pass down the cache-hot hint to the page freeing code. Even if the 195 * We pass down the cache-hot hint to the page freeing code. Even if the
201 * mapping is large, it is probably the case that the final pages are the most 196 * mapping is large, it is probably the case that the final pages are the most
202 * recently touched, and freeing happens in ascending file offset order. 197 * recently touched, and freeing happens in ascending file offset order.
198 *
199 * Note that since ->invalidatepage() accepts range to invalidate
200 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
201 * page aligned properly.
203 */ 202 */
204void truncate_inode_pages_range(struct address_space *mapping, 203void truncate_inode_pages_range(struct address_space *mapping,
205 loff_t lstart, loff_t lend) 204 loff_t lstart, loff_t lend)
206{ 205{
207 const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; 206 pgoff_t start; /* inclusive */
208 const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 207 pgoff_t end; /* exclusive */
209 struct pagevec pvec; 208 unsigned int partial_start; /* inclusive */
210 pgoff_t index; 209 unsigned int partial_end; /* exclusive */
211 pgoff_t end; 210 struct pagevec pvec;
212 int i; 211 pgoff_t index;
212 int i;
213 213
214 cleancache_invalidate_inode(mapping); 214 cleancache_invalidate_inode(mapping);
215 if (mapping->nrpages == 0) 215 if (mapping->nrpages == 0)
216 return; 216 return;
217 217
218 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 218 /* Offsets within partial pages */
219 end = (lend >> PAGE_CACHE_SHIFT); 219 partial_start = lstart & (PAGE_CACHE_SIZE - 1);
220 partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
221
222 /*
223 * 'start' and 'end' always covers the range of pages to be fully
224 * truncated. Partial pages are covered with 'partial_start' at the
225 * start of the range and 'partial_end' at the end of the range.
226 * Note that 'end' is exclusive while 'lend' is inclusive.
227 */
228 start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
229 if (lend == -1)
230 /*
231 * lend == -1 indicates end-of-file so we have to set 'end'
232 * to the highest possible pgoff_t and since the type is
233 * unsigned we're using -1.
234 */
235 end = -1;
236 else
237 end = (lend + 1) >> PAGE_CACHE_SHIFT;
220 238
221 pagevec_init(&pvec, 0); 239 pagevec_init(&pvec, 0);
222 index = start; 240 index = start;
223 while (index <= end && pagevec_lookup(&pvec, mapping, index, 241 while (index < end && pagevec_lookup(&pvec, mapping, index,
224 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 242 min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
225 mem_cgroup_uncharge_start(); 243 mem_cgroup_uncharge_start();
226 for (i = 0; i < pagevec_count(&pvec); i++) { 244 for (i = 0; i < pagevec_count(&pvec); i++) {
227 struct page *page = pvec.pages[i]; 245 struct page *page = pvec.pages[i];
228 246
229 /* We rely upon deletion not changing page->index */ 247 /* We rely upon deletion not changing page->index */
230 index = page->index; 248 index = page->index;
231 if (index > end) 249 if (index >= end)
232 break; 250 break;
233 251
234 if (!trylock_page(page)) 252 if (!trylock_page(page))
@@ -247,27 +265,56 @@ void truncate_inode_pages_range(struct address_space *mapping,
247 index++; 265 index++;
248 } 266 }
249 267
250 if (partial) { 268 if (partial_start) {
251 struct page *page = find_lock_page(mapping, start - 1); 269 struct page *page = find_lock_page(mapping, start - 1);
252 if (page) { 270 if (page) {
271 unsigned int top = PAGE_CACHE_SIZE;
272 if (start > end) {
273 /* Truncation within a single page */
274 top = partial_end;
275 partial_end = 0;
276 }
253 wait_on_page_writeback(page); 277 wait_on_page_writeback(page);
254 truncate_partial_page(page, partial); 278 zero_user_segment(page, partial_start, top);
279 cleancache_invalidate_page(mapping, page);
280 if (page_has_private(page))
281 do_invalidatepage(page, partial_start,
282 top - partial_start);
255 unlock_page(page); 283 unlock_page(page);
256 page_cache_release(page); 284 page_cache_release(page);
257 } 285 }
258 } 286 }
287 if (partial_end) {
288 struct page *page = find_lock_page(mapping, end);
289 if (page) {
290 wait_on_page_writeback(page);
291 zero_user_segment(page, 0, partial_end);
292 cleancache_invalidate_page(mapping, page);
293 if (page_has_private(page))
294 do_invalidatepage(page, 0,
295 partial_end);
296 unlock_page(page);
297 page_cache_release(page);
298 }
299 }
300 /*
301 * If the truncation happened within a single page no pages
302 * will be released, just zeroed, so we can bail out now.
303 */
304 if (start >= end)
305 return;
259 306
260 index = start; 307 index = start;
261 for ( ; ; ) { 308 for ( ; ; ) {
262 cond_resched(); 309 cond_resched();
263 if (!pagevec_lookup(&pvec, mapping, index, 310 if (!pagevec_lookup(&pvec, mapping, index,
264 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { 311 min(end - index, (pgoff_t)PAGEVEC_SIZE))) {
265 if (index == start) 312 if (index == start)
266 break; 313 break;
267 index = start; 314 index = start;
268 continue; 315 continue;
269 } 316 }
270 if (index == start && pvec.pages[0]->index > end) { 317 if (index == start && pvec.pages[0]->index >= end) {
271 pagevec_release(&pvec); 318 pagevec_release(&pvec);
272 break; 319 break;
273 } 320 }
@@ -277,7 +324,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
277 324
278 /* We rely upon deletion not changing page->index */ 325 /* We rely upon deletion not changing page->index */
279 index = page->index; 326 index = page->index;
280 if (index > end) 327 if (index >= end)
281 break; 328 break;
282 329
283 lock_page(page); 330 lock_page(page);
@@ -598,10 +645,8 @@ void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
598 * This rounding is currently just for example: unmap_mapping_range 645 * This rounding is currently just for example: unmap_mapping_range
599 * expands its hole outwards, whereas we want it to contract the hole 646 * expands its hole outwards, whereas we want it to contract the hole
600 * inwards. However, existing callers of truncate_pagecache_range are 647 * inwards. However, existing callers of truncate_pagecache_range are
601 * doing their own page rounding first; and truncate_inode_pages_range 648 * doing their own page rounding first. Note that unmap_mapping_range
602 * currently BUGs if lend is not pagealigned-1 (it handles partial 649 * allows holelen 0 for all, and we allow lend -1 for end of file.
603 * page at start of hole, but not partial page at end of hole). Note
604 * unmap_mapping_range allows holelen 0 for all, and we allow lend -1.
605 */ 650 */
606 651
607 /* 652 /*
diff --git a/mm/util.c b/mm/util.c
index ab1424dbe2e6..7441c41d00f6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
295{ 295{
296 mm->mmap_base = TASK_UNMAPPED_BASE; 296 mm->mmap_base = TASK_UNMAPPED_BASE;
297 mm->get_unmapped_area = arch_get_unmapped_area; 297 mm->get_unmapped_area = arch_get_unmapped_area;
298 mm->unmap_area = arch_unmap_area;
299} 298}
300#endif 299#endif
301 300
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d365724feb05..13a54953a273 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
292 va = rb_entry(n, struct vmap_area, rb_node); 292 va = rb_entry(n, struct vmap_area, rb_node);
293 if (addr < va->va_start) 293 if (addr < va->va_start)
294 n = n->rb_left; 294 n = n->rb_left;
295 else if (addr > va->va_start) 295 else if (addr >= va->va_end)
296 n = n->rb_right; 296 n = n->rb_right;
297 else 297 else
298 return va; 298 return va;
@@ -388,12 +388,12 @@ nocache:
388 addr = ALIGN(first->va_end, align); 388 addr = ALIGN(first->va_end, align);
389 if (addr < vstart) 389 if (addr < vstart)
390 goto nocache; 390 goto nocache;
391 if (addr + size - 1 < addr) 391 if (addr + size < addr)
392 goto overflow; 392 goto overflow;
393 393
394 } else { 394 } else {
395 addr = ALIGN(vstart, align); 395 addr = ALIGN(vstart, align);
396 if (addr + size - 1 < addr) 396 if (addr + size < addr)
397 goto overflow; 397 goto overflow;
398 398
399 n = vmap_area_root.rb_node; 399 n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
420 if (addr + cached_hole_size < first->va_start) 420 if (addr + cached_hole_size < first->va_start)
421 cached_hole_size = first->va_start - addr; 421 cached_hole_size = first->va_start - addr;
422 addr = ALIGN(first->va_end, align); 422 addr = ALIGN(first->va_end, align);
423 if (addr + size - 1 < addr) 423 if (addr + size < addr)
424 goto overflow; 424 goto overflow;
425 425
426 if (list_is_last(&first->list, &vmap_area_list)) 426 if (list_is_last(&first->list, &vmap_area_list))
@@ -754,7 +754,6 @@ struct vmap_block {
754 struct vmap_area *va; 754 struct vmap_area *va;
755 struct vmap_block_queue *vbq; 755 struct vmap_block_queue *vbq;
756 unsigned long free, dirty; 756 unsigned long free, dirty;
757 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
758 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 757 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
759 struct list_head free_list; 758 struct list_head free_list;
760 struct rcu_head rcu_head; 759 struct rcu_head rcu_head;
@@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
820 vb->va = va; 819 vb->va = va;
821 vb->free = VMAP_BBMAP_BITS; 820 vb->free = VMAP_BBMAP_BITS;
822 vb->dirty = 0; 821 vb->dirty = 0;
823 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
824 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 822 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
825 INIT_LIST_HEAD(&vb->free_list); 823 INIT_LIST_HEAD(&vb->free_list);
826 824
@@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu)
873 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 871 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
874 vb->free = 0; /* prevent further allocs after releasing lock */ 872 vb->free = 0; /* prevent further allocs after releasing lock */
875 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 873 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
876 bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
877 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 874 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
878 spin_lock(&vbq->lock); 875 spin_lock(&vbq->lock);
879 list_del_rcu(&vb->free_list); 876 list_del_rcu(&vb->free_list);
@@ -891,11 +888,6 @@ static void purge_fragmented_blocks(int cpu)
891 } 888 }
892} 889}
893 890
894static void purge_fragmented_blocks_thiscpu(void)
895{
896 purge_fragmented_blocks(smp_processor_id());
897}
898
899static void purge_fragmented_blocks_allcpus(void) 891static void purge_fragmented_blocks_allcpus(void)
900{ 892{
901 int cpu; 893 int cpu;
@@ -910,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
910 struct vmap_block *vb; 902 struct vmap_block *vb;
911 unsigned long addr = 0; 903 unsigned long addr = 0;
912 unsigned int order; 904 unsigned int order;
913 int purge = 0;
914 905
915 BUG_ON(size & ~PAGE_MASK); 906 BUG_ON(size & ~PAGE_MASK);
916 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); 907 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -934,17 +925,7 @@ again:
934 if (vb->free < 1UL << order) 925 if (vb->free < 1UL << order)
935 goto next; 926 goto next;
936 927
937 i = bitmap_find_free_region(vb->alloc_map, 928 i = VMAP_BBMAP_BITS - vb->free;
938 VMAP_BBMAP_BITS, order);
939
940 if (i < 0) {
941 if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
942 /* fragmented and no outstanding allocations */
943 BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
944 purge = 1;
945 }
946 goto next;
947 }
948 addr = vb->va->va_start + (i << PAGE_SHIFT); 929 addr = vb->va->va_start + (i << PAGE_SHIFT);
949 BUG_ON(addr_to_vb_idx(addr) != 930 BUG_ON(addr_to_vb_idx(addr) !=
950 addr_to_vb_idx(vb->va->va_start)); 931 addr_to_vb_idx(vb->va->va_start));
@@ -960,9 +941,6 @@ next:
960 spin_unlock(&vb->lock); 941 spin_unlock(&vb->lock);
961 } 942 }
962 943
963 if (purge)
964 purge_fragmented_blocks_thiscpu();
965
966 put_cpu_var(vmap_block_queue); 944 put_cpu_var(vmap_block_queue);
967 rcu_read_unlock(); 945 rcu_read_unlock();
968 946
@@ -1311,22 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1311 spin_unlock(&vmap_area_lock); 1289 spin_unlock(&vmap_area_lock);
1312} 1290}
1313 1291
1314static void clear_vm_unlist(struct vm_struct *vm) 1292static void clear_vm_uninitialized_flag(struct vm_struct *vm)
1315{ 1293{
1316 /* 1294 /*
1317 * Before removing VM_UNLIST, 1295 * Before removing VM_UNINITIALIZED,
1318 * we should make sure that vm has proper values. 1296 * we should make sure that vm has proper values.
1319 * Pair with smp_rmb() in show_numa_info(). 1297 * Pair with smp_rmb() in show_numa_info().
1320 */ 1298 */
1321 smp_wmb(); 1299 smp_wmb();
1322 vm->flags &= ~VM_UNLIST; 1300 vm->flags &= ~VM_UNINITIALIZED;
1323}
1324
1325static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1326 unsigned long flags, const void *caller)
1327{
1328 setup_vmalloc_vm(vm, va, flags, caller);
1329 clear_vm_unlist(vm);
1330} 1301}
1331 1302
1332static struct vm_struct *__get_vm_area_node(unsigned long size, 1303static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1337,16 +1308,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1337 struct vm_struct *area; 1308 struct vm_struct *area;
1338 1309
1339 BUG_ON(in_interrupt()); 1310 BUG_ON(in_interrupt());
1340 if (flags & VM_IOREMAP) { 1311 if (flags & VM_IOREMAP)
1341 int bit = fls(size); 1312 align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER);
1342
1343 if (bit > IOREMAP_MAX_ORDER)
1344 bit = IOREMAP_MAX_ORDER;
1345 else if (bit < PAGE_SHIFT)
1346 bit = PAGE_SHIFT;
1347
1348 align = 1ul << bit;
1349 }
1350 1313
1351 size = PAGE_ALIGN(size); 1314 size = PAGE_ALIGN(size);
1352 if (unlikely(!size)) 1315 if (unlikely(!size))
@@ -1367,16 +1330,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1367 return NULL; 1330 return NULL;
1368 } 1331 }
1369 1332
1370 /* 1333 setup_vmalloc_vm(area, va, flags, caller);
1371 * When this function is called from __vmalloc_node_range,
1372 * we add VM_UNLIST flag to avoid accessing uninitialized
1373 * members of vm_struct such as pages and nr_pages fields.
1374 * They will be set later.
1375 */
1376 if (flags & VM_UNLIST)
1377 setup_vmalloc_vm(area, va, flags, caller);
1378 else
1379 insert_vmalloc_vm(area, va, flags, caller);
1380 1334
1381 return area; 1335 return area;
1382} 1336}
@@ -1476,10 +1430,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
1476 if (!addr) 1430 if (!addr)
1477 return; 1431 return;
1478 1432
1479 if ((PAGE_SIZE-1) & (unsigned long)addr) { 1433 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
1480 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); 1434 addr))
1481 return; 1435 return;
1482 }
1483 1436
1484 area = remove_vm_area(addr); 1437 area = remove_vm_area(addr);
1485 if (unlikely(!area)) { 1438 if (unlikely(!area)) {
@@ -1524,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
1524 * conventions for vfree() arch-depenedent would be a really bad idea) 1477 * conventions for vfree() arch-depenedent would be a really bad idea)
1525 * 1478 *
1526 * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) 1479 * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
1527 *
1528 */ 1480 */
1529void vfree(const void *addr) 1481void vfree(const void *addr)
1530{ 1482{
@@ -1536,8 +1488,8 @@ void vfree(const void *addr)
1536 return; 1488 return;
1537 if (unlikely(in_interrupt())) { 1489 if (unlikely(in_interrupt())) {
1538 struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); 1490 struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
1539 llist_add((struct llist_node *)addr, &p->list); 1491 if (llist_add((struct llist_node *)addr, &p->list))
1540 schedule_work(&p->wq); 1492 schedule_work(&p->wq);
1541 } else 1493 } else
1542 __vunmap(addr, 1); 1494 __vunmap(addr, 1);
1543} 1495}
@@ -1682,21 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1682 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1634 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1683 goto fail; 1635 goto fail;
1684 1636
1685 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, 1637 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
1686 start, end, node, gfp_mask, caller); 1638 start, end, node, gfp_mask, caller);
1687 if (!area) 1639 if (!area)
1688 goto fail; 1640 goto fail;
1689 1641
1690 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); 1642 addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
1691 if (!addr) 1643 if (!addr)
1692 return NULL; 1644 goto fail;
1693 1645
1694 /* 1646 /*
1695 * In this function, newly allocated vm_struct has VM_UNLIST flag. 1647 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
1696 * It means that vm_struct is not fully initialized. 1648 * flag. It means that vm_struct is not fully initialized.
1697 * Now, it is fully initialized, so remove this flag here. 1649 * Now, it is fully initialized, so remove this flag here.
1698 */ 1650 */
1699 clear_vm_unlist(area); 1651 clear_vm_uninitialized_flag(area);
1700 1652
1701 /* 1653 /*
1702 * A ref_count = 3 is needed because the vm_struct and vmap_area 1654 * A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2148,42 +2100,43 @@ finished:
2148} 2100}
2149 2101
2150/** 2102/**
2151 * remap_vmalloc_range - map vmalloc pages to userspace 2103 * remap_vmalloc_range_partial - map vmalloc pages to userspace
2152 * @vma: vma to cover (map full range of vma) 2104 * @vma: vma to cover
2153 * @addr: vmalloc memory 2105 * @uaddr: target user address to start at
2154 * @pgoff: number of pages into addr before first page to map 2106 * @kaddr: virtual address of vmalloc kernel memory
2107 * @size: size of map area
2155 * 2108 *
2156 * Returns: 0 for success, -Exxx on failure 2109 * Returns: 0 for success, -Exxx on failure
2157 * 2110 *
2158 * This function checks that addr is a valid vmalloc'ed area, and 2111 * This function checks that @kaddr is a valid vmalloc'ed area,
2159 * that it is big enough to cover the vma. Will return failure if 2112 * and that it is big enough to cover the range starting at
2160 * that criteria isn't met. 2113 * @uaddr in @vma. Will return failure if that criteria isn't
2114 * met.
2161 * 2115 *
2162 * Similar to remap_pfn_range() (see mm/memory.c) 2116 * Similar to remap_pfn_range() (see mm/memory.c)
2163 */ 2117 */
2164int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, 2118int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
2165 unsigned long pgoff) 2119 void *kaddr, unsigned long size)
2166{ 2120{
2167 struct vm_struct *area; 2121 struct vm_struct *area;
2168 unsigned long uaddr = vma->vm_start;
2169 unsigned long usize = vma->vm_end - vma->vm_start;
2170 2122
2171 if ((PAGE_SIZE-1) & (unsigned long)addr) 2123 size = PAGE_ALIGN(size);
2124
2125 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
2172 return -EINVAL; 2126 return -EINVAL;
2173 2127
2174 area = find_vm_area(addr); 2128 area = find_vm_area(kaddr);
2175 if (!area) 2129 if (!area)
2176 return -EINVAL; 2130 return -EINVAL;
2177 2131
2178 if (!(area->flags & VM_USERMAP)) 2132 if (!(area->flags & VM_USERMAP))
2179 return -EINVAL; 2133 return -EINVAL;
2180 2134
2181 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 2135 if (kaddr + size > area->addr + area->size)
2182 return -EINVAL; 2136 return -EINVAL;
2183 2137
2184 addr += pgoff << PAGE_SHIFT;
2185 do { 2138 do {
2186 struct page *page = vmalloc_to_page(addr); 2139 struct page *page = vmalloc_to_page(kaddr);
2187 int ret; 2140 int ret;
2188 2141
2189 ret = vm_insert_page(vma, uaddr, page); 2142 ret = vm_insert_page(vma, uaddr, page);
@@ -2191,14 +2144,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2191 return ret; 2144 return ret;
2192 2145
2193 uaddr += PAGE_SIZE; 2146 uaddr += PAGE_SIZE;
2194 addr += PAGE_SIZE; 2147 kaddr += PAGE_SIZE;
2195 usize -= PAGE_SIZE; 2148 size -= PAGE_SIZE;
2196 } while (usize > 0); 2149 } while (size > 0);
2197 2150
2198 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; 2151 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2199 2152
2200 return 0; 2153 return 0;
2201} 2154}
2155EXPORT_SYMBOL(remap_vmalloc_range_partial);
2156
2157/**
2158 * remap_vmalloc_range - map vmalloc pages to userspace
2159 * @vma: vma to cover (map full range of vma)
2160 * @addr: vmalloc memory
2161 * @pgoff: number of pages into addr before first page to map
2162 *
2163 * Returns: 0 for success, -Exxx on failure
2164 *
2165 * This function checks that addr is a valid vmalloc'ed area, and
2166 * that it is big enough to cover the vma. Will return failure if
2167 * that criteria isn't met.
2168 *
2169 * Similar to remap_pfn_range() (see mm/memory.c)
2170 */
2171int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2172 unsigned long pgoff)
2173{
2174 return remap_vmalloc_range_partial(vma, vma->vm_start,
2175 addr + (pgoff << PAGE_SHIFT),
2176 vma->vm_end - vma->vm_start);
2177}
2202EXPORT_SYMBOL(remap_vmalloc_range); 2178EXPORT_SYMBOL(remap_vmalloc_range);
2203 2179
2204/* 2180/*
@@ -2512,8 +2488,8 @@ found:
2512 2488
2513 /* insert all vm's */ 2489 /* insert all vm's */
2514 for (area = 0; area < nr_vms; area++) 2490 for (area = 0; area < nr_vms; area++)
2515 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, 2491 setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2516 pcpu_get_vm_areas); 2492 pcpu_get_vm_areas);
2517 2493
2518 kfree(vas); 2494 kfree(vas);
2519 return vms; 2495 return vms;
@@ -2592,11 +2568,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2592 if (!counters) 2568 if (!counters)
2593 return; 2569 return;
2594 2570
2595 /* Pair with smp_wmb() in clear_vm_unlist() */
2596 smp_rmb();
2597 if (v->flags & VM_UNLIST)
2598 return;
2599
2600 memset(counters, 0, nr_node_ids * sizeof(unsigned int)); 2571 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
2601 2572
2602 for (nr = 0; nr < v->nr_pages; nr++) 2573 for (nr = 0; nr < v->nr_pages; nr++)
@@ -2625,6 +2596,11 @@ static int s_show(struct seq_file *m, void *p)
2625 2596
2626 v = va->vm; 2597 v = va->vm;
2627 2598
2599 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
2600 smp_rmb();
2601 if (v->flags & VM_UNINITIALIZED)
2602 return 0;
2603
2628 seq_printf(m, "0x%pK-0x%pK %7ld", 2604 seq_printf(m, "0x%pK-0x%pK %7ld",
2629 v->addr, v->addr + v->size, v->size); 2605 v->addr, v->addr + v->size, v->size);
2630 2606
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 736a6011c2c8..0c1e37d829fa 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -180,12 +180,12 @@ static void vmpressure_work_fn(struct work_struct *work)
180 if (!vmpr->scanned) 180 if (!vmpr->scanned)
181 return; 181 return;
182 182
183 mutex_lock(&vmpr->sr_lock); 183 spin_lock(&vmpr->sr_lock);
184 scanned = vmpr->scanned; 184 scanned = vmpr->scanned;
185 reclaimed = vmpr->reclaimed; 185 reclaimed = vmpr->reclaimed;
186 vmpr->scanned = 0; 186 vmpr->scanned = 0;
187 vmpr->reclaimed = 0; 187 vmpr->reclaimed = 0;
188 mutex_unlock(&vmpr->sr_lock); 188 spin_unlock(&vmpr->sr_lock);
189 189
190 do { 190 do {
191 if (vmpressure_event(vmpr, scanned, reclaimed)) 191 if (vmpressure_event(vmpr, scanned, reclaimed))
@@ -240,13 +240,13 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
240 if (!scanned) 240 if (!scanned)
241 return; 241 return;
242 242
243 mutex_lock(&vmpr->sr_lock); 243 spin_lock(&vmpr->sr_lock);
244 vmpr->scanned += scanned; 244 vmpr->scanned += scanned;
245 vmpr->reclaimed += reclaimed; 245 vmpr->reclaimed += reclaimed;
246 scanned = vmpr->scanned; 246 scanned = vmpr->scanned;
247 mutex_unlock(&vmpr->sr_lock); 247 spin_unlock(&vmpr->sr_lock);
248 248
249 if (scanned < vmpressure_win || work_pending(&vmpr->work)) 249 if (scanned < vmpressure_win)
250 return; 250 return;
251 schedule_work(&vmpr->work); 251 schedule_work(&vmpr->work);
252} 252}
@@ -367,8 +367,24 @@ void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
367 */ 367 */
368void vmpressure_init(struct vmpressure *vmpr) 368void vmpressure_init(struct vmpressure *vmpr)
369{ 369{
370 mutex_init(&vmpr->sr_lock); 370 spin_lock_init(&vmpr->sr_lock);
371 mutex_init(&vmpr->events_lock); 371 mutex_init(&vmpr->events_lock);
372 INIT_LIST_HEAD(&vmpr->events); 372 INIT_LIST_HEAD(&vmpr->events);
373 INIT_WORK(&vmpr->work, vmpressure_work_fn); 373 INIT_WORK(&vmpr->work, vmpressure_work_fn);
374} 374}
375
376/**
377 * vmpressure_cleanup() - shuts down vmpressure control structure
378 * @vmpr: Structure to be cleaned up
379 *
380 * This function should be called before the structure in which it is
381 * embedded is cleaned up.
382 */
383void vmpressure_cleanup(struct vmpressure *vmpr)
384{
385 /*
386 * Make sure there is no pending work before eventfd infrastructure
387 * goes away.
388 */
389 flush_work(&vmpr->work);
390}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..2cff0d491c6d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
546void putback_lru_page(struct page *page) 546void putback_lru_page(struct page *page)
547{ 547{
548 int lru; 548 int lru;
549 int active = !!TestClearPageActive(page);
550 int was_unevictable = PageUnevictable(page); 549 int was_unevictable = PageUnevictable(page);
551 550
552 VM_BUG_ON(PageLRU(page)); 551 VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
561 * unevictable page on [in]active list. 560 * unevictable page on [in]active list.
562 * We know how to handle that. 561 * We know how to handle that.
563 */ 562 */
564 lru = active + page_lru_base_type(page); 563 lru = page_lru_base_type(page);
565 lru_cache_add_lru(page, lru); 564 lru_cache_add(page);
566 } else { 565 } else {
567 /* 566 /*
568 * Put unevictable pages directly on zone's unevictable 567 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
669 return PAGEREF_RECLAIM; 668 return PAGEREF_RECLAIM;
670} 669}
671 670
671/* Check if a page is dirty or under writeback */
672static void page_check_dirty_writeback(struct page *page,
673 bool *dirty, bool *writeback)
674{
675 struct address_space *mapping;
676
677 /*
678 * Anonymous pages are not handled by flushers and must be written
679 * from reclaim context. Do not stall reclaim based on them
680 */
681 if (!page_is_file_cache(page)) {
682 *dirty = false;
683 *writeback = false;
684 return;
685 }
686
687 /* By default assume that the page flags are accurate */
688 *dirty = PageDirty(page);
689 *writeback = PageWriteback(page);
690
691 /* Verify dirty/writeback state if the filesystem supports it */
692 if (!page_has_private(page))
693 return;
694
695 mapping = page_mapping(page);
696 if (mapping && mapping->a_ops->is_dirty_writeback)
697 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
698}
699
672/* 700/*
673 * shrink_page_list() returns the number of reclaimed pages 701 * shrink_page_list() returns the number of reclaimed pages
674 */ 702 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
677 struct scan_control *sc, 705 struct scan_control *sc,
678 enum ttu_flags ttu_flags, 706 enum ttu_flags ttu_flags,
679 unsigned long *ret_nr_dirty, 707 unsigned long *ret_nr_dirty,
708 unsigned long *ret_nr_unqueued_dirty,
709 unsigned long *ret_nr_congested,
680 unsigned long *ret_nr_writeback, 710 unsigned long *ret_nr_writeback,
711 unsigned long *ret_nr_immediate,
681 bool force_reclaim) 712 bool force_reclaim)
682{ 713{
683 LIST_HEAD(ret_pages); 714 LIST_HEAD(ret_pages);
684 LIST_HEAD(free_pages); 715 LIST_HEAD(free_pages);
685 int pgactivate = 0; 716 int pgactivate = 0;
717 unsigned long nr_unqueued_dirty = 0;
686 unsigned long nr_dirty = 0; 718 unsigned long nr_dirty = 0;
687 unsigned long nr_congested = 0; 719 unsigned long nr_congested = 0;
688 unsigned long nr_reclaimed = 0; 720 unsigned long nr_reclaimed = 0;
689 unsigned long nr_writeback = 0; 721 unsigned long nr_writeback = 0;
722 unsigned long nr_immediate = 0;
690 723
691 cond_resched(); 724 cond_resched();
692 725
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
696 struct page *page; 729 struct page *page;
697 int may_enter_fs; 730 int may_enter_fs;
698 enum page_references references = PAGEREF_RECLAIM_CLEAN; 731 enum page_references references = PAGEREF_RECLAIM_CLEAN;
732 bool dirty, writeback;
699 733
700 cond_resched(); 734 cond_resched();
701 735
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
723 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 757 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
724 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 758 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
725 759
760 /*
761 * The number of dirty pages determines if a zone is marked
762 * reclaim_congested which affects wait_iff_congested. kswapd
763 * will stall and start writing pages if the tail of the LRU
764 * is all dirty unqueued pages.
765 */
766 page_check_dirty_writeback(page, &dirty, &writeback);
767 if (dirty || writeback)
768 nr_dirty++;
769
770 if (dirty && !writeback)
771 nr_unqueued_dirty++;
772
773 /*
774 * Treat this page as congested if the underlying BDI is or if
775 * pages are cycling through the LRU so quickly that the
776 * pages marked for immediate reclaim are making it to the
777 * end of the LRU a second time.
778 */
779 mapping = page_mapping(page);
780 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
781 (writeback && PageReclaim(page)))
782 nr_congested++;
783
784 /*
785 * If a page at the tail of the LRU is under writeback, there
786 * are three cases to consider.
787 *
788 * 1) If reclaim is encountering an excessive number of pages
789 * under writeback and this page is both under writeback and
790 * PageReclaim then it indicates that pages are being queued
791 * for IO but are being recycled through the LRU before the
792 * IO can complete. Waiting on the page itself risks an
793 * indefinite stall if it is impossible to writeback the
794 * page due to IO error or disconnected storage so instead
795 * note that the LRU is being scanned too quickly and the
796 * caller can stall after page list has been processed.
797 *
798 * 2) Global reclaim encounters a page, memcg encounters a
799 * page that is not marked for immediate reclaim or
800 * the caller does not have __GFP_IO. In this case mark
801 * the page for immediate reclaim and continue scanning.
802 *
803 * __GFP_IO is checked because a loop driver thread might
804 * enter reclaim, and deadlock if it waits on a page for
805 * which it is needed to do the write (loop masks off
806 * __GFP_IO|__GFP_FS for this reason); but more thought
807 * would probably show more reasons.
808 *
809 * Don't require __GFP_FS, since we're not going into the
810 * FS, just waiting on its writeback completion. Worryingly,
811 * ext4 gfs2 and xfs allocate pages with
812 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
813 * may_enter_fs here is liable to OOM on them.
814 *
815 * 3) memcg encounters a page that is not already marked
816 * PageReclaim. memcg does not have any dirty pages
817 * throttling so we could easily OOM just because too many
818 * pages are in writeback and there is nothing else to
819 * reclaim. Wait for the writeback to complete.
820 */
726 if (PageWriteback(page)) { 821 if (PageWriteback(page)) {
727 /* 822 /* Case 1 above */
728 * memcg doesn't have any dirty pages throttling so we 823 if (current_is_kswapd() &&
729 * could easily OOM just because too many pages are in 824 PageReclaim(page) &&
730 * writeback and there is nothing else to reclaim. 825 zone_is_reclaim_writeback(zone)) {
731 * 826 nr_immediate++;
732 * Check __GFP_IO, certainly because a loop driver 827 goto keep_locked;
733 * thread might enter reclaim, and deadlock if it waits 828
734 * on a page for which it is needed to do the write 829 /* Case 2 above */
735 * (loop masks off __GFP_IO|__GFP_FS for this reason); 830 } else if (global_reclaim(sc) ||
736 * but more thought would probably show more reasons.
737 *
738 * Don't require __GFP_FS, since we're not going into
739 * the FS, just waiting on its writeback completion.
740 * Worryingly, ext4 gfs2 and xfs allocate pages with
741 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
742 * testing may_enter_fs here is liable to OOM on them.
743 */
744 if (global_reclaim(sc) ||
745 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 831 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
746 /* 832 /*
747 * This is slightly racy - end_page_writeback() 833 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
756 */ 842 */
757 SetPageReclaim(page); 843 SetPageReclaim(page);
758 nr_writeback++; 844 nr_writeback++;
845
759 goto keep_locked; 846 goto keep_locked;
847
848 /* Case 3 above */
849 } else {
850 wait_on_page_writeback(page);
760 } 851 }
761 wait_on_page_writeback(page);
762 } 852 }
763 853
764 if (!force_reclaim) 854 if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
784 if (!add_to_swap(page, page_list)) 874 if (!add_to_swap(page, page_list))
785 goto activate_locked; 875 goto activate_locked;
786 may_enter_fs = 1; 876 may_enter_fs = 1;
787 }
788 877
789 mapping = page_mapping(page); 878 /* Adding to swap updated mapping */
879 mapping = page_mapping(page);
880 }
790 881
791 /* 882 /*
792 * The page is mapped into the page tables of one or more 883 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 } 897 }
807 898
808 if (PageDirty(page)) { 899 if (PageDirty(page)) {
809 nr_dirty++;
810
811 /* 900 /*
812 * Only kswapd can writeback filesystem pages to 901 * Only kswapd can writeback filesystem pages to
813 * avoid risk of stack overflow but do not writeback 902 * avoid risk of stack overflow but only writeback
814 * unless under significant pressure. 903 * if many dirty pages have been encountered.
815 */ 904 */
816 if (page_is_file_cache(page) && 905 if (page_is_file_cache(page) &&
817 (!current_is_kswapd() || 906 (!current_is_kswapd() ||
818 sc->priority >= DEF_PRIORITY - 2)) { 907 !zone_is_reclaim_dirty(zone))) {
819 /* 908 /*
820 * Immediately reclaim when written back. 909 * Immediately reclaim when written back.
821 * Similar in principal to deactivate_page() 910 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
838 /* Page is dirty, try to write it out here */ 927 /* Page is dirty, try to write it out here */
839 switch (pageout(page, mapping, sc)) { 928 switch (pageout(page, mapping, sc)) {
840 case PAGE_KEEP: 929 case PAGE_KEEP:
841 nr_congested++;
842 goto keep_locked; 930 goto keep_locked;
843 case PAGE_ACTIVATE: 931 case PAGE_ACTIVATE:
844 goto activate_locked; 932 goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
946 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1034 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
947 } 1035 }
948 1036
949 /*
950 * Tag a zone as congested if all the dirty pages encountered were
951 * backed by a congested BDI. In this case, reclaimers should just
952 * back off and wait for congestion to clear because further reclaim
953 * will encounter the same problem
954 */
955 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
956 zone_set_flag(zone, ZONE_CONGESTED);
957
958 free_hot_cold_page_list(&free_pages, 1); 1037 free_hot_cold_page_list(&free_pages, 1);
959 1038
960 list_splice(&ret_pages, page_list); 1039 list_splice(&ret_pages, page_list);
961 count_vm_events(PGACTIVATE, pgactivate); 1040 count_vm_events(PGACTIVATE, pgactivate);
962 mem_cgroup_uncharge_end(); 1041 mem_cgroup_uncharge_end();
963 *ret_nr_dirty += nr_dirty; 1042 *ret_nr_dirty += nr_dirty;
1043 *ret_nr_congested += nr_congested;
1044 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
964 *ret_nr_writeback += nr_writeback; 1045 *ret_nr_writeback += nr_writeback;
1046 *ret_nr_immediate += nr_immediate;
965 return nr_reclaimed; 1047 return nr_reclaimed;
966} 1048}
967 1049
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
973 .priority = DEF_PRIORITY, 1055 .priority = DEF_PRIORITY,
974 .may_unmap = 1, 1056 .may_unmap = 1,
975 }; 1057 };
976 unsigned long ret, dummy1, dummy2; 1058 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
977 struct page *page, *next; 1059 struct page *page, *next;
978 LIST_HEAD(clean_pages); 1060 LIST_HEAD(clean_pages);
979 1061
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
985 } 1067 }
986 1068
987 ret = shrink_page_list(&clean_pages, zone, &sc, 1069 ret = shrink_page_list(&clean_pages, zone, &sc,
988 TTU_UNMAP|TTU_IGNORE_ACCESS, 1070 TTU_UNMAP|TTU_IGNORE_ACCESS,
989 &dummy1, &dummy2, true); 1071 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
990 list_splice(&clean_pages, page_list); 1072 list_splice(&clean_pages, page_list);
991 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1073 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
992 return ret; 1074 return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1281 unsigned long nr_reclaimed = 0; 1363 unsigned long nr_reclaimed = 0;
1282 unsigned long nr_taken; 1364 unsigned long nr_taken;
1283 unsigned long nr_dirty = 0; 1365 unsigned long nr_dirty = 0;
1366 unsigned long nr_congested = 0;
1367 unsigned long nr_unqueued_dirty = 0;
1284 unsigned long nr_writeback = 0; 1368 unsigned long nr_writeback = 0;
1369 unsigned long nr_immediate = 0;
1285 isolate_mode_t isolate_mode = 0; 1370 isolate_mode_t isolate_mode = 0;
1286 int file = is_file_lru(lru); 1371 int file = is_file_lru(lru);
1287 struct zone *zone = lruvec_zone(lruvec); 1372 struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1323 return 0; 1408 return 0;
1324 1409
1325 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1410 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1326 &nr_dirty, &nr_writeback, false); 1411 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1412 &nr_writeback, &nr_immediate,
1413 false);
1327 1414
1328 spin_lock_irq(&zone->lru_lock); 1415 spin_lock_irq(&zone->lru_lock);
1329 1416
@@ -1356,21 +1443,51 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1356 * as there is no guarantee the dirtying process is throttled in the 1443 * as there is no guarantee the dirtying process is throttled in the
1357 * same way balance_dirty_pages() manages. 1444 * same way balance_dirty_pages() manages.
1358 * 1445 *
1359 * This scales the number of dirty pages that must be under writeback 1446 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1360 * before throttling depending on priority. It is a simple backoff 1447 * of pages under pages flagged for immediate reclaim and stall if any
1361 * function that has the most effect in the range DEF_PRIORITY to 1448 * are encountered in the nr_immediate check below.
1362 * DEF_PRIORITY-2 which is the priority reclaim is considered to be 1449 */
1363 * in trouble and reclaim is considered to be in trouble. 1450 if (nr_writeback && nr_writeback == nr_taken)
1364 * 1451 zone_set_flag(zone, ZONE_WRITEBACK);
1365 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle 1452
1366 * DEF_PRIORITY-1 50% must be PageWriteback 1453 /*
1367 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble 1454 * memcg will stall in page writeback so only consider forcibly
1368 * ... 1455 * stalling for global reclaim
1369 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1370 * isolated page is PageWriteback
1371 */ 1456 */
1372 if (nr_writeback && nr_writeback >= 1457 if (global_reclaim(sc)) {
1373 (nr_taken >> (DEF_PRIORITY - sc->priority))) 1458 /*
1459 * Tag a zone as congested if all the dirty pages scanned were
1460 * backed by a congested BDI and wait_iff_congested will stall.
1461 */
1462 if (nr_dirty && nr_dirty == nr_congested)
1463 zone_set_flag(zone, ZONE_CONGESTED);
1464
1465 /*
1466 * If dirty pages are scanned that are not queued for IO, it
1467 * implies that flushers are not keeping up. In this case, flag
1468 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1469 * pages from reclaim context. It will forcibly stall in the
1470 * next check.
1471 */
1472 if (nr_unqueued_dirty == nr_taken)
1473 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1474
1475 /*
1476 * In addition, if kswapd scans pages marked marked for
1477 * immediate reclaim and under writeback (nr_immediate), it
1478 * implies that pages are cycling through the LRU faster than
1479 * they are written so also forcibly stall.
1480 */
1481 if (nr_unqueued_dirty == nr_taken || nr_immediate)
1482 congestion_wait(BLK_RW_ASYNC, HZ/10);
1483 }
1484
1485 /*
1486 * Stall direct reclaim for IO completions if underlying BDIs or zone
1487 * is congested. Allow kswapd to continue until it starts encountering
1488 * unqueued dirty pages or cycling through the LRU too quickly.
1489 */
1490 if (!sc->hibernation_mode && !current_is_kswapd())
1374 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1491 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1375 1492
1376 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1493 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1939,25 @@ out:
1822static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 1939static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1823{ 1940{
1824 unsigned long nr[NR_LRU_LISTS]; 1941 unsigned long nr[NR_LRU_LISTS];
1942 unsigned long targets[NR_LRU_LISTS];
1825 unsigned long nr_to_scan; 1943 unsigned long nr_to_scan;
1826 enum lru_list lru; 1944 enum lru_list lru;
1827 unsigned long nr_reclaimed = 0; 1945 unsigned long nr_reclaimed = 0;
1828 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1946 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1829 struct blk_plug plug; 1947 struct blk_plug plug;
1948 bool scan_adjusted = false;
1830 1949
1831 get_scan_count(lruvec, sc, nr); 1950 get_scan_count(lruvec, sc, nr);
1832 1951
1952 /* Record the original scan target for proportional adjustments later */
1953 memcpy(targets, nr, sizeof(nr));
1954
1833 blk_start_plug(&plug); 1955 blk_start_plug(&plug);
1834 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1956 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1835 nr[LRU_INACTIVE_FILE]) { 1957 nr[LRU_INACTIVE_FILE]) {
1958 unsigned long nr_anon, nr_file, percentage;
1959 unsigned long nr_scanned;
1960
1836 for_each_evictable_lru(lru) { 1961 for_each_evictable_lru(lru) {
1837 if (nr[lru]) { 1962 if (nr[lru]) {
1838 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 1963 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1967,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1842 lruvec, sc); 1967 lruvec, sc);
1843 } 1968 }
1844 } 1969 }
1970
1971 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
1972 continue;
1973
1845 /* 1974 /*
1846 * On large memory systems, scan >> priority can become 1975 * For global direct reclaim, reclaim only the number of pages
1847 * really large. This is fine for the starting priority; 1976 * requested. Less care is taken to scan proportionally as it
1848 * we want to put equal scanning pressure on each zone. 1977 * is more important to minimise direct reclaim stall latency
1849 * However, if the VM has a harder time of freeing pages, 1978 * than it is to properly age the LRU lists.
1850 * with multiple processes reclaiming pages, the total
1851 * freeing target can get unreasonably large.
1852 */ 1979 */
1853 if (nr_reclaimed >= nr_to_reclaim && 1980 if (global_reclaim(sc) && !current_is_kswapd())
1854 sc->priority < DEF_PRIORITY)
1855 break; 1981 break;
1982
1983 /*
1984 * For kswapd and memcg, reclaim at least the number of pages
1985 * requested. Ensure that the anon and file LRUs shrink
1986 * proportionally what was requested by get_scan_count(). We
1987 * stop reclaiming one LRU and reduce the amount scanning
1988 * proportional to the original scan target.
1989 */
1990 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
1991 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
1992
1993 if (nr_file > nr_anon) {
1994 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
1995 targets[LRU_ACTIVE_ANON] + 1;
1996 lru = LRU_BASE;
1997 percentage = nr_anon * 100 / scan_target;
1998 } else {
1999 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2000 targets[LRU_ACTIVE_FILE] + 1;
2001 lru = LRU_FILE;
2002 percentage = nr_file * 100 / scan_target;
2003 }
2004
2005 /* Stop scanning the smaller of the LRU */
2006 nr[lru] = 0;
2007 nr[lru + LRU_ACTIVE] = 0;
2008
2009 /*
2010 * Recalculate the other LRU scan count based on its original
2011 * scan target and the percentage scanning already complete
2012 */
2013 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2014 nr_scanned = targets[lru] - nr[lru];
2015 nr[lru] = targets[lru] * (100 - percentage) / 100;
2016 nr[lru] -= min(nr[lru], nr_scanned);
2017
2018 lru += LRU_ACTIVE;
2019 nr_scanned = targets[lru] - nr[lru];
2020 nr[lru] = targets[lru] * (100 - percentage) / 100;
2021 nr[lru] -= min(nr[lru], nr_scanned);
2022
2023 scan_adjusted = true;
1856 } 2024 }
1857 blk_finish_plug(&plug); 2025 blk_finish_plug(&plug);
1858 sc->nr_reclaimed += nr_reclaimed; 2026 sc->nr_reclaimed += nr_reclaimed;
@@ -2179,8 +2347,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2179 aborted_reclaim = shrink_zones(zonelist, sc); 2347 aborted_reclaim = shrink_zones(zonelist, sc);
2180 2348
2181 /* 2349 /*
2182 * Don't shrink slabs when reclaiming memory from 2350 * Don't shrink slabs when reclaiming memory from over limit
2183 * over limit cgroups 2351 * cgroups but do shrink slab at least once when aborting
2352 * reclaim for compaction to avoid unevenly scanning file/anon
2353 * LRU pages over slab pages.
2184 */ 2354 */
2185 if (global_reclaim(sc)) { 2355 if (global_reclaim(sc)) {
2186 unsigned long lru_pages = 0; 2356 unsigned long lru_pages = 0;
@@ -2222,18 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2222 WB_REASON_TRY_TO_FREE_PAGES); 2392 WB_REASON_TRY_TO_FREE_PAGES);
2223 sc->may_writepage = 1; 2393 sc->may_writepage = 1;
2224 } 2394 }
2225 2395 } while (--sc->priority >= 0 && !aborted_reclaim);
2226 /* Take a nap, wait for some writeback to complete */
2227 if (!sc->hibernation_mode && sc->nr_scanned &&
2228 sc->priority < DEF_PRIORITY - 2) {
2229 struct zone *preferred_zone;
2230
2231 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2232 &cpuset_current_mems_allowed,
2233 &preferred_zone);
2234 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2235 }
2236 } while (--sc->priority >= 0);
2237 2396
2238out: 2397out:
2239 delayacct_freepages_end(); 2398 delayacct_freepages_end();
@@ -2601,6 +2760,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2601} 2760}
2602 2761
2603/* 2762/*
2763 * kswapd shrinks the zone by the number of pages required to reach
2764 * the high watermark.
2765 *
2766 * Returns true if kswapd scanned at least the requested number of pages to
2767 * reclaim or if the lack of progress was due to pages under writeback.
2768 * This is used to determine if the scanning priority needs to be raised.
2769 */
2770static bool kswapd_shrink_zone(struct zone *zone,
2771 int classzone_idx,
2772 struct scan_control *sc,
2773 unsigned long lru_pages,
2774 unsigned long *nr_attempted)
2775{
2776 unsigned long nr_slab;
2777 int testorder = sc->order;
2778 unsigned long balance_gap;
2779 struct reclaim_state *reclaim_state = current->reclaim_state;
2780 struct shrink_control shrink = {
2781 .gfp_mask = sc->gfp_mask,
2782 };
2783 bool lowmem_pressure;
2784
2785 /* Reclaim above the high watermark. */
2786 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2787
2788 /*
2789 * Kswapd reclaims only single pages with compaction enabled. Trying
2790 * too hard to reclaim until contiguous free pages have become
2791 * available can hurt performance by evicting too much useful data
2792 * from memory. Do not reclaim more than needed for compaction.
2793 */
2794 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2795 compaction_suitable(zone, sc->order) !=
2796 COMPACT_SKIPPED)
2797 testorder = 0;
2798
2799 /*
2800 * We put equal pressure on every zone, unless one zone has way too
2801 * many pages free already. The "too many pages" is defined as the
2802 * high wmark plus a "gap" where the gap is either the low
2803 * watermark or 1% of the zone, whichever is smaller.
2804 */
2805 balance_gap = min(low_wmark_pages(zone),
2806 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2807 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2808
2809 /*
2810 * If there is no low memory pressure or the zone is balanced then no
2811 * reclaim is necessary
2812 */
2813 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2814 if (!lowmem_pressure && zone_balanced(zone, testorder,
2815 balance_gap, classzone_idx))
2816 return true;
2817
2818 shrink_zone(zone, sc);
2819
2820 reclaim_state->reclaimed_slab = 0;
2821 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2822 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2823
2824 /* Account for the number of pages attempted to reclaim */
2825 *nr_attempted += sc->nr_to_reclaim;
2826
2827 if (nr_slab == 0 && !zone_reclaimable(zone))
2828 zone->all_unreclaimable = 1;
2829
2830 zone_clear_flag(zone, ZONE_WRITEBACK);
2831
2832 /*
2833 * If a zone reaches its high watermark, consider it to be no longer
2834 * congested. It's possible there are dirty pages backed by congested
2835 * BDIs but as pressure is relieved, speculatively avoid congestion
2836 * waits.
2837 */
2838 if (!zone->all_unreclaimable &&
2839 zone_balanced(zone, testorder, 0, classzone_idx)) {
2840 zone_clear_flag(zone, ZONE_CONGESTED);
2841 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2842 }
2843
2844 return sc->nr_scanned >= sc->nr_to_reclaim;
2845}
2846
2847/*
2604 * For kswapd, balance_pgdat() will work across all this node's zones until 2848 * For kswapd, balance_pgdat() will work across all this node's zones until
2605 * they are all at high_wmark_pages(zone). 2849 * they are all at high_wmark_pages(zone).
2606 * 2850 *
@@ -2624,35 +2868,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2624static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2868static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2625 int *classzone_idx) 2869 int *classzone_idx)
2626{ 2870{
2627 bool pgdat_is_balanced = false;
2628 int i; 2871 int i;
2629 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2872 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2630 struct reclaim_state *reclaim_state = current->reclaim_state;
2631 unsigned long nr_soft_reclaimed; 2873 unsigned long nr_soft_reclaimed;
2632 unsigned long nr_soft_scanned; 2874 unsigned long nr_soft_scanned;
2633 struct scan_control sc = { 2875 struct scan_control sc = {
2634 .gfp_mask = GFP_KERNEL, 2876 .gfp_mask = GFP_KERNEL,
2877 .priority = DEF_PRIORITY,
2635 .may_unmap = 1, 2878 .may_unmap = 1,
2636 .may_swap = 1, 2879 .may_swap = 1,
2637 /* 2880 .may_writepage = !laptop_mode,
2638 * kswapd doesn't want to be bailed out while reclaim. because
2639 * we want to put equal scanning pressure on each zone.
2640 */
2641 .nr_to_reclaim = ULONG_MAX,
2642 .order = order, 2881 .order = order,
2643 .target_mem_cgroup = NULL, 2882 .target_mem_cgroup = NULL,
2644 }; 2883 };
2645 struct shrink_control shrink = {
2646 .gfp_mask = sc.gfp_mask,
2647 };
2648loop_again:
2649 sc.priority = DEF_PRIORITY;
2650 sc.nr_reclaimed = 0;
2651 sc.may_writepage = !laptop_mode;
2652 count_vm_event(PAGEOUTRUN); 2884 count_vm_event(PAGEOUTRUN);
2653 2885
2654 do { 2886 do {
2655 unsigned long lru_pages = 0; 2887 unsigned long lru_pages = 0;
2888 unsigned long nr_attempted = 0;
2889 bool raise_priority = true;
2890 bool pgdat_needs_compaction = (order > 0);
2891
2892 sc.nr_reclaimed = 0;
2656 2893
2657 /* 2894 /*
2658 * Scan in the highmem->dma direction for the highest 2895 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2926,46 @@ loop_again:
2689 end_zone = i; 2926 end_zone = i;
2690 break; 2927 break;
2691 } else { 2928 } else {
2692 /* If balanced, clear the congested flag */ 2929 /*
2930 * If balanced, clear the dirty and congested
2931 * flags
2932 */
2693 zone_clear_flag(zone, ZONE_CONGESTED); 2933 zone_clear_flag(zone, ZONE_CONGESTED);
2934 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2694 } 2935 }
2695 } 2936 }
2696 2937
2697 if (i < 0) { 2938 if (i < 0)
2698 pgdat_is_balanced = true;
2699 goto out; 2939 goto out;
2700 }
2701 2940
2702 for (i = 0; i <= end_zone; i++) { 2941 for (i = 0; i <= end_zone; i++) {
2703 struct zone *zone = pgdat->node_zones + i; 2942 struct zone *zone = pgdat->node_zones + i;
2704 2943
2944 if (!populated_zone(zone))
2945 continue;
2946
2705 lru_pages += zone_reclaimable_pages(zone); 2947 lru_pages += zone_reclaimable_pages(zone);
2948
2949 /*
2950 * If any zone is currently balanced then kswapd will
2951 * not call compaction as it is expected that the
2952 * necessary pages are already available.
2953 */
2954 if (pgdat_needs_compaction &&
2955 zone_watermark_ok(zone, order,
2956 low_wmark_pages(zone),
2957 *classzone_idx, 0))
2958 pgdat_needs_compaction = false;
2706 } 2959 }
2707 2960
2708 /* 2961 /*
2962 * If we're getting trouble reclaiming, start doing writepage
2963 * even in laptop mode.
2964 */
2965 if (sc.priority < DEF_PRIORITY - 2)
2966 sc.may_writepage = 1;
2967
2968 /*
2709 * Now scan the zone in the dma->highmem direction, stopping 2969 * Now scan the zone in the dma->highmem direction, stopping
2710 * at the last zone which needs scanning. 2970 * at the last zone which needs scanning.
2711 * 2971 *
@@ -2716,8 +2976,6 @@ loop_again:
2716 */ 2976 */
2717 for (i = 0; i <= end_zone; i++) { 2977 for (i = 0; i <= end_zone; i++) {
2718 struct zone *zone = pgdat->node_zones + i; 2978 struct zone *zone = pgdat->node_zones + i;
2719 int nr_slab, testorder;
2720 unsigned long balance_gap;
2721 2979
2722 if (!populated_zone(zone)) 2980 if (!populated_zone(zone))
2723 continue; 2981 continue;
@@ -2738,65 +2996,14 @@ loop_again:
2738 sc.nr_reclaimed += nr_soft_reclaimed; 2996 sc.nr_reclaimed += nr_soft_reclaimed;
2739 2997
2740 /* 2998 /*
2741 * We put equal pressure on every zone, unless 2999 * There should be no need to raise the scanning
2742 * one zone has way too many pages free 3000 * priority if enough pages are already being scanned
2743 * already. The "too many pages" is defined 3001 * that that high watermark would be met at 100%
2744 * as the high wmark plus a "gap" where the 3002 * efficiency.
2745 * gap is either the low watermark or 1%
2746 * of the zone, whichever is smaller.
2747 */ 3003 */
2748 balance_gap = min(low_wmark_pages(zone), 3004 if (kswapd_shrink_zone(zone, end_zone, &sc,
2749 (zone->managed_pages + 3005 lru_pages, &nr_attempted))
2750 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 3006 raise_priority = false;
2751 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2752 /*
2753 * Kswapd reclaims only single pages with compaction
2754 * enabled. Trying too hard to reclaim until contiguous
2755 * free pages have become available can hurt performance
2756 * by evicting too much useful data from memory.
2757 * Do not reclaim more than needed for compaction.
2758 */
2759 testorder = order;
2760 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2761 compaction_suitable(zone, order) !=
2762 COMPACT_SKIPPED)
2763 testorder = 0;
2764
2765 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2766 !zone_balanced(zone, testorder,
2767 balance_gap, end_zone)) {
2768 shrink_zone(zone, &sc);
2769
2770 reclaim_state->reclaimed_slab = 0;
2771 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2772 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2773
2774 if (nr_slab == 0 && !zone_reclaimable(zone))
2775 zone->all_unreclaimable = 1;
2776 }
2777
2778 /*
2779 * If we're getting trouble reclaiming, start doing
2780 * writepage even in laptop mode.
2781 */
2782 if (sc.priority < DEF_PRIORITY - 2)
2783 sc.may_writepage = 1;
2784
2785 if (zone->all_unreclaimable) {
2786 if (end_zone && end_zone == i)
2787 end_zone--;
2788 continue;
2789 }
2790
2791 if (zone_balanced(zone, testorder, 0, end_zone))
2792 /*
2793 * If a zone reaches its high watermark,
2794 * consider it to be no longer congested. It's
2795 * possible there are dirty pages backed by
2796 * congested BDIs but as pressure is relieved,
2797 * speculatively avoid congestion waits
2798 */
2799 zone_clear_flag(zone, ZONE_CONGESTED);
2800 } 3007 }
2801 3008
2802 /* 3009 /*
@@ -2808,74 +3015,38 @@ loop_again:
2808 pfmemalloc_watermark_ok(pgdat)) 3015 pfmemalloc_watermark_ok(pgdat))
2809 wake_up(&pgdat->pfmemalloc_wait); 3016 wake_up(&pgdat->pfmemalloc_wait);
2810 3017
2811 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2812 pgdat_is_balanced = true;
2813 break; /* kswapd: all done */
2814 }
2815
2816 /* 3018 /*
2817 * We do this so kswapd doesn't build up large priorities for 3019 * Fragmentation may mean that the system cannot be rebalanced
2818 * example when it is freeing in parallel with allocators. It 3020 * for high-order allocations in all zones. If twice the
2819 * matches the direct reclaim path behaviour in terms of impact 3021 * allocation size has been reclaimed and the zones are still
2820 * on zone->*_priority. 3022 * not balanced then recheck the watermarks at order-0 to
3023 * prevent kswapd reclaiming excessively. Assume that a
3024 * process requested a high-order can direct reclaim/compact.
2821 */ 3025 */
2822 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 3026 if (order && sc.nr_reclaimed >= 2UL << order)
2823 break; 3027 order = sc.order = 0;
2824 } while (--sc.priority >= 0);
2825
2826out:
2827 if (!pgdat_is_balanced) {
2828 cond_resched();
2829 3028
2830 try_to_freeze(); 3029 /* Check if kswapd should be suspending */
3030 if (try_to_freeze() || kthread_should_stop())
3031 break;
2831 3032
2832 /* 3033 /*
2833 * Fragmentation may mean that the system cannot be 3034 * Compact if necessary and kswapd is reclaiming at least the
2834 * rebalanced for high-order allocations in all zones. 3035 * high watermark number of pages as requsted
2835 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
2836 * it means the zones have been fully scanned and are still
2837 * not balanced. For high-order allocations, there is
2838 * little point trying all over again as kswapd may
2839 * infinite loop.
2840 *
2841 * Instead, recheck all watermarks at order-0 as they
2842 * are the most important. If watermarks are ok, kswapd will go
2843 * back to sleep. High-order users can still perform direct
2844 * reclaim if they wish.
2845 */ 3036 */
2846 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 3037 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
2847 order = sc.order = 0;
2848
2849 goto loop_again;
2850 }
2851
2852 /*
2853 * If kswapd was reclaiming at a higher order, it has the option of
2854 * sleeping without all zones being balanced. Before it does, it must
2855 * ensure that the watermarks for order-0 on *all* zones are met and
2856 * that the congestion flags are cleared. The congestion flag must
2857 * be cleared as kswapd is the only mechanism that clears the flag
2858 * and it is potentially going to sleep here.
2859 */
2860 if (order) {
2861 int zones_need_compaction = 1;
2862
2863 for (i = 0; i <= end_zone; i++) {
2864 struct zone *zone = pgdat->node_zones + i;
2865
2866 if (!populated_zone(zone))
2867 continue;
2868
2869 /* Check if the memory needs to be defragmented. */
2870 if (zone_watermark_ok(zone, order,
2871 low_wmark_pages(zone), *classzone_idx, 0))
2872 zones_need_compaction = 0;
2873 }
2874
2875 if (zones_need_compaction)
2876 compact_pgdat(pgdat, order); 3038 compact_pgdat(pgdat, order);
2877 }
2878 3039
3040 /*
3041 * Raise priority if scanning rate is too low or there was no
3042 * progress in reclaiming pages
3043 */
3044 if (raise_priority || !sc.nr_reclaimed)
3045 sc.priority--;
3046 } while (sc.priority >= 1 &&
3047 !pgdat_balanced(pgdat, order, *classzone_idx));
3048
3049out:
2879 /* 3050 /*
2880 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3051 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2881 * makes a decision on the order we were last reclaiming at. However, 3052 * makes a decision on the order we were last reclaiming at. However,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f42745e65780..20c2ef4458fa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1182,7 +1182,7 @@ static void vmstat_update(struct work_struct *w)
1182 round_jiffies_relative(sysctl_stat_interval)); 1182 round_jiffies_relative(sysctl_stat_interval));
1183} 1183}
1184 1184
1185static void __cpuinit start_cpu_timer(int cpu) 1185static void start_cpu_timer(int cpu)
1186{ 1186{
1187 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1187 struct delayed_work *work = &per_cpu(vmstat_work, cpu);
1188 1188
@@ -1194,7 +1194,7 @@ static void __cpuinit start_cpu_timer(int cpu)
1194 * Use the cpu notifier to insure that the thresholds are recalculated 1194 * Use the cpu notifier to insure that the thresholds are recalculated
1195 * when necessary. 1195 * when necessary.
1196 */ 1196 */
1197static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, 1197static int vmstat_cpuup_callback(struct notifier_block *nfb,
1198 unsigned long action, 1198 unsigned long action,
1199 void *hcpu) 1199 void *hcpu)
1200{ 1200{
@@ -1226,7 +1226,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1226 return NOTIFY_OK; 1226 return NOTIFY_OK;
1227} 1227}
1228 1228
1229static struct notifier_block __cpuinitdata vmstat_notifier = 1229static struct notifier_block vmstat_notifier =
1230 { &vmstat_cpuup_callback, NULL, 0 }; 1230 { &vmstat_cpuup_callback, NULL, 0 };
1231#endif 1231#endif
1232 1232
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000000..ad1e781284fd
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,527 @@
1/*
2 * zbud.c
3 *
4 * Copyright (C) 2013, Seth Jennings, IBM
5 *
6 * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
7 *
8 * zbud is an special purpose allocator for storing compressed pages. Contrary
9 * to what its name may suggest, zbud is not a buddy allocator, but rather an
10 * allocator that "buddies" two compressed pages together in a single memory
11 * page.
12 *
13 * While this design limits storage density, it has simple and deterministic
14 * reclaim properties that make it preferable to a higher density approach when
15 * reclaim will be used.
16 *
17 * zbud works by storing compressed pages, or "zpages", together in pairs in a
18 * single memory page called a "zbud page". The first buddy is "left
19 * justifed" at the beginning of the zbud page, and the last buddy is "right
20 * justified" at the end of the zbud page. The benefit is that if either
21 * buddy is freed, the freed buddy space, coalesced with whatever slack space
22 * that existed between the buddies, results in the largest possible free region
23 * within the zbud page.
24 *
25 * zbud also provides an attractive lower bound on density. The ratio of zpages
26 * to zbud pages can not be less than 1. This ensures that zbud can never "do
27 * harm" by using more pages to store zpages than the uncompressed zpages would
28 * have used on their own.
29 *
30 * zbud pages are divided into "chunks". The size of the chunks is fixed at
31 * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
32 * into chunks allows organizing unbuddied zbud pages into a manageable number
33 * of unbuddied lists according to the number of free chunks available in the
34 * zbud page.
35 *
36 * The zbud API differs from that of conventional allocators in that the
37 * allocation function, zbud_alloc(), returns an opaque handle to the user,
38 * not a dereferenceable pointer. The user must map the handle using
39 * zbud_map() in order to get a usable pointer by which to access the
40 * allocation data and unmap the handle with zbud_unmap() when operations
41 * on the allocation data are complete.
42 */
43
44#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
45
46#include <linux/atomic.h>
47#include <linux/list.h>
48#include <linux/mm.h>
49#include <linux/module.h>
50#include <linux/preempt.h>
51#include <linux/slab.h>
52#include <linux/spinlock.h>
53#include <linux/zbud.h>
54
55/*****************
56 * Structures
57*****************/
58/*
59 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
60 * adjusting internal fragmentation. It also determines the number of
61 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
62 * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
63 * will be 64 freelists per pool.
64 */
65#define NCHUNKS_ORDER 6
66
67#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
68#define CHUNK_SIZE (1 << CHUNK_SHIFT)
69#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
70#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
71
72/**
73 * struct zbud_pool - stores metadata for each zbud pool
74 * @lock: protects all pool fields and first|last_chunk fields of any
75 * zbud page in the pool
76 * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
77 * the lists each zbud page is added to depends on the size of
78 * its free region.
79 * @buddied: list tracking the zbud pages that contain two buddies;
80 * these zbud pages are full
81 * @lru: list tracking the zbud pages in LRU order by most recently
82 * added buddy.
83 * @pages_nr: number of zbud pages in the pool.
84 * @ops: pointer to a structure of user defined operations specified at
85 * pool creation time.
86 *
87 * This structure is allocated at pool creation time and maintains metadata
88 * pertaining to a particular zbud pool.
89 */
90struct zbud_pool {
91 spinlock_t lock;
92 struct list_head unbuddied[NCHUNKS];
93 struct list_head buddied;
94 struct list_head lru;
95 u64 pages_nr;
96 struct zbud_ops *ops;
97};
98
99/*
100 * struct zbud_header - zbud page metadata occupying the first chunk of each
101 * zbud page.
102 * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
103 * @lru: links the zbud page into the lru list in the pool
104 * @first_chunks: the size of the first buddy in chunks, 0 if free
105 * @last_chunks: the size of the last buddy in chunks, 0 if free
106 */
107struct zbud_header {
108 struct list_head buddy;
109 struct list_head lru;
110 unsigned int first_chunks;
111 unsigned int last_chunks;
112 bool under_reclaim;
113};
114
115/*****************
116 * Helpers
117*****************/
118/* Just to make the code easier to read */
119enum buddy {
120 FIRST,
121 LAST
122};
123
124/* Converts an allocation size in bytes to size in zbud chunks */
125static int size_to_chunks(int size)
126{
127 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
128}
129
130#define for_each_unbuddied_list(_iter, _begin) \
131 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
132
133/* Initializes the zbud header of a newly allocated zbud page */
134static struct zbud_header *init_zbud_page(struct page *page)
135{
136 struct zbud_header *zhdr = page_address(page);
137 zhdr->first_chunks = 0;
138 zhdr->last_chunks = 0;
139 INIT_LIST_HEAD(&zhdr->buddy);
140 INIT_LIST_HEAD(&zhdr->lru);
141 zhdr->under_reclaim = 0;
142 return zhdr;
143}
144
145/* Resets the struct page fields and frees the page */
146static void free_zbud_page(struct zbud_header *zhdr)
147{
148 __free_page(virt_to_page(zhdr));
149}
150
151/*
152 * Encodes the handle of a particular buddy within a zbud page
153 * Pool lock should be held as this function accesses first|last_chunks
154 */
155static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
156{
157 unsigned long handle;
158
159 /*
160 * For now, the encoded handle is actually just the pointer to the data
161 * but this might not always be the case. A little information hiding.
162 * Add CHUNK_SIZE to the handle if it is the first allocation to jump
163 * over the zbud header in the first chunk.
164 */
165 handle = (unsigned long)zhdr;
166 if (bud == FIRST)
167 /* skip over zbud header */
168 handle += ZHDR_SIZE_ALIGNED;
169 else /* bud == LAST */
170 handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
171 return handle;
172}
173
174/* Returns the zbud page where a given handle is stored */
175static struct zbud_header *handle_to_zbud_header(unsigned long handle)
176{
177 return (struct zbud_header *)(handle & PAGE_MASK);
178}
179
180/* Returns the number of free chunks in a zbud page */
181static int num_free_chunks(struct zbud_header *zhdr)
182{
183 /*
184 * Rather than branch for different situations, just use the fact that
185 * free buddies have a length of zero to simplify everything. -1 at the
186 * end for the zbud header.
187 */
188 return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
189}
190
191/*****************
192 * API Functions
193*****************/
194/**
195 * zbud_create_pool() - create a new zbud pool
196 * @gfp: gfp flags when allocating the zbud pool structure
197 * @ops: user-defined operations for the zbud pool
198 *
199 * Return: pointer to the new zbud pool or NULL if the metadata allocation
200 * failed.
201 */
202struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
203{
204 struct zbud_pool *pool;
205 int i;
206
207 pool = kmalloc(sizeof(struct zbud_pool), gfp);
208 if (!pool)
209 return NULL;
210 spin_lock_init(&pool->lock);
211 for_each_unbuddied_list(i, 0)
212 INIT_LIST_HEAD(&pool->unbuddied[i]);
213 INIT_LIST_HEAD(&pool->buddied);
214 INIT_LIST_HEAD(&pool->lru);
215 pool->pages_nr = 0;
216 pool->ops = ops;
217 return pool;
218}
219
220/**
221 * zbud_destroy_pool() - destroys an existing zbud pool
222 * @pool: the zbud pool to be destroyed
223 *
224 * The pool should be emptied before this function is called.
225 */
226void zbud_destroy_pool(struct zbud_pool *pool)
227{
228 kfree(pool);
229}
230
231/**
232 * zbud_alloc() - allocates a region of a given size
233 * @pool: zbud pool from which to allocate
234 * @size: size in bytes of the desired allocation
235 * @gfp: gfp flags used if the pool needs to grow
236 * @handle: handle of the new allocation
237 *
238 * This function will attempt to find a free region in the pool large enough to
239 * satisfy the allocation request. A search of the unbuddied lists is
240 * performed first. If no suitable free region is found, then a new page is
241 * allocated and added to the pool to satisfy the request.
242 *
243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
244 * as zbud pool pages.
245 *
246 * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page.
249 */
250int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
251 unsigned long *handle)
252{
253 int chunks, i, freechunks;
254 struct zbud_header *zhdr = NULL;
255 enum buddy bud;
256 struct page *page;
257
258 if (size <= 0 || gfp & __GFP_HIGHMEM)
259 return -EINVAL;
260 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
261 return -ENOSPC;
262 chunks = size_to_chunks(size);
263 spin_lock(&pool->lock);
264
265 /* First, try to find an unbuddied zbud page. */
266 zhdr = NULL;
267 for_each_unbuddied_list(i, chunks) {
268 if (!list_empty(&pool->unbuddied[i])) {
269 zhdr = list_first_entry(&pool->unbuddied[i],
270 struct zbud_header, buddy);
271 list_del(&zhdr->buddy);
272 if (zhdr->first_chunks == 0)
273 bud = FIRST;
274 else
275 bud = LAST;
276 goto found;
277 }
278 }
279
280 /* Couldn't find unbuddied zbud page, create new one */
281 spin_unlock(&pool->lock);
282 page = alloc_page(gfp);
283 if (!page)
284 return -ENOMEM;
285 spin_lock(&pool->lock);
286 pool->pages_nr++;
287 zhdr = init_zbud_page(page);
288 bud = FIRST;
289
290found:
291 if (bud == FIRST)
292 zhdr->first_chunks = chunks;
293 else
294 zhdr->last_chunks = chunks;
295
296 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
297 /* Add to unbuddied list */
298 freechunks = num_free_chunks(zhdr);
299 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
300 } else {
301 /* Add to buddied list */
302 list_add(&zhdr->buddy, &pool->buddied);
303 }
304
305 /* Add/move zbud page to beginning of LRU */
306 if (!list_empty(&zhdr->lru))
307 list_del(&zhdr->lru);
308 list_add(&zhdr->lru, &pool->lru);
309
310 *handle = encode_handle(zhdr, bud);
311 spin_unlock(&pool->lock);
312
313 return 0;
314}
315
316/**
317 * zbud_free() - frees the allocation associated with the given handle
318 * @pool: pool in which the allocation resided
319 * @handle: handle associated with the allocation returned by zbud_alloc()
320 *
321 * In the case that the zbud page in which the allocation resides is under
322 * reclaim, as indicated by the PG_reclaim flag being set, this function
323 * only sets the first|last_chunks to 0. The page is actually freed
324 * once both buddies are evicted (see zbud_reclaim_page() below).
325 */
326void zbud_free(struct zbud_pool *pool, unsigned long handle)
327{
328 struct zbud_header *zhdr;
329 int freechunks;
330
331 spin_lock(&pool->lock);
332 zhdr = handle_to_zbud_header(handle);
333
334 /* If first buddy, handle will be page aligned */
335 if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
336 zhdr->last_chunks = 0;
337 else
338 zhdr->first_chunks = 0;
339
340 if (zhdr->under_reclaim) {
341 /* zbud page is under reclaim, reclaim will free */
342 spin_unlock(&pool->lock);
343 return;
344 }
345
346 /* Remove from existing buddy list */
347 list_del(&zhdr->buddy);
348
349 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
350 /* zbud page is empty, free */
351 list_del(&zhdr->lru);
352 free_zbud_page(zhdr);
353 pool->pages_nr--;
354 } else {
355 /* Add to unbuddied list */
356 freechunks = num_free_chunks(zhdr);
357 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
358 }
359
360 spin_unlock(&pool->lock);
361}
362
363#define list_tail_entry(ptr, type, member) \
364 list_entry((ptr)->prev, type, member)
365
366/**
367 * zbud_reclaim_page() - evicts allocations from a pool page and frees it
368 * @pool: pool from which a page will attempt to be evicted
369 * @retires: number of pages on the LRU list for which eviction will
370 * be attempted before failing
371 *
372 * zbud reclaim is different from normal system reclaim in that the reclaim is
373 * done from the bottom, up. This is because only the bottom layer, zbud, has
374 * information on how the allocations are organized within each zbud page. This
375 * has the potential to create interesting locking situations between zbud and
376 * the user, however.
377 *
378 * To avoid these, this is how zbud_reclaim_page() should be called:
379
380 * The user detects a page should be reclaimed and calls zbud_reclaim_page().
381 * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
382 * the user-defined eviction handler with the pool and handle as arguments.
383 *
384 * If the handle can not be evicted, the eviction handler should return
385 * non-zero. zbud_reclaim_page() will add the zbud page back to the
386 * appropriate list and try the next zbud page on the LRU up to
387 * a user defined number of retries.
388 *
389 * If the handle is successfully evicted, the eviction handler should
390 * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
391 * contains logic to delay freeing the page if the page is under reclaim,
392 * as indicated by the setting of the PG_reclaim flag on the underlying page.
393 *
394 * If all buddies in the zbud page are successfully evicted, then the
395 * zbud page can be freed.
396 *
397 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
398 * no pages to evict or an eviction handler is not registered, -EAGAIN if
399 * the retry limit was hit.
400 */
401int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
402{
403 int i, ret, freechunks;
404 struct zbud_header *zhdr;
405 unsigned long first_handle = 0, last_handle = 0;
406
407 spin_lock(&pool->lock);
408 if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
409 retries == 0) {
410 spin_unlock(&pool->lock);
411 return -EINVAL;
412 }
413 for (i = 0; i < retries; i++) {
414 zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
415 list_del(&zhdr->lru);
416 list_del(&zhdr->buddy);
417 /* Protect zbud page against free */
418 zhdr->under_reclaim = true;
419 /*
420 * We need encode the handles before unlocking, since we can
421 * race with free that will set (first|last)_chunks to 0
422 */
423 first_handle = 0;
424 last_handle = 0;
425 if (zhdr->first_chunks)
426 first_handle = encode_handle(zhdr, FIRST);
427 if (zhdr->last_chunks)
428 last_handle = encode_handle(zhdr, LAST);
429 spin_unlock(&pool->lock);
430
431 /* Issue the eviction callback(s) */
432 if (first_handle) {
433 ret = pool->ops->evict(pool, first_handle);
434 if (ret)
435 goto next;
436 }
437 if (last_handle) {
438 ret = pool->ops->evict(pool, last_handle);
439 if (ret)
440 goto next;
441 }
442next:
443 spin_lock(&pool->lock);
444 zhdr->under_reclaim = false;
445 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
446 /*
447 * Both buddies are now free, free the zbud page and
448 * return success.
449 */
450 free_zbud_page(zhdr);
451 pool->pages_nr--;
452 spin_unlock(&pool->lock);
453 return 0;
454 } else if (zhdr->first_chunks == 0 ||
455 zhdr->last_chunks == 0) {
456 /* add to unbuddied list */
457 freechunks = num_free_chunks(zhdr);
458 list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
459 } else {
460 /* add to buddied list */
461 list_add(&zhdr->buddy, &pool->buddied);
462 }
463
464 /* add to beginning of LRU */
465 list_add(&zhdr->lru, &pool->lru);
466 }
467 spin_unlock(&pool->lock);
468 return -EAGAIN;
469}
470
471/**
472 * zbud_map() - maps the allocation associated with the given handle
473 * @pool: pool in which the allocation resides
474 * @handle: handle associated with the allocation to be mapped
475 *
476 * While trivial for zbud, the mapping functions for others allocators
477 * implementing this allocation API could have more complex information encoded
478 * in the handle and could create temporary mappings to make the data
479 * accessible to the user.
480 *
481 * Returns: a pointer to the mapped allocation
482 */
483void *zbud_map(struct zbud_pool *pool, unsigned long handle)
484{
485 return (void *)(handle);
486}
487
488/**
489 * zbud_unmap() - maps the allocation associated with the given handle
490 * @pool: pool in which the allocation resides
491 * @handle: handle associated with the allocation to be unmapped
492 */
493void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
494{
495}
496
497/**
498 * zbud_get_pool_size() - gets the zbud pool size in pages
499 * @pool: pool whose size is being queried
500 *
501 * Returns: size in pages of the given pool. The pool lock need not be
502 * taken to access pages_nr.
503 */
504u64 zbud_get_pool_size(struct zbud_pool *pool)
505{
506 return pool->pages_nr;
507}
508
509static int __init init_zbud(void)
510{
511 /* Make sure the zbud header will fit in one chunk */
512 BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
513 pr_info("loaded\n");
514 return 0;
515}
516
517static void __exit exit_zbud(void)
518{
519 pr_info("unloaded\n");
520}
521
522module_init(init_zbud);
523module_exit(exit_zbud);
524
525MODULE_LICENSE("GPL");
526MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
527MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");
diff --git a/mm/zswap.c b/mm/zswap.c
new file mode 100644
index 000000000000..deda2b671e12
--- /dev/null
+++ b/mm/zswap.c
@@ -0,0 +1,943 @@
1/*
2 * zswap.c - zswap driver file
3 *
4 * zswap is a backend for frontswap that takes pages that are in the process
5 * of being swapped out and attempts to compress and store them in a
6 * RAM-based memory pool. This can result in a significant I/O reduction on
7 * the swap device and, in the case where decompressing from RAM is faster
8 * than reading from the swap device, can also improve workload performance.
9 *
10 * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21*/
22
23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/highmem.h>
28#include <linux/slab.h>
29#include <linux/spinlock.h>
30#include <linux/types.h>
31#include <linux/atomic.h>
32#include <linux/frontswap.h>
33#include <linux/rbtree.h>
34#include <linux/swap.h>
35#include <linux/crypto.h>
36#include <linux/mempool.h>
37#include <linux/zbud.h>
38
39#include <linux/mm_types.h>
40#include <linux/page-flags.h>
41#include <linux/swapops.h>
42#include <linux/writeback.h>
43#include <linux/pagemap.h>
44
45/*********************************
46* statistics
47**********************************/
48/* Number of memory pages used by the compressed pool */
49static u64 zswap_pool_pages;
50/* The number of compressed pages currently stored in zswap */
51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
52
53/*
54 * The statistics below are not protected from concurrent access for
55 * performance reasons so they may not be a 100% accurate. However,
56 * they do provide useful information on roughly how many times a
57 * certain event is occurring.
58*/
59
60/* Pool limit was hit (see zswap_max_pool_percent) */
61static u64 zswap_pool_limit_hit;
62/* Pages written back when pool limit was reached */
63static u64 zswap_written_back_pages;
64/* Store failed due to a reclaim failure after pool limit was reached */
65static u64 zswap_reject_reclaim_fail;
66/* Compressed page was too big for the allocator to (optimally) store */
67static u64 zswap_reject_compress_poor;
68/* Store failed because underlying allocator could not get memory */
69static u64 zswap_reject_alloc_fail;
70/* Store failed because the entry metadata could not be allocated (rare) */
71static u64 zswap_reject_kmemcache_fail;
72/* Duplicate store was encountered (rare) */
73static u64 zswap_duplicate_entry;
74
75/*********************************
76* tunables
77**********************************/
78/* Enable/disable zswap (disabled by default, fixed at boot for now) */
79static bool zswap_enabled __read_mostly;
80module_param_named(enabled, zswap_enabled, bool, 0);
81
82/* Compressor to be used by zswap (fixed at boot for now) */
83#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
85module_param_named(compressor, zswap_compressor, charp, 0);
86
87/* The maximum percentage of memory that the compressed pool can occupy */
88static unsigned int zswap_max_pool_percent = 20;
89module_param_named(max_pool_percent,
90 zswap_max_pool_percent, uint, 0644);
91
92/*********************************
93* compression functions
94**********************************/
95/* per-cpu compression transforms */
96static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
97
98enum comp_op {
99 ZSWAP_COMPOP_COMPRESS,
100 ZSWAP_COMPOP_DECOMPRESS
101};
102
103static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
104 u8 *dst, unsigned int *dlen)
105{
106 struct crypto_comp *tfm;
107 int ret;
108
109 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
110 switch (op) {
111 case ZSWAP_COMPOP_COMPRESS:
112 ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
113 break;
114 case ZSWAP_COMPOP_DECOMPRESS:
115 ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
116 break;
117 default:
118 ret = -EINVAL;
119 }
120
121 put_cpu();
122 return ret;
123}
124
125static int __init zswap_comp_init(void)
126{
127 if (!crypto_has_comp(zswap_compressor, 0, 0)) {
128 pr_info("%s compressor not available\n", zswap_compressor);
129 /* fall back to default compressor */
130 zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
131 if (!crypto_has_comp(zswap_compressor, 0, 0))
132 /* can't even load the default compressor */
133 return -ENODEV;
134 }
135 pr_info("using %s compressor\n", zswap_compressor);
136
137 /* alloc percpu transforms */
138 zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
139 if (!zswap_comp_pcpu_tfms)
140 return -ENOMEM;
141 return 0;
142}
143
144static void zswap_comp_exit(void)
145{
146 /* free percpu transforms */
147 if (zswap_comp_pcpu_tfms)
148 free_percpu(zswap_comp_pcpu_tfms);
149}
150
151/*********************************
152* data structures
153**********************************/
154/*
155 * struct zswap_entry
156 *
157 * This structure contains the metadata for tracking a single compressed
158 * page within zswap.
159 *
160 * rbnode - links the entry into red-black tree for the appropriate swap type
161 * refcount - the number of outstanding reference to the entry. This is needed
162 * to protect against premature freeing of the entry by code
163 * concurent calls to load, invalidate, and writeback. The lock
164 * for the zswap_tree structure that contains the entry must
165 * be held while changing the refcount. Since the lock must
166 * be held, there is no reason to also make refcount atomic.
167 * offset - the swap offset for the entry. Index into the red-black tree.
168 * handle - zsmalloc allocation handle that stores the compressed page data
169 * length - the length in bytes of the compressed page data. Needed during
170 * decompression
171 */
172struct zswap_entry {
173 struct rb_node rbnode;
174 pgoff_t offset;
175 int refcount;
176 unsigned int length;
177 unsigned long handle;
178};
179
180struct zswap_header {
181 swp_entry_t swpentry;
182};
183
184/*
185 * The tree lock in the zswap_tree struct protects a few things:
186 * - the rbtree
187 * - the refcount field of each entry in the tree
188 */
189struct zswap_tree {
190 struct rb_root rbroot;
191 spinlock_t lock;
192 struct zbud_pool *pool;
193};
194
195static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
196
197/*********************************
198* zswap entry functions
199**********************************/
200static struct kmem_cache *zswap_entry_cache;
201
202static int zswap_entry_cache_create(void)
203{
204 zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
205 return (zswap_entry_cache == NULL);
206}
207
208static void zswap_entry_cache_destory(void)
209{
210 kmem_cache_destroy(zswap_entry_cache);
211}
212
213static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
214{
215 struct zswap_entry *entry;
216 entry = kmem_cache_alloc(zswap_entry_cache, gfp);
217 if (!entry)
218 return NULL;
219 entry->refcount = 1;
220 return entry;
221}
222
223static void zswap_entry_cache_free(struct zswap_entry *entry)
224{
225 kmem_cache_free(zswap_entry_cache, entry);
226}
227
228/* caller must hold the tree lock */
229static void zswap_entry_get(struct zswap_entry *entry)
230{
231 entry->refcount++;
232}
233
234/* caller must hold the tree lock */
235static int zswap_entry_put(struct zswap_entry *entry)
236{
237 entry->refcount--;
238 return entry->refcount;
239}
240
241/*********************************
242* rbtree functions
243**********************************/
244static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
245{
246 struct rb_node *node = root->rb_node;
247 struct zswap_entry *entry;
248
249 while (node) {
250 entry = rb_entry(node, struct zswap_entry, rbnode);
251 if (entry->offset > offset)
252 node = node->rb_left;
253 else if (entry->offset < offset)
254 node = node->rb_right;
255 else
256 return entry;
257 }
258 return NULL;
259}
260
261/*
262 * In the case that a entry with the same offset is found, a pointer to
263 * the existing entry is stored in dupentry and the function returns -EEXIST
264 */
265static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
266 struct zswap_entry **dupentry)
267{
268 struct rb_node **link = &root->rb_node, *parent = NULL;
269 struct zswap_entry *myentry;
270
271 while (*link) {
272 parent = *link;
273 myentry = rb_entry(parent, struct zswap_entry, rbnode);
274 if (myentry->offset > entry->offset)
275 link = &(*link)->rb_left;
276 else if (myentry->offset < entry->offset)
277 link = &(*link)->rb_right;
278 else {
279 *dupentry = myentry;
280 return -EEXIST;
281 }
282 }
283 rb_link_node(&entry->rbnode, parent, link);
284 rb_insert_color(&entry->rbnode, root);
285 return 0;
286}
287
288/*********************************
289* per-cpu code
290**********************************/
291static DEFINE_PER_CPU(u8 *, zswap_dstmem);
292
293static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
294{
295 struct crypto_comp *tfm;
296 u8 *dst;
297
298 switch (action) {
299 case CPU_UP_PREPARE:
300 tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
301 if (IS_ERR(tfm)) {
302 pr_err("can't allocate compressor transform\n");
303 return NOTIFY_BAD;
304 }
305 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
306 dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL);
307 if (!dst) {
308 pr_err("can't allocate compressor buffer\n");
309 crypto_free_comp(tfm);
310 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
311 return NOTIFY_BAD;
312 }
313 per_cpu(zswap_dstmem, cpu) = dst;
314 break;
315 case CPU_DEAD:
316 case CPU_UP_CANCELED:
317 tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
318 if (tfm) {
319 crypto_free_comp(tfm);
320 *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
321 }
322 dst = per_cpu(zswap_dstmem, cpu);
323 kfree(dst);
324 per_cpu(zswap_dstmem, cpu) = NULL;
325 break;
326 default:
327 break;
328 }
329 return NOTIFY_OK;
330}
331
332static int zswap_cpu_notifier(struct notifier_block *nb,
333 unsigned long action, void *pcpu)
334{
335 unsigned long cpu = (unsigned long)pcpu;
336 return __zswap_cpu_notifier(action, cpu);
337}
338
339static struct notifier_block zswap_cpu_notifier_block = {
340 .notifier_call = zswap_cpu_notifier
341};
342
343static int zswap_cpu_init(void)
344{
345 unsigned long cpu;
346
347 get_online_cpus();
348 for_each_online_cpu(cpu)
349 if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
350 goto cleanup;
351 register_cpu_notifier(&zswap_cpu_notifier_block);
352 put_online_cpus();
353 return 0;
354
355cleanup:
356 for_each_online_cpu(cpu)
357 __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
358 put_online_cpus();
359 return -ENOMEM;
360}
361
362/*********************************
363* helpers
364**********************************/
365static bool zswap_is_full(void)
366{
367 return (totalram_pages * zswap_max_pool_percent / 100 <
368 zswap_pool_pages);
369}
370
371/*
372 * Carries out the common pattern of freeing and entry's zsmalloc allocation,
373 * freeing the entry itself, and decrementing the number of stored pages.
374 */
375static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
376{
377 zbud_free(tree->pool, entry->handle);
378 zswap_entry_cache_free(entry);
379 atomic_dec(&zswap_stored_pages);
380 zswap_pool_pages = zbud_get_pool_size(tree->pool);
381}
382
383/*********************************
384* writeback code
385**********************************/
386/* return enum for zswap_get_swap_cache_page */
387enum zswap_get_swap_ret {
388 ZSWAP_SWAPCACHE_NEW,
389 ZSWAP_SWAPCACHE_EXIST,
390 ZSWAP_SWAPCACHE_NOMEM
391};
392
393/*
394 * zswap_get_swap_cache_page
395 *
396 * This is an adaption of read_swap_cache_async()
397 *
398 * This function tries to find a page with the given swap entry
399 * in the swapper_space address space (the swap cache). If the page
400 * is found, it is returned in retpage. Otherwise, a page is allocated,
401 * added to the swap cache, and returned in retpage.
402 *
403 * If success, the swap cache page is returned in retpage
404 * Returns 0 if page was already in the swap cache, page is not locked
405 * Returns 1 if the new page needs to be populated, page is locked
406 * Returns <0 on error
407 */
408static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage)
410{
411 struct page *found_page, *new_page = NULL;
412 struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
413 int err;
414
415 *retpage = NULL;
416 do {
417 /*
418 * First check the swap cache. Since this is normally
419 * called after lookup_swap_cache() failed, re-calling
420 * that would confuse statistics.
421 */
422 found_page = find_get_page(swapper_space, entry.val);
423 if (found_page)
424 break;
425
426 /*
427 * Get a new page to read into from swap.
428 */
429 if (!new_page) {
430 new_page = alloc_page(GFP_KERNEL);
431 if (!new_page)
432 break; /* Out of memory */
433 }
434
435 /*
436 * call radix_tree_preload() while we can wait.
437 */
438 err = radix_tree_preload(GFP_KERNEL);
439 if (err)
440 break;
441
442 /*
443 * Swap entry may have been freed since our caller observed it.
444 */
445 err = swapcache_prepare(entry);
446 if (err == -EEXIST) { /* seems racy */
447 radix_tree_preload_end();
448 continue;
449 }
450 if (err) { /* swp entry is obsolete ? */
451 radix_tree_preload_end();
452 break;
453 }
454
455 /* May fail (-ENOMEM) if radix-tree node allocation failed. */
456 __set_page_locked(new_page);
457 SetPageSwapBacked(new_page);
458 err = __add_to_swap_cache(new_page, entry);
459 if (likely(!err)) {
460 radix_tree_preload_end();
461 lru_cache_add_anon(new_page);
462 *retpage = new_page;
463 return ZSWAP_SWAPCACHE_NEW;
464 }
465 radix_tree_preload_end();
466 ClearPageSwapBacked(new_page);
467 __clear_page_locked(new_page);
468 /*
469 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
470 * clear SWAP_HAS_CACHE flag.
471 */
472 swapcache_free(entry, NULL);
473 } while (err != -ENOMEM);
474
475 if (new_page)
476 page_cache_release(new_page);
477 if (!found_page)
478 return ZSWAP_SWAPCACHE_NOMEM;
479 *retpage = found_page;
480 return ZSWAP_SWAPCACHE_EXIST;
481}
482
483/*
484 * Attempts to free an entry by adding a page to the swap cache,
485 * decompressing the entry data into the page, and issuing a
486 * bio write to write the page back to the swap device.
487 *
488 * This can be thought of as a "resumed writeback" of the page
489 * to the swap device. We are basically resuming the same swap
490 * writeback path that was intercepted with the frontswap_store()
491 * in the first place. After the page has been decompressed into
492 * the swap cache, the compressed version stored by zswap can be
493 * freed.
494 */
495static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
496{
497 struct zswap_header *zhdr;
498 swp_entry_t swpentry;
499 struct zswap_tree *tree;
500 pgoff_t offset;
501 struct zswap_entry *entry;
502 struct page *page;
503 u8 *src, *dst;
504 unsigned int dlen;
505 int ret, refcount;
506 struct writeback_control wbc = {
507 .sync_mode = WB_SYNC_NONE,
508 };
509
510 /* extract swpentry from data */
511 zhdr = zbud_map(pool, handle);
512 swpentry = zhdr->swpentry; /* here */
513 zbud_unmap(pool, handle);
514 tree = zswap_trees[swp_type(swpentry)];
515 offset = swp_offset(swpentry);
516 BUG_ON(pool != tree->pool);
517
518 /* find and ref zswap entry */
519 spin_lock(&tree->lock);
520 entry = zswap_rb_search(&tree->rbroot, offset);
521 if (!entry) {
522 /* entry was invalidated */
523 spin_unlock(&tree->lock);
524 return 0;
525 }
526 zswap_entry_get(entry);
527 spin_unlock(&tree->lock);
528 BUG_ON(offset != entry->offset);
529
530 /* try to allocate swap cache page */
531 switch (zswap_get_swap_cache_page(swpentry, &page)) {
532 case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
533 ret = -ENOMEM;
534 goto fail;
535
536 case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
537 /* page is already in the swap cache, ignore for now */
538 page_cache_release(page);
539 ret = -EEXIST;
540 goto fail;
541
542 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
543 /* decompress */
544 dlen = PAGE_SIZE;
545 src = (u8 *)zbud_map(tree->pool, entry->handle) +
546 sizeof(struct zswap_header);
547 dst = kmap_atomic(page);
548 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
549 entry->length, dst, &dlen);
550 kunmap_atomic(dst);
551 zbud_unmap(tree->pool, entry->handle);
552 BUG_ON(ret);
553 BUG_ON(dlen != PAGE_SIZE);
554
555 /* page is up to date */
556 SetPageUptodate(page);
557 }
558
559 /* start writeback */
560 __swap_writepage(page, &wbc, end_swap_bio_write);
561 page_cache_release(page);
562 zswap_written_back_pages++;
563
564 spin_lock(&tree->lock);
565
566 /* drop local reference */
567 zswap_entry_put(entry);
568 /* drop the initial reference from entry creation */
569 refcount = zswap_entry_put(entry);
570
571 /*
572 * There are three possible values for refcount here:
573 * (1) refcount is 1, load is in progress, unlink from rbtree,
574 * load will free
575 * (2) refcount is 0, (normal case) entry is valid,
576 * remove from rbtree and free entry
577 * (3) refcount is -1, invalidate happened during writeback,
578 * free entry
579 */
580 if (refcount >= 0) {
581 /* no invalidate yet, remove from rbtree */
582 rb_erase(&entry->rbnode, &tree->rbroot);
583 }
584 spin_unlock(&tree->lock);
585 if (refcount <= 0) {
586 /* free the entry */
587 zswap_free_entry(tree, entry);
588 return 0;
589 }
590 return -EAGAIN;
591
592fail:
593 spin_lock(&tree->lock);
594 zswap_entry_put(entry);
595 spin_unlock(&tree->lock);
596 return ret;
597}
598
599/*********************************
600* frontswap hooks
601**********************************/
602/* attempts to compress and store an single page */
603static int zswap_frontswap_store(unsigned type, pgoff_t offset,
604 struct page *page)
605{
606 struct zswap_tree *tree = zswap_trees[type];
607 struct zswap_entry *entry, *dupentry;
608 int ret;
609 unsigned int dlen = PAGE_SIZE, len;
610 unsigned long handle;
611 char *buf;
612 u8 *src, *dst;
613 struct zswap_header *zhdr;
614
615 if (!tree) {
616 ret = -ENODEV;
617 goto reject;
618 }
619
620 /* reclaim space if needed */
621 if (zswap_is_full()) {
622 zswap_pool_limit_hit++;
623 if (zbud_reclaim_page(tree->pool, 8)) {
624 zswap_reject_reclaim_fail++;
625 ret = -ENOMEM;
626 goto reject;
627 }
628 }
629
630 /* allocate entry */
631 entry = zswap_entry_cache_alloc(GFP_KERNEL);
632 if (!entry) {
633 zswap_reject_kmemcache_fail++;
634 ret = -ENOMEM;
635 goto reject;
636 }
637
638 /* compress */
639 dst = get_cpu_var(zswap_dstmem);
640 src = kmap_atomic(page);
641 ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
642 kunmap_atomic(src);
643 if (ret) {
644 ret = -EINVAL;
645 goto freepage;
646 }
647
648 /* store */
649 len = dlen + sizeof(struct zswap_header);
650 ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN,
651 &handle);
652 if (ret == -ENOSPC) {
653 zswap_reject_compress_poor++;
654 goto freepage;
655 }
656 if (ret) {
657 zswap_reject_alloc_fail++;
658 goto freepage;
659 }
660 zhdr = zbud_map(tree->pool, handle);
661 zhdr->swpentry = swp_entry(type, offset);
662 buf = (u8 *)(zhdr + 1);
663 memcpy(buf, dst, dlen);
664 zbud_unmap(tree->pool, handle);
665 put_cpu_var(zswap_dstmem);
666
667 /* populate entry */
668 entry->offset = offset;
669 entry->handle = handle;
670 entry->length = dlen;
671
672 /* map */
673 spin_lock(&tree->lock);
674 do {
675 ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
676 if (ret == -EEXIST) {
677 zswap_duplicate_entry++;
678 /* remove from rbtree */
679 rb_erase(&dupentry->rbnode, &tree->rbroot);
680 if (!zswap_entry_put(dupentry)) {
681 /* free */
682 zswap_free_entry(tree, dupentry);
683 }
684 }
685 } while (ret == -EEXIST);
686 spin_unlock(&tree->lock);
687
688 /* update stats */
689 atomic_inc(&zswap_stored_pages);
690 zswap_pool_pages = zbud_get_pool_size(tree->pool);
691
692 return 0;
693
694freepage:
695 put_cpu_var(zswap_dstmem);
696 zswap_entry_cache_free(entry);
697reject:
698 return ret;
699}
700
701/*
702 * returns 0 if the page was successfully decompressed
703 * return -1 on entry not found or error
704*/
705static int zswap_frontswap_load(unsigned type, pgoff_t offset,
706 struct page *page)
707{
708 struct zswap_tree *tree = zswap_trees[type];
709 struct zswap_entry *entry;
710 u8 *src, *dst;
711 unsigned int dlen;
712 int refcount, ret;
713
714 /* find */
715 spin_lock(&tree->lock);
716 entry = zswap_rb_search(&tree->rbroot, offset);
717 if (!entry) {
718 /* entry was written back */
719 spin_unlock(&tree->lock);
720 return -1;
721 }
722 zswap_entry_get(entry);
723 spin_unlock(&tree->lock);
724
725 /* decompress */
726 dlen = PAGE_SIZE;
727 src = (u8 *)zbud_map(tree->pool, entry->handle) +
728 sizeof(struct zswap_header);
729 dst = kmap_atomic(page);
730 ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
731 dst, &dlen);
732 kunmap_atomic(dst);
733 zbud_unmap(tree->pool, entry->handle);
734 BUG_ON(ret);
735
736 spin_lock(&tree->lock);
737 refcount = zswap_entry_put(entry);
738 if (likely(refcount)) {
739 spin_unlock(&tree->lock);
740 return 0;
741 }
742 spin_unlock(&tree->lock);
743
744 /*
745 * We don't have to unlink from the rbtree because
746 * zswap_writeback_entry() or zswap_frontswap_invalidate page()
747 * has already done this for us if we are the last reference.
748 */
749 /* free */
750
751 zswap_free_entry(tree, entry);
752
753 return 0;
754}
755
756/* frees an entry in zswap */
757static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
758{
759 struct zswap_tree *tree = zswap_trees[type];
760 struct zswap_entry *entry;
761 int refcount;
762
763 /* find */
764 spin_lock(&tree->lock);
765 entry = zswap_rb_search(&tree->rbroot, offset);
766 if (!entry) {
767 /* entry was written back */
768 spin_unlock(&tree->lock);
769 return;
770 }
771
772 /* remove from rbtree */
773 rb_erase(&entry->rbnode, &tree->rbroot);
774
775 /* drop the initial reference from entry creation */
776 refcount = zswap_entry_put(entry);
777
778 spin_unlock(&tree->lock);
779
780 if (refcount) {
781 /* writeback in progress, writeback will free */
782 return;
783 }
784
785 /* free */
786 zswap_free_entry(tree, entry);
787}
788
789/* frees all zswap entries for the given swap type */
790static void zswap_frontswap_invalidate_area(unsigned type)
791{
792 struct zswap_tree *tree = zswap_trees[type];
793 struct rb_node *node;
794 struct zswap_entry *entry;
795
796 if (!tree)
797 return;
798
799 /* walk the tree and free everything */
800 spin_lock(&tree->lock);
801 /*
802 * TODO: Even though this code should not be executed because
803 * the try_to_unuse() in swapoff should have emptied the tree,
804 * it is very wasteful to rebalance the tree after every
805 * removal when we are freeing the whole tree.
806 *
807 * If post-order traversal code is ever added to the rbtree
808 * implementation, it should be used here.
809 */
810 while ((node = rb_first(&tree->rbroot))) {
811 entry = rb_entry(node, struct zswap_entry, rbnode);
812 rb_erase(&entry->rbnode, &tree->rbroot);
813 zbud_free(tree->pool, entry->handle);
814 zswap_entry_cache_free(entry);
815 atomic_dec(&zswap_stored_pages);
816 }
817 tree->rbroot = RB_ROOT;
818 spin_unlock(&tree->lock);
819}
820
821static struct zbud_ops zswap_zbud_ops = {
822 .evict = zswap_writeback_entry
823};
824
825static void zswap_frontswap_init(unsigned type)
826{
827 struct zswap_tree *tree;
828
829 tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
830 if (!tree)
831 goto err;
832 tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops);
833 if (!tree->pool)
834 goto freetree;
835 tree->rbroot = RB_ROOT;
836 spin_lock_init(&tree->lock);
837 zswap_trees[type] = tree;
838 return;
839
840freetree:
841 kfree(tree);
842err:
843 pr_err("alloc failed, zswap disabled for swap type %d\n", type);
844}
845
846static struct frontswap_ops zswap_frontswap_ops = {
847 .store = zswap_frontswap_store,
848 .load = zswap_frontswap_load,
849 .invalidate_page = zswap_frontswap_invalidate_page,
850 .invalidate_area = zswap_frontswap_invalidate_area,
851 .init = zswap_frontswap_init
852};
853
854/*********************************
855* debugfs functions
856**********************************/
857#ifdef CONFIG_DEBUG_FS
858#include <linux/debugfs.h>
859
860static struct dentry *zswap_debugfs_root;
861
862static int __init zswap_debugfs_init(void)
863{
864 if (!debugfs_initialized())
865 return -ENODEV;
866
867 zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
868 if (!zswap_debugfs_root)
869 return -ENOMEM;
870
871 debugfs_create_u64("pool_limit_hit", S_IRUGO,
872 zswap_debugfs_root, &zswap_pool_limit_hit);
873 debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
874 zswap_debugfs_root, &zswap_reject_reclaim_fail);
875 debugfs_create_u64("reject_alloc_fail", S_IRUGO,
876 zswap_debugfs_root, &zswap_reject_alloc_fail);
877 debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
878 zswap_debugfs_root, &zswap_reject_kmemcache_fail);
879 debugfs_create_u64("reject_compress_poor", S_IRUGO,
880 zswap_debugfs_root, &zswap_reject_compress_poor);
881 debugfs_create_u64("written_back_pages", S_IRUGO,
882 zswap_debugfs_root, &zswap_written_back_pages);
883 debugfs_create_u64("duplicate_entry", S_IRUGO,
884 zswap_debugfs_root, &zswap_duplicate_entry);
885 debugfs_create_u64("pool_pages", S_IRUGO,
886 zswap_debugfs_root, &zswap_pool_pages);
887 debugfs_create_atomic_t("stored_pages", S_IRUGO,
888 zswap_debugfs_root, &zswap_stored_pages);
889
890 return 0;
891}
892
893static void __exit zswap_debugfs_exit(void)
894{
895 debugfs_remove_recursive(zswap_debugfs_root);
896}
897#else
898static int __init zswap_debugfs_init(void)
899{
900 return 0;
901}
902
903static void __exit zswap_debugfs_exit(void) { }
904#endif
905
906/*********************************
907* module init and exit
908**********************************/
909static int __init init_zswap(void)
910{
911 if (!zswap_enabled)
912 return 0;
913
914 pr_info("loading zswap\n");
915 if (zswap_entry_cache_create()) {
916 pr_err("entry cache creation failed\n");
917 goto error;
918 }
919 if (zswap_comp_init()) {
920 pr_err("compressor initialization failed\n");
921 goto compfail;
922 }
923 if (zswap_cpu_init()) {
924 pr_err("per-cpu initialization failed\n");
925 goto pcpufail;
926 }
927 frontswap_register_ops(&zswap_frontswap_ops);
928 if (zswap_debugfs_init())
929 pr_warn("debugfs initialization failed\n");
930 return 0;
931pcpufail:
932 zswap_comp_exit();
933compfail:
934 zswap_entry_cache_destory();
935error:
936 return -ENOMEM;
937}
938/* must be late so crypto has time to come up */
939late_initcall(init_zswap);
940
941MODULE_LICENSE("GPL");
942MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
943MODULE_DESCRIPTION("Compressed cache for swap pages");