aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-04-06 17:19:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2018-04-06 17:19:26 -0400
commit3b54765cca23152ec0cc254b75c877c10f6e2870 (patch)
tree795785d2a9d7498df9452be138867bd996c4cea5
parent3fd14cdcc05a682b03743683ce3a726898b20555 (diff)
parent97b1255cb27c551d7c3c5c496d787da40772da99 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc things - ocfs2 updates - the v9fs maintainers have been missing for a long time. I've taken over v9fs patch slinging. - most of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (116 commits) mm,oom_reaper: check for MMF_OOM_SKIP before complaining mm/ksm: fix interaction with THP mm/memblock.c: cast constant ULLONG_MAX to phys_addr_t headers: untangle kmemleak.h from mm.h include/linux/mmdebug.h: make VM_WARN* non-rvals mm/page_isolation.c: make start_isolate_page_range() fail if already isolated mm: change return type to vm_fault_t mm, oom: remove 3% bonus for CAP_SYS_ADMIN processes mm, page_alloc: wakeup kcompactd even if kswapd cannot free more memory kernel/fork.c: detect early free of a live mm mm: make counting of list_lru_one::nr_items lockless mm/swap_state.c: make bool enable_vma_readahead and swap_vma_readahead() static block_invalidatepage(): only release page if the full page was invalidated mm: kernel-doc: add missing parameter descriptions mm/swap.c: remove @cold parameter description for release_pages() mm/nommu: remove description of alloc_vm_area zram: drop max_zpage_size and use zs_huge_class_size() zsmalloc: introduce zs_huge_class_size() mm: fix races between swapoff and flush dcache fs/direct-io.c: minor cleanups in do_blockdev_direct_IO ...
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt54
-rw-r--r--Documentation/trace/postprocess/trace-vmscan-postprocess.pl4
-rw-r--r--arch/arc/mm/cache.c2
-rw-r--r--arch/arm/boot/compressed/misc.c9
-rw-r--r--arch/arm/mm/copypage-v4mc.c2
-rw-r--r--arch/arm/mm/copypage-v6.c2
-rw-r--r--arch/arm/mm/copypage-xscale.c2
-rw-r--r--arch/arm/mm/fault-armv.c2
-rw-r--r--arch/arm/mm/flush.c6
-rw-r--r--arch/mips/boot/compressed/decompress.c9
-rw-r--r--arch/mips/mm/cache.c2
-rw-r--r--arch/nios2/mm/cacheflush.c4
-rw-r--r--arch/parisc/kernel/cache.c5
-rw-r--r--arch/powerpc/include/asm/hugetlb.h6
-rw-r--r--arch/powerpc/mm/hugetlbpage.c5
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c2
-rw-r--r--arch/powerpc/sysdev/dart_iommu.c1
-rw-r--r--arch/powerpc/sysdev/msi_bitmap.c1
-rw-r--r--arch/s390/kernel/nmi.c2
-rw-r--r--arch/s390/kernel/smp.c1
-rw-r--r--arch/sh/boot/compressed/misc.c9
-rw-r--r--arch/sh/mm/cache-sh4.c2
-rw-r--r--arch/sh/mm/cache-sh7705.c2
-rw-r--r--arch/sparc/kernel/irq_64.c1
-rw-r--r--arch/sparc/kernel/smp_64.c8
-rw-r--r--arch/sparc/mm/init_64.c6
-rw-r--r--arch/sparc/mm/tlb.c2
-rw-r--r--arch/unicore32/mm/flush.c2
-rw-r--r--arch/unicore32/mm/mmu.c2
-rw-r--r--arch/x86/kernel/pci-dma.c1
-rw-r--r--arch/x86/mm/init_64.c33
-rw-r--r--arch/xtensa/mm/cache.c2
-rw-r--r--drivers/base/memory.c40
-rw-r--r--drivers/base/node.c24
-rw-r--r--drivers/block/zram/zram_drv.c9
-rw-r--r--drivers/block/zram/zram_drv.h16
-rw-r--r--drivers/dax/device.c10
-rw-r--r--drivers/iommu/exynos-iommu.c1
-rw-r--r--drivers/iommu/mtk_iommu_v1.c1
-rw-r--r--drivers/net/ethernet/ti/cpsw.c1
-rw-r--r--drivers/net/wireless/realtek/rtlwifi/pci.c1
-rw-r--r--drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c1
-rw-r--r--drivers/staging/rtl8188eu/hal/fw.c2
-rw-r--r--drivers/staging/rtlwifi/pci.c1
-rw-r--r--drivers/virtio/virtio_ring.c1
-rw-r--r--fs/9p/v9fs.c7
-rw-r--r--fs/9p/vfs_inode.c26
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/block_dev.c6
-rw-r--r--fs/buffer.c2
-rw-r--r--fs/direct-io.c9
-rw-r--r--fs/hugetlbfs/inode.c10
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c4
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c11
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmast.c2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c29
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h25
-rw-r--r--fs/ocfs2/dlm/dlmlock.c3
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c25
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c41
-rw-r--r--fs/ocfs2/dlmglue.c21
-rw-r--r--fs/ocfs2/file.c16
-rw-r--r--fs/ocfs2/filecheck.c358
-rw-r--r--fs/ocfs2/filecheck.h29
-rw-r--r--fs/ocfs2/inode.c8
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/ocfs2.h8
-rw-r--r--fs/ocfs2/ocfs2_trace.h6
-rw-r--r--fs/ocfs2/refcounttree.c10
-rw-r--r--fs/ocfs2/suballoc.c53
-rw-r--r--fs/ocfs2/super.c49
-rw-r--r--fs/ocfs2/uptodate.c3
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--include/linux/fault-inject.h5
-rw-r--r--include/linux/kasan.h4
-rw-r--r--include/linux/list_lru.h3
-rw-r--r--include/linux/memblock.h10
-rw-r--r--include/linux/memory.h3
-rw-r--r--include/linux/memory_hotplug.h53
-rw-r--r--include/linux/migrate.h2
-rw-r--r--include/linux/mm.h56
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/mmdebug.h8
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/node.h4
-rw-r--r--include/linux/page-flags.h22
-rw-r--r--include/linux/page_ref.h3
-rw-r--r--include/linux/slab.h20
-rw-r--r--include/linux/slab_def.h4
-rw-r--r--include/linux/slub_def.h28
-rw-r--r--include/linux/swap.h38
-rw-r--r--include/linux/zsmalloc.h2
-rw-r--r--include/net/sock.h4
-rw-r--r--include/trace/events/migrate.h2
-rw-r--r--include/trace/events/vmscan.h17
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/ucount.c1
-rw-r--r--lib/bitmap.c2
-rw-r--r--lib/test_bitmap.c4
-rw-r--r--lib/test_firmware.c1
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c13
-rw-r--r--mm/cma.c6
-rw-r--r--mm/compaction.c9
-rw-r--r--mm/failslab.c2
-rw-r--r--mm/gup.c4
-rw-r--r--mm/huge_memory.c36
-rw-r--r--mm/hugetlb.c27
-rw-r--r--mm/kasan/kasan.c15
-rw-r--r--mm/kmemleak.c12
-rw-r--r--mm/ksm.c36
-rw-r--r--mm/list_lru.c67
-rw-r--r--mm/memblock.c43
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c33
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mmap.c14
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c12
-rw-r--r--mm/page_alloc.c386
-rw-r--r--mm/page_idle.c12
-rw-r--r--mm/page_isolation.c18
-rw-r--r--mm/page_owner.c2
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/pagewalk.c3
-rw-r--r--mm/percpu-stats.c13
-rw-r--r--mm/rmap.c1
-rw-r--r--mm/shmem.c5
-rw-r--r--mm/slab.c18
-rw-r--r--mm/slab.h27
-rw-r--r--mm/slab_common.c96
-rw-r--r--mm/slub.c150
-rw-r--r--mm/sparse.c8
-rw-r--r--mm/swap.c1
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/swap_state.c144
-rw-r--r--mm/util.c10
-rw-r--r--mm/vmscan.c50
-rw-r--r--mm/z3fold.c35
-rw-r--r--mm/zsmalloc.c58
-rw-r--r--net/9p/client.c11
-rw-r--r--net/core/sysctl_net_core.c1
-rw-r--r--net/ipv4/route.c1
-rwxr-xr-xscripts/faddr2line12
-rw-r--r--security/apparmor/lsm.c1
-rw-r--r--security/keys/big_key.c1
151 files changed, 1601 insertions, 1269 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 34dac7cef4cf..3c87a69cffcb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1840,30 +1840,29 @@
1840 keepinitrd [HW,ARM] 1840 keepinitrd [HW,ARM]
1841 1841
1842 kernelcore= [KNL,X86,IA-64,PPC] 1842 kernelcore= [KNL,X86,IA-64,PPC]
1843 Format: nn[KMGTPE] | "mirror" 1843 Format: nn[KMGTPE] | nn% | "mirror"
1844 This parameter 1844 This parameter specifies the amount of memory usable by
1845 specifies the amount of memory usable by the kernel 1845 the kernel for non-movable allocations. The requested
1846 for non-movable allocations. The requested amount is 1846 amount is spread evenly throughout all nodes in the
1847 spread evenly throughout all nodes in the system. The 1847 system as ZONE_NORMAL. The remaining memory is used for
1848 remaining memory in each node is used for Movable 1848 movable memory in its own zone, ZONE_MOVABLE. In the
1849 pages. In the event, a node is too small to have both 1849 event, a node is too small to have both ZONE_NORMAL and
1850 kernelcore and Movable pages, kernelcore pages will 1850 ZONE_MOVABLE, kernelcore memory will take priority and
1851 take priority and other nodes will have a larger number 1851 other nodes will have a larger ZONE_MOVABLE.
1852 of Movable pages. The Movable zone is used for the 1852
1853 allocation of pages that may be reclaimed or moved 1853 ZONE_MOVABLE is used for the allocation of pages that
1854 by the page migration subsystem. This means that 1854 may be reclaimed or moved by the page migration
1855 HugeTLB pages may not be allocated from this zone. 1855 subsystem. Note that allocations like PTEs-from-HighMem
1856 Note that allocations like PTEs-from-HighMem still 1856 still use the HighMem zone if it exists, and the Normal
1857 use the HighMem zone if it exists, and the Normal
1858 zone if it does not. 1857 zone if it does not.
1859 1858
1860 Instead of specifying the amount of memory (nn[KMGTPE]), 1859 It is possible to specify the exact amount of memory in
1861 you can specify "mirror" option. In case "mirror" 1860 the form of "nn[KMGTPE]", a percentage of total system
1861 memory in the form of "nn%", or "mirror". If "mirror"
1862 option is specified, mirrored (reliable) memory is used 1862 option is specified, mirrored (reliable) memory is used
1863 for non-movable allocations and remaining memory is used 1863 for non-movable allocations and remaining memory is used
1864 for Movable pages. nn[KMGTPE] and "mirror" are exclusive, 1864 for Movable pages. "nn[KMGTPE]", "nn%", and "mirror"
1865 so you can NOT specify nn[KMGTPE] and "mirror" at the same 1865 are exclusive, so you cannot specify multiple forms.
1866 time.
1867 1866
1868 kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. 1867 kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port.
1869 Format: <Controller#>[,poll interval] 1868 Format: <Controller#>[,poll interval]
@@ -2377,13 +2376,14 @@
2377 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices 2376 mousedev.yres= [MOUSE] Vertical screen resolution, used for devices
2378 reporting absolute coordinates, such as tablets 2377 reporting absolute coordinates, such as tablets
2379 2378
2380 movablecore=nn[KMG] [KNL,X86,IA-64,PPC] This parameter 2379 movablecore= [KNL,X86,IA-64,PPC]
2381 is similar to kernelcore except it specifies the 2380 Format: nn[KMGTPE] | nn%
2382 amount of memory used for migratable allocations. 2381 This parameter is the complement to kernelcore=, it
2383 If both kernelcore and movablecore is specified, 2382 specifies the amount of memory used for migratable
2384 then kernelcore will be at *least* the specified 2383 allocations. If both kernelcore and movablecore is
2385 value but may be more. If movablecore on its own 2384 specified, then kernelcore will be at *least* the
2386 is specified, the administrator must be careful 2385 specified value but may be more. If movablecore on its
2386 own is specified, the administrator must be careful
2387 that the amount of memory usable for all allocations 2387 that the amount of memory usable for all allocations
2388 is not too small. 2388 is not too small.
2389 2389
diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
index ba976805853a..66bfd8396877 100644
--- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
+++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl
@@ -111,7 +111,7 @@ my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flag
111my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; 111my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)';
112my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; 112my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)';
113my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; 113my $regex_kswapd_sleep_default = 'nid=([0-9]*)';
114my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; 114my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)';
115my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; 115my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)';
116my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; 116my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)';
117my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; 117my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)';
@@ -201,7 +201,7 @@ $regex_kswapd_sleep = generate_traceevent_regex(
201$regex_wakeup_kswapd = generate_traceevent_regex( 201$regex_wakeup_kswapd = generate_traceevent_regex(
202 "vmscan/mm_vmscan_wakeup_kswapd", 202 "vmscan/mm_vmscan_wakeup_kswapd",
203 $regex_wakeup_kswapd_default, 203 $regex_wakeup_kswapd_default,
204 "nid", "zid", "order"); 204 "nid", "zid", "order", "gfp_flags");
205$regex_lru_isolate = generate_traceevent_regex( 205$regex_lru_isolate = generate_traceevent_regex(
206 "vmscan/mm_vmscan_lru_isolate", 206 "vmscan/mm_vmscan_lru_isolate",
207 $regex_lru_isolate_default, 207 $regex_lru_isolate_default,
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 2072f3451e9c..9dbe645ee127 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -833,7 +833,7 @@ void flush_dcache_page(struct page *page)
833 } 833 }
834 834
835 /* don't handle anon pages here */ 835 /* don't handle anon pages here */
836 mapping = page_mapping(page); 836 mapping = page_mapping_file(page);
837 if (!mapping) 837 if (!mapping)
838 return; 838 return;
839 839
diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c
index 16a8a804e958..e8fe51f4e97a 100644
--- a/arch/arm/boot/compressed/misc.c
+++ b/arch/arm/boot/compressed/misc.c
@@ -128,12 +128,7 @@ asmlinkage void __div0(void)
128 error("Attempting division by 0!"); 128 error("Attempting division by 0!");
129} 129}
130 130
131unsigned long __stack_chk_guard; 131const unsigned long __stack_chk_guard = 0x000a0dff;
132
133void __stack_chk_guard_setup(void)
134{
135 __stack_chk_guard = 0x000a0dff;
136}
137 132
138void __stack_chk_fail(void) 133void __stack_chk_fail(void)
139{ 134{
@@ -150,8 +145,6 @@ decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p,
150{ 145{
151 int ret; 146 int ret;
152 147
153 __stack_chk_guard_setup();
154
155 output_data = (unsigned char *)output_start; 148 output_data = (unsigned char *)output_start;
156 free_mem_ptr = free_mem_ptr_p; 149 free_mem_ptr = free_mem_ptr_p;
157 free_mem_end_ptr = free_mem_ptr_end_p; 150 free_mem_end_ptr = free_mem_ptr_end_p;
diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c
index 1267e64133b9..0224416cba3c 100644
--- a/arch/arm/mm/copypage-v4mc.c
+++ b/arch/arm/mm/copypage-v4mc.c
@@ -70,7 +70,7 @@ void v4_mc_copy_user_highpage(struct page *to, struct page *from,
70 void *kto = kmap_atomic(to); 70 void *kto = kmap_atomic(to);
71 71
72 if (!test_and_set_bit(PG_dcache_clean, &from->flags)) 72 if (!test_and_set_bit(PG_dcache_clean, &from->flags))
73 __flush_dcache_page(page_mapping(from), from); 73 __flush_dcache_page(page_mapping_file(from), from);
74 74
75 raw_spin_lock(&minicache_lock); 75 raw_spin_lock(&minicache_lock);
76 76
diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c
index 70423345da26..a698e575e321 100644
--- a/arch/arm/mm/copypage-v6.c
+++ b/arch/arm/mm/copypage-v6.c
@@ -76,7 +76,7 @@ static void v6_copy_user_highpage_aliasing(struct page *to,
76 unsigned long kfrom, kto; 76 unsigned long kfrom, kto;
77 77
78 if (!test_and_set_bit(PG_dcache_clean, &from->flags)) 78 if (!test_and_set_bit(PG_dcache_clean, &from->flags))
79 __flush_dcache_page(page_mapping(from), from); 79 __flush_dcache_page(page_mapping_file(from), from);
80 80
81 /* FIXME: not highmem safe */ 81 /* FIXME: not highmem safe */
82 discard_old_kernel_data(page_address(to)); 82 discard_old_kernel_data(page_address(to));
diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c
index 0fb85025344d..97972379f4d6 100644
--- a/arch/arm/mm/copypage-xscale.c
+++ b/arch/arm/mm/copypage-xscale.c
@@ -90,7 +90,7 @@ void xscale_mc_copy_user_highpage(struct page *to, struct page *from,
90 void *kto = kmap_atomic(to); 90 void *kto = kmap_atomic(to);
91 91
92 if (!test_and_set_bit(PG_dcache_clean, &from->flags)) 92 if (!test_and_set_bit(PG_dcache_clean, &from->flags))
93 __flush_dcache_page(page_mapping(from), from); 93 __flush_dcache_page(page_mapping_file(from), from);
94 94
95 raw_spin_lock(&minicache_lock); 95 raw_spin_lock(&minicache_lock);
96 96
diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index d9e0d00a6699..4d75dae5ac96 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -195,7 +195,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
195 if (page == ZERO_PAGE(0)) 195 if (page == ZERO_PAGE(0))
196 return; 196 return;
197 197
198 mapping = page_mapping(page); 198 mapping = page_mapping_file(page);
199 if (!test_and_set_bit(PG_dcache_clean, &page->flags)) 199 if (!test_and_set_bit(PG_dcache_clean, &page->flags))
200 __flush_dcache_page(mapping, page); 200 __flush_dcache_page(mapping, page);
201 if (mapping) { 201 if (mapping) {
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index f1e6190aa7ea..58469623b015 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -285,7 +285,7 @@ void __sync_icache_dcache(pte_t pteval)
285 285
286 page = pfn_to_page(pfn); 286 page = pfn_to_page(pfn);
287 if (cache_is_vipt_aliasing()) 287 if (cache_is_vipt_aliasing())
288 mapping = page_mapping(page); 288 mapping = page_mapping_file(page);
289 else 289 else
290 mapping = NULL; 290 mapping = NULL;
291 291
@@ -333,7 +333,7 @@ void flush_dcache_page(struct page *page)
333 return; 333 return;
334 } 334 }
335 335
336 mapping = page_mapping(page); 336 mapping = page_mapping_file(page);
337 337
338 if (!cache_ops_need_broadcast() && 338 if (!cache_ops_need_broadcast() &&
339 mapping && !page_mapcount(page)) 339 mapping && !page_mapcount(page))
@@ -363,7 +363,7 @@ void flush_kernel_dcache_page(struct page *page)
363 if (cache_is_vivt() || cache_is_vipt_aliasing()) { 363 if (cache_is_vivt() || cache_is_vipt_aliasing()) {
364 struct address_space *mapping; 364 struct address_space *mapping;
365 365
366 mapping = page_mapping(page); 366 mapping = page_mapping_file(page);
367 367
368 if (!mapping || mapping_mapped(mapping)) { 368 if (!mapping || mapping_mapped(mapping)) {
369 void *addr; 369 void *addr;
diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c
index fdf99e9dd4c3..81df9047e110 100644
--- a/arch/mips/boot/compressed/decompress.c
+++ b/arch/mips/boot/compressed/decompress.c
@@ -76,12 +76,7 @@ void error(char *x)
76#include "../../../../lib/decompress_unxz.c" 76#include "../../../../lib/decompress_unxz.c"
77#endif 77#endif
78 78
79unsigned long __stack_chk_guard; 79const unsigned long __stack_chk_guard = 0x000a0dff;
80
81void __stack_chk_guard_setup(void)
82{
83 __stack_chk_guard = 0x000a0dff;
84}
85 80
86void __stack_chk_fail(void) 81void __stack_chk_fail(void)
87{ 82{
@@ -92,8 +87,6 @@ void decompress_kernel(unsigned long boot_heap_start)
92{ 87{
93 unsigned long zimage_start, zimage_size; 88 unsigned long zimage_start, zimage_size;
94 89
95 __stack_chk_guard_setup();
96
97 zimage_start = (unsigned long)(&__image_begin); 90 zimage_start = (unsigned long)(&__image_begin);
98 zimage_size = (unsigned long)(&__image_end) - 91 zimage_size = (unsigned long)(&__image_end) -
99 (unsigned long)(&__image_begin); 92 (unsigned long)(&__image_begin);
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index 44ac64d51827..0d3c656feba0 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -86,7 +86,7 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes,
86 86
87void __flush_dcache_page(struct page *page) 87void __flush_dcache_page(struct page *page)
88{ 88{
89 struct address_space *mapping = page_mapping(page); 89 struct address_space *mapping = page_mapping_file(page);
90 unsigned long addr; 90 unsigned long addr;
91 91
92 if (mapping && !mapping_mapped(mapping)) { 92 if (mapping && !mapping_mapped(mapping)) {
diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c
index 87bf88ed04c6..506f6e1c86d5 100644
--- a/arch/nios2/mm/cacheflush.c
+++ b/arch/nios2/mm/cacheflush.c
@@ -180,7 +180,7 @@ void flush_dcache_page(struct page *page)
180 if (page == ZERO_PAGE(0)) 180 if (page == ZERO_PAGE(0))
181 return; 181 return;
182 182
183 mapping = page_mapping(page); 183 mapping = page_mapping_file(page);
184 184
185 /* Flush this page if there are aliases. */ 185 /* Flush this page if there are aliases. */
186 if (mapping && !mapping_mapped(mapping)) { 186 if (mapping && !mapping_mapped(mapping)) {
@@ -215,7 +215,7 @@ void update_mmu_cache(struct vm_area_struct *vma,
215 if (page == ZERO_PAGE(0)) 215 if (page == ZERO_PAGE(0))
216 return; 216 return;
217 217
218 mapping = page_mapping(page); 218 mapping = page_mapping_file(page);
219 if (!test_and_set_bit(PG_dcache_clean, &page->flags)) 219 if (!test_and_set_bit(PG_dcache_clean, &page->flags))
220 __flush_dcache_page(mapping, page); 220 __flush_dcache_page(mapping, page);
221 221
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index e3b45546d589..a99da95fc9fd 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -88,7 +88,8 @@ update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep)
88 return; 88 return;
89 89
90 page = pfn_to_page(pfn); 90 page = pfn_to_page(pfn);
91 if (page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { 91 if (page_mapping_file(page) &&
92 test_bit(PG_dcache_dirty, &page->flags)) {
92 flush_kernel_dcache_page_addr(pfn_va(pfn)); 93 flush_kernel_dcache_page_addr(pfn_va(pfn));
93 clear_bit(PG_dcache_dirty, &page->flags); 94 clear_bit(PG_dcache_dirty, &page->flags);
94 } else if (parisc_requires_coherency()) 95 } else if (parisc_requires_coherency())
@@ -304,7 +305,7 @@ __flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr,
304 305
305void flush_dcache_page(struct page *page) 306void flush_dcache_page(struct page *page)
306{ 307{
307 struct address_space *mapping = page_mapping(page); 308 struct address_space *mapping = page_mapping_file(page);
308 struct vm_area_struct *mpnt; 309 struct vm_area_struct *mpnt;
309 unsigned long offset; 310 unsigned long offset;
310 unsigned long addr, old_addr = 0; 311 unsigned long addr, old_addr = 0;
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 1a4847f67ea8..6f6751d3eba9 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -118,12 +118,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
118 unsigned long ceiling); 118 unsigned long ceiling);
119 119
120/* 120/*
121 * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs
122 * to override the version in mm/hugetlb.c
123 */
124#define vma_mmu_pagesize vma_mmu_pagesize
125
126/*
127 * If the arch doesn't supply something else, assume that hugepage 121 * If the arch doesn't supply something else, assume that hugepage
128 * size aligned regions are ok without further preparation. 122 * size aligned regions are ok without further preparation.
129 */ 123 */
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 876da2bc1796..3a08d211d2ee 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -568,10 +568,7 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
568 if (!radix_enabled()) 568 if (!radix_enabled())
569 return 1UL << mmu_psize_to_shift(psize); 569 return 1UL << mmu_psize_to_shift(psize);
570#endif 570#endif
571 if (!is_vm_hugetlb_page(vma)) 571 return vma_kernel_pagesize(vma);
572 return PAGE_SIZE;
573
574 return huge_page_size(hstate_vma(vma));
575} 572}
576 573
577static inline bool is_power_of_4(unsigned long x) 574static inline bool is_power_of_4(unsigned long x)
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index e0a2d8e806ed..9a8a084e4aba 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -112,7 +112,7 @@ static int mm_iommu_move_page_from_cma(struct page *page)
112 put_page(page); /* Drop the gup reference */ 112 put_page(page); /* Drop the gup reference */
113 113
114 ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, 114 ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
115 NULL, 0, MIGRATE_SYNC, MR_CMA); 115 NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
116 if (ret) { 116 if (ret) {
117 if (!list_empty(&cma_migrate_pages)) 117 if (!list_empty(&cma_migrate_pages))
118 putback_movable_pages(&cma_migrate_pages); 118 putback_movable_pages(&cma_migrate_pages);
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index a6198d4f0f03..5ca3e22d0512 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -38,6 +38,7 @@
38#include <linux/suspend.h> 38#include <linux/suspend.h>
39#include <linux/memblock.h> 39#include <linux/memblock.h>
40#include <linux/gfp.h> 40#include <linux/gfp.h>
41#include <linux/kmemleak.h>
41#include <asm/io.h> 42#include <asm/io.h>
42#include <asm/prom.h> 43#include <asm/prom.h>
43#include <asm/iommu.h> 44#include <asm/iommu.h>
diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c
index c4dae27172b3..6243a7e537d0 100644
--- a/arch/powerpc/sysdev/msi_bitmap.c
+++ b/arch/powerpc/sysdev/msi_bitmap.c
@@ -10,6 +10,7 @@
10 10
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/kmemleak.h>
13#include <linux/bitmap.h> 14#include <linux/bitmap.h>
14#include <linux/bootmem.h> 15#include <linux/bootmem.h>
15#include <asm/msi_bitmap.h> 16#include <asm/msi_bitmap.h>
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index c7a627620e5e..8c867b43c8eb 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -15,7 +15,7 @@
15#include <linux/hardirq.h> 15#include <linux/hardirq.h>
16#include <linux/log2.h> 16#include <linux/log2.h>
17#include <linux/kprobes.h> 17#include <linux/kprobes.h>
18#include <linux/slab.h> 18#include <linux/kmemleak.h>
19#include <linux/time.h> 19#include <linux/time.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/sched/signal.h> 21#include <linux/sched/signal.h>
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index a4a9fe1934e9..2f8f7d7dd9a8 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -27,7 +27,6 @@
27#include <linux/err.h> 27#include <linux/err.h>
28#include <linux/spinlock.h> 28#include <linux/spinlock.h>
29#include <linux/kernel_stat.h> 29#include <linux/kernel_stat.h>
30#include <linux/kmemleak.h>
31#include <linux/delay.h> 30#include <linux/delay.h>
32#include <linux/interrupt.h> 31#include <linux/interrupt.h>
33#include <linux/irqflags.h> 32#include <linux/irqflags.h>
diff --git a/arch/sh/boot/compressed/misc.c b/arch/sh/boot/compressed/misc.c
index 627ce8e75e01..c15cac9251b9 100644
--- a/arch/sh/boot/compressed/misc.c
+++ b/arch/sh/boot/compressed/misc.c
@@ -104,12 +104,7 @@ static void error(char *x)
104 while(1); /* Halt */ 104 while(1); /* Halt */
105} 105}
106 106
107unsigned long __stack_chk_guard; 107const unsigned long __stack_chk_guard = 0x000a0dff;
108
109void __stack_chk_guard_setup(void)
110{
111 __stack_chk_guard = 0x000a0dff;
112}
113 108
114void __stack_chk_fail(void) 109void __stack_chk_fail(void)
115{ 110{
@@ -130,8 +125,6 @@ void decompress_kernel(void)
130{ 125{
131 unsigned long output_addr; 126 unsigned long output_addr;
132 127
133 __stack_chk_guard_setup();
134
135#ifdef CONFIG_SUPERH64 128#ifdef CONFIG_SUPERH64
136 output_addr = (CONFIG_MEMORY_START + 0x2000); 129 output_addr = (CONFIG_MEMORY_START + 0x2000);
137#else 130#else
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c
index 58aaa4f33b81..eee911422cf9 100644
--- a/arch/sh/mm/cache-sh4.c
+++ b/arch/sh/mm/cache-sh4.c
@@ -112,7 +112,7 @@ static void sh4_flush_dcache_page(void *arg)
112 struct page *page = arg; 112 struct page *page = arg;
113 unsigned long addr = (unsigned long)page_address(page); 113 unsigned long addr = (unsigned long)page_address(page);
114#ifndef CONFIG_SMP 114#ifndef CONFIG_SMP
115 struct address_space *mapping = page_mapping(page); 115 struct address_space *mapping = page_mapping_file(page);
116 116
117 if (mapping && !mapping_mapped(mapping)) 117 if (mapping && !mapping_mapped(mapping))
118 clear_bit(PG_dcache_clean, &page->flags); 118 clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c
index 6cd2aa395817..ed25eba80667 100644
--- a/arch/sh/mm/cache-sh7705.c
+++ b/arch/sh/mm/cache-sh7705.c
@@ -136,7 +136,7 @@ static void __flush_dcache_page(unsigned long phys)
136static void sh7705_flush_dcache_page(void *arg) 136static void sh7705_flush_dcache_page(void *arg)
137{ 137{
138 struct page *page = arg; 138 struct page *page = arg;
139 struct address_space *mapping = page_mapping(page); 139 struct address_space *mapping = page_mapping_file(page);
140 140
141 if (mapping && !mapping_mapped(mapping)) 141 if (mapping && !mapping_mapped(mapping))
142 clear_bit(PG_dcache_clean, &page->flags); 142 clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index d66dde833f5e..713670e6d13d 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -22,7 +22,6 @@
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/ftrace.h> 23#include <linux/ftrace.h>
24#include <linux/irq.h> 24#include <linux/irq.h>
25#include <linux/kmemleak.h>
26 25
27#include <asm/ptrace.h> 26#include <asm/ptrace.h>
28#include <asm/processor.h> 27#include <asm/processor.h>
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index c50182cd2f64..d3ea1f3c06a0 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -929,9 +929,9 @@ static inline void __local_flush_dcache_page(struct page *page)
929#ifdef DCACHE_ALIASING_POSSIBLE 929#ifdef DCACHE_ALIASING_POSSIBLE
930 __flush_dcache_page(page_address(page), 930 __flush_dcache_page(page_address(page),
931 ((tlb_type == spitfire) && 931 ((tlb_type == spitfire) &&
932 page_mapping(page) != NULL)); 932 page_mapping_file(page) != NULL));
933#else 933#else
934 if (page_mapping(page) != NULL && 934 if (page_mapping_file(page) != NULL &&
935 tlb_type == spitfire) 935 tlb_type == spitfire)
936 __flush_icache_page(__pa(page_address(page))); 936 __flush_icache_page(__pa(page_address(page)));
937#endif 937#endif
@@ -958,7 +958,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
958 958
959 if (tlb_type == spitfire) { 959 if (tlb_type == spitfire) {
960 data0 = ((u64)&xcall_flush_dcache_page_spitfire); 960 data0 = ((u64)&xcall_flush_dcache_page_spitfire);
961 if (page_mapping(page) != NULL) 961 if (page_mapping_file(page) != NULL)
962 data0 |= ((u64)1 << 32); 962 data0 |= ((u64)1 << 32);
963 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 963 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
964#ifdef DCACHE_ALIASING_POSSIBLE 964#ifdef DCACHE_ALIASING_POSSIBLE
@@ -994,7 +994,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
994 pg_addr = page_address(page); 994 pg_addr = page_address(page);
995 if (tlb_type == spitfire) { 995 if (tlb_type == spitfire) {
996 data0 = ((u64)&xcall_flush_dcache_page_spitfire); 996 data0 = ((u64)&xcall_flush_dcache_page_spitfire);
997 if (page_mapping(page) != NULL) 997 if (page_mapping_file(page) != NULL)
998 data0 |= ((u64)1 << 32); 998 data0 |= ((u64)1 << 32);
999 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { 999 } else if (tlb_type == cheetah || tlb_type == cheetah_plus) {
1000#ifdef DCACHE_ALIASING_POSSIBLE 1000#ifdef DCACHE_ALIASING_POSSIBLE
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index cb9ebac6663f..8aeb1aabe76e 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -206,9 +206,9 @@ inline void flush_dcache_page_impl(struct page *page)
206#ifdef DCACHE_ALIASING_POSSIBLE 206#ifdef DCACHE_ALIASING_POSSIBLE
207 __flush_dcache_page(page_address(page), 207 __flush_dcache_page(page_address(page),
208 ((tlb_type == spitfire) && 208 ((tlb_type == spitfire) &&
209 page_mapping(page) != NULL)); 209 page_mapping_file(page) != NULL));
210#else 210#else
211 if (page_mapping(page) != NULL && 211 if (page_mapping_file(page) != NULL &&
212 tlb_type == spitfire) 212 tlb_type == spitfire)
213 __flush_icache_page(__pa(page_address(page))); 213 __flush_icache_page(__pa(page_address(page)));
214#endif 214#endif
@@ -490,7 +490,7 @@ void flush_dcache_page(struct page *page)
490 490
491 this_cpu = get_cpu(); 491 this_cpu = get_cpu();
492 492
493 mapping = page_mapping(page); 493 mapping = page_mapping_file(page);
494 if (mapping && !mapping_mapped(mapping)) { 494 if (mapping && !mapping_mapped(mapping)) {
495 int dirty = test_bit(PG_dcache_dirty, &page->flags); 495 int dirty = test_bit(PG_dcache_dirty, &page->flags);
496 if (dirty) { 496 if (dirty) {
diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c
index b5cfab711651..3d72d2deb13b 100644
--- a/arch/sparc/mm/tlb.c
+++ b/arch/sparc/mm/tlb.c
@@ -128,7 +128,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr,
128 goto no_cache_flush; 128 goto no_cache_flush;
129 129
130 /* A real file page? */ 130 /* A real file page? */
131 mapping = page_mapping(page); 131 mapping = page_mapping_file(page);
132 if (!mapping) 132 if (!mapping)
133 goto no_cache_flush; 133 goto no_cache_flush;
134 134
diff --git a/arch/unicore32/mm/flush.c b/arch/unicore32/mm/flush.c
index 6d4c096ffa2a..74f4d636df2d 100644
--- a/arch/unicore32/mm/flush.c
+++ b/arch/unicore32/mm/flush.c
@@ -83,7 +83,7 @@ void flush_dcache_page(struct page *page)
83 if (page == ZERO_PAGE(0)) 83 if (page == ZERO_PAGE(0))
84 return; 84 return;
85 85
86 mapping = page_mapping(page); 86 mapping = page_mapping_file(page);
87 87
88 if (mapping && !mapping_mapped(mapping)) 88 if (mapping && !mapping_mapped(mapping))
89 clear_bit(PG_dcache_clean, &page->flags); 89 clear_bit(PG_dcache_clean, &page->flags);
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
index 4f5a532bee13..0c94b7b4514d 100644
--- a/arch/unicore32/mm/mmu.c
+++ b/arch/unicore32/mm/mmu.c
@@ -503,7 +503,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
503 if (page == ZERO_PAGE(0)) 503 if (page == ZERO_PAGE(0))
504 return; 504 return;
505 505
506 mapping = page_mapping(page); 506 mapping = page_mapping_file(page);
507 if (!test_and_set_bit(PG_dcache_clean, &page->flags)) 507 if (!test_and_set_bit(PG_dcache_clean, &page->flags))
508 __flush_dcache_page(mapping, page); 508 __flush_dcache_page(mapping, page);
509 if (mapping) 509 if (mapping)
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 14437116ffea..77625b60a510 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -6,7 +6,6 @@
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/gfp.h> 7#include <linux/gfp.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/kmemleak.h>
10 9
11#include <asm/proto.h> 10#include <asm/proto.h>
12#include <asm/dma.h> 11#include <asm/dma.h>
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 45241de66785..dca9abf2b85c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1328,14 +1328,39 @@ int kern_addr_valid(unsigned long addr)
1328 return pfn_valid(pte_pfn(*pte)); 1328 return pfn_valid(pte_pfn(*pte));
1329} 1329}
1330 1330
1331/*
1332 * Block size is the minimum amount of memory which can be hotplugged or
1333 * hotremoved. It must be power of two and must be equal or larger than
1334 * MIN_MEMORY_BLOCK_SIZE.
1335 */
1336#define MAX_BLOCK_SIZE (2UL << 30)
1337
1338/* Amount of ram needed to start using large blocks */
1339#define MEM_SIZE_FOR_LARGE_BLOCK (64UL << 30)
1340
1331static unsigned long probe_memory_block_size(void) 1341static unsigned long probe_memory_block_size(void)
1332{ 1342{
1333 unsigned long bz = MIN_MEMORY_BLOCK_SIZE; 1343 unsigned long boot_mem_end = max_pfn << PAGE_SHIFT;
1344 unsigned long bz;
1334 1345
1335 /* if system is UV or has 64GB of RAM or more, use large blocks */ 1346 /* If this is UV system, always set 2G block size */
1336 if (is_uv_system() || ((max_pfn << PAGE_SHIFT) >= (64UL << 30))) 1347 if (is_uv_system()) {
1337 bz = 2UL << 30; /* 2GB */ 1348 bz = MAX_BLOCK_SIZE;
1349 goto done;
1350 }
1338 1351
1352 /* Use regular block if RAM is smaller than MEM_SIZE_FOR_LARGE_BLOCK */
1353 if (boot_mem_end < MEM_SIZE_FOR_LARGE_BLOCK) {
1354 bz = MIN_MEMORY_BLOCK_SIZE;
1355 goto done;
1356 }
1357
1358 /* Find the largest allowed block size that aligns to memory end */
1359 for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) {
1360 if (IS_ALIGNED(boot_mem_end, bz))
1361 break;
1362 }
1363done:
1339 pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20); 1364 pr_info("x86/mm: Memory block size: %ldMB\n", bz >> 20);
1340 1365
1341 return bz; 1366 return bz;
diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c
index 57dc231a0709..9220dcde7520 100644
--- a/arch/xtensa/mm/cache.c
+++ b/arch/xtensa/mm/cache.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(copy_user_highpage);
127 127
128void flush_dcache_page(struct page *page) 128void flush_dcache_page(struct page *page)
129{ 129{
130 struct address_space *mapping = page_mapping(page); 130 struct address_space *mapping = page_mapping_file(page);
131 131
132 /* 132 /*
133 * If we have a mapping but the page is not mapped to user-space 133 * If we have a mapping but the page is not mapped to user-space
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index fe4b24f05f6a..79fcd2bae96b 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -187,13 +187,14 @@ int memory_isolate_notify(unsigned long val, void *v)
187} 187}
188 188
189/* 189/*
190 * The probe routines leave the pages reserved, just as the bootmem code does. 190 * The probe routines leave the pages uninitialized, just as the bootmem code
191 * Make sure they're still that way. 191 * does. Make sure we do not access them, but instead use only information from
192 * within sections.
192 */ 193 */
193static bool pages_correctly_reserved(unsigned long start_pfn) 194static bool pages_correctly_probed(unsigned long start_pfn)
194{ 195{
195 int i, j; 196 unsigned long section_nr = pfn_to_section_nr(start_pfn);
196 struct page *page; 197 unsigned long section_nr_end = section_nr + sections_per_block;
197 unsigned long pfn = start_pfn; 198 unsigned long pfn = start_pfn;
198 199
199 /* 200 /*
@@ -201,21 +202,24 @@ static bool pages_correctly_reserved(unsigned long start_pfn)
201 * SPARSEMEM_VMEMMAP. We lookup the page once per section 202 * SPARSEMEM_VMEMMAP. We lookup the page once per section
202 * and assume memmap is contiguous within each section 203 * and assume memmap is contiguous within each section
203 */ 204 */
204 for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) { 205 for (; section_nr < section_nr_end; section_nr++) {
205 if (WARN_ON_ONCE(!pfn_valid(pfn))) 206 if (WARN_ON_ONCE(!pfn_valid(pfn)))
206 return false; 207 return false;
207 page = pfn_to_page(pfn);
208
209 for (j = 0; j < PAGES_PER_SECTION; j++) {
210 if (PageReserved(page + j))
211 continue;
212
213 printk(KERN_WARNING "section number %ld page number %d "
214 "not reserved, was it already online?\n",
215 pfn_to_section_nr(pfn), j);
216 208
209 if (!present_section_nr(section_nr)) {
210 pr_warn("section %ld pfn[%lx, %lx) not present",
211 section_nr, pfn, pfn + PAGES_PER_SECTION);
212 return false;
213 } else if (!valid_section_nr(section_nr)) {
214 pr_warn("section %ld pfn[%lx, %lx) no valid memmap",
215 section_nr, pfn, pfn + PAGES_PER_SECTION);
216 return false;
217 } else if (online_section_nr(section_nr)) {
218 pr_warn("section %ld pfn[%lx, %lx) is already online",
219 section_nr, pfn, pfn + PAGES_PER_SECTION);
217 return false; 220 return false;
218 } 221 }
222 pfn += PAGES_PER_SECTION;
219 } 223 }
220 224
221 return true; 225 return true;
@@ -237,7 +241,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t
237 241
238 switch (action) { 242 switch (action) {
239 case MEM_ONLINE: 243 case MEM_ONLINE:
240 if (!pages_correctly_reserved(start_pfn)) 244 if (!pages_correctly_probed(start_pfn))
241 return -EBUSY; 245 return -EBUSY;
242 246
243 ret = online_pages(start_pfn, nr_pages, online_type); 247 ret = online_pages(start_pfn, nr_pages, online_type);
@@ -708,7 +712,7 @@ static int add_memory_block(int base_section_nr)
708 * need an interface for the VM to add new memory regions, 712 * need an interface for the VM to add new memory regions,
709 * but without onlining it. 713 * but without onlining it.
710 */ 714 */
711int register_new_memory(int nid, struct mem_section *section) 715int hotplug_memory_register(int nid, struct mem_section *section)
712{ 716{
713 int ret = 0; 717 int ret = 0;
714 struct memory_block *mem; 718 struct memory_block *mem;
@@ -727,7 +731,7 @@ int register_new_memory(int nid, struct mem_section *section)
727 } 731 }
728 732
729 if (mem->section_count == sections_per_block) 733 if (mem->section_count == sections_per_block)
730 ret = register_mem_sect_under_node(mem, nid); 734 ret = register_mem_sect_under_node(mem, nid, false);
731out: 735out:
732 mutex_unlock(&mem_sysfs_mutex); 736 mutex_unlock(&mem_sysfs_mutex);
733 return ret; 737 return ret;
diff --git a/drivers/base/node.c b/drivers/base/node.c
index c5f81fc621ac..7a3a580821e0 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -399,13 +399,16 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
399} 399}
400 400
401/* register memory section under specified node if it spans that node */ 401/* register memory section under specified node if it spans that node */
402int register_mem_sect_under_node(struct memory_block *mem_blk, int nid) 402int register_mem_sect_under_node(struct memory_block *mem_blk, int nid,
403 bool check_nid)
403{ 404{
404 int ret; 405 int ret;
405 unsigned long pfn, sect_start_pfn, sect_end_pfn; 406 unsigned long pfn, sect_start_pfn, sect_end_pfn;
406 407
407 if (!mem_blk) 408 if (!mem_blk)
408 return -EFAULT; 409 return -EFAULT;
410
411 mem_blk->nid = nid;
409 if (!node_online(nid)) 412 if (!node_online(nid))
410 return 0; 413 return 0;
411 414
@@ -425,11 +428,18 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, int nid)
425 continue; 428 continue;
426 } 429 }
427 430
428 page_nid = get_nid_for_pfn(pfn); 431 /*
429 if (page_nid < 0) 432 * We need to check if page belongs to nid only for the boot
430 continue; 433 * case, during hotplug we know that all pages in the memory
431 if (page_nid != nid) 434 * block belong to the same node.
432 continue; 435 */
436 if (check_nid) {
437 page_nid = get_nid_for_pfn(pfn);
438 if (page_nid < 0)
439 continue;
440 if (page_nid != nid)
441 continue;
442 }
433 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, 443 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
434 &mem_blk->dev.kobj, 444 &mem_blk->dev.kobj,
435 kobject_name(&mem_blk->dev.kobj)); 445 kobject_name(&mem_blk->dev.kobj));
@@ -504,7 +514,7 @@ int link_mem_sections(int nid, unsigned long start_pfn, unsigned long nr_pages)
504 514
505 mem_blk = find_memory_block_hinted(mem_sect, mem_blk); 515 mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
506 516
507 ret = register_mem_sect_under_node(mem_blk, nid); 517 ret = register_mem_sect_under_node(mem_blk, nid, true);
508 if (!err) 518 if (!err)
509 err = ret; 519 err = ret;
510 520
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 71b449613cfa..0f3fadd71230 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -44,6 +44,11 @@ static const char *default_compressor = "lzo";
44 44
45/* Module params (documentation at end) */ 45/* Module params (documentation at end) */
46static unsigned int num_devices = 1; 46static unsigned int num_devices = 1;
47/*
48 * Pages that compress to sizes equals or greater than this are stored
49 * uncompressed in memory.
50 */
51static size_t huge_class_size;
47 52
48static void zram_free_page(struct zram *zram, size_t index); 53static void zram_free_page(struct zram *zram, size_t index);
49 54
@@ -786,6 +791,8 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
786 return false; 791 return false;
787 } 792 }
788 793
794 if (!huge_class_size)
795 huge_class_size = zs_huge_class_size(zram->mem_pool);
789 return true; 796 return true;
790} 797}
791 798
@@ -965,7 +972,7 @@ compress_again:
965 return ret; 972 return ret;
966 } 973 }
967 974
968 if (unlikely(comp_len > max_zpage_size)) { 975 if (unlikely(comp_len >= huge_class_size)) {
969 if (zram_wb_enabled(zram) && allow_wb) { 976 if (zram_wb_enabled(zram) && allow_wb) {
970 zcomp_stream_put(zram->comp); 977 zcomp_stream_put(zram->comp);
971 ret = write_to_bdev(zram, bvec, index, bio, &element); 978 ret = write_to_bdev(zram, bvec, index, bio, &element);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 1e9bf65c0bfb..008861220723 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -21,22 +21,6 @@
21 21
22#include "zcomp.h" 22#include "zcomp.h"
23 23
24/*-- Configurable parameters */
25
26/*
27 * Pages that compress to size greater than this are stored
28 * uncompressed in memory.
29 */
30static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
31
32/*
33 * NOTE: max_zpage_size must be less than or equal to:
34 * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would
35 * always return failure.
36 */
37
38/*-- End of configurable params */
39
40#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) 24#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
41#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) 25#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT)
42#define ZRAM_LOGICAL_BLOCK_SHIFT 12 26#define ZRAM_LOGICAL_BLOCK_SHIFT 12
diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 2137dbc29877..0b61f48f21a6 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -439,10 +439,20 @@ static int dev_dax_split(struct vm_area_struct *vma, unsigned long addr)
439 return 0; 439 return 0;
440} 440}
441 441
442static unsigned long dev_dax_pagesize(struct vm_area_struct *vma)
443{
444 struct file *filp = vma->vm_file;
445 struct dev_dax *dev_dax = filp->private_data;
446 struct dax_region *dax_region = dev_dax->region;
447
448 return dax_region->align;
449}
450
442static const struct vm_operations_struct dax_vm_ops = { 451static const struct vm_operations_struct dax_vm_ops = {
443 .fault = dev_dax_fault, 452 .fault = dev_dax_fault,
444 .huge_fault = dev_dax_huge_fault, 453 .huge_fault = dev_dax_huge_fault,
445 .split = dev_dax_split, 454 .split = dev_dax_split,
455 .pagesize = dev_dax_pagesize,
446}; 456};
447 457
448static int dax_mmap(struct file *filp, struct vm_area_struct *vma) 458static int dax_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 2138102ef611..c5f4f7691b57 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -17,6 +17,7 @@
17#include <linux/io.h> 17#include <linux/io.h>
18#include <linux/iommu.h> 18#include <linux/iommu.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kmemleak.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/of.h> 22#include <linux/of.h>
22#include <linux/of_iommu.h> 23#include <linux/of_iommu.h>
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 542930cd183d..5a96fd14ac22 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -25,7 +25,6 @@
25#include <linux/io.h> 25#include <linux/io.h>
26#include <linux/iommu.h> 26#include <linux/iommu.h>
27#include <linux/iopoll.h> 27#include <linux/iopoll.h>
28#include <linux/kmemleak.h>
29#include <linux/list.h> 28#include <linux/list.h>
30#include <linux/of_address.h> 29#include <linux/of_address.h>
31#include <linux/of_iommu.h> 30#include <linux/of_iommu.h>
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 1b4af54a4968..30371274409d 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -35,6 +35,7 @@
35#include <linux/of_net.h> 35#include <linux/of_net.h>
36#include <linux/of_device.h> 36#include <linux/of_device.h>
37#include <linux/if_vlan.h> 37#include <linux/if_vlan.h>
38#include <linux/kmemleak.h>
38 39
39#include <linux/pinctrl/consumer.h> 40#include <linux/pinctrl/consumer.h>
40 41
diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 2437422625bf..57bb8f049e59 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -31,7 +31,6 @@
31#include "efuse.h" 31#include "efuse.h"
32#include <linux/interrupt.h> 32#include <linux/interrupt.h>
33#include <linux/export.h> 33#include <linux/export.h>
34#include <linux/kmemleak.h>
35#include <linux/module.h> 34#include <linux/module.h>
36 35
37MODULE_AUTHOR("lizhaoming <chaoming_li@realsil.com.cn>"); 36MODULE_AUTHOR("lizhaoming <chaoming_li@realsil.com.cn>");
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c
index 015476e3f7e5..f3bff66e85d0 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8192c/fw_common.c
@@ -32,7 +32,6 @@
32#include "../rtl8192ce/def.h" 32#include "../rtl8192ce/def.h"
33#include "fw_common.h" 33#include "fw_common.h"
34#include <linux/export.h> 34#include <linux/export.h>
35#include <linux/kmemleak.h>
36 35
37static void _rtl92c_enable_fw_download(struct ieee80211_hw *hw, bool enable) 36static void _rtl92c_enable_fw_download(struct ieee80211_hw *hw, bool enable)
38{ 37{
diff --git a/drivers/staging/rtl8188eu/hal/fw.c b/drivers/staging/rtl8188eu/hal/fw.c
index 03d091bad13a..6b67b38a6a9f 100644
--- a/drivers/staging/rtl8188eu/hal/fw.c
+++ b/drivers/staging/rtl8188eu/hal/fw.c
@@ -30,7 +30,7 @@
30#include "rtl8188e_hal.h" 30#include "rtl8188e_hal.h"
31 31
32#include <linux/firmware.h> 32#include <linux/firmware.h>
33#include <linux/kmemleak.h> 33#include <linux/slab.h>
34 34
35static void _rtl88e_enable_fw_download(struct adapter *adapt, bool enable) 35static void _rtl88e_enable_fw_download(struct adapter *adapt, bool enable)
36{ 36{
diff --git a/drivers/staging/rtlwifi/pci.c b/drivers/staging/rtlwifi/pci.c
index 70a64a5f564a..d56810eabde7 100644
--- a/drivers/staging/rtlwifi/pci.c
+++ b/drivers/staging/rtlwifi/pci.c
@@ -31,7 +31,6 @@
31#include "efuse.h" 31#include "efuse.h"
32#include <linux/interrupt.h> 32#include <linux/interrupt.h>
33#include <linux/export.h> 33#include <linux/export.h>
34#include <linux/kmemleak.h>
35#include <linux/module.h> 34#include <linux/module.h>
36 35
37MODULE_AUTHOR("lizhaoming <chaoming_li@realsil.com.cn>"); 36MODULE_AUTHOR("lizhaoming <chaoming_li@realsil.com.cn>");
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..21d464a29cf8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -23,7 +23,6 @@
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/module.h> 24#include <linux/module.h>
25#include <linux/hrtimer.h> 25#include <linux/hrtimer.h>
26#include <linux/kmemleak.h>
27#include <linux/dma-mapping.h> 26#include <linux/dma-mapping.h>
28#include <xen/xen.h> 27#include <xen/xen.h>
29 28
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 8fb89ddc6cc7..e622f0f10502 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -292,6 +292,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
292#ifdef CONFIG_9P_FSCACHE 292#ifdef CONFIG_9P_FSCACHE
293 kfree(v9ses->cachetag); 293 kfree(v9ses->cachetag);
294 v9ses->cachetag = match_strdup(&args[0]); 294 v9ses->cachetag = match_strdup(&args[0]);
295 if (!v9ses->cachetag) {
296 ret = -ENOMEM;
297 goto free_and_return;
298 }
295#endif 299#endif
296 break; 300 break;
297 case Opt_cache: 301 case Opt_cache:
@@ -471,6 +475,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
471 return fid; 475 return fid;
472 476
473err_clnt: 477err_clnt:
478#ifdef CONFIG_9P_FSCACHE
479 kfree(v9ses->cachetag);
480#endif
474 p9_client_destroy(v9ses->clnt); 481 p9_client_destroy(v9ses->clnt);
475err_names: 482err_names:
476 kfree(v9ses->uname); 483 kfree(v9ses->uname);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index bdabb2765d1b..9ee534159cc6 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -579,6 +579,24 @@ static int v9fs_at_to_dotl_flags(int flags)
579} 579}
580 580
581/** 581/**
582 * v9fs_dec_count - helper functon to drop i_nlink.
583 *
584 * If a directory had nlink <= 2 (including . and ..), then we should not drop
585 * the link count, which indicates the underlying exported fs doesn't maintain
586 * nlink accurately. e.g.
587 * - overlayfs sets nlink to 1 for merged dir
588 * - ext4 (with dir_nlink feature enabled) sets nlink to 1 if a dir has more
589 * than EXT4_LINK_MAX (65000) links.
590 *
591 * @inode: inode whose nlink is being dropped
592 */
593static void v9fs_dec_count(struct inode *inode)
594{
595 if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
596 drop_nlink(inode);
597}
598
599/**
582 * v9fs_remove - helper function to remove files and directories 600 * v9fs_remove - helper function to remove files and directories
583 * @dir: directory inode that is being deleted 601 * @dir: directory inode that is being deleted
584 * @dentry: dentry that is being deleted 602 * @dentry: dentry that is being deleted
@@ -621,9 +639,9 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
621 */ 639 */
622 if (flags & AT_REMOVEDIR) { 640 if (flags & AT_REMOVEDIR) {
623 clear_nlink(inode); 641 clear_nlink(inode);
624 drop_nlink(dir); 642 v9fs_dec_count(dir);
625 } else 643 } else
626 drop_nlink(inode); 644 v9fs_dec_count(inode);
627 645
628 v9fs_invalidate_inode_attr(inode); 646 v9fs_invalidate_inode_attr(inode);
629 v9fs_invalidate_inode_attr(dir); 647 v9fs_invalidate_inode_attr(dir);
@@ -1024,12 +1042,12 @@ clunk_newdir:
1024 if (S_ISDIR(new_inode->i_mode)) 1042 if (S_ISDIR(new_inode->i_mode))
1025 clear_nlink(new_inode); 1043 clear_nlink(new_inode);
1026 else 1044 else
1027 drop_nlink(new_inode); 1045 v9fs_dec_count(new_inode);
1028 } 1046 }
1029 if (S_ISDIR(old_inode->i_mode)) { 1047 if (S_ISDIR(old_inode->i_mode)) {
1030 if (!new_inode) 1048 if (!new_inode)
1031 inc_nlink(new_dir); 1049 inc_nlink(new_dir);
1032 drop_nlink(old_dir); 1050 v9fs_dec_count(old_dir);
1033 } 1051 }
1034 v9fs_invalidate_inode_attr(old_inode); 1052 v9fs_invalidate_inode_attr(old_inode);
1035 v9fs_invalidate_inode_attr(old_dir); 1053 v9fs_invalidate_inode_attr(old_dir);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index af03c2a901eb..48ce50484e80 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -94,7 +94,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
94 if (v9ses->cache) 94 if (v9ses->cache)
95 sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; 95 sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE;
96 96
97 sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME; 97 sb->s_flags |= SB_ACTIVE | SB_DIRSYNC;
98 if (!v9ses->cache) 98 if (!v9ses->cache)
99 sb->s_flags |= SB_SYNCHRONOUS; 99 sb->s_flags |= SB_SYNCHRONOUS;
100 100
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe09ef9c21f3..7a506c55a993 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1324,7 +1324,8 @@ static void flush_disk(struct block_device *bdev, bool kill_dirty)
1324 * @bdev: struct bdev to adjust. 1324 * @bdev: struct bdev to adjust.
1325 * 1325 *
1326 * This routine checks to see if the bdev size does not match the disk size 1326 * This routine checks to see if the bdev size does not match the disk size
1327 * and adjusts it if it differs. 1327 * and adjusts it if it differs. When shrinking the bdev size, its all caches
1328 * are freed.
1328 */ 1329 */
1329void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 1330void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
1330{ 1331{
@@ -1337,7 +1338,8 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
1337 "%s: detected capacity change from %lld to %lld\n", 1338 "%s: detected capacity change from %lld to %lld\n",
1338 disk->disk_name, bdev_size, disk_size); 1339 disk->disk_name, bdev_size, disk_size);
1339 i_size_write(bdev->bd_inode, disk_size); 1340 i_size_write(bdev->bd_inode, disk_size);
1340 flush_disk(bdev, false); 1341 if (bdev_size > disk_size)
1342 flush_disk(bdev, false);
1341 } 1343 }
1342} 1344}
1343EXPORT_SYMBOL(check_disk_size_change); 1345EXPORT_SYMBOL(check_disk_size_change);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9a73924db22f..ec5dd39071e6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1511,7 +1511,7 @@ void block_invalidatepage(struct page *page, unsigned int offset,
1511 * The get_block cached value has been unconditionally invalidated, 1511 * The get_block cached value has been unconditionally invalidated,
1512 * so real IO is not possible anymore. 1512 * so real IO is not possible anymore.
1513 */ 1513 */
1514 if (offset == 0) 1514 if (length == PAGE_SIZE)
1515 try_to_release_page(page, 0); 1515 try_to_release_page(page, 0);
1516out: 1516out:
1517 return; 1517 return;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ba12ee659673..874607bb6e02 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1177,9 +1177,9 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1177 unsigned blkbits = i_blkbits; 1177 unsigned blkbits = i_blkbits;
1178 unsigned blocksize_mask = (1 << blkbits) - 1; 1178 unsigned blocksize_mask = (1 << blkbits) - 1;
1179 ssize_t retval = -EINVAL; 1179 ssize_t retval = -EINVAL;
1180 size_t count = iov_iter_count(iter); 1180 const size_t count = iov_iter_count(iter);
1181 loff_t offset = iocb->ki_pos; 1181 loff_t offset = iocb->ki_pos;
1182 loff_t end = offset + count; 1182 const loff_t end = offset + count;
1183 struct dio *dio; 1183 struct dio *dio;
1184 struct dio_submit sdio = { 0, }; 1184 struct dio_submit sdio = { 0, };
1185 struct buffer_head map_bh = { 0, }; 1185 struct buffer_head map_bh = { 0, };
@@ -1200,7 +1200,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1200 } 1200 }
1201 1201
1202 /* watch out for a 0 len io from a tricksy fs */ 1202 /* watch out for a 0 len io from a tricksy fs */
1203 if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) 1203 if (iov_iter_rw(iter) == READ && !count)
1204 return 0; 1204 return 0;
1205 1205
1206 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); 1206 dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1315,8 +1315,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1315 1315
1316 dio->should_dirty = (iter->type == ITER_IOVEC); 1316 dio->should_dirty = (iter->type == ITER_IOVEC);
1317 sdio.iter = iter; 1317 sdio.iter = iter;
1318 sdio.final_block_in_request = 1318 sdio.final_block_in_request = end >> blkbits;
1319 (offset + iov_iter_count(iter)) >> blkbits;
1320 1319
1321 /* 1320 /*
1322 * In case of non-aligned buffers, we may need 2 more 1321 * In case of non-aligned buffers, we may need 2 more
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b9a254dcc0e7..d508c7844681 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -138,10 +138,14 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
138 138
139 /* 139 /*
140 * page based offset in vm_pgoff could be sufficiently large to 140 * page based offset in vm_pgoff could be sufficiently large to
141 * overflow a (l)off_t when converted to byte offset. 141 * overflow a loff_t when converted to byte offset. This can
142 * only happen on architectures where sizeof(loff_t) ==
143 * sizeof(unsigned long). So, only check in those instances.
142 */ 144 */
143 if (vma->vm_pgoff & PGOFF_LOFFT_MAX) 145 if (sizeof(unsigned long) == sizeof(loff_t)) {
144 return -EINVAL; 146 if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
147 return -EINVAL;
148 }
145 149
146 /* must be huge page aligned */ 150 /* must be huge page aligned */
147 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 151 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9a876bb07cac..0f157bbd3e0f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7119,7 +7119,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7119 goto out_commit; 7119 goto out_commit;
7120 did_quota = 1; 7120 did_quota = 1;
7121 7121
7122 data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 7122 data_ac->ac_resv = &oi->ip_la_data_resv;
7123 7123
7124 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off, 7124 ret = ocfs2_claim_clusters(handle, data_ac, 1, &bit_off,
7125 &num); 7125 &num);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e8e205bf2e41..302cd7caa4a7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -346,7 +346,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
346 unlock = 0; 346 unlock = 0;
347 347
348out_alloc: 348out_alloc:
349 up_read(&OCFS2_I(inode)->ip_alloc_sem); 349 up_read(&oi->ip_alloc_sem);
350out_inode_unlock: 350out_inode_unlock:
351 ocfs2_inode_unlock(inode, 0); 351 ocfs2_inode_unlock(inode, 0);
352out: 352out:
@@ -2213,7 +2213,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
2213 down_write(&oi->ip_alloc_sem); 2213 down_write(&oi->ip_alloc_sem);
2214 2214
2215 if (first_get_block) { 2215 if (first_get_block) {
2216 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 2216 if (ocfs2_sparse_alloc(osb))
2217 ret = ocfs2_zero_tail(inode, di_bh, pos); 2217 ret = ocfs2_zero_tail(inode, di_bh, pos);
2218 else 2218 else
2219 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, 2219 ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 8614ff069d99..3494a62ed749 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,7 +78,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
78/* 78/*
79 * Using a named enum representing lock types in terms of #N bit stored in 79 * Using a named enum representing lock types in terms of #N bit stored in
80 * iocb->private, which is going to be used for communication between 80 * iocb->private, which is going to be used for communication between
81 * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). 81 * ocfs2_dio_end_io() and ocfs2_file_write/read_iter().
82 */ 82 */
83enum ocfs2_iocb_lock_bits { 83enum ocfs2_iocb_lock_bits {
84 OCFS2_IOCB_RW_LOCK = 0, 84 OCFS2_IOCB_RW_LOCK = 0,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index ea8c551bcd7e..91a8889abf9b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -570,7 +570,16 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
570 current_page, vec_len, vec_start); 570 current_page, vec_len, vec_start);
571 571
572 len = bio_add_page(bio, page, vec_len, vec_start); 572 len = bio_add_page(bio, page, vec_len, vec_start);
573 if (len != vec_len) break; 573 if (len != vec_len) {
574 mlog(ML_ERROR, "Adding page[%d] to bio failed, "
575 "page %p, len %d, vec_len %u, vec_start %u, "
576 "bi_sector %llu\n", current_page, page, len,
577 vec_len, vec_start,
578 (unsigned long long)bio->bi_iter.bi_sector);
579 bio_put(bio);
580 bio = ERR_PTR(-EIO);
581 return bio;
582 }
574 583
575 cs += vec_len / (PAGE_SIZE/spp); 584 cs += vec_len / (PAGE_SIZE/spp);
576 vec_start = 0; 585 vec_start = 0;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 977763d4c27d..b048d4fa3959 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3072,7 +3072,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
3072 * We need to return the correct block within the 3072 * We need to return the correct block within the
3073 * cluster which should hold our entry. 3073 * cluster which should hold our entry.
3074 */ 3074 */
3075 off = ocfs2_dx_dir_hash_idx(OCFS2_SB(dir->i_sb), 3075 off = ocfs2_dx_dir_hash_idx(osb,
3076 &lookup->dl_hinfo); 3076 &lookup->dl_hinfo);
3077 get_bh(dx_leaves[off]); 3077 get_bh(dx_leaves[off]);
3078 lookup->dl_dx_leaf_bh = dx_leaves[off]; 3078 lookup->dl_dx_leaf_bh = dx_leaves[off];
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fd6bbbbd7d78..39831fc2fd52 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -224,14 +224,12 @@ void dlm_do_local_ast(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
224 struct dlm_lock *lock) 224 struct dlm_lock *lock)
225{ 225{
226 dlm_astlockfunc_t *fn; 226 dlm_astlockfunc_t *fn;
227 struct dlm_lockstatus *lksb;
228 227
229 mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name, 228 mlog(0, "%s: res %.*s, lock %u:%llu, Local AST\n", dlm->name,
230 res->lockname.len, res->lockname.name, 229 res->lockname.len, res->lockname.name,
231 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), 230 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
232 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie))); 231 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
233 232
234 lksb = lock->lksb;
235 fn = lock->ast; 233 fn = lock->ast;
236 BUG_ON(lock->ml.node != dlm->node_num); 234 BUG_ON(lock->ml.node != dlm->node_num);
237 235
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e9f3705c4c9f..d06e27ec4be4 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
140 u8 node_num; 140 u8 node_num;
141 u32 key; 141 u32 key;
142 u8 joining_node; 142 u8 joining_node;
143 u8 migrate_done; /* set to 1 means node has migrated all lock resources */
143 wait_queue_head_t dlm_join_events; 144 wait_queue_head_t dlm_join_events;
144 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 145 unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
145 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 146 unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -960,13 +961,10 @@ static inline int dlm_send_proxy_ast(struct dlm_ctxt *dlm,
960void dlm_print_one_lock_resource(struct dlm_lock_resource *res); 961void dlm_print_one_lock_resource(struct dlm_lock_resource *res);
961void __dlm_print_one_lock_resource(struct dlm_lock_resource *res); 962void __dlm_print_one_lock_resource(struct dlm_lock_resource *res);
962 963
963u8 dlm_nm_this_node(struct dlm_ctxt *dlm);
964void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 964void dlm_kick_thread(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
965void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); 965void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
966 966
967 967
968int dlm_nm_init(struct dlm_ctxt *dlm);
969int dlm_heartbeat_init(struct dlm_ctxt *dlm);
970void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data); 968void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data);
971void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data); 969void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data);
972 970
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index e1fea149f50b..425081be6161 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -461,6 +461,19 @@ redo_bucket:
461 cond_resched_lock(&dlm->spinlock); 461 cond_resched_lock(&dlm->spinlock);
462 num += n; 462 num += n;
463 } 463 }
464
465 if (!num) {
466 if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
467 mlog(0, "%s: perhaps there are more lock resources "
468 "need to be migrated after dlm recovery\n", dlm->name);
469 ret = -EAGAIN;
470 } else {
471 mlog(0, "%s: we won't do dlm recovery after migrating "
472 "all lock resources\n", dlm->name);
473 dlm->migrate_done = 1;
474 }
475 }
476
464 spin_unlock(&dlm->spinlock); 477 spin_unlock(&dlm->spinlock);
465 wake_up(&dlm->dlm_thread_wq); 478 wake_up(&dlm->dlm_thread_wq);
466 479
@@ -675,20 +688,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
675 spin_unlock(&dlm->spinlock); 688 spin_unlock(&dlm->spinlock);
676} 689}
677 690
678int dlm_shutting_down(struct dlm_ctxt *dlm)
679{
680 int ret = 0;
681
682 spin_lock(&dlm_domain_lock);
683
684 if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
685 ret = 1;
686
687 spin_unlock(&dlm_domain_lock);
688
689 return ret;
690}
691
692void dlm_unregister_domain(struct dlm_ctxt *dlm) 691void dlm_unregister_domain(struct dlm_ctxt *dlm)
693{ 692{
694 int leave = 0; 693 int leave = 0;
@@ -2052,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2052 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN; 2051 dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
2053 init_waitqueue_head(&dlm->dlm_join_events); 2052 init_waitqueue_head(&dlm->dlm_join_events);
2054 2053
2054 dlm->migrate_done = 0;
2055
2055 dlm->reco.new_master = O2NM_INVALID_NODE_NUM; 2056 dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
2056 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; 2057 dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
2057 2058
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index fd6122a38dbd..8a9281411c18 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,30 @@
28extern spinlock_t dlm_domain_lock; 28extern spinlock_t dlm_domain_lock;
29extern struct list_head dlm_domains; 29extern struct list_head dlm_domains;
30 30
31int dlm_shutting_down(struct dlm_ctxt *dlm); 31static inline int dlm_joined(struct dlm_ctxt *dlm)
32{
33 int ret = 0;
34
35 spin_lock(&dlm_domain_lock);
36 if (dlm->dlm_state == DLM_CTXT_JOINED)
37 ret = 1;
38 spin_unlock(&dlm_domain_lock);
39
40 return ret;
41}
42
43static inline int dlm_shutting_down(struct dlm_ctxt *dlm)
44{
45 int ret = 0;
46
47 spin_lock(&dlm_domain_lock);
48 if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
49 ret = 1;
50 spin_unlock(&dlm_domain_lock);
51
52 return ret;
53}
54
32void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, 55void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
33 int node_num); 56 int node_num);
34 57
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 66c2a491f68d..74962315794e 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -77,8 +77,7 @@ int dlm_init_lock_cache(void)
77 77
78void dlm_destroy_lock_cache(void) 78void dlm_destroy_lock_cache(void)
79{ 79{
80 if (dlm_lock_cache) 80 kmem_cache_destroy(dlm_lock_cache);
81 kmem_cache_destroy(dlm_lock_cache);
82} 81}
83 82
84/* Tell us whether we can grant a new lock request. 83/* Tell us whether we can grant a new lock request.
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a7df226f9449..aaca0949fe53 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -414,8 +414,7 @@ int dlm_init_mle_cache(void)
414 414
415void dlm_destroy_mle_cache(void) 415void dlm_destroy_mle_cache(void)
416{ 416{
417 if (dlm_mle_cache) 417 kmem_cache_destroy(dlm_mle_cache);
418 kmem_cache_destroy(dlm_mle_cache);
419} 418}
420 419
421static void dlm_mle_release(struct kref *kref) 420static void dlm_mle_release(struct kref *kref)
@@ -472,15 +471,11 @@ bail:
472 471
473void dlm_destroy_master_caches(void) 472void dlm_destroy_master_caches(void)
474{ 473{
475 if (dlm_lockname_cache) { 474 kmem_cache_destroy(dlm_lockname_cache);
476 kmem_cache_destroy(dlm_lockname_cache); 475 dlm_lockname_cache = NULL;
477 dlm_lockname_cache = NULL;
478 }
479 476
480 if (dlm_lockres_cache) { 477 kmem_cache_destroy(dlm_lockres_cache);
481 kmem_cache_destroy(dlm_lockres_cache); 478 dlm_lockres_cache = NULL;
482 dlm_lockres_cache = NULL;
483 }
484} 479}
485 480
486static void dlm_lockres_release(struct kref *kref) 481static void dlm_lockres_release(struct kref *kref)
@@ -2495,13 +2490,13 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2495} 2490}
2496 2491
2497/* 2492/*
2498 * A migrateable resource is one that is : 2493 * A migratable resource is one that is :
2499 * 1. locally mastered, and, 2494 * 1. locally mastered, and,
2500 * 2. zero local locks, and, 2495 * 2. zero local locks, and,
2501 * 3. one or more non-local locks, or, one or more references 2496 * 3. one or more non-local locks, or, one or more references
2502 * Returns 1 if yes, 0 if not. 2497 * Returns 1 if yes, 0 if not.
2503 */ 2498 */
2504static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm, 2499static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
2505 struct dlm_lock_resource *res) 2500 struct dlm_lock_resource *res)
2506{ 2501{
2507 enum dlm_lockres_list idx; 2502 enum dlm_lockres_list idx;
@@ -2532,7 +2527,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2532 continue; 2527 continue;
2533 } 2528 }
2534 cookie = be64_to_cpu(lock->ml.cookie); 2529 cookie = be64_to_cpu(lock->ml.cookie);
2535 mlog(0, "%s: Not migrateable res %.*s, lock %u:%llu on " 2530 mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
2536 "%s list\n", dlm->name, res->lockname.len, 2531 "%s list\n", dlm->name, res->lockname.len,
2537 res->lockname.name, 2532 res->lockname.name,
2538 dlm_get_lock_cookie_node(cookie), 2533 dlm_get_lock_cookie_node(cookie),
@@ -2548,7 +2543,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2548 return 0; 2543 return 0;
2549 } 2544 }
2550 2545
2551 mlog(0, "%s: res %.*s, Migrateable\n", dlm->name, res->lockname.len, 2546 mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len,
2552 res->lockname.name); 2547 res->lockname.name);
2553 2548
2554 return 1; 2549 return 1;
@@ -2792,7 +2787,7 @@ int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2792 assert_spin_locked(&dlm->spinlock); 2787 assert_spin_locked(&dlm->spinlock);
2793 2788
2794 spin_lock(&res->spinlock); 2789 spin_lock(&res->spinlock);
2795 if (dlm_is_lockres_migrateable(dlm, res)) 2790 if (dlm_is_lockres_migratable(dlm, res))
2796 target = dlm_pick_migration_target(dlm, res); 2791 target = dlm_pick_migration_target(dlm, res);
2797 spin_unlock(&res->spinlock); 2792 spin_unlock(&res->spinlock);
2798 2793
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ec8f75813beb..802636d50365 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -62,7 +62,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node);
62static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); 62static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node);
63static int dlm_request_all_locks(struct dlm_ctxt *dlm, 63static int dlm_request_all_locks(struct dlm_ctxt *dlm,
64 u8 request_from, u8 dead_node); 64 u8 request_from, u8 dead_node);
65static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node); 65static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm);
66 66
67static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res); 67static inline int dlm_num_locks_in_lockres(struct dlm_lock_resource *res);
68static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres, 68static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
@@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
423 423
424static void dlm_begin_recovery(struct dlm_ctxt *dlm) 424static void dlm_begin_recovery(struct dlm_ctxt *dlm)
425{ 425{
426 spin_lock(&dlm->spinlock); 426 assert_spin_locked(&dlm->spinlock);
427 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 427 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
428 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n", 428 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
429 dlm->name, dlm->reco.dead_node); 429 dlm->name, dlm->reco.dead_node);
430 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 430 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
431 spin_unlock(&dlm->spinlock);
432} 431}
433 432
434static void dlm_end_recovery(struct dlm_ctxt *dlm) 433static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
456 455
457 spin_lock(&dlm->spinlock); 456 spin_lock(&dlm->spinlock);
458 457
458 if (dlm->migrate_done) {
459 mlog(0, "%s: no need do recovery after migrating all "
460 "lock resources\n", dlm->name);
461 spin_unlock(&dlm->spinlock);
462 return 0;
463 }
464
459 /* check to see if the new master has died */ 465 /* check to see if the new master has died */
460 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM && 466 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
461 test_bit(dlm->reco.new_master, dlm->recovery_map)) { 467 test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
490 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", 496 mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
491 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), 497 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
492 dlm->reco.dead_node); 498 dlm->reco.dead_node);
493 spin_unlock(&dlm->spinlock);
494 499
495 /* take write barrier */ 500 /* take write barrier */
496 /* (stops the list reshuffling thread, proxy ast handling) */ 501 /* (stops the list reshuffling thread, proxy ast handling) */
497 dlm_begin_recovery(dlm); 502 dlm_begin_recovery(dlm);
498 503
504 spin_unlock(&dlm->spinlock);
505
499 if (dlm->reco.new_master == dlm->node_num) 506 if (dlm->reco.new_master == dlm->node_num)
500 goto master_here; 507 goto master_here;
501 508
@@ -739,7 +746,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
739 } 746 }
740 747
741 if (destroy) 748 if (destroy)
742 dlm_destroy_recovery_area(dlm, dead_node); 749 dlm_destroy_recovery_area(dlm);
743 750
744 return status; 751 return status;
745} 752}
@@ -764,7 +771,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
764 771
765 ndata = kzalloc(sizeof(*ndata), GFP_NOFS); 772 ndata = kzalloc(sizeof(*ndata), GFP_NOFS);
766 if (!ndata) { 773 if (!ndata) {
767 dlm_destroy_recovery_area(dlm, dead_node); 774 dlm_destroy_recovery_area(dlm);
768 return -ENOMEM; 775 return -ENOMEM;
769 } 776 }
770 ndata->node_num = num; 777 ndata->node_num = num;
@@ -778,7 +785,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
778 return 0; 785 return 0;
779} 786}
780 787
781static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) 788static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm)
782{ 789{
783 struct dlm_reco_node_data *ndata, *next; 790 struct dlm_reco_node_data *ndata, *next;
784 LIST_HEAD(tmplist); 791 LIST_HEAD(tmplist);
@@ -1378,6 +1385,15 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
1378 if (!dlm_grab(dlm)) 1385 if (!dlm_grab(dlm))
1379 return -EINVAL; 1386 return -EINVAL;
1380 1387
1388 if (!dlm_joined(dlm)) {
1389 mlog(ML_ERROR, "Domain %s not joined! "
1390 "lockres %.*s, master %u\n",
1391 dlm->name, mres->lockname_len,
1392 mres->lockname, mres->master);
1393 dlm_put(dlm);
1394 return -EINVAL;
1395 }
1396
1381 BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION))); 1397 BUG_ON(!(mres->flags & (DLM_MRES_RECOVERY|DLM_MRES_MIGRATION)));
1382 1398
1383 real_master = mres->master; 1399 real_master = mres->master;
@@ -1807,7 +1823,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1807 int i, j, bad; 1823 int i, j, bad;
1808 struct dlm_lock *lock; 1824 struct dlm_lock *lock;
1809 u8 from = O2NM_MAX_NODES; 1825 u8 from = O2NM_MAX_NODES;
1810 unsigned int added = 0;
1811 __be64 c; 1826 __be64 c;
1812 1827
1813 mlog(0, "running %d locks for this lockres\n", mres->num_locks); 1828 mlog(0, "running %d locks for this lockres\n", mres->num_locks);
@@ -1823,7 +1838,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1823 spin_lock(&res->spinlock); 1838 spin_lock(&res->spinlock);
1824 dlm_lockres_set_refmap_bit(dlm, res, from); 1839 dlm_lockres_set_refmap_bit(dlm, res, from);
1825 spin_unlock(&res->spinlock); 1840 spin_unlock(&res->spinlock);
1826 added++;
1827 break; 1841 break;
1828 } 1842 }
1829 BUG_ON(ml->highest_blocked != LKM_IVMODE); 1843 BUG_ON(ml->highest_blocked != LKM_IVMODE);
@@ -1911,7 +1925,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1911 /* do not alter lock refcount. switching lists. */ 1925 /* do not alter lock refcount. switching lists. */
1912 list_move_tail(&lock->list, queue); 1926 list_move_tail(&lock->list, queue);
1913 spin_unlock(&res->spinlock); 1927 spin_unlock(&res->spinlock);
1914 added++;
1915 1928
1916 mlog(0, "just reordered a local lock!\n"); 1929 mlog(0, "just reordered a local lock!\n");
1917 continue; 1930 continue;
@@ -2037,7 +2050,6 @@ skip_lvb:
2037 "setting refmap bit\n", dlm->name, 2050 "setting refmap bit\n", dlm->name,
2038 res->lockname.len, res->lockname.name, ml->node); 2051 res->lockname.len, res->lockname.name, ml->node);
2039 dlm_lockres_set_refmap_bit(dlm, res, ml->node); 2052 dlm_lockres_set_refmap_bit(dlm, res, ml->node);
2040 added++;
2041 } 2053 }
2042 spin_unlock(&res->spinlock); 2054 spin_unlock(&res->spinlock);
2043 } 2055 }
@@ -2331,13 +2343,6 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2331 __dlm_dirty_lockres(dlm, res); 2343 __dlm_dirty_lockres(dlm, res);
2332} 2344}
2333 2345
2334/* if this node is the recovery master, and there are no
2335 * locks for a given lockres owned by this node that are in
2336 * either PR or EX mode, zero out the lvb before requesting.
2337 *
2338 */
2339
2340
2341static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) 2346static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2342{ 2347{
2343 struct dlm_lock_resource *res; 2348 struct dlm_lock_resource *res;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b552d1f8508c..97a972efab83 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1756,8 +1756,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1756 1756
1757 level = write ? DLM_LOCK_EX : DLM_LOCK_PR; 1757 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1758 1758
1759 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1759 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1760 0);
1761 if (status < 0) 1760 if (status < 0)
1762 mlog_errno(status); 1761 mlog_errno(status);
1763 1762
@@ -1796,7 +1795,7 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
1796 write ? "EXMODE" : "PRMODE"); 1795 write ? "EXMODE" : "PRMODE");
1797 1796
1798 if (!ocfs2_mount_local(osb)) 1797 if (!ocfs2_mount_local(osb))
1799 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1798 ocfs2_cluster_unlock(osb, lockres, level);
1800} 1799}
1801 1800
1802/* 1801/*
@@ -1816,8 +1815,7 @@ int ocfs2_open_lock(struct inode *inode)
1816 1815
1817 lockres = &OCFS2_I(inode)->ip_open_lockres; 1816 lockres = &OCFS2_I(inode)->ip_open_lockres;
1818 1817
1819 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1818 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_PR, 0, 0);
1820 DLM_LOCK_PR, 0, 0);
1821 if (status < 0) 1819 if (status < 0)
1822 mlog_errno(status); 1820 mlog_errno(status);
1823 1821
@@ -1854,8 +1852,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1854 * other nodes and the -EAGAIN will indicate to the caller that 1852 * other nodes and the -EAGAIN will indicate to the caller that
1855 * this inode is still in use. 1853 * this inode is still in use.
1856 */ 1854 */
1857 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1855 status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0);
1858 level, DLM_LKF_NOQUEUE, 0);
1859 1856
1860out: 1857out:
1861 return status; 1858 return status;
@@ -1876,11 +1873,9 @@ void ocfs2_open_unlock(struct inode *inode)
1876 goto out; 1873 goto out;
1877 1874
1878 if(lockres->l_ro_holders) 1875 if(lockres->l_ro_holders)
1879 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1876 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_PR);
1880 DLM_LOCK_PR);
1881 if(lockres->l_ex_holders) 1877 if(lockres->l_ex_holders)
1882 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1878 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
1883 DLM_LOCK_EX);
1884 1879
1885out: 1880out:
1886 return; 1881 return;
@@ -2601,9 +2596,9 @@ void ocfs2_inode_unlock(struct inode *inode,
2601 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2596 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2602 ex ? "EXMODE" : "PRMODE"); 2597 ex ? "EXMODE" : "PRMODE");
2603 2598
2604 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && 2599 if (!ocfs2_is_hard_readonly(osb) &&
2605 !ocfs2_mount_local(osb)) 2600 !ocfs2_mount_local(osb))
2606 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 2601 ocfs2_cluster_unlock(osb, lockres, level);
2607} 2602}
2608 2603
2609/* 2604/*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5d1784a365a3..6ee94bc23f5b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -101,7 +101,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
101 struct ocfs2_inode_info *oi = OCFS2_I(inode); 101 struct ocfs2_inode_info *oi = OCFS2_I(inode);
102 102
103 trace_ocfs2_file_open(inode, file, file->f_path.dentry, 103 trace_ocfs2_file_open(inode, file, file->f_path.dentry,
104 (unsigned long long)OCFS2_I(inode)->ip_blkno, 104 (unsigned long long)oi->ip_blkno,
105 file->f_path.dentry->d_name.len, 105 file->f_path.dentry->d_name.len,
106 file->f_path.dentry->d_name.name, mode); 106 file->f_path.dentry->d_name.name, mode);
107 107
@@ -116,7 +116,7 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
116 /* Check that the inode hasn't been wiped from disk by another 116 /* Check that the inode hasn't been wiped from disk by another
117 * node. If it hasn't then we're safe as long as we hold the 117 * node. If it hasn't then we're safe as long as we hold the
118 * spin lock until our increment of open count. */ 118 * spin lock until our increment of open count. */
119 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 119 if (oi->ip_flags & OCFS2_INODE_DELETED) {
120 spin_unlock(&oi->ip_lock); 120 spin_unlock(&oi->ip_lock);
121 121
122 status = -ENOENT; 122 status = -ENOENT;
@@ -190,7 +190,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
190 bool needs_barrier = false; 190 bool needs_barrier = false;
191 191
192 trace_ocfs2_sync_file(inode, file, file->f_path.dentry, 192 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
193 OCFS2_I(inode)->ip_blkno, 193 oi->ip_blkno,
194 file->f_path.dentry->d_name.len, 194 file->f_path.dentry->d_name.len,
195 file->f_path.dentry->d_name.name, 195 file->f_path.dentry->d_name.name,
196 (unsigned long long)datasync); 196 (unsigned long long)datasync);
@@ -296,7 +296,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
296 ocfs2_journal_dirty(handle, bh); 296 ocfs2_journal_dirty(handle, bh);
297 297
298out_commit: 298out_commit:
299 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 299 ocfs2_commit_trans(osb, handle);
300out: 300out:
301 return ret; 301 return ret;
302} 302}
@@ -2257,7 +2257,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
2257 int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; 2257 int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2258 int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; 2258 int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2259 2259
2260 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, 2260 trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
2261 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2261 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2262 file->f_path.dentry->d_name.len, 2262 file->f_path.dentry->d_name.len,
2263 file->f_path.dentry->d_name.name, 2263 file->f_path.dentry->d_name.name,
@@ -2405,7 +2405,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2405 int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; 2405 int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
2406 int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; 2406 int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0;
2407 2407
2408 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, 2408 trace_ocfs2_file_read_iter(inode, filp, filp->f_path.dentry,
2409 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2409 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2410 filp->f_path.dentry->d_name.len, 2410 filp->f_path.dentry->d_name.len,
2411 filp->f_path.dentry->d_name.name, 2411 filp->f_path.dentry->d_name.name,
@@ -2448,7 +2448,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2448 * 2448 *
2449 * Take and drop the meta data lock to update inode fields 2449 * Take and drop the meta data lock to update inode fields
2450 * like i_size. This allows the checks down below 2450 * like i_size. This allows the checks down below
2451 * generic_file_aio_read() a chance of actually working. 2451 * generic_file_read_iter() a chance of actually working.
2452 */ 2452 */
2453 ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, 2453 ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level,
2454 !nowait); 2454 !nowait);
@@ -2460,7 +2460,7 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
2460 ocfs2_inode_unlock(inode, lock_level); 2460 ocfs2_inode_unlock(inode, lock_level);
2461 2461
2462 ret = generic_file_read_iter(iocb, to); 2462 ret = generic_file_read_iter(iocb, to);
2463 trace_generic_file_aio_read_ret(ret); 2463 trace_generic_file_read_iter_ret(ret);
2464 2464
2465 /* buffered aio wouldn't have proper lock coverage today */ 2465 /* buffered aio wouldn't have proper lock coverage today */
2466 BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); 2466 BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT));
diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
index 6b92cb241138..f65f2b2f594d 100644
--- a/fs/ocfs2/filecheck.c
+++ b/fs/ocfs2/filecheck.c
@@ -53,36 +53,6 @@ static const char * const ocfs2_filecheck_errs[] = {
53 "UNSUPPORTED" 53 "UNSUPPORTED"
54}; 54};
55 55
56static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
57static LIST_HEAD(ocfs2_filecheck_sysfs_list);
58
59struct ocfs2_filecheck {
60 struct list_head fc_head; /* File check entry list head */
61 spinlock_t fc_lock;
62 unsigned int fc_max; /* Maximum number of entry in list */
63 unsigned int fc_size; /* Current entry count in list */
64 unsigned int fc_done; /* Finished entry count in list */
65};
66
67struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per mounting */
68 struct list_head fs_list;
69 atomic_t fs_count;
70 struct super_block *fs_sb;
71 struct kset *fs_devicekset;
72 struct kset *fs_fcheckkset;
73 struct ocfs2_filecheck *fs_fcheck;
74};
75
76#define OCFS2_FILECHECK_MAXSIZE 100
77#define OCFS2_FILECHECK_MINSIZE 10
78
79/* File check operation type */
80enum {
81 OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
82 OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
83 OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
84};
85
86struct ocfs2_filecheck_entry { 56struct ocfs2_filecheck_entry {
87 struct list_head fe_list; 57 struct list_head fe_list;
88 unsigned long fe_ino; 58 unsigned long fe_ino;
@@ -110,35 +80,84 @@ ocfs2_filecheck_error(int errno)
110 return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; 80 return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
111} 81}
112 82
113static ssize_t ocfs2_filecheck_show(struct kobject *kobj, 83static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
114 struct kobj_attribute *attr, 84 struct kobj_attribute *attr,
115 char *buf); 85 char *buf);
116static ssize_t ocfs2_filecheck_store(struct kobject *kobj, 86static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
117 struct kobj_attribute *attr, 87 struct kobj_attribute *attr,
118 const char *buf, size_t count); 88 const char *buf, size_t count);
119static struct kobj_attribute ocfs2_attr_filecheck_chk = 89static struct kobj_attribute ocfs2_filecheck_attr_chk =
120 __ATTR(check, S_IRUSR | S_IWUSR, 90 __ATTR(check, S_IRUSR | S_IWUSR,
121 ocfs2_filecheck_show, 91 ocfs2_filecheck_attr_show,
122 ocfs2_filecheck_store); 92 ocfs2_filecheck_attr_store);
123static struct kobj_attribute ocfs2_attr_filecheck_fix = 93static struct kobj_attribute ocfs2_filecheck_attr_fix =
124 __ATTR(fix, S_IRUSR | S_IWUSR, 94 __ATTR(fix, S_IRUSR | S_IWUSR,
125 ocfs2_filecheck_show, 95 ocfs2_filecheck_attr_show,
126 ocfs2_filecheck_store); 96 ocfs2_filecheck_attr_store);
127static struct kobj_attribute ocfs2_attr_filecheck_set = 97static struct kobj_attribute ocfs2_filecheck_attr_set =
128 __ATTR(set, S_IRUSR | S_IWUSR, 98 __ATTR(set, S_IRUSR | S_IWUSR,
129 ocfs2_filecheck_show, 99 ocfs2_filecheck_attr_show,
130 ocfs2_filecheck_store); 100 ocfs2_filecheck_attr_store);
101static struct attribute *ocfs2_filecheck_attrs[] = {
102 &ocfs2_filecheck_attr_chk.attr,
103 &ocfs2_filecheck_attr_fix.attr,
104 &ocfs2_filecheck_attr_set.attr,
105 NULL
106};
107
108static void ocfs2_filecheck_release(struct kobject *kobj)
109{
110 struct ocfs2_filecheck_sysfs_entry *entry = container_of(kobj,
111 struct ocfs2_filecheck_sysfs_entry, fs_kobj);
112
113 complete(&entry->fs_kobj_unregister);
114}
115
116static ssize_t
117ocfs2_filecheck_show(struct kobject *kobj, struct attribute *attr, char *buf)
118{
119 ssize_t ret = -EIO;
120 struct kobj_attribute *kattr = container_of(attr,
121 struct kobj_attribute, attr);
122
123 kobject_get(kobj);
124 if (kattr->show)
125 ret = kattr->show(kobj, kattr, buf);
126 kobject_put(kobj);
127 return ret;
128}
129
130static ssize_t
131ocfs2_filecheck_store(struct kobject *kobj, struct attribute *attr,
132 const char *buf, size_t count)
133{
134 ssize_t ret = -EIO;
135 struct kobj_attribute *kattr = container_of(attr,
136 struct kobj_attribute, attr);
137
138 kobject_get(kobj);
139 if (kattr->store)
140 ret = kattr->store(kobj, kattr, buf, count);
141 kobject_put(kobj);
142 return ret;
143}
144
145static const struct sysfs_ops ocfs2_filecheck_ops = {
146 .show = ocfs2_filecheck_show,
147 .store = ocfs2_filecheck_store,
148};
149
150static struct kobj_type ocfs2_ktype_filecheck = {
151 .default_attrs = ocfs2_filecheck_attrs,
152 .sysfs_ops = &ocfs2_filecheck_ops,
153 .release = ocfs2_filecheck_release,
154};
131 155
132static void 156static void
133ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) 157ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
134{ 158{
135 struct ocfs2_filecheck_entry *p; 159 struct ocfs2_filecheck_entry *p;
136 160
137 if (!atomic_dec_and_test(&entry->fs_count)) {
138 wait_var_event(&entry->fs_count,
139 !atomic_read(&entry->fs_count));
140 }
141
142 spin_lock(&entry->fs_fcheck->fc_lock); 161 spin_lock(&entry->fs_fcheck->fc_lock);
143 while (!list_empty(&entry->fs_fcheck->fc_head)) { 162 while (!list_empty(&entry->fs_fcheck->fc_head)) {
144 p = list_first_entry(&entry->fs_fcheck->fc_head, 163 p = list_first_entry(&entry->fs_fcheck->fc_head,
@@ -149,151 +168,48 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry)
149 } 168 }
150 spin_unlock(&entry->fs_fcheck->fc_lock); 169 spin_unlock(&entry->fs_fcheck->fc_lock);
151 170
152 kset_unregister(entry->fs_fcheckkset);
153 kset_unregister(entry->fs_devicekset);
154 kfree(entry->fs_fcheck); 171 kfree(entry->fs_fcheck);
155 kfree(entry); 172 entry->fs_fcheck = NULL;
156}
157
158static void
159ocfs2_filecheck_sysfs_add(struct ocfs2_filecheck_sysfs_entry *entry)
160{
161 spin_lock(&ocfs2_filecheck_sysfs_lock);
162 list_add_tail(&entry->fs_list, &ocfs2_filecheck_sysfs_list);
163 spin_unlock(&ocfs2_filecheck_sysfs_lock);
164} 173}
165 174
166static int ocfs2_filecheck_sysfs_del(const char *devname) 175int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb)
167{ 176{
168 struct ocfs2_filecheck_sysfs_entry *p; 177 int ret;
169 178 struct ocfs2_filecheck *fcheck;
170 spin_lock(&ocfs2_filecheck_sysfs_lock); 179 struct ocfs2_filecheck_sysfs_entry *entry = &osb->osb_fc_ent;
171 list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
172 if (!strcmp(p->fs_sb->s_id, devname)) {
173 list_del(&p->fs_list);
174 spin_unlock(&ocfs2_filecheck_sysfs_lock);
175 ocfs2_filecheck_sysfs_free(p);
176 return 0;
177 }
178 }
179 spin_unlock(&ocfs2_filecheck_sysfs_lock);
180 return 1;
181}
182
183static void
184ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry)
185{
186 if (atomic_dec_and_test(&entry->fs_count))
187 wake_up_var(&entry->fs_count);
188}
189
190static struct ocfs2_filecheck_sysfs_entry *
191ocfs2_filecheck_sysfs_get(const char *devname)
192{
193 struct ocfs2_filecheck_sysfs_entry *p = NULL;
194
195 spin_lock(&ocfs2_filecheck_sysfs_lock);
196 list_for_each_entry(p, &ocfs2_filecheck_sysfs_list, fs_list) {
197 if (!strcmp(p->fs_sb->s_id, devname)) {
198 atomic_inc(&p->fs_count);
199 spin_unlock(&ocfs2_filecheck_sysfs_lock);
200 return p;
201 }
202 }
203 spin_unlock(&ocfs2_filecheck_sysfs_lock);
204 return NULL;
205}
206
207int ocfs2_filecheck_create_sysfs(struct super_block *sb)
208{
209 int ret = 0;
210 struct kset *device_kset = NULL;
211 struct kset *fcheck_kset = NULL;
212 struct ocfs2_filecheck *fcheck = NULL;
213 struct ocfs2_filecheck_sysfs_entry *entry = NULL;
214 struct attribute **attrs = NULL;
215 struct attribute_group attrgp;
216
217 if (!ocfs2_kset)
218 return -ENOMEM;
219
220 attrs = kmalloc(sizeof(struct attribute *) * 4, GFP_NOFS);
221 if (!attrs) {
222 ret = -ENOMEM;
223 goto error;
224 } else {
225 attrs[0] = &ocfs2_attr_filecheck_chk.attr;
226 attrs[1] = &ocfs2_attr_filecheck_fix.attr;
227 attrs[2] = &ocfs2_attr_filecheck_set.attr;
228 attrs[3] = NULL;
229 memset(&attrgp, 0, sizeof(attrgp));
230 attrgp.attrs = attrs;
231 }
232 180
233 fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS); 181 fcheck = kmalloc(sizeof(struct ocfs2_filecheck), GFP_NOFS);
234 if (!fcheck) { 182 if (!fcheck)
235 ret = -ENOMEM; 183 return -ENOMEM;
236 goto error;
237 } else {
238 INIT_LIST_HEAD(&fcheck->fc_head);
239 spin_lock_init(&fcheck->fc_lock);
240 fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
241 fcheck->fc_size = 0;
242 fcheck->fc_done = 0;
243 }
244
245 if (strlen(sb->s_id) <= 0) {
246 mlog(ML_ERROR,
247 "Cannot get device basename when create filecheck sysfs\n");
248 ret = -ENODEV;
249 goto error;
250 }
251
252 device_kset = kset_create_and_add(sb->s_id, NULL, &ocfs2_kset->kobj);
253 if (!device_kset) {
254 ret = -ENOMEM;
255 goto error;
256 }
257
258 fcheck_kset = kset_create_and_add("filecheck", NULL,
259 &device_kset->kobj);
260 if (!fcheck_kset) {
261 ret = -ENOMEM;
262 goto error;
263 }
264
265 ret = sysfs_create_group(&fcheck_kset->kobj, &attrgp);
266 if (ret)
267 goto error;
268 184
269 entry = kmalloc(sizeof(struct ocfs2_filecheck_sysfs_entry), GFP_NOFS); 185 INIT_LIST_HEAD(&fcheck->fc_head);
270 if (!entry) { 186 spin_lock_init(&fcheck->fc_lock);
271 ret = -ENOMEM; 187 fcheck->fc_max = OCFS2_FILECHECK_MINSIZE;
272 goto error; 188 fcheck->fc_size = 0;
273 } else { 189 fcheck->fc_done = 0;
274 atomic_set(&entry->fs_count, 1); 190
275 entry->fs_sb = sb; 191 entry->fs_kobj.kset = osb->osb_dev_kset;
276 entry->fs_devicekset = device_kset; 192 init_completion(&entry->fs_kobj_unregister);
277 entry->fs_fcheckkset = fcheck_kset; 193 ret = kobject_init_and_add(&entry->fs_kobj, &ocfs2_ktype_filecheck,
278 entry->fs_fcheck = fcheck; 194 NULL, "filecheck");
279 ocfs2_filecheck_sysfs_add(entry); 195 if (ret) {
196 kfree(fcheck);
197 return ret;
280 } 198 }
281 199
282 kfree(attrs); 200 entry->fs_fcheck = fcheck;
283 return 0; 201 return 0;
284
285error:
286 kfree(attrs);
287 kfree(entry);
288 kfree(fcheck);
289 kset_unregister(fcheck_kset);
290 kset_unregister(device_kset);
291 return ret;
292} 202}
293 203
294int ocfs2_filecheck_remove_sysfs(struct super_block *sb) 204void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb)
295{ 205{
296 return ocfs2_filecheck_sysfs_del(sb->s_id); 206 if (!osb->osb_fc_ent.fs_fcheck)
207 return;
208
209 kobject_del(&osb->osb_fc_ent.fs_kobj);
210 kobject_put(&osb->osb_fc_ent.fs_kobj);
211 wait_for_completion(&osb->osb_fc_ent.fs_kobj_unregister);
212 ocfs2_filecheck_sysfs_free(&osb->osb_fc_ent);
297} 213}
298 214
299static int 215static int
@@ -310,7 +226,7 @@ ocfs2_filecheck_adjust_max(struct ocfs2_filecheck_sysfs_entry *ent,
310 226
311 spin_lock(&ent->fs_fcheck->fc_lock); 227 spin_lock(&ent->fs_fcheck->fc_lock);
312 if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) { 228 if (len < (ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done)) {
313 mlog(ML_ERROR, 229 mlog(ML_NOTICE,
314 "Cannot set online file check maximum entry number " 230 "Cannot set online file check maximum entry number "
315 "to %u due to too many pending entries(%u)\n", 231 "to %u due to too many pending entries(%u)\n",
316 len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done); 232 len, ent->fs_fcheck->fc_size - ent->fs_fcheck->fc_done);
@@ -387,7 +303,7 @@ ocfs2_filecheck_args_parse(const char *name, const char *buf, size_t count,
387 return 0; 303 return 0;
388} 304}
389 305
390static ssize_t ocfs2_filecheck_show(struct kobject *kobj, 306static ssize_t ocfs2_filecheck_attr_show(struct kobject *kobj,
391 struct kobj_attribute *attr, 307 struct kobj_attribute *attr,
392 char *buf) 308 char *buf)
393{ 309{
@@ -395,19 +311,12 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
395 ssize_t ret = 0, total = 0, remain = PAGE_SIZE; 311 ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
396 unsigned int type; 312 unsigned int type;
397 struct ocfs2_filecheck_entry *p; 313 struct ocfs2_filecheck_entry *p;
398 struct ocfs2_filecheck_sysfs_entry *ent; 314 struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
315 struct ocfs2_filecheck_sysfs_entry, fs_kobj);
399 316
400 if (ocfs2_filecheck_type_parse(attr->attr.name, &type)) 317 if (ocfs2_filecheck_type_parse(attr->attr.name, &type))
401 return -EINVAL; 318 return -EINVAL;
402 319
403 ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
404 if (!ent) {
405 mlog(ML_ERROR,
406 "Cannot get the corresponding entry via device basename %s\n",
407 kobj->name);
408 return -ENODEV;
409 }
410
411 if (type == OCFS2_FILECHECK_TYPE_SET) { 320 if (type == OCFS2_FILECHECK_TYPE_SET) {
412 spin_lock(&ent->fs_fcheck->fc_lock); 321 spin_lock(&ent->fs_fcheck->fc_lock);
413 total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max); 322 total = snprintf(buf, remain, "%u\n", ent->fs_fcheck->fc_max);
@@ -441,11 +350,26 @@ static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
441 spin_unlock(&ent->fs_fcheck->fc_lock); 350 spin_unlock(&ent->fs_fcheck->fc_lock);
442 351
443exit: 352exit:
444 ocfs2_filecheck_sysfs_put(ent);
445 return total; 353 return total;
446} 354}
447 355
448static int 356static inline int
357ocfs2_filecheck_is_dup_entry(struct ocfs2_filecheck_sysfs_entry *ent,
358 unsigned long ino)
359{
360 struct ocfs2_filecheck_entry *p;
361
362 list_for_each_entry(p, &ent->fs_fcheck->fc_head, fe_list) {
363 if (!p->fe_done) {
364 if (p->fe_ino == ino)
365 return 1;
366 }
367 }
368
369 return 0;
370}
371
372static inline int
449ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent) 373ocfs2_filecheck_erase_entry(struct ocfs2_filecheck_sysfs_entry *ent)
450{ 374{
451 struct ocfs2_filecheck_entry *p; 375 struct ocfs2_filecheck_entry *p;
@@ -484,21 +408,21 @@ static void
484ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent, 408ocfs2_filecheck_done_entry(struct ocfs2_filecheck_sysfs_entry *ent,
485 struct ocfs2_filecheck_entry *entry) 409 struct ocfs2_filecheck_entry *entry)
486{ 410{
487 entry->fe_done = 1;
488 spin_lock(&ent->fs_fcheck->fc_lock); 411 spin_lock(&ent->fs_fcheck->fc_lock);
412 entry->fe_done = 1;
489 ent->fs_fcheck->fc_done++; 413 ent->fs_fcheck->fc_done++;
490 spin_unlock(&ent->fs_fcheck->fc_lock); 414 spin_unlock(&ent->fs_fcheck->fc_lock);
491} 415}
492 416
493static unsigned int 417static unsigned int
494ocfs2_filecheck_handle(struct super_block *sb, 418ocfs2_filecheck_handle(struct ocfs2_super *osb,
495 unsigned long ino, unsigned int flags) 419 unsigned long ino, unsigned int flags)
496{ 420{
497 unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS; 421 unsigned int ret = OCFS2_FILECHECK_ERR_SUCCESS;
498 struct inode *inode = NULL; 422 struct inode *inode = NULL;
499 int rc; 423 int rc;
500 424
501 inode = ocfs2_iget(OCFS2_SB(sb), ino, flags, 0); 425 inode = ocfs2_iget(osb, ino, flags, 0);
502 if (IS_ERR(inode)) { 426 if (IS_ERR(inode)) {
503 rc = (int)(-(long)inode); 427 rc = (int)(-(long)inode);
504 if (rc >= OCFS2_FILECHECK_ERR_START && 428 if (rc >= OCFS2_FILECHECK_ERR_START &&
@@ -516,11 +440,14 @@ static void
516ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent, 440ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
517 struct ocfs2_filecheck_entry *entry) 441 struct ocfs2_filecheck_entry *entry)
518{ 442{
443 struct ocfs2_super *osb = container_of(ent, struct ocfs2_super,
444 osb_fc_ent);
445
519 if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK) 446 if (entry->fe_type == OCFS2_FILECHECK_TYPE_CHK)
520 entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, 447 entry->fe_status = ocfs2_filecheck_handle(osb,
521 entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK); 448 entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_CHK);
522 else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX) 449 else if (entry->fe_type == OCFS2_FILECHECK_TYPE_FIX)
523 entry->fe_status = ocfs2_filecheck_handle(ent->fs_sb, 450 entry->fe_status = ocfs2_filecheck_handle(osb,
524 entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX); 451 entry->fe_ino, OCFS2_FI_FLAG_FILECHECK_FIX);
525 else 452 else
526 entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED; 453 entry->fe_status = OCFS2_FILECHECK_ERR_UNSUPPORTED;
@@ -528,30 +455,21 @@ ocfs2_filecheck_handle_entry(struct ocfs2_filecheck_sysfs_entry *ent,
528 ocfs2_filecheck_done_entry(ent, entry); 455 ocfs2_filecheck_done_entry(ent, entry);
529} 456}
530 457
531static ssize_t ocfs2_filecheck_store(struct kobject *kobj, 458static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj,
532 struct kobj_attribute *attr, 459 struct kobj_attribute *attr,
533 const char *buf, size_t count) 460 const char *buf, size_t count)
534{ 461{
462 ssize_t ret = 0;
535 struct ocfs2_filecheck_args args; 463 struct ocfs2_filecheck_args args;
536 struct ocfs2_filecheck_entry *entry; 464 struct ocfs2_filecheck_entry *entry;
537 struct ocfs2_filecheck_sysfs_entry *ent; 465 struct ocfs2_filecheck_sysfs_entry *ent = container_of(kobj,
538 ssize_t ret = 0; 466 struct ocfs2_filecheck_sysfs_entry, fs_kobj);
539 467
540 if (count == 0) 468 if (count == 0)
541 return count; 469 return count;
542 470
543 if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args)) { 471 if (ocfs2_filecheck_args_parse(attr->attr.name, buf, count, &args))
544 mlog(ML_ERROR, "Invalid arguments for online file check\n");
545 return -EINVAL; 472 return -EINVAL;
546 }
547
548 ent = ocfs2_filecheck_sysfs_get(kobj->parent->name);
549 if (!ent) {
550 mlog(ML_ERROR,
551 "Cannot get the corresponding entry via device basename %s\n",
552 kobj->parent->name);
553 return -ENODEV;
554 }
555 473
556 if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) { 474 if (args.fa_type == OCFS2_FILECHECK_TYPE_SET) {
557 ret = ocfs2_filecheck_adjust_max(ent, args.fa_len); 475 ret = ocfs2_filecheck_adjust_max(ent, args.fa_len);
@@ -565,13 +483,16 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
565 } 483 }
566 484
567 spin_lock(&ent->fs_fcheck->fc_lock); 485 spin_lock(&ent->fs_fcheck->fc_lock);
568 if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && 486 if (ocfs2_filecheck_is_dup_entry(ent, args.fa_ino)) {
569 (ent->fs_fcheck->fc_done == 0)) { 487 ret = -EEXIST;
570 mlog(ML_ERROR, 488 kfree(entry);
489 } else if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
490 (ent->fs_fcheck->fc_done == 0)) {
491 mlog(ML_NOTICE,
571 "Cannot do more file check " 492 "Cannot do more file check "
572 "since file check queue(%u) is full now\n", 493 "since file check queue(%u) is full now\n",
573 ent->fs_fcheck->fc_max); 494 ent->fs_fcheck->fc_max);
574 ret = -EBUSY; 495 ret = -EAGAIN;
575 kfree(entry); 496 kfree(entry);
576 } else { 497 } else {
577 if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) && 498 if ((ent->fs_fcheck->fc_size >= ent->fs_fcheck->fc_max) &&
@@ -596,6 +517,5 @@ static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
596 ocfs2_filecheck_handle_entry(ent, entry); 517 ocfs2_filecheck_handle_entry(ent, entry);
597 518
598exit: 519exit:
599 ocfs2_filecheck_sysfs_put(ent);
600 return (!ret ? count : ret); 520 return (!ret ? count : ret);
601} 521}
diff --git a/fs/ocfs2/filecheck.h b/fs/ocfs2/filecheck.h
index e5cd002a2c09..6a22ee79e8d0 100644
--- a/fs/ocfs2/filecheck.h
+++ b/fs/ocfs2/filecheck.h
@@ -43,7 +43,32 @@ enum {
43#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED 43#define OCFS2_FILECHECK_ERR_START OCFS2_FILECHECK_ERR_FAILED
44#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED 44#define OCFS2_FILECHECK_ERR_END OCFS2_FILECHECK_ERR_UNSUPPORTED
45 45
46int ocfs2_filecheck_create_sysfs(struct super_block *sb); 46struct ocfs2_filecheck {
47int ocfs2_filecheck_remove_sysfs(struct super_block *sb); 47 struct list_head fc_head; /* File check entry list head */
48 spinlock_t fc_lock;
49 unsigned int fc_max; /* Maximum number of entry in list */
50 unsigned int fc_size; /* Current entry count in list */
51 unsigned int fc_done; /* Finished entry count in list */
52};
53
54#define OCFS2_FILECHECK_MAXSIZE 100
55#define OCFS2_FILECHECK_MINSIZE 10
56
57/* File check operation type */
58enum {
59 OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file(inode) */
60 OCFS2_FILECHECK_TYPE_FIX, /* Fix a file(inode) */
61 OCFS2_FILECHECK_TYPE_SET = 100 /* Set entry list maximum size */
62};
63
64struct ocfs2_filecheck_sysfs_entry { /* sysfs entry per partition */
65 struct kobject fs_kobj;
66 struct completion fs_kobj_unregister;
67 struct ocfs2_filecheck *fs_fcheck;
68};
69
70
71int ocfs2_filecheck_create_sysfs(struct ocfs2_super *osb);
72void ocfs2_filecheck_remove_sysfs(struct ocfs2_super *osb);
48 73
49#endif /* FILECHECK_H */ 74#endif /* FILECHECK_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d51b80edd972..ddc3e9470c87 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1135,7 +1135,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1135 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1135 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
1136 inode->i_nlink); 1136 inode->i_nlink);
1137 1137
1138 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1138 mlog_bug_on_msg(osb == NULL,
1139 "Inode=%lu\n", inode->i_ino); 1139 "Inode=%lu\n", inode->i_ino);
1140 1140
1141 dquot_drop(inode); 1141 dquot_drop(inode);
@@ -1150,7 +1150,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1150 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); 1150 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
1151 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); 1151 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
1152 1152
1153 ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap, 1153 ocfs2_resv_discard(&osb->osb_la_resmap,
1154 &oi->ip_la_data_resv); 1154 &oi->ip_la_data_resv);
1155 ocfs2_resv_init_once(&oi->ip_la_data_resv); 1155 ocfs2_resv_init_once(&oi->ip_la_data_resv);
1156 1156
@@ -1160,7 +1160,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1160 * exception here are successfully wiped inodes - their 1160 * exception here are successfully wiped inodes - their
1161 * metadata can now be considered to be part of the system 1161 * metadata can now be considered to be part of the system
1162 * inodes from which it came. */ 1162 * inodes from which it came. */
1163 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) 1163 if (!(oi->ip_flags & OCFS2_INODE_DELETED))
1164 ocfs2_checkpoint_inode(inode); 1164 ocfs2_checkpoint_inode(inode);
1165 1165
1166 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 1166 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers),
@@ -1223,7 +1223,7 @@ static void ocfs2_clear_inode(struct inode *inode)
1223 * the journal is flushed before journal shutdown. Thus it is safe to 1223 * the journal is flushed before journal shutdown. Thus it is safe to
1224 * have inodes get cleaned up after journal shutdown. 1224 * have inodes get cleaned up after journal shutdown.
1225 */ 1225 */
1226 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, 1226 jbd2_journal_release_jbd_inode(osb->journal->j_journal,
1227 &oi->ip_jinode); 1227 &oi->ip_jinode);
1228} 1228}
1229 1229
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c801eddc4bf3..8dd6f703c819 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -525,7 +525,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
525 * these are used by the support functions here and in 525 * these are used by the support functions here and in
526 * callers. */ 526 * callers. */
527 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); 527 inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
528 OCFS2_I(inode)->ip_blkno = fe_blkno; 528 oi->ip_blkno = fe_blkno;
529 spin_lock(&osb->osb_lock); 529 spin_lock(&osb->osb_lock);
530 inode->i_generation = osb->s_next_generation++; 530 inode->i_generation = osb->s_next_generation++;
531 spin_unlock(&osb->osb_lock); 531 spin_unlock(&osb->osb_lock);
@@ -1186,8 +1186,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1186 } 1186 }
1187 1187
1188 trace_ocfs2_double_lock_end( 1188 trace_ocfs2_double_lock_end(
1189 (unsigned long long)OCFS2_I(inode1)->ip_blkno, 1189 (unsigned long long)oi1->ip_blkno,
1190 (unsigned long long)OCFS2_I(inode2)->ip_blkno); 1190 (unsigned long long)oi2->ip_blkno);
1191 1191
1192bail: 1192bail:
1193 if (status) 1193 if (status)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6867eef2e06b..4f86ac0027b5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -50,6 +50,8 @@
50 50
51#include "reservations.h" 51#include "reservations.h"
52 52
53#include "filecheck.h"
54
53/* Caching of metadata buffers */ 55/* Caching of metadata buffers */
54 56
55/* Most user visible OCFS2 inodes will have very few pieces of 57/* Most user visible OCFS2 inodes will have very few pieces of
@@ -472,6 +474,12 @@ struct ocfs2_super
472 * workqueue and schedule on our own. 474 * workqueue and schedule on our own.
473 */ 475 */
474 struct workqueue_struct *ocfs2_wq; 476 struct workqueue_struct *ocfs2_wq;
477
478 /* sysfs directory per partition */
479 struct kset *osb_dev_kset;
480
481 /* file check related stuff */
482 struct ocfs2_filecheck_sysfs_entry osb_fc_ent;
475}; 483};
476 484
477#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) 485#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index e2a11aaece10..2ee76a90ba8f 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1311,11 +1311,11 @@ DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
1311 1311
1312DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file); 1312DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
1313 1313
1314DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write); 1314DEFINE_OCFS2_FILE_OPS(ocfs2_file_write_iter);
1315 1315
1316DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write); 1316DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
1317 1317
1318DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read); 1318DEFINE_OCFS2_FILE_OPS(ocfs2_file_read_iter);
1319 1319
1320DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file); 1320DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
1321 1321
@@ -1467,7 +1467,7 @@ TRACE_EVENT(ocfs2_prepare_inode_for_write,
1467 __entry->saved_pos, __entry->count, __entry->wait) 1467 __entry->saved_pos, __entry->count, __entry->wait)
1468); 1468);
1469 1469
1470DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); 1470DEFINE_OCFS2_INT_EVENT(generic_file_read_iter_ret);
1471 1471
1472/* End of trace events for fs/ocfs2/file.c. */ 1472/* End of trace events for fs/ocfs2/file.c. */
1473 1473
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ab156e35ec00..01c6b3894406 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -573,7 +573,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
573 BUG_ON(ocfs2_is_refcount_inode(inode)); 573 BUG_ON(ocfs2_is_refcount_inode(inode));
574 574
575 trace_ocfs2_create_refcount_tree( 575 trace_ocfs2_create_refcount_tree(
576 (unsigned long long)OCFS2_I(inode)->ip_blkno); 576 (unsigned long long)oi->ip_blkno);
577 577
578 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 578 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
579 if (ret) { 579 if (ret) {
@@ -3359,7 +3359,7 @@ static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
3359 unsigned int ext_flags; 3359 unsigned int ext_flags;
3360 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3360 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3361 3361
3362 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 3362 if (!ocfs2_refcount_tree(osb)) {
3363 return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n", 3363 return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
3364 inode->i_ino); 3364 inode->i_ino);
3365 } 3365 }
@@ -3707,7 +3707,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
3707 trace_ocfs2_add_refcount_flag(ref_blocks, credits); 3707 trace_ocfs2_add_refcount_flag(ref_blocks, credits);
3708 3708
3709 if (ref_blocks) { 3709 if (ref_blocks) {
3710 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 3710 ret = ocfs2_reserve_new_metadata_blocks(osb,
3711 ref_blocks, &meta_ac); 3711 ref_blocks, &meta_ac);
3712 if (ret) { 3712 if (ret) {
3713 mlog_errno(ret); 3713 mlog_errno(ret);
@@ -4766,8 +4766,8 @@ static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
4766 *bh2 = *bh1; 4766 *bh2 = *bh1;
4767 4767
4768 trace_ocfs2_double_lock_end( 4768 trace_ocfs2_double_lock_end(
4769 (unsigned long long)OCFS2_I(inode1)->ip_blkno, 4769 (unsigned long long)oi1->ip_blkno,
4770 (unsigned long long)OCFS2_I(inode2)->ip_blkno); 4770 (unsigned long long)oi2->ip_blkno);
4771 4771
4772 return 0; 4772 return 0;
4773 4773
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d8f5f6ce99dc..f7c972fbed6a 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -79,8 +79,6 @@ static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res)
79 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); 79 return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset);
80} 80}
81 81
82static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
83static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
84static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 82static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
85static int ocfs2_block_group_fill(handle_t *handle, 83static int ocfs2_block_group_fill(handle_t *handle,
86 struct inode *alloc_inode, 84 struct inode *alloc_inode,
@@ -387,7 +385,7 @@ static int ocfs2_block_group_fill(handle_t *handle,
387 385
388 memset(bg, 0, sb->s_blocksize); 386 memset(bg, 0, sb->s_blocksize);
389 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE); 387 strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
390 bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 388 bg->bg_generation = cpu_to_le32(osb->fs_generation);
391 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1, 389 bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb, 1,
392 osb->s_feature_incompat)); 390 osb->s_feature_incompat));
393 bg->bg_chain = cpu_to_le16(my_chain); 391 bg->bg_chain = cpu_to_le16(my_chain);
@@ -1521,7 +1519,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1521 OCFS2_I(inode)->ip_clusters, max_bits); 1519 OCFS2_I(inode)->ip_clusters, max_bits);
1522 } 1520 }
1523 1521
1524 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1522 ret = ocfs2_block_group_find_clear_bits(osb,
1525 group_bh, bits_wanted, 1523 group_bh, bits_wanted,
1526 max_bits, res); 1524 max_bits, res);
1527 if (ret) 1525 if (ret)
@@ -2626,53 +2624,6 @@ int ocfs2_release_clusters(handle_t *handle,
2626 _ocfs2_clear_bit); 2624 _ocfs2_clear_bit);
2627} 2625}
2628 2626
2629static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
2630{
2631 printk("Block Group:\n");
2632 printk("bg_signature: %s\n", bg->bg_signature);
2633 printk("bg_size: %u\n", bg->bg_size);
2634 printk("bg_bits: %u\n", bg->bg_bits);
2635 printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
2636 printk("bg_chain: %u\n", bg->bg_chain);
2637 printk("bg_generation: %u\n", le32_to_cpu(bg->bg_generation));
2638 printk("bg_next_group: %llu\n",
2639 (unsigned long long)bg->bg_next_group);
2640 printk("bg_parent_dinode: %llu\n",
2641 (unsigned long long)bg->bg_parent_dinode);
2642 printk("bg_blkno: %llu\n",
2643 (unsigned long long)bg->bg_blkno);
2644}
2645
2646static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2647{
2648 int i;
2649
2650 printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2651 printk("i_signature: %s\n", fe->i_signature);
2652 printk("i_size: %llu\n",
2653 (unsigned long long)fe->i_size);
2654 printk("i_clusters: %u\n", fe->i_clusters);
2655 printk("i_generation: %u\n",
2656 le32_to_cpu(fe->i_generation));
2657 printk("id1.bitmap1.i_used: %u\n",
2658 le32_to_cpu(fe->id1.bitmap1.i_used));
2659 printk("id1.bitmap1.i_total: %u\n",
2660 le32_to_cpu(fe->id1.bitmap1.i_total));
2661 printk("id2.i_chain.cl_cpg: %u\n", fe->id2.i_chain.cl_cpg);
2662 printk("id2.i_chain.cl_bpc: %u\n", fe->id2.i_chain.cl_bpc);
2663 printk("id2.i_chain.cl_count: %u\n", fe->id2.i_chain.cl_count);
2664 printk("id2.i_chain.cl_next_free_rec: %u\n",
2665 fe->id2.i_chain.cl_next_free_rec);
2666 for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2667 printk("fe->id2.i_chain.cl_recs[%d].c_free: %u\n", i,
2668 fe->id2.i_chain.cl_recs[i].c_free);
2669 printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2670 fe->id2.i_chain.cl_recs[i].c_total);
2671 printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2672 (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2673 }
2674}
2675
2676/* 2627/*
2677 * For a given allocation, determine which allocators will need to be 2628 * For a given allocation, determine which allocators will need to be
2678 * accessed, and lock them, reserving the appropriate number of bits. 2629 * accessed, and lock them, reserving the appropriate number of bits.
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ffa4952d432b..3415e0b09398 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -423,10 +423,10 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
423 ocfs2_schedule_truncate_log_flush(osb, 0); 423 ocfs2_schedule_truncate_log_flush(osb, 0);
424 } 424 }
425 425
426 if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, 426 if (jbd2_journal_start_commit(osb->journal->j_journal,
427 &target)) { 427 &target)) {
428 if (wait) 428 if (wait)
429 jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, 429 jbd2_log_wait_commit(osb->journal->j_journal,
430 target); 430 target);
431 } 431 }
432 return 0; 432 return 0;
@@ -1161,6 +1161,23 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1161 1161
1162 ocfs2_complete_mount_recovery(osb); 1162 ocfs2_complete_mount_recovery(osb);
1163 1163
1164 osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
1165 &ocfs2_kset->kobj);
1166 if (!osb->osb_dev_kset) {
1167 status = -ENOMEM;
1168 mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
1169 goto read_super_error;
1170 }
1171
1172 /* Create filecheck sysfs related directories/files at
1173 * /sys/fs/ocfs2/<devname>/filecheck */
1174 if (ocfs2_filecheck_create_sysfs(osb)) {
1175 status = -ENOMEM;
1176 mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
1177 "/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
1178 goto read_super_error;
1179 }
1180
1164 if (ocfs2_mount_local(osb)) 1181 if (ocfs2_mount_local(osb))
1165 snprintf(nodestr, sizeof(nodestr), "local"); 1182 snprintf(nodestr, sizeof(nodestr), "local");
1166 else 1183 else
@@ -1199,9 +1216,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1199 /* Start this when the mount is almost sure of being successful */ 1216 /* Start this when the mount is almost sure of being successful */
1200 ocfs2_orphan_scan_start(osb); 1217 ocfs2_orphan_scan_start(osb);
1201 1218
1202 /* Create filecheck sysfile /sys/fs/ocfs2/<devname>/filecheck */
1203 ocfs2_filecheck_create_sysfs(sb);
1204
1205 return status; 1219 return status;
1206 1220
1207read_super_error: 1221read_super_error:
@@ -1653,7 +1667,6 @@ static void ocfs2_put_super(struct super_block *sb)
1653 1667
1654 ocfs2_sync_blockdev(sb); 1668 ocfs2_sync_blockdev(sb);
1655 ocfs2_dismount_volume(sb, 0); 1669 ocfs2_dismount_volume(sb, 0);
1656 ocfs2_filecheck_remove_sysfs(sb);
1657} 1670}
1658 1671
1659static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) 1672static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1768,12 +1781,9 @@ static int ocfs2_initialize_mem_caches(void)
1768 NULL); 1781 NULL);
1769 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || 1782 if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
1770 !ocfs2_qf_chunk_cachep) { 1783 !ocfs2_qf_chunk_cachep) {
1771 if (ocfs2_inode_cachep) 1784 kmem_cache_destroy(ocfs2_inode_cachep);
1772 kmem_cache_destroy(ocfs2_inode_cachep); 1785 kmem_cache_destroy(ocfs2_dquot_cachep);
1773 if (ocfs2_dquot_cachep) 1786 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1774 kmem_cache_destroy(ocfs2_dquot_cachep);
1775 if (ocfs2_qf_chunk_cachep)
1776 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1777 return -ENOMEM; 1787 return -ENOMEM;
1778 } 1788 }
1779 1789
@@ -1787,16 +1797,13 @@ static void ocfs2_free_mem_caches(void)
1787 * destroy cache. 1797 * destroy cache.
1788 */ 1798 */
1789 rcu_barrier(); 1799 rcu_barrier();
1790 if (ocfs2_inode_cachep) 1800 kmem_cache_destroy(ocfs2_inode_cachep);
1791 kmem_cache_destroy(ocfs2_inode_cachep);
1792 ocfs2_inode_cachep = NULL; 1801 ocfs2_inode_cachep = NULL;
1793 1802
1794 if (ocfs2_dquot_cachep) 1803 kmem_cache_destroy(ocfs2_dquot_cachep);
1795 kmem_cache_destroy(ocfs2_dquot_cachep);
1796 ocfs2_dquot_cachep = NULL; 1804 ocfs2_dquot_cachep = NULL;
1797 1805
1798 if (ocfs2_qf_chunk_cachep) 1806 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1799 kmem_cache_destroy(ocfs2_qf_chunk_cachep);
1800 ocfs2_qf_chunk_cachep = NULL; 1807 ocfs2_qf_chunk_cachep = NULL;
1801} 1808}
1802 1809
@@ -1899,6 +1906,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1899 osb = OCFS2_SB(sb); 1906 osb = OCFS2_SB(sb);
1900 BUG_ON(!osb); 1907 BUG_ON(!osb);
1901 1908
1909 /* Remove file check sysfs related directores/files,
1910 * and wait for the pending file check operations */
1911 ocfs2_filecheck_remove_sysfs(osb);
1912
1913 kset_unregister(osb->osb_dev_kset);
1914
1902 debugfs_remove(osb->osb_ctxt); 1915 debugfs_remove(osb->osb_ctxt);
1903 1916
1904 /* Orphan scan should be stopped as early as possible */ 1917 /* Orphan scan should be stopped as early as possible */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 82e17b076ce7..78f09c76ab3c 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -633,6 +633,5 @@ int __init init_ocfs2_uptodate_cache(void)
633 633
634void exit_ocfs2_uptodate_cache(void) 634void exit_ocfs2_uptodate_cache(void)
635{ 635{
636 if (ocfs2_uptodate_cachep) 636 kmem_cache_destroy(ocfs2_uptodate_cachep);
637 kmem_cache_destroy(ocfs2_uptodate_cachep);
638} 637}
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index c261c1dfd374..3a24ce3deb01 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3564,7 +3564,7 @@ int ocfs2_xattr_set(struct inode *inode,
3564 .not_found = -ENODATA, 3564 .not_found = -ENODATA,
3565 }; 3565 };
3566 3566
3567 if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) 3567 if (!ocfs2_supports_xattr(osb))
3568 return -EOPNOTSUPP; 3568 return -EOPNOTSUPP;
3569 3569
3570 /* 3570 /*
diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h
index c3c95d18bf43..7e6c77740413 100644
--- a/include/linux/fault-inject.h
+++ b/include/linux/fault-inject.h
@@ -64,10 +64,11 @@ static inline struct dentry *fault_create_debugfs_attr(const char *name,
64 64
65struct kmem_cache; 65struct kmem_cache;
66 66
67int should_failslab(struct kmem_cache *s, gfp_t gfpflags);
67#ifdef CONFIG_FAILSLAB 68#ifdef CONFIG_FAILSLAB
68extern bool should_failslab(struct kmem_cache *s, gfp_t gfpflags); 69extern bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags);
69#else 70#else
70static inline bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) 71static inline bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
71{ 72{
72 return false; 73 return false;
73} 74}
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index d6459bd1376d..de784fd11d12 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -43,7 +43,7 @@ void kasan_unpoison_stack_above_sp_to(const void *watermark);
43void kasan_alloc_pages(struct page *page, unsigned int order); 43void kasan_alloc_pages(struct page *page, unsigned int order);
44void kasan_free_pages(struct page *page, unsigned int order); 44void kasan_free_pages(struct page *page, unsigned int order);
45 45
46void kasan_cache_create(struct kmem_cache *cache, size_t *size, 46void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
47 slab_flags_t *flags); 47 slab_flags_t *flags);
48void kasan_cache_shrink(struct kmem_cache *cache); 48void kasan_cache_shrink(struct kmem_cache *cache);
49void kasan_cache_shutdown(struct kmem_cache *cache); 49void kasan_cache_shutdown(struct kmem_cache *cache);
@@ -92,7 +92,7 @@ static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
92static inline void kasan_free_pages(struct page *page, unsigned int order) {} 92static inline void kasan_free_pages(struct page *page, unsigned int order) {}
93 93
94static inline void kasan_cache_create(struct kmem_cache *cache, 94static inline void kasan_cache_create(struct kmem_cache *cache,
95 size_t *size, 95 unsigned int *size,
96 slab_flags_t *flags) {} 96 slab_flags_t *flags) {}
97static inline void kasan_cache_shrink(struct kmem_cache *cache) {} 97static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
98static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} 98static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index bb8129a3474d..96def9d15b1b 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -32,6 +32,7 @@ struct list_lru_one {
32}; 32};
33 33
34struct list_lru_memcg { 34struct list_lru_memcg {
35 struct rcu_head rcu;
35 /* array of per cgroup lists, indexed by memcg_cache_id */ 36 /* array of per cgroup lists, indexed by memcg_cache_id */
36 struct list_lru_one *lru[0]; 37 struct list_lru_one *lru[0];
37}; 38};
@@ -43,7 +44,7 @@ struct list_lru_node {
43 struct list_lru_one lru; 44 struct list_lru_one lru;
44#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) 45#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
45 /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ 46 /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
46 struct list_lru_memcg *memcg_lrus; 47 struct list_lru_memcg __rcu *memcg_lrus;
47#endif 48#endif
48 long nr_items; 49 long nr_items;
49} ____cacheline_aligned_in_smp; 50} ____cacheline_aligned_in_smp;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f92ea7783652..0257aee7ab4b 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -416,21 +416,11 @@ static inline void early_memtest(phys_addr_t start, phys_addr_t end)
416{ 416{
417} 417}
418#endif 418#endif
419
420extern unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
421 phys_addr_t end_addr);
422#else 419#else
423static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) 420static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
424{ 421{
425 return 0; 422 return 0;
426} 423}
427
428static inline unsigned long memblock_reserved_memory_within(phys_addr_t start_addr,
429 phys_addr_t end_addr)
430{
431 return 0;
432}
433
434#endif /* CONFIG_HAVE_MEMBLOCK */ 424#endif /* CONFIG_HAVE_MEMBLOCK */
435 425
436#endif /* __KERNEL__ */ 426#endif /* __KERNEL__ */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index f71e732c77b2..31ca3e28b0eb 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -33,6 +33,7 @@ struct memory_block {
33 void *hw; /* optional pointer to fw/hw data */ 33 void *hw; /* optional pointer to fw/hw data */
34 int (*phys_callback)(struct memory_block *); 34 int (*phys_callback)(struct memory_block *);
35 struct device dev; 35 struct device dev;
36 int nid; /* NID for this memory block */
36}; 37};
37 38
38int arch_get_memory_phys_device(unsigned long start_pfn); 39int arch_get_memory_phys_device(unsigned long start_pfn);
@@ -109,7 +110,7 @@ extern int register_memory_notifier(struct notifier_block *nb);
109extern void unregister_memory_notifier(struct notifier_block *nb); 110extern void unregister_memory_notifier(struct notifier_block *nb);
110extern int register_memory_isolate_notifier(struct notifier_block *nb); 111extern int register_memory_isolate_notifier(struct notifier_block *nb);
111extern void unregister_memory_isolate_notifier(struct notifier_block *nb); 112extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
112extern int register_new_memory(int, struct mem_section *); 113int hotplug_memory_register(int nid, struct mem_section *section);
113#ifdef CONFIG_MEMORY_HOTREMOVE 114#ifdef CONFIG_MEMORY_HOTREMOVE
114extern int unregister_memory_section(struct mem_section *); 115extern int unregister_memory_section(struct mem_section *);
115#endif 116#endif
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index aba5f86eb038..2b0265265c28 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -52,24 +52,6 @@ enum {
52}; 52};
53 53
54/* 54/*
55 * pgdat resizing functions
56 */
57static inline
58void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
59{
60 spin_lock_irqsave(&pgdat->node_size_lock, *flags);
61}
62static inline
63void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
64{
65 spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
66}
67static inline
68void pgdat_resize_init(struct pglist_data *pgdat)
69{
70 spin_lock_init(&pgdat->node_size_lock);
71}
72/*
73 * Zone resizing functions 55 * Zone resizing functions
74 * 56 *
75 * Note: any attempt to resize a zone should has pgdat_resize_lock() 57 * Note: any attempt to resize a zone should has pgdat_resize_lock()
@@ -246,13 +228,6 @@ extern void clear_zone_contiguous(struct zone *zone);
246 ___page; \ 228 ___page; \
247 }) 229 })
248 230
249/*
250 * Stub functions for when hotplug is off
251 */
252static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
253static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
254static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
255
256static inline unsigned zone_span_seqbegin(struct zone *zone) 231static inline unsigned zone_span_seqbegin(struct zone *zone)
257{ 232{
258 return 0; 233 return 0;
@@ -293,6 +268,34 @@ static inline bool movable_node_is_enabled(void)
293} 268}
294#endif /* ! CONFIG_MEMORY_HOTPLUG */ 269#endif /* ! CONFIG_MEMORY_HOTPLUG */
295 270
271#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
272/*
273 * pgdat resizing functions
274 */
275static inline
276void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
277{
278 spin_lock_irqsave(&pgdat->node_size_lock, *flags);
279}
280static inline
281void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
282{
283 spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
284}
285static inline
286void pgdat_resize_init(struct pglist_data *pgdat)
287{
288 spin_lock_init(&pgdat->node_size_lock);
289}
290#else /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
291/*
292 * Stub functions for when hotplug is off
293 */
294static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
295static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
296static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
297#endif /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
298
296#ifdef CONFIG_MEMORY_HOTREMOVE 299#ifdef CONFIG_MEMORY_HOTREMOVE
297 300
298extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); 301extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a2246cf670ba..ab45f8a0d288 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -25,7 +25,7 @@ enum migrate_reason {
25 MR_SYSCALL, /* also applies to cpusets */ 25 MR_SYSCALL, /* also applies to cpusets */
26 MR_MEMPOLICY_MBIND, 26 MR_MEMPOLICY_MBIND,
27 MR_NUMA_MISPLACED, 27 MR_NUMA_MISPLACED,
28 MR_CMA, 28 MR_CONTIG_RANGE,
29 MR_TYPES 29 MR_TYPES
30}; 30};
31 31
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f945dff34925..3ad632366973 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -386,17 +386,19 @@ struct vm_operations_struct {
386 void (*close)(struct vm_area_struct * area); 386 void (*close)(struct vm_area_struct * area);
387 int (*split)(struct vm_area_struct * area, unsigned long addr); 387 int (*split)(struct vm_area_struct * area, unsigned long addr);
388 int (*mremap)(struct vm_area_struct * area); 388 int (*mremap)(struct vm_area_struct * area);
389 int (*fault)(struct vm_fault *vmf); 389 vm_fault_t (*fault)(struct vm_fault *vmf);
390 int (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); 390 vm_fault_t (*huge_fault)(struct vm_fault *vmf,
391 enum page_entry_size pe_size);
391 void (*map_pages)(struct vm_fault *vmf, 392 void (*map_pages)(struct vm_fault *vmf,
392 pgoff_t start_pgoff, pgoff_t end_pgoff); 393 pgoff_t start_pgoff, pgoff_t end_pgoff);
394 unsigned long (*pagesize)(struct vm_area_struct * area);
393 395
394 /* notification that a previously read-only page is about to become 396 /* notification that a previously read-only page is about to become
395 * writable, if an error is returned it will cause a SIGBUS */ 397 * writable, if an error is returned it will cause a SIGBUS */
396 int (*page_mkwrite)(struct vm_fault *vmf); 398 vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);
397 399
398 /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ 400 /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
399 int (*pfn_mkwrite)(struct vm_fault *vmf); 401 vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);
400 402
401 /* called by access_process_vm when get_user_pages() fails, typically 403 /* called by access_process_vm when get_user_pages() fails, typically
402 * for use by special VMAs that can switch between memory and hardware 404 * for use by special VMAs that can switch between memory and hardware
@@ -903,7 +905,9 @@ extern int page_to_nid(const struct page *page);
903#else 905#else
904static inline int page_to_nid(const struct page *page) 906static inline int page_to_nid(const struct page *page)
905{ 907{
906 return (page->flags >> NODES_PGSHIFT) & NODES_MASK; 908 struct page *p = (struct page *)page;
909
910 return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK;
907} 911}
908#endif 912#endif
909 913
@@ -1152,6 +1156,7 @@ static inline pgoff_t page_index(struct page *page)
1152 1156
1153bool page_mapped(struct page *page); 1157bool page_mapped(struct page *page);
1154struct address_space *page_mapping(struct page *page); 1158struct address_space *page_mapping(struct page *page);
1159struct address_space *page_mapping_file(struct page *page);
1155 1160
1156/* 1161/*
1157 * Return true only if the page has been allocated with 1162 * Return true only if the page has been allocated with
@@ -2420,6 +2425,44 @@ int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr,
2420 pfn_t pfn); 2425 pfn_t pfn);
2421int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); 2426int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
2422 2427
2428static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
2429 unsigned long addr, struct page *page)
2430{
2431 int err = vm_insert_page(vma, addr, page);
2432
2433 if (err == -ENOMEM)
2434 return VM_FAULT_OOM;
2435 if (err < 0 && err != -EBUSY)
2436 return VM_FAULT_SIGBUS;
2437
2438 return VM_FAULT_NOPAGE;
2439}
2440
2441static inline vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma,
2442 unsigned long addr, pfn_t pfn)
2443{
2444 int err = vm_insert_mixed(vma, addr, pfn);
2445
2446 if (err == -ENOMEM)
2447 return VM_FAULT_OOM;
2448 if (err < 0 && err != -EBUSY)
2449 return VM_FAULT_SIGBUS;
2450
2451 return VM_FAULT_NOPAGE;
2452}
2453
2454static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma,
2455 unsigned long addr, unsigned long pfn)
2456{
2457 int err = vm_insert_pfn(vma, addr, pfn);
2458
2459 if (err == -ENOMEM)
2460 return VM_FAULT_OOM;
2461 if (err < 0 && err != -EBUSY)
2462 return VM_FAULT_SIGBUS;
2463
2464 return VM_FAULT_NOPAGE;
2465}
2423 2466
2424struct page *follow_page_mask(struct vm_area_struct *vma, 2467struct page *follow_page_mask(struct vm_area_struct *vma,
2425 unsigned long address, unsigned int foll_flags, 2468 unsigned long address, unsigned int foll_flags,
@@ -2589,7 +2632,7 @@ extern int get_hwpoison_page(struct page *page);
2589extern int sysctl_memory_failure_early_kill; 2632extern int sysctl_memory_failure_early_kill;
2590extern int sysctl_memory_failure_recovery; 2633extern int sysctl_memory_failure_recovery;
2591extern void shake_page(struct page *p, int access); 2634extern void shake_page(struct page *p, int access);
2592extern atomic_long_t num_poisoned_pages; 2635extern atomic_long_t num_poisoned_pages __read_mostly;
2593extern int soft_offline_page(struct page *page, int flags); 2636extern int soft_offline_page(struct page *page, int flags);
2594 2637
2595 2638
@@ -2611,6 +2654,7 @@ enum mf_action_page_type {
2611 MF_MSG_POISONED_HUGE, 2654 MF_MSG_POISONED_HUGE,
2612 MF_MSG_HUGE, 2655 MF_MSG_HUGE,
2613 MF_MSG_FREE_HUGE, 2656 MF_MSG_FREE_HUGE,
2657 MF_MSG_NON_PMD_HUGE,
2614 MF_MSG_UNMAP_FAILED, 2658 MF_MSG_UNMAP_FAILED,
2615 MF_MSG_DIRTY_SWAPCACHE, 2659 MF_MSG_DIRTY_SWAPCACHE,
2616 MF_MSG_CLEAN_SWAPCACHE, 2660 MF_MSG_CLEAN_SWAPCACHE,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fd1af6b9591d..21612347d311 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,6 +22,8 @@
22#endif 22#endif
23#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 23#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
24 24
25typedef int vm_fault_t;
26
25struct address_space; 27struct address_space;
26struct mem_cgroup; 28struct mem_cgroup;
27struct hmm; 29struct hmm;
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 57b0030d3800..2ad72d2c8cc5 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -37,10 +37,10 @@ void dump_mm(const struct mm_struct *mm);
37 BUG(); \ 37 BUG(); \
38 } \ 38 } \
39 } while (0) 39 } while (0)
40#define VM_WARN_ON(cond) WARN_ON(cond) 40#define VM_WARN_ON(cond) (void)WARN_ON(cond)
41#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) 41#define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
42#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) 42#define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
43#define VM_WARN(cond, format...) WARN(cond, format) 43#define VM_WARN(cond, format...) (void)WARN(cond, format)
44#else 44#else
45#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) 45#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
46#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) 46#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a2db4576e499..f11ae29005f1 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -633,14 +633,15 @@ typedef struct pglist_data {
633#ifndef CONFIG_NO_BOOTMEM 633#ifndef CONFIG_NO_BOOTMEM
634 struct bootmem_data *bdata; 634 struct bootmem_data *bdata;
635#endif 635#endif
636#ifdef CONFIG_MEMORY_HOTPLUG 636#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
637 /* 637 /*
638 * Must be held any time you expect node_start_pfn, node_present_pages 638 * Must be held any time you expect node_start_pfn, node_present_pages
639 * or node_spanned_pages stay constant. Holding this will also 639 * or node_spanned_pages stay constant. Holding this will also
640 * guarantee that any pfn_valid() stays that way. 640 * guarantee that any pfn_valid() stays that way.
641 * 641 *
642 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to 642 * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
643 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. 643 * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
644 * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
644 * 645 *
645 * Nests above zone->lock and zone->span_seqlock 646 * Nests above zone->lock and zone->span_seqlock
646 */ 647 */
@@ -775,7 +776,8 @@ static inline bool is_dev_zone(const struct zone *zone)
775#include <linux/memory_hotplug.h> 776#include <linux/memory_hotplug.h>
776 777
777void build_all_zonelists(pg_data_t *pgdat); 778void build_all_zonelists(pg_data_t *pgdat);
778void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); 779void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
780 enum zone_type classzone_idx);
779bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, 781bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
780 int classzone_idx, unsigned int alloc_flags, 782 int classzone_idx, unsigned int alloc_flags,
781 long free_pages); 783 long free_pages);
diff --git a/include/linux/node.h b/include/linux/node.h
index 4ece0fee0ffc..41f171861dcc 100644
--- a/include/linux/node.h
+++ b/include/linux/node.h
@@ -67,7 +67,7 @@ extern void unregister_one_node(int nid);
67extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); 67extern int register_cpu_under_node(unsigned int cpu, unsigned int nid);
68extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); 68extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid);
69extern int register_mem_sect_under_node(struct memory_block *mem_blk, 69extern int register_mem_sect_under_node(struct memory_block *mem_blk,
70 int nid); 70 int nid, bool check_nid);
71extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk, 71extern int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
72 unsigned long phys_index); 72 unsigned long phys_index);
73 73
@@ -97,7 +97,7 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
97 return 0; 97 return 0;
98} 98}
99static inline int register_mem_sect_under_node(struct memory_block *mem_blk, 99static inline int register_mem_sect_under_node(struct memory_block *mem_blk,
100 int nid) 100 int nid, bool check_nid)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 50c2b8786831..e34a27727b9a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -156,9 +156,18 @@ static __always_inline int PageCompound(struct page *page)
156 return test_bit(PG_head, &page->flags) || PageTail(page); 156 return test_bit(PG_head, &page->flags) || PageTail(page);
157} 157}
158 158
159#define PAGE_POISON_PATTERN -1l
160static inline int PagePoisoned(const struct page *page)
161{
162 return page->flags == PAGE_POISON_PATTERN;
163}
164
159/* 165/*
160 * Page flags policies wrt compound pages 166 * Page flags policies wrt compound pages
161 * 167 *
168 * PF_POISONED_CHECK
169 * check if this struct page poisoned/uninitialized
170 *
162 * PF_ANY: 171 * PF_ANY:
163 * the page flag is relevant for small, head and tail pages. 172 * the page flag is relevant for small, head and tail pages.
164 * 173 *
@@ -176,17 +185,20 @@ static __always_inline int PageCompound(struct page *page)
176 * PF_NO_COMPOUND: 185 * PF_NO_COMPOUND:
177 * the page flag is not relevant for compound pages. 186 * the page flag is not relevant for compound pages.
178 */ 187 */
179#define PF_ANY(page, enforce) page 188#define PF_POISONED_CHECK(page) ({ \
180#define PF_HEAD(page, enforce) compound_head(page) 189 VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \
190 page; })
191#define PF_ANY(page, enforce) PF_POISONED_CHECK(page)
192#define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page))
181#define PF_ONLY_HEAD(page, enforce) ({ \ 193#define PF_ONLY_HEAD(page, enforce) ({ \
182 VM_BUG_ON_PGFLAGS(PageTail(page), page); \ 194 VM_BUG_ON_PGFLAGS(PageTail(page), page); \
183 page;}) 195 PF_POISONED_CHECK(page); })
184#define PF_NO_TAIL(page, enforce) ({ \ 196#define PF_NO_TAIL(page, enforce) ({ \
185 VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ 197 VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \
186 compound_head(page);}) 198 PF_POISONED_CHECK(compound_head(page)); })
187#define PF_NO_COMPOUND(page, enforce) ({ \ 199#define PF_NO_COMPOUND(page, enforce) ({ \
188 VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ 200 VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \
189 page;}) 201 PF_POISONED_CHECK(page); })
190 202
191/* 203/*
192 * Macros to create function definitions for page flags 204 * Macros to create function definitions for page flags
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 760d74a0e9a9..14d14beb1f7f 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -175,8 +175,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
175 VM_BUG_ON_PAGE(page_count(page) != 0, page); 175 VM_BUG_ON_PAGE(page_count(page) != 0, page);
176 VM_BUG_ON(count == 0); 176 VM_BUG_ON(count == 0);
177 177
178 smp_mb(); 178 atomic_set_release(&page->_refcount, count);
179 atomic_set(&page->_refcount, count);
180 if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) 179 if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
181 __page_ref_unfreeze(page, count); 180 __page_ref_unfreeze(page, count);
182} 181}
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 231abc8976c5..81ebd71f8c03 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -125,7 +125,6 @@
125#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ 125#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
126 (unsigned long)ZERO_SIZE_PTR) 126 (unsigned long)ZERO_SIZE_PTR)
127 127
128#include <linux/kmemleak.h>
129#include <linux/kasan.h> 128#include <linux/kasan.h>
130 129
131struct mem_cgroup; 130struct mem_cgroup;
@@ -137,12 +136,13 @@ bool slab_is_available(void);
137 136
138extern bool usercopy_fallback; 137extern bool usercopy_fallback;
139 138
140struct kmem_cache *kmem_cache_create(const char *name, size_t size, 139struct kmem_cache *kmem_cache_create(const char *name, unsigned int size,
141 size_t align, slab_flags_t flags, 140 unsigned int align, slab_flags_t flags,
142 void (*ctor)(void *)); 141 void (*ctor)(void *));
143struct kmem_cache *kmem_cache_create_usercopy(const char *name, 142struct kmem_cache *kmem_cache_create_usercopy(const char *name,
144 size_t size, size_t align, slab_flags_t flags, 143 unsigned int size, unsigned int align,
145 size_t useroffset, size_t usersize, 144 slab_flags_t flags,
145 unsigned int useroffset, unsigned int usersize,
146 void (*ctor)(void *)); 146 void (*ctor)(void *));
147void kmem_cache_destroy(struct kmem_cache *); 147void kmem_cache_destroy(struct kmem_cache *);
148int kmem_cache_shrink(struct kmem_cache *); 148int kmem_cache_shrink(struct kmem_cache *);
@@ -308,7 +308,7 @@ extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
308 * 2 = 129 .. 192 bytes 308 * 2 = 129 .. 192 bytes
309 * n = 2^(n-1)+1 .. 2^n 309 * n = 2^(n-1)+1 .. 2^n
310 */ 310 */
311static __always_inline int kmalloc_index(size_t size) 311static __always_inline unsigned int kmalloc_index(size_t size)
312{ 312{
313 if (!size) 313 if (!size)
314 return 0; 314 return 0;
@@ -504,7 +504,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
504 return kmalloc_large(size, flags); 504 return kmalloc_large(size, flags);
505#ifndef CONFIG_SLOB 505#ifndef CONFIG_SLOB
506 if (!(flags & GFP_DMA)) { 506 if (!(flags & GFP_DMA)) {
507 int index = kmalloc_index(size); 507 unsigned int index = kmalloc_index(size);
508 508
509 if (!index) 509 if (!index)
510 return ZERO_SIZE_PTR; 510 return ZERO_SIZE_PTR;
@@ -522,11 +522,11 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
522 * return size or 0 if a kmalloc cache for that 522 * return size or 0 if a kmalloc cache for that
523 * size does not exist 523 * size does not exist
524 */ 524 */
525static __always_inline int kmalloc_size(int n) 525static __always_inline unsigned int kmalloc_size(unsigned int n)
526{ 526{
527#ifndef CONFIG_SLOB 527#ifndef CONFIG_SLOB
528 if (n > 2) 528 if (n > 2)
529 return 1 << n; 529 return 1U << n;
530 530
531 if (n == 1 && KMALLOC_MIN_SIZE <= 32) 531 if (n == 1 && KMALLOC_MIN_SIZE <= 32)
532 return 96; 532 return 96;
@@ -542,7 +542,7 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
542#ifndef CONFIG_SLOB 542#ifndef CONFIG_SLOB
543 if (__builtin_constant_p(size) && 543 if (__builtin_constant_p(size) &&
544 size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { 544 size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
545 int i = kmalloc_index(size); 545 unsigned int i = kmalloc_index(size);
546 546
547 if (!i) 547 if (!i)
548 return ZERO_SIZE_PTR; 548 return ZERO_SIZE_PTR;
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 7385547c04b1..d9228e4d0320 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -85,8 +85,8 @@ struct kmem_cache {
85 unsigned int *random_seq; 85 unsigned int *random_seq;
86#endif 86#endif
87 87
88 size_t useroffset; /* Usercopy region offset */ 88 unsigned int useroffset; /* Usercopy region offset */
89 size_t usersize; /* Usercopy region size */ 89 unsigned int usersize; /* Usercopy region size */
90 90
91 struct kmem_cache_node *node[MAX_NUMNODES]; 91 struct kmem_cache_node *node[MAX_NUMNODES];
92}; 92};
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 8ad99c47b19c..3773e26c08c1 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -73,7 +73,7 @@ struct kmem_cache_cpu {
73 * given order would contain. 73 * given order would contain.
74 */ 74 */
75struct kmem_cache_order_objects { 75struct kmem_cache_order_objects {
76 unsigned long x; 76 unsigned int x;
77}; 77};
78 78
79/* 79/*
@@ -84,11 +84,12 @@ struct kmem_cache {
84 /* Used for retriving partial slabs etc */ 84 /* Used for retriving partial slabs etc */
85 slab_flags_t flags; 85 slab_flags_t flags;
86 unsigned long min_partial; 86 unsigned long min_partial;
87 int size; /* The size of an object including meta data */ 87 unsigned int size; /* The size of an object including meta data */
88 int object_size; /* The size of an object without meta data */ 88 unsigned int object_size;/* The size of an object without meta data */
89 int offset; /* Free pointer offset. */ 89 unsigned int offset; /* Free pointer offset. */
90#ifdef CONFIG_SLUB_CPU_PARTIAL 90#ifdef CONFIG_SLUB_CPU_PARTIAL
91 int cpu_partial; /* Number of per cpu partial objects to keep around */ 91 /* Number of per cpu partial objects to keep around */
92 unsigned int cpu_partial;
92#endif 93#endif
93 struct kmem_cache_order_objects oo; 94 struct kmem_cache_order_objects oo;
94 95
@@ -98,10 +99,10 @@ struct kmem_cache {
98 gfp_t allocflags; /* gfp flags to use on each alloc */ 99 gfp_t allocflags; /* gfp flags to use on each alloc */
99 int refcount; /* Refcount for slab cache destroy */ 100 int refcount; /* Refcount for slab cache destroy */
100 void (*ctor)(void *); 101 void (*ctor)(void *);
101 int inuse; /* Offset to metadata */ 102 unsigned int inuse; /* Offset to metadata */
102 int align; /* Alignment */ 103 unsigned int align; /* Alignment */
103 int reserved; /* Reserved bytes at the end of slabs */ 104 unsigned int reserved; /* Reserved bytes at the end of slabs */
104 int red_left_pad; /* Left redzone padding size */ 105 unsigned int red_left_pad; /* Left redzone padding size */
105 const char *name; /* Name (only for display!) */ 106 const char *name; /* Name (only for display!) */
106 struct list_head list; /* List of slab caches */ 107 struct list_head list; /* List of slab caches */
107#ifdef CONFIG_SYSFS 108#ifdef CONFIG_SYSFS
@@ -110,7 +111,8 @@ struct kmem_cache {
110#endif 111#endif
111#ifdef CONFIG_MEMCG 112#ifdef CONFIG_MEMCG
112 struct memcg_cache_params memcg_params; 113 struct memcg_cache_params memcg_params;
113 int max_attr_size; /* for propagation, maximum size of a stored attr */ 114 /* for propagation, maximum size of a stored attr */
115 unsigned int max_attr_size;
114#ifdef CONFIG_SYSFS 116#ifdef CONFIG_SYSFS
115 struct kset *memcg_kset; 117 struct kset *memcg_kset;
116#endif 118#endif
@@ -124,7 +126,7 @@ struct kmem_cache {
124 /* 126 /*
125 * Defragmentation by allocating from a remote node. 127 * Defragmentation by allocating from a remote node.
126 */ 128 */
127 int remote_node_defrag_ratio; 129 unsigned int remote_node_defrag_ratio;
128#endif 130#endif
129 131
130#ifdef CONFIG_SLAB_FREELIST_RANDOM 132#ifdef CONFIG_SLAB_FREELIST_RANDOM
@@ -135,8 +137,8 @@ struct kmem_cache {
135 struct kasan_cache kasan_info; 137 struct kasan_cache kasan_info;
136#endif 138#endif
137 139
138 size_t useroffset; /* Usercopy region offset */ 140 unsigned int useroffset; /* Usercopy region offset */
139 size_t usersize; /* Usercopy region size */ 141 unsigned int usersize; /* Usercopy region size */
140 142
141 struct kmem_cache_node *node[MAX_NUMNODES]; 143 struct kmem_cache_node *node[MAX_NUMNODES];
142}; 144};
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a1a3f4ed94ce..2417d288e016 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -400,7 +400,6 @@ int generic_swapfile_activate(struct swap_info_struct *, struct file *,
400#define SWAP_ADDRESS_SPACE_SHIFT 14 400#define SWAP_ADDRESS_SPACE_SHIFT 14
401#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) 401#define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT)
402extern struct address_space *swapper_spaces[]; 402extern struct address_space *swapper_spaces[];
403extern bool swap_vma_readahead;
404#define swap_address_space(entry) \ 403#define swap_address_space(entry) \
405 (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ 404 (&swapper_spaces[swp_type(entry)][swp_offset(entry) \
406 >> SWAP_ADDRESS_SPACE_SHIFT]) 405 >> SWAP_ADDRESS_SPACE_SHIFT])
@@ -422,14 +421,10 @@ extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
422extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, 421extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t,
423 struct vm_area_struct *vma, unsigned long addr, 422 struct vm_area_struct *vma, unsigned long addr,
424 bool *new_page_allocated); 423 bool *new_page_allocated);
425extern struct page *swapin_readahead(swp_entry_t, gfp_t, 424extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
426 struct vm_area_struct *vma, unsigned long addr); 425 struct vm_fault *vmf);
427 426extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
428extern struct page *swap_readahead_detect(struct vm_fault *vmf, 427 struct vm_fault *vmf);
429 struct vma_swap_readahead *swap_ra);
430extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
431 struct vm_fault *vmf,
432 struct vma_swap_readahead *swap_ra);
433 428
434/* linux/mm/swapfile.c */ 429/* linux/mm/swapfile.c */
435extern atomic_long_t nr_swap_pages; 430extern atomic_long_t nr_swap_pages;
@@ -437,11 +432,6 @@ extern long total_swap_pages;
437extern atomic_t nr_rotate_swap; 432extern atomic_t nr_rotate_swap;
438extern bool has_usable_swap(void); 433extern bool has_usable_swap(void);
439 434
440static inline bool swap_use_vma_readahead(void)
441{
442 return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap);
443}
444
445/* Swap 50% full? Release swapcache more aggressively.. */ 435/* Swap 50% full? Release swapcache more aggressively.. */
446static inline bool vm_swap_full(void) 436static inline bool vm_swap_full(void)
447{ 437{
@@ -537,26 +527,14 @@ static inline void put_swap_page(struct page *page, swp_entry_t swp)
537{ 527{
538} 528}
539 529
540static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, 530static inline struct page *swap_cluster_readahead(swp_entry_t entry,
541 struct vm_area_struct *vma, unsigned long addr) 531 gfp_t gfp_mask, struct vm_fault *vmf)
542{ 532{
543 return NULL; 533 return NULL;
544} 534}
545 535
546static inline bool swap_use_vma_readahead(void) 536static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
547{ 537 struct vm_fault *vmf)
548 return false;
549}
550
551static inline struct page *swap_readahead_detect(
552 struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
553{
554 return NULL;
555}
556
557static inline struct page *do_swap_page_readahead(
558 swp_entry_t fentry, gfp_t gfp_mask,
559 struct vm_fault *vmf, struct vma_swap_readahead *swap_ra)
560{ 538{
561 return NULL; 539 return NULL;
562} 540}
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 57a8e98f2708..2219cce81ca4 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -47,6 +47,8 @@ void zs_destroy_pool(struct zs_pool *pool);
47unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); 47unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
48void zs_free(struct zs_pool *pool, unsigned long obj); 48void zs_free(struct zs_pool *pool, unsigned long obj);
49 49
50size_t zs_huge_class_size(struct zs_pool *pool);
51
50void *zs_map_object(struct zs_pool *pool, unsigned long handle, 52void *zs_map_object(struct zs_pool *pool, unsigned long handle,
51 enum zs_mapmode mm); 53 enum zs_mapmode mm);
52void zs_unmap_object(struct zs_pool *pool, unsigned long handle); 54void zs_unmap_object(struct zs_pool *pool, unsigned long handle);
diff --git a/include/net/sock.h b/include/net/sock.h
index 49bd2c1796b0..74d725fdbe0f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1114,8 +1114,8 @@ struct proto {
1114 struct kmem_cache *slab; 1114 struct kmem_cache *slab;
1115 unsigned int obj_size; 1115 unsigned int obj_size;
1116 slab_flags_t slab_flags; 1116 slab_flags_t slab_flags;
1117 size_t useroffset; /* Usercopy region offset */ 1117 unsigned int useroffset; /* Usercopy region offset */
1118 size_t usersize; /* Usercopy region size */ 1118 unsigned int usersize; /* Usercopy region size */
1119 1119
1120 struct percpu_counter *orphan_count; 1120 struct percpu_counter *orphan_count;
1121 1121
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index bcf4daccd6be..711372845945 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -20,7 +20,7 @@
20 EM( MR_SYSCALL, "syscall_or_cpuset") \ 20 EM( MR_SYSCALL, "syscall_or_cpuset") \
21 EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \ 21 EM( MR_MEMPOLICY_MBIND, "mempolicy_mbind") \
22 EM( MR_NUMA_MISPLACED, "numa_misplaced") \ 22 EM( MR_NUMA_MISPLACED, "numa_misplaced") \
23 EMe(MR_CMA, "cma") 23 EMe(MR_CONTIG_RANGE, "contig_range")
24 24
25/* 25/*
26 * First define the enums in the above macros to be exported to userspace 26 * First define the enums in the above macros to be exported to userspace
diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h
index e0b8b9173e1c..6570c5b45ba1 100644
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -78,26 +78,29 @@ TRACE_EVENT(mm_vmscan_kswapd_wake,
78 78
79TRACE_EVENT(mm_vmscan_wakeup_kswapd, 79TRACE_EVENT(mm_vmscan_wakeup_kswapd,
80 80
81 TP_PROTO(int nid, int zid, int order), 81 TP_PROTO(int nid, int zid, int order, gfp_t gfp_flags),
82 82
83 TP_ARGS(nid, zid, order), 83 TP_ARGS(nid, zid, order, gfp_flags),
84 84
85 TP_STRUCT__entry( 85 TP_STRUCT__entry(
86 __field( int, nid ) 86 __field( int, nid )
87 __field( int, zid ) 87 __field( int, zid )
88 __field( int, order ) 88 __field( int, order )
89 __field( gfp_t, gfp_flags )
89 ), 90 ),
90 91
91 TP_fast_assign( 92 TP_fast_assign(
92 __entry->nid = nid; 93 __entry->nid = nid;
93 __entry->zid = zid; 94 __entry->zid = zid;
94 __entry->order = order; 95 __entry->order = order;
96 __entry->gfp_flags = gfp_flags;
95 ), 97 ),
96 98
97 TP_printk("nid=%d zid=%d order=%d", 99 TP_printk("nid=%d zid=%d order=%d gfp_flags=%s",
98 __entry->nid, 100 __entry->nid,
99 __entry->zid, 101 __entry->zid,
100 __entry->order) 102 __entry->order,
103 show_gfp_flags(__entry->gfp_flags))
101); 104);
102 105
103DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, 106DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template,
diff --git a/kernel/fork.c b/kernel/fork.c
index f71b67dc156d..242c8c93d285 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -595,6 +595,8 @@ static void check_mm(struct mm_struct *mm)
595void __mmdrop(struct mm_struct *mm) 595void __mmdrop(struct mm_struct *mm)
596{ 596{
597 BUG_ON(mm == &init_mm); 597 BUG_ON(mm == &init_mm);
598 WARN_ON_ONCE(mm == current->mm);
599 WARN_ON_ONCE(mm == current->active_mm);
598 mm_free_pgd(mm); 600 mm_free_pgd(mm);
599 destroy_context(mm); 601 destroy_context(mm);
600 hmm_mm_destroy(mm); 602 hmm_mm_destroy(mm);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 28b68995a417..e8afd6086f23 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5560,6 +5560,7 @@ void idle_task_exit(void)
5560 5560
5561 if (mm != &init_mm) { 5561 if (mm != &init_mm) {
5562 switch_mm(mm, &init_mm, current); 5562 switch_mm(mm, &init_mm, current);
5563 current->active_mm = &init_mm;
5563 finish_arch_post_lock_switch(); 5564 finish_arch_post_lock_switch();
5564 } 5565 }
5565 mmdrop(mm); 5566 mmdrop(mm);
diff --git a/kernel/ucount.c b/kernel/ucount.c
index b4eeee03934f..f48d1b6376a4 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -10,6 +10,7 @@
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/cred.h> 11#include <linux/cred.h>
12#include <linux/hash.h> 12#include <linux/hash.h>
13#include <linux/kmemleak.h>
13#include <linux/user_namespace.h> 14#include <linux/user_namespace.h>
14 15
15#define UCOUNTS_HASHTABLE_BITS 10 16#define UCOUNTS_HASHTABLE_BITS 10
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 9e498c77ed0e..a42eff7e8c48 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -607,7 +607,7 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen,
607 /* if no digit is after '-', it's wrong*/ 607 /* if no digit is after '-', it's wrong*/
608 if (at_start && in_range) 608 if (at_start && in_range)
609 return -EINVAL; 609 return -EINVAL;
610 if (!(a <= b) || !(used_size <= group_size)) 610 if (!(a <= b) || group_size == 0 || !(used_size <= group_size))
611 return -EINVAL; 611 return -EINVAL;
612 if (b >= nmaskbits) 612 if (b >= nmaskbits)
613 return -ERANGE; 613 return -ERANGE;
diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c
index b3f235baa05d..413367cf569e 100644
--- a/lib/test_bitmap.c
+++ b/lib/test_bitmap.c
@@ -255,6 +255,10 @@ static const struct test_bitmap_parselist parselist_tests[] __initconst = {
255 {-EINVAL, "-1", NULL, 8, 0}, 255 {-EINVAL, "-1", NULL, 8, 0},
256 {-EINVAL, "-0", NULL, 8, 0}, 256 {-EINVAL, "-0", NULL, 8, 0},
257 {-EINVAL, "10-1", NULL, 8, 0}, 257 {-EINVAL, "10-1", NULL, 8, 0},
258 {-EINVAL, "0-31:", NULL, 8, 0},
259 {-EINVAL, "0-31:0", NULL, 8, 0},
260 {-EINVAL, "0-31:0/0", NULL, 8, 0},
261 {-EINVAL, "0-31:1/0", NULL, 8, 0},
258 {-EINVAL, "0-31:10/1", NULL, 8, 0}, 262 {-EINVAL, "0-31:10/1", NULL, 8, 0},
259}; 263};
260 264
diff --git a/lib/test_firmware.c b/lib/test_firmware.c
index 078a61480573..cee000ac54d8 100644
--- a/lib/test_firmware.c
+++ b/lib/test_firmware.c
@@ -21,6 +21,7 @@
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/delay.h> 22#include <linux/delay.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/vmalloc.h>
24 25
25#define TEST_FIRMWARE_NAME "test-firmware.bin" 26#define TEST_FIRMWARE_NAME "test-firmware.bin"
26#define TEST_FIRMWARE_NUM_REQS 4 27#define TEST_FIRMWARE_NUM_REQS 4
diff --git a/mm/Makefile b/mm/Makefile
index e669f02c5a54..b4e54a9ae9c5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
37 readahead.o swap.o truncate.o vmscan.o shmem.o \ 37 readahead.o swap.o truncate.o vmscan.o shmem.o \
38 util.o mmzone.o vmstat.o backing-dev.o \ 38 util.o mmzone.o vmstat.o backing-dev.o \
39 mm_init.o mmu_context.o percpu.o slab_common.o \ 39 mm_init.o mmu_context.o percpu.o slab_common.o \
40 compaction.o vmacache.o swap_slots.o \ 40 compaction.o vmacache.o \
41 interval_tree.o list_lru.o workingset.o \ 41 interval_tree.o list_lru.o workingset.o \
42 debug.o $(mmu-y) 42 debug.o $(mmu-y)
43 43
@@ -55,7 +55,7 @@ ifdef CONFIG_MMU
55endif 55endif
56obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 56obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
57 57
58obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o 58obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
59obj-$(CONFIG_FRONTSWAP) += frontswap.o 59obj-$(CONFIG_FRONTSWAP) += frontswap.o
60obj-$(CONFIG_ZSWAP) += zswap.o 60obj-$(CONFIG_ZSWAP) += zswap.o
61obj-$(CONFIG_HAS_DMA) += dmapool.o 61obj-$(CONFIG_HAS_DMA) += dmapool.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d2984e9fcf08..08b9aab631ab 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -100,18 +100,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
100 100
101 return 0; 101 return 0;
102} 102}
103 103DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
104static int bdi_debug_stats_open(struct inode *inode, struct file *file)
105{
106 return single_open(file, bdi_debug_stats_show, inode->i_private);
107}
108
109static const struct file_operations bdi_debug_stats_fops = {
110 .open = bdi_debug_stats_open,
111 .read = seq_read,
112 .llseek = seq_lseek,
113 .release = single_release,
114};
115 104
116static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) 105static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
117{ 106{
diff --git a/mm/cma.c b/mm/cma.c
index 0607729abf3b..5809bbe360d7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,6 +35,7 @@
35#include <linux/cma.h> 35#include <linux/cma.h>
36#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/io.h> 37#include <linux/io.h>
38#include <linux/kmemleak.h>
38#include <trace/events/cma.h> 39#include <trace/events/cma.h>
39 40
40#include "cma.h" 41#include "cma.h"
@@ -165,6 +166,9 @@ core_initcall(cma_init_reserved_areas);
165 * @base: Base address of the reserved area 166 * @base: Base address of the reserved area
166 * @size: Size of the reserved area (in bytes), 167 * @size: Size of the reserved area (in bytes),
167 * @order_per_bit: Order of pages represented by one bit on bitmap. 168 * @order_per_bit: Order of pages represented by one bit on bitmap.
169 * @name: The name of the area. If this parameter is NULL, the name of
170 * the area will be set to "cmaN", where N is a running counter of
171 * used areas.
168 * @res_cma: Pointer to store the created cma region. 172 * @res_cma: Pointer to store the created cma region.
169 * 173 *
170 * This function creates custom contiguous area from already reserved memory. 174 * This function creates custom contiguous area from already reserved memory.
@@ -227,6 +231,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
227 * @alignment: Alignment for the CMA area, should be power of 2 or zero 231 * @alignment: Alignment for the CMA area, should be power of 2 or zero
228 * @order_per_bit: Order of pages represented by one bit on bitmap. 232 * @order_per_bit: Order of pages represented by one bit on bitmap.
229 * @fixed: hint about where to place the reserved area 233 * @fixed: hint about where to place the reserved area
234 * @name: The name of the area. See function cma_init_reserved_mem()
230 * @res_cma: Pointer to store the created cma region. 235 * @res_cma: Pointer to store the created cma region.
231 * 236 *
232 * This function reserves memory from early allocator. It should be 237 * This function reserves memory from early allocator. It should be
@@ -390,6 +395,7 @@ static inline void cma_debug_show_areas(struct cma *cma) { }
390 * @cma: Contiguous memory region for which the allocation is performed. 395 * @cma: Contiguous memory region for which the allocation is performed.
391 * @count: Requested number of pages. 396 * @count: Requested number of pages.
392 * @align: Requested alignment of pages (in PAGE_SIZE order). 397 * @align: Requested alignment of pages (in PAGE_SIZE order).
398 * @gfp_mask: GFP mask to use during compaction
393 * 399 *
394 * This function allocates part of contiguous memory on specific 400 * This function allocates part of contiguous memory on specific
395 * contiguous memory area. 401 * contiguous memory area.
diff --git a/mm/compaction.c b/mm/compaction.c
index 2c8999d027ab..88d01a50a015 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -576,6 +576,7 @@ isolate_fail:
576 576
577/** 577/**
578 * isolate_freepages_range() - isolate free pages. 578 * isolate_freepages_range() - isolate free pages.
579 * @cc: Compaction control structure.
579 * @start_pfn: The first PFN to start isolating. 580 * @start_pfn: The first PFN to start isolating.
580 * @end_pfn: The one-past-last PFN. 581 * @end_pfn: The one-past-last PFN.
581 * 582 *
@@ -1988,6 +1989,14 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1988 compaction_defer_reset(zone, cc.order, false); 1989 compaction_defer_reset(zone, cc.order, false);
1989 } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { 1990 } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1990 /* 1991 /*
1992 * Buddy pages may become stranded on pcps that could
1993 * otherwise coalesce on the zone's free area for
1994 * order >= cc.order. This is ratelimited by the
1995 * upcoming deferral.
1996 */
1997 drain_all_pages(zone);
1998
1999 /*
1991 * We use sync migration mode here, so we defer like 2000 * We use sync migration mode here, so we defer like
1992 * sync direct compaction does. 2001 * sync direct compaction does.
1993 */ 2002 */
diff --git a/mm/failslab.c b/mm/failslab.c
index 8087d976a809..1f2f248e3601 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -14,7 +14,7 @@ static struct {
14 .cache_filter = false, 14 .cache_filter = false,
15}; 15};
16 16
17bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) 17bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags)
18{ 18{
19 /* No fault-injection for bootstrap cache */ 19 /* No fault-injection for bootstrap cache */
20 if (unlikely(s == kmem_cache)) 20 if (unlikely(s == kmem_cache))
diff --git a/mm/gup.c b/mm/gup.c
index 6afae32571ca..f296df6cf666 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -531,7 +531,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
531 * reCOWed by userspace write). 531 * reCOWed by userspace write).
532 */ 532 */
533 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) 533 if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
534 *flags |= FOLL_COW; 534 *flags |= FOLL_COW;
535 return 0; 535 return 0;
536} 536}
537 537
@@ -1638,7 +1638,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1638 PMD_SHIFT, next, write, pages, nr)) 1638 PMD_SHIFT, next, write, pages, nr))
1639 return 0; 1639 return 0;
1640 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) 1640 } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
1641 return 0; 1641 return 0;
1642 } while (pmdp++, addr = next, addr != end); 1642 } while (pmdp++, addr = next, addr != end);
1643 1643
1644 return 1; 1644 return 1;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5a68730eebd6..f0ae8d1d4329 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2356,26 +2356,13 @@ static void __split_huge_page_tail(struct page *head, int tail,
2356 struct page *page_tail = head + tail; 2356 struct page *page_tail = head + tail;
2357 2357
2358 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); 2358 VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2359 VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
2360 2359
2361 /* 2360 /*
2362 * tail_page->_refcount is zero and not changing from under us. But 2361 * Clone page flags before unfreezing refcount.
2363 * get_page_unless_zero() may be running from under us on the 2362 *
2364 * tail_page. If we used atomic_set() below instead of atomic_inc() or 2363 * After successful get_page_unless_zero() might follow flags change,
2365 * atomic_add(), we would then run atomic_set() concurrently with 2364 * for exmaple lock_page() which set PG_waiters.
2366 * get_page_unless_zero(), and atomic_set() is implemented in C not
2367 * using locked ops. spin_unlock on x86 sometime uses locked ops
2368 * because of PPro errata 66, 92, so unless somebody can guarantee
2369 * atomic_set() here would be safe on all archs (and not only on x86),
2370 * it's safer to use atomic_inc()/atomic_add().
2371 */ 2365 */
2372 if (PageAnon(head) && !PageSwapCache(head)) {
2373 page_ref_inc(page_tail);
2374 } else {
2375 /* Additional pin to radix tree */
2376 page_ref_add(page_tail, 2);
2377 }
2378
2379 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 2366 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2380 page_tail->flags |= (head->flags & 2367 page_tail->flags |= (head->flags &
2381 ((1L << PG_referenced) | 2368 ((1L << PG_referenced) |
@@ -2388,14 +2375,21 @@ static void __split_huge_page_tail(struct page *head, int tail,
2388 (1L << PG_unevictable) | 2375 (1L << PG_unevictable) |
2389 (1L << PG_dirty))); 2376 (1L << PG_dirty)));
2390 2377
2391 /* 2378 /* Page flags must be visible before we make the page non-compound. */
2392 * After clearing PageTail the gup refcount can be released.
2393 * Page flags also must be visible before we make the page non-compound.
2394 */
2395 smp_wmb(); 2379 smp_wmb();
2396 2380
2381 /*
2382 * Clear PageTail before unfreezing page refcount.
2383 *
2384 * After successful get_page_unless_zero() might follow put_page()
2385 * which needs correct compound_head().
2386 */
2397 clear_compound_head(page_tail); 2387 clear_compound_head(page_tail);
2398 2388
2389 /* Finally unfreeze refcount. Additional reference from page cache. */
2390 page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
2391 PageSwapCache(head)));
2392
2399 if (page_is_young(head)) 2393 if (page_is_young(head))
2400 set_page_young(page_tail); 2394 set_page_young(page_tail);
2401 if (page_is_idle(head)) 2395 if (page_is_idle(head))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 976bbc5646fe..218679138255 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -637,29 +637,22 @@ EXPORT_SYMBOL_GPL(linear_hugepage_index);
637 */ 637 */
638unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) 638unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
639{ 639{
640 struct hstate *hstate; 640 if (vma->vm_ops && vma->vm_ops->pagesize)
641 641 return vma->vm_ops->pagesize(vma);
642 if (!is_vm_hugetlb_page(vma)) 642 return PAGE_SIZE;
643 return PAGE_SIZE;
644
645 hstate = hstate_vma(vma);
646
647 return 1UL << huge_page_shift(hstate);
648} 643}
649EXPORT_SYMBOL_GPL(vma_kernel_pagesize); 644EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
650 645
651/* 646/*
652 * Return the page size being used by the MMU to back a VMA. In the majority 647 * Return the page size being used by the MMU to back a VMA. In the majority
653 * of cases, the page size used by the kernel matches the MMU size. On 648 * of cases, the page size used by the kernel matches the MMU size. On
654 * architectures where it differs, an architecture-specific version of this 649 * architectures where it differs, an architecture-specific 'strong'
655 * function is required. 650 * version of this symbol is required.
656 */ 651 */
657#ifndef vma_mmu_pagesize 652__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
658unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
659{ 653{
660 return vma_kernel_pagesize(vma); 654 return vma_kernel_pagesize(vma);
661} 655}
662#endif
663 656
664/* 657/*
665 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 658 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
@@ -3153,6 +3146,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
3153 return 0; 3146 return 0;
3154} 3147}
3155 3148
3149static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
3150{
3151 struct hstate *hstate = hstate_vma(vma);
3152
3153 return 1UL << huge_page_shift(hstate);
3154}
3155
3156/* 3156/*
3157 * We cannot handle pagefaults against hugetlb pages at all. They cause 3157 * We cannot handle pagefaults against hugetlb pages at all. They cause
3158 * handle_mm_fault() to try to instantiate regular-sized pages in the 3158 * handle_mm_fault() to try to instantiate regular-sized pages in the
@@ -3170,6 +3170,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
3170 .open = hugetlb_vm_op_open, 3170 .open = hugetlb_vm_op_open,
3171 .close = hugetlb_vm_op_close, 3171 .close = hugetlb_vm_op_close,
3172 .split = hugetlb_vm_op_split, 3172 .split = hugetlb_vm_op_split,
3173 .pagesize = hugetlb_vm_op_pagesize,
3173}; 3174};
3174 3175
3175static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, 3176static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e13d911251e7..bc0e68f7dc75 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -323,9 +323,9 @@ void kasan_free_pages(struct page *page, unsigned int order)
323 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. 323 * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
324 * For larger allocations larger redzones are used. 324 * For larger allocations larger redzones are used.
325 */ 325 */
326static size_t optimal_redzone(size_t object_size) 326static unsigned int optimal_redzone(unsigned int object_size)
327{ 327{
328 int rz = 328 return
329 object_size <= 64 - 16 ? 16 : 329 object_size <= 64 - 16 ? 16 :
330 object_size <= 128 - 32 ? 32 : 330 object_size <= 128 - 32 ? 32 :
331 object_size <= 512 - 64 ? 64 : 331 object_size <= 512 - 64 ? 64 :
@@ -333,14 +333,13 @@ static size_t optimal_redzone(size_t object_size)
333 object_size <= (1 << 14) - 256 ? 256 : 333 object_size <= (1 << 14) - 256 ? 256 :
334 object_size <= (1 << 15) - 512 ? 512 : 334 object_size <= (1 << 15) - 512 ? 512 :
335 object_size <= (1 << 16) - 1024 ? 1024 : 2048; 335 object_size <= (1 << 16) - 1024 ? 1024 : 2048;
336 return rz;
337} 336}
338 337
339void kasan_cache_create(struct kmem_cache *cache, size_t *size, 338void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
340 slab_flags_t *flags) 339 slab_flags_t *flags)
341{ 340{
341 unsigned int orig_size = *size;
342 int redzone_adjust; 342 int redzone_adjust;
343 int orig_size = *size;
344 343
345 /* Add alloc meta. */ 344 /* Add alloc meta. */
346 cache->kasan_info.alloc_meta_offset = *size; 345 cache->kasan_info.alloc_meta_offset = *size;
@@ -358,7 +357,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
358 if (redzone_adjust > 0) 357 if (redzone_adjust > 0)
359 *size += redzone_adjust; 358 *size += redzone_adjust;
360 359
361 *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + 360 *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
361 max(*size, cache->object_size +
362 optimal_redzone(cache->object_size))); 362 optimal_redzone(cache->object_size)));
363 363
364 /* 364 /*
@@ -382,7 +382,8 @@ void kasan_cache_shrink(struct kmem_cache *cache)
382 382
383void kasan_cache_shutdown(struct kmem_cache *cache) 383void kasan_cache_shutdown(struct kmem_cache *cache)
384{ 384{
385 quarantine_remove_cache(cache); 385 if (!__kmem_cache_empty(cache))
386 quarantine_remove_cache(cache);
386} 387}
387 388
388size_t kasan_metadata_size(struct kmem_cache *cache) 389size_t kasan_metadata_size(struct kmem_cache *cache)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 46c2290a08f1..9a085d525bbc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1187,6 +1187,11 @@ EXPORT_SYMBOL(kmemleak_no_scan);
1187/** 1187/**
1188 * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical 1188 * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical
1189 * address argument 1189 * address argument
1190 * @phys: physical address of the object
1191 * @size: size of the object
1192 * @min_count: minimum number of references to this object.
1193 * See kmemleak_alloc()
1194 * @gfp: kmalloc() flags used for kmemleak internal memory allocations
1190 */ 1195 */
1191void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, 1196void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
1192 gfp_t gfp) 1197 gfp_t gfp)
@@ -1199,6 +1204,9 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
1199/** 1204/**
1200 * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a 1205 * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a
1201 * physical address argument 1206 * physical address argument
1207 * @phys: physical address if the beginning or inside an object. This
1208 * also represents the start of the range to be freed
1209 * @size: size to be unregistered
1202 */ 1210 */
1203void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) 1211void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
1204{ 1212{
@@ -1210,6 +1218,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys);
1210/** 1218/**
1211 * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical 1219 * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
1212 * address argument 1220 * address argument
1221 * @phys: physical address of the object
1213 */ 1222 */
1214void __ref kmemleak_not_leak_phys(phys_addr_t phys) 1223void __ref kmemleak_not_leak_phys(phys_addr_t phys)
1215{ 1224{
@@ -1221,6 +1230,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
1221/** 1230/**
1222 * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical 1231 * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
1223 * address argument 1232 * address argument
1233 * @phys: physical address of the object
1224 */ 1234 */
1225void __ref kmemleak_ignore_phys(phys_addr_t phys) 1235void __ref kmemleak_ignore_phys(phys_addr_t phys)
1226{ 1236{
@@ -1963,7 +1973,7 @@ static void kmemleak_disable(void)
1963/* 1973/*
1964 * Allow boot-time kmemleak disabling (enabled by default). 1974 * Allow boot-time kmemleak disabling (enabled by default).
1965 */ 1975 */
1966static int kmemleak_boot_config(char *str) 1976static int __init kmemleak_boot_config(char *str)
1967{ 1977{
1968 if (!str) 1978 if (!str)
1969 return -EINVAL; 1979 return -EINVAL;
diff --git a/mm/ksm.c b/mm/ksm.c
index adb5f991da8e..e8d6c6210b80 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1318,10 +1318,10 @@ bool is_page_sharing_candidate(struct stable_node *stable_node)
1318 return __is_page_sharing_candidate(stable_node, 0); 1318 return __is_page_sharing_candidate(stable_node, 0);
1319} 1319}
1320 1320
1321struct page *stable_node_dup(struct stable_node **_stable_node_dup, 1321static struct page *stable_node_dup(struct stable_node **_stable_node_dup,
1322 struct stable_node **_stable_node, 1322 struct stable_node **_stable_node,
1323 struct rb_root *root, 1323 struct rb_root *root,
1324 bool prune_stale_stable_nodes) 1324 bool prune_stale_stable_nodes)
1325{ 1325{
1326 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; 1326 struct stable_node *dup, *found = NULL, *stable_node = *_stable_node;
1327 struct hlist_node *hlist_safe; 1327 struct hlist_node *hlist_safe;
@@ -2082,8 +2082,22 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2082 tree_rmap_item = 2082 tree_rmap_item =
2083 unstable_tree_search_insert(rmap_item, page, &tree_page); 2083 unstable_tree_search_insert(rmap_item, page, &tree_page);
2084 if (tree_rmap_item) { 2084 if (tree_rmap_item) {
2085 bool split;
2086
2085 kpage = try_to_merge_two_pages(rmap_item, page, 2087 kpage = try_to_merge_two_pages(rmap_item, page,
2086 tree_rmap_item, tree_page); 2088 tree_rmap_item, tree_page);
2089 /*
2090 * If both pages we tried to merge belong to the same compound
2091 * page, then we actually ended up increasing the reference
2092 * count of the same compound page twice, and split_huge_page
2093 * failed.
2094 * Here we set a flag if that happened, and we use it later to
2095 * try split_huge_page again. Since we call put_page right
2096 * afterwards, the reference count will be correct and
2097 * split_huge_page should succeed.
2098 */
2099 split = PageTransCompound(page)
2100 && compound_head(page) == compound_head(tree_page);
2087 put_page(tree_page); 2101 put_page(tree_page);
2088 if (kpage) { 2102 if (kpage) {
2089 /* 2103 /*
@@ -2110,6 +2124,20 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
2110 break_cow(tree_rmap_item); 2124 break_cow(tree_rmap_item);
2111 break_cow(rmap_item); 2125 break_cow(rmap_item);
2112 } 2126 }
2127 } else if (split) {
2128 /*
2129 * We are here if we tried to merge two pages and
2130 * failed because they both belonged to the same
2131 * compound page. We will split the page now, but no
2132 * merging will take place.
2133 * We do not want to add the cost of a full lock; if
2134 * the page is locked, it is better to skip it and
2135 * perhaps try again later.
2136 */
2137 if (!trylock_page(page))
2138 return;
2139 split_huge_page(page);
2140 unlock_page(page);
2113 } 2141 }
2114 } 2142 }
2115} 2143}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fd41e969ede5..fcfb6c89ed47 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru)
52static inline struct list_lru_one * 52static inline struct list_lru_one *
53list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) 53list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
54{ 54{
55 struct list_lru_memcg *memcg_lrus;
55 /* 56 /*
56 * The lock protects the array of per cgroup lists from relocation 57 * Either lock or RCU protects the array of per cgroup lists
57 * (see memcg_update_list_lru_node). 58 * from relocation (see memcg_update_list_lru_node).
58 */ 59 */
59 lockdep_assert_held(&nlru->lock); 60 memcg_lrus = rcu_dereference_check(nlru->memcg_lrus,
60 if (nlru->memcg_lrus && idx >= 0) 61 lockdep_is_held(&nlru->lock));
61 return nlru->memcg_lrus->lru[idx]; 62 if (memcg_lrus && idx >= 0)
62 63 return memcg_lrus->lru[idx];
63 return &nlru->lru; 64 return &nlru->lru;
64} 65}
65 66
@@ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru,
168 struct list_lru_one *l; 169 struct list_lru_one *l;
169 unsigned long count; 170 unsigned long count;
170 171
171 spin_lock(&nlru->lock); 172 rcu_read_lock();
172 l = list_lru_from_memcg_idx(nlru, memcg_idx); 173 l = list_lru_from_memcg_idx(nlru, memcg_idx);
173 count = l->nr_items; 174 count = l->nr_items;
174 spin_unlock(&nlru->lock); 175 rcu_read_unlock();
175 176
176 return count; 177 return count;
177} 178}
@@ -324,24 +325,41 @@ fail:
324 325
325static int memcg_init_list_lru_node(struct list_lru_node *nlru) 326static int memcg_init_list_lru_node(struct list_lru_node *nlru)
326{ 327{
328 struct list_lru_memcg *memcg_lrus;
327 int size = memcg_nr_cache_ids; 329 int size = memcg_nr_cache_ids;
328 330
329 nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); 331 memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
330 if (!nlru->memcg_lrus) 332 size * sizeof(void *), GFP_KERNEL);
333 if (!memcg_lrus)
331 return -ENOMEM; 334 return -ENOMEM;
332 335
333 if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { 336 if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
334 kvfree(nlru->memcg_lrus); 337 kvfree(memcg_lrus);
335 return -ENOMEM; 338 return -ENOMEM;
336 } 339 }
340 RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus);
337 341
338 return 0; 342 return 0;
339} 343}
340 344
341static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) 345static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
342{ 346{
343 __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); 347 struct list_lru_memcg *memcg_lrus;
344 kvfree(nlru->memcg_lrus); 348 /*
349 * This is called when shrinker has already been unregistered,
350 * and nobody can use it. So, there is no need to use kvfree_rcu().
351 */
352 memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
353 __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
354 kvfree(memcg_lrus);
355}
356
357static void kvfree_rcu(struct rcu_head *head)
358{
359 struct list_lru_memcg *mlru;
360
361 mlru = container_of(head, struct list_lru_memcg, rcu);
362 kvfree(mlru);
345} 363}
346 364
347static int memcg_update_list_lru_node(struct list_lru_node *nlru, 365static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -351,8 +369,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
351 369
352 BUG_ON(old_size > new_size); 370 BUG_ON(old_size > new_size);
353 371
354 old = nlru->memcg_lrus; 372 old = rcu_dereference_protected(nlru->memcg_lrus,
355 new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); 373 lockdep_is_held(&list_lrus_mutex));
374 new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
356 if (!new) 375 if (!new)
357 return -ENOMEM; 376 return -ENOMEM;
358 377
@@ -361,29 +380,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
361 return -ENOMEM; 380 return -ENOMEM;
362 } 381 }
363 382
364 memcpy(new, old, old_size * sizeof(void *)); 383 memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
365 384
366 /* 385 /*
367 * The lock guarantees that we won't race with a reader 386 * The locking below allows readers that hold nlru->lock avoid taking
368 * (see list_lru_from_memcg_idx). 387 * rcu_read_lock (see list_lru_from_memcg_idx).
369 * 388 *
370 * Since list_lru_{add,del} may be called under an IRQ-safe lock, 389 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
371 * we have to use IRQ-safe primitives here to avoid deadlock. 390 * we have to use IRQ-safe primitives here to avoid deadlock.
372 */ 391 */
373 spin_lock_irq(&nlru->lock); 392 spin_lock_irq(&nlru->lock);
374 nlru->memcg_lrus = new; 393 rcu_assign_pointer(nlru->memcg_lrus, new);
375 spin_unlock_irq(&nlru->lock); 394 spin_unlock_irq(&nlru->lock);
376 395
377 kvfree(old); 396 call_rcu(&old->rcu, kvfree_rcu);
378 return 0; 397 return 0;
379} 398}
380 399
381static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, 400static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
382 int old_size, int new_size) 401 int old_size, int new_size)
383{ 402{
403 struct list_lru_memcg *memcg_lrus;
404
405 memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus,
406 lockdep_is_held(&list_lrus_mutex));
384 /* do not bother shrinking the array back to the old size, because we 407 /* do not bother shrinking the array back to the old size, because we
385 * cannot handle allocation failures here */ 408 * cannot handle allocation failures here */
386 __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); 409 __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size);
387} 410}
388 411
389static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 412static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
diff --git a/mm/memblock.c b/mm/memblock.c
index 48376bd33274..9b04568ad42a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -17,6 +17,7 @@
17#include <linux/poison.h> 17#include <linux/poison.h>
18#include <linux/pfn.h> 18#include <linux/pfn.h>
19#include <linux/debugfs.h> 19#include <linux/debugfs.h>
20#include <linux/kmemleak.h>
20#include <linux/seq_file.h> 21#include <linux/seq_file.h>
21#include <linux/memblock.h> 22#include <linux/memblock.h>
22 23
@@ -924,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags,
924 r = &type_b->regions[idx_b]; 925 r = &type_b->regions[idx_b];
925 r_start = idx_b ? r[-1].base + r[-1].size : 0; 926 r_start = idx_b ? r[-1].base + r[-1].size : 0;
926 r_end = idx_b < type_b->cnt ? 927 r_end = idx_b < type_b->cnt ?
927 r->base : ULLONG_MAX; 928 r->base : (phys_addr_t)ULLONG_MAX;
928 929
929 /* 930 /*
930 * if idx_b advanced past idx_a, 931 * if idx_b advanced past idx_a,
@@ -1040,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags,
1040 r = &type_b->regions[idx_b]; 1041 r = &type_b->regions[idx_b];
1041 r_start = idx_b ? r[-1].base + r[-1].size : 0; 1042 r_start = idx_b ? r[-1].base + r[-1].size : 0;
1042 r_end = idx_b < type_b->cnt ? 1043 r_end = idx_b < type_b->cnt ?
1043 r->base : ULLONG_MAX; 1044 r->base : (phys_addr_t)ULLONG_MAX;
1044 /* 1045 /*
1045 * if idx_b advanced past idx_a, 1046 * if idx_b advanced past idx_a,
1046 * break out to advance idx_a 1047 * break out to advance idx_a
@@ -1345,7 +1346,7 @@ void * __init memblock_virt_alloc_try_nid_raw(
1345 min_addr, max_addr, nid); 1346 min_addr, max_addr, nid);
1346#ifdef CONFIG_DEBUG_VM 1347#ifdef CONFIG_DEBUG_VM
1347 if (ptr && size > 0) 1348 if (ptr && size > 0)
1348 memset(ptr, 0xff, size); 1349 memset(ptr, PAGE_POISON_PATTERN, size);
1349#endif 1350#endif
1350 return ptr; 1351 return ptr;
1351} 1352}
@@ -1750,29 +1751,6 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
1750 } 1751 }
1751} 1752}
1752 1753
1753extern unsigned long __init_memblock
1754memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr)
1755{
1756 struct memblock_region *rgn;
1757 unsigned long size = 0;
1758 int idx;
1759
1760 for_each_memblock_type(idx, (&memblock.reserved), rgn) {
1761 phys_addr_t start, end;
1762
1763 if (rgn->base + rgn->size < start_addr)
1764 continue;
1765 if (rgn->base > end_addr)
1766 continue;
1767
1768 start = rgn->base;
1769 end = start + rgn->size;
1770 size += end - start;
1771 }
1772
1773 return size;
1774}
1775
1776void __init_memblock __memblock_dump_all(void) 1754void __init_memblock __memblock_dump_all(void)
1777{ 1755{
1778 pr_info("MEMBLOCK configuration:\n"); 1756 pr_info("MEMBLOCK configuration:\n");
@@ -1818,18 +1796,7 @@ static int memblock_debug_show(struct seq_file *m, void *private)
1818 } 1796 }
1819 return 0; 1797 return 0;
1820} 1798}
1821 1799DEFINE_SHOW_ATTRIBUTE(memblock_debug);
1822static int memblock_debug_open(struct inode *inode, struct file *file)
1823{
1824 return single_open(file, memblock_debug_show, inode->i_private);
1825}
1826
1827static const struct file_operations memblock_debug_fops = {
1828 .open = memblock_debug_open,
1829 .read = seq_read,
1830 .llseek = seq_lseek,
1831 .release = single_release,
1832};
1833 1800
1834static int __init memblock_init_debugfs(void) 1801static int __init memblock_init_debugfs(void)
1835{ 1802{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8291b75f42c8..2d4bf647cf01 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -502,6 +502,7 @@ static const char * const action_page_types[] = {
502 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", 502 [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
503 [MF_MSG_HUGE] = "huge page", 503 [MF_MSG_HUGE] = "huge page",
504 [MF_MSG_FREE_HUGE] = "free huge page", 504 [MF_MSG_FREE_HUGE] = "free huge page",
505 [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
505 [MF_MSG_UNMAP_FAILED] = "unmapping failed page", 506 [MF_MSG_UNMAP_FAILED] = "unmapping failed page",
506 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", 507 [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
507 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", 508 [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
@@ -1084,6 +1085,21 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
1084 return 0; 1085 return 0;
1085 } 1086 }
1086 1087
1088 /*
1089 * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
1090 * simply disable it. In order to make it work properly, we need
1091 * make sure that:
1092 * - conversion of a pud that maps an error hugetlb into hwpoison
1093 * entry properly works, and
1094 * - other mm code walking over page table is aware of pud-aligned
1095 * hwpoison entries.
1096 */
1097 if (huge_page_size(page_hstate(head)) > PMD_SIZE) {
1098 action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED);
1099 res = -EBUSY;
1100 goto out;
1101 }
1102
1087 if (!hwpoison_user_mappings(p, pfn, flags, &head)) { 1103 if (!hwpoison_user_mappings(p, pfn, flags, &head)) {
1088 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); 1104 action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
1089 res = -EBUSY; 1105 res = -EBUSY;
diff --git a/mm/memory.c b/mm/memory.c
index aed37325d94e..01f5464e0fd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2883,26 +2883,16 @@ EXPORT_SYMBOL(unmap_mapping_range);
2883int do_swap_page(struct vm_fault *vmf) 2883int do_swap_page(struct vm_fault *vmf)
2884{ 2884{
2885 struct vm_area_struct *vma = vmf->vma; 2885 struct vm_area_struct *vma = vmf->vma;
2886 struct page *page = NULL, *swapcache = NULL; 2886 struct page *page = NULL, *swapcache;
2887 struct mem_cgroup *memcg; 2887 struct mem_cgroup *memcg;
2888 struct vma_swap_readahead swap_ra;
2889 swp_entry_t entry; 2888 swp_entry_t entry;
2890 pte_t pte; 2889 pte_t pte;
2891 int locked; 2890 int locked;
2892 int exclusive = 0; 2891 int exclusive = 0;
2893 int ret = 0; 2892 int ret = 0;
2894 bool vma_readahead = swap_use_vma_readahead();
2895 2893
2896 if (vma_readahead) { 2894 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2897 page = swap_readahead_detect(vmf, &swap_ra);
2898 swapcache = page;
2899 }
2900
2901 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) {
2902 if (page)
2903 put_page(page);
2904 goto out; 2895 goto out;
2905 }
2906 2896
2907 entry = pte_to_swp_entry(vmf->orig_pte); 2897 entry = pte_to_swp_entry(vmf->orig_pte);
2908 if (unlikely(non_swap_entry(entry))) { 2898 if (unlikely(non_swap_entry(entry))) {
@@ -2928,11 +2918,8 @@ int do_swap_page(struct vm_fault *vmf)
2928 2918
2929 2919
2930 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2920 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2931 if (!page) { 2921 page = lookup_swap_cache(entry, vma, vmf->address);
2932 page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, 2922 swapcache = page;
2933 vmf->address);
2934 swapcache = page;
2935 }
2936 2923
2937 if (!page) { 2924 if (!page) {
2938 struct swap_info_struct *si = swp_swap_info(entry); 2925 struct swap_info_struct *si = swp_swap_info(entry);
@@ -2940,7 +2927,8 @@ int do_swap_page(struct vm_fault *vmf)
2940 if (si->flags & SWP_SYNCHRONOUS_IO && 2927 if (si->flags & SWP_SYNCHRONOUS_IO &&
2941 __swap_count(si, entry) == 1) { 2928 __swap_count(si, entry) == 1) {
2942 /* skip swapcache */ 2929 /* skip swapcache */
2943 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); 2930 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2931 vmf->address);
2944 if (page) { 2932 if (page) {
2945 __SetPageLocked(page); 2933 __SetPageLocked(page);
2946 __SetPageSwapBacked(page); 2934 __SetPageSwapBacked(page);
@@ -2949,12 +2937,8 @@ int do_swap_page(struct vm_fault *vmf)
2949 swap_readpage(page, true); 2937 swap_readpage(page, true);
2950 } 2938 }
2951 } else { 2939 } else {
2952 if (vma_readahead) 2940 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
2953 page = do_swap_page_readahead(entry, 2941 vmf);
2954 GFP_HIGHUSER_MOVABLE, vmf, &swap_ra);
2955 else
2956 page = swapin_readahead(entry,
2957 GFP_HIGHUSER_MOVABLE, vma, vmf->address);
2958 swapcache = page; 2942 swapcache = page;
2959 } 2943 }
2960 2944
@@ -2982,7 +2966,6 @@ int do_swap_page(struct vm_fault *vmf)
2982 */ 2966 */
2983 ret = VM_FAULT_HWPOISON; 2967 ret = VM_FAULT_HWPOISON;
2984 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2968 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2985 swapcache = page;
2986 goto out_release; 2969 goto out_release;
2987 } 2970 }
2988 2971
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b2bd52ff7605..cc6dfa5832ca 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -250,7 +250,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
250 struct vmem_altmap *altmap, bool want_memblock) 250 struct vmem_altmap *altmap, bool want_memblock)
251{ 251{
252 int ret; 252 int ret;
253 int i;
254 253
255 if (pfn_valid(phys_start_pfn)) 254 if (pfn_valid(phys_start_pfn))
256 return -EEXIST; 255 return -EEXIST;
@@ -259,27 +258,10 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
259 if (ret < 0) 258 if (ret < 0)
260 return ret; 259 return ret;
261 260
262 /*
263 * Make all the pages reserved so that nobody will stumble over half
264 * initialized state.
265 * FIXME: We also have to associate it with a node because page_to_nid
266 * relies on having page with the proper node.
267 */
268 for (i = 0; i < PAGES_PER_SECTION; i++) {
269 unsigned long pfn = phys_start_pfn + i;
270 struct page *page;
271 if (!pfn_valid(pfn))
272 continue;
273
274 page = pfn_to_page(pfn);
275 set_page_node(page, nid);
276 SetPageReserved(page);
277 }
278
279 if (!want_memblock) 261 if (!want_memblock)
280 return 0; 262 return 0;
281 263
282 return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); 264 return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
283} 265}
284 266
285/* 267/*
@@ -559,6 +541,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
559 * @zone: zone from which pages need to be removed 541 * @zone: zone from which pages need to be removed
560 * @phys_start_pfn: starting pageframe (must be aligned to start of a section) 542 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
561 * @nr_pages: number of pages to remove (must be multiple of section size) 543 * @nr_pages: number of pages to remove (must be multiple of section size)
544 * @altmap: alternative device page map or %NULL if default memmap is used
562 * 545 *
563 * Generic helper function to remove section mappings and sysfs entries 546 * Generic helper function to remove section mappings and sysfs entries
564 * for the section of the memory we are removing. Caller needs to make 547 * for the section of the memory we are removing. Caller needs to make
@@ -908,8 +891,15 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
908 int nid; 891 int nid;
909 int ret; 892 int ret;
910 struct memory_notify arg; 893 struct memory_notify arg;
894 struct memory_block *mem;
895
896 /*
897 * We can't use pfn_to_nid() because nid might be stored in struct page
898 * which is not yet initialized. Instead, we find nid from memory block.
899 */
900 mem = find_memory_block(__pfn_to_section(pfn));
901 nid = mem->nid;
911 902
912 nid = pfn_to_nid(pfn);
913 /* associate pfn range with the zone */ 903 /* associate pfn range with the zone */
914 zone = move_pfn_range(online_type, nid, pfn, nr_pages); 904 zone = move_pfn_range(online_type, nid, pfn, nr_pages);
915 905
@@ -1055,6 +1045,7 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
1055 1045
1056/** 1046/**
1057 * try_online_node - online a node if offlined 1047 * try_online_node - online a node if offlined
1048 * @nid: the node ID
1058 * 1049 *
1059 * called by cpu_up() to online a node without onlined memory. 1050 * called by cpu_up() to online a node without onlined memory.
1060 */ 1051 */
@@ -1083,15 +1074,16 @@ out:
1083 1074
1084static int check_hotplug_memory_range(u64 start, u64 size) 1075static int check_hotplug_memory_range(u64 start, u64 size)
1085{ 1076{
1086 u64 start_pfn = PFN_DOWN(start); 1077 unsigned long block_sz = memory_block_size_bytes();
1078 u64 block_nr_pages = block_sz >> PAGE_SHIFT;
1087 u64 nr_pages = size >> PAGE_SHIFT; 1079 u64 nr_pages = size >> PAGE_SHIFT;
1080 u64 start_pfn = PFN_DOWN(start);
1088 1081
1089 /* Memory range must be aligned with section */ 1082 /* memory range must be block size aligned */
1090 if ((start_pfn & ~PAGE_SECTION_MASK) || 1083 if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
1091 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { 1084 !IS_ALIGNED(nr_pages, block_nr_pages)) {
1092 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", 1085 pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
1093 (unsigned long long)start, 1086 block_sz, start, size);
1094 (unsigned long long)size);
1095 return -EINVAL; 1087 return -EINVAL;
1096 } 1088 }
1097 1089
@@ -1814,6 +1806,7 @@ static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1814 1806
1815/** 1807/**
1816 * try_offline_node 1808 * try_offline_node
1809 * @nid: the node ID
1817 * 1810 *
1818 * Offline a node if all memory sections and cpus of the node are removed. 1811 * Offline a node if all memory sections and cpus of the node are removed.
1819 * 1812 *
@@ -1857,6 +1850,9 @@ EXPORT_SYMBOL(try_offline_node);
1857 1850
1858/** 1851/**
1859 * remove_memory 1852 * remove_memory
1853 * @nid: the node ID
1854 * @start: physical address of the region to remove
1855 * @size: size of the region to remove
1860 * 1856 *
1861 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug 1857 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1862 * and online/offline operations before this call, as required by 1858 * and online/offline operations before this call, as required by
diff --git a/mm/mmap.c b/mm/mmap.c
index aa0dc8231c0d..f2154fc2548b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3191,13 +3191,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
3191 if (rlimit(RLIMIT_DATA) == 0 && 3191 if (rlimit(RLIMIT_DATA) == 0 &&
3192 mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) 3192 mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
3193 return true; 3193 return true;
3194 if (!ignore_rlimit_data) { 3194
3195 pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n", 3195 pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
3196 current->comm, current->pid, 3196 current->comm, current->pid,
3197 (mm->data_vm + npages) << PAGE_SHIFT, 3197 (mm->data_vm + npages) << PAGE_SHIFT,
3198 rlimit(RLIMIT_DATA)); 3198 rlimit(RLIMIT_DATA),
3199 ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");
3200
3201 if (!ignore_rlimit_data)
3199 return false; 3202 return false;
3200 }
3201 } 3203 }
3202 3204
3203 return true; 3205 return true;
diff --git a/mm/nommu.c b/mm/nommu.c
index 4f8720243ae7..13723736d38f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -457,18 +457,6 @@ void __weak vmalloc_sync_all(void)
457{ 457{
458} 458}
459 459
460/**
461 * alloc_vm_area - allocate a range of kernel address space
462 * @size: size of the area
463 *
464 * Returns: NULL on failure, vm_struct on success
465 *
466 * This function reserves a range of kernel address space, and
467 * allocates pagetables to map that range. No actual mappings
468 * are created. If the kernel address space is not shared
469 * between processes, it syncs the pagetable across all
470 * processes.
471 */
472struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) 460struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
473{ 461{
474 BUG(); 462 BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f2e7dfb81eee..ff992fa8760a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -185,6 +185,8 @@ static bool is_dump_unreclaim_slabs(void)
185 * oom_badness - heuristic function to determine which candidate task to kill 185 * oom_badness - heuristic function to determine which candidate task to kill
186 * @p: task struct of which task we should calculate 186 * @p: task struct of which task we should calculate
187 * @totalpages: total present RAM allowed for page allocation 187 * @totalpages: total present RAM allowed for page allocation
188 * @memcg: task's memory controller, if constrained
189 * @nodemask: nodemask passed to page allocator for mempolicy ooms
188 * 190 *
189 * The heuristic for determining which task to kill is made to be as simple and 191 * The heuristic for determining which task to kill is made to be as simple and
190 * predictable as possible. The goal is to return the highest value for the 192 * predictable as possible. The goal is to return the highest value for the
@@ -224,13 +226,6 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
224 mm_pgtables_bytes(p->mm) / PAGE_SIZE; 226 mm_pgtables_bytes(p->mm) / PAGE_SIZE;
225 task_unlock(p); 227 task_unlock(p);
226 228
227 /*
228 * Root processes get 3% bonus, just like the __vm_enough_memory()
229 * implementation used by LSMs.
230 */
231 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
232 points -= (points * 3) / 100;
233
234 /* Normalize to oom_score_adj units */ 229 /* Normalize to oom_score_adj units */
235 adj *= totalpages / 1000; 230 adj *= totalpages / 1000;
236 points += adj; 231 points += adj;
@@ -595,7 +590,8 @@ static void oom_reap_task(struct task_struct *tsk)
595 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) 590 while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm))
596 schedule_timeout_idle(HZ/10); 591 schedule_timeout_idle(HZ/10);
597 592
598 if (attempts <= MAX_OOM_REAP_RETRIES) 593 if (attempts <= MAX_OOM_REAP_RETRIES ||
594 test_bit(MMF_OOM_SKIP, &mm->flags))
599 goto done; 595 goto done;
600 596
601 597
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ea018263210..0b97b8ece4a9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -265,17 +265,19 @@ int min_free_kbytes = 1024;
265int user_min_free_kbytes = -1; 265int user_min_free_kbytes = -1;
266int watermark_scale_factor = 10; 266int watermark_scale_factor = 10;
267 267
268static unsigned long __meminitdata nr_kernel_pages; 268static unsigned long nr_kernel_pages __meminitdata;
269static unsigned long __meminitdata nr_all_pages; 269static unsigned long nr_all_pages __meminitdata;
270static unsigned long __meminitdata dma_reserve; 270static unsigned long dma_reserve __meminitdata;
271 271
272#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 272#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
273static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 273static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
274static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 274static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
275static unsigned long __initdata required_kernelcore; 275static unsigned long required_kernelcore __initdata;
276static unsigned long __initdata required_movablecore; 276static unsigned long required_kernelcore_percent __initdata;
277static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; 277static unsigned long required_movablecore __initdata;
278static bool mirrored_kernelcore; 278static unsigned long required_movablecore_percent __initdata;
279static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
280static bool mirrored_kernelcore __meminitdata;
279 281
280/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ 282/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
281int movable_zone; 283int movable_zone;
@@ -292,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes);
292int page_group_by_mobility_disabled __read_mostly; 294int page_group_by_mobility_disabled __read_mostly;
293 295
294#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 296#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
295
296/*
297 * Determine how many pages need to be initialized during early boot
298 * (non-deferred initialization).
299 * The value of first_deferred_pfn will be set later, once non-deferred pages
300 * are initialized, but for now set it ULONG_MAX.
301 */
302static inline void reset_deferred_meminit(pg_data_t *pgdat)
303{
304 phys_addr_t start_addr, end_addr;
305 unsigned long max_pgcnt;
306 unsigned long reserved;
307
308 /*
309 * Initialise at least 2G of a node but also take into account that
310 * two large system hashes that can take up 1GB for 0.25TB/node.
311 */
312 max_pgcnt = max(2UL << (30 - PAGE_SHIFT),
313 (pgdat->node_spanned_pages >> 8));
314
315 /*
316 * Compensate the all the memblock reservations (e.g. crash kernel)
317 * from the initial estimation to make sure we will initialize enough
318 * memory to boot.
319 */
320 start_addr = PFN_PHYS(pgdat->node_start_pfn);
321 end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt);
322 reserved = memblock_reserved_memory_within(start_addr, end_addr);
323 max_pgcnt += PHYS_PFN(reserved);
324
325 pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages);
326 pgdat->first_deferred_pfn = ULONG_MAX;
327}
328
329/* Returns true if the struct page for the pfn is uninitialised */ 297/* Returns true if the struct page for the pfn is uninitialised */
330static inline bool __meminit early_page_uninitialised(unsigned long pfn) 298static inline bool __meminit early_page_uninitialised(unsigned long pfn)
331{ 299{
@@ -361,10 +329,6 @@ static inline bool update_defer_init(pg_data_t *pgdat,
361 return true; 329 return true;
362} 330}
363#else 331#else
364static inline void reset_deferred_meminit(pg_data_t *pgdat)
365{
366}
367
368static inline bool early_page_uninitialised(unsigned long pfn) 332static inline bool early_page_uninitialised(unsigned long pfn)
369{ 333{
370 return false; 334 return false;
@@ -1099,6 +1063,15 @@ static bool bulkfree_pcp_prepare(struct page *page)
1099} 1063}
1100#endif /* CONFIG_DEBUG_VM */ 1064#endif /* CONFIG_DEBUG_VM */
1101 1065
1066static inline void prefetch_buddy(struct page *page)
1067{
1068 unsigned long pfn = page_to_pfn(page);
1069 unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
1070 struct page *buddy = page + (buddy_pfn - pfn);
1071
1072 prefetch(buddy);
1073}
1074
1102/* 1075/*
1103 * Frees a number of pages from the PCP lists 1076 * Frees a number of pages from the PCP lists
1104 * Assumes all pages on list are in same zone, and of same order. 1077 * Assumes all pages on list are in same zone, and of same order.
@@ -1115,13 +1088,12 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1115{ 1088{
1116 int migratetype = 0; 1089 int migratetype = 0;
1117 int batch_free = 0; 1090 int batch_free = 0;
1091 int prefetch_nr = 0;
1118 bool isolated_pageblocks; 1092 bool isolated_pageblocks;
1119 1093 struct page *page, *tmp;
1120 spin_lock(&zone->lock); 1094 LIST_HEAD(head);
1121 isolated_pageblocks = has_isolate_pageblock(zone);
1122 1095
1123 while (count) { 1096 while (count) {
1124 struct page *page;
1125 struct list_head *list; 1097 struct list_head *list;
1126 1098
1127 /* 1099 /*
@@ -1143,26 +1115,48 @@ static void free_pcppages_bulk(struct zone *zone, int count,
1143 batch_free = count; 1115 batch_free = count;
1144 1116
1145 do { 1117 do {
1146 int mt; /* migratetype of the to-be-freed page */
1147
1148 page = list_last_entry(list, struct page, lru); 1118 page = list_last_entry(list, struct page, lru);
1149 /* must delete as __free_one_page list manipulates */ 1119 /* must delete to avoid corrupting pcp list */
1150 list_del(&page->lru); 1120 list_del(&page->lru);
1151 1121 pcp->count--;
1152 mt = get_pcppage_migratetype(page);
1153 /* MIGRATE_ISOLATE page should not go to pcplists */
1154 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1155 /* Pageblock could have been isolated meanwhile */
1156 if (unlikely(isolated_pageblocks))
1157 mt = get_pageblock_migratetype(page);
1158 1122
1159 if (bulkfree_pcp_prepare(page)) 1123 if (bulkfree_pcp_prepare(page))
1160 continue; 1124 continue;
1161 1125
1162 __free_one_page(page, page_to_pfn(page), zone, 0, mt); 1126 list_add_tail(&page->lru, &head);
1163 trace_mm_page_pcpu_drain(page, 0, mt); 1127
1128 /*
1129 * We are going to put the page back to the global
1130 * pool, prefetch its buddy to speed up later access
1131 * under zone->lock. It is believed the overhead of
1132 * an additional test and calculating buddy_pfn here
1133 * can be offset by reduced memory latency later. To
1134 * avoid excessive prefetching due to large count, only
1135 * prefetch buddy for the first pcp->batch nr of pages.
1136 */
1137 if (prefetch_nr++ < pcp->batch)
1138 prefetch_buddy(page);
1164 } while (--count && --batch_free && !list_empty(list)); 1139 } while (--count && --batch_free && !list_empty(list));
1165 } 1140 }
1141
1142 spin_lock(&zone->lock);
1143 isolated_pageblocks = has_isolate_pageblock(zone);
1144
1145 /*
1146 * Use safe version since after __free_one_page(),
1147 * page->lru.next will not point to original list.
1148 */
1149 list_for_each_entry_safe(page, tmp, &head, lru) {
1150 int mt = get_pcppage_migratetype(page);
1151 /* MIGRATE_ISOLATE page should not go to pcplists */
1152 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1153 /* Pageblock could have been isolated meanwhile */
1154 if (unlikely(isolated_pageblocks))
1155 mt = get_pageblock_migratetype(page);
1156
1157 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1158 trace_mm_page_pcpu_drain(page, 0, mt);
1159 }
1166 spin_unlock(&zone->lock); 1160 spin_unlock(&zone->lock);
1167} 1161}
1168 1162
@@ -1181,10 +1175,9 @@ static void free_one_page(struct zone *zone,
1181} 1175}
1182 1176
1183static void __meminit __init_single_page(struct page *page, unsigned long pfn, 1177static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1184 unsigned long zone, int nid, bool zero) 1178 unsigned long zone, int nid)
1185{ 1179{
1186 if (zero) 1180 mm_zero_struct_page(page);
1187 mm_zero_struct_page(page);
1188 set_page_links(page, zone, nid, pfn); 1181 set_page_links(page, zone, nid, pfn);
1189 init_page_count(page); 1182 init_page_count(page);
1190 page_mapcount_reset(page); 1183 page_mapcount_reset(page);
@@ -1198,12 +1191,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1198#endif 1191#endif
1199} 1192}
1200 1193
1201static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
1202 int nid, bool zero)
1203{
1204 return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero);
1205}
1206
1207#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT 1194#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1208static void __meminit init_reserved_page(unsigned long pfn) 1195static void __meminit init_reserved_page(unsigned long pfn)
1209{ 1196{
@@ -1222,7 +1209,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
1222 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) 1209 if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1223 break; 1210 break;
1224 } 1211 }
1225 __init_single_pfn(pfn, zid, nid, true); 1212 __init_single_page(pfn_to_page(pfn), pfn, zid, nid);
1226} 1213}
1227#else 1214#else
1228static inline void init_reserved_page(unsigned long pfn) 1215static inline void init_reserved_page(unsigned long pfn)
@@ -1506,7 +1493,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
1506 } else if (!(pfn & nr_pgmask)) { 1493 } else if (!(pfn & nr_pgmask)) {
1507 deferred_free_range(pfn - nr_free, nr_free); 1494 deferred_free_range(pfn - nr_free, nr_free);
1508 nr_free = 1; 1495 nr_free = 1;
1509 cond_resched(); 1496 touch_nmi_watchdog();
1510 } else { 1497 } else {
1511 nr_free++; 1498 nr_free++;
1512 } 1499 }
@@ -1535,11 +1522,11 @@ static unsigned long __init deferred_init_pages(int nid, int zid,
1535 continue; 1522 continue;
1536 } else if (!page || !(pfn & nr_pgmask)) { 1523 } else if (!page || !(pfn & nr_pgmask)) {
1537 page = pfn_to_page(pfn); 1524 page = pfn_to_page(pfn);
1538 cond_resched(); 1525 touch_nmi_watchdog();
1539 } else { 1526 } else {
1540 page++; 1527 page++;
1541 } 1528 }
1542 __init_single_page(page, pfn, zid, nid, true); 1529 __init_single_page(page, pfn, zid, nid);
1543 nr_pages++; 1530 nr_pages++;
1544 } 1531 }
1545 return (nr_pages); 1532 return (nr_pages);
@@ -1552,23 +1539,25 @@ static int __init deferred_init_memmap(void *data)
1552 int nid = pgdat->node_id; 1539 int nid = pgdat->node_id;
1553 unsigned long start = jiffies; 1540 unsigned long start = jiffies;
1554 unsigned long nr_pages = 0; 1541 unsigned long nr_pages = 0;
1555 unsigned long spfn, epfn; 1542 unsigned long spfn, epfn, first_init_pfn, flags;
1556 phys_addr_t spa, epa; 1543 phys_addr_t spa, epa;
1557 int zid; 1544 int zid;
1558 struct zone *zone; 1545 struct zone *zone;
1559 unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1560 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); 1546 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1561 u64 i; 1547 u64 i;
1562 1548
1549 /* Bind memory initialisation thread to a local node if possible */
1550 if (!cpumask_empty(cpumask))
1551 set_cpus_allowed_ptr(current, cpumask);
1552
1553 pgdat_resize_lock(pgdat, &flags);
1554 first_init_pfn = pgdat->first_deferred_pfn;
1563 if (first_init_pfn == ULONG_MAX) { 1555 if (first_init_pfn == ULONG_MAX) {
1556 pgdat_resize_unlock(pgdat, &flags);
1564 pgdat_init_report_one_done(); 1557 pgdat_init_report_one_done();
1565 return 0; 1558 return 0;
1566 } 1559 }
1567 1560
1568 /* Bind memory initialisation thread to a local node if possible */
1569 if (!cpumask_empty(cpumask))
1570 set_cpus_allowed_ptr(current, cpumask);
1571
1572 /* Sanity check boundaries */ 1561 /* Sanity check boundaries */
1573 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); 1562 BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1574 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); 1563 BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
@@ -1598,6 +1587,7 @@ static int __init deferred_init_memmap(void *data)
1598 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); 1587 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1599 deferred_free_pages(nid, zid, spfn, epfn); 1588 deferred_free_pages(nid, zid, spfn, epfn);
1600 } 1589 }
1590 pgdat_resize_unlock(pgdat, &flags);
1601 1591
1602 /* Sanity check that the next zone really is unpopulated */ 1592 /* Sanity check that the next zone really is unpopulated */
1603 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); 1593 WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1608,6 +1598,117 @@ static int __init deferred_init_memmap(void *data)
1608 pgdat_init_report_one_done(); 1598 pgdat_init_report_one_done();
1609 return 0; 1599 return 0;
1610} 1600}
1601
1602/*
1603 * During boot we initialize deferred pages on-demand, as needed, but once
1604 * page_alloc_init_late() has finished, the deferred pages are all initialized,
1605 * and we can permanently disable that path.
1606 */
1607static DEFINE_STATIC_KEY_TRUE(deferred_pages);
1608
1609/*
1610 * If this zone has deferred pages, try to grow it by initializing enough
1611 * deferred pages to satisfy the allocation specified by order, rounded up to
1612 * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
1613 * of SECTION_SIZE bytes by initializing struct pages in increments of
1614 * PAGES_PER_SECTION * sizeof(struct page) bytes.
1615 *
1616 * Return true when zone was grown, otherwise return false. We return true even
1617 * when we grow less than requested, to let the caller decide if there are
1618 * enough pages to satisfy the allocation.
1619 *
1620 * Note: We use noinline because this function is needed only during boot, and
1621 * it is called from a __ref function _deferred_grow_zone. This way we are
1622 * making sure that it is not inlined into permanent text section.
1623 */
1624static noinline bool __init
1625deferred_grow_zone(struct zone *zone, unsigned int order)
1626{
1627 int zid = zone_idx(zone);
1628 int nid = zone_to_nid(zone);
1629 pg_data_t *pgdat = NODE_DATA(nid);
1630 unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
1631 unsigned long nr_pages = 0;
1632 unsigned long first_init_pfn, spfn, epfn, t, flags;
1633 unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
1634 phys_addr_t spa, epa;
1635 u64 i;
1636
1637 /* Only the last zone may have deferred pages */
1638 if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
1639 return false;
1640
1641 pgdat_resize_lock(pgdat, &flags);
1642
1643 /*
1644 * If deferred pages have been initialized while we were waiting for
1645 * the lock, return true, as the zone was grown. The caller will retry
1646 * this zone. We won't return to this function since the caller also
1647 * has this static branch.
1648 */
1649 if (!static_branch_unlikely(&deferred_pages)) {
1650 pgdat_resize_unlock(pgdat, &flags);
1651 return true;
1652 }
1653
1654 /*
1655 * If someone grew this zone while we were waiting for spinlock, return
1656 * true, as there might be enough pages already.
1657 */
1658 if (first_deferred_pfn != pgdat->first_deferred_pfn) {
1659 pgdat_resize_unlock(pgdat, &flags);
1660 return true;
1661 }
1662
1663 first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
1664
1665 if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
1666 pgdat_resize_unlock(pgdat, &flags);
1667 return false;
1668 }
1669
1670 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1671 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1672 epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
1673
1674 while (spfn < epfn && nr_pages < nr_pages_needed) {
1675 t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
1676 first_deferred_pfn = min(t, epfn);
1677 nr_pages += deferred_init_pages(nid, zid, spfn,
1678 first_deferred_pfn);
1679 spfn = first_deferred_pfn;
1680 }
1681
1682 if (nr_pages >= nr_pages_needed)
1683 break;
1684 }
1685
1686 for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
1687 spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
1688 epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
1689 deferred_free_pages(nid, zid, spfn, epfn);
1690
1691 if (first_deferred_pfn == epfn)
1692 break;
1693 }
1694 pgdat->first_deferred_pfn = first_deferred_pfn;
1695 pgdat_resize_unlock(pgdat, &flags);
1696
1697 return nr_pages > 0;
1698}
1699
1700/*
1701 * deferred_grow_zone() is __init, but it is called from
1702 * get_page_from_freelist() during early boot until deferred_pages permanently
1703 * disables this call. This is why we have refdata wrapper to avoid warning,
1704 * and to ensure that the function body gets unloaded.
1705 */
1706static bool __ref
1707_deferred_grow_zone(struct zone *zone, unsigned int order)
1708{
1709 return deferred_grow_zone(zone, order);
1710}
1711
1611#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 1712#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1612 1713
1613void __init page_alloc_init_late(void) 1714void __init page_alloc_init_late(void)
@@ -1626,6 +1727,12 @@ void __init page_alloc_init_late(void)
1626 /* Block until all are initialised */ 1727 /* Block until all are initialised */
1627 wait_for_completion(&pgdat_init_all_done_comp); 1728 wait_for_completion(&pgdat_init_all_done_comp);
1628 1729
1730 /*
1731 * We initialized the rest of the deferred pages. Permanently disable
1732 * on-demand struct page initialization.
1733 */
1734 static_branch_disable(&deferred_pages);
1735
1629 /* Reinit limits that are based on free pages after the kernel is up */ 1736 /* Reinit limits that are based on free pages after the kernel is up */
1630 files_maxfiles_init(); 1737 files_maxfiles_init();
1631#endif 1738#endif
@@ -2418,10 +2525,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2418 local_irq_save(flags); 2525 local_irq_save(flags);
2419 batch = READ_ONCE(pcp->batch); 2526 batch = READ_ONCE(pcp->batch);
2420 to_drain = min(pcp->count, batch); 2527 to_drain = min(pcp->count, batch);
2421 if (to_drain > 0) { 2528 if (to_drain > 0)
2422 free_pcppages_bulk(zone, to_drain, pcp); 2529 free_pcppages_bulk(zone, to_drain, pcp);
2423 pcp->count -= to_drain;
2424 }
2425 local_irq_restore(flags); 2530 local_irq_restore(flags);
2426} 2531}
2427#endif 2532#endif
@@ -2443,10 +2548,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2443 pset = per_cpu_ptr(zone->pageset, cpu); 2548 pset = per_cpu_ptr(zone->pageset, cpu);
2444 2549
2445 pcp = &pset->pcp; 2550 pcp = &pset->pcp;
2446 if (pcp->count) { 2551 if (pcp->count)
2447 free_pcppages_bulk(zone, pcp->count, pcp); 2552 free_pcppages_bulk(zone, pcp->count, pcp);
2448 pcp->count = 0;
2449 }
2450 local_irq_restore(flags); 2553 local_irq_restore(flags);
2451} 2554}
2452 2555
@@ -2670,7 +2773,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
2670 if (pcp->count >= pcp->high) { 2773 if (pcp->count >= pcp->high) {
2671 unsigned long batch = READ_ONCE(pcp->batch); 2774 unsigned long batch = READ_ONCE(pcp->batch);
2672 free_pcppages_bulk(zone, batch, pcp); 2775 free_pcppages_bulk(zone, batch, pcp);
2673 pcp->count -= batch;
2674 } 2776 }
2675} 2777}
2676 2778
@@ -3205,6 +3307,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3205 ac_classzone_idx(ac), alloc_flags)) { 3307 ac_classzone_idx(ac), alloc_flags)) {
3206 int ret; 3308 int ret;
3207 3309
3310#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3311 /*
3312 * Watermark failed for this zone, but see if we can
3313 * grow this zone if it contains deferred pages.
3314 */
3315 if (static_branch_unlikely(&deferred_pages)) {
3316 if (_deferred_grow_zone(zone, order))
3317 goto try_this_zone;
3318 }
3319#endif
3208 /* Checked here to keep the fast path fast */ 3320 /* Checked here to keep the fast path fast */
3209 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 3321 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3210 if (alloc_flags & ALLOC_NO_WATERMARKS) 3322 if (alloc_flags & ALLOC_NO_WATERMARKS)
@@ -3246,6 +3358,14 @@ try_this_zone:
3246 reserve_highatomic_pageblock(page, zone, order); 3358 reserve_highatomic_pageblock(page, zone, order);
3247 3359
3248 return page; 3360 return page;
3361 } else {
3362#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
3363 /* Try again if zone has deferred pages */
3364 if (static_branch_unlikely(&deferred_pages)) {
3365 if (_deferred_grow_zone(zone, order))
3366 goto try_this_zone;
3367 }
3368#endif
3249 } 3369 }
3250 } 3370 }
3251 3371
@@ -3685,16 +3805,18 @@ retry:
3685 return page; 3805 return page;
3686} 3806}
3687 3807
3688static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) 3808static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
3809 const struct alloc_context *ac)
3689{ 3810{
3690 struct zoneref *z; 3811 struct zoneref *z;
3691 struct zone *zone; 3812 struct zone *zone;
3692 pg_data_t *last_pgdat = NULL; 3813 pg_data_t *last_pgdat = NULL;
3814 enum zone_type high_zoneidx = ac->high_zoneidx;
3693 3815
3694 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, 3816 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
3695 ac->high_zoneidx, ac->nodemask) { 3817 ac->nodemask) {
3696 if (last_pgdat != zone->zone_pgdat) 3818 if (last_pgdat != zone->zone_pgdat)
3697 wakeup_kswapd(zone, order, ac->high_zoneidx); 3819 wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
3698 last_pgdat = zone->zone_pgdat; 3820 last_pgdat = zone->zone_pgdat;
3699 } 3821 }
3700} 3822}
@@ -3973,7 +4095,7 @@ retry_cpuset:
3973 goto nopage; 4095 goto nopage;
3974 4096
3975 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 4097 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3976 wake_all_kswapds(order, ac); 4098 wake_all_kswapds(order, gfp_mask, ac);
3977 4099
3978 /* 4100 /*
3979 * The adjusted alloc_flags might result in immediate success, so try 4101 * The adjusted alloc_flags might result in immediate success, so try
@@ -4031,7 +4153,7 @@ retry_cpuset:
4031retry: 4153retry:
4032 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ 4154 /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
4033 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 4155 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
4034 wake_all_kswapds(order, ac); 4156 wake_all_kswapds(order, gfp_mask, ac);
4035 4157
4036 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); 4158 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
4037 if (reserve_flags) 4159 if (reserve_flags)
@@ -5334,6 +5456,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5334 pg_data_t *pgdat = NODE_DATA(nid); 5456 pg_data_t *pgdat = NODE_DATA(nid);
5335 unsigned long pfn; 5457 unsigned long pfn;
5336 unsigned long nr_initialised = 0; 5458 unsigned long nr_initialised = 0;
5459 struct page *page;
5337#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5460#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5338 struct memblock_region *r = NULL, *tmp; 5461 struct memblock_region *r = NULL, *tmp;
5339#endif 5462#endif
@@ -5386,6 +5509,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5386#endif 5509#endif
5387 5510
5388not_early: 5511not_early:
5512 page = pfn_to_page(pfn);
5513 __init_single_page(page, pfn, zone, nid);
5514 if (context == MEMMAP_HOTPLUG)
5515 SetPageReserved(page);
5516
5389 /* 5517 /*
5390 * Mark the block movable so that blocks are reserved for 5518 * Mark the block movable so that blocks are reserved for
5391 * movable at startup. This will force kernel allocations 5519 * movable at startup. This will force kernel allocations
@@ -5402,15 +5530,8 @@ not_early:
5402 * because this is done early in sparse_add_one_section 5530 * because this is done early in sparse_add_one_section
5403 */ 5531 */
5404 if (!(pfn & (pageblock_nr_pages - 1))) { 5532 if (!(pfn & (pageblock_nr_pages - 1))) {
5405 struct page *page = pfn_to_page(pfn);
5406
5407 __init_single_page(page, pfn, zone, nid,
5408 context != MEMMAP_HOTPLUG);
5409 set_pageblock_migratetype(page, MIGRATE_MOVABLE); 5533 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5410 cond_resched(); 5534 cond_resched();
5411 } else {
5412 __init_single_pfn(pfn, zone, nid,
5413 context != MEMMAP_HOTPLUG);
5414 } 5535 }
5415 } 5536 }
5416} 5537}
@@ -6241,7 +6362,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6241 6362
6242 alloc_node_mem_map(pgdat); 6363 alloc_node_mem_map(pgdat);
6243 6364
6244 reset_deferred_meminit(pgdat); 6365#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
6366 /*
6367 * We start only with one section of pages, more pages are added as
6368 * needed until the rest of deferred pages are initialized.
6369 */
6370 pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
6371 pgdat->node_spanned_pages);
6372 pgdat->first_deferred_pfn = ULONG_MAX;
6373#endif
6245 free_area_init_core(pgdat); 6374 free_area_init_core(pgdat);
6246} 6375}
6247 6376
@@ -6471,7 +6600,18 @@ static void __init find_zone_movable_pfns_for_nodes(void)
6471 } 6600 }
6472 6601
6473 /* 6602 /*
6474 * If movablecore=nn[KMG] was specified, calculate what size of 6603 * If kernelcore=nn% or movablecore=nn% was specified, calculate the
6604 * amount of necessary memory.
6605 */
6606 if (required_kernelcore_percent)
6607 required_kernelcore = (totalpages * 100 * required_kernelcore_percent) /
6608 10000UL;
6609 if (required_movablecore_percent)
6610 required_movablecore = (totalpages * 100 * required_movablecore_percent) /
6611 10000UL;
6612
6613 /*
6614 * If movablecore= was specified, calculate what size of
6475 * kernelcore that corresponds so that memory usable for 6615 * kernelcore that corresponds so that memory usable for
6476 * any allocation type is evenly spread. If both kernelcore 6616 * any allocation type is evenly spread. If both kernelcore
6477 * and movablecore are specified, then the value of kernelcore 6617 * and movablecore are specified, then the value of kernelcore
@@ -6711,18 +6851,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6711 zero_resv_unavail(); 6851 zero_resv_unavail();
6712} 6852}
6713 6853
6714static int __init cmdline_parse_core(char *p, unsigned long *core) 6854static int __init cmdline_parse_core(char *p, unsigned long *core,
6855 unsigned long *percent)
6715{ 6856{
6716 unsigned long long coremem; 6857 unsigned long long coremem;
6858 char *endptr;
6859
6717 if (!p) 6860 if (!p)
6718 return -EINVAL; 6861 return -EINVAL;
6719 6862
6720 coremem = memparse(p, &p); 6863 /* Value may be a percentage of total memory, otherwise bytes */
6721 *core = coremem >> PAGE_SHIFT; 6864 coremem = simple_strtoull(p, &endptr, 0);
6865 if (*endptr == '%') {
6866 /* Paranoid check for percent values greater than 100 */
6867 WARN_ON(coremem > 100);
6722 6868
6723 /* Paranoid check that UL is enough for the coremem value */ 6869 *percent = coremem;
6724 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); 6870 } else {
6871 coremem = memparse(p, &p);
6872 /* Paranoid check that UL is enough for the coremem value */
6873 WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
6725 6874
6875 *core = coremem >> PAGE_SHIFT;
6876 *percent = 0UL;
6877 }
6726 return 0; 6878 return 0;
6727} 6879}
6728 6880
@@ -6738,7 +6890,8 @@ static int __init cmdline_parse_kernelcore(char *p)
6738 return 0; 6890 return 0;
6739 } 6891 }
6740 6892
6741 return cmdline_parse_core(p, &required_kernelcore); 6893 return cmdline_parse_core(p, &required_kernelcore,
6894 &required_kernelcore_percent);
6742} 6895}
6743 6896
6744/* 6897/*
@@ -6747,7 +6900,8 @@ static int __init cmdline_parse_kernelcore(char *p)
6747 */ 6900 */
6748static int __init cmdline_parse_movablecore(char *p) 6901static int __init cmdline_parse_movablecore(char *p)
6749{ 6902{
6750 return cmdline_parse_core(p, &required_movablecore); 6903 return cmdline_parse_core(p, &required_movablecore,
6904 &required_movablecore_percent);
6751} 6905}
6752 6906
6753early_param("kernelcore", cmdline_parse_kernelcore); 6907early_param("kernelcore", cmdline_parse_kernelcore);
@@ -7591,7 +7745,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
7591 cc->nr_migratepages -= nr_reclaimed; 7745 cc->nr_migratepages -= nr_reclaimed;
7592 7746
7593 ret = migrate_pages(&cc->migratepages, alloc_migrate_target, 7747 ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
7594 NULL, 0, cc->mode, MR_CMA); 7748 NULL, 0, cc->mode, MR_CONTIG_RANGE);
7595 } 7749 }
7596 if (ret < 0) { 7750 if (ret < 0) {
7597 putback_movable_pages(&cc->migratepages); 7751 putback_movable_pages(&cc->migratepages);
@@ -7611,11 +7765,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
7611 * @gfp_mask: GFP mask to use during compaction 7765 * @gfp_mask: GFP mask to use during compaction
7612 * 7766 *
7613 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES 7767 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
7614 * aligned, however it's the caller's responsibility to guarantee that 7768 * aligned. The PFN range must belong to a single zone.
7615 * we are the only thread that changes migrate type of pageblocks the
7616 * pages fall in.
7617 * 7769 *
7618 * The PFN range must belong to a single zone. 7770 * The first thing this routine does is attempt to MIGRATE_ISOLATE all
7771 * pageblocks in the range. Once isolated, the pageblocks should not
7772 * be modified by others.
7619 * 7773 *
7620 * Returns zero on success or negative error code. On success all 7774 * Returns zero on success or negative error code. On success all
7621 * pages which PFN is in [start, end) are allocated for the caller and 7775 * pages which PFN is in [start, end) are allocated for the caller and
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 0a49374e6931..e412a63b2b74 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -65,11 +65,15 @@ static bool page_idle_clear_pte_refs_one(struct page *page,
65 while (page_vma_mapped_walk(&pvmw)) { 65 while (page_vma_mapped_walk(&pvmw)) {
66 addr = pvmw.address; 66 addr = pvmw.address;
67 if (pvmw.pte) { 67 if (pvmw.pte) {
68 referenced = ptep_clear_young_notify(vma, addr, 68 /*
69 pvmw.pte); 69 * For PTE-mapped THP, one sub page is referenced,
70 * the whole THP is referenced.
71 */
72 if (ptep_clear_young_notify(vma, addr, pvmw.pte))
73 referenced = true;
70 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { 74 } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
71 referenced = pmdp_clear_young_notify(vma, addr, 75 if (pmdp_clear_young_notify(vma, addr, pvmw.pmd))
72 pvmw.pmd); 76 referenced = true;
73 } else { 77 } else {
74 /* unexpected pmd-mapped page? */ 78 /* unexpected pmd-mapped page? */
75 WARN_ON_ONCE(1); 79 WARN_ON_ONCE(1);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 165ed8117bd1..61dee77bb211 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -28,6 +28,14 @@ static int set_migratetype_isolate(struct page *page, int migratetype,
28 28
29 spin_lock_irqsave(&zone->lock, flags); 29 spin_lock_irqsave(&zone->lock, flags);
30 30
31 /*
32 * We assume the caller intended to SET migrate type to isolate.
33 * If it is already set, then someone else must have raced and
34 * set it before us. Return -EBUSY
35 */
36 if (is_migrate_isolate_page(page))
37 goto out;
38
31 pfn = page_to_pfn(page); 39 pfn = page_to_pfn(page);
32 arg.start_pfn = pfn; 40 arg.start_pfn = pfn;
33 arg.nr_pages = pageblock_nr_pages; 41 arg.nr_pages = pageblock_nr_pages;
@@ -166,7 +174,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
166 * future will not be allocated again. 174 * future will not be allocated again.
167 * 175 *
168 * start_pfn/end_pfn must be aligned to pageblock_order. 176 * start_pfn/end_pfn must be aligned to pageblock_order.
169 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 177 * Return 0 on success and -EBUSY if any part of range cannot be isolated.
178 *
179 * There is no high level synchronization mechanism that prevents two threads
180 * from trying to isolate overlapping ranges. If this happens, one thread
181 * will notice pageblocks in the overlapping range already set to isolate.
182 * This happens in set_migratetype_isolate, and set_migratetype_isolate
183 * returns an error. We then clean up by restoring the migration type on
184 * pageblocks we may have modified and return -EBUSY to caller. This
185 * prevents two threads from simultaneously working on overlapping ranges.
170 */ 186 */
171int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 187int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
172 unsigned migratetype, bool skip_hwpoisoned_pages) 188 unsigned migratetype, bool skip_hwpoisoned_pages)
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 7172e0a80e13..75d21a2259b3 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -35,7 +35,7 @@ static depot_stack_handle_t early_handle;
35 35
36static void init_early_allocated_pages(void); 36static void init_early_allocated_pages(void);
37 37
38static int early_page_owner_param(char *buf) 38static int __init early_page_owner_param(char *buf)
39{ 39{
40 if (!buf) 40 if (!buf)
41 return -EINVAL; 41 return -EINVAL;
diff --git a/mm/page_poison.c b/mm/page_poison.c
index e83fd44867de..aa2b3d34e8ea 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -9,7 +9,7 @@
9 9
10static bool want_page_poisoning __read_mostly; 10static bool want_page_poisoning __read_mostly;
11 11
12static int early_page_poison_param(char *buf) 12static int __init early_page_poison_param(char *buf)
13{ 13{
14 if (!buf) 14 if (!buf)
15 return -EINVAL; 15 return -EINVAL;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8d2da5dec1e0..c3084ff2569d 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -258,6 +258,9 @@ static int __walk_page_range(unsigned long start, unsigned long end,
258 258
259/** 259/**
260 * walk_page_range - walk page table with caller specific callbacks 260 * walk_page_range - walk page table with caller specific callbacks
261 * @start: start address of the virtual address range
262 * @end: end address of the virtual address range
263 * @walk: mm_walk structure defining the callbacks and the target address space
261 * 264 *
262 * Recursively walk the page table tree of the process represented by @walk->mm 265 * Recursively walk the page table tree of the process represented by @walk->mm
263 * within the virtual address range [@start, @end). During walking, we can do 266 * within the virtual address range [@start, @end). During walking, we can do
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 7a58460bfd27..063ff60ecd90 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -223,18 +223,7 @@ alloc_buffer:
223 223
224 return 0; 224 return 0;
225} 225}
226 226DEFINE_SHOW_ATTRIBUTE(percpu_stats);
227static int percpu_stats_open(struct inode *inode, struct file *filp)
228{
229 return single_open(filp, percpu_stats_show, NULL);
230}
231
232static const struct file_operations percpu_stats_fops = {
233 .open = percpu_stats_open,
234 .read = seq_read,
235 .llseek = seq_lseek,
236 .release = single_release,
237};
238 227
239static int __init init_percpu_stats_debugfs(void) 228static int __init init_percpu_stats_debugfs(void)
240{ 229{
diff --git a/mm/rmap.c b/mm/rmap.c
index 144c66e688a9..9122787c4947 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1171,6 +1171,7 @@ void page_add_new_anon_rmap(struct page *page,
1171/** 1171/**
1172 * page_add_file_rmap - add pte mapping to a file page 1172 * page_add_file_rmap - add pte mapping to a file page
1173 * @page: the page to add the mapping to 1173 * @page: the page to add the mapping to
1174 * @compound: charge the page as compound or small page
1174 * 1175 *
1175 * The caller needs to hold the pte lock. 1176 * The caller needs to hold the pte lock.
1176 */ 1177 */
diff --git a/mm/shmem.c b/mm/shmem.c
index b85919243399..4424fc0c33aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1422,9 +1422,12 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
1422{ 1422{
1423 struct vm_area_struct pvma; 1423 struct vm_area_struct pvma;
1424 struct page *page; 1424 struct page *page;
1425 struct vm_fault vmf;
1425 1426
1426 shmem_pseudo_vma_init(&pvma, info, index); 1427 shmem_pseudo_vma_init(&pvma, info, index);
1427 page = swapin_readahead(swap, gfp, &pvma, 0); 1428 vmf.vma = &pvma;
1429 vmf.address = 0;
1430 page = swap_cluster_readahead(swap, gfp, &vmf);
1428 shmem_pseudo_vma_destroy(&pvma); 1431 shmem_pseudo_vma_destroy(&pvma);
1429 1432
1430 return page; 1433 return page;
diff --git a/mm/slab.c b/mm/slab.c
index 9095c3945425..e3a9b8e23306 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1869,7 +1869,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
1869 return 0; 1869 return 0;
1870} 1870}
1871 1871
1872slab_flags_t kmem_cache_flags(unsigned long object_size, 1872slab_flags_t kmem_cache_flags(unsigned int object_size,
1873 slab_flags_t flags, const char *name, 1873 slab_flags_t flags, const char *name,
1874 void (*ctor)(void *)) 1874 void (*ctor)(void *))
1875{ 1875{
@@ -1877,7 +1877,7 @@ slab_flags_t kmem_cache_flags(unsigned long object_size,
1877} 1877}
1878 1878
1879struct kmem_cache * 1879struct kmem_cache *
1880__kmem_cache_alias(const char *name, size_t size, size_t align, 1880__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
1881 slab_flags_t flags, void (*ctor)(void *)) 1881 slab_flags_t flags, void (*ctor)(void *))
1882{ 1882{
1883 struct kmem_cache *cachep; 1883 struct kmem_cache *cachep;
@@ -1994,7 +1994,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
1994 size_t ralign = BYTES_PER_WORD; 1994 size_t ralign = BYTES_PER_WORD;
1995 gfp_t gfp; 1995 gfp_t gfp;
1996 int err; 1996 int err;
1997 size_t size = cachep->size; 1997 unsigned int size = cachep->size;
1998 1998
1999#if DEBUG 1999#if DEBUG
2000#if FORCED_DEBUG 2000#if FORCED_DEBUG
@@ -2291,6 +2291,18 @@ out:
2291 return nr_freed; 2291 return nr_freed;
2292} 2292}
2293 2293
2294bool __kmem_cache_empty(struct kmem_cache *s)
2295{
2296 int node;
2297 struct kmem_cache_node *n;
2298
2299 for_each_kmem_cache_node(s, node, n)
2300 if (!list_empty(&n->slabs_full) ||
2301 !list_empty(&n->slabs_partial))
2302 return false;
2303 return true;
2304}
2305
2294int __kmem_cache_shrink(struct kmem_cache *cachep) 2306int __kmem_cache_shrink(struct kmem_cache *cachep)
2295{ 2307{
2296 int ret = 0; 2308 int ret = 0;
diff --git a/mm/slab.h b/mm/slab.h
index 51813236e773..68bdf498da3b 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -22,8 +22,8 @@ struct kmem_cache {
22 unsigned int size; /* The aligned/padded/added on size */ 22 unsigned int size; /* The aligned/padded/added on size */
23 unsigned int align; /* Alignment as calculated */ 23 unsigned int align; /* Alignment as calculated */
24 slab_flags_t flags; /* Active flags on the slab */ 24 slab_flags_t flags; /* Active flags on the slab */
25 size_t useroffset; /* Usercopy region offset */ 25 unsigned int useroffset;/* Usercopy region offset */
26 size_t usersize; /* Usercopy region size */ 26 unsigned int usersize; /* Usercopy region size */
27 const char *name; /* Slab name for sysfs */ 27 const char *name; /* Slab name for sysfs */
28 int refcount; /* Use counter */ 28 int refcount; /* Use counter */
29 void (*ctor)(void *); /* Called on object slot creation */ 29 void (*ctor)(void *); /* Called on object slot creation */
@@ -77,7 +77,7 @@ extern struct kmem_cache *kmem_cache;
77/* A table of kmalloc cache names and sizes */ 77/* A table of kmalloc cache names and sizes */
78extern const struct kmalloc_info_struct { 78extern const struct kmalloc_info_struct {
79 const char *name; 79 const char *name;
80 unsigned long size; 80 unsigned int size;
81} kmalloc_info[]; 81} kmalloc_info[];
82 82
83#ifndef CONFIG_SLOB 83#ifndef CONFIG_SLOB
@@ -93,31 +93,31 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t);
93/* Functions provided by the slab allocators */ 93/* Functions provided by the slab allocators */
94int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); 94int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
95 95
96extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, 96struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size,
97 slab_flags_t flags, size_t useroffset, 97 slab_flags_t flags, unsigned int useroffset,
98 size_t usersize); 98 unsigned int usersize);
99extern void create_boot_cache(struct kmem_cache *, const char *name, 99extern void create_boot_cache(struct kmem_cache *, const char *name,
100 size_t size, slab_flags_t flags, size_t useroffset, 100 unsigned int size, slab_flags_t flags,
101 size_t usersize); 101 unsigned int useroffset, unsigned int usersize);
102 102
103int slab_unmergeable(struct kmem_cache *s); 103int slab_unmergeable(struct kmem_cache *s);
104struct kmem_cache *find_mergeable(size_t size, size_t align, 104struct kmem_cache *find_mergeable(unsigned size, unsigned align,
105 slab_flags_t flags, const char *name, void (*ctor)(void *)); 105 slab_flags_t flags, const char *name, void (*ctor)(void *));
106#ifndef CONFIG_SLOB 106#ifndef CONFIG_SLOB
107struct kmem_cache * 107struct kmem_cache *
108__kmem_cache_alias(const char *name, size_t size, size_t align, 108__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
109 slab_flags_t flags, void (*ctor)(void *)); 109 slab_flags_t flags, void (*ctor)(void *));
110 110
111slab_flags_t kmem_cache_flags(unsigned long object_size, 111slab_flags_t kmem_cache_flags(unsigned int object_size,
112 slab_flags_t flags, const char *name, 112 slab_flags_t flags, const char *name,
113 void (*ctor)(void *)); 113 void (*ctor)(void *));
114#else 114#else
115static inline struct kmem_cache * 115static inline struct kmem_cache *
116__kmem_cache_alias(const char *name, size_t size, size_t align, 116__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
117 slab_flags_t flags, void (*ctor)(void *)) 117 slab_flags_t flags, void (*ctor)(void *))
118{ return NULL; } 118{ return NULL; }
119 119
120static inline slab_flags_t kmem_cache_flags(unsigned long object_size, 120static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
121 slab_flags_t flags, const char *name, 121 slab_flags_t flags, const char *name,
122 void (*ctor)(void *)) 122 void (*ctor)(void *))
123{ 123{
@@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size,
166 SLAB_TEMPORARY | \ 166 SLAB_TEMPORARY | \
167 SLAB_ACCOUNT) 167 SLAB_ACCOUNT)
168 168
169bool __kmem_cache_empty(struct kmem_cache *);
169int __kmem_cache_shutdown(struct kmem_cache *); 170int __kmem_cache_shutdown(struct kmem_cache *);
170void __kmem_cache_release(struct kmem_cache *); 171void __kmem_cache_release(struct kmem_cache *);
171int __kmem_cache_shrink(struct kmem_cache *); 172int __kmem_cache_shrink(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 10f127b2de7c..98dcdc352062 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -10,6 +10,7 @@
10#include <linux/poison.h> 10#include <linux/poison.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/memory.h> 12#include <linux/memory.h>
13#include <linux/cache.h>
13#include <linux/compiler.h> 14#include <linux/compiler.h>
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/cpu.h> 16#include <linux/cpu.h>
@@ -81,38 +82,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
81EXPORT_SYMBOL(kmem_cache_size); 82EXPORT_SYMBOL(kmem_cache_size);
82 83
83#ifdef CONFIG_DEBUG_VM 84#ifdef CONFIG_DEBUG_VM
84static int kmem_cache_sanity_check(const char *name, size_t size) 85static int kmem_cache_sanity_check(const char *name, unsigned int size)
85{ 86{
86 struct kmem_cache *s = NULL;
87
88 if (!name || in_interrupt() || size < sizeof(void *) || 87 if (!name || in_interrupt() || size < sizeof(void *) ||
89 size > KMALLOC_MAX_SIZE) { 88 size > KMALLOC_MAX_SIZE) {
90 pr_err("kmem_cache_create(%s) integrity check failed\n", name); 89 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
91 return -EINVAL; 90 return -EINVAL;
92 } 91 }
93 92
94 list_for_each_entry(s, &slab_caches, list) {
95 char tmp;
96 int res;
97
98 /*
99 * This happens when the module gets unloaded and doesn't
100 * destroy its slab cache and no-one else reuses the vmalloc
101 * area of the module. Print a warning.
102 */
103 res = probe_kernel_address(s->name, tmp);
104 if (res) {
105 pr_err("Slab cache with size %d has lost its name\n",
106 s->object_size);
107 continue;
108 }
109 }
110
111 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 93 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
112 return 0; 94 return 0;
113} 95}
114#else 96#else
115static inline int kmem_cache_sanity_check(const char *name, size_t size) 97static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
116{ 98{
117 return 0; 99 return 0;
118} 100}
@@ -279,8 +261,8 @@ static inline void memcg_unlink_cache(struct kmem_cache *s)
279 * Figure out what the alignment of the objects will be given a set of 261 * Figure out what the alignment of the objects will be given a set of
280 * flags, a user specified alignment and the size of the objects. 262 * flags, a user specified alignment and the size of the objects.
281 */ 263 */
282static unsigned long calculate_alignment(unsigned long flags, 264static unsigned int calculate_alignment(slab_flags_t flags,
283 unsigned long align, unsigned long size) 265 unsigned int align, unsigned int size)
284{ 266{
285 /* 267 /*
286 * If the user wants hardware cache aligned objects then follow that 268 * If the user wants hardware cache aligned objects then follow that
@@ -290,7 +272,7 @@ static unsigned long calculate_alignment(unsigned long flags,
290 * alignment though. If that is greater then use it. 272 * alignment though. If that is greater then use it.
291 */ 273 */
292 if (flags & SLAB_HWCACHE_ALIGN) { 274 if (flags & SLAB_HWCACHE_ALIGN) {
293 unsigned long ralign; 275 unsigned int ralign;
294 276
295 ralign = cache_line_size(); 277 ralign = cache_line_size();
296 while (size <= ralign / 2) 278 while (size <= ralign / 2)
@@ -330,7 +312,7 @@ int slab_unmergeable(struct kmem_cache *s)
330 return 0; 312 return 0;
331} 313}
332 314
333struct kmem_cache *find_mergeable(size_t size, size_t align, 315struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
334 slab_flags_t flags, const char *name, void (*ctor)(void *)) 316 slab_flags_t flags, const char *name, void (*ctor)(void *))
335{ 317{
336 struct kmem_cache *s; 318 struct kmem_cache *s;
@@ -378,9 +360,9 @@ struct kmem_cache *find_mergeable(size_t size, size_t align,
378} 360}
379 361
380static struct kmem_cache *create_cache(const char *name, 362static struct kmem_cache *create_cache(const char *name,
381 size_t object_size, size_t size, size_t align, 363 unsigned int object_size, unsigned int align,
382 slab_flags_t flags, size_t useroffset, 364 slab_flags_t flags, unsigned int useroffset,
383 size_t usersize, void (*ctor)(void *), 365 unsigned int usersize, void (*ctor)(void *),
384 struct mem_cgroup *memcg, struct kmem_cache *root_cache) 366 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
385{ 367{
386 struct kmem_cache *s; 368 struct kmem_cache *s;
@@ -395,8 +377,7 @@ static struct kmem_cache *create_cache(const char *name,
395 goto out; 377 goto out;
396 378
397 s->name = name; 379 s->name = name;
398 s->object_size = object_size; 380 s->size = s->object_size = object_size;
399 s->size = size;
400 s->align = align; 381 s->align = align;
401 s->ctor = ctor; 382 s->ctor = ctor;
402 s->useroffset = useroffset; 383 s->useroffset = useroffset;
@@ -451,8 +432,10 @@ out_free_cache:
451 * as davem. 432 * as davem.
452 */ 433 */
453struct kmem_cache * 434struct kmem_cache *
454kmem_cache_create_usercopy(const char *name, size_t size, size_t align, 435kmem_cache_create_usercopy(const char *name,
455 slab_flags_t flags, size_t useroffset, size_t usersize, 436 unsigned int size, unsigned int align,
437 slab_flags_t flags,
438 unsigned int useroffset, unsigned int usersize,
456 void (*ctor)(void *)) 439 void (*ctor)(void *))
457{ 440{
458 struct kmem_cache *s = NULL; 441 struct kmem_cache *s = NULL;
@@ -500,7 +483,7 @@ kmem_cache_create_usercopy(const char *name, size_t size, size_t align,
500 goto out_unlock; 483 goto out_unlock;
501 } 484 }
502 485
503 s = create_cache(cache_name, size, size, 486 s = create_cache(cache_name, size,
504 calculate_alignment(flags, align, size), 487 calculate_alignment(flags, align, size),
505 flags, useroffset, usersize, ctor, NULL, NULL); 488 flags, useroffset, usersize, ctor, NULL, NULL);
506 if (IS_ERR(s)) { 489 if (IS_ERR(s)) {
@@ -531,7 +514,7 @@ out_unlock:
531EXPORT_SYMBOL(kmem_cache_create_usercopy); 514EXPORT_SYMBOL(kmem_cache_create_usercopy);
532 515
533struct kmem_cache * 516struct kmem_cache *
534kmem_cache_create(const char *name, size_t size, size_t align, 517kmem_cache_create(const char *name, unsigned int size, unsigned int align,
535 slab_flags_t flags, void (*ctor)(void *)) 518 slab_flags_t flags, void (*ctor)(void *))
536{ 519{
537 return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, 520 return kmem_cache_create_usercopy(name, size, align, flags, 0, 0,
@@ -647,7 +630,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
647 goto out_unlock; 630 goto out_unlock;
648 631
649 s = create_cache(cache_name, root_cache->object_size, 632 s = create_cache(cache_name, root_cache->object_size,
650 root_cache->size, root_cache->align, 633 root_cache->align,
651 root_cache->flags & CACHE_CREATE_MASK, 634 root_cache->flags & CACHE_CREATE_MASK,
652 root_cache->useroffset, root_cache->usersize, 635 root_cache->useroffset, root_cache->usersize,
653 root_cache->ctor, memcg, root_cache); 636 root_cache->ctor, memcg, root_cache);
@@ -916,8 +899,9 @@ bool slab_is_available(void)
916 899
917#ifndef CONFIG_SLOB 900#ifndef CONFIG_SLOB
918/* Create a cache during boot when no slab services are available yet */ 901/* Create a cache during boot when no slab services are available yet */
919void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, 902void __init create_boot_cache(struct kmem_cache *s, const char *name,
920 slab_flags_t flags, size_t useroffset, size_t usersize) 903 unsigned int size, slab_flags_t flags,
904 unsigned int useroffset, unsigned int usersize)
921{ 905{
922 int err; 906 int err;
923 907
@@ -932,15 +916,15 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
932 err = __kmem_cache_create(s, flags); 916 err = __kmem_cache_create(s, flags);
933 917
934 if (err) 918 if (err)
935 panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", 919 panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n",
936 name, size, err); 920 name, size, err);
937 921
938 s->refcount = -1; /* Exempt from merging for now */ 922 s->refcount = -1; /* Exempt from merging for now */
939} 923}
940 924
941struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, 925struct kmem_cache *__init create_kmalloc_cache(const char *name,
942 slab_flags_t flags, size_t useroffset, 926 unsigned int size, slab_flags_t flags,
943 size_t usersize) 927 unsigned int useroffset, unsigned int usersize)
944{ 928{
945 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 929 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
946 930
@@ -954,11 +938,11 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
954 return s; 938 return s;
955} 939}
956 940
957struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; 941struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
958EXPORT_SYMBOL(kmalloc_caches); 942EXPORT_SYMBOL(kmalloc_caches);
959 943
960#ifdef CONFIG_ZONE_DMA 944#ifdef CONFIG_ZONE_DMA
961struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; 945struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
962EXPORT_SYMBOL(kmalloc_dma_caches); 946EXPORT_SYMBOL(kmalloc_dma_caches);
963#endif 947#endif
964 948
@@ -968,7 +952,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches);
968 * of two cache sizes there. The size of larger slabs can be determined using 952 * of two cache sizes there. The size of larger slabs can be determined using
969 * fls. 953 * fls.
970 */ 954 */
971static s8 size_index[24] = { 955static u8 size_index[24] __ro_after_init = {
972 3, /* 8 */ 956 3, /* 8 */
973 4, /* 16 */ 957 4, /* 16 */
974 5, /* 24 */ 958 5, /* 24 */
@@ -995,7 +979,7 @@ static s8 size_index[24] = {
995 2 /* 192 */ 979 2 /* 192 */
996}; 980};
997 981
998static inline int size_index_elem(size_t bytes) 982static inline unsigned int size_index_elem(unsigned int bytes)
999{ 983{
1000 return (bytes - 1) / 8; 984 return (bytes - 1) / 8;
1001} 985}
@@ -1006,7 +990,7 @@ static inline int size_index_elem(size_t bytes)
1006 */ 990 */
1007struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) 991struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
1008{ 992{
1009 int index; 993 unsigned int index;
1010 994
1011 if (unlikely(size > KMALLOC_MAX_SIZE)) { 995 if (unlikely(size > KMALLOC_MAX_SIZE)) {
1012 WARN_ON_ONCE(!(flags & __GFP_NOWARN)); 996 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
@@ -1064,13 +1048,13 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = {
1064 */ 1048 */
1065void __init setup_kmalloc_cache_index_table(void) 1049void __init setup_kmalloc_cache_index_table(void)
1066{ 1050{
1067 int i; 1051 unsigned int i;
1068 1052
1069 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 1053 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
1070 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 1054 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
1071 1055
1072 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 1056 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
1073 int elem = size_index_elem(i); 1057 unsigned int elem = size_index_elem(i);
1074 1058
1075 if (elem >= ARRAY_SIZE(size_index)) 1059 if (elem >= ARRAY_SIZE(size_index))
1076 break; 1060 break;
@@ -1137,9 +1121,9 @@ void __init create_kmalloc_caches(slab_flags_t flags)
1137 struct kmem_cache *s = kmalloc_caches[i]; 1121 struct kmem_cache *s = kmalloc_caches[i];
1138 1122
1139 if (s) { 1123 if (s) {
1140 int size = kmalloc_size(i); 1124 unsigned int size = kmalloc_size(i);
1141 char *n = kasprintf(GFP_NOWAIT, 1125 char *n = kasprintf(GFP_NOWAIT,
1142 "dma-kmalloc-%d", size); 1126 "dma-kmalloc-%u", size);
1143 1127
1144 BUG_ON(!n); 1128 BUG_ON(!n);
1145 kmalloc_dma_caches[i] = create_kmalloc_cache(n, 1129 kmalloc_dma_caches[i] = create_kmalloc_cache(n,
@@ -1182,10 +1166,10 @@ EXPORT_SYMBOL(kmalloc_order_trace);
1182#ifdef CONFIG_SLAB_FREELIST_RANDOM 1166#ifdef CONFIG_SLAB_FREELIST_RANDOM
1183/* Randomize a generic freelist */ 1167/* Randomize a generic freelist */
1184static void freelist_randomize(struct rnd_state *state, unsigned int *list, 1168static void freelist_randomize(struct rnd_state *state, unsigned int *list,
1185 size_t count) 1169 unsigned int count)
1186{ 1170{
1187 size_t i;
1188 unsigned int rand; 1171 unsigned int rand;
1172 unsigned int i;
1189 1173
1190 for (i = 0; i < count; i++) 1174 for (i = 0; i < count; i++)
1191 list[i] = i; 1175 list[i] = i;
@@ -1532,3 +1516,11 @@ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
1532EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); 1516EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
1533EXPORT_TRACEPOINT_SYMBOL(kfree); 1517EXPORT_TRACEPOINT_SYMBOL(kfree);
1534EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); 1518EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
1519
1520int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
1521{
1522 if (__should_failslab(s, gfpflags))
1523 return -ENOMEM;
1524 return 0;
1525}
1526ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
diff --git a/mm/slub.c b/mm/slub.c
index e381728a3751..4fb037c98782 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -311,18 +311,18 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
311 __p += (__s)->size, __idx++) 311 __p += (__s)->size, __idx++)
312 312
313/* Determine object index from a given position */ 313/* Determine object index from a given position */
314static inline int slab_index(void *p, struct kmem_cache *s, void *addr) 314static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
315{ 315{
316 return (p - addr) / s->size; 316 return (p - addr) / s->size;
317} 317}
318 318
319static inline int order_objects(int order, unsigned long size, int reserved) 319static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved)
320{ 320{
321 return ((PAGE_SIZE << order) - reserved) / size; 321 return (((unsigned int)PAGE_SIZE << order) - reserved) / size;
322} 322}
323 323
324static inline struct kmem_cache_order_objects oo_make(int order, 324static inline struct kmem_cache_order_objects oo_make(unsigned int order,
325 unsigned long size, int reserved) 325 unsigned int size, unsigned int reserved)
326{ 326{
327 struct kmem_cache_order_objects x = { 327 struct kmem_cache_order_objects x = {
328 (order << OO_SHIFT) + order_objects(order, size, reserved) 328 (order << OO_SHIFT) + order_objects(order, size, reserved)
@@ -331,12 +331,12 @@ static inline struct kmem_cache_order_objects oo_make(int order,
331 return x; 331 return x;
332} 332}
333 333
334static inline int oo_order(struct kmem_cache_order_objects x) 334static inline unsigned int oo_order(struct kmem_cache_order_objects x)
335{ 335{
336 return x.x >> OO_SHIFT; 336 return x.x >> OO_SHIFT;
337} 337}
338 338
339static inline int oo_objects(struct kmem_cache_order_objects x) 339static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
340{ 340{
341 return x.x & OO_MASK; 341 return x.x & OO_MASK;
342} 342}
@@ -466,7 +466,7 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
466 set_bit(slab_index(p, s, addr), map); 466 set_bit(slab_index(p, s, addr), map);
467} 467}
468 468
469static inline int size_from_object(struct kmem_cache *s) 469static inline unsigned int size_from_object(struct kmem_cache *s)
470{ 470{
471 if (s->flags & SLAB_RED_ZONE) 471 if (s->flags & SLAB_RED_ZONE)
472 return s->size - s->red_left_pad; 472 return s->size - s->red_left_pad;
@@ -598,13 +598,13 @@ static void init_tracking(struct kmem_cache *s, void *object)
598 set_track(s, object, TRACK_ALLOC, 0UL); 598 set_track(s, object, TRACK_ALLOC, 0UL);
599} 599}
600 600
601static void print_track(const char *s, struct track *t) 601static void print_track(const char *s, struct track *t, unsigned long pr_time)
602{ 602{
603 if (!t->addr) 603 if (!t->addr)
604 return; 604 return;
605 605
606 pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", 606 pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
607 s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); 607 s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
608#ifdef CONFIG_STACKTRACE 608#ifdef CONFIG_STACKTRACE
609 { 609 {
610 int i; 610 int i;
@@ -619,11 +619,12 @@ static void print_track(const char *s, struct track *t)
619 619
620static void print_tracking(struct kmem_cache *s, void *object) 620static void print_tracking(struct kmem_cache *s, void *object)
621{ 621{
622 unsigned long pr_time = jiffies;
622 if (!(s->flags & SLAB_STORE_USER)) 623 if (!(s->flags & SLAB_STORE_USER))
623 return; 624 return;
624 625
625 print_track("Allocated", get_track(s, object, TRACK_ALLOC)); 626 print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time);
626 print_track("Freed", get_track(s, object, TRACK_FREE)); 627 print_track("Freed", get_track(s, object, TRACK_FREE), pr_time);
627} 628}
628 629
629static void print_page_info(struct page *page) 630static void print_page_info(struct page *page)
@@ -680,7 +681,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
680 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); 681 print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
681 682
682 print_section(KERN_ERR, "Object ", p, 683 print_section(KERN_ERR, "Object ", p,
683 min_t(unsigned long, s->object_size, PAGE_SIZE)); 684 min_t(unsigned int, s->object_size, PAGE_SIZE));
684 if (s->flags & SLAB_RED_ZONE) 685 if (s->flags & SLAB_RED_ZONE)
685 print_section(KERN_ERR, "Redzone ", p + s->object_size, 686 print_section(KERN_ERR, "Redzone ", p + s->object_size,
686 s->inuse - s->object_size); 687 s->inuse - s->object_size);
@@ -1292,7 +1293,7 @@ out:
1292 1293
1293__setup("slub_debug", setup_slub_debug); 1294__setup("slub_debug", setup_slub_debug);
1294 1295
1295slab_flags_t kmem_cache_flags(unsigned long object_size, 1296slab_flags_t kmem_cache_flags(unsigned int object_size,
1296 slab_flags_t flags, const char *name, 1297 slab_flags_t flags, const char *name,
1297 void (*ctor)(void *)) 1298 void (*ctor)(void *))
1298{ 1299{
@@ -1325,7 +1326,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1325 struct page *page) {} 1326 struct page *page) {}
1326static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, 1327static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
1327 struct page *page) {} 1328 struct page *page) {}
1328slab_flags_t kmem_cache_flags(unsigned long object_size, 1329slab_flags_t kmem_cache_flags(unsigned int object_size,
1329 slab_flags_t flags, const char *name, 1330 slab_flags_t flags, const char *name,
1330 void (*ctor)(void *)) 1331 void (*ctor)(void *))
1331{ 1332{
@@ -1435,7 +1436,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1435 gfp_t flags, int node, struct kmem_cache_order_objects oo) 1436 gfp_t flags, int node, struct kmem_cache_order_objects oo)
1436{ 1437{
1437 struct page *page; 1438 struct page *page;
1438 int order = oo_order(oo); 1439 unsigned int order = oo_order(oo);
1439 1440
1440 if (node == NUMA_NO_NODE) 1441 if (node == NUMA_NO_NODE)
1441 page = alloc_pages(flags, order); 1442 page = alloc_pages(flags, order);
@@ -1454,8 +1455,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1454/* Pre-initialize the random sequence cache */ 1455/* Pre-initialize the random sequence cache */
1455static int init_cache_random_seq(struct kmem_cache *s) 1456static int init_cache_random_seq(struct kmem_cache *s)
1456{ 1457{
1458 unsigned int count = oo_objects(s->oo);
1457 int err; 1459 int err;
1458 unsigned long i, count = oo_objects(s->oo);
1459 1460
1460 /* Bailout if already initialised */ 1461 /* Bailout if already initialised */
1461 if (s->random_seq) 1462 if (s->random_seq)
@@ -1470,6 +1471,8 @@ static int init_cache_random_seq(struct kmem_cache *s)
1470 1471
1471 /* Transform to an offset on the set of pages */ 1472 /* Transform to an offset on the set of pages */
1472 if (s->random_seq) { 1473 if (s->random_seq) {
1474 unsigned int i;
1475
1473 for (i = 0; i < count; i++) 1476 for (i = 0; i < count; i++)
1474 s->random_seq[i] *= s->size; 1477 s->random_seq[i] *= s->size;
1475 } 1478 }
@@ -1811,7 +1814,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1811{ 1814{
1812 struct page *page, *page2; 1815 struct page *page, *page2;
1813 void *object = NULL; 1816 void *object = NULL;
1814 int available = 0; 1817 unsigned int available = 0;
1815 int objects; 1818 int objects;
1816 1819
1817 /* 1820 /*
@@ -2398,7 +2401,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2398 2401
2399 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", 2402 pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n",
2400 nid, gfpflags, &gfpflags); 2403 nid, gfpflags, &gfpflags);
2401 pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", 2404 pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n",
2402 s->name, s->object_size, s->size, oo_order(s->oo), 2405 s->name, s->object_size, s->size, oo_order(s->oo),
2403 oo_order(s->min)); 2406 oo_order(s->min));
2404 2407
@@ -3181,9 +3184,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk);
3181 * and increases the number of allocations possible without having to 3184 * and increases the number of allocations possible without having to
3182 * take the list_lock. 3185 * take the list_lock.
3183 */ 3186 */
3184static int slub_min_order; 3187static unsigned int slub_min_order;
3185static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; 3188static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
3186static int slub_min_objects; 3189static unsigned int slub_min_objects;
3187 3190
3188/* 3191/*
3189 * Calculate the order of allocation given an slab object size. 3192 * Calculate the order of allocation given an slab object size.
@@ -3210,20 +3213,21 @@ static int slub_min_objects;
3210 * requested a higher mininum order then we start with that one instead of 3213 * requested a higher mininum order then we start with that one instead of
3211 * the smallest order which will fit the object. 3214 * the smallest order which will fit the object.
3212 */ 3215 */
3213static inline int slab_order(int size, int min_objects, 3216static inline unsigned int slab_order(unsigned int size,
3214 int max_order, int fract_leftover, int reserved) 3217 unsigned int min_objects, unsigned int max_order,
3218 unsigned int fract_leftover, unsigned int reserved)
3215{ 3219{
3216 int order; 3220 unsigned int min_order = slub_min_order;
3217 int rem; 3221 unsigned int order;
3218 int min_order = slub_min_order;
3219 3222
3220 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) 3223 if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
3221 return get_order(size * MAX_OBJS_PER_PAGE) - 1; 3224 return get_order(size * MAX_OBJS_PER_PAGE) - 1;
3222 3225
3223 for (order = max(min_order, get_order(min_objects * size + reserved)); 3226 for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved));
3224 order <= max_order; order++) { 3227 order <= max_order; order++) {
3225 3228
3226 unsigned long slab_size = PAGE_SIZE << order; 3229 unsigned int slab_size = (unsigned int)PAGE_SIZE << order;
3230 unsigned int rem;
3227 3231
3228 rem = (slab_size - reserved) % size; 3232 rem = (slab_size - reserved) % size;
3229 3233
@@ -3234,12 +3238,11 @@ static inline int slab_order(int size, int min_objects,
3234 return order; 3238 return order;
3235} 3239}
3236 3240
3237static inline int calculate_order(int size, int reserved) 3241static inline int calculate_order(unsigned int size, unsigned int reserved)
3238{ 3242{
3239 int order; 3243 unsigned int order;
3240 int min_objects; 3244 unsigned int min_objects;
3241 int fraction; 3245 unsigned int max_objects;
3242 int max_objects;
3243 3246
3244 /* 3247 /*
3245 * Attempt to find best configuration for a slab. This 3248 * Attempt to find best configuration for a slab. This
@@ -3256,6 +3259,8 @@ static inline int calculate_order(int size, int reserved)
3256 min_objects = min(min_objects, max_objects); 3259 min_objects = min(min_objects, max_objects);
3257 3260
3258 while (min_objects > 1) { 3261 while (min_objects > 1) {
3262 unsigned int fraction;
3263
3259 fraction = 16; 3264 fraction = 16;
3260 while (fraction >= 4) { 3265 while (fraction >= 4) {
3261 order = slab_order(size, min_objects, 3266 order = slab_order(size, min_objects,
@@ -3457,8 +3462,8 @@ static void set_cpu_partial(struct kmem_cache *s)
3457static int calculate_sizes(struct kmem_cache *s, int forced_order) 3462static int calculate_sizes(struct kmem_cache *s, int forced_order)
3458{ 3463{
3459 slab_flags_t flags = s->flags; 3464 slab_flags_t flags = s->flags;
3460 size_t size = s->object_size; 3465 unsigned int size = s->object_size;
3461 int order; 3466 unsigned int order;
3462 3467
3463 /* 3468 /*
3464 * Round up object size to the next word boundary. We can only 3469 * Round up object size to the next word boundary. We can only
@@ -3548,7 +3553,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
3548 else 3553 else
3549 order = calculate_order(size, s->reserved); 3554 order = calculate_order(size, s->reserved);
3550 3555
3551 if (order < 0) 3556 if ((int)order < 0)
3552 return 0; 3557 return 0;
3553 3558
3554 s->allocflags = 0; 3559 s->allocflags = 0;
@@ -3632,8 +3637,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
3632 free_kmem_cache_nodes(s); 3637 free_kmem_cache_nodes(s);
3633error: 3638error:
3634 if (flags & SLAB_PANIC) 3639 if (flags & SLAB_PANIC)
3635 panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", 3640 panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
3636 s->name, (unsigned long)s->size, s->size, 3641 s->name, s->size, s->size,
3637 oo_order(s->oo), s->offset, (unsigned long)flags); 3642 oo_order(s->oo), s->offset, (unsigned long)flags);
3638 return -EINVAL; 3643 return -EINVAL;
3639} 3644}
@@ -3691,6 +3696,17 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3691 discard_slab(s, page); 3696 discard_slab(s, page);
3692} 3697}
3693 3698
3699bool __kmem_cache_empty(struct kmem_cache *s)
3700{
3701 int node;
3702 struct kmem_cache_node *n;
3703
3704 for_each_kmem_cache_node(s, node, n)
3705 if (n->nr_partial || slabs_node(s, node))
3706 return false;
3707 return true;
3708}
3709
3694/* 3710/*
3695 * Release all resources used by a slab cache. 3711 * Release all resources used by a slab cache.
3696 */ 3712 */
@@ -3716,7 +3732,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
3716 3732
3717static int __init setup_slub_min_order(char *str) 3733static int __init setup_slub_min_order(char *str)
3718{ 3734{
3719 get_option(&str, &slub_min_order); 3735 get_option(&str, (int *)&slub_min_order);
3720 3736
3721 return 1; 3737 return 1;
3722} 3738}
@@ -3725,8 +3741,8 @@ __setup("slub_min_order=", setup_slub_min_order);
3725 3741
3726static int __init setup_slub_max_order(char *str) 3742static int __init setup_slub_max_order(char *str)
3727{ 3743{
3728 get_option(&str, &slub_max_order); 3744 get_option(&str, (int *)&slub_max_order);
3729 slub_max_order = min(slub_max_order, MAX_ORDER - 1); 3745 slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1);
3730 3746
3731 return 1; 3747 return 1;
3732} 3748}
@@ -3735,7 +3751,7 @@ __setup("slub_max_order=", setup_slub_max_order);
3735 3751
3736static int __init setup_slub_min_objects(char *str) 3752static int __init setup_slub_min_objects(char *str)
3737{ 3753{
3738 get_option(&str, &slub_min_objects); 3754 get_option(&str, (int *)&slub_min_objects);
3739 3755
3740 return 1; 3756 return 1;
3741} 3757}
@@ -3824,7 +3840,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
3824 bool to_user) 3840 bool to_user)
3825{ 3841{
3826 struct kmem_cache *s; 3842 struct kmem_cache *s;
3827 unsigned long offset; 3843 unsigned int offset;
3828 size_t object_size; 3844 size_t object_size;
3829 3845
3830 /* Find object and usable object size. */ 3846 /* Find object and usable object size. */
@@ -4230,7 +4246,7 @@ void __init kmem_cache_init(void)
4230 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, 4246 cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
4231 slub_cpu_dead); 4247 slub_cpu_dead);
4232 4248
4233 pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", 4249 pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
4234 cache_line_size(), 4250 cache_line_size(),
4235 slub_min_order, slub_max_order, slub_min_objects, 4251 slub_min_order, slub_max_order, slub_min_objects,
4236 nr_cpu_ids, nr_node_ids); 4252 nr_cpu_ids, nr_node_ids);
@@ -4241,7 +4257,7 @@ void __init kmem_cache_init_late(void)
4241} 4257}
4242 4258
4243struct kmem_cache * 4259struct kmem_cache *
4244__kmem_cache_alias(const char *name, size_t size, size_t align, 4260__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
4245 slab_flags_t flags, void (*ctor)(void *)) 4261 slab_flags_t flags, void (*ctor)(void *))
4246{ 4262{
4247 struct kmem_cache *s, *c; 4263 struct kmem_cache *s, *c;
@@ -4254,13 +4270,12 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
4254 * Adjust the object sizes so that we clear 4270 * Adjust the object sizes so that we clear
4255 * the complete object on kzalloc. 4271 * the complete object on kzalloc.
4256 */ 4272 */
4257 s->object_size = max(s->object_size, (int)size); 4273 s->object_size = max(s->object_size, size);
4258 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 4274 s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
4259 4275
4260 for_each_memcg_cache(c, s) { 4276 for_each_memcg_cache(c, s) {
4261 c->object_size = s->object_size; 4277 c->object_size = s->object_size;
4262 c->inuse = max_t(int, c->inuse, 4278 c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
4263 ALIGN(size, sizeof(void *)));
4264 } 4279 }
4265 4280
4266 if (sysfs_slab_alias(s, name)) { 4281 if (sysfs_slab_alias(s, name)) {
@@ -4889,35 +4904,35 @@ struct slab_attribute {
4889 4904
4890static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4905static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4891{ 4906{
4892 return sprintf(buf, "%d\n", s->size); 4907 return sprintf(buf, "%u\n", s->size);
4893} 4908}
4894SLAB_ATTR_RO(slab_size); 4909SLAB_ATTR_RO(slab_size);
4895 4910
4896static ssize_t align_show(struct kmem_cache *s, char *buf) 4911static ssize_t align_show(struct kmem_cache *s, char *buf)
4897{ 4912{
4898 return sprintf(buf, "%d\n", s->align); 4913 return sprintf(buf, "%u\n", s->align);
4899} 4914}
4900SLAB_ATTR_RO(align); 4915SLAB_ATTR_RO(align);
4901 4916
4902static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4917static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4903{ 4918{
4904 return sprintf(buf, "%d\n", s->object_size); 4919 return sprintf(buf, "%u\n", s->object_size);
4905} 4920}
4906SLAB_ATTR_RO(object_size); 4921SLAB_ATTR_RO(object_size);
4907 4922
4908static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) 4923static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
4909{ 4924{
4910 return sprintf(buf, "%d\n", oo_objects(s->oo)); 4925 return sprintf(buf, "%u\n", oo_objects(s->oo));
4911} 4926}
4912SLAB_ATTR_RO(objs_per_slab); 4927SLAB_ATTR_RO(objs_per_slab);
4913 4928
4914static ssize_t order_store(struct kmem_cache *s, 4929static ssize_t order_store(struct kmem_cache *s,
4915 const char *buf, size_t length) 4930 const char *buf, size_t length)
4916{ 4931{
4917 unsigned long order; 4932 unsigned int order;
4918 int err; 4933 int err;
4919 4934
4920 err = kstrtoul(buf, 10, &order); 4935 err = kstrtouint(buf, 10, &order);
4921 if (err) 4936 if (err)
4922 return err; 4937 return err;
4923 4938
@@ -4930,7 +4945,7 @@ static ssize_t order_store(struct kmem_cache *s,
4930 4945
4931static ssize_t order_show(struct kmem_cache *s, char *buf) 4946static ssize_t order_show(struct kmem_cache *s, char *buf)
4932{ 4947{
4933 return sprintf(buf, "%d\n", oo_order(s->oo)); 4948 return sprintf(buf, "%u\n", oo_order(s->oo));
4934} 4949}
4935SLAB_ATTR(order); 4950SLAB_ATTR(order);
4936 4951
@@ -4962,10 +4977,10 @@ static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4962static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, 4977static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4963 size_t length) 4978 size_t length)
4964{ 4979{
4965 unsigned long objects; 4980 unsigned int objects;
4966 int err; 4981 int err;
4967 4982
4968 err = kstrtoul(buf, 10, &objects); 4983 err = kstrtouint(buf, 10, &objects);
4969 if (err) 4984 if (err)
4970 return err; 4985 return err;
4971 if (objects && !kmem_cache_has_cpu_partial(s)) 4986 if (objects && !kmem_cache_has_cpu_partial(s))
@@ -5081,7 +5096,7 @@ SLAB_ATTR_RO(cache_dma);
5081 5096
5082static ssize_t usersize_show(struct kmem_cache *s, char *buf) 5097static ssize_t usersize_show(struct kmem_cache *s, char *buf)
5083{ 5098{
5084 return sprintf(buf, "%zu\n", s->usersize); 5099 return sprintf(buf, "%u\n", s->usersize);
5085} 5100}
5086SLAB_ATTR_RO(usersize); 5101SLAB_ATTR_RO(usersize);
5087 5102
@@ -5093,7 +5108,7 @@ SLAB_ATTR_RO(destroy_by_rcu);
5093 5108
5094static ssize_t reserved_show(struct kmem_cache *s, char *buf) 5109static ssize_t reserved_show(struct kmem_cache *s, char *buf)
5095{ 5110{
5096 return sprintf(buf, "%d\n", s->reserved); 5111 return sprintf(buf, "%u\n", s->reserved);
5097} 5112}
5098SLAB_ATTR_RO(reserved); 5113SLAB_ATTR_RO(reserved);
5099 5114
@@ -5288,21 +5303,22 @@ SLAB_ATTR(shrink);
5288#ifdef CONFIG_NUMA 5303#ifdef CONFIG_NUMA
5289static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) 5304static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
5290{ 5305{
5291 return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); 5306 return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
5292} 5307}
5293 5308
5294static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, 5309static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
5295 const char *buf, size_t length) 5310 const char *buf, size_t length)
5296{ 5311{
5297 unsigned long ratio; 5312 unsigned int ratio;
5298 int err; 5313 int err;
5299 5314
5300 err = kstrtoul(buf, 10, &ratio); 5315 err = kstrtouint(buf, 10, &ratio);
5301 if (err) 5316 if (err)
5302 return err; 5317 return err;
5318 if (ratio > 100)
5319 return -ERANGE;
5303 5320
5304 if (ratio <= 100) 5321 s->remote_node_defrag_ratio = ratio * 10;
5305 s->remote_node_defrag_ratio = ratio * 10;
5306 5322
5307 return length; 5323 return length;
5308} 5324}
@@ -5663,7 +5679,7 @@ static char *create_unique_id(struct kmem_cache *s)
5663 *p++ = 'A'; 5679 *p++ = 'A';
5664 if (p != name + 1) 5680 if (p != name + 1)
5665 *p++ = '-'; 5681 *p++ = '-';
5666 p += sprintf(p, "%07d", s->size); 5682 p += sprintf(p, "%07u", s->size);
5667 5683
5668 BUG_ON(p > name + ID_STR_LENGTH - 1); 5684 BUG_ON(p > name + ID_STR_LENGTH - 1);
5669 return name; 5685 return name;
diff --git a/mm/sparse.c b/mm/sparse.c
index 58cab483e81b..62eef264a7bd 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -779,7 +779,13 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat,
779 goto out; 779 goto out;
780 } 780 }
781 781
782 memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); 782#ifdef CONFIG_DEBUG_VM
783 /*
784 * Poison uninitialized struct pages in order to catch invalid flags
785 * combinations.
786 */
787 memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
788#endif
783 789
784 section_mark_present(ms); 790 section_mark_present(ms);
785 791
diff --git a/mm/swap.c b/mm/swap.c
index 0f17330dd0e5..3dd518832096 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -707,7 +707,6 @@ void lru_add_drain_all(void)
707 * release_pages - batched put_page() 707 * release_pages - batched put_page()
708 * @pages: array of pages to release 708 * @pages: array of pages to release
709 * @nr: number of pages 709 * @nr: number of pages
710 * @cold: whether the pages are cache cold
711 * 710 *
712 * Decrement the reference count on all the pages in @pages. If it 711 * Decrement the reference count on all the pages in @pages. If it
713 * fell to zero, remove the page from the LRU and free it. 712 * fell to zero, remove the page from the LRU and free it.
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index bebc19292018..f2641894f440 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -34,8 +34,6 @@
34#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/mm.h> 35#include <linux/mm.h>
36 36
37#ifdef CONFIG_SWAP
38
39static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); 37static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots);
40static bool swap_slot_cache_active; 38static bool swap_slot_cache_active;
41bool swap_slot_cache_enabled; 39bool swap_slot_cache_enabled;
@@ -356,5 +354,3 @@ repeat:
356 354
357 return entry; 355 return entry;
358} 356}
359
360#endif /* CONFIG_SWAP */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 39ae7cfad90f..f233dccd3b1b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -38,7 +38,7 @@ static const struct address_space_operations swap_aops = {
38 38
39struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; 39struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
40static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; 40static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
41bool swap_vma_readahead __read_mostly = true; 41static bool enable_vma_readahead __read_mostly = true;
42 42
43#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) 43#define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2)
44#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) 44#define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1)
@@ -322,6 +322,11 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
322 release_pages(pagep, nr); 322 release_pages(pagep, nr);
323} 323}
324 324
325static inline bool swap_use_vma_readahead(void)
326{
327 return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
328}
329
325/* 330/*
326 * Lookup a swap entry in the swap cache. A found page will be returned 331 * Lookup a swap entry in the swap cache. A found page will be returned
327 * unlocked and with its refcount incremented - we rely on the kernel 332 * unlocked and with its refcount incremented - we rely on the kernel
@@ -332,32 +337,43 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
332 unsigned long addr) 337 unsigned long addr)
333{ 338{
334 struct page *page; 339 struct page *page;
335 unsigned long ra_info;
336 int win, hits, readahead;
337 340
338 page = find_get_page(swap_address_space(entry), swp_offset(entry)); 341 page = find_get_page(swap_address_space(entry), swp_offset(entry));
339 342
340 INC_CACHE_INFO(find_total); 343 INC_CACHE_INFO(find_total);
341 if (page) { 344 if (page) {
345 bool vma_ra = swap_use_vma_readahead();
346 bool readahead;
347
342 INC_CACHE_INFO(find_success); 348 INC_CACHE_INFO(find_success);
349 /*
350 * At the moment, we don't support PG_readahead for anon THP
351 * so let's bail out rather than confusing the readahead stat.
352 */
343 if (unlikely(PageTransCompound(page))) 353 if (unlikely(PageTransCompound(page)))
344 return page; 354 return page;
355
345 readahead = TestClearPageReadahead(page); 356 readahead = TestClearPageReadahead(page);
346 if (vma) { 357 if (vma && vma_ra) {
347 ra_info = GET_SWAP_RA_VAL(vma); 358 unsigned long ra_val;
348 win = SWAP_RA_WIN(ra_info); 359 int win, hits;
349 hits = SWAP_RA_HITS(ra_info); 360
361 ra_val = GET_SWAP_RA_VAL(vma);
362 win = SWAP_RA_WIN(ra_val);
363 hits = SWAP_RA_HITS(ra_val);
350 if (readahead) 364 if (readahead)
351 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); 365 hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
352 atomic_long_set(&vma->swap_readahead_info, 366 atomic_long_set(&vma->swap_readahead_info,
353 SWAP_RA_VAL(addr, win, hits)); 367 SWAP_RA_VAL(addr, win, hits));
354 } 368 }
369
355 if (readahead) { 370 if (readahead) {
356 count_vm_event(SWAP_RA_HIT); 371 count_vm_event(SWAP_RA_HIT);
357 if (!vma) 372 if (!vma || !vma_ra)
358 atomic_inc(&swapin_readahead_hits); 373 atomic_inc(&swapin_readahead_hits);
359 } 374 }
360 } 375 }
376
361 return page; 377 return page;
362} 378}
363 379
@@ -533,11 +549,10 @@ static unsigned long swapin_nr_pages(unsigned long offset)
533} 549}
534 550
535/** 551/**
536 * swapin_readahead - swap in pages in hope we need them soon 552 * swap_cluster_readahead - swap in pages in hope we need them soon
537 * @entry: swap entry of this memory 553 * @entry: swap entry of this memory
538 * @gfp_mask: memory allocation flags 554 * @gfp_mask: memory allocation flags
539 * @vma: user vma this address belongs to 555 * @vmf: fault information
540 * @addr: target address for mempolicy
541 * 556 *
542 * Returns the struct page for entry and addr, after queueing swapin. 557 * Returns the struct page for entry and addr, after queueing swapin.
543 * 558 *
@@ -549,10 +564,10 @@ static unsigned long swapin_nr_pages(unsigned long offset)
549 * This has been extended to use the NUMA policies from the mm triggering 564 * This has been extended to use the NUMA policies from the mm triggering
550 * the readahead. 565 * the readahead.
551 * 566 *
552 * Caller must hold down_read on the vma->vm_mm if vma is not NULL. 567 * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
553 */ 568 */
554struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 569struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
555 struct vm_area_struct *vma, unsigned long addr) 570 struct vm_fault *vmf)
556{ 571{
557 struct page *page; 572 struct page *page;
558 unsigned long entry_offset = swp_offset(entry); 573 unsigned long entry_offset = swp_offset(entry);
@@ -562,6 +577,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
562 struct swap_info_struct *si = swp_swap_info(entry); 577 struct swap_info_struct *si = swp_swap_info(entry);
563 struct blk_plug plug; 578 struct blk_plug plug;
564 bool do_poll = true, page_allocated; 579 bool do_poll = true, page_allocated;
580 struct vm_area_struct *vma = vmf->vma;
581 unsigned long addr = vmf->address;
565 582
566 mask = swapin_nr_pages(offset) - 1; 583 mask = swapin_nr_pages(offset) - 1;
567 if (!mask) 584 if (!mask)
@@ -586,8 +603,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
586 continue; 603 continue;
587 if (page_allocated) { 604 if (page_allocated) {
588 swap_readpage(page, false); 605 swap_readpage(page, false);
589 if (offset != entry_offset && 606 if (offset != entry_offset) {
590 likely(!PageTransCompound(page))) {
591 SetPageReadahead(page); 607 SetPageReadahead(page);
592 count_vm_event(SWAP_RA); 608 count_vm_event(SWAP_RA);
593 } 609 }
@@ -649,16 +665,15 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
649 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); 665 PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
650} 666}
651 667
652struct page *swap_readahead_detect(struct vm_fault *vmf, 668static void swap_ra_info(struct vm_fault *vmf,
653 struct vma_swap_readahead *swap_ra) 669 struct vma_swap_readahead *ra_info)
654{ 670{
655 struct vm_area_struct *vma = vmf->vma; 671 struct vm_area_struct *vma = vmf->vma;
656 unsigned long swap_ra_info; 672 unsigned long ra_val;
657 struct page *page;
658 swp_entry_t entry; 673 swp_entry_t entry;
659 unsigned long faddr, pfn, fpfn; 674 unsigned long faddr, pfn, fpfn;
660 unsigned long start, end; 675 unsigned long start, end;
661 pte_t *pte; 676 pte_t *pte, *orig_pte;
662 unsigned int max_win, hits, prev_win, win, left; 677 unsigned int max_win, hits, prev_win, win, left;
663#ifndef CONFIG_64BIT 678#ifndef CONFIG_64BIT
664 pte_t *tpte; 679 pte_t *tpte;
@@ -667,30 +682,32 @@ struct page *swap_readahead_detect(struct vm_fault *vmf,
667 max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), 682 max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
668 SWAP_RA_ORDER_CEILING); 683 SWAP_RA_ORDER_CEILING);
669 if (max_win == 1) { 684 if (max_win == 1) {
670 swap_ra->win = 1; 685 ra_info->win = 1;
671 return NULL; 686 return;
672 } 687 }
673 688
674 faddr = vmf->address; 689 faddr = vmf->address;
675 entry = pte_to_swp_entry(vmf->orig_pte); 690 orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
676 if ((unlikely(non_swap_entry(entry)))) 691 entry = pte_to_swp_entry(*pte);
677 return NULL; 692 if ((unlikely(non_swap_entry(entry)))) {
678 page = lookup_swap_cache(entry, vma, faddr); 693 pte_unmap(orig_pte);
679 if (page) 694 return;
680 return page; 695 }
681 696
682 fpfn = PFN_DOWN(faddr); 697 fpfn = PFN_DOWN(faddr);
683 swap_ra_info = GET_SWAP_RA_VAL(vma); 698 ra_val = GET_SWAP_RA_VAL(vma);
684 pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); 699 pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
685 prev_win = SWAP_RA_WIN(swap_ra_info); 700 prev_win = SWAP_RA_WIN(ra_val);
686 hits = SWAP_RA_HITS(swap_ra_info); 701 hits = SWAP_RA_HITS(ra_val);
687 swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, 702 ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
688 max_win, prev_win); 703 max_win, prev_win);
689 atomic_long_set(&vma->swap_readahead_info, 704 atomic_long_set(&vma->swap_readahead_info,
690 SWAP_RA_VAL(faddr, win, 0)); 705 SWAP_RA_VAL(faddr, win, 0));
691 706
692 if (win == 1) 707 if (win == 1) {
693 return NULL; 708 pte_unmap(orig_pte);
709 return;
710 }
694 711
695 /* Copy the PTEs because the page table may be unmapped */ 712 /* Copy the PTEs because the page table may be unmapped */
696 if (fpfn == pfn + 1) 713 if (fpfn == pfn + 1)
@@ -703,23 +720,21 @@ struct page *swap_readahead_detect(struct vm_fault *vmf,
703 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, 720 swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
704 &start, &end); 721 &start, &end);
705 } 722 }
706 swap_ra->nr_pte = end - start; 723 ra_info->nr_pte = end - start;
707 swap_ra->offset = fpfn - start; 724 ra_info->offset = fpfn - start;
708 pte = vmf->pte - swap_ra->offset; 725 pte -= ra_info->offset;
709#ifdef CONFIG_64BIT 726#ifdef CONFIG_64BIT
710 swap_ra->ptes = pte; 727 ra_info->ptes = pte;
711#else 728#else
712 tpte = swap_ra->ptes; 729 tpte = ra_info->ptes;
713 for (pfn = start; pfn != end; pfn++) 730 for (pfn = start; pfn != end; pfn++)
714 *tpte++ = *pte++; 731 *tpte++ = *pte++;
715#endif 732#endif
716 733 pte_unmap(orig_pte);
717 return NULL;
718} 734}
719 735
720struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, 736static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
721 struct vm_fault *vmf, 737 struct vm_fault *vmf)
722 struct vma_swap_readahead *swap_ra)
723{ 738{
724 struct blk_plug plug; 739 struct blk_plug plug;
725 struct vm_area_struct *vma = vmf->vma; 740 struct vm_area_struct *vma = vmf->vma;
@@ -728,12 +743,14 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
728 swp_entry_t entry; 743 swp_entry_t entry;
729 unsigned int i; 744 unsigned int i;
730 bool page_allocated; 745 bool page_allocated;
746 struct vma_swap_readahead ra_info = {0,};
731 747
732 if (swap_ra->win == 1) 748 swap_ra_info(vmf, &ra_info);
749 if (ra_info.win == 1)
733 goto skip; 750 goto skip;
734 751
735 blk_start_plug(&plug); 752 blk_start_plug(&plug);
736 for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; 753 for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
737 i++, pte++) { 754 i++, pte++) {
738 pentry = *pte; 755 pentry = *pte;
739 if (pte_none(pentry)) 756 if (pte_none(pentry))
@@ -749,8 +766,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
749 continue; 766 continue;
750 if (page_allocated) { 767 if (page_allocated) {
751 swap_readpage(page, false); 768 swap_readpage(page, false);
752 if (i != swap_ra->offset && 769 if (i != ra_info.offset) {
753 likely(!PageTransCompound(page))) {
754 SetPageReadahead(page); 770 SetPageReadahead(page);
755 count_vm_event(SWAP_RA); 771 count_vm_event(SWAP_RA);
756 } 772 }
@@ -761,23 +777,43 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
761 lru_add_drain(); 777 lru_add_drain();
762skip: 778skip:
763 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, 779 return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
764 swap_ra->win == 1); 780 ra_info.win == 1);
781}
782
783/**
784 * swapin_readahead - swap in pages in hope we need them soon
785 * @entry: swap entry of this memory
786 * @gfp_mask: memory allocation flags
787 * @vmf: fault information
788 *
789 * Returns the struct page for entry and addr, after queueing swapin.
790 *
791 * It's a main entry function for swap readahead. By the configuration,
792 * it will read ahead blocks by cluster-based(ie, physical disk based)
793 * or vma-based(ie, virtual address based on faulty address) readahead.
794 */
795struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
796 struct vm_fault *vmf)
797{
798 return swap_use_vma_readahead() ?
799 swap_vma_readahead(entry, gfp_mask, vmf) :
800 swap_cluster_readahead(entry, gfp_mask, vmf);
765} 801}
766 802
767#ifdef CONFIG_SYSFS 803#ifdef CONFIG_SYSFS
768static ssize_t vma_ra_enabled_show(struct kobject *kobj, 804static ssize_t vma_ra_enabled_show(struct kobject *kobj,
769 struct kobj_attribute *attr, char *buf) 805 struct kobj_attribute *attr, char *buf)
770{ 806{
771 return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); 807 return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
772} 808}
773static ssize_t vma_ra_enabled_store(struct kobject *kobj, 809static ssize_t vma_ra_enabled_store(struct kobject *kobj,
774 struct kobj_attribute *attr, 810 struct kobj_attribute *attr,
775 const char *buf, size_t count) 811 const char *buf, size_t count)
776{ 812{
777 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) 813 if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
778 swap_vma_readahead = true; 814 enable_vma_readahead = true;
779 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) 815 else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
780 swap_vma_readahead = false; 816 enable_vma_readahead = false;
781 else 817 else
782 return -EINVAL; 818 return -EINVAL;
783 819
diff --git a/mm/util.c b/mm/util.c
index c1250501364f..029fc2f3b395 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -515,6 +515,16 @@ struct address_space *page_mapping(struct page *page)
515} 515}
516EXPORT_SYMBOL(page_mapping); 516EXPORT_SYMBOL(page_mapping);
517 517
518/*
519 * For file cache pages, return the address_space, otherwise return NULL
520 */
521struct address_space *page_mapping_file(struct page *page)
522{
523 if (unlikely(PageSwapCache(page)))
524 return NULL;
525 return page_mapping(page);
526}
527
518/* Slow path of page_mapcount() for compound pages */ 528/* Slow path of page_mapcount() for compound pages */
519int __page_mapcount(struct page *page) 529int __page_mapcount(struct page *page)
520{ 530{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index cd5dc3faaa57..4390a8d5be41 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -442,16 +442,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
442 if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) 442 if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
443 return 0; 443 return 0;
444 444
445 if (!down_read_trylock(&shrinker_rwsem)) { 445 if (!down_read_trylock(&shrinker_rwsem))
446 /*
447 * If we would return 0, our callers would understand that we
448 * have nothing else to shrink and give up trying. By returning
449 * 1 we keep it going and assume we'll be able to shrink next
450 * time.
451 */
452 freed = 1;
453 goto out; 446 goto out;
454 }
455 447
456 list_for_each_entry(shrinker, &shrinker_list, list) { 448 list_for_each_entry(shrinker, &shrinker_list, list) {
457 struct shrink_control sc = { 449 struct shrink_control sc = {
@@ -3547,16 +3539,21 @@ kswapd_try_sleep:
3547} 3539}
3548 3540
3549/* 3541/*
3550 * A zone is low on free memory, so wake its kswapd task to service it. 3542 * A zone is low on free memory or too fragmented for high-order memory. If
3543 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
3544 * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
3545 * has failed or is not needed, still wake up kcompactd if only compaction is
3546 * needed.
3551 */ 3547 */
3552void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) 3548void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3549 enum zone_type classzone_idx)
3553{ 3550{
3554 pg_data_t *pgdat; 3551 pg_data_t *pgdat;
3555 3552
3556 if (!managed_zone(zone)) 3553 if (!managed_zone(zone))
3557 return; 3554 return;
3558 3555
3559 if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) 3556 if (!cpuset_zone_allowed(zone, gfp_flags))
3560 return; 3557 return;
3561 pgdat = zone->zone_pgdat; 3558 pgdat = zone->zone_pgdat;
3562 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, 3559 pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
@@ -3565,14 +3562,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3565 if (!waitqueue_active(&pgdat->kswapd_wait)) 3562 if (!waitqueue_active(&pgdat->kswapd_wait))
3566 return; 3563 return;
3567 3564
3568 /* Hopeless node, leave it to direct reclaim */ 3565 /* Hopeless node, leave it to direct reclaim if possible */
3569 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) 3566 if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3570 return; 3567 pgdat_balanced(pgdat, order, classzone_idx)) {
3571 3568 /*
3572 if (pgdat_balanced(pgdat, order, classzone_idx)) 3569 * There may be plenty of free memory available, but it's too
3570 * fragmented for high-order allocations. Wake up kcompactd
3571 * and rely on compaction_suitable() to determine if it's
3572 * needed. If it fails, it will defer subsequent attempts to
3573 * ratelimit its work.
3574 */
3575 if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3576 wakeup_kcompactd(pgdat, order, classzone_idx);
3573 return; 3577 return;
3578 }
3574 3579
3575 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); 3580 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
3581 gfp_flags);
3576 wake_up_interruptible(&pgdat->kswapd_wait); 3582 wake_up_interruptible(&pgdat->kswapd_wait);
3577} 3583}
3578 3584
@@ -3877,7 +3883,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
3877 */ 3883 */
3878int page_evictable(struct page *page) 3884int page_evictable(struct page *page)
3879{ 3885{
3880 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); 3886 int ret;
3887
3888 /* Prevent address_space of inode and swap cache from being freed */
3889 rcu_read_lock();
3890 ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3891 rcu_read_unlock();
3892 return ret;
3881} 3893}
3882 3894
3883#ifdef CONFIG_SHMEM 3895#ifdef CONFIG_SHMEM
diff --git a/mm/z3fold.c b/mm/z3fold.c
index d589d318727f..f579ad4a8100 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -620,24 +620,27 @@ lookup:
620 bud = FIRST; 620 bud = FIRST;
621 } 621 }
622 622
623 spin_lock(&pool->stale_lock); 623 page = NULL;
624 zhdr = list_first_entry_or_null(&pool->stale, 624 if (can_sleep) {
625 struct z3fold_header, buddy); 625 spin_lock(&pool->stale_lock);
626 /* 626 zhdr = list_first_entry_or_null(&pool->stale,
627 * Before allocating a page, let's see if we can take one from the 627 struct z3fold_header, buddy);
628 * stale pages list. cancel_work_sync() can sleep so we must make 628 /*
629 * sure it won't be called in case we're in atomic context. 629 * Before allocating a page, let's see if we can take one from
630 */ 630 * the stale pages list. cancel_work_sync() can sleep so we
631 if (zhdr && (can_sleep || !work_pending(&zhdr->work))) { 631 * limit this case to the contexts where we can sleep
632 list_del(&zhdr->buddy); 632 */
633 spin_unlock(&pool->stale_lock); 633 if (zhdr) {
634 if (can_sleep) 634 list_del(&zhdr->buddy);
635 spin_unlock(&pool->stale_lock);
635 cancel_work_sync(&zhdr->work); 636 cancel_work_sync(&zhdr->work);
636 page = virt_to_page(zhdr); 637 page = virt_to_page(zhdr);
637 } else { 638 } else {
638 spin_unlock(&pool->stale_lock); 639 spin_unlock(&pool->stale_lock);
639 page = alloc_page(gfp); 640 }
640 } 641 }
642 if (!page)
643 page = alloc_page(gfp);
641 644
642 if (!page) 645 if (!page)
643 return -ENOMEM; 646 return -ENOMEM;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b7f61cd1c709..61cb05dc950c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -193,6 +193,7 @@ static struct vfsmount *zsmalloc_mnt;
193 * (see: fix_fullness_group()) 193 * (see: fix_fullness_group())
194 */ 194 */
195static const int fullness_threshold_frac = 4; 195static const int fullness_threshold_frac = 4;
196static size_t huge_class_size;
196 197
197struct size_class { 198struct size_class {
198 spinlock_t lock; 199 spinlock_t lock;
@@ -642,18 +643,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
642 643
643 return 0; 644 return 0;
644} 645}
645 646DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
646static int zs_stats_size_open(struct inode *inode, struct file *file)
647{
648 return single_open(file, zs_stats_size_show, inode->i_private);
649}
650
651static const struct file_operations zs_stat_size_ops = {
652 .open = zs_stats_size_open,
653 .read = seq_read,
654 .llseek = seq_lseek,
655 .release = single_release,
656};
657 647
658static void zs_pool_stat_create(struct zs_pool *pool, const char *name) 648static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
659{ 649{
@@ -672,7 +662,7 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
672 pool->stat_dentry = entry; 662 pool->stat_dentry = entry;
673 663
674 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 664 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
675 pool->stat_dentry, pool, &zs_stat_size_ops); 665 pool->stat_dentry, pool, &zs_stats_size_fops);
676 if (!entry) { 666 if (!entry) {
677 pr_warn("%s: debugfs file entry <%s> creation failed\n", 667 pr_warn("%s: debugfs file entry <%s> creation failed\n",
678 name, "classes"); 668 name, "classes");
@@ -861,6 +851,7 @@ static struct page *get_next_page(struct page *page)
861 851
862/** 852/**
863 * obj_to_location - get (<page>, <obj_idx>) from encoded object value 853 * obj_to_location - get (<page>, <obj_idx>) from encoded object value
854 * @obj: the encoded object value
864 * @page: page object resides in zspage 855 * @page: page object resides in zspage
865 * @obj_idx: object index 856 * @obj_idx: object index
866 */ 857 */
@@ -1311,6 +1302,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
1311 * zs_map_object - get address of allocated object from handle. 1302 * zs_map_object - get address of allocated object from handle.
1312 * @pool: pool from which the object was allocated 1303 * @pool: pool from which the object was allocated
1313 * @handle: handle returned from zs_malloc 1304 * @handle: handle returned from zs_malloc
1305 * @mm: maping mode to use
1314 * 1306 *
1315 * Before using an object allocated from zs_malloc, it must be mapped using 1307 * Before using an object allocated from zs_malloc, it must be mapped using
1316 * this function. When done with the object, it must be unmapped using 1308 * this function. When done with the object, it must be unmapped using
@@ -1418,6 +1410,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1418} 1410}
1419EXPORT_SYMBOL_GPL(zs_unmap_object); 1411EXPORT_SYMBOL_GPL(zs_unmap_object);
1420 1412
1413/**
1414 * zs_huge_class_size() - Returns the size (in bytes) of the first huge
1415 * zsmalloc &size_class.
1416 * @pool: zsmalloc pool to use
1417 *
1418 * The function returns the size of the first huge class - any object of equal
1419 * or bigger size will be stored in zspage consisting of a single physical
1420 * page.
1421 *
1422 * Context: Any context.
1423 *
1424 * Return: the size (in bytes) of the first huge zsmalloc &size_class.
1425 */
1426size_t zs_huge_class_size(struct zs_pool *pool)
1427{
1428 return huge_class_size;
1429}
1430EXPORT_SYMBOL_GPL(zs_huge_class_size);
1431
1421static unsigned long obj_malloc(struct size_class *class, 1432static unsigned long obj_malloc(struct size_class *class,
1422 struct zspage *zspage, unsigned long handle) 1433 struct zspage *zspage, unsigned long handle)
1423{ 1434{
@@ -2375,6 +2386,27 @@ struct zs_pool *zs_create_pool(const char *name)
2375 objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; 2386 objs_per_zspage = pages_per_zspage * PAGE_SIZE / size;
2376 2387
2377 /* 2388 /*
2389 * We iterate from biggest down to smallest classes,
2390 * so huge_class_size holds the size of the first huge
2391 * class. Any object bigger than or equal to that will
2392 * endup in the huge class.
2393 */
2394 if (pages_per_zspage != 1 && objs_per_zspage != 1 &&
2395 !huge_class_size) {
2396 huge_class_size = size;
2397 /*
2398 * The object uses ZS_HANDLE_SIZE bytes to store the
2399 * handle. We need to subtract it, because zs_malloc()
2400 * unconditionally adds handle size before it performs
2401 * size class search - so object may be smaller than
2402 * huge class size, yet it still can end up in the huge
2403 * class because it grows by ZS_HANDLE_SIZE extra bytes
2404 * right before class lookup.
2405 */
2406 huge_class_size -= (ZS_HANDLE_SIZE - 1);
2407 }
2408
2409 /*
2378 * size_class is used for normal zsmalloc operation such 2410 * size_class is used for normal zsmalloc operation such
2379 * as alloc/free for that size. Although it is natural that we 2411 * as alloc/free for that size. Although it is natural that we
2380 * have one size_class for each size, there is a chance that we 2412 * have one size_class for each size, there is a chance that we
diff --git a/net/9p/client.c b/net/9p/client.c
index b433aff5ff13..21e6df1cc70f 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -190,7 +190,9 @@ static int parse_opts(char *opts, struct p9_client *clnt)
190 p9_debug(P9_DEBUG_ERROR, 190 p9_debug(P9_DEBUG_ERROR,
191 "problem allocating copy of trans arg\n"); 191 "problem allocating copy of trans arg\n");
192 goto free_and_return; 192 goto free_and_return;
193 } 193 }
194
195 v9fs_put_trans(clnt->trans_mod);
194 clnt->trans_mod = v9fs_get_trans_by_name(s); 196 clnt->trans_mod = v9fs_get_trans_by_name(s);
195 if (clnt->trans_mod == NULL) { 197 if (clnt->trans_mod == NULL) {
196 pr_info("Could not find request transport: %s\n", 198 pr_info("Could not find request transport: %s\n",
@@ -226,6 +228,7 @@ static int parse_opts(char *opts, struct p9_client *clnt)
226 } 228 }
227 229
228free_and_return: 230free_and_return:
231 v9fs_put_trans(clnt->trans_mod);
229 kfree(tmp_options); 232 kfree(tmp_options);
230 return ret; 233 return ret;
231} 234}
@@ -769,7 +772,7 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
769 if (err < 0) { 772 if (err < 0) {
770 if (err != -ERESTARTSYS && err != -EFAULT) 773 if (err != -ERESTARTSYS && err != -EFAULT)
771 c->status = Disconnected; 774 c->status = Disconnected;
772 goto reterr; 775 goto recalc_sigpending;
773 } 776 }
774again: 777again:
775 /* Wait for the response */ 778 /* Wait for the response */
@@ -804,6 +807,7 @@ again:
804 if (req->status == REQ_STATUS_RCVD) 807 if (req->status == REQ_STATUS_RCVD)
805 err = 0; 808 err = 0;
806 } 809 }
810recalc_sigpending:
807 if (sigpending) { 811 if (sigpending) {
808 spin_lock_irqsave(&current->sighand->siglock, flags); 812 spin_lock_irqsave(&current->sighand->siglock, flags);
809 recalc_sigpending(); 813 recalc_sigpending();
@@ -867,7 +871,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
867 if (err == -EIO) 871 if (err == -EIO)
868 c->status = Disconnected; 872 c->status = Disconnected;
869 if (err != -ERESTARTSYS) 873 if (err != -ERESTARTSYS)
870 goto reterr; 874 goto recalc_sigpending;
871 } 875 }
872 if (req->status == REQ_STATUS_ERROR) { 876 if (req->status == REQ_STATUS_ERROR) {
873 p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err); 877 p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
@@ -885,6 +889,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
885 if (req->status == REQ_STATUS_RCVD) 889 if (req->status == REQ_STATUS_RCVD)
886 err = 0; 890 err = 0;
887 } 891 }
892recalc_sigpending:
888 if (sigpending) { 893 if (sigpending) {
889 spin_lock_irqsave(&current->sighand->siglock, flags); 894 spin_lock_irqsave(&current->sighand->siglock, flags);
890 recalc_sigpending(); 895 recalc_sigpending();
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b3b609f0eeb5..b1a2c5e38530 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -15,7 +15,6 @@
15#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/slab.h> 17#include <linux/slab.h>
18#include <linux/kmemleak.h>
19 18
20#include <net/ip.h> 19#include <net/ip.h>
21#include <net/sock.h> 20#include <net/sock.h>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8322e479f299..594a1c605c92 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -108,7 +108,6 @@
108#include <net/rtnetlink.h> 108#include <net/rtnetlink.h>
109#ifdef CONFIG_SYSCTL 109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h> 110#include <linux/sysctl.h>
111#include <linux/kmemleak.h>
112#endif 111#endif
113#include <net/secure_seq.h> 112#include <net/secure_seq.h>
114#include <net/ip_tunnels.h> 113#include <net/ip_tunnels.h>
diff --git a/scripts/faddr2line b/scripts/faddr2line
index 7721d5b2b0c0..9e5735a4d3a5 100755
--- a/scripts/faddr2line
+++ b/scripts/faddr2line
@@ -163,7 +163,17 @@ __faddr2line() {
163 163
164 # pass real address to addr2line 164 # pass real address to addr2line
165 echo "$func+$offset/$sym_size:" 165 echo "$func+$offset/$sym_size:"
166 ${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;" 166 local file_lines=$(${ADDR2LINE} -fpie $objfile $addr | sed "s; $dir_prefix\(\./\)*; ;")
167 [[ -z $file_lines ]] && return
168
169 # show each line with context
170 echo "$file_lines" | while read -r line
171 do
172 echo $line
173 eval $(echo $line | awk -F "[ :]" '{printf("n1=%d;n2=%d;f=%s",$NF-5, $NF+5, $(NF-1))}')
174 awk 'NR>=strtonum("'$n1'") && NR<=strtonum("'$n2'") {printf("%d\t%s\n", NR, $0)}' $f
175 done
176
167 DONE=1 177 DONE=1
168 178
169 done < <(${NM} -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }') 179 done < <(${NM} -n $objfile | awk -v fn=$func -v end=$file_end '$3 == fn { found=1; line=$0; start=$1; next } found == 1 { found=0; print line, "0x"$1 } END {if (found == 1) print line, end; }')
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index 9a65eeaf7dfa..6134302c143c 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -23,7 +23,6 @@
23#include <linux/sysctl.h> 23#include <linux/sysctl.h>
24#include <linux/audit.h> 24#include <linux/audit.h>
25#include <linux/user_namespace.h> 25#include <linux/user_namespace.h>
26#include <linux/kmemleak.h>
27#include <net/sock.h> 26#include <net/sock.h>
28 27
29#include "include/apparmor.h" 28#include "include/apparmor.h"
diff --git a/security/keys/big_key.c b/security/keys/big_key.c
index fa728f662a6f..933623784ccd 100644
--- a/security/keys/big_key.c
+++ b/security/keys/big_key.c
@@ -18,6 +18,7 @@
18#include <linux/err.h> 18#include <linux/err.h>
19#include <linux/scatterlist.h> 19#include <linux/scatterlist.h>
20#include <linux/random.h> 20#include <linux/random.h>
21#include <linux/vmalloc.h>
21#include <keys/user-type.h> 22#include <keys/user-type.h>
22#include <keys/big_key-type.h> 23#include <keys/big_key-type.h>
23#include <crypto/aead.h> 24#include <crypto/aead.h>