aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-12 23:50:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-12 23:50:02 -0500
commite34bac726d27056081d0250c0e173e4b155aa340 (patch)
tree85607d0b3b185380fb3267866020c6a4372b9298
parentfe6bce8d30a86c693bf7cfbf4759cbafd121289f (diff)
parent39a0e975c37dee93fa1b8ea5f7eacd1c4c8a586e (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - various misc bits - most of MM (quite a lot of MM material is awaiting the merge of linux-next dependencies) - kasan - printk updates - procfs updates - MAINTAINERS - /lib updates - checkpatch updates * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (123 commits) init: reduce rootwait polling interval time to 5ms binfmt_elf: use vmalloc() for allocation of vma_filesz checkpatch: don't emit unified-diff error for rename-only patches checkpatch: don't check c99 types like uint8_t under tools checkpatch: avoid multiple line dereferences checkpatch: don't check .pl files, improve absolute path commit log test scripts/checkpatch.pl: fix spelling checkpatch: don't try to get maintained status when --no-tree is given lib/ida: document locking requirements a bit better lib/rbtree.c: fix typo in comment of ____rb_erase_color lib/Kconfig.debug: make CONFIG_STRICT_DEVMEM depend on CONFIG_DEVMEM MAINTAINERS: add drm and drm/i915 irc channels MAINTAINERS: add "C:" for URI for chat where developers hang out MAINTAINERS: add drm and drm/i915 bug filing info MAINTAINERS: add "B:" for URI where to file bugs get_maintainer: look for arbitrary letter prefixes in sections printk: add Kconfig option to set default console loglevel printk/sound: handle more message headers printk/btrfs: handle more message headers printk/kdb: handle more message headers ...
-rw-r--r--Documentation/devicetree/booting-without-of.txt7
-rw-r--r--Documentation/filesystems/proc.txt2
-rw-r--r--Documentation/kernel-parameters.txt2
-rw-r--r--Documentation/vm/transhuge.txt5
-rw-r--r--MAINTAINERS8
-rw-r--r--arch/arm/include/asm/tlb.h21
-rw-r--r--arch/ia64/include/asm/tlb.h25
-rw-r--r--arch/m32r/Kconfig2
-rw-r--r--arch/m32r/include/asm/device.h6
-rw-r--r--arch/m32r/include/asm/dma-mapping.h32
-rw-r--r--arch/m32r/platforms/m32700ut/setup.c2
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h13
-rw-r--r--arch/powerpc/include/asm/tlb.h16
-rw-r--r--arch/powerpc/mm/numa.c13
-rw-r--r--arch/s390/include/asm/tlb.h14
-rw-r--r--arch/s390/mm/gmap.c2
-rw-r--r--arch/sh/include/asm/tlb.h15
-rw-r--r--arch/um/include/asm/tlb.h15
-rw-r--r--arch/x86/kernel/ldt.c2
-rw-r--r--arch/x86/kernel/setup.c24
-rw-r--r--block/blk-settings.c1
-rw-r--r--block/blk-sysfs.c1
-rw-r--r--drivers/of/fdt.c19
-rw-r--r--drivers/pcmcia/m32r_pcc.c41
-rw-r--r--drivers/sh/intc/virq.c2
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/btrfs/super.c26
-rw-r--r--fs/dax.c10
-rw-r--r--fs/fs-writeback.c16
-rw-r--r--fs/ocfs2/aops.c7
-rw-r--r--fs/ocfs2/aops.h3
-rw-r--r--fs/ocfs2/cluster/heartbeat.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c2
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/mmap.c3
-rw-r--r--fs/ocfs2/namei.c6
-rw-r--r--fs/ocfs2/ocfs2.h2
-rw-r--r--fs/ocfs2/refcounttree.c1
-rw-r--r--fs/ocfs2/super.c2
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c31
-rw-r--r--fs/proc/inode.c37
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_mmu.c1
-rw-r--r--include/asm-generic/pgtable.h13
-rw-r--r--include/asm-generic/tlb.h83
-rw-r--r--include/linux/backing-dev-defs.h3
-rw-r--r--include/linux/cma.h3
-rw-r--r--include/linux/compiler-gcc.h2
-rw-r--r--include/linux/huge_mm.h2
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/mempolicy.h8
-rw-r--r--include/linux/of_fdt.h1
-rw-r--r--include/linux/printk.h17
-rw-r--r--include/linux/radix-tree.h34
-rw-r--r--include/linux/rmap.h10
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/swap.h34
-rw-r--r--include/linux/vmalloc.h1
-rw-r--r--init/do_mounts.c2
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/hung_task.c3
-rw-r--r--kernel/kthread.c5
-rw-r--r--kernel/printk/nmi.c83
-rw-r--r--kernel/sys.c10
-rw-r--r--lib/Kconfig.debug21
-rw-r--r--lib/idr.c11
-rw-r--r--lib/radix-tree.c297
-rw-r--r--lib/rbtree.c23
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/compaction.c25
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c68
-rw-r--r--mm/gup.c19
-rw-r--r--mm/huge_memory.c53
-rw-r--r--mm/hugetlb.c25
-rw-r--r--mm/kasan/quarantine.c94
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/khugepaged.c37
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/madvise.c1
-rw-r--r--mm/memcontrol.c15
-rw-r--r--mm/memory.c92
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/mprotect.c19
-rw-r--r--mm/page_alloc.c75
-rw-r--r--mm/percpu.c16
-rw-r--r--mm/readahead.c39
-rw-r--r--mm/rmap.c69
-rw-r--r--mm/shmem.c15
-rw-r--r--mm/slab.c129
-rw-r--r--mm/slab.h20
-rw-r--r--mm/slab_common.c33
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c21
-rw-r--r--mm/swapfile.c13
-rw-r--r--mm/truncate.c21
-rw-r--r--mm/vmalloc.c196
-rw-r--r--mm/vmscan.c14
-rw-r--r--mm/workingset.c114
-rw-r--r--scripts/Makefile.kasan2
-rwxr-xr-xscripts/bloat-o-meter25
-rwxr-xr-xscripts/checkpatch.pl50
-rwxr-xr-xscripts/get_maintainer.pl12
-rwxr-xr-xscripts/tags.sh19
-rw-r--r--sound/core/misc.c20
-rw-r--r--tools/testing/radix-tree/multiorder.c2
113 files changed, 1550 insertions, 1041 deletions
diff --git a/Documentation/devicetree/booting-without-of.txt b/Documentation/devicetree/booting-without-of.txt
index 3f1437fbca6b..280d283304bb 100644
--- a/Documentation/devicetree/booting-without-of.txt
+++ b/Documentation/devicetree/booting-without-of.txt
@@ -974,6 +974,13 @@ compatibility.
974 4Gb. Some vendors prefer splitting those ranges into smaller 974 4Gb. Some vendors prefer splitting those ranges into smaller
975 segments, but the kernel doesn't care. 975 segments, but the kernel doesn't care.
976 976
977 Additional properties:
978
979 - hotpluggable : The presence of this property provides an explicit
980 hint to the operating system that this memory may potentially be
981 removed later. The kernel can take this into consideration when
982 doing nonmovable allocations and when laying out memory zones.
983
977 e) The /chosen node 984 e) The /chosen node
978 985
979 This node is a bit "special". Normally, that's where Open Firmware 986 This node is a bit "special". Normally, that's where Open Firmware
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 74329fd0add2..c03f2f91c6ab 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -191,6 +191,7 @@ read the file /proc/PID/status:
191 CapPrm: 0000000000000000 191 CapPrm: 0000000000000000
192 CapEff: 0000000000000000 192 CapEff: 0000000000000000
193 CapBnd: ffffffffffffffff 193 CapBnd: ffffffffffffffff
194 NoNewPrivs: 0
194 Seccomp: 0 195 Seccomp: 0
195 voluntary_ctxt_switches: 0 196 voluntary_ctxt_switches: 0
196 nonvoluntary_ctxt_switches: 1 197 nonvoluntary_ctxt_switches: 1
@@ -262,6 +263,7 @@ Table 1-2: Contents of the status files (as of 4.1)
262 CapPrm bitmap of permitted capabilities 263 CapPrm bitmap of permitted capabilities
263 CapEff bitmap of effective capabilities 264 CapEff bitmap of effective capabilities
264 CapBnd bitmap of capabilities bounding set 265 CapBnd bitmap of capabilities bounding set
266 NoNewPrivs no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...)
265 Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...) 267 Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...)
266 Cpus_allowed mask of CPUs on which this process may run 268 Cpus_allowed mask of CPUs on which this process may run
267 Cpus_allowed_list Same as previous, but in "list format" 269 Cpus_allowed_list Same as previous, but in "list format"
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index c5f1546a440f..6c6141c76eaa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2397,7 +2397,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
2397 that the amount of memory usable for all allocations 2397 that the amount of memory usable for all allocations
2398 is not too small. 2398 is not too small.
2399 2399
2400 movable_node [KNL,X86] Boot-time switch to enable the effects 2400 movable_node [KNL] Boot-time switch to enable the effects
2401 of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. 2401 of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
2402 2402
2403 MTD_Partition= [MTD] 2403 MTD_Partition= [MTD]
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 2ec6adb5a4ce..c4171e4519c2 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -136,6 +136,11 @@ or enable it back by writing 1:
136echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page 136echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
137echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page 137echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
138 138
139Some userspace (such as a test program, or an optimized memory allocation
140library) may want to know the size (in bytes) of a transparent hugepage:
141
142cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
143
139khugepaged will be automatically started when 144khugepaged will be automatically started when
140transparent_hugepage/enabled is set to "always" or "madvise, and it'll 145transparent_hugepage/enabled is set to "always" or "madvise, and it'll
141be automatically shutdown if it's set to "never". 146be automatically shutdown if it's set to "never".
diff --git a/MAINTAINERS b/MAINTAINERS
index 4e62a0e67df9..88315cfcfb39 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -74,6 +74,10 @@ Descriptions of section entries:
74 These reviewers should be CCed on patches. 74 These reviewers should be CCed on patches.
75 L: Mailing list that is relevant to this area 75 L: Mailing list that is relevant to this area
76 W: Web-page with status/info 76 W: Web-page with status/info
77 B: URI for where to file bugs. A web-page with detailed bug
78 filing info, a direct bug tracker link, or a mailto: URI.
79 C: URI for chat protocol, server and channel where developers
80 usually hang out, for example irc://server/channel.
77 Q: Patchwork web based patch tracking system site 81 Q: Patchwork web based patch tracking system site
78 T: SCM tree type and location. 82 T: SCM tree type and location.
79 Type is one of: git, hg, quilt, stgit, topgit 83 Type is one of: git, hg, quilt, stgit, topgit
@@ -4024,6 +4028,8 @@ DRM DRIVERS
4024M: David Airlie <airlied@linux.ie> 4028M: David Airlie <airlied@linux.ie>
4025L: dri-devel@lists.freedesktop.org 4029L: dri-devel@lists.freedesktop.org
4026T: git git://people.freedesktop.org/~airlied/linux 4030T: git git://people.freedesktop.org/~airlied/linux
4031B: https://bugs.freedesktop.org/
4032C: irc://chat.freenode.net/dri-devel
4027S: Maintained 4033S: Maintained
4028F: drivers/gpu/drm/ 4034F: drivers/gpu/drm/
4029F: drivers/gpu/vga/ 4035F: drivers/gpu/vga/
@@ -4076,6 +4082,8 @@ M: Jani Nikula <jani.nikula@linux.intel.com>
4076L: intel-gfx@lists.freedesktop.org 4082L: intel-gfx@lists.freedesktop.org
4077L: dri-devel@lists.freedesktop.org 4083L: dri-devel@lists.freedesktop.org
4078W: https://01.org/linuxgraphics/ 4084W: https://01.org/linuxgraphics/
4085B: https://01.org/linuxgraphics/documentation/how-report-bugs
4086C: irc://chat.freenode.net/intel-gfx
4079Q: http://patchwork.freedesktop.org/project/intel-gfx/ 4087Q: http://patchwork.freedesktop.org/project/intel-gfx/
4080T: git git://anongit.freedesktop.org/drm-intel 4088T: git git://anongit.freedesktop.org/drm-intel
4081S: Supported 4089S: Supported
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 1e25cd80589e..3f2eb76243e3 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr)
186 tlb_add_flush(tlb, addr); 186 tlb_add_flush(tlb, addr);
187} 187}
188 188
189#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
190 tlb_remove_tlb_entry(tlb, ptep, address)
189/* 191/*
190 * In the case of tlb vma handling, we can optimise these away in the 192 * In the case of tlb vma handling, we can optimise these away in the
191 * case where we're doing a full MM flush. When we're doing a munmap, 193 * case where we're doing a full MM flush. When we're doing a munmap,
@@ -211,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
211 213
212static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 214static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
213{ 215{
216 tlb->pages[tlb->nr++] = page;
217 VM_WARN_ON(tlb->nr > tlb->max);
214 if (tlb->nr == tlb->max) 218 if (tlb->nr == tlb->max)
215 return true; 219 return true;
216 tlb->pages[tlb->nr++] = page;
217 return false; 220 return false;
218} 221}
219 222
220static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 223static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
221{ 224{
222 if (__tlb_remove_page(tlb, page)) { 225 if (__tlb_remove_page(tlb, page))
223 tlb_flush_mmu(tlb); 226 tlb_flush_mmu(tlb);
224 __tlb_remove_page(tlb, page);
225 }
226} 227}
227 228
228static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, 229static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
@@ -231,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
231 return __tlb_remove_page(tlb, page); 232 return __tlb_remove_page(tlb, page);
232} 233}
233 234
234static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
235 struct page *page)
236{
237 return __tlb_remove_page(tlb, page);
238}
239
240static inline void tlb_remove_page_size(struct mmu_gather *tlb, 235static inline void tlb_remove_page_size(struct mmu_gather *tlb,
241 struct page *page, int page_size) 236 struct page *page, int page_size)
242{ 237{
@@ -284,5 +279,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr
284 279
285#define tlb_migrate_finish(mm) do { } while (0) 280#define tlb_migrate_finish(mm) do { } while (0)
286 281
282#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
283static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
284 unsigned int page_size)
285{
286}
287
287#endif /* CONFIG_MMU */ 288#endif /* CONFIG_MMU */
288#endif 289#endif
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 77e541cf0e5d..fced197b9626 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
207 */ 207 */
208static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 208static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
209{ 209{
210 if (tlb->nr == tlb->max)
211 return true;
212
213 tlb->need_flush = 1; 210 tlb->need_flush = 1;
214 211
215 if (!tlb->nr && tlb->pages == tlb->local) 212 if (!tlb->nr && tlb->pages == tlb->local)
216 __tlb_alloc_page(tlb); 213 __tlb_alloc_page(tlb);
217 214
218 tlb->pages[tlb->nr++] = page; 215 tlb->pages[tlb->nr++] = page;
216 VM_WARN_ON(tlb->nr > tlb->max);
217 if (tlb->nr == tlb->max)
218 return true;
219 return false; 219 return false;
220} 220}
221 221
@@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
236 236
237static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 237static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
238{ 238{
239 if (__tlb_remove_page(tlb, page)) { 239 if (__tlb_remove_page(tlb, page))
240 tlb_flush_mmu(tlb); 240 tlb_flush_mmu(tlb);
241 __tlb_remove_page(tlb, page);
242 }
243} 241}
244 242
245static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, 243static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
@@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
248 return __tlb_remove_page(tlb, page); 246 return __tlb_remove_page(tlb, page);
249} 247}
250 248
251static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
252 struct page *page)
253{
254 return __tlb_remove_page(tlb, page);
255}
256
257static inline void tlb_remove_page_size(struct mmu_gather *tlb, 249static inline void tlb_remove_page_size(struct mmu_gather *tlb,
258 struct page *page, int page_size) 250 struct page *page, int page_size)
259{ 251{
@@ -283,6 +275,15 @@ do { \
283 __tlb_remove_tlb_entry(tlb, ptep, addr); \ 275 __tlb_remove_tlb_entry(tlb, ptep, addr); \
284} while (0) 276} while (0)
285 277
278#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
279 tlb_remove_tlb_entry(tlb, ptep, address)
280
281#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
282static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
283 unsigned int page_size)
284{
285}
286
286#define pte_free_tlb(tlb, ptep, address) \ 287#define pte_free_tlb(tlb, ptep, address) \
287do { \ 288do { \
288 tlb->need_flush = 1; \ 289 tlb->need_flush = 1; \
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index 3cc8498fe0fe..d227a6988d6b 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -34,7 +34,7 @@ config NO_IOPORT_MAP
34 def_bool y 34 def_bool y
35 35
36config NO_DMA 36config NO_DMA
37 def_bool y 37 def_bool n
38 38
39config HZ 39config HZ
40 int 40 int
diff --git a/arch/m32r/include/asm/device.h b/arch/m32r/include/asm/device.h
index d8f9872b0e2d..4a9f35e0973f 100644
--- a/arch/m32r/include/asm/device.h
+++ b/arch/m32r/include/asm/device.h
@@ -3,5 +3,9 @@
3 * 3 *
4 * This file is released under the GPLv2 4 * This file is released under the GPLv2
5 */ 5 */
6#include <asm-generic/device.h> 6struct dev_archdata {
7 struct dma_map_ops *dma_ops;
8};
7 9
10struct pdev_archdata {
11};
diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h
new file mode 100644
index 000000000000..2c43a77fe942
--- /dev/null
+++ b/arch/m32r/include/asm/dma-mapping.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_M32R_DMA_MAPPING_H
2#define _ASM_M32R_DMA_MAPPING_H
3
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/mm.h>
7#include <linux/scatterlist.h>
8#include <linux/dma-debug.h>
9#include <linux/io.h>
10
11#define DMA_ERROR_CODE (~(dma_addr_t)0x0)
12
13static inline struct dma_map_ops *get_dma_ops(struct device *dev)
14{
15 if (dev && dev->archdata.dma_ops)
16 return dev->archdata.dma_ops;
17 return &dma_noop_ops;
18}
19
20static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size,
21 enum dma_data_direction direction)
22{
23}
24
25static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
26{
27 if (!dev->dma_mask)
28 return false;
29 return addr + size - 1 <= *dev->dma_mask;
30}
31
32#endif /* _ASM_M32R_DMA_MAPPING_H */
diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c
index 9a4ba8a8589d..349eb341752c 100644
--- a/arch/m32r/platforms/m32700ut/setup.c
+++ b/arch/m32r/platforms/m32700ut/setup.c
@@ -201,6 +201,7 @@ static struct irq_chip m32700ut_lanpld_irq_type =
201#define lcdpldirq2port(x) (unsigned long)((int)M32700UT_LCD_ICUCR1 + \ 201#define lcdpldirq2port(x) (unsigned long)((int)M32700UT_LCD_ICUCR1 + \
202 (((x) - 1) * sizeof(unsigned short))) 202 (((x) - 1) * sizeof(unsigned short)))
203 203
204#ifdef CONFIG_USB
204static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ]; 205static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ];
205 206
206static void disable_m32700ut_lcdpld_irq(unsigned int irq) 207static void disable_m32700ut_lcdpld_irq(unsigned int irq)
@@ -253,6 +254,7 @@ static struct irq_chip m32700ut_lcdpld_irq_type =
253 .irq_mask = mask_m32700ut_lcdpld, 254 .irq_mask = mask_m32700ut_lcdpld,
254 .irq_unmask = unmask_m32700ut_lcdpld, 255 .irq_unmask = unmask_m32700ut_lcdpld,
255}; 256};
257#endif
256 258
257void __init init_IRQ(void) 259void __init init_IRQ(void)
258{ 260{
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 9fd77f8794a0..0ebfbc8f0449 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
1009#define pmd_move_must_withdraw pmd_move_must_withdraw 1009#define pmd_move_must_withdraw pmd_move_must_withdraw
1010struct spinlock; 1010struct spinlock;
1011static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, 1011static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
1012 struct spinlock *old_pmd_ptl) 1012 struct spinlock *old_pmd_ptl,
1013 struct vm_area_struct *vma)
1013{ 1014{
1014 if (radix_enabled()) 1015 if (radix_enabled())
1015 return false; 1016 return false;
@@ -1020,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
1020 */ 1021 */
1021 return true; 1022 return true;
1022} 1023}
1024
1025
1026#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
1027static inline bool arch_needs_pgtable_deposit(void)
1028{
1029 if (radix_enabled())
1030 return false;
1031 return true;
1032}
1033
1023#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1034#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1024#endif /* __ASSEMBLY__ */ 1035#endif /* __ASSEMBLY__ */
1025#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ 1036#endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index 99e1397b71da..609557569f65 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -28,6 +28,7 @@
28#define tlb_start_vma(tlb, vma) do { } while (0) 28#define tlb_start_vma(tlb, vma) do { } while (0)
29#define tlb_end_vma(tlb, vma) do { } while (0) 29#define tlb_end_vma(tlb, vma) do { } while (0)
30#define __tlb_remove_tlb_entry __tlb_remove_tlb_entry 30#define __tlb_remove_tlb_entry __tlb_remove_tlb_entry
31#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
31 32
32extern void tlb_flush(struct mmu_gather *tlb); 33extern void tlb_flush(struct mmu_gather *tlb);
33 34
@@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep,
46#endif 47#endif
47} 48}
48 49
50static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
51 unsigned int page_size)
52{
53 if (!tlb->page_size)
54 tlb->page_size = page_size;
55 else if (tlb->page_size != page_size) {
56 tlb_flush_mmu(tlb);
57 /*
58 * update the page size after flush for the new
59 * mmu_gather.
60 */
61 tlb->page_size = page_size;
62 }
63}
64
49#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
50static inline int mm_is_core_local(struct mm_struct *mm) 66static inline int mm_is_core_local(struct mm_struct *mm)
51{ 67{
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index a51c188b81f3..0cb6bd8bfccf 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr)
1085int hot_add_scn_to_nid(unsigned long scn_addr) 1085int hot_add_scn_to_nid(unsigned long scn_addr)
1086{ 1086{
1087 struct device_node *memory = NULL; 1087 struct device_node *memory = NULL;
1088 int nid, found = 0; 1088 int nid;
1089 1089
1090 if (!numa_enabled || (min_common_depth < 0)) 1090 if (!numa_enabled || (min_common_depth < 0))
1091 return first_online_node; 1091 return first_online_node;
@@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
1101 if (nid < 0 || !node_online(nid)) 1101 if (nid < 0 || !node_online(nid))
1102 nid = first_online_node; 1102 nid = first_online_node;
1103 1103
1104 if (NODE_DATA(nid)->node_spanned_pages)
1105 return nid;
1106
1107 for_each_online_node(nid) {
1108 if (NODE_DATA(nid)->node_spanned_pages) {
1109 found = 1;
1110 break;
1111 }
1112 }
1113
1114 BUG_ON(!found);
1115 return nid; 1104 return nid;
1116} 1105}
1117 1106
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 15711de10403..853b2a3d8dee 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
104 return __tlb_remove_page(tlb, page); 104 return __tlb_remove_page(tlb, page);
105} 105}
106 106
107static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
108 struct page *page)
109{
110 return __tlb_remove_page(tlb, page);
111}
112
113static inline void tlb_remove_page_size(struct mmu_gather *tlb, 107static inline void tlb_remove_page_size(struct mmu_gather *tlb,
114 struct page *page, int page_size) 108 struct page *page, int page_size)
115{ 109{
@@ -162,5 +156,13 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
162#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) 156#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0)
163#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) 157#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0)
164#define tlb_migrate_finish(mm) do { } while (0) 158#define tlb_migrate_finish(mm) do { } while (0)
159#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
160 tlb_remove_tlb_entry(tlb, ptep, address)
161
162#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
163static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
164 unsigned int page_size)
165{
166}
165 167
166#endif /* _S390_TLB_H */ 168#endif /* _S390_TLB_H */
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 3ba622702ce4..ec1f0dedb948 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1015 if (slot) { 1015 if (slot) {
1016 rmap->next = radix_tree_deref_slot_protected(slot, 1016 rmap->next = radix_tree_deref_slot_protected(slot,
1017 &sg->guest_table_lock); 1017 &sg->guest_table_lock);
1018 radix_tree_replace_slot(slot, rmap); 1018 radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1019 } else { 1019 } else {
1020 rmap->next = NULL; 1020 rmap->next = NULL;
1021 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, 1021 radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 025cdb1032f6..46e0d635e36f 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
65 tlb->end = address + PAGE_SIZE; 65 tlb->end = address + PAGE_SIZE;
66} 66}
67 67
68#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
69 tlb_remove_tlb_entry(tlb, ptep, address)
70
68/* 71/*
69 * In the case of tlb vma handling, we can optimise these away in the 72 * In the case of tlb vma handling, we can optimise these away in the
70 * case where we're doing a full MM flush. When we're doing a munmap, 73 * case where we're doing a full MM flush. When we're doing a munmap,
@@ -115,18 +118,18 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
115 return __tlb_remove_page(tlb, page); 118 return __tlb_remove_page(tlb, page);
116} 119}
117 120
118static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
119 struct page *page)
120{
121 return __tlb_remove_page(tlb, page);
122}
123
124static inline void tlb_remove_page_size(struct mmu_gather *tlb, 121static inline void tlb_remove_page_size(struct mmu_gather *tlb,
125 struct page *page, int page_size) 122 struct page *page, int page_size)
126{ 123{
127 return tlb_remove_page(tlb, page); 124 return tlb_remove_page(tlb, page);
128} 125}
129 126
127#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
128static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
129 unsigned int page_size)
130{
131}
132
130#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) 133#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep)
131#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) 134#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)
132#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) 135#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp)
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 821ff0acfe17..600a2e9bfee2 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
116 return __tlb_remove_page(tlb, page); 116 return __tlb_remove_page(tlb, page);
117} 117}
118 118
119static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
120 struct page *page)
121{
122 return __tlb_remove_page(tlb, page);
123}
124
125static inline void tlb_remove_page_size(struct mmu_gather *tlb, 119static inline void tlb_remove_page_size(struct mmu_gather *tlb,
126 struct page *page, int page_size) 120 struct page *page, int page_size)
127{ 121{
@@ -141,6 +135,15 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb,
141 __tlb_remove_tlb_entry(tlb, ptep, address); \ 135 __tlb_remove_tlb_entry(tlb, ptep, address); \
142 } while (0) 136 } while (0)
143 137
138#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
139 tlb_remove_tlb_entry(tlb, ptep, address)
140
141#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
142static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
143 unsigned int page_size)
144{
145}
146
144#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) 147#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr)
145 148
146#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) 149#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr)
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index f09df2ff1bcc..d4a15831ac58 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt)
93 93
94 paravirt_free_ldt(ldt->entries, ldt->size); 94 paravirt_free_ldt(ldt->entries, ldt->size);
95 if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) 95 if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
96 vfree(ldt->entries); 96 vfree_atomic(ldt->entries);
97 else 97 else
98 free_page((unsigned long)ldt->entries); 98 free_page((unsigned long)ldt->entries);
99 kfree(ldt); 99 kfree(ldt);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9c337b0e8ba7..4cfba947d774 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p)
985 985
986 parse_early_param(); 986 parse_early_param();
987 987
988#ifdef CONFIG_MEMORY_HOTPLUG
989 /*
990 * Memory used by the kernel cannot be hot-removed because Linux
991 * cannot migrate the kernel pages. When memory hotplug is
992 * enabled, we should prevent memblock from allocating memory
993 * for the kernel.
994 *
995 * ACPI SRAT records all hotpluggable memory ranges. But before
996 * SRAT is parsed, we don't know about it.
997 *
998 * The kernel image is loaded into memory at very early time. We
999 * cannot prevent this anyway. So on NUMA system, we set any
1000 * node the kernel resides in as un-hotpluggable.
1001 *
1002 * Since on modern servers, one node could have double-digit
1003 * gigabytes memory, we can assume the memory around the kernel
1004 * image is also un-hotpluggable. So before SRAT is parsed, just
1005 * allocate memory near the kernel image to try the best to keep
1006 * the kernel away from hotpluggable memory.
1007 */
1008 if (movable_node_is_enabled())
1009 memblock_set_bottom_up(true);
1010#endif
1011
988 x86_report_nx(); 1012 x86_report_nx();
989 1013
990 /* after early param, so could get panic from serial */ 1014 /* after early param, so could get panic from serial */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae122843..65f16cf4f850 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
249 max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); 249 max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
250 max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); 250 max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
251 limits->max_sectors = max_sectors; 251 limits->max_sectors = max_sectors;
252 q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9);
252} 253}
253EXPORT_SYMBOL(blk_queue_max_hw_sectors); 254EXPORT_SYMBOL(blk_queue_max_hw_sectors);
254 255
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c5439a..ea374e820775 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
212 212
213 spin_lock_irq(q->queue_lock); 213 spin_lock_irq(q->queue_lock);
214 q->limits.max_sectors = max_sectors_kb << 1; 214 q->limits.max_sectors = max_sectors_kb << 1;
215 q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
215 spin_unlock_irq(q->queue_lock); 216 spin_unlock_irq(q->queue_lock);
216 217
217 return ret; 218 return ret;
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c
index c89d5d231a0e..c9b5cac03b36 100644
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
1015 const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 1015 const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
1016 const __be32 *reg, *endp; 1016 const __be32 *reg, *endp;
1017 int l; 1017 int l;
1018 bool hotpluggable;
1018 1019
1019 /* We are scanning "memory" nodes only */ 1020 /* We are scanning "memory" nodes only */
1020 if (type == NULL) { 1021 if (type == NULL) {
@@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
1034 return 0; 1035 return 0;
1035 1036
1036 endp = reg + (l / sizeof(__be32)); 1037 endp = reg + (l / sizeof(__be32));
1038 hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL);
1037 1039
1038 pr_debug("memory scan node %s, reg size %d,\n", uname, l); 1040 pr_debug("memory scan node %s, reg size %d,\n", uname, l);
1039 1041
@@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
1049 (unsigned long long)size); 1051 (unsigned long long)size);
1050 1052
1051 early_init_dt_add_memory_arch(base, size); 1053 early_init_dt_add_memory_arch(base, size);
1054
1055 if (!hotpluggable)
1056 continue;
1057
1058 if (early_init_dt_mark_hotplug_memory_arch(base, size))
1059 pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n",
1060 base, base + size);
1052 } 1061 }
1053 1062
1054 return 0; 1063 return 0;
@@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
1146 memblock_add(base, size); 1155 memblock_add(base, size);
1147} 1156}
1148 1157
1158int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
1159{
1160 return memblock_mark_hotplug(base, size);
1161}
1162
1149int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, 1163int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
1150 phys_addr_t size, bool nomap) 1164 phys_addr_t size, bool nomap)
1151{ 1165{
@@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size)
1168 WARN_ON(1); 1182 WARN_ON(1);
1169} 1183}
1170 1184
1185int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size)
1186{
1187 return -ENOSYS;
1188}
1189
1171int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, 1190int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base,
1172 phys_addr_t size, bool nomap) 1191 phys_addr_t size, bool nomap)
1173{ 1192{
diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c
index eb126b98ed8a..e50bbf826188 100644
--- a/drivers/pcmcia/m32r_pcc.c
+++ b/drivers/pcmcia/m32r_pcc.c
@@ -296,10 +296,11 @@ static int __init is_alive(u_short sock)
296 return 0; 296 return 0;
297} 297}
298 298
299static void add_pcc_socket(ulong base, int irq, ulong mapaddr, 299static int add_pcc_socket(ulong base, int irq, ulong mapaddr,
300 unsigned int ioaddr) 300 unsigned int ioaddr)
301{ 301{
302 pcc_socket_t *t = &socket[pcc_sockets]; 302 pcc_socket_t *t = &socket[pcc_sockets];
303 int err;
303 304
304 /* add sockets */ 305 /* add sockets */
305 t->ioaddr = ioaddr; 306 t->ioaddr = ioaddr;
@@ -328,11 +329,16 @@ static void add_pcc_socket(ulong base, int irq, ulong mapaddr,
328 t->socket.irq_mask = 0; 329 t->socket.irq_mask = 0;
329 t->socket.pci_irq = 2 + pcc_sockets; /* XXX */ 330 t->socket.pci_irq = 2 + pcc_sockets; /* XXX */
330 331
331 request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt); 332 err = request_irq(irq, pcc_interrupt, 0, "m32r-pcc", pcc_interrupt);
333 if (err) {
334 if (t->base > 0)
335 release_region(t->base, 0x20);
336 return err;
337 }
332 338
333 pcc_sockets++; 339 pcc_sockets++;
334 340
335 return; 341 return 0;
336} 342}
337 343
338 344
@@ -683,26 +689,29 @@ static int __init init_m32r_pcc(void)
683 return ret; 689 return ret;
684 690
685 ret = platform_device_register(&pcc_device); 691 ret = platform_device_register(&pcc_device);
686 if (ret){ 692 if (ret)
687 platform_driver_unregister(&pcc_driver); 693 goto unreg_driv;
688 return ret;
689 }
690 694
691 printk(KERN_INFO "m32r PCC probe:\n"); 695 printk(KERN_INFO "m32r PCC probe:\n");
692 696
693 pcc_sockets = 0; 697 pcc_sockets = 0;
694 698
695 add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE, 0x1000); 699 ret = add_pcc_socket(M32R_PCC0_BASE, PCC0_IRQ, M32R_PCC0_MAPBASE,
700 0x1000);
701 if (ret)
702 goto unreg_dev;
696 703
697#ifdef CONFIG_M32RPCC_SLOT2 704#ifdef CONFIG_M32RPCC_SLOT2
698 add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE, 0x2000); 705 ret = add_pcc_socket(M32R_PCC1_BASE, PCC1_IRQ, M32R_PCC1_MAPBASE,
706 0x2000);
707 if (ret)
708 goto unreg_dev;
699#endif 709#endif
700 710
701 if (pcc_sockets == 0) { 711 if (pcc_sockets == 0) {
702 printk("socket is not found.\n"); 712 printk("socket is not found.\n");
703 platform_device_unregister(&pcc_device); 713 ret = -ENODEV;
704 platform_driver_unregister(&pcc_driver); 714 goto unreg_dev;
705 return -ENODEV;
706 } 715 }
707 716
708 /* Set up interrupt handler(s) */ 717 /* Set up interrupt handler(s) */
@@ -728,6 +737,12 @@ static int __init init_m32r_pcc(void)
728 } 737 }
729 738
730 return 0; 739 return 0;
740
741unreg_dev:
742 platform_device_unregister(&pcc_device);
743unreg_driv:
744 platform_driver_unregister(&pcc_driver);
745 return ret;
731} /* init_m32r_pcc */ 746} /* init_m32r_pcc */
732 747
733static void __exit exit_m32r_pcc(void) 748static void __exit exit_m32r_pcc(void)
diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c
index e7899624aa0b..35bbe288ddb4 100644
--- a/drivers/sh/intc/virq.c
+++ b/drivers/sh/intc/virq.c
@@ -254,7 +254,7 @@ restart:
254 254
255 radix_tree_tag_clear(&d->tree, entry->enum_id, 255 radix_tree_tag_clear(&d->tree, entry->enum_id,
256 INTC_TAG_VIRQ_NEEDS_ALLOC); 256 INTC_TAG_VIRQ_NEEDS_ALLOC);
257 radix_tree_replace_slot((void **)entries[i], 257 radix_tree_replace_slot(&d->tree, (void **)entries[i],
258 &intc_irq_xlate[irq]); 258 &intc_irq_xlate[irq]);
259 } 259 }
260 260
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2798c7..e6c1bd443806 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm)
2204 2204
2205 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 2205 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
2206 2206
2207 vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); 2207 if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
2208 goto end_coredump;
2209 vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
2208 if (!vma_filesz) 2210 if (!vma_filesz)
2209 goto end_coredump; 2211 goto end_coredump;
2210 2212
@@ -2311,7 +2313,7 @@ end_coredump:
2311cleanup: 2313cleanup:
2312 free_note_info(&info); 2314 free_note_info(&info);
2313 kfree(shdr4extnum); 2315 kfree(shdr4extnum);
2314 kfree(vma_filesz); 2316 vfree(vma_filesz);
2315 kfree(phdr4note); 2317 kfree(phdr4note);
2316 kfree(elf); 2318 kfree(elf);
2317out: 2319out:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 74ed5aae6cea..180f910339f4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,27 +202,31 @@ static struct ratelimit_state printk_limits[] = {
202void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 202void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
203{ 203{
204 struct super_block *sb = fs_info->sb; 204 struct super_block *sb = fs_info->sb;
205 char lvl[4]; 205 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1];
206 struct va_format vaf; 206 struct va_format vaf;
207 va_list args; 207 va_list args;
208 const char *type = logtypes[4]; 208 const char *type = NULL;
209 int kern_level; 209 int kern_level;
210 struct ratelimit_state *ratelimit; 210 struct ratelimit_state *ratelimit;
211 211
212 va_start(args, fmt); 212 va_start(args, fmt);
213 213
214 kern_level = printk_get_level(fmt); 214 while ((kern_level = printk_get_level(fmt)) != 0) {
215 if (kern_level) {
216 size_t size = printk_skip_level(fmt) - fmt; 215 size_t size = printk_skip_level(fmt) - fmt;
217 memcpy(lvl, fmt, size); 216
218 lvl[size] = '\0'; 217 if (kern_level >= '0' && kern_level <= '7') {
218 memcpy(lvl, fmt, size);
219 lvl[size] = '\0';
220 type = logtypes[kern_level - '0'];
221 ratelimit = &printk_limits[kern_level - '0'];
222 }
219 fmt += size; 223 fmt += size;
220 type = logtypes[kern_level - '0']; 224 }
221 ratelimit = &printk_limits[kern_level - '0']; 225
222 } else { 226 if (!type) {
223 *lvl = '\0'; 227 *lvl = '\0';
224 /* Default to debug output */ 228 type = logtypes[4];
225 ratelimit = &printk_limits[7]; 229 ratelimit = &printk_limits[4];
226 } 230 }
227 231
228 vaf.fmt = fmt; 232 vaf.fmt = fmt;
diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..6916ed37d463 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
342 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 342 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
343 343
344 entry |= RADIX_DAX_ENTRY_LOCK; 344 entry |= RADIX_DAX_ENTRY_LOCK;
345 radix_tree_replace_slot(slot, (void *)entry); 345 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
346 return (void *)entry; 346 return (void *)entry;
347} 347}
348 348
@@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
356 radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 356 radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
357 357
358 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; 358 entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
359 radix_tree_replace_slot(slot, (void *)entry); 359 radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
360 return (void *)entry; 360 return (void *)entry;
361} 361}
362 362
@@ -643,12 +643,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
643 } 643 }
644 mapping->nrexceptional++; 644 mapping->nrexceptional++;
645 } else { 645 } else {
646 struct radix_tree_node *node;
646 void **slot; 647 void **slot;
647 void *ret; 648 void *ret;
648 649
649 ret = __radix_tree_lookup(page_tree, index, NULL, &slot); 650 ret = __radix_tree_lookup(page_tree, index, &node, &slot);
650 WARN_ON_ONCE(ret != entry); 651 WARN_ON_ONCE(ret != entry);
651 radix_tree_replace_slot(slot, new_entry); 652 __radix_tree_replace(page_tree, node, slot,
653 new_entry, NULL, NULL);
652 } 654 }
653 if (vmf->flags & FAULT_FLAG_WRITE) 655 if (vmf->flags & FAULT_FLAG_WRITE)
654 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); 656 radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05713a5da083..ef600591d96f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb,
1769 * become available for writeback. Otherwise 1769 * become available for writeback. Otherwise
1770 * we'll just busyloop. 1770 * we'll just busyloop.
1771 */ 1771 */
1772 if (!list_empty(&wb->b_more_io)) { 1772 trace_writeback_wait(wb, work);
1773 trace_writeback_wait(wb, work); 1773 inode = wb_inode(wb->b_more_io.prev);
1774 inode = wb_inode(wb->b_more_io.prev); 1774 spin_lock(&inode->i_lock);
1775 spin_lock(&inode->i_lock); 1775 spin_unlock(&wb->list_lock);
1776 spin_unlock(&wb->list_lock); 1776 /* This function drops i_lock... */
1777 /* This function drops i_lock... */ 1777 inode_sleep_on_writeback(inode);
1778 inode_sleep_on_writeback(inode); 1778 spin_lock(&wb->list_lock);
1779 spin_lock(&wb->list_lock);
1780 }
1781 } 1779 }
1782 spin_unlock(&wb->list_lock); 1780 spin_unlock(&wb->list_lock);
1783 blk_finish_plug(&plug); 1781 blk_finish_plug(&plug);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c5c5b9748ea3..9a88984f9f6f 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1950,8 +1950,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1950} 1950}
1951 1951
1952int ocfs2_write_end_nolock(struct address_space *mapping, 1952int ocfs2_write_end_nolock(struct address_space *mapping,
1953 loff_t pos, unsigned len, unsigned copied, 1953 loff_t pos, unsigned len, unsigned copied, void *fsdata)
1954 struct page *page, void *fsdata)
1955{ 1954{
1956 int i, ret; 1955 int i, ret;
1957 unsigned from, to, start = pos & (PAGE_SIZE - 1); 1956 unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@ -2064,7 +2063,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
2064 int ret; 2063 int ret;
2065 struct inode *inode = mapping->host; 2064 struct inode *inode = mapping->host;
2066 2065
2067 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); 2066 ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
2068 2067
2069 up_write(&OCFS2_I(inode)->ip_alloc_sem); 2068 up_write(&OCFS2_I(inode)->ip_alloc_sem);
2070 ocfs2_inode_unlock(inode, 1); 2069 ocfs2_inode_unlock(inode, 1);
@@ -2241,7 +2240,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
2241 dwc->dw_zero_count++; 2240 dwc->dw_zero_count++;
2242 } 2241 }
2243 2242
2244 ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); 2243 ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
2245 BUG_ON(ret != len); 2244 BUG_ON(ret != len);
2246 ret = 0; 2245 ret = 0;
2247unlock: 2246unlock:
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index b1c9f28a57b1..8614ff069d99 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle,
44 struct buffer_head *bh)); 44 struct buffer_head *bh));
45 45
46int ocfs2_write_end_nolock(struct address_space *mapping, 46int ocfs2_write_end_nolock(struct address_space *mapping,
47 loff_t pos, unsigned len, unsigned copied, 47 loff_t pos, unsigned len, unsigned copied, void *fsdata);
48 struct page *page, void *fsdata);
49 48
50typedef enum { 49typedef enum {
51 OCFS2_WRITE_BUFFER = 0, 50 OCFS2_WRITE_BUFFER = 0,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 636abcbd4650..9158c9825094 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
741 hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; 741 hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
742 memset(hb_block, 0, reg->hr_block_bytes); 742 memset(hb_block, 0, reg->hr_block_bytes);
743 /* TODO: time stuff */ 743 /* TODO: time stuff */
744 cputime = CURRENT_TIME.tv_sec; 744 cputime = ktime_get_real_seconds();
745 if (!cputime) 745 if (!cputime)
746 cputime = 1; 746 cputime = 1;
747 747
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3f828a187049..a464c8088170 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1609,8 +1609,6 @@ way_up_top:
1609 __dlm_insert_mle(dlm, mle); 1609 __dlm_insert_mle(dlm, mle);
1610 response = DLM_MASTER_RESP_NO; 1610 response = DLM_MASTER_RESP_NO;
1611 } else { 1611 } else {
1612 // mlog(0, "mle was found\n");
1613 set_maybe = 1;
1614 spin_lock(&tmpmle->spinlock); 1612 spin_lock(&tmpmle->spinlock);
1615 if (tmpmle->master == dlm->node_num) { 1613 if (tmpmle->master == dlm->node_num) {
1616 mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); 1614 mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@@ -1625,8 +1623,7 @@ way_up_top:
1625 response = DLM_MASTER_RESP_NO; 1623 response = DLM_MASTER_RESP_NO;
1626 } else 1624 } else
1627 response = DLM_MASTER_RESP_MAYBE; 1625 response = DLM_MASTER_RESP_MAYBE;
1628 if (set_maybe) 1626 set_bit(request->node_idx, tmpmle->maybe_map);
1629 set_bit(request->node_idx, tmpmle->maybe_map);
1630 spin_unlock(&tmpmle->spinlock); 1627 spin_unlock(&tmpmle->spinlock);
1631 } 1628 }
1632 spin_unlock(&dlm->master_lock); 1629 spin_unlock(&dlm->master_lock);
@@ -1644,12 +1641,6 @@ send_response:
1644 * dlm_assert_master_worker() isn't called, we drop it here. 1641 * dlm_assert_master_worker() isn't called, we drop it here.
1645 */ 1642 */
1646 if (dispatch_assert) { 1643 if (dispatch_assert) {
1647 if (response != DLM_MASTER_RESP_YES)
1648 mlog(ML_ERROR, "invalid response %d\n", response);
1649 if (!res) {
1650 mlog(ML_ERROR, "bad lockres while trying to assert!\n");
1651 BUG();
1652 }
1653 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", 1644 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1654 dlm->node_num, res->lockname.len, res->lockname.name); 1645 dlm->node_num, res->lockname.len, res->lockname.name);
1655 spin_lock(&res->spinlock); 1646 spin_lock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index dd5cb8bcefd1..74407c6dd592 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
2966 spin_unlock(&dlm->spinlock); 2966 spin_unlock(&dlm->spinlock);
2967 dlm_kick_recovery_thread(dlm); 2967 dlm_kick_recovery_thread(dlm);
2968 break; 2968 break;
2969 default:
2970 BUG();
2971 } 2969 }
2972 2970
2973 mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", 2971 mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c56a7679df93..382401d3e88f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode,
703 goto bail_commit; 703 goto bail_commit;
704 } 704 }
705 705
706 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 706 di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
707 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 707 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
708 ocfs2_journal_dirty(handle, di_bh); 708 ocfs2_journal_dirty(handle, di_bh);
709 709
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a244f14c6b87..d5e5fa7f0743 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1947 */ 1947 */
1948 seqno++; 1948 seqno++;
1949 os->os_count++; 1949 os->os_count++;
1950 os->os_scantime = CURRENT_TIME; 1950 os->os_scantime = ktime_get_seconds();
1951unlock: 1951unlock:
1952 ocfs2_orphan_scan_unlock(osb, seqno); 1952 ocfs2_orphan_scan_unlock(osb, seqno);
1953out: 1953out:
@@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
2004 struct ocfs2_orphan_scan *os; 2004 struct ocfs2_orphan_scan *os;
2005 2005
2006 os = &osb->osb_orphan_scan; 2006 os = &osb->osb_orphan_scan;
2007 os->os_scantime = CURRENT_TIME; 2007 os->os_scantime = ktime_get_seconds();
2008 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) 2008 if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
2009 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); 2009 atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
2010 else { 2010 else {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 71545ad4628c..429088786e93 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
120 ret = VM_FAULT_NOPAGE; 120 ret = VM_FAULT_NOPAGE;
121 goto out; 121 goto out;
122 } 122 }
123 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, 123 ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
124 fsdata);
125 BUG_ON(ret != len); 124 BUG_ON(ret != len);
126 ret = VM_FAULT_LOCKED; 125 ret = VM_FAULT_LOCKED;
127out: 126out:
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8d887c75765c..3b0a10d9b36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
516 struct ocfs2_extent_list *fel; 516 struct ocfs2_extent_list *fel;
517 u16 feat; 517 u16 feat;
518 struct ocfs2_inode_info *oi = OCFS2_I(inode); 518 struct ocfs2_inode_info *oi = OCFS2_I(inode);
519 struct timespec64 ts;
519 520
520 *new_fe_bh = NULL; 521 *new_fe_bh = NULL;
521 522
@@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
564 fe->i_last_eb_blk = 0; 565 fe->i_last_eb_blk = 0;
565 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); 566 strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
566 fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); 567 fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
568 ktime_get_real_ts64(&ts);
567 fe->i_atime = fe->i_ctime = fe->i_mtime = 569 fe->i_atime = fe->i_ctime = fe->i_mtime =
568 cpu_to_le64(CURRENT_TIME.tv_sec); 570 cpu_to_le64(ts.tv_sec);
569 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = 571 fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
570 cpu_to_le32(CURRENT_TIME.tv_nsec); 572 cpu_to_le32(ts.tv_nsec);
571 fe->i_dtime = 0; 573 fe->i_dtime = 0;
572 574
573 /* 575 /*
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index e63af7ddfe68..7e5958b0be6b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,7 +224,7 @@ struct ocfs2_orphan_scan {
224 struct ocfs2_super *os_osb; 224 struct ocfs2_super *os_osb;
225 struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ 225 struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */
226 struct delayed_work os_orphan_scan_work; 226 struct delayed_work os_orphan_scan_work;
227 struct timespec os_scantime; /* time this node ran the scan */ 227 time64_t os_scantime; /* time this node ran the scan */
228 u32 os_count; /* tracks node specific scans */ 228 u32 os_count; /* tracks node specific scans */
229 u32 os_seqno; /* tracks cluster wide scans */ 229 u32 os_seqno; /* tracks cluster wide scans */
230 atomic_t os_state; /* ACTIVE or INACTIVE */ 230 atomic_t os_state; /* ACTIVE or INACTIVE */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19238512a324..738b4ea8e990 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -478,7 +478,6 @@ again:
478 if (ret) { 478 if (ret) {
479 mlog_errno(ret); 479 mlog_errno(ret);
480 ocfs2_unlock_refcount_tree(osb, tree, rw); 480 ocfs2_unlock_refcount_tree(osb, tree, rw);
481 ocfs2_refcount_tree_put(tree);
482 goto out; 481 goto out;
483 } 482 }
484 483
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f56fe39fab04..c894d945b084 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
337 out += snprintf(buf + out, len - out, "Disabled\n"); 337 out += snprintf(buf + out, len - out, "Disabled\n");
338 else 338 else
339 out += snprintf(buf + out, len - out, "%lu seconds ago\n", 339 out += snprintf(buf + out, len - out, "%lu seconds ago\n",
340 (get_seconds() - os->os_scantime.tv_sec)); 340 (unsigned long)(ktime_get_seconds() - os->os_scantime));
341 341
342 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", 342 out += snprintf(buf + out, len - out, "%10s => %3s %10s\n",
343 "Slots", "Num", "RecoGen"); 343 "Slots", "Num", "RecoGen");
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 81818adb8e9e..51a4213afa2e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header,
245 if (sigismember(set, i+2)) x |= 2; 245 if (sigismember(set, i+2)) x |= 2;
246 if (sigismember(set, i+3)) x |= 4; 246 if (sigismember(set, i+3)) x |= 4;
247 if (sigismember(set, i+4)) x |= 8; 247 if (sigismember(set, i+4)) x |= 8;
248 seq_printf(m, "%x", x); 248 seq_putc(m, hex_asc[x]);
249 } while (i >= 4); 249 } while (i >= 4);
250 250
251 seq_putc(m, '\n'); 251 seq_putc(m, '\n');
@@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
342 342
343static inline void task_seccomp(struct seq_file *m, struct task_struct *p) 343static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
344{ 344{
345 seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
345#ifdef CONFIG_SECCOMP 346#ifdef CONFIG_SECCOMP
346 seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); 347 seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
347 seq_putc(m, '\n');
348#endif 348#endif
349 seq_putc(m, '\n');
349} 350}
350 351
351static inline void task_context_switch_counts(struct seq_file *m, 352static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ca651ac00660..9b99df4893a4 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -104,9 +104,12 @@
104 * in /proc for a task before it execs a suid executable. 104 * in /proc for a task before it execs a suid executable.
105 */ 105 */
106 106
107static u8 nlink_tid;
108static u8 nlink_tgid;
109
107struct pid_entry { 110struct pid_entry {
108 const char *name; 111 const char *name;
109 int len; 112 unsigned int len;
110 umode_t mode; 113 umode_t mode;
111 const struct inode_operations *iop; 114 const struct inode_operations *iop;
112 const struct file_operations *fop; 115 const struct file_operations *fop;
@@ -139,13 +142,13 @@ struct pid_entry {
139 * Count the number of hardlinks for the pid_entry table, excluding the . 142 * Count the number of hardlinks for the pid_entry table, excluding the .
140 * and .. links. 143 * and .. links.
141 */ 144 */
142static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, 145static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
143 unsigned int n) 146 unsigned int n)
144{ 147{
145 unsigned int i; 148 unsigned int i;
146 unsigned int count; 149 unsigned int count;
147 150
148 count = 0; 151 count = 2;
149 for (i = 0; i < n; ++i) { 152 for (i = 0; i < n; ++i) {
150 if (S_ISDIR(entries[i].mode)) 153 if (S_ISDIR(entries[i].mode))
151 ++count; 154 ++count;
@@ -1967,7 +1970,7 @@ out:
1967 1970
1968struct map_files_info { 1971struct map_files_info {
1969 fmode_t mode; 1972 fmode_t mode;
1970 unsigned long len; 1973 unsigned int len;
1971 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1974 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1972}; 1975};
1973 1976
@@ -2412,14 +2415,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
2412 * Yes, it does not scale. And it should not. Don't add 2415 * Yes, it does not scale. And it should not. Don't add
2413 * new entries into /proc/<tgid>/ without very good reasons. 2416 * new entries into /proc/<tgid>/ without very good reasons.
2414 */ 2417 */
2415 last = &ents[nents - 1]; 2418 last = &ents[nents];
2416 for (p = ents; p <= last; p++) { 2419 for (p = ents; p < last; p++) {
2417 if (p->len != dentry->d_name.len) 2420 if (p->len != dentry->d_name.len)
2418 continue; 2421 continue;
2419 if (!memcmp(dentry->d_name.name, p->name, p->len)) 2422 if (!memcmp(dentry->d_name.name, p->name, p->len))
2420 break; 2423 break;
2421 } 2424 }
2422 if (p > last) 2425 if (p >= last)
2423 goto out; 2426 goto out;
2424 2427
2425 error = proc_pident_instantiate(dir, dentry, task, p); 2428 error = proc_pident_instantiate(dir, dentry, task, p);
@@ -2444,7 +2447,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2444 if (ctx->pos >= nents + 2) 2447 if (ctx->pos >= nents + 2)
2445 goto out; 2448 goto out;
2446 2449
2447 for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { 2450 for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
2448 if (!proc_fill_cache(file, ctx, p->name, p->len, 2451 if (!proc_fill_cache(file, ctx, p->name, p->len,
2449 proc_pident_instantiate, task, p)) 2452 proc_pident_instantiate, task, p))
2450 break; 2453 break;
@@ -3068,8 +3071,7 @@ static int proc_pid_instantiate(struct inode *dir,
3068 inode->i_fop = &proc_tgid_base_operations; 3071 inode->i_fop = &proc_tgid_base_operations;
3069 inode->i_flags|=S_IMMUTABLE; 3072 inode->i_flags|=S_IMMUTABLE;
3070 3073
3071 set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, 3074 set_nlink(inode, nlink_tgid);
3072 ARRAY_SIZE(tgid_base_stuff)));
3073 3075
3074 d_set_d_op(dentry, &pid_dentry_operations); 3076 d_set_d_op(dentry, &pid_dentry_operations);
3075 3077
@@ -3361,8 +3363,7 @@ static int proc_task_instantiate(struct inode *dir,
3361 inode->i_fop = &proc_tid_base_operations; 3363 inode->i_fop = &proc_tid_base_operations;
3362 inode->i_flags|=S_IMMUTABLE; 3364 inode->i_flags|=S_IMMUTABLE;
3363 3365
3364 set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, 3366 set_nlink(inode, nlink_tid);
3365 ARRAY_SIZE(tid_base_stuff)));
3366 3367
3367 d_set_d_op(dentry, &pid_dentry_operations); 3368 d_set_d_op(dentry, &pid_dentry_operations);
3368 3369
@@ -3552,3 +3553,9 @@ static const struct file_operations proc_task_operations = {
3552 .iterate_shared = proc_task_readdir, 3553 .iterate_shared = proc_task_readdir,
3553 .llseek = generic_file_llseek, 3554 .llseek = generic_file_llseek,
3554}; 3555};
3556
3557void __init set_proc_pid_nlink(void)
3558{
3559 nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
3560 nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
3561}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e69ebe648a34..783bc19644d1 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde)
138/* pde is locked */ 138/* pde is locked */
139static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) 139static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
140{ 140{
141 /*
142 * close() (proc_reg_release()) can't delete an entry and proceed:
143 * ->release hook needs to be available at the right moment.
144 *
145 * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
146 * "struct file" needs to be available at the right moment.
147 *
148 * Therefore, first process to enter this function does ->release() and
149 * signals its completion to the other process which does nothing.
150 */
141 if (pdeo->closing) { 151 if (pdeo->closing) {
142 /* somebody else is doing that, just wait */ 152 /* somebody else is doing that, just wait */
143 DECLARE_COMPLETION_ONSTACK(c); 153 DECLARE_COMPLETION_ONSTACK(c);
@@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
147 spin_lock(&pde->pde_unload_lock); 157 spin_lock(&pde->pde_unload_lock);
148 } else { 158 } else {
149 struct file *file; 159 struct file *file;
150 pdeo->closing = 1; 160 pdeo->closing = true;
151 spin_unlock(&pde->pde_unload_lock); 161 spin_unlock(&pde->pde_unload_lock);
152 file = pdeo->file; 162 file = pdeo->file;
153 pde->proc_fops->release(file_inode(file), file); 163 pde->proc_fops->release(file_inode(file), file);
154 spin_lock(&pde->pde_unload_lock); 164 spin_lock(&pde->pde_unload_lock);
155 list_del_init(&pdeo->lh); 165 /* After ->release. */
166 list_del(&pdeo->lh);
156 if (pdeo->c) 167 if (pdeo->c)
157 complete(pdeo->c); 168 complete(pdeo->c);
158 kfree(pdeo); 169 kfree(pdeo);
@@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
167 if (atomic_add_return(BIAS, &de->in_use) != BIAS) 178 if (atomic_add_return(BIAS, &de->in_use) != BIAS)
168 wait_for_completion(&c); 179 wait_for_completion(&c);
169 180
181 /* ->pde_openers list can't grow from now on. */
182
170 spin_lock(&de->pde_unload_lock); 183 spin_lock(&de->pde_unload_lock);
171 while (!list_empty(&de->pde_openers)) { 184 while (!list_empty(&de->pde_openers)) {
172 struct pde_opener *pdeo; 185 struct pde_opener *pdeo;
@@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
312 struct pde_opener *pdeo; 325 struct pde_opener *pdeo;
313 326
314 /* 327 /*
315 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry 328 * Ensure that
316 * sequence. ->release won't be called because ->proc_fops will be 329 * 1) PDE's ->release hook will be called no matter what
317 * cleared. Depending on complexity of ->release, consequences vary. 330 * either normally by close()/->release, or forcefully by
331 * rmmod/remove_proc_entry.
332 *
333 * 2) rmmod isn't blocked by opening file in /proc and sitting on
334 * the descriptor (including "rmmod foo </proc/foo" scenario).
318 * 335 *
319 * We can't wait for mercy when close will be done for real, it's 336 * Save every "struct file" with custom ->release hook.
320 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
321 * by hand in remove_proc_entry(). For this, save opener's credentials
322 * for later.
323 */ 337 */
324 pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); 338 pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
325 if (!pdeo) 339 if (!pdeo)
326 return -ENOMEM; 340 return -ENOMEM;
327 341
@@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
338 if (rv == 0 && release) { 352 if (rv == 0 && release) {
339 /* To know what to release. */ 353 /* To know what to release. */
340 pdeo->file = file; 354 pdeo->file = file;
341 /* Strictly for "too late" ->release in proc_reg_release(). */ 355 pdeo->closing = false;
356 pdeo->c = NULL;
342 spin_lock(&pde->pde_unload_lock); 357 spin_lock(&pde->pde_unload_lock);
343 list_add(&pdeo->lh, &pde->pde_openers); 358 list_add(&pdeo->lh, &pde->pde_openers);
344 spin_unlock(&pde->pde_unload_lock); 359 spin_unlock(&pde->pde_unload_lock);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5378441ec1b7..bbba5d22aada 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -203,7 +203,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name);
203struct pde_opener { 203struct pde_opener {
204 struct file *file; 204 struct file *file;
205 struct list_head lh; 205 struct list_head lh;
206 int closing; 206 bool closing;
207 struct completion *c; 207 struct completion *c;
208}; 208};
209extern const struct inode_operations proc_link_inode_operations; 209extern const struct inode_operations proc_link_inode_operations;
@@ -211,6 +211,7 @@ extern const struct inode_operations proc_link_inode_operations;
211extern const struct inode_operations proc_pid_link_inode_operations; 211extern const struct inode_operations proc_pid_link_inode_operations;
212 212
213extern void proc_init_inodecache(void); 213extern void proc_init_inodecache(void);
214void set_proc_pid_nlink(void);
214extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); 215extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
215extern int proc_fill_super(struct super_block *, void *data, int flags); 216extern int proc_fill_super(struct super_block *, void *data, int flags);
216extern void proc_entry_rundown(struct proc_dir_entry *); 217extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8d3e484055a6..4bd0373576b5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -122,6 +122,7 @@ void __init proc_root_init(void)
122 int err; 122 int err;
123 123
124 proc_init_inodecache(); 124 proc_init_inodecache();
125 set_proc_pid_nlink();
125 err = register_filesystem(&proc_fs_type); 126 err = register_filesystem(&proc_fs_type);
126 if (err) 127 if (err)
127 return; 128 return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 35b92d81692f..958f32545064 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1588 1588
1589 } while (pte++, addr += PAGE_SIZE, addr != end); 1589 } while (pte++, addr += PAGE_SIZE, addr != end);
1590 pte_unmap_unlock(orig_pte, ptl); 1590 pte_unmap_unlock(orig_pte, ptl);
1591 cond_resched();
1591 return 0; 1592 return 0;
1592} 1593}
1593#ifdef CONFIG_HUGETLB_PAGE 1594#ifdef CONFIG_HUGETLB_PAGE
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 41b95d82a185..18af2bcefe6a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -652,18 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
652} 652}
653#endif 653#endif
654 654
655#ifndef pmd_move_must_withdraw 655#ifndef arch_needs_pgtable_deposit
656static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, 656#define arch_needs_pgtable_deposit() (false)
657 spinlock_t *old_pmd_ptl)
658{
659 /*
660 * With split pmd lock we also need to move preallocated
661 * PTE page table if new_pmd is on different PMD page table.
662 */
663 return new_pmd_ptl != old_pmd_ptl;
664}
665#endif 657#endif
666
667/* 658/*
668 * This function is meant to be used by sites walking pagetables with 659 * This function is meant to be used by sites walking pagetables with
669 * the mmap_sem hold in read mode to protect against MADV_DONTNEED and 660 * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index c6d667187608..7eed8cf3130a 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -107,11 +107,6 @@ struct mmu_gather {
107 struct mmu_gather_batch local; 107 struct mmu_gather_batch local;
108 struct page *__pages[MMU_GATHER_BUNDLE]; 108 struct page *__pages[MMU_GATHER_BUNDLE];
109 unsigned int batch_count; 109 unsigned int batch_count;
110 /*
111 * __tlb_adjust_range will track the new addr here,
112 * that that we can adjust the range after the flush
113 */
114 unsigned long addr;
115 int page_size; 110 int page_size;
116}; 111};
117 112
@@ -125,16 +120,11 @@ extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
125 int page_size); 120 int page_size);
126 121
127static inline void __tlb_adjust_range(struct mmu_gather *tlb, 122static inline void __tlb_adjust_range(struct mmu_gather *tlb,
128 unsigned long address) 123 unsigned long address,
124 unsigned int range_size)
129{ 125{
130 tlb->start = min(tlb->start, address); 126 tlb->start = min(tlb->start, address);
131 tlb->end = max(tlb->end, address + PAGE_SIZE); 127 tlb->end = max(tlb->end, address + range_size);
132 /*
133 * Track the last address with which we adjusted the range. This
134 * will be used later to adjust again after a mmu_flush due to
135 * failed __tlb_remove_page
136 */
137 tlb->addr = address;
138} 128}
139 129
140static inline void __tlb_reset_range(struct mmu_gather *tlb) 130static inline void __tlb_reset_range(struct mmu_gather *tlb)
@@ -150,15 +140,11 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
150static inline void tlb_remove_page_size(struct mmu_gather *tlb, 140static inline void tlb_remove_page_size(struct mmu_gather *tlb,
151 struct page *page, int page_size) 141 struct page *page, int page_size)
152{ 142{
153 if (__tlb_remove_page_size(tlb, page, page_size)) { 143 if (__tlb_remove_page_size(tlb, page, page_size))
154 tlb_flush_mmu(tlb); 144 tlb_flush_mmu(tlb);
155 tlb->page_size = page_size;
156 __tlb_adjust_range(tlb, tlb->addr);
157 __tlb_remove_page_size(tlb, page, page_size);
158 }
159} 145}
160 146
161static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 147static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
162{ 148{
163 return __tlb_remove_page_size(tlb, page, PAGE_SIZE); 149 return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
164} 150}
@@ -172,14 +158,21 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
172 return tlb_remove_page_size(tlb, page, PAGE_SIZE); 158 return tlb_remove_page_size(tlb, page, PAGE_SIZE);
173} 159}
174 160
175static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) 161#ifndef tlb_remove_check_page_size_change
162#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
163static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
164 unsigned int page_size)
176{ 165{
177 /* active->nr should be zero when we call this */ 166 /*
178 VM_BUG_ON_PAGE(tlb->active->nr, page); 167 * We don't care about page size change, just update
179 tlb->page_size = PAGE_SIZE; 168 * mmu_gather page size here so that debug checks
180 __tlb_adjust_range(tlb, tlb->addr); 169 * doesn't throw false warning.
181 return __tlb_remove_page(tlb, page); 170 */
171#ifdef CONFIG_DEBUG_VM
172 tlb->page_size = page_size;
173#endif
182} 174}
175#endif
183 176
184/* 177/*
185 * In the case of tlb vma handling, we can optimise these away in the 178 * In the case of tlb vma handling, we can optimise these away in the
@@ -215,10 +208,16 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa
215 */ 208 */
216#define tlb_remove_tlb_entry(tlb, ptep, address) \ 209#define tlb_remove_tlb_entry(tlb, ptep, address) \
217 do { \ 210 do { \
218 __tlb_adjust_range(tlb, address); \ 211 __tlb_adjust_range(tlb, address, PAGE_SIZE); \
219 __tlb_remove_tlb_entry(tlb, ptep, address); \ 212 __tlb_remove_tlb_entry(tlb, ptep, address); \
220 } while (0) 213 } while (0)
221 214
215#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \
216 do { \
217 __tlb_adjust_range(tlb, address, huge_page_size(h)); \
218 __tlb_remove_tlb_entry(tlb, ptep, address); \
219 } while (0)
220
222/** 221/**
223 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation 222 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
224 * This is a nop so far, because only x86 needs it. 223 * This is a nop so far, because only x86 needs it.
@@ -227,29 +226,47 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa
227#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0) 226#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
228#endif 227#endif
229 228
230#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \ 229#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address) \
231 do { \ 230 do { \
232 __tlb_adjust_range(tlb, address); \ 231 __tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE); \
233 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \ 232 __tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
234 } while (0) 233 } while (0)
235 234
235/*
236 * For things like page tables caches (ie caching addresses "inside" the
237 * page tables, like x86 does), for legacy reasons, flushing an
238 * individual page had better flush the page table caches behind it. This
239 * is definitely how x86 works, for example. And if you have an
240 * architected non-legacy page table cache (which I'm not aware of
241 * anybody actually doing), you're going to have some architecturally
242 * explicit flushing for that, likely *separate* from a regular TLB entry
243 * flush, and thus you'd need more than just some range expansion..
244 *
245 * So if we ever find an architecture
246 * that would want something that odd, I think it is up to that
247 * architecture to do its own odd thing, not cause pain for others
248 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
249 *
250 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
251 */
252
236#define pte_free_tlb(tlb, ptep, address) \ 253#define pte_free_tlb(tlb, ptep, address) \
237 do { \ 254 do { \
238 __tlb_adjust_range(tlb, address); \ 255 __tlb_adjust_range(tlb, address, PAGE_SIZE); \
239 __pte_free_tlb(tlb, ptep, address); \ 256 __pte_free_tlb(tlb, ptep, address); \
240 } while (0) 257 } while (0)
241 258
242#ifndef __ARCH_HAS_4LEVEL_HACK 259#ifndef __ARCH_HAS_4LEVEL_HACK
243#define pud_free_tlb(tlb, pudp, address) \ 260#define pud_free_tlb(tlb, pudp, address) \
244 do { \ 261 do { \
245 __tlb_adjust_range(tlb, address); \ 262 __tlb_adjust_range(tlb, address, PAGE_SIZE); \
246 __pud_free_tlb(tlb, pudp, address); \ 263 __pud_free_tlb(tlb, pudp, address); \
247 } while (0) 264 } while (0)
248#endif 265#endif
249 266
250#define pmd_free_tlb(tlb, pmdp, address) \ 267#define pmd_free_tlb(tlb, pmdp, address) \
251 do { \ 268 do { \
252 __tlb_adjust_range(tlb, address); \ 269 __tlb_adjust_range(tlb, address, PAGE_SIZE); \
253 __pmd_free_tlb(tlb, pmdp, address); \ 270 __pmd_free_tlb(tlb, pmdp, address); \
254 } while (0) 271 } while (0)
255 272
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c357f27d5483..0b5b1af35e5e 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -136,12 +136,13 @@ struct bdi_writeback {
136struct backing_dev_info { 136struct backing_dev_info {
137 struct list_head bdi_list; 137 struct list_head bdi_list;
138 unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ 138 unsigned long ra_pages; /* max readahead in PAGE_SIZE units */
139 unsigned int capabilities; /* Device capabilities */ 139 unsigned long io_pages; /* max allowed IO size */
140 congested_fn *congested_fn; /* Function pointer if device is md/dm */ 140 congested_fn *congested_fn; /* Function pointer if device is md/dm */
141 void *congested_data; /* Pointer to aux data for congested func */ 141 void *congested_data; /* Pointer to aux data for congested func */
142 142
143 char *name; 143 char *name;
144 144
145 unsigned int capabilities; /* Device capabilities */
145 unsigned int min_ratio; 146 unsigned int min_ratio;
146 unsigned int max_ratio, max_prop_frac; 147 unsigned int max_ratio, max_prop_frac;
147 148
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 29f9e774ab76..6f0a91b37f68 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -1,6 +1,9 @@
1#ifndef __CMA_H__ 1#ifndef __CMA_H__
2#define __CMA_H__ 2#define __CMA_H__
3 3
4#include <linux/init.h>
5#include <linux/types.h>
6
4/* 7/*
5 * There is always at least global CMA area and a few optional 8 * There is always at least global CMA area and a few optional
6 * areas configured in kernel .config. 9 * areas configured in kernel .config.
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 928e5ca0caee..0444b1336268 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -21,7 +21,7 @@
21 * clobbered. The issue is as follows: while the inline asm might 21 * clobbered. The issue is as follows: while the inline asm might
22 * access any memory it wants, the compiler could have fit all of 22 * access any memory it wants, the compiler could have fit all of
23 * @ptr into memory registers instead, and since @ptr never escaped 23 * @ptr into memory registers instead, and since @ptr never escaped
24 * from that, it proofed that the inline asm wasn't touching any of 24 * from that, it proved that the inline asm wasn't touching any of
25 * it. This version works well with both compilers, i.e. we're telling 25 * it. This version works well with both compilers, i.e. we're telling
26 * the compiler that the inline asm absolutely may see the contents 26 * the compiler that the inline asm absolutely may see the contents
27 * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495 27 * of @ptr. See also: https://llvm.org/bugs/show_bug.cgi?id=15495
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index e35e6de633b9..1f782aa1d8e6 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -189,6 +189,8 @@ static inline void deferred_split_huge_page(struct page *page) {}
189#define split_huge_pmd(__vma, __pmd, __address) \ 189#define split_huge_pmd(__vma, __pmd, __address) \
190 do { } while (0) 190 do { } while (0)
191 191
192static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
193 unsigned long address, bool freeze, struct page *page) {}
192static inline void split_huge_pmd_address(struct vm_area_struct *vma, 194static inline void split_huge_pmd_address(struct vm_area_struct *vma,
193 unsigned long address, bool freeze, struct page *page) {} 195 unsigned long address, bool freeze, struct page *page) {}
194 196
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c1c3e63d52c1..4fec8b775895 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -175,7 +175,7 @@ __printf(2, 3)
175struct kthread_worker * 175struct kthread_worker *
176kthread_create_worker(unsigned int flags, const char namefmt[], ...); 176kthread_create_worker(unsigned int flags, const char namefmt[], ...);
177 177
178struct kthread_worker * 178__printf(3, 4) struct kthread_worker *
179kthread_create_worker_on_cpu(int cpu, unsigned int flags, 179kthread_create_worker_on_cpu(int cpu, unsigned int flags,
180 const char namefmt[], ...); 180 const char namefmt[], ...);
181 181
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5e5b2969d931..5f4d8281832b 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -7,6 +7,7 @@
7 7
8 8
9#include <linux/mmzone.h> 9#include <linux/mmzone.h>
10#include <linux/dax.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/rbtree.h> 12#include <linux/rbtree.h>
12#include <linux/spinlock.h> 13#include <linux/spinlock.h>
@@ -177,6 +178,13 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
177 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 178 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
178 return false; 179 return false;
179 180
181 /*
182 * DAX device mappings require predictable access latency, so avoid
183 * incurring periodic faults.
184 */
185 if (vma_is_dax(vma))
186 return false;
187
180#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION 188#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
181 if (vma->vm_flags & VM_HUGETLB) 189 if (vma->vm_flags & VM_HUGETLB)
182 return false; 190 return false;
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h
index 4341f32516d8..271b3fdf0070 100644
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -71,6 +71,7 @@ extern int early_init_dt_scan_chosen_stdout(void);
71extern void early_init_fdt_scan_reserved_mem(void); 71extern void early_init_fdt_scan_reserved_mem(void);
72extern void early_init_fdt_reserve_self(void); 72extern void early_init_fdt_reserve_self(void);
73extern void early_init_dt_add_memory_arch(u64 base, u64 size); 73extern void early_init_dt_add_memory_arch(u64 base, u64 size);
74extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size);
74extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, 75extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size,
75 bool no_map); 76 bool no_map);
76extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align); 77extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align);
diff --git a/include/linux/printk.h b/include/linux/printk.h
index eac1af8502bb..3472cc6b7a60 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -10,6 +10,8 @@
10extern const char linux_banner[]; 10extern const char linux_banner[];
11extern const char linux_proc_banner[]; 11extern const char linux_proc_banner[];
12 12
13#define PRINTK_MAX_SINGLE_HEADER_LEN 2
14
13static inline int printk_get_level(const char *buffer) 15static inline int printk_get_level(const char *buffer)
14{ 16{
15 if (buffer[0] == KERN_SOH_ASCII && buffer[1]) { 17 if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
@@ -31,6 +33,14 @@ static inline const char *printk_skip_level(const char *buffer)
31 return buffer; 33 return buffer;
32} 34}
33 35
36static inline const char *printk_skip_headers(const char *buffer)
37{
38 while (printk_get_level(buffer))
39 buffer = printk_skip_level(buffer);
40
41 return buffer;
42}
43
34#define CONSOLE_EXT_LOG_MAX 8192 44#define CONSOLE_EXT_LOG_MAX 8192
35 45
36/* printk's without a loglevel use this.. */ 46/* printk's without a loglevel use this.. */
@@ -40,10 +50,15 @@ static inline const char *printk_skip_level(const char *buffer)
40#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ 50#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */
41#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ 51#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */
42#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ 52#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */
43#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */
44#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ 53#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */
45#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ 54#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */
46 55
56/*
57 * Default used to be hard-coded at 7, we're now allowing it to be set from
58 * kernel config.
59 */
60#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
61
47extern int console_printk[]; 62extern int console_printk[];
48 63
49#define console_loglevel (console_printk[0]) 64#define console_loglevel (console_printk[0])
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index af3581b8a451..744486057e9e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -80,14 +80,11 @@ static inline bool radix_tree_is_internal_node(void *ptr)
80#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ 80#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
81 RADIX_TREE_MAP_SHIFT)) 81 RADIX_TREE_MAP_SHIFT))
82 82
83/* Internally used bits of node->count */
84#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1)
85#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1)
86
87struct radix_tree_node { 83struct radix_tree_node {
88 unsigned char shift; /* Bits remaining in each slot */ 84 unsigned char shift; /* Bits remaining in each slot */
89 unsigned char offset; /* Slot offset in parent */ 85 unsigned char offset; /* Slot offset in parent */
90 unsigned int count; 86 unsigned char count; /* Total entry count */
87 unsigned char exceptional; /* Exceptional entry count */
91 union { 88 union {
92 struct { 89 struct {
93 /* Used when ascending tree */ 90 /* Used when ascending tree */
@@ -248,20 +245,6 @@ static inline int radix_tree_exception(void *arg)
248 return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); 245 return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
249} 246}
250 247
251/**
252 * radix_tree_replace_slot - replace item in a slot
253 * @pslot: pointer to slot, returned by radix_tree_lookup_slot
254 * @item: new item to store in the slot.
255 *
256 * For use with radix_tree_lookup_slot(). Caller must hold tree write locked
257 * across slot lookup and replacement.
258 */
259static inline void radix_tree_replace_slot(void **pslot, void *item)
260{
261 BUG_ON(radix_tree_is_internal_node(item));
262 rcu_assign_pointer(*pslot, item);
263}
264
265int __radix_tree_create(struct radix_tree_root *root, unsigned long index, 248int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
266 unsigned order, struct radix_tree_node **nodep, 249 unsigned order, struct radix_tree_node **nodep,
267 void ***slotp); 250 void ***slotp);
@@ -276,7 +259,14 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
276 struct radix_tree_node **nodep, void ***slotp); 259 struct radix_tree_node **nodep, void ***slotp);
277void *radix_tree_lookup(struct radix_tree_root *, unsigned long); 260void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
278void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); 261void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
279bool __radix_tree_delete_node(struct radix_tree_root *root, 262typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *);
263void __radix_tree_replace(struct radix_tree_root *root,
264 struct radix_tree_node *node,
265 void **slot, void *item,
266 radix_tree_update_node_t update_node, void *private);
267void radix_tree_replace_slot(struct radix_tree_root *root,
268 void **slot, void *item);
269void __radix_tree_delete_node(struct radix_tree_root *root,
280 struct radix_tree_node *node); 270 struct radix_tree_node *node);
281void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); 271void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
282void *radix_tree_delete(struct radix_tree_root *, unsigned long); 272void *radix_tree_delete(struct radix_tree_root *, unsigned long);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b46bb5620a76..15321fb1df6b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -137,11 +137,19 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
137 * anon_vma helper functions. 137 * anon_vma helper functions.
138 */ 138 */
139void anon_vma_init(void); /* create anon_vma_cachep */ 139void anon_vma_init(void); /* create anon_vma_cachep */
140int anon_vma_prepare(struct vm_area_struct *); 140int __anon_vma_prepare(struct vm_area_struct *);
141void unlink_anon_vmas(struct vm_area_struct *); 141void unlink_anon_vmas(struct vm_area_struct *);
142int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); 142int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
143int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); 143int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
144 144
145static inline int anon_vma_prepare(struct vm_area_struct *vma)
146{
147 if (likely(vma->anon_vma))
148 return 0;
149
150 return __anon_vma_prepare(vma);
151}
152
145static inline void anon_vma_merge(struct vm_area_struct *vma, 153static inline void anon_vma_merge(struct vm_area_struct *vma,
146 struct vm_area_struct *next) 154 struct vm_area_struct *next)
147{ 155{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7551d3e2ab70..0e90f2973719 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -540,7 +540,11 @@ static inline int get_dumpable(struct mm_struct *mm)
540 /* leave room for more dump flags */ 540 /* leave room for more dump flags */
541#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ 541#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
542#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ 542#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
543#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ 543/*
544 * This one-shot flag is dropped due to necessity of changing exe once again
545 * on NFS restore
546 */
547//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
544 548
545#define MMF_HAS_UPROBES 19 /* has uprobes */ 549#define MMF_HAS_UPROBES 19 /* has uprobes */
546#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ 550#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index a56523cefb9b..09b212d37f1d 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -246,39 +246,7 @@ struct swap_info_struct {
246void *workingset_eviction(struct address_space *mapping, struct page *page); 246void *workingset_eviction(struct address_space *mapping, struct page *page);
247bool workingset_refault(void *shadow); 247bool workingset_refault(void *shadow);
248void workingset_activation(struct page *page); 248void workingset_activation(struct page *page);
249extern struct list_lru workingset_shadow_nodes; 249void workingset_update_node(struct radix_tree_node *node, void *private);
250
251static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
252{
253 return node->count & RADIX_TREE_COUNT_MASK;
254}
255
256static inline void workingset_node_pages_inc(struct radix_tree_node *node)
257{
258 node->count++;
259}
260
261static inline void workingset_node_pages_dec(struct radix_tree_node *node)
262{
263 VM_WARN_ON_ONCE(!workingset_node_pages(node));
264 node->count--;
265}
266
267static inline unsigned int workingset_node_shadows(struct radix_tree_node *node)
268{
269 return node->count >> RADIX_TREE_COUNT_SHIFT;
270}
271
272static inline void workingset_node_shadows_inc(struct radix_tree_node *node)
273{
274 node->count += 1U << RADIX_TREE_COUNT_SHIFT;
275}
276
277static inline void workingset_node_shadows_dec(struct radix_tree_node *node)
278{
279 VM_WARN_ON_ONCE(!workingset_node_shadows(node));
280 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
281}
282 250
283/* linux/mm/page_alloc.c */ 251/* linux/mm/page_alloc.c */
284extern unsigned long totalram_pages; 252extern unsigned long totalram_pages;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 3d9d786a943c..d68edffbf142 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -82,6 +82,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
82 const void *caller); 82 const void *caller);
83 83
84extern void vfree(const void *addr); 84extern void vfree(const void *addr);
85extern void vfree_atomic(const void *addr);
85 86
86extern void *vmap(struct page **pages, unsigned int count, 87extern void *vmap(struct page **pages, unsigned int count,
87 unsigned long flags, pgprot_t prot); 88 unsigned long flags, pgprot_t prot);
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dea5de95c2dd..c2de5104aad2 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -588,7 +588,7 @@ void __init prepare_namespace(void)
588 saved_root_name); 588 saved_root_name);
589 while (driver_probe_done() != 0 || 589 while (driver_probe_done() != 0 ||
590 (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0) 590 (ROOT_DEV = name_to_dev_t(saved_root_name)) == 0)
591 msleep(100); 591 msleep(5);
592 async_synchronize_full(); 592 async_synchronize_full();
593 } 593 }
594 594
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index fc1ef736253c..98c9011eac78 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -697,7 +697,7 @@ kdb_printit:
697 * Write to all consoles. 697 * Write to all consoles.
698 */ 698 */
699 retlen = strlen(kdb_buffer); 699 retlen = strlen(kdb_buffer);
700 cp = (char *) printk_skip_level(kdb_buffer); 700 cp = (char *) printk_skip_headers(kdb_buffer);
701 if (!dbg_kdb_mode && kgdb_connected) { 701 if (!dbg_kdb_mode && kgdb_connected) {
702 gdbstub_msg_write(cp, retlen - (cp - kdb_buffer)); 702 gdbstub_msg_write(cp, retlen - (cp - kdb_buffer));
703 } else { 703 } else {
diff --git a/kernel/fork.c b/kernel/fork.c
index 5957cf8b4c4b..7377f414f3ce 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -229,7 +229,7 @@ static inline void free_thread_stack(struct task_struct *tsk)
229 } 229 }
230 local_irq_restore(flags); 230 local_irq_restore(flags);
231 231
232 vfree(tsk->stack); 232 vfree_atomic(tsk->stack);
233 return; 233 return;
234 } 234 }
235#endif 235#endif
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 2b59c82cc3e1..40c07e4fa116 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -106,7 +106,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
106 * complain: 106 * complain:
107 */ 107 */
108 if (sysctl_hung_task_warnings) { 108 if (sysctl_hung_task_warnings) {
109 sysctl_hung_task_warnings--; 109 if (sysctl_hung_task_warnings > 0)
110 sysctl_hung_task_warnings--;
110 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", 111 pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
111 t->comm, t->pid, timeout); 112 t->comm, t->pid, timeout);
112 pr_err(" %s %s %.*s\n", 113 pr_err(" %s %s %.*s\n",
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 956495f0efaf..2318fba86277 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -261,7 +261,8 @@ static void create_kthread(struct kthread_create_info *create)
261 } 261 }
262} 262}
263 263
264static struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), 264static __printf(4, 0)
265struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
265 void *data, int node, 266 void *data, int node,
266 const char namefmt[], 267 const char namefmt[],
267 va_list args) 268 va_list args)
@@ -635,7 +636,7 @@ repeat:
635} 636}
636EXPORT_SYMBOL_GPL(kthread_worker_fn); 637EXPORT_SYMBOL_GPL(kthread_worker_fn);
637 638
638static struct kthread_worker * 639static __printf(3, 0) struct kthread_worker *
639__kthread_create_worker(int cpu, unsigned int flags, 640__kthread_create_worker(int cpu, unsigned int flags,
640 const char namefmt[], va_list args) 641 const char namefmt[], va_list args)
641{ 642{
diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index 16bab471c7e2..f011aaef583c 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -67,7 +67,8 @@ static int vprintk_nmi(const char *fmt, va_list args)
67again: 67again:
68 len = atomic_read(&s->len); 68 len = atomic_read(&s->len);
69 69
70 if (len >= sizeof(s->buffer)) { 70 /* The trailing '\0' is not counted into len. */
71 if (len >= sizeof(s->buffer) - 1) {
71 atomic_inc(&nmi_message_lost); 72 atomic_inc(&nmi_message_lost);
72 return 0; 73 return 0;
73 } 74 }
@@ -79,7 +80,7 @@ again:
79 if (!len) 80 if (!len)
80 smp_rmb(); 81 smp_rmb();
81 82
82 add = vsnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args); 83 add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, args);
83 84
84 /* 85 /*
85 * Do it once again if the buffer has been flushed in the meantime. 86 * Do it once again if the buffer has been flushed in the meantime.
@@ -113,16 +114,51 @@ static void printk_nmi_flush_line(const char *text, int len)
113 114
114} 115}
115 116
116/* 117/* printk part of the temporary buffer line by line */
117 * printk one line from the temporary buffer from @start index until 118static int printk_nmi_flush_buffer(const char *start, size_t len)
118 * and including the @end index.
119 */
120static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
121 int start, int end)
122{ 119{
123 const char *buf = s->buffer + start; 120 const char *c, *end;
121 bool header;
122
123 c = start;
124 end = start + len;
125 header = true;
126
127 /* Print line by line. */
128 while (c < end) {
129 if (*c == '\n') {
130 printk_nmi_flush_line(start, c - start + 1);
131 start = ++c;
132 header = true;
133 continue;
134 }
135
136 /* Handle continuous lines or missing new line. */
137 if ((c + 1 < end) && printk_get_level(c)) {
138 if (header) {
139 c = printk_skip_level(c);
140 continue;
141 }
142
143 printk_nmi_flush_line(start, c - start);
144 start = c++;
145 header = true;
146 continue;
147 }
148
149 header = false;
150 c++;
151 }
124 152
125 printk_nmi_flush_line(buf, (end - start) + 1); 153 /* Check if there was a partial line. Ignore pure header. */
154 if (start < end && !header) {
155 static const char newline[] = KERN_CONT "\n";
156
157 printk_nmi_flush_line(start, end - start);
158 printk_nmi_flush_line(newline, strlen(newline));
159 }
160
161 return len;
126} 162}
127 163
128/* 164/*
@@ -135,8 +171,8 @@ static void __printk_nmi_flush(struct irq_work *work)
135 __RAW_SPIN_LOCK_INITIALIZER(read_lock); 171 __RAW_SPIN_LOCK_INITIALIZER(read_lock);
136 struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work); 172 struct nmi_seq_buf *s = container_of(work, struct nmi_seq_buf, work);
137 unsigned long flags; 173 unsigned long flags;
138 size_t len, size; 174 size_t len;
139 int i, last_i; 175 int i;
140 176
141 /* 177 /*
142 * The lock has two functions. First, one reader has to flush all 178 * The lock has two functions. First, one reader has to flush all
@@ -154,12 +190,14 @@ more:
154 /* 190 /*
155 * This is just a paranoid check that nobody has manipulated 191 * This is just a paranoid check that nobody has manipulated
156 * the buffer an unexpected way. If we printed something then 192 * the buffer an unexpected way. If we printed something then
157 * @len must only increase. 193 * @len must only increase. Also it should never overflow the
194 * buffer size.
158 */ 195 */
159 if (i && i >= len) { 196 if ((i && i >= len) || len > sizeof(s->buffer)) {
160 const char *msg = "printk_nmi_flush: internal error\n"; 197 const char *msg = "printk_nmi_flush: internal error\n";
161 198
162 printk_nmi_flush_line(msg, strlen(msg)); 199 printk_nmi_flush_line(msg, strlen(msg));
200 len = 0;
163 } 201 }
164 202
165 if (!len) 203 if (!len)
@@ -167,22 +205,7 @@ more:
167 205
168 /* Make sure that data has been written up to the @len */ 206 /* Make sure that data has been written up to the @len */
169 smp_rmb(); 207 smp_rmb();
170 208 i += printk_nmi_flush_buffer(s->buffer + i, len - i);
171 size = min(len, sizeof(s->buffer));
172 last_i = i;
173
174 /* Print line by line. */
175 for (; i < size; i++) {
176 if (s->buffer[i] == '\n') {
177 printk_nmi_flush_seq_line(s, last_i, i);
178 last_i = i + 1;
179 }
180 }
181 /* Check if there was a partial line. */
182 if (last_i < size) {
183 printk_nmi_flush_seq_line(s, last_i, size - 1);
184 printk_nmi_flush_line("\n", strlen("\n"));
185 }
186 209
187 /* 210 /*
188 * Check that nothing has got added in the meantime and truncate 211 * Check that nothing has got added in the meantime and truncate
diff --git a/kernel/sys.c b/kernel/sys.c
index 78c9fb7dd680..9758892a2d09 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1697,16 +1697,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1697 fput(exe_file); 1697 fput(exe_file);
1698 } 1698 }
1699 1699
1700 /*
1701 * The symlink can be changed only once, just to disallow arbitrary
1702 * transitions malicious software might bring in. This means one
1703 * could make a snapshot over all processes running and monitor
1704 * /proc/pid/exe changes to notice unusual activity if needed.
1705 */
1706 err = -EPERM;
1707 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1708 goto exit;
1709
1710 err = 0; 1700 err = 0;
1711 /* set the new file, lockless */ 1701 /* set the new file, lockless */
1712 get_file(exe.file); 1702 get_file(exe.file);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9bb7d825ba14..e40a0715f422 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -15,6 +15,21 @@ config PRINTK_TIME
15 The behavior is also controlled by the kernel command line 15 The behavior is also controlled by the kernel command line
16 parameter printk.time=1. See Documentation/kernel-parameters.txt 16 parameter printk.time=1. See Documentation/kernel-parameters.txt
17 17
18config CONSOLE_LOGLEVEL_DEFAULT
19 int "Default console loglevel (1-15)"
20 range 1 15
21 default "7"
22 help
23 Default loglevel to determine what will be printed on the console.
24
25 Setting a default here is equivalent to passing in loglevel=<x> in
26 the kernel bootargs. loglevel=<x> continues to override whatever
27 value is specified here as well.
28
29 Note: This does not affect the log level of un-prefixed prink()
30 usage in the kernel. That is controlled by the MESSAGE_LOGLEVEL_DEFAULT
31 option.
32
18config MESSAGE_LOGLEVEL_DEFAULT 33config MESSAGE_LOGLEVEL_DEFAULT
19 int "Default message log level (1-7)" 34 int "Default message log level (1-7)"
20 range 1 7 35 range 1 7
@@ -26,6 +41,10 @@ config MESSAGE_LOGLEVEL_DEFAULT
26 that are auditing their logs closely may want to set it to a lower 41 that are auditing their logs closely may want to set it to a lower
27 priority. 42 priority.
28 43
44 Note: This does not affect what message level gets printed on the console
45 by default. To change that, use loglevel=<x> in the kernel bootargs,
46 or pick a different CONSOLE_LOGLEVEL_DEFAULT configuration value.
47
29config BOOT_PRINTK_DELAY 48config BOOT_PRINTK_DELAY
30 bool "Delay each boot printk message by N milliseconds" 49 bool "Delay each boot printk message by N milliseconds"
31 depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY 50 depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
@@ -1986,7 +2005,7 @@ config ARCH_HAS_DEVMEM_IS_ALLOWED
1986 2005
1987config STRICT_DEVMEM 2006config STRICT_DEVMEM
1988 bool "Filter access to /dev/mem" 2007 bool "Filter access to /dev/mem"
1989 depends on MMU 2008 depends on MMU && DEVMEM
1990 depends on ARCH_HAS_DEVMEM_IS_ALLOWED 2009 depends on ARCH_HAS_DEVMEM_IS_ALLOWED
1991 default y if TILE || PPC 2010 default y if TILE || PPC
1992 ---help--- 2011 ---help---
diff --git a/lib/idr.c b/lib/idr.c
index 6098336df267..52d2979a05e8 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -927,6 +927,9 @@ EXPORT_SYMBOL(ida_pre_get);
927 * and go back to the ida_pre_get() call. If the ida is full, it will 927 * and go back to the ida_pre_get() call. If the ida is full, it will
928 * return %-ENOSPC. 928 * return %-ENOSPC.
929 * 929 *
930 * Note that callers must ensure that concurrent access to @ida is not possible.
931 * See ida_simple_get() for a varaint which takes care of locking.
932 *
930 * @p_id returns a value in the range @starting_id ... %0x7fffffff. 933 * @p_id returns a value in the range @starting_id ... %0x7fffffff.
931 */ 934 */
932int ida_get_new_above(struct ida *ida, int starting_id, int *p_id) 935int ida_get_new_above(struct ida *ida, int starting_id, int *p_id)
@@ -1073,6 +1076,9 @@ EXPORT_SYMBOL(ida_destroy);
1073 * Allocates an id in the range start <= id < end, or returns -ENOSPC. 1076 * Allocates an id in the range start <= id < end, or returns -ENOSPC.
1074 * On memory allocation failure, returns -ENOMEM. 1077 * On memory allocation failure, returns -ENOMEM.
1075 * 1078 *
1079 * Compared to ida_get_new_above() this function does its own locking, and
1080 * should be used unless there are special requirements.
1081 *
1076 * Use ida_simple_remove() to get rid of an id. 1082 * Use ida_simple_remove() to get rid of an id.
1077 */ 1083 */
1078int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end, 1084int ida_simple_get(struct ida *ida, unsigned int start, unsigned int end,
@@ -1119,6 +1125,11 @@ EXPORT_SYMBOL(ida_simple_get);
1119 * ida_simple_remove - remove an allocated id. 1125 * ida_simple_remove - remove an allocated id.
1120 * @ida: the (initialized) ida. 1126 * @ida: the (initialized) ida.
1121 * @id: the id returned by ida_simple_get. 1127 * @id: the id returned by ida_simple_get.
1128 *
1129 * Use to release an id allocated with ida_simple_get().
1130 *
1131 * Compared to ida_remove() this function does its own locking, and should be
1132 * used unless there are special requirements.
1122 */ 1133 */
1123void ida_simple_remove(struct ida *ida, unsigned int id) 1134void ida_simple_remove(struct ida *ida, unsigned int id)
1124{ 1135{
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 4b8bb3618b83..2e8c6f7aa56e 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
220{ 220{
221 unsigned long i; 221 unsigned long i;
222 222
223 pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", 223 pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n",
224 node, node->offset, 224 node, node->offset,
225 node->tags[0][0], node->tags[1][0], node->tags[2][0], 225 node->tags[0][0], node->tags[1][0], node->tags[2][0],
226 node->shift, node->count, node->parent); 226 node->shift, node->count, node->exceptional, node->parent);
227 227
228 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { 228 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
229 unsigned long first = index | (i << node->shift); 229 unsigned long first = index | (i << node->shift);
@@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
325 tag_clear(node, i, 0); 325 tag_clear(node, i, 0);
326 326
327 node->slots[0] = NULL; 327 node->slots[0] = NULL;
328 node->count = 0;
329 328
330 kmem_cache_free(radix_tree_node_cachep, node); 329 kmem_cache_free(radix_tree_node_cachep, node);
331} 330}
@@ -522,8 +521,13 @@ static int radix_tree_extend(struct radix_tree_root *root,
522 node->offset = 0; 521 node->offset = 0;
523 node->count = 1; 522 node->count = 1;
524 node->parent = NULL; 523 node->parent = NULL;
525 if (radix_tree_is_internal_node(slot)) 524 if (radix_tree_is_internal_node(slot)) {
526 entry_to_node(slot)->parent = node; 525 entry_to_node(slot)->parent = node;
526 } else {
527 /* Moving an exceptional root->rnode to a node */
528 if (radix_tree_exceptional_entry(slot))
529 node->exceptional = 1;
530 }
527 node->slots[0] = slot; 531 node->slots[0] = slot;
528 slot = node_to_entry(node); 532 slot = node_to_entry(node);
529 rcu_assign_pointer(root->rnode, slot); 533 rcu_assign_pointer(root->rnode, slot);
@@ -534,6 +538,104 @@ out:
534} 538}
535 539
536/** 540/**
541 * radix_tree_shrink - shrink radix tree to minimum height
542 * @root radix tree root
543 */
544static inline void radix_tree_shrink(struct radix_tree_root *root,
545 radix_tree_update_node_t update_node,
546 void *private)
547{
548 for (;;) {
549 struct radix_tree_node *node = root->rnode;
550 struct radix_tree_node *child;
551
552 if (!radix_tree_is_internal_node(node))
553 break;
554 node = entry_to_node(node);
555
556 /*
557 * The candidate node has more than one child, or its child
558 * is not at the leftmost slot, or the child is a multiorder
559 * entry, we cannot shrink.
560 */
561 if (node->count != 1)
562 break;
563 child = node->slots[0];
564 if (!child)
565 break;
566 if (!radix_tree_is_internal_node(child) && node->shift)
567 break;
568
569 if (radix_tree_is_internal_node(child))
570 entry_to_node(child)->parent = NULL;
571
572 /*
573 * We don't need rcu_assign_pointer(), since we are simply
574 * moving the node from one part of the tree to another: if it
575 * was safe to dereference the old pointer to it
576 * (node->slots[0]), it will be safe to dereference the new
577 * one (root->rnode) as far as dependent read barriers go.
578 */
579 root->rnode = child;
580
581 /*
582 * We have a dilemma here. The node's slot[0] must not be
583 * NULLed in case there are concurrent lookups expecting to
584 * find the item. However if this was a bottom-level node,
585 * then it may be subject to the slot pointer being visible
586 * to callers dereferencing it. If item corresponding to
587 * slot[0] is subsequently deleted, these callers would expect
588 * their slot to become empty sooner or later.
589 *
590 * For example, lockless pagecache will look up a slot, deref
591 * the page pointer, and if the page has 0 refcount it means it
592 * was concurrently deleted from pagecache so try the deref
593 * again. Fortunately there is already a requirement for logic
594 * to retry the entire slot lookup -- the indirect pointer
595 * problem (replacing direct root node with an indirect pointer
596 * also results in a stale slot). So tag the slot as indirect
597 * to force callers to retry.
598 */
599 node->count = 0;
600 if (!radix_tree_is_internal_node(child)) {
601 node->slots[0] = RADIX_TREE_RETRY;
602 if (update_node)
603 update_node(node, private);
604 }
605
606 radix_tree_node_free(node);
607 }
608}
609
610static void delete_node(struct radix_tree_root *root,
611 struct radix_tree_node *node,
612 radix_tree_update_node_t update_node, void *private)
613{
614 do {
615 struct radix_tree_node *parent;
616
617 if (node->count) {
618 if (node == entry_to_node(root->rnode))
619 radix_tree_shrink(root, update_node, private);
620 return;
621 }
622
623 parent = node->parent;
624 if (parent) {
625 parent->slots[node->offset] = NULL;
626 parent->count--;
627 } else {
628 root_tag_clear_all(root);
629 root->rnode = NULL;
630 }
631
632 radix_tree_node_free(node);
633
634 node = parent;
635 } while (node);
636}
637
638/**
537 * __radix_tree_create - create a slot in a radix tree 639 * __radix_tree_create - create a slot in a radix tree
538 * @root: radix tree root 640 * @root: radix tree root
539 * @index: index key 641 * @index: index key
@@ -649,6 +751,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
649 if (node) { 751 if (node) {
650 unsigned offset = get_slot_offset(node, slot); 752 unsigned offset = get_slot_offset(node, slot);
651 node->count++; 753 node->count++;
754 if (radix_tree_exceptional_entry(item))
755 node->exceptional++;
652 BUG_ON(tag_get(node, 0, offset)); 756 BUG_ON(tag_get(node, 0, offset));
653 BUG_ON(tag_get(node, 1, offset)); 757 BUG_ON(tag_get(node, 1, offset));
654 BUG_ON(tag_get(node, 2, offset)); 758 BUG_ON(tag_get(node, 2, offset));
@@ -746,6 +850,85 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
746} 850}
747EXPORT_SYMBOL(radix_tree_lookup); 851EXPORT_SYMBOL(radix_tree_lookup);
748 852
853static void replace_slot(struct radix_tree_root *root,
854 struct radix_tree_node *node,
855 void **slot, void *item,
856 bool warn_typeswitch)
857{
858 void *old = rcu_dereference_raw(*slot);
859 int count, exceptional;
860
861 WARN_ON_ONCE(radix_tree_is_internal_node(item));
862
863 count = !!item - !!old;
864 exceptional = !!radix_tree_exceptional_entry(item) -
865 !!radix_tree_exceptional_entry(old);
866
867 WARN_ON_ONCE(warn_typeswitch && (count || exceptional));
868
869 if (node) {
870 node->count += count;
871 node->exceptional += exceptional;
872 }
873
874 rcu_assign_pointer(*slot, item);
875}
876
877/**
878 * __radix_tree_replace - replace item in a slot
879 * @root: radix tree root
880 * @node: pointer to tree node
881 * @slot: pointer to slot in @node
882 * @item: new item to store in the slot.
883 * @update_node: callback for changing leaf nodes
884 * @private: private data to pass to @update_node
885 *
886 * For use with __radix_tree_lookup(). Caller must hold tree write locked
887 * across slot lookup and replacement.
888 */
889void __radix_tree_replace(struct radix_tree_root *root,
890 struct radix_tree_node *node,
891 void **slot, void *item,
892 radix_tree_update_node_t update_node, void *private)
893{
894 /*
895 * This function supports replacing exceptional entries and
896 * deleting entries, but that needs accounting against the
897 * node unless the slot is root->rnode.
898 */
899 replace_slot(root, node, slot, item,
900 !node && slot != (void **)&root->rnode);
901
902 if (!node)
903 return;
904
905 if (update_node)
906 update_node(node, private);
907
908 delete_node(root, node, update_node, private);
909}
910
911/**
912 * radix_tree_replace_slot - replace item in a slot
913 * @root: radix tree root
914 * @slot: pointer to slot
915 * @item: new item to store in the slot.
916 *
917 * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(),
918 * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked
919 * across slot lookup and replacement.
920 *
921 * NOTE: This cannot be used to switch between non-entries (empty slots),
922 * regular entries, and exceptional entries, as that requires accounting
923 * inside the radix tree node. When switching from one type of entry or
924 * deleting, use __radix_tree_lookup() and __radix_tree_replace().
925 */
926void radix_tree_replace_slot(struct radix_tree_root *root,
927 void **slot, void *item)
928{
929 replace_slot(root, NULL, slot, item, true);
930}
931
749/** 932/**
750 * radix_tree_tag_set - set a tag on a radix tree node 933 * radix_tree_tag_set - set a tag on a radix tree node
751 * @root: radix tree root 934 * @root: radix tree root
@@ -1394,75 +1577,6 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1394#endif /* CONFIG_SHMEM && CONFIG_SWAP */ 1577#endif /* CONFIG_SHMEM && CONFIG_SWAP */
1395 1578
1396/** 1579/**
1397 * radix_tree_shrink - shrink radix tree to minimum height
1398 * @root radix tree root
1399 */
1400static inline bool radix_tree_shrink(struct radix_tree_root *root)
1401{
1402 bool shrunk = false;
1403
1404 for (;;) {
1405 struct radix_tree_node *node = root->rnode;
1406 struct radix_tree_node *child;
1407
1408 if (!radix_tree_is_internal_node(node))
1409 break;
1410 node = entry_to_node(node);
1411
1412 /*
1413 * The candidate node has more than one child, or its child
1414 * is not at the leftmost slot, or the child is a multiorder
1415 * entry, we cannot shrink.
1416 */
1417 if (node->count != 1)
1418 break;
1419 child = node->slots[0];
1420 if (!child)
1421 break;
1422 if (!radix_tree_is_internal_node(child) && node->shift)
1423 break;
1424
1425 if (radix_tree_is_internal_node(child))
1426 entry_to_node(child)->parent = NULL;
1427
1428 /*
1429 * We don't need rcu_assign_pointer(), since we are simply
1430 * moving the node from one part of the tree to another: if it
1431 * was safe to dereference the old pointer to it
1432 * (node->slots[0]), it will be safe to dereference the new
1433 * one (root->rnode) as far as dependent read barriers go.
1434 */
1435 root->rnode = child;
1436
1437 /*
1438 * We have a dilemma here. The node's slot[0] must not be
1439 * NULLed in case there are concurrent lookups expecting to
1440 * find the item. However if this was a bottom-level node,
1441 * then it may be subject to the slot pointer being visible
1442 * to callers dereferencing it. If item corresponding to
1443 * slot[0] is subsequently deleted, these callers would expect
1444 * their slot to become empty sooner or later.
1445 *
1446 * For example, lockless pagecache will look up a slot, deref
1447 * the page pointer, and if the page has 0 refcount it means it
1448 * was concurrently deleted from pagecache so try the deref
1449 * again. Fortunately there is already a requirement for logic
1450 * to retry the entire slot lookup -- the indirect pointer
1451 * problem (replacing direct root node with an indirect pointer
1452 * also results in a stale slot). So tag the slot as indirect
1453 * to force callers to retry.
1454 */
1455 if (!radix_tree_is_internal_node(child))
1456 node->slots[0] = RADIX_TREE_RETRY;
1457
1458 radix_tree_node_free(node);
1459 shrunk = true;
1460 }
1461
1462 return shrunk;
1463}
1464
1465/**
1466 * __radix_tree_delete_node - try to free node after clearing a slot 1580 * __radix_tree_delete_node - try to free node after clearing a slot
1467 * @root: radix tree root 1581 * @root: radix tree root
1468 * @node: node containing @index 1582 * @node: node containing @index
@@ -1470,39 +1584,11 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root)
1470 * After clearing the slot at @index in @node from radix tree 1584 * After clearing the slot at @index in @node from radix tree
1471 * rooted at @root, call this function to attempt freeing the 1585 * rooted at @root, call this function to attempt freeing the
1472 * node and shrinking the tree. 1586 * node and shrinking the tree.
1473 *
1474 * Returns %true if @node was freed, %false otherwise.
1475 */ 1587 */
1476bool __radix_tree_delete_node(struct radix_tree_root *root, 1588void __radix_tree_delete_node(struct radix_tree_root *root,
1477 struct radix_tree_node *node) 1589 struct radix_tree_node *node)
1478{ 1590{
1479 bool deleted = false; 1591 delete_node(root, node, NULL, NULL);
1480
1481 do {
1482 struct radix_tree_node *parent;
1483
1484 if (node->count) {
1485 if (node == entry_to_node(root->rnode))
1486 deleted |= radix_tree_shrink(root);
1487 return deleted;
1488 }
1489
1490 parent = node->parent;
1491 if (parent) {
1492 parent->slots[node->offset] = NULL;
1493 parent->count--;
1494 } else {
1495 root_tag_clear_all(root);
1496 root->rnode = NULL;
1497 }
1498
1499 radix_tree_node_free(node);
1500 deleted = true;
1501
1502 node = parent;
1503 } while (node);
1504
1505 return deleted;
1506} 1592}
1507 1593
1508static inline void delete_sibling_entries(struct radix_tree_node *node, 1594static inline void delete_sibling_entries(struct radix_tree_node *node,
@@ -1559,10 +1645,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
1559 node_tag_clear(root, node, tag, offset); 1645 node_tag_clear(root, node, tag, offset);
1560 1646
1561 delete_sibling_entries(node, node_to_entry(slot), offset); 1647 delete_sibling_entries(node, node_to_entry(slot), offset);
1562 node->slots[offset] = NULL; 1648 __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
1563 node->count--;
1564
1565 __radix_tree_delete_node(root, node);
1566 1649
1567 return entry; 1650 return entry;
1568} 1651}
diff --git a/lib/rbtree.c b/lib/rbtree.c
index eb8a19fee110..1f8b112a7c35 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -296,11 +296,26 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
296 * 296 *
297 * (p) (p) 297 * (p) (p)
298 * / \ / \ 298 * / \ / \
299 * N S --> N Sl 299 * N S --> N sl
300 * / \ \ 300 * / \ \
301 * sl Sr s 301 * sl Sr S
302 * \ 302 * \
303 * Sr 303 * Sr
304 *
305 * Note: p might be red, and then both
306 * p and sl are red after rotation(which
307 * breaks property 4). This is fixed in
308 * Case 4 (in __rb_rotate_set_parents()
309 * which set sl the color of p
310 * and set p RB_BLACK)
311 *
312 * (p) (sl)
313 * / \ / \
314 * N sl --> P S
315 * \ / \
316 * S N Sr
317 * \
318 * Sr
304 */ 319 */
305 tmp1 = tmp2->rb_right; 320 tmp1 = tmp2->rb_right;
306 WRITE_ONCE(sibling->rb_left, tmp1); 321 WRITE_ONCE(sibling->rb_left, tmp1);
@@ -365,7 +380,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
365 } 380 }
366 break; 381 break;
367 } 382 }
368 /* Case 3 - right rotate at sibling */ 383 /* Case 3 - left rotate at sibling */
369 tmp1 = tmp2->rb_left; 384 tmp1 = tmp2->rb_left;
370 WRITE_ONCE(sibling->rb_right, tmp1); 385 WRITE_ONCE(sibling->rb_right, tmp1);
371 WRITE_ONCE(tmp2->rb_left, sibling); 386 WRITE_ONCE(tmp2->rb_left, sibling);
@@ -377,7 +392,7 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
377 tmp1 = sibling; 392 tmp1 = sibling;
378 sibling = tmp2; 393 sibling = tmp2;
379 } 394 }
380 /* Case 4 - left rotate at parent + color flips */ 395 /* Case 4 - right rotate at parent + color flips */
381 tmp2 = sibling->rb_right; 396 tmp2 = sibling->rb_right;
382 WRITE_ONCE(parent->rb_left, tmp2); 397 WRITE_ONCE(parent->rb_left, tmp2);
383 WRITE_ONCE(sibling->rb_right, parent); 398 WRITE_ONCE(sibling->rb_right, parent);
diff --git a/mm/Kconfig b/mm/Kconfig
index 86e3e0e74d20..9b8fccb969dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -153,7 +153,7 @@ config MOVABLE_NODE
153 bool "Enable to assign a node which has only movable memory" 153 bool "Enable to assign a node which has only movable memory"
154 depends on HAVE_MEMBLOCK 154 depends on HAVE_MEMBLOCK
155 depends on NO_BOOTMEM 155 depends on NO_BOOTMEM
156 depends on X86_64 156 depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG
157 depends on NUMA 157 depends on NUMA
158 default n 158 default n
159 help 159 help
@@ -447,13 +447,9 @@ choice
447 benefit. 447 benefit.
448endchoice 448endchoice
449 449
450#
451# We don't deposit page tables on file THP mapping,
452# but Power makes use of them to address MMU quirk.
453#
454config TRANSPARENT_HUGE_PAGECACHE 450config TRANSPARENT_HUGE_PAGECACHE
455 def_bool y 451 def_bool y
456 depends on TRANSPARENT_HUGEPAGE && !PPC 452 depends on TRANSPARENT_HUGEPAGE
457 453
458# 454#
459# UP and nommu archs use km based percpu allocator 455# UP and nommu archs use km based percpu allocator
diff --git a/mm/compaction.c b/mm/compaction.c
index 0d37192d9423..223464227299 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc,
634 return pfn; 634 return pfn;
635} 635}
636 636
637/* Update the number of anon and file isolated pages in the zone */
638static void acct_isolated(struct zone *zone, struct compact_control *cc)
639{
640 struct page *page;
641 unsigned int count[2] = { 0, };
642
643 if (list_empty(&cc->migratepages))
644 return;
645
646 list_for_each_entry(page, &cc->migratepages, lru)
647 count[!!page_is_file_cache(page)]++;
648
649 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]);
650 mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]);
651}
652
653/* Similar to reclaim, but different enough that they don't share logic */ 637/* Similar to reclaim, but different enough that they don't share logic */
654static bool too_many_isolated(struct zone *zone) 638static bool too_many_isolated(struct zone *zone)
655{ 639{
@@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
866 850
867 /* Successfully isolated */ 851 /* Successfully isolated */
868 del_page_from_lru_list(page, lruvec, page_lru(page)); 852 del_page_from_lru_list(page, lruvec, page_lru(page));
853 inc_node_page_state(page,
854 NR_ISOLATED_ANON + page_is_file_cache(page));
869 855
870isolate_success: 856isolate_success:
871 list_add(&page->lru, &cc->migratepages); 857 list_add(&page->lru, &cc->migratepages);
@@ -902,7 +888,6 @@ isolate_fail:
902 spin_unlock_irqrestore(zone_lru_lock(zone), flags); 888 spin_unlock_irqrestore(zone_lru_lock(zone), flags);
903 locked = false; 889 locked = false;
904 } 890 }
905 acct_isolated(zone, cc);
906 putback_movable_pages(&cc->migratepages); 891 putback_movable_pages(&cc->migratepages);
907 cc->nr_migratepages = 0; 892 cc->nr_migratepages = 0;
908 cc->last_migrated_pfn = 0; 893 cc->last_migrated_pfn = 0;
@@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
988 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 973 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
989 break; 974 break;
990 } 975 }
991 acct_isolated(cc->zone, cc);
992 976
993 return pfn; 977 return pfn;
994} 978}
@@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1258 low_pfn = isolate_migratepages_block(cc, low_pfn, 1242 low_pfn = isolate_migratepages_block(cc, low_pfn,
1259 block_end_pfn, isolate_mode); 1243 block_end_pfn, isolate_mode);
1260 1244
1261 if (!low_pfn || cc->contended) { 1245 if (!low_pfn || cc->contended)
1262 acct_isolated(zone, cc);
1263 return ISOLATE_ABORT; 1246 return ISOLATE_ABORT;
1264 }
1265 1247
1266 /* 1248 /*
1267 * Either we isolated something and proceed with migration. Or 1249 * Either we isolated something and proceed with migration. Or
@@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1271 break; 1253 break;
1272 } 1254 }
1273 1255
1274 acct_isolated(zone, cc);
1275 /* Record where migration scanner will be restarted. */ 1256 /* Record where migration scanner will be restarted. */
1276 cc->migrate_pfn = low_pfn; 1257 cc->migrate_pfn = low_pfn;
1277 1258
diff --git a/mm/debug.c b/mm/debug.c
index 9feb699c5d25..db1cd26d8752 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason)
59 59
60 pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); 60 pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
61 61
62 print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
63 sizeof(unsigned long), page,
64 sizeof(struct page), false);
65
62 if (reason) 66 if (reason)
63 pr_alert("page dumped because: %s\n", reason); 67 pr_alert("page dumped because: %s\n", reason);
64 68
diff --git a/mm/filemap.c b/mm/filemap.c
index 50b52fe51937..5b4dd03130da 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -132,44 +132,29 @@ static int page_cache_tree_insert(struct address_space *mapping,
132 if (!dax_mapping(mapping)) { 132 if (!dax_mapping(mapping)) {
133 if (shadowp) 133 if (shadowp)
134 *shadowp = p; 134 *shadowp = p;
135 if (node)
136 workingset_node_shadows_dec(node);
137 } else { 135 } else {
138 /* DAX can replace empty locked entry with a hole */ 136 /* DAX can replace empty locked entry with a hole */
139 WARN_ON_ONCE(p != 137 WARN_ON_ONCE(p !=
140 (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | 138 (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
141 RADIX_DAX_ENTRY_LOCK)); 139 RADIX_DAX_ENTRY_LOCK));
142 /* DAX accounts exceptional entries as normal pages */
143 if (node)
144 workingset_node_pages_dec(node);
145 /* Wakeup waiters for exceptional entry lock */ 140 /* Wakeup waiters for exceptional entry lock */
146 dax_wake_mapping_entry_waiter(mapping, page->index, 141 dax_wake_mapping_entry_waiter(mapping, page->index,
147 false); 142 false);
148 } 143 }
149 } 144 }
150 radix_tree_replace_slot(slot, page); 145 __radix_tree_replace(&mapping->page_tree, node, slot, page,
146 workingset_update_node, mapping);
151 mapping->nrpages++; 147 mapping->nrpages++;
152 if (node) {
153 workingset_node_pages_inc(node);
154 /*
155 * Don't track node that contains actual pages.
156 *
157 * Avoid acquiring the list_lru lock if already
158 * untracked. The list_empty() test is safe as
159 * node->private_list is protected by
160 * mapping->tree_lock.
161 */
162 if (!list_empty(&node->private_list))
163 list_lru_del(&workingset_shadow_nodes,
164 &node->private_list);
165 }
166 return 0; 148 return 0;
167} 149}
168 150
169static void page_cache_tree_delete(struct address_space *mapping, 151static void page_cache_tree_delete(struct address_space *mapping,
170 struct page *page, void *shadow) 152 struct page *page, void *shadow)
171{ 153{
172 int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); 154 int i, nr;
155
156 /* hugetlb pages are represented by one entry in the radix tree */
157 nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
173 158
174 VM_BUG_ON_PAGE(!PageLocked(page), page); 159 VM_BUG_ON_PAGE(!PageLocked(page), page);
175 VM_BUG_ON_PAGE(PageTail(page), page); 160 VM_BUG_ON_PAGE(PageTail(page), page);
@@ -182,44 +167,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
182 __radix_tree_lookup(&mapping->page_tree, page->index + i, 167 __radix_tree_lookup(&mapping->page_tree, page->index + i,
183 &node, &slot); 168 &node, &slot);
184 169
185 radix_tree_clear_tags(&mapping->page_tree, node, slot); 170 VM_BUG_ON_PAGE(!node && nr != 1, page);
186
187 if (!node) {
188 VM_BUG_ON_PAGE(nr != 1, page);
189 /*
190 * We need a node to properly account shadow
191 * entries. Don't plant any without. XXX
192 */
193 shadow = NULL;
194 }
195
196 radix_tree_replace_slot(slot, shadow);
197 171
198 if (!node) 172 radix_tree_clear_tags(&mapping->page_tree, node, slot);
199 break; 173 __radix_tree_replace(&mapping->page_tree, node, slot, shadow,
200 174 workingset_update_node, mapping);
201 workingset_node_pages_dec(node);
202 if (shadow)
203 workingset_node_shadows_inc(node);
204 else
205 if (__radix_tree_delete_node(&mapping->page_tree, node))
206 continue;
207
208 /*
209 * Track node that only contains shadow entries. DAX mappings
210 * contain no shadow entries and may contain other exceptional
211 * entries so skip those.
212 *
213 * Avoid acquiring the list_lru lock if already tracked.
214 * The list_empty() test is safe as node->private_list is
215 * protected by mapping->tree_lock.
216 */
217 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
218 list_empty(&node->private_list)) {
219 node->private_data = mapping;
220 list_lru_add(&workingset_shadow_nodes,
221 &node->private_list);
222 }
223 } 175 }
224 176
225 if (shadow) { 177 if (shadow) {
diff --git a/mm/gup.c b/mm/gup.c
index ec4f82704b6f..e50178c58b97 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -632,7 +632,8 @@ next_page:
632 return i; 632 return i;
633} 633}
634 634
635bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) 635static bool vma_permits_fault(struct vm_area_struct *vma,
636 unsigned int fault_flags)
636{ 637{
637 bool write = !!(fault_flags & FAULT_FLAG_WRITE); 638 bool write = !!(fault_flags & FAULT_FLAG_WRITE);
638 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); 639 bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
@@ -857,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
857EXPORT_SYMBOL(get_user_pages_locked); 858EXPORT_SYMBOL(get_user_pages_locked);
858 859
859/* 860/*
860 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to 861 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for
861 * pass additional gup_flags as last parameter (like FOLL_HWPOISON). 862 * tsk, mm to be specified.
862 * 863 *
863 * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the 864 * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
864 * caller if required (just like with __get_user_pages). "FOLL_GET", 865 * caller if required (just like with __get_user_pages). "FOLL_GET"
865 * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed 866 * is set implicitly if "pages" is non-NULL.
866 * according to the parameters "pages", "write", "force"
867 * respectively.
868 */ 867 */
869__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 868__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
870 unsigned long start, unsigned long nr_pages, 869 unsigned long start, unsigned long nr_pages,
@@ -894,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked);
894 * get_user_pages_unlocked(tsk, mm, ..., pages); 893 * get_user_pages_unlocked(tsk, mm, ..., pages);
895 * 894 *
896 * It is functionally equivalent to get_user_pages_fast so 895 * It is functionally equivalent to get_user_pages_fast so
897 * get_user_pages_fast should be used instead, if the two parameters 896 * get_user_pages_fast should be used instead if specific gup_flags
898 * "tsk" and "mm" are respectively equal to current and current->mm, 897 * (e.g. FOLL_FORCE) are not required.
899 * or if "force" shall be set to 1 (get_user_pages_fast misses the
900 * "force" parameter).
901 */ 898 */
902long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 899long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
903 struct page **pages, unsigned int gup_flags) 900 struct page **pages, unsigned int gup_flags)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8e35cc66d32..cee42cf05477 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
285} 285}
286static struct kobj_attribute use_zero_page_attr = 286static struct kobj_attribute use_zero_page_attr =
287 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); 287 __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
288
289static ssize_t hpage_pmd_size_show(struct kobject *kobj,
290 struct kobj_attribute *attr, char *buf)
291{
292 return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
293}
294static struct kobj_attribute hpage_pmd_size_attr =
295 __ATTR_RO(hpage_pmd_size);
296
288#ifdef CONFIG_DEBUG_VM 297#ifdef CONFIG_DEBUG_VM
289static ssize_t debug_cow_show(struct kobject *kobj, 298static ssize_t debug_cow_show(struct kobject *kobj,
290 struct kobj_attribute *attr, char *buf) 299 struct kobj_attribute *attr, char *buf)
@@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = {
307 &enabled_attr.attr, 316 &enabled_attr.attr,
308 &defrag_attr.attr, 317 &defrag_attr.attr,
309 &use_zero_page_attr.attr, 318 &use_zero_page_attr.attr,
319 &hpage_pmd_size_attr.attr,
310#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 320#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
311 &shmem_enabled_attr.attr, 321 &shmem_enabled_attr.attr,
312#endif 322#endif
@@ -1323,6 +1333,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1323 struct mm_struct *mm = tlb->mm; 1333 struct mm_struct *mm = tlb->mm;
1324 bool ret = false; 1334 bool ret = false;
1325 1335
1336 tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
1337
1326 ptl = pmd_trans_huge_lock(pmd, vma); 1338 ptl = pmd_trans_huge_lock(pmd, vma);
1327 if (!ptl) 1339 if (!ptl)
1328 goto out_unlocked; 1340 goto out_unlocked;
@@ -1378,12 +1390,23 @@ out_unlocked:
1378 return ret; 1390 return ret;
1379} 1391}
1380 1392
1393static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1394{
1395 pgtable_t pgtable;
1396
1397 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1398 pte_free(mm, pgtable);
1399 atomic_long_dec(&mm->nr_ptes);
1400}
1401
1381int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1402int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1382 pmd_t *pmd, unsigned long addr) 1403 pmd_t *pmd, unsigned long addr)
1383{ 1404{
1384 pmd_t orig_pmd; 1405 pmd_t orig_pmd;
1385 spinlock_t *ptl; 1406 spinlock_t *ptl;
1386 1407
1408 tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
1409
1387 ptl = __pmd_trans_huge_lock(pmd, vma); 1410 ptl = __pmd_trans_huge_lock(pmd, vma);
1388 if (!ptl) 1411 if (!ptl)
1389 return 0; 1412 return 0;
@@ -1399,12 +1422,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1399 if (vma_is_dax(vma)) { 1422 if (vma_is_dax(vma)) {
1400 spin_unlock(ptl); 1423 spin_unlock(ptl);
1401 if (is_huge_zero_pmd(orig_pmd)) 1424 if (is_huge_zero_pmd(orig_pmd))
1402 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1425 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
1403 } else if (is_huge_zero_pmd(orig_pmd)) { 1426 } else if (is_huge_zero_pmd(orig_pmd)) {
1404 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1427 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd));
1405 atomic_long_dec(&tlb->mm->nr_ptes); 1428 atomic_long_dec(&tlb->mm->nr_ptes);
1406 spin_unlock(ptl); 1429 spin_unlock(ptl);
1407 tlb_remove_page(tlb, pmd_page(orig_pmd)); 1430 tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
1408 } else { 1431 } else {
1409 struct page *page = pmd_page(orig_pmd); 1432 struct page *page = pmd_page(orig_pmd);
1410 page_remove_rmap(page, true); 1433 page_remove_rmap(page, true);
@@ -1417,6 +1440,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1417 atomic_long_dec(&tlb->mm->nr_ptes); 1440 atomic_long_dec(&tlb->mm->nr_ptes);
1418 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); 1441 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1419 } else { 1442 } else {
1443 if (arch_needs_pgtable_deposit())
1444 zap_deposited_table(tlb->mm, pmd);
1420 add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); 1445 add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
1421 } 1446 }
1422 spin_unlock(ptl); 1447 spin_unlock(ptl);
@@ -1425,6 +1450,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1425 return 1; 1450 return 1;
1426} 1451}
1427 1452
1453#ifndef pmd_move_must_withdraw
1454static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1455 spinlock_t *old_pmd_ptl,
1456 struct vm_area_struct *vma)
1457{
1458 /*
1459 * With split pmd lock we also need to move preallocated
1460 * PTE page table if new_pmd is on different PMD page table.
1461 *
1462 * We also don't deposit and withdraw tables for file pages.
1463 */
1464 return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1465}
1466#endif
1467
1428bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, 1468bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1429 unsigned long new_addr, unsigned long old_end, 1469 unsigned long new_addr, unsigned long old_end,
1430 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) 1470 pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
@@ -1462,8 +1502,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1462 force_flush = true; 1502 force_flush = true;
1463 VM_BUG_ON(!pmd_none(*new_pmd)); 1503 VM_BUG_ON(!pmd_none(*new_pmd));
1464 1504
1465 if (pmd_move_must_withdraw(new_ptl, old_ptl) && 1505 if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1466 vma_is_anonymous(vma)) {
1467 pgtable_t pgtable; 1506 pgtable_t pgtable;
1468 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); 1507 pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1469 pgtable_trans_huge_deposit(mm, new_pmd, pgtable); 1508 pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -1589,6 +1628,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
1589 1628
1590 if (!vma_is_anonymous(vma)) { 1629 if (!vma_is_anonymous(vma)) {
1591 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1630 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
1631 /*
1632 * We are going to unmap this huge page. So
1633 * just go ahead and zap it
1634 */
1635 if (arch_needs_pgtable_deposit())
1636 zap_deposited_table(mm, pmd);
1592 if (vma_is_dax(vma)) 1637 if (vma_is_dax(vma))
1593 return; 1638 return;
1594 page = pmd_page(_pmd); 1639 page = pmd_page(_pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 418bf01a50ed..3edb759c5c7d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3286 BUG_ON(start & ~huge_page_mask(h)); 3286 BUG_ON(start & ~huge_page_mask(h));
3287 BUG_ON(end & ~huge_page_mask(h)); 3287 BUG_ON(end & ~huge_page_mask(h));
3288 3288
3289 /*
3290 * This is a hugetlb vma, all the pte entries should point
3291 * to huge page.
3292 */
3293 tlb_remove_check_page_size_change(tlb, sz);
3289 tlb_start_vma(tlb, vma); 3294 tlb_start_vma(tlb, vma);
3290 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3295 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3291 address = start; 3296 address = start;
@@ -3336,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3336 } 3341 }
3337 3342
3338 pte = huge_ptep_get_and_clear(mm, address, ptep); 3343 pte = huge_ptep_get_and_clear(mm, address, ptep);
3339 tlb_remove_tlb_entry(tlb, ptep, address); 3344 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
3340 if (huge_pte_dirty(pte)) 3345 if (huge_pte_dirty(pte))
3341 set_page_dirty(page); 3346 set_page_dirty(page);
3342 3347
@@ -3450,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
3450 * Keep the pte_same checks anyway to make transition from the mutex easier. 3455 * Keep the pte_same checks anyway to make transition from the mutex easier.
3451 */ 3456 */
3452static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, 3457static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
3453 unsigned long address, pte_t *ptep, pte_t pte, 3458 unsigned long address, pte_t *ptep,
3454 struct page *pagecache_page, spinlock_t *ptl) 3459 struct page *pagecache_page, spinlock_t *ptl)
3455{ 3460{
3461 pte_t pte;
3456 struct hstate *h = hstate_vma(vma); 3462 struct hstate *h = hstate_vma(vma);
3457 struct page *old_page, *new_page; 3463 struct page *old_page, *new_page;
3458 int ret = 0, outside_reserve = 0; 3464 int ret = 0, outside_reserve = 0;
3459 unsigned long mmun_start; /* For mmu_notifiers */ 3465 unsigned long mmun_start; /* For mmu_notifiers */
3460 unsigned long mmun_end; /* For mmu_notifiers */ 3466 unsigned long mmun_end; /* For mmu_notifiers */
3461 3467
3468 pte = huge_ptep_get(ptep);
3462 old_page = pte_page(pte); 3469 old_page = pte_page(pte);
3463 3470
3464retry_avoidcopy: 3471retry_avoidcopy:
@@ -3711,8 +3718,7 @@ retry:
3711 vma_end_reservation(h, vma, address); 3718 vma_end_reservation(h, vma, address);
3712 } 3719 }
3713 3720
3714 ptl = huge_pte_lockptr(h, mm, ptep); 3721 ptl = huge_pte_lock(h, mm, ptep);
3715 spin_lock(ptl);
3716 size = i_size_read(mapping->host) >> huge_page_shift(h); 3722 size = i_size_read(mapping->host) >> huge_page_shift(h);
3717 if (idx >= size) 3723 if (idx >= size)
3718 goto backout; 3724 goto backout;
@@ -3733,7 +3739,7 @@ retry:
3733 hugetlb_count_add(pages_per_huge_page(h), mm); 3739 hugetlb_count_add(pages_per_huge_page(h), mm);
3734 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { 3740 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3735 /* Optimization, do the COW without a second fault */ 3741 /* Optimization, do the COW without a second fault */
3736 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); 3742 ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
3737 } 3743 }
3738 3744
3739 spin_unlock(ptl); 3745 spin_unlock(ptl);
@@ -3888,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3888 3894
3889 if (flags & FAULT_FLAG_WRITE) { 3895 if (flags & FAULT_FLAG_WRITE) {
3890 if (!huge_pte_write(entry)) { 3896 if (!huge_pte_write(entry)) {
3891 ret = hugetlb_cow(mm, vma, address, ptep, entry, 3897 ret = hugetlb_cow(mm, vma, address, ptep,
3892 pagecache_page, ptl); 3898 pagecache_page, ptl);
3893 goto out_put_page; 3899 goto out_put_page;
3894 } 3900 }
3895 entry = huge_pte_mkdirty(entry); 3901 entry = huge_pte_mkdirty(entry);
@@ -4330,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
4330 if (!spte) 4336 if (!spte)
4331 goto out; 4337 goto out;
4332 4338
4333 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 4339 ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
4334 spin_lock(ptl);
4335 if (pud_none(*pud)) { 4340 if (pud_none(*pud)) {
4336 pud_populate(mm, pud, 4341 pud_populate(mm, pud,
4337 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 4342 (pmd_t *)((unsigned long)spte & PAGE_MASK));
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index baabaad4a4aa..dae929c02bbb 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
86 qlist_init(from); 86 qlist_init(from);
87} 87}
88 88
89static void qlist_move(struct qlist_head *from, struct qlist_node *last, 89#define QUARANTINE_PERCPU_SIZE (1 << 20)
90 struct qlist_head *to, size_t size) 90#define QUARANTINE_BATCHES \
91{ 91 (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS)
92 if (unlikely(last == from->tail)) {
93 qlist_move_all(from, to);
94 return;
95 }
96 if (qlist_empty(to))
97 to->head = from->head;
98 else
99 to->tail->next = from->head;
100 to->tail = last;
101 from->head = last->next;
102 last->next = NULL;
103 from->bytes -= size;
104 to->bytes += size;
105}
106
107 92
108/* 93/*
109 * The object quarantine consists of per-cpu queues and a global queue, 94 * The object quarantine consists of per-cpu queues and a global queue,
@@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last,
111 */ 96 */
112static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); 97static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
113 98
114static struct qlist_head global_quarantine; 99/* Round-robin FIFO array of batches. */
100static struct qlist_head global_quarantine[QUARANTINE_BATCHES];
101static int quarantine_head;
102static int quarantine_tail;
103/* Total size of all objects in global_quarantine across all batches. */
104static unsigned long quarantine_size;
115static DEFINE_SPINLOCK(quarantine_lock); 105static DEFINE_SPINLOCK(quarantine_lock);
116 106
117/* Maximum size of the global queue. */ 107/* Maximum size of the global queue. */
118static unsigned long quarantine_size; 108static unsigned long quarantine_max_size;
109
110/*
111 * Target size of a batch in global_quarantine.
112 * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM.
113 */
114static unsigned long quarantine_batch_size;
119 115
120/* 116/*
121 * The fraction of physical memory the quarantine is allowed to occupy. 117 * The fraction of physical memory the quarantine is allowed to occupy.
@@ -124,9 +120,6 @@ static unsigned long quarantine_size;
124 */ 120 */
125#define QUARANTINE_FRACTION 32 121#define QUARANTINE_FRACTION 32
126 122
127#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
128#define QUARANTINE_PERCPU_SIZE (1 << 20)
129
130static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) 123static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
131{ 124{
132 return virt_to_head_page(qlink)->slab_cache; 125 return virt_to_head_page(qlink)->slab_cache;
@@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
191 184
192 if (unlikely(!qlist_empty(&temp))) { 185 if (unlikely(!qlist_empty(&temp))) {
193 spin_lock_irqsave(&quarantine_lock, flags); 186 spin_lock_irqsave(&quarantine_lock, flags);
194 qlist_move_all(&temp, &global_quarantine); 187 WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
188 qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
189 if (global_quarantine[quarantine_tail].bytes >=
190 READ_ONCE(quarantine_batch_size)) {
191 int new_tail;
192
193 new_tail = quarantine_tail + 1;
194 if (new_tail == QUARANTINE_BATCHES)
195 new_tail = 0;
196 if (new_tail != quarantine_head)
197 quarantine_tail = new_tail;
198 }
195 spin_unlock_irqrestore(&quarantine_lock, flags); 199 spin_unlock_irqrestore(&quarantine_lock, flags);
196 } 200 }
197} 201}
198 202
199void quarantine_reduce(void) 203void quarantine_reduce(void)
200{ 204{
201 size_t new_quarantine_size, percpu_quarantines; 205 size_t total_size, new_quarantine_size, percpu_quarantines;
202 unsigned long flags; 206 unsigned long flags;
203 struct qlist_head to_free = QLIST_INIT; 207 struct qlist_head to_free = QLIST_INIT;
204 size_t size_to_free = 0;
205 struct qlist_node *last;
206 208
207 if (likely(READ_ONCE(global_quarantine.bytes) <= 209 if (likely(READ_ONCE(quarantine_size) <=
208 READ_ONCE(quarantine_size))) 210 READ_ONCE(quarantine_max_size)))
209 return; 211 return;
210 212
211 spin_lock_irqsave(&quarantine_lock, flags); 213 spin_lock_irqsave(&quarantine_lock, flags);
@@ -214,24 +216,23 @@ void quarantine_reduce(void)
214 * Update quarantine size in case of hotplug. Allocate a fraction of 216 * Update quarantine size in case of hotplug. Allocate a fraction of
215 * the installed memory to quarantine minus per-cpu queue limits. 217 * the installed memory to quarantine minus per-cpu queue limits.
216 */ 218 */
217 new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / 219 total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
218 QUARANTINE_FRACTION; 220 QUARANTINE_FRACTION;
219 percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); 221 percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
220 new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? 222 new_quarantine_size = (total_size < percpu_quarantines) ?
221 0 : new_quarantine_size - percpu_quarantines; 223 0 : total_size - percpu_quarantines;
222 WRITE_ONCE(quarantine_size, new_quarantine_size); 224 WRITE_ONCE(quarantine_max_size, new_quarantine_size);
223 225 /* Aim at consuming at most 1/2 of slots in quarantine. */
224 last = global_quarantine.head; 226 WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE,
225 while (last) { 227 2 * total_size / QUARANTINE_BATCHES));
226 struct kmem_cache *cache = qlink_to_cache(last); 228
227 229 if (likely(quarantine_size > quarantine_max_size)) {
228 size_to_free += cache->size; 230 qlist_move_all(&global_quarantine[quarantine_head], &to_free);
229 if (!last->next || size_to_free > 231 WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes);
230 global_quarantine.bytes - QUARANTINE_LOW_SIZE) 232 quarantine_head++;
231 break; 233 if (quarantine_head == QUARANTINE_BATCHES)
232 last = last->next; 234 quarantine_head = 0;
233 } 235 }
234 qlist_move(&global_quarantine, last, &to_free, size_to_free);
235 236
236 spin_unlock_irqrestore(&quarantine_lock, flags); 237 spin_unlock_irqrestore(&quarantine_lock, flags);
237 238
@@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg)
275 276
276void quarantine_remove_cache(struct kmem_cache *cache) 277void quarantine_remove_cache(struct kmem_cache *cache)
277{ 278{
278 unsigned long flags; 279 unsigned long flags, i;
279 struct qlist_head to_free = QLIST_INIT; 280 struct qlist_head to_free = QLIST_INIT;
280 281
281 on_each_cpu(per_cpu_remove_cache, cache, 1); 282 on_each_cpu(per_cpu_remove_cache, cache, 1);
282 283
283 spin_lock_irqsave(&quarantine_lock, flags); 284 spin_lock_irqsave(&quarantine_lock, flags);
284 qlist_move_cache(&global_quarantine, &to_free, cache); 285 for (i = 0; i < QUARANTINE_BATCHES; i++)
286 qlist_move_cache(&global_quarantine[i], &to_free, cache);
285 spin_unlock_irqrestore(&quarantine_lock, flags); 287 spin_unlock_irqrestore(&quarantine_lock, flags);
286 288
287 qlist_free_all(&to_free, cache); 289 qlist_free_all(&to_free, cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 073325aedc68..b82b3e215157 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags)
136 pr_err("==================================================================\n"); 136 pr_err("==================================================================\n");
137 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 137 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
138 spin_unlock_irqrestore(&report_lock, *flags); 138 spin_unlock_irqrestore(&report_lock, *flags);
139 if (panic_on_warn)
140 panic("panic_on_warn set ...\n");
139 kasan_enable_current(); 141 kasan_enable_current();
140} 142}
141 143
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 87e1a7ca3846..09460955e818 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1242 struct vm_area_struct *vma; 1242 struct vm_area_struct *vma;
1243 unsigned long addr; 1243 unsigned long addr;
1244 pmd_t *pmd, _pmd; 1244 pmd_t *pmd, _pmd;
1245 bool deposited = false;
1245 1246
1246 i_mmap_lock_write(mapping); 1247 i_mmap_lock_write(mapping);
1247 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1248 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1266 spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); 1267 spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
1267 /* assume page table is clear */ 1268 /* assume page table is clear */
1268 _pmd = pmdp_collapse_flush(vma, addr, pmd); 1269 _pmd = pmdp_collapse_flush(vma, addr, pmd);
1270 /*
1271 * now deposit the pgtable for arch that need it
1272 * otherwise free it.
1273 */
1274 if (arch_needs_pgtable_deposit()) {
1275 /*
1276 * The deposit should be visibile only after
1277 * collapse is seen by others.
1278 */
1279 smp_wmb();
1280 pgtable_trans_huge_deposit(vma->vm_mm, pmd,
1281 pmd_pgtable(_pmd));
1282 deposited = true;
1283 }
1269 spin_unlock(ptl); 1284 spin_unlock(ptl);
1270 up_write(&vma->vm_mm->mmap_sem); 1285 up_write(&vma->vm_mm->mmap_sem);
1271 atomic_long_dec(&vma->vm_mm->nr_ptes); 1286 if (!deposited) {
1272 pte_free(vma->vm_mm, pmd_pgtable(_pmd)); 1287 atomic_long_dec(&vma->vm_mm->nr_ptes);
1288 pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1289 }
1273 } 1290 }
1274 } 1291 }
1275 i_mmap_unlock_write(mapping); 1292 i_mmap_unlock_write(mapping);
@@ -1403,6 +1420,9 @@ static void collapse_shmem(struct mm_struct *mm,
1403 1420
1404 spin_lock_irq(&mapping->tree_lock); 1421 spin_lock_irq(&mapping->tree_lock);
1405 1422
1423 slot = radix_tree_lookup_slot(&mapping->page_tree, index);
1424 VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
1425 &mapping->tree_lock), page);
1406 VM_BUG_ON_PAGE(page_mapped(page), page); 1426 VM_BUG_ON_PAGE(page_mapped(page), page);
1407 1427
1408 /* 1428 /*
@@ -1423,9 +1443,10 @@ static void collapse_shmem(struct mm_struct *mm,
1423 list_add_tail(&page->lru, &pagelist); 1443 list_add_tail(&page->lru, &pagelist);
1424 1444
1425 /* Finally, replace with the new page. */ 1445 /* Finally, replace with the new page. */
1426 radix_tree_replace_slot(slot, 1446 radix_tree_replace_slot(&mapping->page_tree, slot,
1427 new_page + (index % HPAGE_PMD_NR)); 1447 new_page + (index % HPAGE_PMD_NR));
1428 1448
1449 slot = radix_tree_iter_next(&iter);
1429 index++; 1450 index++;
1430 continue; 1451 continue;
1431out_lru: 1452out_lru:
@@ -1521,9 +1542,11 @@ tree_unlocked:
1521 if (!page || iter.index < page->index) { 1542 if (!page || iter.index < page->index) {
1522 if (!nr_none) 1543 if (!nr_none)
1523 break; 1544 break;
1524 /* Put holes back where they were */
1525 radix_tree_replace_slot(slot, NULL);
1526 nr_none--; 1545 nr_none--;
1546 /* Put holes back where they were */
1547 radix_tree_delete(&mapping->page_tree,
1548 iter.index);
1549 slot = radix_tree_iter_next(&iter);
1527 continue; 1550 continue;
1528 } 1551 }
1529 1552
@@ -1532,11 +1555,13 @@ tree_unlocked:
1532 /* Unfreeze the page. */ 1555 /* Unfreeze the page. */
1533 list_del(&page->lru); 1556 list_del(&page->lru);
1534 page_ref_unfreeze(page, 2); 1557 page_ref_unfreeze(page, 2);
1535 radix_tree_replace_slot(slot, page); 1558 radix_tree_replace_slot(&mapping->page_tree,
1559 slot, page);
1536 spin_unlock_irq(&mapping->tree_lock); 1560 spin_unlock_irq(&mapping->tree_lock);
1537 putback_lru_page(page); 1561 putback_lru_page(page);
1538 unlock_page(page); 1562 unlock_page(page);
1539 spin_lock_irq(&mapping->tree_lock); 1563 spin_lock_irq(&mapping->tree_lock);
1564 slot = radix_tree_iter_next(&iter);
1540 } 1565 }
1541 VM_BUG_ON(nr_none); 1566 VM_BUG_ON(nr_none);
1542 spin_unlock_irq(&mapping->tree_lock); 1567 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d1380ed93fdf..da3436953022 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -19,7 +19,7 @@
19 * 19 *
20 * 20 *
21 * For more information on the algorithm and kmemleak usage, please see 21 * For more information on the algorithm and kmemleak usage, please see
22 * Documentation/kmemleak.txt. 22 * Documentation/dev-tools/kmemleak.rst.
23 * 23 *
24 * Notes on locking 24 * Notes on locking
25 * ---------------- 25 * ----------------
diff --git a/mm/madvise.c b/mm/madvise.c
index 93fb63e88b5e..0e3828eae9f8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
281 if (pmd_trans_unstable(pmd)) 281 if (pmd_trans_unstable(pmd))
282 return 0; 282 return 0;
283 283
284 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
284 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 285 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
285 arch_enter_lazy_mmu_mode(); 286 arch_enter_lazy_mmu_mode();
286 for (; addr != end; pte++, addr += PAGE_SIZE) { 287 for (; addr != end; pte++, addr += PAGE_SIZE) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c2043509fb5..175ec51c346d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2145,6 +2145,8 @@ struct memcg_kmem_cache_create_work {
2145 struct work_struct work; 2145 struct work_struct work;
2146}; 2146};
2147 2147
2148static struct workqueue_struct *memcg_kmem_cache_create_wq;
2149
2148static void memcg_kmem_cache_create_func(struct work_struct *w) 2150static void memcg_kmem_cache_create_func(struct work_struct *w)
2149{ 2151{
2150 struct memcg_kmem_cache_create_work *cw = 2152 struct memcg_kmem_cache_create_work *cw =
@@ -2176,7 +2178,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2176 cw->cachep = cachep; 2178 cw->cachep = cachep;
2177 INIT_WORK(&cw->work, memcg_kmem_cache_create_func); 2179 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2178 2180
2179 schedule_work(&cw->work); 2181 queue_work(memcg_kmem_cache_create_wq, &cw->work);
2180} 2182}
2181 2183
2182static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, 2184static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
@@ -5774,6 +5776,17 @@ static int __init mem_cgroup_init(void)
5774{ 5776{
5775 int cpu, node; 5777 int cpu, node;
5776 5778
5779#ifndef CONFIG_SLOB
5780 /*
5781 * Kmem cache creation is mostly done with the slab_mutex held,
5782 * so use a special workqueue to avoid stalling all worker
5783 * threads in case lots of cgroups are created simultaneously.
5784 */
5785 memcg_kmem_cache_create_wq =
5786 alloc_ordered_workqueue("memcg_kmem_cache_create", 0);
5787 BUG_ON(!memcg_kmem_cache_create_wq);
5788#endif
5789
5777 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, 5790 cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
5778 memcg_hotplug_cpu_dead); 5791 memcg_hotplug_cpu_dead);
5779 5792
diff --git a/mm/memory.c b/mm/memory.c
index 33f45edf8272..32e9b7aec366 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
300 struct mmu_gather_batch *batch; 300 struct mmu_gather_batch *batch;
301 301
302 VM_BUG_ON(!tlb->end); 302 VM_BUG_ON(!tlb->end);
303 303 VM_WARN_ON(tlb->page_size != page_size);
304 if (!tlb->page_size)
305 tlb->page_size = page_size;
306 else {
307 if (page_size != tlb->page_size)
308 return true;
309 }
310 304
311 batch = tlb->active; 305 batch = tlb->active;
306 /*
307 * Add the page and check if we are full. If so
308 * force a flush.
309 */
310 batch->pages[batch->nr++] = page;
312 if (batch->nr == batch->max) { 311 if (batch->nr == batch->max) {
313 if (!tlb_next_batch(tlb)) 312 if (!tlb_next_batch(tlb))
314 return true; 313 return true;
@@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
316 } 315 }
317 VM_BUG_ON_PAGE(batch->nr > batch->max, page); 316 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
318 317
319 batch->pages[batch->nr++] = page;
320 return false; 318 return false;
321} 319}
322 320
@@ -528,7 +526,11 @@ void free_pgd_range(struct mmu_gather *tlb,
528 end -= PMD_SIZE; 526 end -= PMD_SIZE;
529 if (addr > end - 1) 527 if (addr > end - 1)
530 return; 528 return;
531 529 /*
530 * We add page table cache pages with PAGE_SIZE,
531 * (see pte_free_tlb()), flush the tlb if we need
532 */
533 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
532 pgd = pgd_offset(tlb->mm, addr); 534 pgd = pgd_offset(tlb->mm, addr);
533 do { 535 do {
534 next = pgd_addr_end(addr, end); 536 next = pgd_addr_end(addr, end);
@@ -1118,8 +1120,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1118 pte_t *start_pte; 1120 pte_t *start_pte;
1119 pte_t *pte; 1121 pte_t *pte;
1120 swp_entry_t entry; 1122 swp_entry_t entry;
1121 struct page *pending_page = NULL;
1122 1123
1124 tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
1123again: 1125again:
1124 init_rss_vec(rss); 1126 init_rss_vec(rss);
1125 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 1127 start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1172,7 +1174,6 @@ again:
1172 print_bad_pte(vma, addr, ptent, page); 1174 print_bad_pte(vma, addr, ptent, page);
1173 if (unlikely(__tlb_remove_page(tlb, page))) { 1175 if (unlikely(__tlb_remove_page(tlb, page))) {
1174 force_flush = 1; 1176 force_flush = 1;
1175 pending_page = page;
1176 addr += PAGE_SIZE; 1177 addr += PAGE_SIZE;
1177 break; 1178 break;
1178 } 1179 }
@@ -1213,11 +1214,6 @@ again:
1213 if (force_flush) { 1214 if (force_flush) {
1214 force_flush = 0; 1215 force_flush = 0;
1215 tlb_flush_mmu_free(tlb); 1216 tlb_flush_mmu_free(tlb);
1216 if (pending_page) {
1217 /* remove the page with new size */
1218 __tlb_remove_pte_page(tlb, pending_page);
1219 pending_page = NULL;
1220 }
1221 if (addr != end) 1217 if (addr != end)
1222 goto again; 1218 goto again;
1223 } 1219 }
@@ -1240,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1240 if (next - addr != HPAGE_PMD_SIZE) { 1236 if (next - addr != HPAGE_PMD_SIZE) {
1241 VM_BUG_ON_VMA(vma_is_anonymous(vma) && 1237 VM_BUG_ON_VMA(vma_is_anonymous(vma) &&
1242 !rwsem_is_locked(&tlb->mm->mmap_sem), vma); 1238 !rwsem_is_locked(&tlb->mm->mmap_sem), vma);
1243 split_huge_pmd(vma, pmd, addr); 1239 __split_huge_pmd(vma, pmd, addr, false, NULL);
1244 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1240 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1245 goto next; 1241 goto next;
1246 /* fall through */ 1242 /* fall through */
@@ -2939,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
2939 return true; 2935 return true;
2940} 2936}
2941 2937
2938static void deposit_prealloc_pte(struct fault_env *fe)
2939{
2940 struct vm_area_struct *vma = fe->vma;
2941
2942 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte);
2943 /*
2944 * We are going to consume the prealloc table,
2945 * count that as nr_ptes.
2946 */
2947 atomic_long_inc(&vma->vm_mm->nr_ptes);
2948 fe->prealloc_pte = 0;
2949}
2950
2942static int do_set_pmd(struct fault_env *fe, struct page *page) 2951static int do_set_pmd(struct fault_env *fe, struct page *page)
2943{ 2952{
2944 struct vm_area_struct *vma = fe->vma; 2953 struct vm_area_struct *vma = fe->vma;
@@ -2953,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2953 ret = VM_FAULT_FALLBACK; 2962 ret = VM_FAULT_FALLBACK;
2954 page = compound_head(page); 2963 page = compound_head(page);
2955 2964
2965 /*
2966 * Archs like ppc64 need additonal space to store information
2967 * related to pte entry. Use the preallocated table for that.
2968 */
2969 if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) {
2970 fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address);
2971 if (!fe->prealloc_pte)
2972 return VM_FAULT_OOM;
2973 smp_wmb(); /* See comment in __pte_alloc() */
2974 }
2975
2956 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 2976 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2957 if (unlikely(!pmd_none(*fe->pmd))) 2977 if (unlikely(!pmd_none(*fe->pmd)))
2958 goto out; 2978 goto out;
@@ -2966,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2966 2986
2967 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); 2987 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
2968 page_add_file_rmap(page, true); 2988 page_add_file_rmap(page, true);
2989 /*
2990 * deposit and withdraw with pmd lock held
2991 */
2992 if (arch_needs_pgtable_deposit())
2993 deposit_prealloc_pte(fe);
2969 2994
2970 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 2995 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
2971 2996
@@ -2975,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2975 ret = 0; 3000 ret = 0;
2976 count_vm_event(THP_FILE_MAPPED); 3001 count_vm_event(THP_FILE_MAPPED);
2977out: 3002out:
3003 /*
3004 * If we are going to fallback to pte mapping, do a
3005 * withdraw with pmd lock held.
3006 */
3007 if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
3008 fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
3009 fe->pmd);
2978 spin_unlock(fe->ptl); 3010 spin_unlock(fe->ptl);
2979 return ret; 3011 return ret;
2980} 3012}
@@ -3014,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
3014 3046
3015 ret = do_set_pmd(fe, page); 3047 ret = do_set_pmd(fe, page);
3016 if (ret != VM_FAULT_FALLBACK) 3048 if (ret != VM_FAULT_FALLBACK)
3017 return ret; 3049 goto fault_handled;
3018 } 3050 }
3019 3051
3020 if (!fe->pte) { 3052 if (!fe->pte) {
3021 ret = pte_alloc_one_map(fe); 3053 ret = pte_alloc_one_map(fe);
3022 if (ret) 3054 if (ret)
3023 return ret; 3055 goto fault_handled;
3024 } 3056 }
3025 3057
3026 /* Re-check under ptl */ 3058 /* Re-check under ptl */
3027 if (unlikely(!pte_none(*fe->pte))) 3059 if (unlikely(!pte_none(*fe->pte))) {
3028 return VM_FAULT_NOPAGE; 3060 ret = VM_FAULT_NOPAGE;
3061 goto fault_handled;
3062 }
3029 3063
3030 flush_icache_page(vma, page); 3064 flush_icache_page(vma, page);
3031 entry = mk_pte(page, vma->vm_page_prot); 3065 entry = mk_pte(page, vma->vm_page_prot);
@@ -3045,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
3045 3079
3046 /* no need to invalidate: a not-present page won't be cached */ 3080 /* no need to invalidate: a not-present page won't be cached */
3047 update_mmu_cache(vma, fe->address, fe->pte); 3081 update_mmu_cache(vma, fe->address, fe->pte);
3082 ret = 0;
3048 3083
3049 return 0; 3084fault_handled:
3085 /* preallocated pagetable is unused: free it */
3086 if (fe->prealloc_pte) {
3087 pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3088 fe->prealloc_pte = 0;
3089 }
3090 return ret;
3050} 3091}
3051 3092
3052static unsigned long fault_around_bytes __read_mostly = 3093static unsigned long fault_around_bytes __read_mostly =
@@ -3145,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
3145 3186
3146 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); 3187 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
3147 3188
3148 /* preallocated pagetable is unused: free it */
3149 if (fe->prealloc_pte) {
3150 pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3151 fe->prealloc_pte = 0;
3152 }
3153 /* Huge page is mapped? Page fault is solved */ 3189 /* Huge page is mapped? Page fault is solved */
3154 if (pmd_trans_huge(*fe->pmd)) { 3190 if (pmd_trans_huge(*fe->pmd)) {
3155 ret = VM_FAULT_NOPAGE; 3191 ret = VM_FAULT_NOPAGE;
@@ -3454,7 +3490,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
3454 3490
3455 /* COW handled on pte level: split pmd */ 3491 /* COW handled on pte level: split pmd */
3456 VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); 3492 VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
3457 split_huge_pmd(fe->vma, fe->pmd, fe->address); 3493 __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL);
3458 3494
3459 return VM_FAULT_FALLBACK; 3495 return VM_FAULT_FALLBACK;
3460} 3496}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index cad4b9125695..e43142c15631 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1727static int __init cmdline_parse_movable_node(char *p) 1727static int __init cmdline_parse_movable_node(char *p)
1728{ 1728{
1729#ifdef CONFIG_MOVABLE_NODE 1729#ifdef CONFIG_MOVABLE_NODE
1730 /*
1731 * Memory used by the kernel cannot be hot-removed because Linux
1732 * cannot migrate the kernel pages. When memory hotplug is
1733 * enabled, we should prevent memblock from allocating memory
1734 * for the kernel.
1735 *
1736 * ACPI SRAT records all hotpluggable memory ranges. But before
1737 * SRAT is parsed, we don't know about it.
1738 *
1739 * The kernel image is loaded into memory at very early time. We
1740 * cannot prevent this anyway. So on NUMA system, we set any
1741 * node the kernel resides in as un-hotpluggable.
1742 *
1743 * Since on modern servers, one node could have double-digit
1744 * gigabytes memory, we can assume the memory around the kernel
1745 * image is also un-hotpluggable. So before SRAT is parsed, just
1746 * allocate memory near the kernel image to try the best to keep
1747 * the kernel away from hotpluggable memory.
1748 */
1749 memblock_set_bottom_up(true);
1750 movable_node_enabled = true; 1730 movable_node_enabled = true;
1751#else 1731#else
1752 pr_warn("movable_node option not supported\n"); 1732 pr_warn("movable_node option not supported\n");
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0b859af06b87..6d3639e1f254 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
276 return ERR_PTR(-EINVAL); 276 return ERR_PTR(-EINVAL);
277 } 277 }
278 } else if (mode == MPOL_LOCAL) { 278 } else if (mode == MPOL_LOCAL) {
279 if (!nodes_empty(*nodes)) 279 if (!nodes_empty(*nodes) ||
280 (flags & MPOL_F_STATIC_NODES) ||
281 (flags & MPOL_F_RELATIVE_NODES))
280 return ERR_PTR(-EINVAL); 282 return ERR_PTR(-EINVAL);
281 mode = MPOL_PREFERRED; 283 mode = MPOL_PREFERRED;
282 } else if (nodes_empty(*nodes)) 284 } else if (nodes_empty(*nodes))
@@ -496,7 +498,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
496 page = pmd_page(*pmd); 498 page = pmd_page(*pmd);
497 if (is_huge_zero_page(page)) { 499 if (is_huge_zero_page(page)) {
498 spin_unlock(ptl); 500 spin_unlock(ptl);
499 split_huge_pmd(vma, pmd, addr); 501 __split_huge_pmd(vma, pmd, addr, false, NULL);
500 } else { 502 } else {
501 get_page(page); 503 get_page(page);
502 spin_unlock(ptl); 504 spin_unlock(ptl);
@@ -1679,25 +1681,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1679static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, 1681static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1680 int nd) 1682 int nd)
1681{ 1683{
1682 switch (policy->mode) { 1684 if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
1683 case MPOL_PREFERRED: 1685 nd = policy->v.preferred_node;
1684 if (!(policy->flags & MPOL_F_LOCAL)) 1686 else {
1685 nd = policy->v.preferred_node;
1686 break;
1687 case MPOL_BIND:
1688 /* 1687 /*
1689 * Normally, MPOL_BIND allocations are node-local within the 1688 * __GFP_THISNODE shouldn't even be used with the bind policy
1690 * allowed nodemask. However, if __GFP_THISNODE is set and the 1689 * because we might easily break the expectation to stay on the
1691 * current node isn't part of the mask, we use the zonelist for 1690 * requested node and not break the policy.
1692 * the first node in the mask instead.
1693 */ 1691 */
1694 if (unlikely(gfp & __GFP_THISNODE) && 1692 WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
1695 unlikely(!node_isset(nd, policy->v.nodes)))
1696 nd = first_node(policy->v.nodes);
1697 break;
1698 default:
1699 BUG();
1700 } 1693 }
1694
1701 return node_zonelist(nd, gfp); 1695 return node_zonelist(nd, gfp);
1702} 1696}
1703 1697
diff --git a/mm/migrate.c b/mm/migrate.c
index 99250aee1ac1..0ed24b1fa77b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l)
168 continue; 168 continue;
169 } 169 }
170 list_del(&page->lru); 170 list_del(&page->lru);
171 dec_node_page_state(page, NR_ISOLATED_ANON +
172 page_is_file_cache(page));
173 /* 171 /*
174 * We isolated non-lru movable page so here we can use 172 * We isolated non-lru movable page so here we can use
175 * __PageMovable because LRU page's mapping cannot have 173 * __PageMovable because LRU page's mapping cannot have
@@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l)
186 put_page(page); 184 put_page(page);
187 } else { 185 } else {
188 putback_lru_page(page); 186 putback_lru_page(page);
187 dec_node_page_state(page, NR_ISOLATED_ANON +
188 page_is_file_cache(page));
189 } 189 }
190 } 190 }
191} 191}
@@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
482 SetPageDirty(newpage); 482 SetPageDirty(newpage);
483 } 483 }
484 484
485 radix_tree_replace_slot(pslot, newpage); 485 radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
486 486
487 /* 487 /*
488 * Drop cache reference from old page by unfreezing 488 * Drop cache reference from old page by unfreezing
@@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
556 556
557 get_page(newpage); 557 get_page(newpage);
558 558
559 radix_tree_replace_slot(pslot, newpage); 559 radix_tree_replace_slot(&mapping->page_tree, pslot, newpage);
560 560
561 page_ref_unfreeze(page, expected_count - 1); 561 page_ref_unfreeze(page, expected_count - 1);
562 562
@@ -1121,8 +1121,15 @@ out:
1121 * restored. 1121 * restored.
1122 */ 1122 */
1123 list_del(&page->lru); 1123 list_del(&page->lru);
1124 dec_node_page_state(page, NR_ISOLATED_ANON + 1124
1125 page_is_file_cache(page)); 1125 /*
1126 * Compaction can migrate also non-LRU pages which are
1127 * not accounted to NR_ISOLATED_*. They can be recognized
1128 * as __PageMovable
1129 */
1130 if (likely(!__PageMovable(page)))
1131 dec_node_page_state(page, NR_ISOLATED_ANON +
1132 page_is_file_cache(page));
1126 } 1133 }
1127 1134
1128 /* 1135 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 11936526b08b..cc2459c57f60 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
69 pte_t *pte, oldpte; 69 pte_t *pte, oldpte;
70 spinlock_t *ptl; 70 spinlock_t *ptl;
71 unsigned long pages = 0; 71 unsigned long pages = 0;
72 int target_node = NUMA_NO_NODE;
72 73
73 pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); 74 pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl);
74 if (!pte) 75 if (!pte)
75 return 0; 76 return 0;
76 77
78 /* Get target node for single threaded private VMAs */
79 if (prot_numa && !(vma->vm_flags & VM_SHARED) &&
80 atomic_read(&vma->vm_mm->mm_users) == 1)
81 target_node = numa_node_id();
82
77 arch_enter_lazy_mmu_mode(); 83 arch_enter_lazy_mmu_mode();
78 do { 84 do {
79 oldpte = *pte; 85 oldpte = *pte;
@@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
95 /* Avoid TLB flush if possible */ 101 /* Avoid TLB flush if possible */
96 if (pte_protnone(oldpte)) 102 if (pte_protnone(oldpte))
97 continue; 103 continue;
104
105 /*
106 * Don't mess with PTEs if page is already on the node
107 * a single-threaded process is running on.
108 */
109 if (target_node == page_to_nid(page))
110 continue;
98 } 111 }
99 112
100 ptent = ptep_modify_prot_start(mm, addr, pte); 113 ptent = ptep_modify_prot_start(mm, addr, pte);
@@ -163,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
163 176
164 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 177 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
165 if (next - addr != HPAGE_PMD_SIZE) { 178 if (next - addr != HPAGE_PMD_SIZE) {
166 split_huge_pmd(vma, pmd, addr); 179 __split_huge_pmd(vma, pmd, addr, false, NULL);
167 if (pmd_trans_unstable(pmd)) 180 if (pmd_trans_unstable(pmd))
168 continue; 181 continue;
169 } else { 182 } else {
@@ -484,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
484 return do_mprotect_pkey(start, len, prot, -1); 497 return do_mprotect_pkey(start, len, prot, -1);
485} 498}
486 499
500#ifdef CONFIG_ARCH_HAS_PKEYS
501
487SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, 502SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len,
488 unsigned long, prot, int, pkey) 503 unsigned long, prot, int, pkey)
489{ 504{
@@ -534,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
534 */ 549 */
535 return ret; 550 return ret;
536} 551}
552
553#endif /* CONFIG_ARCH_HAS_PKEYS */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3dcc54da5637..f64e7bcb43b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2058,8 +2058,12 @@ out_unlock:
2058 * potentially hurts the reliability of high-order allocations when under 2058 * potentially hurts the reliability of high-order allocations when under
2059 * intense memory pressure but failed atomic allocations should be easier 2059 * intense memory pressure but failed atomic allocations should be easier
2060 * to recover from than an OOM. 2060 * to recover from than an OOM.
2061 *
2062 * If @force is true, try to unreserve a pageblock even though highatomic
2063 * pageblock is exhausted.
2061 */ 2064 */
2062static void unreserve_highatomic_pageblock(const struct alloc_context *ac) 2065static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2066 bool force)
2063{ 2067{
2064 struct zonelist *zonelist = ac->zonelist; 2068 struct zonelist *zonelist = ac->zonelist;
2065 unsigned long flags; 2069 unsigned long flags;
@@ -2067,11 +2071,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
2067 struct zone *zone; 2071 struct zone *zone;
2068 struct page *page; 2072 struct page *page;
2069 int order; 2073 int order;
2074 bool ret;
2070 2075
2071 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, 2076 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2072 ac->nodemask) { 2077 ac->nodemask) {
2073 /* Preserve at least one pageblock */ 2078 /*
2074 if (zone->nr_reserved_highatomic <= pageblock_nr_pages) 2079 * Preserve at least one pageblock unless memory pressure
2080 * is really high.
2081 */
2082 if (!force && zone->nr_reserved_highatomic <=
2083 pageblock_nr_pages)
2075 continue; 2084 continue;
2076 2085
2077 spin_lock_irqsave(&zone->lock, flags); 2086 spin_lock_irqsave(&zone->lock, flags);
@@ -2085,13 +2094,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
2085 continue; 2094 continue;
2086 2095
2087 /* 2096 /*
2088 * It should never happen but changes to locking could 2097 * In page freeing path, migratetype change is racy so
2089 * inadvertently allow a per-cpu drain to add pages 2098 * we can counter several free pages in a pageblock
2090 * to MIGRATE_HIGHATOMIC while unreserving so be safe 2099 * in this loop althoug we changed the pageblock type
2091 * and watch for underflows. 2100 * from highatomic to ac->migratetype. So we should
2101 * adjust the count once.
2092 */ 2102 */
2093 zone->nr_reserved_highatomic -= min(pageblock_nr_pages, 2103 if (get_pageblock_migratetype(page) ==
2094 zone->nr_reserved_highatomic); 2104 MIGRATE_HIGHATOMIC) {
2105 /*
2106 * It should never happen but changes to
2107 * locking could inadvertently allow a per-cpu
2108 * drain to add pages to MIGRATE_HIGHATOMIC
2109 * while unreserving so be safe and watch for
2110 * underflows.
2111 */
2112 zone->nr_reserved_highatomic -= min(
2113 pageblock_nr_pages,
2114 zone->nr_reserved_highatomic);
2115 }
2095 2116
2096 /* 2117 /*
2097 * Convert to ac->migratetype and avoid the normal 2118 * Convert to ac->migratetype and avoid the normal
@@ -2103,12 +2124,16 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
2103 * may increase. 2124 * may increase.
2104 */ 2125 */
2105 set_pageblock_migratetype(page, ac->migratetype); 2126 set_pageblock_migratetype(page, ac->migratetype);
2106 move_freepages_block(zone, page, ac->migratetype); 2127 ret = move_freepages_block(zone, page, ac->migratetype);
2107 spin_unlock_irqrestore(&zone->lock, flags); 2128 if (ret) {
2108 return; 2129 spin_unlock_irqrestore(&zone->lock, flags);
2130 return ret;
2131 }
2109 } 2132 }
2110 spin_unlock_irqrestore(&zone->lock, flags); 2133 spin_unlock_irqrestore(&zone->lock, flags);
2111 } 2134 }
2135
2136 return false;
2112} 2137}
2113 2138
2114/* Remove an element from the buddy allocator from the fallback list */ 2139/* Remove an element from the buddy allocator from the fallback list */
@@ -2133,7 +2158,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
2133 2158
2134 page = list_first_entry(&area->free_list[fallback_mt], 2159 page = list_first_entry(&area->free_list[fallback_mt],
2135 struct page, lru); 2160 struct page, lru);
2136 if (can_steal) 2161 if (can_steal &&
2162 get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
2137 steal_suitable_fallback(zone, page, start_migratetype); 2163 steal_suitable_fallback(zone, page, start_migratetype);
2138 2164
2139 /* Remove the page from the freelists */ 2165 /* Remove the page from the freelists */
@@ -2192,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2192 unsigned long count, struct list_head *list, 2218 unsigned long count, struct list_head *list,
2193 int migratetype, bool cold) 2219 int migratetype, bool cold)
2194{ 2220{
2195 int i; 2221 int i, alloced = 0;
2196 2222
2197 spin_lock(&zone->lock); 2223 spin_lock(&zone->lock);
2198 for (i = 0; i < count; ++i) { 2224 for (i = 0; i < count; ++i) {
@@ -2217,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
2217 else 2243 else
2218 list_add_tail(&page->lru, list); 2244 list_add_tail(&page->lru, list);
2219 list = &page->lru; 2245 list = &page->lru;
2246 alloced++;
2220 if (is_migrate_cma(get_pcppage_migratetype(page))) 2247 if (is_migrate_cma(get_pcppage_migratetype(page)))
2221 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 2248 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2222 -(1 << order)); 2249 -(1 << order));
2223 } 2250 }
2251
2252 /*
2253 * i pages were removed from the buddy list even if some leak due
2254 * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2255 * on i. Do not confuse with 'alloced' which is the number of
2256 * pages added to the pcp list.
2257 */
2224 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 2258 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2225 spin_unlock(&zone->lock); 2259 spin_unlock(&zone->lock);
2226 return i; 2260 return alloced;
2227} 2261}
2228 2262
2229#ifdef CONFIG_NUMA 2263#ifdef CONFIG_NUMA
@@ -2534,7 +2568,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
2534 struct page *endpage = page + (1 << order) - 1; 2568 struct page *endpage = page + (1 << order) - 1;
2535 for (; page < endpage; page += pageblock_nr_pages) { 2569 for (; page < endpage; page += pageblock_nr_pages) {
2536 int mt = get_pageblock_migratetype(page); 2570 int mt = get_pageblock_migratetype(page);
2537 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) 2571 if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
2572 && mt != MIGRATE_HIGHATOMIC)
2538 set_pageblock_migratetype(page, 2573 set_pageblock_migratetype(page,
2539 MIGRATE_MOVABLE); 2574 MIGRATE_MOVABLE);
2540 } 2575 }
@@ -3305,7 +3340,7 @@ retry:
3305 * Shrink them them and try again 3340 * Shrink them them and try again
3306 */ 3341 */
3307 if (!page && !drained) { 3342 if (!page && !drained) {
3308 unreserve_highatomic_pageblock(ac); 3343 unreserve_highatomic_pageblock(ac, false);
3309 drain_all_pages(NULL); 3344 drain_all_pages(NULL);
3310 drained = true; 3345 drained = true;
3311 goto retry; 3346 goto retry;
@@ -3422,8 +3457,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3422 * Make sure we converge to OOM if we cannot make any progress 3457 * Make sure we converge to OOM if we cannot make any progress
3423 * several times in the row. 3458 * several times in the row.
3424 */ 3459 */
3425 if (*no_progress_loops > MAX_RECLAIM_RETRIES) 3460 if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3426 return false; 3461 /* Before OOM, exhaust highatomic_reserve */
3462 return unreserve_highatomic_pageblock(ac, true);
3463 }
3427 3464
3428 /* 3465 /*
3429 * Keep reclaiming pages while there is a chance this will lead 3466 * Keep reclaiming pages while there is a chance this will lead
diff --git a/mm/percpu.c b/mm/percpu.c
index 255714302394..f696385bcc44 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
2093 size_t pages_size; 2093 size_t pages_size;
2094 struct page **pages; 2094 struct page **pages;
2095 int unit, i, j, rc; 2095 int unit, i, j, rc;
2096 int upa;
2097 int nr_g0_units;
2096 2098
2097 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); 2099 snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
2098 2100
@@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
2100 if (IS_ERR(ai)) 2102 if (IS_ERR(ai))
2101 return PTR_ERR(ai); 2103 return PTR_ERR(ai);
2102 BUG_ON(ai->nr_groups != 1); 2104 BUG_ON(ai->nr_groups != 1);
2103 BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); 2105 upa = ai->alloc_size/ai->unit_size;
2106 nr_g0_units = roundup(num_possible_cpus(), upa);
2107 if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
2108 pcpu_free_alloc_info(ai);
2109 return -EINVAL;
2110 }
2104 2111
2105 unit_pages = ai->unit_size >> PAGE_SHIFT; 2112 unit_pages = ai->unit_size >> PAGE_SHIFT;
2106 2113
@@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
2111 2118
2112 /* allocate pages */ 2119 /* allocate pages */
2113 j = 0; 2120 j = 0;
2114 for (unit = 0; unit < num_possible_cpus(); unit++) 2121 for (unit = 0; unit < num_possible_cpus(); unit++) {
2122 unsigned int cpu = ai->groups[0].cpu_map[unit];
2115 for (i = 0; i < unit_pages; i++) { 2123 for (i = 0; i < unit_pages; i++) {
2116 unsigned int cpu = ai->groups[0].cpu_map[unit];
2117 void *ptr; 2124 void *ptr;
2118 2125
2119 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); 2126 ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
2120 if (!ptr) { 2127 if (!ptr) {
2121 pr_warn("failed to allocate %s page for cpu%u\n", 2128 pr_warn("failed to allocate %s page for cpu%u\n",
2122 psize_str, cpu); 2129 psize_str, cpu);
2123 goto enomem; 2130 goto enomem;
2124 } 2131 }
2125 /* kmemleak tracks the percpu allocations separately */ 2132 /* kmemleak tracks the percpu allocations separately */
2126 kmemleak_free(ptr); 2133 kmemleak_free(ptr);
2127 pages[j++] = virt_to_page(ptr); 2134 pages[j++] = virt_to_page(ptr);
2128 } 2135 }
2136 }
2129 2137
2130 /* allocate vm area, map the pages and copy static data */ 2138 /* allocate vm area, map the pages and copy static data */
2131 vm.flags = VM_ALLOC; 2139 vm.flags = VM_ALLOC;
diff --git a/mm/readahead.c b/mm/readahead.c
index c8a955b1297e..c4ca70239233 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -207,12 +207,21 @@ out:
207 * memory at once. 207 * memory at once.
208 */ 208 */
209int force_page_cache_readahead(struct address_space *mapping, struct file *filp, 209int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
210 pgoff_t offset, unsigned long nr_to_read) 210 pgoff_t offset, unsigned long nr_to_read)
211{ 211{
212 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
213 struct file_ra_state *ra = &filp->f_ra;
214 unsigned long max_pages;
215
212 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) 216 if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
213 return -EINVAL; 217 return -EINVAL;
214 218
215 nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); 219 /*
220 * If the request exceeds the readahead window, allow the read to
221 * be up to the optimal hardware IO size
222 */
223 max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
224 nr_to_read = min(nr_to_read, max_pages);
216 while (nr_to_read) { 225 while (nr_to_read) {
217 int err; 226 int err;
218 227
@@ -369,10 +378,18 @@ ondemand_readahead(struct address_space *mapping,
369 bool hit_readahead_marker, pgoff_t offset, 378 bool hit_readahead_marker, pgoff_t offset,
370 unsigned long req_size) 379 unsigned long req_size)
371{ 380{
372 unsigned long max = ra->ra_pages; 381 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
382 unsigned long max_pages = ra->ra_pages;
373 pgoff_t prev_offset; 383 pgoff_t prev_offset;
374 384
375 /* 385 /*
386 * If the request exceeds the readahead window, allow the read to
387 * be up to the optimal hardware IO size
388 */
389 if (req_size > max_pages && bdi->io_pages > max_pages)
390 max_pages = min(req_size, bdi->io_pages);
391
392 /*
376 * start of file 393 * start of file
377 */ 394 */
378 if (!offset) 395 if (!offset)
@@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping,
385 if ((offset == (ra->start + ra->size - ra->async_size) || 402 if ((offset == (ra->start + ra->size - ra->async_size) ||
386 offset == (ra->start + ra->size))) { 403 offset == (ra->start + ra->size))) {
387 ra->start += ra->size; 404 ra->start += ra->size;
388 ra->size = get_next_ra_size(ra, max); 405 ra->size = get_next_ra_size(ra, max_pages);
389 ra->async_size = ra->size; 406 ra->async_size = ra->size;
390 goto readit; 407 goto readit;
391 } 408 }
@@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping,
400 pgoff_t start; 417 pgoff_t start;
401 418
402 rcu_read_lock(); 419 rcu_read_lock();
403 start = page_cache_next_hole(mapping, offset + 1, max); 420 start = page_cache_next_hole(mapping, offset + 1, max_pages);
404 rcu_read_unlock(); 421 rcu_read_unlock();
405 422
406 if (!start || start - offset > max) 423 if (!start || start - offset > max_pages)
407 return 0; 424 return 0;
408 425
409 ra->start = start; 426 ra->start = start;
410 ra->size = start - offset; /* old async_size */ 427 ra->size = start - offset; /* old async_size */
411 ra->size += req_size; 428 ra->size += req_size;
412 ra->size = get_next_ra_size(ra, max); 429 ra->size = get_next_ra_size(ra, max_pages);
413 ra->async_size = ra->size; 430 ra->async_size = ra->size;
414 goto readit; 431 goto readit;
415 } 432 }
@@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping,
417 /* 434 /*
418 * oversize read 435 * oversize read
419 */ 436 */
420 if (req_size > max) 437 if (req_size > max_pages)
421 goto initial_readahead; 438 goto initial_readahead;
422 439
423 /* 440 /*
@@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping,
433 * Query the page cache and look for the traces(cached history pages) 450 * Query the page cache and look for the traces(cached history pages)
434 * that a sequential stream would leave behind. 451 * that a sequential stream would leave behind.
435 */ 452 */
436 if (try_context_readahead(mapping, ra, offset, req_size, max)) 453 if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
437 goto readit; 454 goto readit;
438 455
439 /* 456 /*
@@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping,
444 461
445initial_readahead: 462initial_readahead:
446 ra->start = offset; 463 ra->start = offset;
447 ra->size = get_init_ra_size(req_size, max); 464 ra->size = get_init_ra_size(req_size, max_pages);
448 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; 465 ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
449 466
450readit: 467readit:
@@ -454,7 +471,7 @@ readit:
454 * the resulted next readahead window into the current one. 471 * the resulted next readahead window into the current one.
455 */ 472 */
456 if (offset == ra->start && ra->size == ra->async_size) { 473 if (offset == ra->start && ra->size == ra->async_size) {
457 ra->async_size = get_next_ra_size(ra, max); 474 ra->async_size = get_next_ra_size(ra, max_pages);
458 ra->size += ra->async_size; 475 ra->size += ra->async_size;
459 } 476 }
460 477
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ef36404e7b2..91619fd70939 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
141} 141}
142 142
143/** 143/**
144 * anon_vma_prepare - attach an anon_vma to a memory region 144 * __anon_vma_prepare - attach an anon_vma to a memory region
145 * @vma: the memory region in question 145 * @vma: the memory region in question
146 * 146 *
147 * This makes sure the memory mapping described by 'vma' has 147 * This makes sure the memory mapping described by 'vma' has
148 * an 'anon_vma' attached to it, so that we can associate the 148 * an 'anon_vma' attached to it, so that we can associate the
149 * anonymous pages mapped into it with that anon_vma. 149 * anonymous pages mapped into it with that anon_vma.
150 * 150 *
151 * The common case will be that we already have one, but if 151 * The common case will be that we already have one, which
152 * is handled inline by anon_vma_prepare(). But if
152 * not we either need to find an adjacent mapping that we 153 * not we either need to find an adjacent mapping that we
153 * can re-use the anon_vma from (very common when the only 154 * can re-use the anon_vma from (very common when the only
154 * reason for splitting a vma has been mprotect()), or we 155 * reason for splitting a vma has been mprotect()), or we
@@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
167 * 168 *
168 * This must be called with the mmap_sem held for reading. 169 * This must be called with the mmap_sem held for reading.
169 */ 170 */
170int anon_vma_prepare(struct vm_area_struct *vma) 171int __anon_vma_prepare(struct vm_area_struct *vma)
171{ 172{
172 struct anon_vma *anon_vma = vma->anon_vma; 173 struct mm_struct *mm = vma->vm_mm;
174 struct anon_vma *anon_vma, *allocated;
173 struct anon_vma_chain *avc; 175 struct anon_vma_chain *avc;
174 176
175 might_sleep(); 177 might_sleep();
176 if (unlikely(!anon_vma)) {
177 struct mm_struct *mm = vma->vm_mm;
178 struct anon_vma *allocated;
179 178
180 avc = anon_vma_chain_alloc(GFP_KERNEL); 179 avc = anon_vma_chain_alloc(GFP_KERNEL);
181 if (!avc) 180 if (!avc)
182 goto out_enomem; 181 goto out_enomem;
182
183 anon_vma = find_mergeable_anon_vma(vma);
184 allocated = NULL;
185 if (!anon_vma) {
186 anon_vma = anon_vma_alloc();
187 if (unlikely(!anon_vma))
188 goto out_enomem_free_avc;
189 allocated = anon_vma;
190 }
183 191
184 anon_vma = find_mergeable_anon_vma(vma); 192 anon_vma_lock_write(anon_vma);
193 /* page_table_lock to protect against threads */
194 spin_lock(&mm->page_table_lock);
195 if (likely(!vma->anon_vma)) {
196 vma->anon_vma = anon_vma;
197 anon_vma_chain_link(vma, avc, anon_vma);
198 /* vma reference or self-parent link for new root */
199 anon_vma->degree++;
185 allocated = NULL; 200 allocated = NULL;
186 if (!anon_vma) { 201 avc = NULL;
187 anon_vma = anon_vma_alloc(); 202 }
188 if (unlikely(!anon_vma)) 203 spin_unlock(&mm->page_table_lock);
189 goto out_enomem_free_avc; 204 anon_vma_unlock_write(anon_vma);
190 allocated = anon_vma;
191 }
192 205
193 anon_vma_lock_write(anon_vma); 206 if (unlikely(allocated))
194 /* page_table_lock to protect against threads */ 207 put_anon_vma(allocated);
195 spin_lock(&mm->page_table_lock); 208 if (unlikely(avc))
196 if (likely(!vma->anon_vma)) { 209 anon_vma_chain_free(avc);
197 vma->anon_vma = anon_vma;
198 anon_vma_chain_link(vma, avc, anon_vma);
199 /* vma reference or self-parent link for new root */
200 anon_vma->degree++;
201 allocated = NULL;
202 avc = NULL;
203 }
204 spin_unlock(&mm->page_table_lock);
205 anon_vma_unlock_write(anon_vma);
206 210
207 if (unlikely(allocated))
208 put_anon_vma(allocated);
209 if (unlikely(avc))
210 anon_vma_chain_free(avc);
211 }
212 return 0; 211 return 0;
213 212
214 out_enomem_free_avc: 213 out_enomem_free_avc:
diff --git a/mm/shmem.c b/mm/shmem.c
index 9d32e1cb9f38..abd7403aba41 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -300,18 +300,19 @@ void shmem_uncharge(struct inode *inode, long pages)
300static int shmem_radix_tree_replace(struct address_space *mapping, 300static int shmem_radix_tree_replace(struct address_space *mapping,
301 pgoff_t index, void *expected, void *replacement) 301 pgoff_t index, void *expected, void *replacement)
302{ 302{
303 struct radix_tree_node *node;
303 void **pslot; 304 void **pslot;
304 void *item; 305 void *item;
305 306
306 VM_BUG_ON(!expected); 307 VM_BUG_ON(!expected);
307 VM_BUG_ON(!replacement); 308 VM_BUG_ON(!replacement);
308 pslot = radix_tree_lookup_slot(&mapping->page_tree, index); 309 item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot);
309 if (!pslot) 310 if (!item)
310 return -ENOENT; 311 return -ENOENT;
311 item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
312 if (item != expected) 312 if (item != expected)
313 return -ENOENT; 313 return -ENOENT;
314 radix_tree_replace_slot(pslot, replacement); 314 __radix_tree_replace(&mapping->page_tree, node, pslot,
315 replacement, NULL, NULL);
315 return 0; 316 return 0;
316} 317}
317 318
@@ -370,6 +371,7 @@ static bool shmem_confirm_swap(struct address_space *mapping,
370 371
371int shmem_huge __read_mostly; 372int shmem_huge __read_mostly;
372 373
374#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
373static int shmem_parse_huge(const char *str) 375static int shmem_parse_huge(const char *str)
374{ 376{
375 if (!strcmp(str, "never")) 377 if (!strcmp(str, "never"))
@@ -407,6 +409,7 @@ static const char *shmem_format_huge(int huge)
407 return "bad_val"; 409 return "bad_val";
408 } 410 }
409} 411}
412#endif
410 413
411static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, 414static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
412 struct shrink_control *sc, unsigned long nr_to_split) 415 struct shrink_control *sc, unsigned long nr_to_split)
@@ -1539,7 +1542,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1539 struct mm_struct *fault_mm, int *fault_type) 1542 struct mm_struct *fault_mm, int *fault_type)
1540{ 1543{
1541 struct address_space *mapping = inode->i_mapping; 1544 struct address_space *mapping = inode->i_mapping;
1542 struct shmem_inode_info *info; 1545 struct shmem_inode_info *info = SHMEM_I(inode);
1543 struct shmem_sb_info *sbinfo; 1546 struct shmem_sb_info *sbinfo;
1544 struct mm_struct *charge_mm; 1547 struct mm_struct *charge_mm;
1545 struct mem_cgroup *memcg; 1548 struct mem_cgroup *memcg;
@@ -1589,7 +1592,6 @@ repeat:
1589 * Fast cache lookup did not find it: 1592 * Fast cache lookup did not find it:
1590 * bring it back from swap or allocate. 1593 * bring it back from swap or allocate.
1591 */ 1594 */
1592 info = SHMEM_I(inode);
1593 sbinfo = SHMEM_SB(inode->i_sb); 1595 sbinfo = SHMEM_SB(inode->i_sb);
1594 charge_mm = fault_mm ? : current->mm; 1596 charge_mm = fault_mm ? : current->mm;
1595 1597
@@ -1837,7 +1839,6 @@ unlock:
1837 put_page(page); 1839 put_page(page);
1838 } 1840 }
1839 if (error == -ENOSPC && !once++) { 1841 if (error == -ENOSPC && !once++) {
1840 info = SHMEM_I(inode);
1841 spin_lock_irq(&info->lock); 1842 spin_lock_irq(&info->lock);
1842 shmem_recalc_inode(inode); 1843 shmem_recalc_inode(inode);
1843 spin_unlock_irq(&info->lock); 1844 spin_unlock_irq(&info->lock);
diff --git a/mm/slab.c b/mm/slab.c
index 0b0550ca85b4..87b29e76cafd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
227 INIT_LIST_HEAD(&parent->slabs_full); 227 INIT_LIST_HEAD(&parent->slabs_full);
228 INIT_LIST_HEAD(&parent->slabs_partial); 228 INIT_LIST_HEAD(&parent->slabs_partial);
229 INIT_LIST_HEAD(&parent->slabs_free); 229 INIT_LIST_HEAD(&parent->slabs_free);
230 parent->total_slabs = 0;
231 parent->free_slabs = 0;
230 parent->shared = NULL; 232 parent->shared = NULL;
231 parent->alien = NULL; 233 parent->alien = NULL;
232 parent->colour_next = 0; 234 parent->colour_next = 0;
233 spin_lock_init(&parent->list_lock); 235 spin_lock_init(&parent->list_lock);
234 parent->free_objects = 0; 236 parent->free_objects = 0;
235 parent->free_touched = 0; 237 parent->free_touched = 0;
236 parent->num_slabs = 0;
237} 238}
238 239
239#define MAKE_LIST(cachep, listp, slab, nodeid) \ 240#define MAKE_LIST(cachep, listp, slab, nodeid) \
@@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1366{ 1367{
1367#if DEBUG 1368#if DEBUG
1368 struct kmem_cache_node *n; 1369 struct kmem_cache_node *n;
1369 struct page *page;
1370 unsigned long flags; 1370 unsigned long flags;
1371 int node; 1371 int node;
1372 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, 1372 static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL,
@@ -1381,32 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
1381 cachep->name, cachep->size, cachep->gfporder); 1381 cachep->name, cachep->size, cachep->gfporder);
1382 1382
1383 for_each_kmem_cache_node(cachep, node, n) { 1383 for_each_kmem_cache_node(cachep, node, n) {
1384 unsigned long active_objs = 0, num_objs = 0, free_objects = 0; 1384 unsigned long total_slabs, free_slabs, free_objs;
1385 unsigned long active_slabs = 0, num_slabs = 0;
1386 unsigned long num_slabs_partial = 0, num_slabs_free = 0;
1387 unsigned long num_slabs_full;
1388 1385
1389 spin_lock_irqsave(&n->list_lock, flags); 1386 spin_lock_irqsave(&n->list_lock, flags);
1390 num_slabs = n->num_slabs; 1387 total_slabs = n->total_slabs;
1391 list_for_each_entry(page, &n->slabs_partial, lru) { 1388 free_slabs = n->free_slabs;
1392 active_objs += page->active; 1389 free_objs = n->free_objects;
1393 num_slabs_partial++;
1394 }
1395 list_for_each_entry(page, &n->slabs_free, lru)
1396 num_slabs_free++;
1397
1398 free_objects += n->free_objects;
1399 spin_unlock_irqrestore(&n->list_lock, flags); 1390 spin_unlock_irqrestore(&n->list_lock, flags);
1400 1391
1401 num_objs = num_slabs * cachep->num; 1392 pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n",
1402 active_slabs = num_slabs - num_slabs_free; 1393 node, total_slabs - free_slabs, total_slabs,
1403 num_slabs_full = num_slabs - 1394 (total_slabs * cachep->num) - free_objs,
1404 (num_slabs_partial + num_slabs_free); 1395 total_slabs * cachep->num);
1405 active_objs += (num_slabs_full * cachep->num);
1406
1407 pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n",
1408 node, active_slabs, num_slabs, active_objs, num_objs,
1409 free_objects);
1410 } 1396 }
1411#endif 1397#endif
1412} 1398}
@@ -2318,7 +2304,8 @@ static int drain_freelist(struct kmem_cache *cache,
2318 2304
2319 page = list_entry(p, struct page, lru); 2305 page = list_entry(p, struct page, lru);
2320 list_del(&page->lru); 2306 list_del(&page->lru);
2321 n->num_slabs--; 2307 n->free_slabs--;
2308 n->total_slabs--;
2322 /* 2309 /*
2323 * Safe to drop the lock. The slab is no longer linked 2310 * Safe to drop the lock. The slab is no longer linked
2324 * to the cache. 2311 * to the cache.
@@ -2332,7 +2319,7 @@ out:
2332 return nr_freed; 2319 return nr_freed;
2333} 2320}
2334 2321
2335int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) 2322int __kmem_cache_shrink(struct kmem_cache *cachep)
2336{ 2323{
2337 int ret = 0; 2324 int ret = 0;
2338 int node; 2325 int node;
@@ -2352,7 +2339,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
2352 2339
2353int __kmem_cache_shutdown(struct kmem_cache *cachep) 2340int __kmem_cache_shutdown(struct kmem_cache *cachep)
2354{ 2341{
2355 return __kmem_cache_shrink(cachep, false); 2342 return __kmem_cache_shrink(cachep);
2356} 2343}
2357 2344
2358void __kmem_cache_release(struct kmem_cache *cachep) 2345void __kmem_cache_release(struct kmem_cache *cachep)
@@ -2753,12 +2740,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
2753 n = get_node(cachep, page_to_nid(page)); 2740 n = get_node(cachep, page_to_nid(page));
2754 2741
2755 spin_lock(&n->list_lock); 2742 spin_lock(&n->list_lock);
2756 if (!page->active) 2743 n->total_slabs++;
2744 if (!page->active) {
2757 list_add_tail(&page->lru, &(n->slabs_free)); 2745 list_add_tail(&page->lru, &(n->slabs_free));
2758 else 2746 n->free_slabs++;
2747 } else
2759 fixup_slab_list(cachep, n, page, &list); 2748 fixup_slab_list(cachep, n, page, &list);
2760 2749
2761 n->num_slabs++;
2762 STATS_INC_GROWN(cachep); 2750 STATS_INC_GROWN(cachep);
2763 n->free_objects += cachep->num - page->active; 2751 n->free_objects += cachep->num - page->active;
2764 spin_unlock(&n->list_lock); 2752 spin_unlock(&n->list_lock);
@@ -2903,9 +2891,10 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
2903 2891
2904 /* Move pfmemalloc slab to the end of list to speed up next search */ 2892 /* Move pfmemalloc slab to the end of list to speed up next search */
2905 list_del(&page->lru); 2893 list_del(&page->lru);
2906 if (!page->active) 2894 if (!page->active) {
2907 list_add_tail(&page->lru, &n->slabs_free); 2895 list_add_tail(&page->lru, &n->slabs_free);
2908 else 2896 n->free_slabs++;
2897 } else
2909 list_add_tail(&page->lru, &n->slabs_partial); 2898 list_add_tail(&page->lru, &n->slabs_partial);
2910 2899
2911 list_for_each_entry(page, &n->slabs_partial, lru) { 2900 list_for_each_entry(page, &n->slabs_partial, lru) {
@@ -2913,9 +2902,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n,
2913 return page; 2902 return page;
2914 } 2903 }
2915 2904
2905 n->free_touched = 1;
2916 list_for_each_entry(page, &n->slabs_free, lru) { 2906 list_for_each_entry(page, &n->slabs_free, lru) {
2917 if (!PageSlabPfmemalloc(page)) 2907 if (!PageSlabPfmemalloc(page)) {
2908 n->free_slabs--;
2918 return page; 2909 return page;
2910 }
2919 } 2911 }
2920 2912
2921 return NULL; 2913 return NULL;
@@ -2925,16 +2917,18 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
2925{ 2917{
2926 struct page *page; 2918 struct page *page;
2927 2919
2928 page = list_first_entry_or_null(&n->slabs_partial, 2920 assert_spin_locked(&n->list_lock);
2929 struct page, lru); 2921 page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
2930 if (!page) { 2922 if (!page) {
2931 n->free_touched = 1; 2923 n->free_touched = 1;
2932 page = list_first_entry_or_null(&n->slabs_free, 2924 page = list_first_entry_or_null(&n->slabs_free, struct page,
2933 struct page, lru); 2925 lru);
2926 if (page)
2927 n->free_slabs--;
2934 } 2928 }
2935 2929
2936 if (sk_memalloc_socks()) 2930 if (sk_memalloc_socks())
2937 return get_valid_first_slab(n, page, pfmemalloc); 2931 page = get_valid_first_slab(n, page, pfmemalloc);
2938 2932
2939 return page; 2933 return page;
2940} 2934}
@@ -3434,9 +3428,10 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
3434 STATS_DEC_ACTIVE(cachep); 3428 STATS_DEC_ACTIVE(cachep);
3435 3429
3436 /* fixup slab chains */ 3430 /* fixup slab chains */
3437 if (page->active == 0) 3431 if (page->active == 0) {
3438 list_add(&page->lru, &n->slabs_free); 3432 list_add(&page->lru, &n->slabs_free);
3439 else { 3433 n->free_slabs++;
3434 } else {
3440 /* Unconditionally move a slab to the end of the 3435 /* Unconditionally move a slab to the end of the
3441 * partial list on free - maximum time for the 3436 * partial list on free - maximum time for the
3442 * other objects to be freed, too. 3437 * other objects to be freed, too.
@@ -3450,7 +3445,8 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
3450 3445
3451 page = list_last_entry(&n->slabs_free, struct page, lru); 3446 page = list_last_entry(&n->slabs_free, struct page, lru);
3452 list_move(&page->lru, list); 3447 list_move(&page->lru, list);
3453 n->num_slabs--; 3448 n->free_slabs--;
3449 n->total_slabs--;
3454 } 3450 }
3455} 3451}
3456 3452
@@ -4102,64 +4098,33 @@ out:
4102#ifdef CONFIG_SLABINFO 4098#ifdef CONFIG_SLABINFO
4103void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) 4099void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
4104{ 4100{
4105 struct page *page; 4101 unsigned long active_objs, num_objs, active_slabs;
4106 unsigned long active_objs; 4102 unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0;
4107 unsigned long num_objs; 4103 unsigned long free_slabs = 0;
4108 unsigned long active_slabs = 0;
4109 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
4110 unsigned long num_slabs_partial = 0, num_slabs_free = 0;
4111 unsigned long num_slabs_full = 0;
4112 const char *name;
4113 char *error = NULL;
4114 int node; 4104 int node;
4115 struct kmem_cache_node *n; 4105 struct kmem_cache_node *n;
4116 4106
4117 active_objs = 0;
4118 num_slabs = 0;
4119 for_each_kmem_cache_node(cachep, node, n) { 4107 for_each_kmem_cache_node(cachep, node, n) {
4120
4121 check_irq_on(); 4108 check_irq_on();
4122 spin_lock_irq(&n->list_lock); 4109 spin_lock_irq(&n->list_lock);
4123 4110
4124 num_slabs += n->num_slabs; 4111 total_slabs += n->total_slabs;
4112 free_slabs += n->free_slabs;
4113 free_objs += n->free_objects;
4125 4114
4126 list_for_each_entry(page, &n->slabs_partial, lru) {
4127 if (page->active == cachep->num && !error)
4128 error = "slabs_partial accounting error";
4129 if (!page->active && !error)
4130 error = "slabs_partial accounting error";
4131 active_objs += page->active;
4132 num_slabs_partial++;
4133 }
4134
4135 list_for_each_entry(page, &n->slabs_free, lru) {
4136 if (page->active && !error)
4137 error = "slabs_free accounting error";
4138 num_slabs_free++;
4139 }
4140
4141 free_objects += n->free_objects;
4142 if (n->shared) 4115 if (n->shared)
4143 shared_avail += n->shared->avail; 4116 shared_avail += n->shared->avail;
4144 4117
4145 spin_unlock_irq(&n->list_lock); 4118 spin_unlock_irq(&n->list_lock);
4146 } 4119 }
4147 num_objs = num_slabs * cachep->num; 4120 num_objs = total_slabs * cachep->num;
4148 active_slabs = num_slabs - num_slabs_free; 4121 active_slabs = total_slabs - free_slabs;
4149 num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); 4122 active_objs = num_objs - free_objs;
4150 active_objs += (num_slabs_full * cachep->num);
4151
4152 if (num_objs - active_objs != free_objects && !error)
4153 error = "free_objects accounting error";
4154
4155 name = cachep->name;
4156 if (error)
4157 pr_err("slab: cache %s error: %s\n", name, error);
4158 4123
4159 sinfo->active_objs = active_objs; 4124 sinfo->active_objs = active_objs;
4160 sinfo->num_objs = num_objs; 4125 sinfo->num_objs = num_objs;
4161 sinfo->active_slabs = active_slabs; 4126 sinfo->active_slabs = active_slabs;
4162 sinfo->num_slabs = num_slabs; 4127 sinfo->num_slabs = total_slabs;
4163 sinfo->shared_avail = shared_avail; 4128 sinfo->shared_avail = shared_avail;
4164 sinfo->limit = cachep->limit; 4129 sinfo->limit = cachep->limit;
4165 sinfo->batchcount = cachep->batchcount; 4130 sinfo->batchcount = cachep->batchcount;
diff --git a/mm/slab.h b/mm/slab.h
index bc05fdc3edce..de6579dc362c 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -142,11 +142,26 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
142#define SLAB_CACHE_FLAGS (0) 142#define SLAB_CACHE_FLAGS (0)
143#endif 143#endif
144 144
145/* Common flags available with current configuration */
145#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) 146#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
146 147
148/* Common flags permitted for kmem_cache_create */
149#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \
150 SLAB_RED_ZONE | \
151 SLAB_POISON | \
152 SLAB_STORE_USER | \
153 SLAB_TRACE | \
154 SLAB_CONSISTENCY_CHECKS | \
155 SLAB_MEM_SPREAD | \
156 SLAB_NOLEAKTRACE | \
157 SLAB_RECLAIM_ACCOUNT | \
158 SLAB_TEMPORARY | \
159 SLAB_NOTRACK | \
160 SLAB_ACCOUNT)
161
147int __kmem_cache_shutdown(struct kmem_cache *); 162int __kmem_cache_shutdown(struct kmem_cache *);
148void __kmem_cache_release(struct kmem_cache *); 163void __kmem_cache_release(struct kmem_cache *);
149int __kmem_cache_shrink(struct kmem_cache *, bool); 164int __kmem_cache_shrink(struct kmem_cache *);
150void slab_kmem_cache_release(struct kmem_cache *); 165void slab_kmem_cache_release(struct kmem_cache *);
151 166
152struct seq_file; 167struct seq_file;
@@ -432,7 +447,8 @@ struct kmem_cache_node {
432 struct list_head slabs_partial; /* partial list first, better asm code */ 447 struct list_head slabs_partial; /* partial list first, better asm code */
433 struct list_head slabs_full; 448 struct list_head slabs_full;
434 struct list_head slabs_free; 449 struct list_head slabs_free;
435 unsigned long num_slabs; 450 unsigned long total_slabs; /* length of all slab lists */
451 unsigned long free_slabs; /* length of free slab list only */
436 unsigned long free_objects; 452 unsigned long free_objects;
437 unsigned int free_limit; 453 unsigned int free_limit;
438 unsigned int colour_next; /* Per-node cache coloring */ 454 unsigned int colour_next; /* Per-node cache coloring */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 329b03843863..ae323841adb1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
404 goto out_unlock; 404 goto out_unlock;
405 } 405 }
406 406
407 /* Refuse requests with allocator specific flags */
408 if (flags & ~SLAB_FLAGS_PERMITTED) {
409 err = -EINVAL;
410 goto out_unlock;
411 }
412
407 /* 413 /*
408 * Some allocators will constraint the set of valid flags to a subset 414 * Some allocators will constraint the set of valid flags to a subset
409 * of all flags. We expect them to define CACHE_CREATE_MASK in this 415 * of all flags. We expect them to define CACHE_CREATE_MASK in this
@@ -573,6 +579,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
573 get_online_cpus(); 579 get_online_cpus();
574 get_online_mems(); 580 get_online_mems();
575 581
582#ifdef CONFIG_SLUB
583 /*
584 * In case of SLUB, we need to disable empty slab caching to
585 * avoid pinning the offline memory cgroup by freeable kmem
586 * pages charged to it. SLAB doesn't need this, as it
587 * periodically purges unused slabs.
588 */
589 mutex_lock(&slab_mutex);
590 list_for_each_entry(s, &slab_caches, list) {
591 c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL;
592 if (c) {
593 c->cpu_partial = 0;
594 c->min_partial = 0;
595 }
596 }
597 mutex_unlock(&slab_mutex);
598 /*
599 * kmem_cache->cpu_partial is checked locklessly (see
600 * put_cpu_partial()). Make sure the change is visible.
601 */
602 synchronize_sched();
603#endif
604
576 mutex_lock(&slab_mutex); 605 mutex_lock(&slab_mutex);
577 list_for_each_entry(s, &slab_caches, list) { 606 list_for_each_entry(s, &slab_caches, list) {
578 if (!is_root_cache(s)) 607 if (!is_root_cache(s))
@@ -584,7 +613,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
584 if (!c) 613 if (!c)
585 continue; 614 continue;
586 615
587 __kmem_cache_shrink(c, true); 616 __kmem_cache_shrink(c);
588 arr->entries[idx] = NULL; 617 arr->entries[idx] = NULL;
589 } 618 }
590 mutex_unlock(&slab_mutex); 619 mutex_unlock(&slab_mutex);
@@ -755,7 +784,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
755 get_online_cpus(); 784 get_online_cpus();
756 get_online_mems(); 785 get_online_mems();
757 kasan_cache_shrink(cachep); 786 kasan_cache_shrink(cachep);
758 ret = __kmem_cache_shrink(cachep, false); 787 ret = __kmem_cache_shrink(cachep);
759 put_online_mems(); 788 put_online_mems();
760 put_online_cpus(); 789 put_online_cpus();
761 return ret; 790 return ret;
diff --git a/mm/slob.c b/mm/slob.c
index 5ec158054ffe..eac04d4357ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c)
634{ 634{
635} 635}
636 636
637int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) 637int __kmem_cache_shrink(struct kmem_cache *d)
638{ 638{
639 return 0; 639 return 0;
640} 640}
diff --git a/mm/slub.c b/mm/slub.c
index 2b3e740609e9..067598a00849 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
3076 struct detached_freelist df; 3076 struct detached_freelist df;
3077 3077
3078 size = build_detached_freelist(s, size, p, &df); 3078 size = build_detached_freelist(s, size, p, &df);
3079 if (unlikely(!df.page)) 3079 if (!df.page)
3080 continue; 3080 continue;
3081 3081
3082 slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); 3082 slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
@@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree);
3883 * being allocated from last increasing the chance that the last objects 3883 * being allocated from last increasing the chance that the last objects
3884 * are freed in them. 3884 * are freed in them.
3885 */ 3885 */
3886int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) 3886int __kmem_cache_shrink(struct kmem_cache *s)
3887{ 3887{
3888 int node; 3888 int node;
3889 int i; 3889 int i;
@@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
3895 unsigned long flags; 3895 unsigned long flags;
3896 int ret = 0; 3896 int ret = 0;
3897 3897
3898 if (deactivate) {
3899 /*
3900 * Disable empty slabs caching. Used to avoid pinning offline
3901 * memory cgroups by kmem pages that can be freed.
3902 */
3903 s->cpu_partial = 0;
3904 s->min_partial = 0;
3905
3906 /*
3907 * s->cpu_partial is checked locklessly (see put_cpu_partial),
3908 * so we have to make sure the change is visible.
3909 */
3910 synchronize_sched();
3911 }
3912
3913 flush_all(s); 3898 flush_all(s);
3914 for_each_kmem_cache_node(s, node, n) { 3899 for_each_kmem_cache_node(s, node, n) {
3915 INIT_LIST_HEAD(&discard); 3900 INIT_LIST_HEAD(&discard);
@@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg)
3966 3951
3967 mutex_lock(&slab_mutex); 3952 mutex_lock(&slab_mutex);
3968 list_for_each_entry(s, &slab_caches, list) 3953 list_for_each_entry(s, &slab_caches, list)
3969 __kmem_cache_shrink(s, false); 3954 __kmem_cache_shrink(s);
3970 mutex_unlock(&slab_mutex); 3955 mutex_unlock(&slab_mutex);
3971 3956
3972 return 0; 3957 return 0;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f30438970cd1..1c6e0321205d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1234 1234
1235 pmd = pmd_offset(pud, addr); 1235 pmd = pmd_offset(pud, addr);
1236 do { 1236 do {
1237 cond_resched();
1237 next = pmd_addr_end(addr, end); 1238 next = pmd_addr_end(addr, end);
1238 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1239 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1239 continue; 1240 continue;
@@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm,
1313 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1314 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1314 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 1315 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
1315 break; 1316 break;
1317 cond_resched();
1316 } 1318 }
1317 up_read(&mm->mmap_sem); 1319 up_read(&mm->mmap_sem);
1318 return (ret < 0)? ret: 0; 1320 return (ret < 0)? ret: 0;
@@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1350 prev = 0; 1352 prev = 0;
1351 i = 1; 1353 i = 1;
1352 } 1354 }
1353 if (frontswap) {
1354 if (frontswap_test(si, i))
1355 break;
1356 else
1357 continue;
1358 }
1359 count = READ_ONCE(si->swap_map[i]); 1355 count = READ_ONCE(si->swap_map[i]);
1360 if (count && swap_count(count) != SWAP_MAP_BAD) 1356 if (count && swap_count(count) != SWAP_MAP_BAD)
1361 break; 1357 if (!frontswap || frontswap_test(si, i))
1358 break;
1359 if ((i % LATENCY_LIMIT) == 0)
1360 cond_resched();
1362 } 1361 }
1363 return i; 1362 return i;
1364} 1363}
diff --git a/mm/truncate.c b/mm/truncate.c
index 8d8c62d89e6d..fd97f1dbce29 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping,
44 * without the tree itself locked. These unlocked entries 44 * without the tree itself locked. These unlocked entries
45 * need verification under the tree lock. 45 * need verification under the tree lock.
46 */ 46 */
47 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, 47 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
48 &slot))
49 goto unlock; 48 goto unlock;
50 if (*slot != entry) 49 if (*slot != entry)
51 goto unlock; 50 goto unlock;
52 radix_tree_replace_slot(slot, NULL); 51 __radix_tree_replace(&mapping->page_tree, node, slot, NULL,
52 workingset_update_node, mapping);
53 mapping->nrexceptional--; 53 mapping->nrexceptional--;
54 if (!node)
55 goto unlock;
56 workingset_node_shadows_dec(node);
57 /*
58 * Don't track node without shadow entries.
59 *
60 * Avoid acquiring the list_lru lock if already untracked.
61 * The list_empty() test is safe as node->private_list is
62 * protected by mapping->tree_lock.
63 */
64 if (!workingset_node_shadows(node) &&
65 !list_empty(&node->private_list))
66 list_lru_del(&workingset_shadow_nodes,
67 &node->private_list);
68 __radix_tree_delete_node(&mapping->page_tree, node);
69unlock: 54unlock:
70 spin_unlock_irq(&mapping->tree_lock); 55 spin_unlock_irq(&mapping->tree_lock);
71} 56}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f2481cb4e6b2..a5584384eabc 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
365 BUG_ON(offset_in_page(size)); 365 BUG_ON(offset_in_page(size));
366 BUG_ON(!is_power_of_2(align)); 366 BUG_ON(!is_power_of_2(align));
367 367
368 might_sleep_if(gfpflags_allow_blocking(gfp_mask)); 368 might_sleep();
369 369
370 va = kmalloc_node(sizeof(struct vmap_area), 370 va = kmalloc_node(sizeof(struct vmap_area),
371 gfp_mask & GFP_RECLAIM_MASK, node); 371 gfp_mask & GFP_RECLAIM_MASK, node);
@@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void)
601 601
602static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); 602static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
603 603
604/*
605 * Serialize vmap purging. There is no actual criticial section protected
606 * by this look, but we want to avoid concurrent calls for performance
607 * reasons and to make the pcpu_get_vm_areas more deterministic.
608 */
609static DEFINE_MUTEX(vmap_purge_lock);
610
604/* for per-CPU blocks */ 611/* for per-CPU blocks */
605static void purge_fragmented_blocks_allcpus(void); 612static void purge_fragmented_blocks_allcpus(void);
606 613
@@ -615,59 +622,40 @@ void set_iounmap_nonlazy(void)
615 622
616/* 623/*
617 * Purges all lazily-freed vmap areas. 624 * Purges all lazily-freed vmap areas.
618 *
619 * If sync is 0 then don't purge if there is already a purge in progress.
620 * If force_flush is 1, then flush kernel TLBs between *start and *end even
621 * if we found no lazy vmap areas to unmap (callers can use this to optimise
622 * their own TLB flushing).
623 * Returns with *start = min(*start, lowest purged address)
624 * *end = max(*end, highest purged address)
625 */ 625 */
626static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 626static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
627 int sync, int force_flush)
628{ 627{
629 static DEFINE_SPINLOCK(purge_lock);
630 struct llist_node *valist; 628 struct llist_node *valist;
631 struct vmap_area *va; 629 struct vmap_area *va;
632 struct vmap_area *n_va; 630 struct vmap_area *n_va;
633 int nr = 0; 631 bool do_free = false;
634 632
635 /* 633 lockdep_assert_held(&vmap_purge_lock);
636 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
637 * should not expect such behaviour. This just simplifies locking for
638 * the case that isn't actually used at the moment anyway.
639 */
640 if (!sync && !force_flush) {
641 if (!spin_trylock(&purge_lock))
642 return;
643 } else
644 spin_lock(&purge_lock);
645
646 if (sync)
647 purge_fragmented_blocks_allcpus();
648 634
649 valist = llist_del_all(&vmap_purge_list); 635 valist = llist_del_all(&vmap_purge_list);
650 llist_for_each_entry(va, valist, purge_list) { 636 llist_for_each_entry(va, valist, purge_list) {
651 if (va->va_start < *start) 637 if (va->va_start < start)
652 *start = va->va_start; 638 start = va->va_start;
653 if (va->va_end > *end) 639 if (va->va_end > end)
654 *end = va->va_end; 640 end = va->va_end;
655 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 641 do_free = true;
656 } 642 }
657 643
658 if (nr) 644 if (!do_free)
659 atomic_sub(nr, &vmap_lazy_nr); 645 return false;
660 646
661 if (nr || force_flush) 647 flush_tlb_kernel_range(start, end);
662 flush_tlb_kernel_range(*start, *end);
663 648
664 if (nr) { 649 spin_lock(&vmap_area_lock);
665 spin_lock(&vmap_area_lock); 650 llist_for_each_entry_safe(va, n_va, valist, purge_list) {
666 llist_for_each_entry_safe(va, n_va, valist, purge_list) 651 int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
667 __free_vmap_area(va); 652
668 spin_unlock(&vmap_area_lock); 653 __free_vmap_area(va);
654 atomic_sub(nr, &vmap_lazy_nr);
655 cond_resched_lock(&vmap_area_lock);
669 } 656 }
670 spin_unlock(&purge_lock); 657 spin_unlock(&vmap_area_lock);
658 return true;
671} 659}
672 660
673/* 661/*
@@ -676,9 +664,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
676 */ 664 */
677static void try_purge_vmap_area_lazy(void) 665static void try_purge_vmap_area_lazy(void)
678{ 666{
679 unsigned long start = ULONG_MAX, end = 0; 667 if (mutex_trylock(&vmap_purge_lock)) {
680 668 __purge_vmap_area_lazy(ULONG_MAX, 0);
681 __purge_vmap_area_lazy(&start, &end, 0, 0); 669 mutex_unlock(&vmap_purge_lock);
670 }
682} 671}
683 672
684/* 673/*
@@ -686,9 +675,10 @@ static void try_purge_vmap_area_lazy(void)
686 */ 675 */
687static void purge_vmap_area_lazy(void) 676static void purge_vmap_area_lazy(void)
688{ 677{
689 unsigned long start = ULONG_MAX, end = 0; 678 mutex_lock(&vmap_purge_lock);
690 679 purge_fragmented_blocks_allcpus();
691 __purge_vmap_area_lazy(&start, &end, 1, 0); 680 __purge_vmap_area_lazy(ULONG_MAX, 0);
681 mutex_unlock(&vmap_purge_lock);
692} 682}
693 683
694/* 684/*
@@ -711,22 +701,13 @@ static void free_vmap_area_noflush(struct vmap_area *va)
711} 701}
712 702
713/* 703/*
714 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
715 * called for the correct range previously.
716 */
717static void free_unmap_vmap_area_noflush(struct vmap_area *va)
718{
719 unmap_vmap_area(va);
720 free_vmap_area_noflush(va);
721}
722
723/*
724 * Free and unmap a vmap area 704 * Free and unmap a vmap area
725 */ 705 */
726static void free_unmap_vmap_area(struct vmap_area *va) 706static void free_unmap_vmap_area(struct vmap_area *va)
727{ 707{
728 flush_cache_vunmap(va->va_start, va->va_end); 708 flush_cache_vunmap(va->va_start, va->va_end);
729 free_unmap_vmap_area_noflush(va); 709 unmap_vmap_area(va);
710 free_vmap_area_noflush(va);
730} 711}
731 712
732static struct vmap_area *find_vmap_area(unsigned long addr) 713static struct vmap_area *find_vmap_area(unsigned long addr)
@@ -740,16 +721,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr)
740 return va; 721 return va;
741} 722}
742 723
743static void free_unmap_vmap_area_addr(unsigned long addr)
744{
745 struct vmap_area *va;
746
747 va = find_vmap_area(addr);
748 BUG_ON(!va);
749 free_unmap_vmap_area(va);
750}
751
752
753/*** Per cpu kva allocator ***/ 724/*** Per cpu kva allocator ***/
754 725
755/* 726/*
@@ -1070,6 +1041,8 @@ void vm_unmap_aliases(void)
1070 if (unlikely(!vmap_initialized)) 1041 if (unlikely(!vmap_initialized))
1071 return; 1042 return;
1072 1043
1044 might_sleep();
1045
1073 for_each_possible_cpu(cpu) { 1046 for_each_possible_cpu(cpu) {
1074 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); 1047 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1075 struct vmap_block *vb; 1048 struct vmap_block *vb;
@@ -1094,7 +1067,11 @@ void vm_unmap_aliases(void)
1094 rcu_read_unlock(); 1067 rcu_read_unlock();
1095 } 1068 }
1096 1069
1097 __purge_vmap_area_lazy(&start, &end, 1, flush); 1070 mutex_lock(&vmap_purge_lock);
1071 purge_fragmented_blocks_allcpus();
1072 if (!__purge_vmap_area_lazy(start, end) && flush)
1073 flush_tlb_kernel_range(start, end);
1074 mutex_unlock(&vmap_purge_lock);
1098} 1075}
1099EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1076EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1100 1077
@@ -1107,7 +1084,9 @@ void vm_unmap_ram(const void *mem, unsigned int count)
1107{ 1084{
1108 unsigned long size = (unsigned long)count << PAGE_SHIFT; 1085 unsigned long size = (unsigned long)count << PAGE_SHIFT;
1109 unsigned long addr = (unsigned long)mem; 1086 unsigned long addr = (unsigned long)mem;
1087 struct vmap_area *va;
1110 1088
1089 might_sleep();
1111 BUG_ON(!addr); 1090 BUG_ON(!addr);
1112 BUG_ON(addr < VMALLOC_START); 1091 BUG_ON(addr < VMALLOC_START);
1113 BUG_ON(addr > VMALLOC_END); 1092 BUG_ON(addr > VMALLOC_END);
@@ -1116,10 +1095,14 @@ void vm_unmap_ram(const void *mem, unsigned int count)
1116 debug_check_no_locks_freed(mem, size); 1095 debug_check_no_locks_freed(mem, size);
1117 vmap_debug_free_range(addr, addr+size); 1096 vmap_debug_free_range(addr, addr+size);
1118 1097
1119 if (likely(count <= VMAP_MAX_ALLOC)) 1098 if (likely(count <= VMAP_MAX_ALLOC)) {
1120 vb_free(mem, size); 1099 vb_free(mem, size);
1121 else 1100 return;
1122 free_unmap_vmap_area_addr(addr); 1101 }
1102
1103 va = find_vmap_area(addr);
1104 BUG_ON(!va);
1105 free_unmap_vmap_area(va);
1123} 1106}
1124EXPORT_SYMBOL(vm_unmap_ram); 1107EXPORT_SYMBOL(vm_unmap_ram);
1125 1108
@@ -1455,6 +1438,8 @@ struct vm_struct *remove_vm_area(const void *addr)
1455{ 1438{
1456 struct vmap_area *va; 1439 struct vmap_area *va;
1457 1440
1441 might_sleep();
1442
1458 va = find_vmap_area((unsigned long)addr); 1443 va = find_vmap_area((unsigned long)addr);
1459 if (va && va->flags & VM_VM_AREA) { 1444 if (va && va->flags & VM_VM_AREA) {
1460 struct vm_struct *vm = va->vm; 1445 struct vm_struct *vm = va->vm;
@@ -1510,7 +1495,39 @@ static void __vunmap(const void *addr, int deallocate_pages)
1510 kfree(area); 1495 kfree(area);
1511 return; 1496 return;
1512} 1497}
1513 1498
1499static inline void __vfree_deferred(const void *addr)
1500{
1501 /*
1502 * Use raw_cpu_ptr() because this can be called from preemptible
1503 * context. Preemption is absolutely fine here, because the llist_add()
1504 * implementation is lockless, so it works even if we are adding to
1505 * nother cpu's list. schedule_work() should be fine with this too.
1506 */
1507 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
1508
1509 if (llist_add((struct llist_node *)addr, &p->list))
1510 schedule_work(&p->wq);
1511}
1512
1513/**
1514 * vfree_atomic - release memory allocated by vmalloc()
1515 * @addr: memory base address
1516 *
1517 * This one is just like vfree() but can be called in any atomic context
1518 * except NMIs.
1519 */
1520void vfree_atomic(const void *addr)
1521{
1522 BUG_ON(in_nmi());
1523
1524 kmemleak_free(addr);
1525
1526 if (!addr)
1527 return;
1528 __vfree_deferred(addr);
1529}
1530
1514/** 1531/**
1515 * vfree - release memory allocated by vmalloc() 1532 * vfree - release memory allocated by vmalloc()
1516 * @addr: memory base address 1533 * @addr: memory base address
@@ -1533,11 +1550,9 @@ void vfree(const void *addr)
1533 1550
1534 if (!addr) 1551 if (!addr)
1535 return; 1552 return;
1536 if (unlikely(in_interrupt())) { 1553 if (unlikely(in_interrupt()))
1537 struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); 1554 __vfree_deferred(addr);
1538 if (llist_add((struct llist_node *)addr, &p->list)) 1555 else
1539 schedule_work(&p->wq);
1540 } else
1541 __vunmap(addr, 1); 1556 __vunmap(addr, 1);
1542} 1557}
1543EXPORT_SYMBOL(vfree); 1558EXPORT_SYMBOL(vfree);
@@ -2574,32 +2589,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2574static void *s_start(struct seq_file *m, loff_t *pos) 2589static void *s_start(struct seq_file *m, loff_t *pos)
2575 __acquires(&vmap_area_lock) 2590 __acquires(&vmap_area_lock)
2576{ 2591{
2577 loff_t n = *pos;
2578 struct vmap_area *va;
2579
2580 spin_lock(&vmap_area_lock); 2592 spin_lock(&vmap_area_lock);
2581 va = list_first_entry(&vmap_area_list, typeof(*va), list); 2593 return seq_list_start(&vmap_area_list, *pos);
2582 while (n > 0 && &va->list != &vmap_area_list) {
2583 n--;
2584 va = list_next_entry(va, list);
2585 }
2586 if (!n && &va->list != &vmap_area_list)
2587 return va;
2588
2589 return NULL;
2590
2591} 2594}
2592 2595
2593static void *s_next(struct seq_file *m, void *p, loff_t *pos) 2596static void *s_next(struct seq_file *m, void *p, loff_t *pos)
2594{ 2597{
2595 struct vmap_area *va = p, *next; 2598 return seq_list_next(p, &vmap_area_list, pos);
2596
2597 ++*pos;
2598 next = list_next_entry(va, list);
2599 if (&next->list != &vmap_area_list)
2600 return next;
2601
2602 return NULL;
2603} 2599}
2604 2600
2605static void s_stop(struct seq_file *m, void *p) 2601static void s_stop(struct seq_file *m, void *p)
@@ -2634,9 +2630,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2634 2630
2635static int s_show(struct seq_file *m, void *p) 2631static int s_show(struct seq_file *m, void *p)
2636{ 2632{
2637 struct vmap_area *va = p; 2633 struct vmap_area *va;
2638 struct vm_struct *v; 2634 struct vm_struct *v;
2639 2635
2636 va = list_entry(p, struct vmap_area, list);
2637
2640 /* 2638 /*
2641 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on 2639 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
2642 * behalf of vmap area is being tear down or vm_map_ram allocation. 2640 * behalf of vmap area is being tear down or vm_map_ram allocation.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0c8f28a6d89f..6aa5b01d3e75 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
291 int nid = shrinkctl->nid; 291 int nid = shrinkctl->nid;
292 long batch_size = shrinker->batch ? shrinker->batch 292 long batch_size = shrinker->batch ? shrinker->batch
293 : SHRINK_BATCH; 293 : SHRINK_BATCH;
294 long scanned = 0, next_deferred;
294 295
295 freeable = shrinker->count_objects(shrinker, shrinkctl); 296 freeable = shrinker->count_objects(shrinker, shrinkctl);
296 if (freeable == 0) 297 if (freeable == 0)
@@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
312 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", 313 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
313 shrinker->scan_objects, total_scan); 314 shrinker->scan_objects, total_scan);
314 total_scan = freeable; 315 total_scan = freeable;
315 } 316 next_deferred = nr;
317 } else
318 next_deferred = total_scan;
316 319
317 /* 320 /*
318 * We need to avoid excessive windup on filesystem shrinkers 321 * We need to avoid excessive windup on filesystem shrinkers
@@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
369 372
370 count_vm_events(SLABS_SCANNED, nr_to_scan); 373 count_vm_events(SLABS_SCANNED, nr_to_scan);
371 total_scan -= nr_to_scan; 374 total_scan -= nr_to_scan;
375 scanned += nr_to_scan;
372 376
373 cond_resched(); 377 cond_resched();
374 } 378 }
375 379
380 if (next_deferred >= scanned)
381 next_deferred -= scanned;
382 else
383 next_deferred = 0;
376 /* 384 /*
377 * move the unused scan count back into the shrinker in a 385 * move the unused scan count back into the shrinker in a
378 * manner that handles concurrent updates. If we exhausted the 386 * manner that handles concurrent updates. If we exhausted the
379 * scan, there is no need to do an update. 387 * scan, there is no need to do an update.
380 */ 388 */
381 if (total_scan > 0) 389 if (next_deferred > 0)
382 new_nr = atomic_long_add_return(total_scan, 390 new_nr = atomic_long_add_return(next_deferred,
383 &shrinker->nr_deferred[nid]); 391 &shrinker->nr_deferred[nid]);
384 else 392 else
385 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); 393 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
diff --git a/mm/workingset.c b/mm/workingset.c
index fb1f9183d89a..241fa5d6b3b2 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -10,6 +10,7 @@
10#include <linux/atomic.h> 10#include <linux/atomic.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/swap.h> 12#include <linux/swap.h>
13#include <linux/dax.h>
13#include <linux/fs.h> 14#include <linux/fs.h>
14#include <linux/mm.h> 15#include <linux/mm.h>
15 16
@@ -334,48 +335,81 @@ out:
334 * point where they would still be useful. 335 * point where they would still be useful.
335 */ 336 */
336 337
337struct list_lru workingset_shadow_nodes; 338static struct list_lru shadow_nodes;
339
340void workingset_update_node(struct radix_tree_node *node, void *private)
341{
342 struct address_space *mapping = private;
343
344 /* Only regular page cache has shadow entries */
345 if (dax_mapping(mapping) || shmem_mapping(mapping))
346 return;
347
348 /*
349 * Track non-empty nodes that contain only shadow entries;
350 * unlink those that contain pages or are being freed.
351 *
352 * Avoid acquiring the list_lru lock when the nodes are
353 * already where they should be. The list_empty() test is safe
354 * as node->private_list is protected by &mapping->tree_lock.
355 */
356 if (node->count && node->count == node->exceptional) {
357 if (list_empty(&node->private_list)) {
358 node->private_data = mapping;
359 list_lru_add(&shadow_nodes, &node->private_list);
360 }
361 } else {
362 if (!list_empty(&node->private_list))
363 list_lru_del(&shadow_nodes, &node->private_list);
364 }
365}
338 366
339static unsigned long count_shadow_nodes(struct shrinker *shrinker, 367static unsigned long count_shadow_nodes(struct shrinker *shrinker,
340 struct shrink_control *sc) 368 struct shrink_control *sc)
341{ 369{
342 unsigned long shadow_nodes;
343 unsigned long max_nodes; 370 unsigned long max_nodes;
344 unsigned long pages; 371 unsigned long nodes;
372 unsigned long cache;
345 373
346 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 374 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
347 local_irq_disable(); 375 local_irq_disable();
348 shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); 376 nodes = list_lru_shrink_count(&shadow_nodes, sc);
349 local_irq_enable(); 377 local_irq_enable();
350 378
351 if (sc->memcg) {
352 pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
353 LRU_ALL_FILE);
354 } else {
355 pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
356 node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
357 }
358
359 /* 379 /*
360 * Active cache pages are limited to 50% of memory, and shadow 380 * Approximate a reasonable limit for the radix tree nodes
361 * entries that represent a refault distance bigger than that 381 * containing shadow entries. We don't need to keep more
362 * do not have any effect. Limit the number of shadow nodes 382 * shadow entries than possible pages on the active list,
363 * such that shadow entries do not exceed the number of active 383 * since refault distances bigger than that are dismissed.
364 * cache pages, assuming a worst-case node population density 384 *
365 * of 1/8th on average. 385 * The size of the active list converges toward 100% of
386 * overall page cache as memory grows, with only a tiny
387 * inactive list. Assume the total cache size for that.
388 *
389 * Nodes might be sparsely populated, with only one shadow
390 * entry in the extreme case. Obviously, we cannot keep one
391 * node for every eligible shadow entry, so compromise on a
392 * worst-case density of 1/8th. Below that, not all eligible
393 * refaults can be detected anymore.
366 * 394 *
367 * On 64-bit with 7 radix_tree_nodes per page and 64 slots 395 * On 64-bit with 7 radix_tree_nodes per page and 64 slots
368 * each, this will reclaim shadow entries when they consume 396 * each, this will reclaim shadow entries when they consume
369 * ~2% of available memory: 397 * ~1.8% of available memory:
370 * 398 *
371 * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE 399 * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
372 */ 400 */
373 max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); 401 if (sc->memcg) {
402 cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
403 LRU_ALL_FILE);
404 } else {
405 cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
406 node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
407 }
408 max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
374 409
375 if (shadow_nodes <= max_nodes) 410 if (nodes <= max_nodes)
376 return 0; 411 return 0;
377 412 return nodes - max_nodes;
378 return shadow_nodes - max_nodes;
379} 413}
380 414
381static enum lru_status shadow_lru_isolate(struct list_head *item, 415static enum lru_status shadow_lru_isolate(struct list_head *item,
@@ -418,23 +452,30 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
418 * no pages, so we expect to be able to remove them all and 452 * no pages, so we expect to be able to remove them all and
419 * delete and free the empty node afterwards. 453 * delete and free the empty node afterwards.
420 */ 454 */
421 BUG_ON(!workingset_node_shadows(node)); 455 if (WARN_ON_ONCE(!node->exceptional))
422 BUG_ON(workingset_node_pages(node)); 456 goto out_invalid;
423 457 if (WARN_ON_ONCE(node->count != node->exceptional))
458 goto out_invalid;
424 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { 459 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
425 if (node->slots[i]) { 460 if (node->slots[i]) {
426 BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); 461 if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
462 goto out_invalid;
463 if (WARN_ON_ONCE(!node->exceptional))
464 goto out_invalid;
465 if (WARN_ON_ONCE(!mapping->nrexceptional))
466 goto out_invalid;
427 node->slots[i] = NULL; 467 node->slots[i] = NULL;
428 workingset_node_shadows_dec(node); 468 node->exceptional--;
429 BUG_ON(!mapping->nrexceptional); 469 node->count--;
430 mapping->nrexceptional--; 470 mapping->nrexceptional--;
431 } 471 }
432 } 472 }
433 BUG_ON(workingset_node_shadows(node)); 473 if (WARN_ON_ONCE(node->exceptional))
474 goto out_invalid;
434 inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); 475 inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
435 if (!__radix_tree_delete_node(&mapping->page_tree, node)) 476 __radix_tree_delete_node(&mapping->page_tree, node);
436 BUG();
437 477
478out_invalid:
438 spin_unlock(&mapping->tree_lock); 479 spin_unlock(&mapping->tree_lock);
439 ret = LRU_REMOVED_RETRY; 480 ret = LRU_REMOVED_RETRY;
440out: 481out:
@@ -452,8 +493,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
452 493
453 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 494 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
454 local_irq_disable(); 495 local_irq_disable();
455 ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, 496 ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL);
456 shadow_lru_isolate, NULL);
457 local_irq_enable(); 497 local_irq_enable();
458 return ret; 498 return ret;
459} 499}
@@ -492,7 +532,7 @@ static int __init workingset_init(void)
492 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", 532 pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
493 timestamp_bits, max_order, bucket_order); 533 timestamp_bits, max_order, bucket_order);
494 534
495 ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); 535 ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key);
496 if (ret) 536 if (ret)
497 goto err; 537 goto err;
498 ret = register_shrinker(&workingset_shadow_shrinker); 538 ret = register_shrinker(&workingset_shadow_shrinker);
@@ -500,7 +540,7 @@ static int __init workingset_init(void)
500 goto err_list_lru; 540 goto err_list_lru;
501 return 0; 541 return 0;
502err_list_lru: 542err_list_lru:
503 list_lru_destroy(&workingset_shadow_nodes); 543 list_lru_destroy(&shadow_nodes);
504err: 544err:
505 return ret; 545 return ret;
506} 546}
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan
index 37323b0df374..9576775a86f6 100644
--- a/scripts/Makefile.kasan
+++ b/scripts/Makefile.kasan
@@ -28,4 +28,6 @@ else
28 CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL) 28 CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
29 endif 29 endif
30endif 30endif
31
32CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope)
31endif 33endif
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index d9ff038c1b28..a27677146410 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -16,19 +16,22 @@ if len(sys.argv) != 3:
16 sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0]) 16 sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0])
17 sys.exit(-1) 17 sys.exit(-1)
18 18
19re_NUMBER = re.compile(r'\.[0-9]+')
20
19def getsizes(file): 21def getsizes(file):
20 sym = {} 22 sym = {}
21 for l in os.popen("nm --size-sort " + file).readlines(): 23 with os.popen("nm --size-sort " + file) as f:
22 size, type, name = l[:-1].split() 24 for line in f:
23 if type in "tTdDbBrR": 25 size, type, name = line.split()
24 # strip generated symbols 26 if type in "tTdDbBrR":
25 if name.startswith("__mod_"): continue 27 # strip generated symbols
26 if name.startswith("SyS_"): continue 28 if name.startswith("__mod_"): continue
27 if name.startswith("compat_SyS_"): continue 29 if name.startswith("SyS_"): continue
28 if name == "linux_banner": continue 30 if name.startswith("compat_SyS_"): continue
29 # statics and some other optimizations adds random .NUMBER 31 if name == "linux_banner": continue
30 name = re.sub(r'\.[0-9]+', '', name) 32 # statics and some other optimizations adds random .NUMBER
31 sym[name] = sym.get(name, 0) + int(size, 16) 33 name = re_NUMBER.sub('', name)
34 sym[name] = sym.get(name, 0) + int(size, 16)
32 return sym 35 return sym
33 36
34old = getsizes(sys.argv[1]) 37old = getsizes(sys.argv[1])
diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 23f462f64a3f..ac5656ef2aec 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -761,7 +761,7 @@ sub seed_camelcase_file {
761sub is_maintained_obsolete { 761sub is_maintained_obsolete {
762 my ($filename) = @_; 762 my ($filename) = @_;
763 763
764 return 0 if (!(-e "$root/scripts/get_maintainer.pl")); 764 return 0 if (!$tree || !(-e "$root/scripts/get_maintainer.pl"));
765 765
766 my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback -f $filename 2>&1`; 766 my $status = `perl $root/scripts/get_maintainer.pl --status --nom --nol --nogit --nogit-fallback -f $filename 2>&1`;
767 767
@@ -2589,6 +2589,7 @@ sub process {
2589 $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ || 2589 $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ ||
2590 ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ && 2590 ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ &&
2591 (defined($1) || defined($2))))) { 2591 (defined($1) || defined($2))))) {
2592 $is_patch = 1;
2592 $reported_maintainer_file = 1; 2593 $reported_maintainer_file = 1;
2593 WARN("FILE_PATH_CHANGES", 2594 WARN("FILE_PATH_CHANGES",
2594 "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr); 2595 "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr);
@@ -2601,20 +2602,6 @@ sub process {
2601 $herecurr) if (!$emitted_corrupt++); 2602 $herecurr) if (!$emitted_corrupt++);
2602 } 2603 }
2603 2604
2604# Check for absolute kernel paths.
2605 if ($tree) {
2606 while ($line =~ m{(?:^|\s)(/\S*)}g) {
2607 my $file = $1;
2608
2609 if ($file =~ m{^(.*?)(?::\d+)+:?$} &&
2610 check_absolute_file($1, $herecurr)) {
2611 #
2612 } else {
2613 check_absolute_file($file, $herecurr);
2614 }
2615 }
2616 }
2617
2618# UTF-8 regex found at http://www.w3.org/International/questions/qa-forms-utf-8.en.php 2605# UTF-8 regex found at http://www.w3.org/International/questions/qa-forms-utf-8.en.php
2619 if (($realfile =~ /^$/ || $line =~ /^\+/) && 2606 if (($realfile =~ /^$/ || $line =~ /^\+/) &&
2620 $rawline !~ m/^$UTF8*$/) { 2607 $rawline !~ m/^$UTF8*$/) {
@@ -2652,6 +2639,20 @@ sub process {
2652 "8-bit UTF-8 used in possible commit log\n" . $herecurr); 2639 "8-bit UTF-8 used in possible commit log\n" . $herecurr);
2653 } 2640 }
2654 2641
2642# Check for absolute kernel paths in commit message
2643 if ($tree && $in_commit_log) {
2644 while ($line =~ m{(?:^|\s)(/\S*)}g) {
2645 my $file = $1;
2646
2647 if ($file =~ m{^(.*?)(?::\d+)+:?$} &&
2648 check_absolute_file($1, $herecurr)) {
2649 #
2650 } else {
2651 check_absolute_file($file, $herecurr);
2652 }
2653 }
2654 }
2655
2655# Check for various typo / spelling mistakes 2656# Check for various typo / spelling mistakes
2656 if (defined($misspellings) && 2657 if (defined($misspellings) &&
2657 ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) { 2658 ($in_commit_log || $line =~ /^(?:\+|Subject:)/i)) {
@@ -2805,7 +2806,7 @@ sub process {
2805 } 2806 }
2806 2807
2807# check we are in a valid source file if not then ignore this hunk 2808# check we are in a valid source file if not then ignore this hunk
2808 next if ($realfile !~ /\.(h|c|s|S|pl|sh|dtsi|dts)$/); 2809 next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/);
2809 2810
2810# line length limit (with some exclusions) 2811# line length limit (with some exclusions)
2811# 2812#
@@ -3440,6 +3441,18 @@ sub process {
3440#ignore lines not being added 3441#ignore lines not being added
3441 next if ($line =~ /^[^\+]/); 3442 next if ($line =~ /^[^\+]/);
3442 3443
3444# check for dereferences that span multiple lines
3445 if ($prevline =~ /^\+.*$Lval\s*(?:\.|->)\s*$/ &&
3446 $line =~ /^\+\s*(?!\#\s*(?!define\s+|if))\s*$Lval/) {
3447 $prevline =~ /($Lval\s*(?:\.|->))\s*$/;
3448 my $ref = $1;
3449 $line =~ /^.\s*($Lval)/;
3450 $ref .= $1;
3451 $ref =~ s/\s//g;
3452 WARN("MULTILINE_DEREFERENCE",
3453 "Avoid multiple line dereference - prefer '$ref'\n" . $hereprev);
3454 }
3455
3443# check for declarations of signed or unsigned without int 3456# check for declarations of signed or unsigned without int
3444 while ($line =~ m{\b($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) { 3457 while ($line =~ m{\b($Declare)\s*(?!char\b|short\b|int\b|long\b)\s*($Ident)?\s*[=,;\[\)\(]}g) {
3445 my $type = $1; 3458 my $type = $1;
@@ -5548,8 +5561,9 @@ sub process {
5548 "Using weak declarations can have unintended link defects\n" . $herecurr); 5561 "Using weak declarations can have unintended link defects\n" . $herecurr);
5549 } 5562 }
5550 5563
5551# check for c99 types like uint8_t used outside of uapi/ 5564# check for c99 types like uint8_t used outside of uapi/ and tools/
5552 if ($realfile !~ m@\binclude/uapi/@ && 5565 if ($realfile !~ m@\binclude/uapi/@ &&
5566 $realfile !~ m@\btools/@ &&
5553 $line =~ /\b($Declare)\s*$Ident\s*[=;,\[]/) { 5567 $line =~ /\b($Declare)\s*$Ident\s*[=;,\[]/) {
5554 my $type = $1; 5568 my $type = $1;
5555 if ($type =~ /\b($typeC99Typedefs)\b/) { 5569 if ($type =~ /\b($typeC99Typedefs)\b/) {
@@ -5925,7 +5939,7 @@ sub process {
5925 } 5939 }
5926 if (!$has_break && $has_statement) { 5940 if (!$has_break && $has_statement) {
5927 WARN("MISSING_BREAK", 5941 WARN("MISSING_BREAK",
5928 "Possible switch case/default not preceeded by break or fallthrough comment\n" . $herecurr); 5942 "Possible switch case/default not preceded by break or fallthrough comment\n" . $herecurr);
5929 } 5943 }
5930 } 5944 }
5931 5945
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index aed4511f0304..633f2dd3de27 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -49,6 +49,7 @@ my $scm = 0;
49my $web = 0; 49my $web = 0;
50my $subsystem = 0; 50my $subsystem = 0;
51my $status = 0; 51my $status = 0;
52my $letters = "";
52my $keywords = 1; 53my $keywords = 1;
53my $sections = 0; 54my $sections = 0;
54my $file_emails = 0; 55my $file_emails = 0;
@@ -241,6 +242,7 @@ if (!GetOptions(
241 'status!' => \$status, 242 'status!' => \$status,
242 'scm!' => \$scm, 243 'scm!' => \$scm,
243 'web!' => \$web, 244 'web!' => \$web,
245 'letters=s' => \$letters,
244 'pattern-depth=i' => \$pattern_depth, 246 'pattern-depth=i' => \$pattern_depth,
245 'k|keywords!' => \$keywords, 247 'k|keywords!' => \$keywords,
246 'sections!' => \$sections, 248 'sections!' => \$sections,
@@ -271,7 +273,8 @@ $output_multiline = 0 if ($output_separator ne ", ");
271$output_rolestats = 1 if ($interactive); 273$output_rolestats = 1 if ($interactive);
272$output_roles = 1 if ($output_rolestats); 274$output_roles = 1 if ($output_rolestats);
273 275
274if ($sections) { 276if ($sections || $letters ne "") {
277 $sections = 1;
275 $email = 0; 278 $email = 0;
276 $email_list = 0; 279 $email_list = 0;
277 $scm = 0; 280 $scm = 0;
@@ -682,8 +685,10 @@ sub get_maintainers {
682 $line =~ s/\\\./\./g; ##Convert \. to . 685 $line =~ s/\\\./\./g; ##Convert \. to .
683 $line =~ s/\.\*/\*/g; ##Convert .* to * 686 $line =~ s/\.\*/\*/g; ##Convert .* to *
684 } 687 }
685 $line =~ s/^([A-Z]):/$1:\t/g; 688 my $count = $line =~ s/^([A-Z]):/$1:\t/g;
686 print("$line\n"); 689 if ($letters eq "" || (!$count || $letters =~ /$1/i)) {
690 print("$line\n");
691 }
687 } 692 }
688 print("\n"); 693 print("\n");
689 } 694 }
@@ -814,6 +819,7 @@ Other options:
814 --pattern-depth => Number of pattern directory traversals (default: 0 (all)) 819 --pattern-depth => Number of pattern directory traversals (default: 0 (all))
815 --keywords => scan patch for keywords (default: $keywords) 820 --keywords => scan patch for keywords (default: $keywords)
816 --sections => print all of the subsystem sections with pattern matches 821 --sections => print all of the subsystem sections with pattern matches
822 --letters => print all matching 'letter' types from all matching sections
817 --mailmap => use .mailmap file (default: $email_use_mailmap) 823 --mailmap => use .mailmap file (default: $email_use_mailmap)
818 --version => show version 824 --version => show version
819 --help => show this help information 825 --help => show this help information
diff --git a/scripts/tags.sh b/scripts/tags.sh
index a2ff3388e5ea..df5fa777d300 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -304,11 +304,26 @@ if [ "${ARCH}" = "um" ]; then
304elif [ "${SRCARCH}" = "arm" -a "${SUBARCH}" != "" ]; then 304elif [ "${SRCARCH}" = "arm" -a "${SUBARCH}" != "" ]; then
305 subarchdir=$(find ${tree}arch/$SRCARCH/ -name "mach-*" -type d -o \ 305 subarchdir=$(find ${tree}arch/$SRCARCH/ -name "mach-*" -type d -o \
306 -name "plat-*" -type d); 306 -name "plat-*" -type d);
307 mach_suffix=$SUBARCH
308 plat_suffix=$SUBARCH
309
310 # Special cases when $plat_suffix != $mach_suffix
311 case $mach_suffix in
312 "omap1" | "omap2")
313 plat_suffix="omap"
314 ;;
315 esac
316
317 if [ ! -d ${tree}arch/$SRCARCH/mach-$mach_suffix ]; then
318 echo "Warning: arch/arm/mach-$mach_suffix/ not found." >&2
319 echo " Fix your \$SUBARCH appropriately" >&2
320 fi
321
307 for i in $subarchdir; do 322 for i in $subarchdir; do
308 case "$i" in 323 case "$i" in
309 *"mach-"${SUBARCH}) 324 *"mach-"${mach_suffix})
310 ;; 325 ;;
311 *"plat-"${SUBARCH}) 326 *"plat-"${plat_suffix})
312 ;; 327 ;;
313 *) 328 *)
314 subarchprune="$subarchprune \ 329 subarchprune="$subarchprune \
diff --git a/sound/core/misc.c b/sound/core/misc.c
index f2e8226c88fb..21b228046e88 100644
--- a/sound/core/misc.c
+++ b/sound/core/misc.c
@@ -71,6 +71,7 @@ void __snd_printk(unsigned int level, const char *path, int line,
71 int kern_level; 71 int kern_level;
72 struct va_format vaf; 72 struct va_format vaf;
73 char verbose_fmt[] = KERN_DEFAULT "ALSA %s:%d %pV"; 73 char verbose_fmt[] = KERN_DEFAULT "ALSA %s:%d %pV";
74 bool level_found = false;
74#endif 75#endif
75 76
76#ifdef CONFIG_SND_DEBUG 77#ifdef CONFIG_SND_DEBUG
@@ -83,15 +84,22 @@ void __snd_printk(unsigned int level, const char *path, int line,
83 vaf.fmt = format; 84 vaf.fmt = format;
84 vaf.va = &args; 85 vaf.va = &args;
85 86
86 kern_level = printk_get_level(format); 87 while ((kern_level = printk_get_level(vaf.fmt)) != 0) {
87 if (kern_level) { 88 const char *end_of_header = printk_skip_level(vaf.fmt);
88 const char *end_of_header = printk_skip_level(format); 89
89 memcpy(verbose_fmt, format, end_of_header - format); 90 /* Ignore KERN_CONT. We print filename:line for each piece. */
91 if (kern_level >= '0' && kern_level <= '7') {
92 memcpy(verbose_fmt, vaf.fmt, end_of_header - vaf.fmt);
93 level_found = true;
94 }
95
90 vaf.fmt = end_of_header; 96 vaf.fmt = end_of_header;
91 } else if (level) 97 }
98
99 if (!level_found && level)
92 memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1); 100 memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1);
93 printk(verbose_fmt, sanity_file_name(path), line, &vaf);
94 101
102 printk(verbose_fmt, sanity_file_name(path), line, &vaf);
95#else 103#else
96 vprintk(format, args); 104 vprintk(format, args);
97#endif 105#endif
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index 05d7bc488971..d1be94667a30 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order)
146 146
147 slot = radix_tree_lookup_slot(&tree, index); 147 slot = radix_tree_lookup_slot(&tree, index);
148 free(*slot); 148 free(*slot);
149 radix_tree_replace_slot(slot, item2); 149 radix_tree_replace_slot(&tree, slot, item2);
150 for (i = min; i < max; i++) { 150 for (i = min; i < max; i++) {
151 struct item *item = item_lookup(&tree, i); 151 struct item *item = item_lookup(&tree, i);
152 assert(item != 0); 152 assert(item != 0);