aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 20:25:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-14 20:25:18 -0500
commita57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch)
tree5a42ee9a668f171143464bc86013954c1bbe94ad
parentcf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff)
parente1e14ab8411df344a17687821f8f78f0a1e73cbb (diff)
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton: - a few misc things - kexec updates - DMA-mapping updates to better support networking DMA operations - IPC updates - various MM changes to improve DAX fault handling - lots of radix-tree changes, mainly to the test suite. All leading up to reimplementing the IDA/IDR code to be a wrapper layer over the radix-tree. However the final trigger-pulling patch is held off for 4.11. * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits) radix tree test suite: delete unused rcupdate.c radix tree test suite: add new tag check radix-tree: ensure counts are initialised radix tree test suite: cache recently freed objects radix tree test suite: add some more functionality idr: reduce the number of bits per level from 8 to 6 rxrpc: abstract away knowledge of IDR internals tpm: use idr_find(), not idr_find_slowpath() idr: add ida_is_empty radix tree test suite: check multiorder iteration radix-tree: fix replacement for multiorder entries radix-tree: add radix_tree_split_preload() radix-tree: add radix_tree_split radix-tree: add radix_tree_join radix-tree: delete radix_tree_range_tag_if_tagged() radix-tree: delete radix_tree_locate_item() radix-tree: improve multiorder iterators btrfs: fix race in btrfs_free_dummy_fs_info() radix-tree: improve dump output radix-tree: make radix_tree_find_next_bit more useful ...
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--arch/arc/mm/dma.c5
-rw-r--r--arch/arm/common/dmabounce.c16
-rw-r--r--arch/avr32/mm/dma-coherent.c7
-rw-r--r--arch/blackfin/kernel/dma-mapping.c8
-rw-r--r--arch/c6x/kernel/dma.c14
-rw-r--r--arch/frv/mb93090-mb00/pci-dma-nommu.c14
-rw-r--r--arch/frv/mb93090-mb00/pci-dma.c9
-rw-r--r--arch/hexagon/kernel/dma.c6
-rw-r--r--arch/m68k/kernel/dma.c8
-rw-r--r--arch/metag/kernel/dma.c16
-rw-r--r--arch/microblaze/kernel/dma.c10
-rw-r--r--arch/mips/loongson64/common/dma-swiotlb.c2
-rw-r--r--arch/mips/mm/dma-default.c8
-rw-r--r--arch/nios2/mm/dma-mapping.c26
-rw-r--r--arch/openrisc/kernel/dma.c3
-rw-r--r--arch/parisc/kernel/pci-dma.c20
-rw-r--r--arch/powerpc/kernel/dma.c9
-rw-r--r--arch/powerpc/platforms/cell/spufs/file.c8
-rw-r--r--arch/sh/kernel/dma-nommu.c7
-rw-r--r--arch/sparc/kernel/iommu.c4
-rw-r--r--arch/sparc/kernel/ioport.c4
-rw-r--r--arch/sparc/kernel/nmi.c44
-rw-r--r--arch/tile/kernel/pci-dma.c12
-rw-r--r--arch/x86/entry/vdso/vma.c4
-rw-r--r--arch/x86/kernel/machine_kexec_64.c6
-rw-r--r--arch/xtensa/kernel/pci-dma.c7
-rw-r--r--drivers/char/agp/alpha-agp.c3
-rw-r--r--drivers/char/mspec.c2
-rw-r--r--drivers/char/tpm/tpm-chip.c4
-rw-r--r--drivers/dax/dax.c3
-rw-r--r--drivers/gpu/drm/armada/armada_gem.c5
-rw-r--r--drivers/gpu/drm/drm_vm.c10
-rw-r--r--drivers/gpu/drm/etnaviv/etnaviv_gem.c9
-rw-r--r--drivers/gpu/drm/exynos/exynos_drm_gem.c6
-rw-r--r--drivers/gpu/drm/gma500/framebuffer.c2
-rw-r--r--drivers/gpu/drm/gma500/gem.c5
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c3
-rw-r--r--drivers/gpu/drm/i915/i915_gem_userptr.c2
-rw-r--r--drivers/gpu/drm/msm/msm_gem.c8
-rw-r--r--drivers/gpu/drm/omapdrm/omap_gem.c20
-rw-r--r--drivers/gpu/drm/tegra/gem.c4
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_vm.c2
-rw-r--r--drivers/gpu/drm/udl/udl_gem.c5
-rw-r--r--drivers/gpu/drm/vgem/vgem_drv.c2
-rw-r--r--drivers/infiniband/core/umem_odp.c2
-rw-r--r--drivers/media/v4l2-core/videobuf-dma-sg.c5
-rw-r--r--drivers/misc/cxl/context.c5
-rw-r--r--drivers/misc/sgi-gru/grumain.c2
-rw-r--r--drivers/net/ethernet/intel/igb/igb.h7
-rw-r--r--drivers/net/ethernet/intel/igb/igb_main.c77
-rw-r--r--drivers/net/wireless/intel/iwlwifi/dvm/calib.c3
-rw-r--r--drivers/staging/android/ion/ion.c2
-rw-r--r--drivers/staging/lustre/lustre/llite/vvp_io.c6
-rw-r--r--drivers/usb/gadget/function/f_hid.c6
-rw-r--r--drivers/usb/gadget/function/f_printer.c6
-rw-r--r--drivers/vfio/vfio_iommu_type1.c2
-rw-r--r--drivers/xen/privcmd.c2
-rw-r--r--fs/btrfs/super.c12
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/dax.c208
-rw-r--r--fs/exec.c2
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--include/linux/dax.h7
-rw-r--r--include/linux/dma-mapping.h20
-rw-r--r--include/linux/gfp.h2
-rw-r--r--include/linux/huge_mm.h10
-rw-r--r--include/linux/idr.h40
-rw-r--r--include/linux/kdb.h2
-rw-r--r--include/linux/kexec.h6
-rw-r--r--include/linux/mm.h46
-rw-r--r--include/linux/nmi.h24
-rw-r--r--include/linux/radix-tree.h174
-rw-r--r--include/linux/signal.h17
-rw-r--r--include/linux/userfaultfd_k.h4
-rw-r--r--ipc/msg.c5
-rw-r--r--ipc/sem.c512
-rw-r--r--ipc/shm.c13
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c37
-rw-r--r--kernel/debug/kdb/kdb_main.c1
-rw-r--r--kernel/debug/kdb/kdb_private.h1
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/kcov.c5
-rw-r--r--kernel/kexec_core.c5
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/signal.c7
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/time/alarmtimer.c3
-rw-r--r--kernel/watchdog.c270
-rw-r--r--kernel/watchdog_hld.c227
-rw-r--r--lib/Kconfig.debug8
-rw-r--r--lib/Kconfig.ubsan3
-rw-r--r--lib/radix-tree.c890
-rw-r--r--mm/compaction.c17
-rw-r--r--mm/filemap.c14
-rw-r--r--mm/gup.c20
-rw-r--r--mm/huge_memory.c173
-rw-r--r--mm/internal.h2
-rw-r--r--mm/khugepaged.c31
-rw-r--r--mm/memory.c859
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/page-writeback.c28
-rw-r--r--mm/page_alloc.c14
-rw-r--r--mm/process_vm_access.c12
-rw-r--r--mm/shmem.c32
-rw-r--r--net/rxrpc/af_rxrpc.c11
-rw-r--r--net/rxrpc/conn_client.c4
-rw-r--r--security/tomoyo/domain.c2
-rw-r--r--tools/include/asm/bug.h11
-rw-r--r--tools/include/linux/bitmap.h26
-rwxr-xr-xtools/testing/ktest/ktest.pl8
-rw-r--r--tools/testing/radix-tree/Makefile15
-rw-r--r--tools/testing/radix-tree/benchmark.c98
-rw-r--r--tools/testing/radix-tree/find_next_bit.c57
-rw-r--r--tools/testing/radix-tree/iteration_check.c123
-rw-r--r--tools/testing/radix-tree/linux.c67
-rw-r--r--tools/testing/radix-tree/linux/bitops.h40
-rw-r--r--tools/testing/radix-tree/linux/bitops/non-atomic.h13
-rw-r--r--tools/testing/radix-tree/linux/bug.h2
-rw-r--r--tools/testing/radix-tree/linux/gfp.h22
-rw-r--r--tools/testing/radix-tree/linux/kernel.h18
-rw-r--r--tools/testing/radix-tree/linux/preempt.h6
-rw-r--r--tools/testing/radix-tree/linux/slab.h11
-rw-r--r--tools/testing/radix-tree/linux/types.h2
-rw-r--r--tools/testing/radix-tree/main.c77
-rw-r--r--tools/testing/radix-tree/multiorder.c326
-rw-r--r--tools/testing/radix-tree/rcupdate.c86
-rw-r--r--tools/testing/radix-tree/regression2.c3
-rw-r--r--tools/testing/radix-tree/regression3.c8
-rw-r--r--tools/testing/radix-tree/tag_check.c12
-rw-r--r--tools/testing/radix-tree/test.c92
-rw-r--r--tools/testing/radix-tree/test.h21
-rw-r--r--usr/Kconfig127
-rw-r--r--usr/Makefile20
-rw-r--r--virt/kvm/async_pf.c10
-rw-r--r--virt/kvm/kvm_main.c5
140 files changed, 3428 insertions, 2218 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 1b5f15653b1b..69e2387ca278 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must
556not block. If it's not possible to reach a page without blocking, 556not block. If it's not possible to reach a page without blocking,
557filesystem should skip it. Filesystem should use do_set_pte() to setup 557filesystem should skip it. Filesystem should use do_set_pte() to setup
558page table entry. Pointer to entry associated with the page is passed in 558page table entry. Pointer to entry associated with the page is passed in
559"pte" field in fault_env structure. Pointers to entries for other offsets 559"pte" field in vm_fault structure. Pointers to entries for other offsets
560should be calculated relative to "pte". 560should be calculated relative to "pte".
561 561
562 ->page_mkwrite() is called when a previously read-only pte is 562 ->page_mkwrite() is called when a previously read-only pte is
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c
index cd8aad8226dd..08450a1a5b5f 100644
--- a/arch/arc/mm/dma.c
+++ b/arch/arc/mm/dma.c
@@ -158,7 +158,10 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page,
158 unsigned long attrs) 158 unsigned long attrs)
159{ 159{
160 phys_addr_t paddr = page_to_phys(page) + offset; 160 phys_addr_t paddr = page_to_phys(page) + offset;
161 _dma_cache_sync(paddr, size, dir); 161
162 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
163 _dma_cache_sync(paddr, size, dir);
164
162 return plat_phys_to_dma(dev, paddr); 165 return plat_phys_to_dma(dev, paddr);
163} 166}
164 167
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 301281645d08..75055df1cda3 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -243,7 +243,8 @@ static int needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size)
243} 243}
244 244
245static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, 245static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
246 enum dma_data_direction dir) 246 enum dma_data_direction dir,
247 unsigned long attrs)
247{ 248{
248 struct dmabounce_device_info *device_info = dev->archdata.dmabounce; 249 struct dmabounce_device_info *device_info = dev->archdata.dmabounce;
249 struct safe_buffer *buf; 250 struct safe_buffer *buf;
@@ -262,7 +263,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
262 __func__, buf->ptr, virt_to_dma(dev, buf->ptr), 263 __func__, buf->ptr, virt_to_dma(dev, buf->ptr),
263 buf->safe, buf->safe_dma_addr); 264 buf->safe, buf->safe_dma_addr);
264 265
265 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) { 266 if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) &&
267 !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
266 dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n", 268 dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n",
267 __func__, ptr, buf->safe, size); 269 __func__, ptr, buf->safe, size);
268 memcpy(buf->safe, ptr, size); 270 memcpy(buf->safe, ptr, size);
@@ -272,7 +274,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size,
272} 274}
273 275
274static inline void unmap_single(struct device *dev, struct safe_buffer *buf, 276static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
275 size_t size, enum dma_data_direction dir) 277 size_t size, enum dma_data_direction dir,
278 unsigned long attrs)
276{ 279{
277 BUG_ON(buf->size != size); 280 BUG_ON(buf->size != size);
278 BUG_ON(buf->direction != dir); 281 BUG_ON(buf->direction != dir);
@@ -283,7 +286,8 @@ static inline void unmap_single(struct device *dev, struct safe_buffer *buf,
283 286
284 DO_STATS(dev->archdata.dmabounce->bounce_count++); 287 DO_STATS(dev->archdata.dmabounce->bounce_count++);
285 288
286 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) { 289 if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) &&
290 !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
287 void *ptr = buf->ptr; 291 void *ptr = buf->ptr;
288 292
289 dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n", 293 dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n",
@@ -334,7 +338,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, struct page *page,
334 return DMA_ERROR_CODE; 338 return DMA_ERROR_CODE;
335 } 339 }
336 340
337 return map_single(dev, page_address(page) + offset, size, dir); 341 return map_single(dev, page_address(page) + offset, size, dir, attrs);
338} 342}
339 343
340/* 344/*
@@ -357,7 +361,7 @@ static void dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t
357 return; 361 return;
358 } 362 }
359 363
360 unmap_single(dev, buf, size, dir); 364 unmap_single(dev, buf, size, dir, attrs);
361} 365}
362 366
363static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr, 367static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr,
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c
index 58610d0df7ed..54534e5d0781 100644
--- a/arch/avr32/mm/dma-coherent.c
+++ b/arch/avr32/mm/dma-coherent.c
@@ -146,7 +146,8 @@ static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page,
146{ 146{
147 void *cpu_addr = page_address(page) + offset; 147 void *cpu_addr = page_address(page) + offset;
148 148
149 dma_cache_sync(dev, cpu_addr, size, direction); 149 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
150 dma_cache_sync(dev, cpu_addr, size, direction);
150 return virt_to_bus(cpu_addr); 151 return virt_to_bus(cpu_addr);
151} 152}
152 153
@@ -162,6 +163,10 @@ static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist,
162 163
163 sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; 164 sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset;
164 virt = sg_virt(sg); 165 virt = sg_virt(sg);
166
167 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
168 continue;
169
165 dma_cache_sync(dev, virt, sg->length, direction); 170 dma_cache_sync(dev, virt, sg->length, direction);
166 } 171 }
167 172
diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c
index 53fbbb61aa86..a27a74a18fb0 100644
--- a/arch/blackfin/kernel/dma-mapping.c
+++ b/arch/blackfin/kernel/dma-mapping.c
@@ -118,6 +118,10 @@ static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list,
118 118
119 for_each_sg(sg_list, sg, nents, i) { 119 for_each_sg(sg_list, sg, nents, i) {
120 sg->dma_address = (dma_addr_t) sg_virt(sg); 120 sg->dma_address = (dma_addr_t) sg_virt(sg);
121
122 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
123 continue;
124
121 __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); 125 __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction);
122 } 126 }
123 127
@@ -143,7 +147,9 @@ static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page,
143{ 147{
144 dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); 148 dma_addr_t handle = (dma_addr_t)(page_address(page) + offset);
145 149
146 _dma_sync(handle, size, dir); 150 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
151 _dma_sync(handle, size, dir);
152
147 return handle; 153 return handle;
148} 154}
149 155
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c
index db4a6a301f5e..6752df32ef06 100644
--- a/arch/c6x/kernel/dma.c
+++ b/arch/c6x/kernel/dma.c
@@ -42,14 +42,17 @@ static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page,
42{ 42{
43 dma_addr_t handle = virt_to_phys(page_address(page) + offset); 43 dma_addr_t handle = virt_to_phys(page_address(page) + offset);
44 44
45 c6x_dma_sync(handle, size, dir); 45 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
46 c6x_dma_sync(handle, size, dir);
47
46 return handle; 48 return handle;
47} 49}
48 50
49static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, 51static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle,
50 size_t size, enum dma_data_direction dir, unsigned long attrs) 52 size_t size, enum dma_data_direction dir, unsigned long attrs)
51{ 53{
52 c6x_dma_sync(handle, size, dir); 54 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
55 c6x_dma_sync(handle, size, dir);
53} 56}
54 57
55static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, 58static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -60,7 +63,8 @@ static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist,
60 63
61 for_each_sg(sglist, sg, nents, i) { 64 for_each_sg(sglist, sg, nents, i) {
62 sg->dma_address = sg_phys(sg); 65 sg->dma_address = sg_phys(sg);
63 c6x_dma_sync(sg->dma_address, sg->length, dir); 66 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
67 c6x_dma_sync(sg->dma_address, sg->length, dir);
64 } 68 }
65 69
66 return nents; 70 return nents;
@@ -72,9 +76,11 @@ static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
72 struct scatterlist *sg; 76 struct scatterlist *sg;
73 int i; 77 int i;
74 78
79 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
80 return;
81
75 for_each_sg(sglist, sg, nents, i) 82 for_each_sg(sglist, sg, nents, i)
76 c6x_dma_sync(sg_dma_address(sg), sg->length, dir); 83 c6x_dma_sync(sg_dma_address(sg), sg->length, dir);
77
78} 84}
79 85
80static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, 86static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle,
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 90f2e4cb33d6..187688128c65 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -109,16 +109,19 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist,
109 int nents, enum dma_data_direction direction, 109 int nents, enum dma_data_direction direction,
110 unsigned long attrs) 110 unsigned long attrs)
111{ 111{
112 int i;
113 struct scatterlist *sg; 112 struct scatterlist *sg;
113 int i;
114
115 BUG_ON(direction == DMA_NONE);
116
117 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
118 return nents;
114 119
115 for_each_sg(sglist, sg, nents, i) { 120 for_each_sg(sglist, sg, nents, i) {
116 frv_cache_wback_inv(sg_dma_address(sg), 121 frv_cache_wback_inv(sg_dma_address(sg),
117 sg_dma_address(sg) + sg_dma_len(sg)); 122 sg_dma_address(sg) + sg_dma_len(sg));
118 } 123 }
119 124
120 BUG_ON(direction == DMA_NONE);
121
122 return nents; 125 return nents;
123} 126}
124 127
@@ -127,7 +130,10 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page,
127 enum dma_data_direction direction, unsigned long attrs) 130 enum dma_data_direction direction, unsigned long attrs)
128{ 131{
129 BUG_ON(direction == DMA_NONE); 132 BUG_ON(direction == DMA_NONE);
130 flush_dcache_page(page); 133
134 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
135 flush_dcache_page(page);
136
131 return (dma_addr_t) page_to_phys(page) + offset; 137 return (dma_addr_t) page_to_phys(page) + offset;
132} 138}
133 139
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index f585745b1abc..dba7df918144 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -40,13 +40,16 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist,
40 int nents, enum dma_data_direction direction, 40 int nents, enum dma_data_direction direction,
41 unsigned long attrs) 41 unsigned long attrs)
42{ 42{
43 struct scatterlist *sg;
43 unsigned long dampr2; 44 unsigned long dampr2;
44 void *vaddr; 45 void *vaddr;
45 int i; 46 int i;
46 struct scatterlist *sg;
47 47
48 BUG_ON(direction == DMA_NONE); 48 BUG_ON(direction == DMA_NONE);
49 49
50 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
51 return nents;
52
50 dampr2 = __get_DAMPR(2); 53 dampr2 = __get_DAMPR(2);
51 54
52 for_each_sg(sglist, sg, nents, i) { 55 for_each_sg(sglist, sg, nents, i) {
@@ -70,7 +73,9 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page,
70 unsigned long offset, size_t size, 73 unsigned long offset, size_t size,
71 enum dma_data_direction direction, unsigned long attrs) 74 enum dma_data_direction direction, unsigned long attrs)
72{ 75{
73 flush_dcache_page(page); 76 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
77 flush_dcache_page(page);
78
74 return (dma_addr_t) page_to_phys(page) + offset; 79 return (dma_addr_t) page_to_phys(page) + offset;
75} 80}
76 81
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c
index b9017785fb71..dbc4f1003da4 100644
--- a/arch/hexagon/kernel/dma.c
+++ b/arch/hexagon/kernel/dma.c
@@ -119,6 +119,9 @@ static int hexagon_map_sg(struct device *hwdev, struct scatterlist *sg,
119 119
120 s->dma_length = s->length; 120 s->dma_length = s->length;
121 121
122 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
123 continue;
124
122 flush_dcache_range(dma_addr_to_virt(s->dma_address), 125 flush_dcache_range(dma_addr_to_virt(s->dma_address),
123 dma_addr_to_virt(s->dma_address + s->length)); 126 dma_addr_to_virt(s->dma_address + s->length));
124 } 127 }
@@ -180,7 +183,8 @@ static dma_addr_t hexagon_map_page(struct device *dev, struct page *page,
180 if (!check_addr("map_single", dev, bus, size)) 183 if (!check_addr("map_single", dev, bus, size))
181 return bad_dma_address; 184 return bad_dma_address;
182 185
183 dma_sync(dma_addr_to_virt(bus), size, dir); 186 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
187 dma_sync(dma_addr_to_virt(bus), size, dir);
184 188
185 return bus; 189 return bus;
186} 190}
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c
index 8cf97cbadc91..07070065a425 100644
--- a/arch/m68k/kernel/dma.c
+++ b/arch/m68k/kernel/dma.c
@@ -134,7 +134,9 @@ static dma_addr_t m68k_dma_map_page(struct device *dev, struct page *page,
134{ 134{
135 dma_addr_t handle = page_to_phys(page) + offset; 135 dma_addr_t handle = page_to_phys(page) + offset;
136 136
137 dma_sync_single_for_device(dev, handle, size, dir); 137 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
138 dma_sync_single_for_device(dev, handle, size, dir);
139
138 return handle; 140 return handle;
139} 141}
140 142
@@ -146,6 +148,10 @@ static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist,
146 148
147 for_each_sg(sglist, sg, nents, i) { 149 for_each_sg(sglist, sg, nents, i) {
148 sg->dma_address = sg_phys(sg); 150 sg->dma_address = sg_phys(sg);
151
152 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
153 continue;
154
149 dma_sync_single_for_device(dev, sg->dma_address, sg->length, 155 dma_sync_single_for_device(dev, sg->dma_address, sg->length,
150 dir); 156 dir);
151 } 157 }
diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c
index 0db31e24c541..91968d92652b 100644
--- a/arch/metag/kernel/dma.c
+++ b/arch/metag/kernel/dma.c
@@ -484,8 +484,9 @@ static dma_addr_t metag_dma_map_page(struct device *dev, struct page *page,
484 unsigned long offset, size_t size, 484 unsigned long offset, size_t size,
485 enum dma_data_direction direction, unsigned long attrs) 485 enum dma_data_direction direction, unsigned long attrs)
486{ 486{
487 dma_sync_for_device((void *)(page_to_phys(page) + offset), size, 487 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
488 direction); 488 dma_sync_for_device((void *)(page_to_phys(page) + offset),
489 size, direction);
489 return page_to_phys(page) + offset; 490 return page_to_phys(page) + offset;
490} 491}
491 492
@@ -493,7 +494,8 @@ static void metag_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
493 size_t size, enum dma_data_direction direction, 494 size_t size, enum dma_data_direction direction,
494 unsigned long attrs) 495 unsigned long attrs)
495{ 496{
496 dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); 497 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
498 dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
497} 499}
498 500
499static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, 501static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -507,6 +509,10 @@ static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist,
507 BUG_ON(!sg_page(sg)); 509 BUG_ON(!sg_page(sg));
508 510
509 sg->dma_address = sg_phys(sg); 511 sg->dma_address = sg_phys(sg);
512
513 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
514 continue;
515
510 dma_sync_for_device(sg_virt(sg), sg->length, direction); 516 dma_sync_for_device(sg_virt(sg), sg->length, direction);
511 } 517 }
512 518
@@ -525,6 +531,10 @@ static void metag_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
525 BUG_ON(!sg_page(sg)); 531 BUG_ON(!sg_page(sg));
526 532
527 sg->dma_address = sg_phys(sg); 533 sg->dma_address = sg_phys(sg);
534
535 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
536 continue;
537
528 dma_sync_for_cpu(sg_virt(sg), sg->length, direction); 538 dma_sync_for_cpu(sg_virt(sg), sg->length, direction);
529 } 539 }
530} 540}
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c
index ec04dc1e2527..818daf230eb4 100644
--- a/arch/microblaze/kernel/dma.c
+++ b/arch/microblaze/kernel/dma.c
@@ -61,6 +61,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
61 /* FIXME this part of code is untested */ 61 /* FIXME this part of code is untested */
62 for_each_sg(sgl, sg, nents, i) { 62 for_each_sg(sgl, sg, nents, i) {
63 sg->dma_address = sg_phys(sg); 63 sg->dma_address = sg_phys(sg);
64
65 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
66 continue;
67
64 __dma_sync(page_to_phys(sg_page(sg)) + sg->offset, 68 __dma_sync(page_to_phys(sg_page(sg)) + sg->offset,
65 sg->length, direction); 69 sg->length, direction);
66 } 70 }
@@ -80,7 +84,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
80 enum dma_data_direction direction, 84 enum dma_data_direction direction,
81 unsigned long attrs) 85 unsigned long attrs)
82{ 86{
83 __dma_sync(page_to_phys(page) + offset, size, direction); 87 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
88 __dma_sync(page_to_phys(page) + offset, size, direction);
84 return page_to_phys(page) + offset; 89 return page_to_phys(page) + offset;
85} 90}
86 91
@@ -95,7 +100,8 @@ static inline void dma_direct_unmap_page(struct device *dev,
95 * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and 100 * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and
96 * dma_address is physical address 101 * dma_address is physical address
97 */ 102 */
98 __dma_sync(dma_address, size, direction); 103 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
104 __dma_sync(dma_address, size, direction);
99} 105}
100 106
101static inline void 107static inline void
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c
index 1a80b6f73ab2..aab4fd681e1f 100644
--- a/arch/mips/loongson64/common/dma-swiotlb.c
+++ b/arch/mips/loongson64/common/dma-swiotlb.c
@@ -61,7 +61,7 @@ static int loongson_dma_map_sg(struct device *dev, struct scatterlist *sg,
61 int nents, enum dma_data_direction dir, 61 int nents, enum dma_data_direction dir,
62 unsigned long attrs) 62 unsigned long attrs)
63{ 63{
64 int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, 0); 64 int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, attrs);
65 mb(); 65 mb();
66 66
67 return r; 67 return r;
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c
index 46d5696c4f27..a39c36af97ad 100644
--- a/arch/mips/mm/dma-default.c
+++ b/arch/mips/mm/dma-default.c
@@ -293,7 +293,7 @@ static inline void __dma_sync(struct page *page,
293static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, 293static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr,
294 size_t size, enum dma_data_direction direction, unsigned long attrs) 294 size_t size, enum dma_data_direction direction, unsigned long attrs)
295{ 295{
296 if (cpu_needs_post_dma_flush(dev)) 296 if (cpu_needs_post_dma_flush(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
297 __dma_sync(dma_addr_to_page(dev, dma_addr), 297 __dma_sync(dma_addr_to_page(dev, dma_addr),
298 dma_addr & ~PAGE_MASK, size, direction); 298 dma_addr & ~PAGE_MASK, size, direction);
299 plat_post_dma_flush(dev); 299 plat_post_dma_flush(dev);
@@ -307,7 +307,8 @@ static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist,
307 struct scatterlist *sg; 307 struct scatterlist *sg;
308 308
309 for_each_sg(sglist, sg, nents, i) { 309 for_each_sg(sglist, sg, nents, i) {
310 if (!plat_device_is_coherent(dev)) 310 if (!plat_device_is_coherent(dev) &&
311 !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
311 __dma_sync(sg_page(sg), sg->offset, sg->length, 312 __dma_sync(sg_page(sg), sg->offset, sg->length,
312 direction); 313 direction);
313#ifdef CONFIG_NEED_SG_DMA_LENGTH 314#ifdef CONFIG_NEED_SG_DMA_LENGTH
@@ -324,7 +325,7 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page,
324 unsigned long offset, size_t size, enum dma_data_direction direction, 325 unsigned long offset, size_t size, enum dma_data_direction direction,
325 unsigned long attrs) 326 unsigned long attrs)
326{ 327{
327 if (!plat_device_is_coherent(dev)) 328 if (!plat_device_is_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
328 __dma_sync(page, offset, size, direction); 329 __dma_sync(page, offset, size, direction);
329 330
330 return plat_map_dma_mem_page(dev, page) + offset; 331 return plat_map_dma_mem_page(dev, page) + offset;
@@ -339,6 +340,7 @@ static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
339 340
340 for_each_sg(sglist, sg, nhwentries, i) { 341 for_each_sg(sglist, sg, nhwentries, i) {
341 if (!plat_device_is_coherent(dev) && 342 if (!plat_device_is_coherent(dev) &&
343 !(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
342 direction != DMA_TO_DEVICE) 344 direction != DMA_TO_DEVICE)
343 __dma_sync(sg_page(sg), sg->offset, sg->length, 345 __dma_sync(sg_page(sg), sg->offset, sg->length,
344 direction); 346 direction);
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c
index d800fad87896..f6a5dcf9d682 100644
--- a/arch/nios2/mm/dma-mapping.c
+++ b/arch/nios2/mm/dma-mapping.c
@@ -98,13 +98,17 @@ static int nios2_dma_map_sg(struct device *dev, struct scatterlist *sg,
98 int i; 98 int i;
99 99
100 for_each_sg(sg, sg, nents, i) { 100 for_each_sg(sg, sg, nents, i) {
101 void *addr; 101 void *addr = sg_virt(sg);
102 102
103 addr = sg_virt(sg); 103 if (!addr)
104 if (addr) { 104 continue;
105 __dma_sync_for_device(addr, sg->length, direction); 105
106 sg->dma_address = sg_phys(sg); 106 sg->dma_address = sg_phys(sg);
107 } 107
108 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
109 continue;
110
111 __dma_sync_for_device(addr, sg->length, direction);
108 } 112 }
109 113
110 return nents; 114 return nents;
@@ -117,7 +121,9 @@ static dma_addr_t nios2_dma_map_page(struct device *dev, struct page *page,
117{ 121{
118 void *addr = page_address(page) + offset; 122 void *addr = page_address(page) + offset;
119 123
120 __dma_sync_for_device(addr, size, direction); 124 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
125 __dma_sync_for_device(addr, size, direction);
126
121 return page_to_phys(page) + offset; 127 return page_to_phys(page) + offset;
122} 128}
123 129
@@ -125,7 +131,8 @@ static void nios2_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
125 size_t size, enum dma_data_direction direction, 131 size_t size, enum dma_data_direction direction,
126 unsigned long attrs) 132 unsigned long attrs)
127{ 133{
128 __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); 134 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
135 __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction);
129} 136}
130 137
131static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, 138static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
@@ -138,6 +145,9 @@ static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
138 if (direction == DMA_TO_DEVICE) 145 if (direction == DMA_TO_DEVICE)
139 return; 146 return;
140 147
148 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
149 return;
150
141 for_each_sg(sg, sg, nhwentries, i) { 151 for_each_sg(sg, sg, nhwentries, i) {
142 addr = sg_virt(sg); 152 addr = sg_virt(sg);
143 if (addr) 153 if (addr)
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c
index 140c99140649..906998bac957 100644
--- a/arch/openrisc/kernel/dma.c
+++ b/arch/openrisc/kernel/dma.c
@@ -141,6 +141,9 @@ or1k_map_page(struct device *dev, struct page *page,
141 unsigned long cl; 141 unsigned long cl;
142 dma_addr_t addr = page_to_phys(page) + offset; 142 dma_addr_t addr = page_to_phys(page) + offset;
143 143
144 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
145 return addr;
146
144 switch (dir) { 147 switch (dir) {
145 case DMA_TO_DEVICE: 148 case DMA_TO_DEVICE:
146 /* Flush the dcache for the requested range */ 149 /* Flush the dcache for the requested range */
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c
index 494ff6e8c88a..b6298a85e8ae 100644
--- a/arch/parisc/kernel/pci-dma.c
+++ b/arch/parisc/kernel/pci-dma.c
@@ -459,7 +459,9 @@ static dma_addr_t pa11_dma_map_page(struct device *dev, struct page *page,
459 void *addr = page_address(page) + offset; 459 void *addr = page_address(page) + offset;
460 BUG_ON(direction == DMA_NONE); 460 BUG_ON(direction == DMA_NONE);
461 461
462 flush_kernel_dcache_range((unsigned long) addr, size); 462 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
463 flush_kernel_dcache_range((unsigned long) addr, size);
464
463 return virt_to_phys(addr); 465 return virt_to_phys(addr);
464} 466}
465 467
@@ -469,8 +471,11 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
469{ 471{
470 BUG_ON(direction == DMA_NONE); 472 BUG_ON(direction == DMA_NONE);
471 473
474 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
475 return;
476
472 if (direction == DMA_TO_DEVICE) 477 if (direction == DMA_TO_DEVICE)
473 return; 478 return;
474 479
475 /* 480 /*
476 * For PCI_DMA_FROMDEVICE this flush is not necessary for the 481 * For PCI_DMA_FROMDEVICE this flush is not necessary for the
@@ -479,7 +484,6 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
479 */ 484 */
480 485
481 flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); 486 flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size);
482 return;
483} 487}
484 488
485static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, 489static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist,
@@ -496,6 +500,10 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist,
496 500
497 sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); 501 sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr);
498 sg_dma_len(sg) = sg->length; 502 sg_dma_len(sg) = sg->length;
503
504 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
505 continue;
506
499 flush_kernel_dcache_range(vaddr, sg->length); 507 flush_kernel_dcache_range(vaddr, sg->length);
500 } 508 }
501 return nents; 509 return nents;
@@ -510,14 +518,16 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
510 518
511 BUG_ON(direction == DMA_NONE); 519 BUG_ON(direction == DMA_NONE);
512 520
521 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
522 return;
523
513 if (direction == DMA_TO_DEVICE) 524 if (direction == DMA_TO_DEVICE)
514 return; 525 return;
515 526
516 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ 527 /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */
517 528
518 for_each_sg(sglist, sg, nents, i) 529 for_each_sg(sglist, sg, nents, i)
519 flush_kernel_vmap_range(sg_virt(sg), sg->length); 530 flush_kernel_vmap_range(sg_virt(sg), sg->length);
520 return;
521} 531}
522 532
523static void pa11_dma_sync_single_for_cpu(struct device *dev, 533static void pa11_dma_sync_single_for_cpu(struct device *dev,
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index e64a6016fba7..6877e3fa95bb 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl,
203 for_each_sg(sgl, sg, nents, i) { 203 for_each_sg(sgl, sg, nents, i) {
204 sg->dma_address = sg_phys(sg) + get_dma_offset(dev); 204 sg->dma_address = sg_phys(sg) + get_dma_offset(dev);
205 sg->dma_length = sg->length; 205 sg->dma_length = sg->length;
206
207 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
208 continue;
209
206 __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); 210 __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction);
207 } 211 }
208 212
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev,
235 unsigned long attrs) 239 unsigned long attrs)
236{ 240{
237 BUG_ON(dir == DMA_NONE); 241 BUG_ON(dir == DMA_NONE);
238 __dma_sync_page(page, offset, size, dir); 242
243 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
244 __dma_sync_page(page, offset, size, dir);
245
239 return page_to_phys(page) + offset + get_dma_offset(dev); 246 return page_to_phys(page) + offset + get_dma_offset(dev);
240} 247}
241 248
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 06254467e4dd..3a147122bc98 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -236,7 +236,6 @@ static int
236spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 236spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
237{ 237{
238 struct spu_context *ctx = vma->vm_file->private_data; 238 struct spu_context *ctx = vma->vm_file->private_data;
239 unsigned long address = (unsigned long)vmf->virtual_address;
240 unsigned long pfn, offset; 239 unsigned long pfn, offset;
241 240
242 offset = vmf->pgoff << PAGE_SHIFT; 241 offset = vmf->pgoff << PAGE_SHIFT;
@@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
244 return VM_FAULT_SIGBUS; 243 return VM_FAULT_SIGBUS;
245 244
246 pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", 245 pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n",
247 address, offset); 246 vmf->address, offset);
248 247
249 if (spu_acquire(ctx)) 248 if (spu_acquire(ctx))
250 return VM_FAULT_NOPAGE; 249 return VM_FAULT_NOPAGE;
@@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
256 vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); 255 vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
257 pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; 256 pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT;
258 } 257 }
259 vm_insert_pfn(vma, address, pfn); 258 vm_insert_pfn(vma, vmf->address, pfn);
260 259
261 spu_release(ctx); 260 spu_release(ctx);
262 261
@@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma,
355 down_read(&current->mm->mmap_sem); 354 down_read(&current->mm->mmap_sem);
356 } else { 355 } else {
357 area = ctx->spu->problem_phys + ps_offs; 356 area = ctx->spu->problem_phys + ps_offs;
358 vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, 357 vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
359 (area + offset) >> PAGE_SHIFT);
360 spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); 358 spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu);
361 } 359 }
362 360
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c
index eadb669a7329..47fee3b6e29c 100644
--- a/arch/sh/kernel/dma-nommu.c
+++ b/arch/sh/kernel/dma-nommu.c
@@ -18,7 +18,9 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
18 dma_addr_t addr = page_to_phys(page) + offset; 18 dma_addr_t addr = page_to_phys(page) + offset;
19 19
20 WARN_ON(size == 0); 20 WARN_ON(size == 0);
21 dma_cache_sync(dev, page_address(page) + offset, size, dir); 21
22 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
23 dma_cache_sync(dev, page_address(page) + offset, size, dir);
22 24
23 return addr; 25 return addr;
24} 26}
@@ -35,7 +37,8 @@ static int nommu_map_sg(struct device *dev, struct scatterlist *sg,
35 for_each_sg(sg, s, nents, i) { 37 for_each_sg(sg, s, nents, i) {
36 BUG_ON(!sg_page(s)); 38 BUG_ON(!sg_page(s));
37 39
38 dma_cache_sync(dev, sg_virt(s), s->length, dir); 40 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
41 dma_cache_sync(dev, sg_virt(s), s->length, dir);
39 42
40 s->dma_address = sg_phys(s); 43 s->dma_address = sg_phys(s);
41 s->dma_length = s->length; 44 s->dma_length = s->length;
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 852a3291db96..9df997995f6b 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -415,7 +415,7 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr,
415 ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL; 415 ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL;
416 416
417 /* Step 1: Kick data out of streaming buffers if necessary. */ 417 /* Step 1: Kick data out of streaming buffers if necessary. */
418 if (strbuf->strbuf_enabled) 418 if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
419 strbuf_flush(strbuf, iommu, bus_addr, ctx, 419 strbuf_flush(strbuf, iommu, bus_addr, ctx,
420 npages, direction); 420 npages, direction);
421 421
@@ -640,7 +640,7 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist,
640 base = iommu->page_table + entry; 640 base = iommu->page_table + entry;
641 641
642 dma_handle &= IO_PAGE_MASK; 642 dma_handle &= IO_PAGE_MASK;
643 if (strbuf->strbuf_enabled) 643 if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
644 strbuf_flush(strbuf, iommu, dma_handle, ctx, 644 strbuf_flush(strbuf, iommu, dma_handle, ctx,
645 npages, direction); 645 npages, direction);
646 646
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 2344103414d1..6ffaec44931a 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -527,7 +527,7 @@ static dma_addr_t pci32_map_page(struct device *dev, struct page *page,
527static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, 527static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size,
528 enum dma_data_direction dir, unsigned long attrs) 528 enum dma_data_direction dir, unsigned long attrs)
529{ 529{
530 if (dir != PCI_DMA_TODEVICE) 530 if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
531 dma_make_coherent(ba, PAGE_ALIGN(size)); 531 dma_make_coherent(ba, PAGE_ALIGN(size));
532} 532}
533 533
@@ -572,7 +572,7 @@ static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl,
572 struct scatterlist *sg; 572 struct scatterlist *sg;
573 int n; 573 int n;
574 574
575 if (dir != PCI_DMA_TODEVICE) { 575 if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
576 for_each_sg(sgl, sg, nents, n) { 576 for_each_sg(sgl, sg, nents, n) {
577 dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); 577 dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length));
578 } 578 }
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index a9973bb4a1b2..95e73c63c99d 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -42,7 +42,7 @@ static int panic_on_timeout;
42 */ 42 */
43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
44EXPORT_SYMBOL(nmi_active); 44EXPORT_SYMBOL(nmi_active);
45 45static int nmi_init_done;
46static unsigned int nmi_hz = HZ; 46static unsigned int nmi_hz = HZ;
47static DEFINE_PER_CPU(short, wd_enabled); 47static DEFINE_PER_CPU(short, wd_enabled);
48static int endflag __initdata; 48static int endflag __initdata;
@@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count)
153 153
154void stop_nmi_watchdog(void *unused) 154void stop_nmi_watchdog(void *unused)
155{ 155{
156 if (!__this_cpu_read(wd_enabled))
157 return;
156 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 158 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable);
157 __this_cpu_write(wd_enabled, 0); 159 __this_cpu_write(wd_enabled, 0);
158 atomic_dec(&nmi_active); 160 atomic_dec(&nmi_active);
@@ -207,6 +209,9 @@ error:
207 209
208void start_nmi_watchdog(void *unused) 210void start_nmi_watchdog(void *unused)
209{ 211{
212 if (__this_cpu_read(wd_enabled))
213 return;
214
210 __this_cpu_write(wd_enabled, 1); 215 __this_cpu_write(wd_enabled, 1);
211 atomic_inc(&nmi_active); 216 atomic_inc(&nmi_active);
212 217
@@ -259,6 +264,8 @@ int __init nmi_init(void)
259 } 264 }
260 } 265 }
261 266
267 nmi_init_done = 1;
268
262 return err; 269 return err;
263} 270}
264 271
@@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str)
270 return 0; 277 return 0;
271} 278}
272__setup("nmi_watchdog=", setup_nmi_watchdog); 279__setup("nmi_watchdog=", setup_nmi_watchdog);
280
281/*
282 * sparc specific NMI watchdog enable function.
283 * Enables watchdog if it is not enabled already.
284 */
285int watchdog_nmi_enable(unsigned int cpu)
286{
287 if (atomic_read(&nmi_active) == -1) {
288 pr_warn("NMI watchdog cannot be enabled or disabled\n");
289 return -1;
290 }
291
292 /*
293 * watchdog thread could start even before nmi_init is called.
294 * Just Return in that case. Let nmi_init finish the init
295 * process first.
296 */
297 if (!nmi_init_done)
298 return 0;
299
300 smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1);
301
302 return 0;
303}
304/*
305 * sparc specific NMI watchdog disable function.
306 * Disables watchdog if it is not disabled already.
307 */
308void watchdog_nmi_disable(unsigned int cpu)
309{
310 if (atomic_read(&nmi_active) == -1)
311 pr_warn_once("NMI watchdog cannot be enabled or disabled\n");
312 else
313 smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1);
314}
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c
index 09bb774b39cd..24e0f8c21f2f 100644
--- a/arch/tile/kernel/pci-dma.c
+++ b/arch/tile/kernel/pci-dma.c
@@ -213,10 +213,12 @@ static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist,
213 213
214 for_each_sg(sglist, sg, nents, i) { 214 for_each_sg(sglist, sg, nents, i) {
215 sg->dma_address = sg_phys(sg); 215 sg->dma_address = sg_phys(sg);
216 __dma_prep_pa_range(sg->dma_address, sg->length, direction);
217#ifdef CONFIG_NEED_SG_DMA_LENGTH 216#ifdef CONFIG_NEED_SG_DMA_LENGTH
218 sg->dma_length = sg->length; 217 sg->dma_length = sg->length;
219#endif 218#endif
219 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
220 continue;
221 __dma_prep_pa_range(sg->dma_address, sg->length, direction);
220 } 222 }
221 223
222 return nents; 224 return nents;
@@ -232,6 +234,8 @@ static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
232 BUG_ON(!valid_dma_direction(direction)); 234 BUG_ON(!valid_dma_direction(direction));
233 for_each_sg(sglist, sg, nents, i) { 235 for_each_sg(sglist, sg, nents, i) {
234 sg->dma_address = sg_phys(sg); 236 sg->dma_address = sg_phys(sg);
237 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
238 continue;
235 __dma_complete_pa_range(sg->dma_address, sg->length, 239 __dma_complete_pa_range(sg->dma_address, sg->length,
236 direction); 240 direction);
237 } 241 }
@@ -245,7 +249,8 @@ static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page,
245 BUG_ON(!valid_dma_direction(direction)); 249 BUG_ON(!valid_dma_direction(direction));
246 250
247 BUG_ON(offset + size > PAGE_SIZE); 251 BUG_ON(offset + size > PAGE_SIZE);
248 __dma_prep_page(page, offset, size, direction); 252 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
253 __dma_prep_page(page, offset, size, direction);
249 254
250 return page_to_pa(page) + offset; 255 return page_to_pa(page) + offset;
251} 256}
@@ -256,6 +261,9 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address,
256{ 261{
257 BUG_ON(!valid_dma_direction(direction)); 262 BUG_ON(!valid_dma_direction(direction));
258 263
264 if (attrs & DMA_ATTR_SKIP_CPU_SYNC)
265 return;
266
259 __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), 267 __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)),
260 dma_address & (PAGE_SIZE - 1), size, direction); 268 dma_address & (PAGE_SIZE - 1), size, direction);
261} 269}
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index e739002427ed..40121d14d34d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
109 return VM_FAULT_SIGBUS; 109 return VM_FAULT_SIGBUS;
110 110
111 if (sym_offset == image->sym_vvar_page) { 111 if (sym_offset == image->sym_vvar_page) {
112 ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, 112 ret = vm_insert_pfn(vma, vmf->address,
113 __pa_symbol(&__vvar_page) >> PAGE_SHIFT); 113 __pa_symbol(&__vvar_page) >> PAGE_SHIFT);
114 } else if (sym_offset == image->sym_pvclock_page) { 114 } else if (sym_offset == image->sym_pvclock_page) {
115 struct pvclock_vsyscall_time_info *pvti = 115 struct pvclock_vsyscall_time_info *pvti =
@@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
117 if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { 117 if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
118 ret = vm_insert_pfn( 118 ret = vm_insert_pfn(
119 vma, 119 vma,
120 (unsigned long)vmf->virtual_address, 120 vmf->address,
121 __pa(pvti) >> PAGE_SHIFT); 121 __pa(pvti) >> PAGE_SHIFT);
122 } 122 }
123 } 123 }
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 8c1f218926d7..307b1f4543de 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image)
328 328
329void arch_crash_save_vmcoreinfo(void) 329void arch_crash_save_vmcoreinfo(void)
330{ 330{
331 VMCOREINFO_SYMBOL(phys_base); 331 VMCOREINFO_NUMBER(phys_base);
332 VMCOREINFO_SYMBOL(init_level4_pgt); 332 VMCOREINFO_SYMBOL(init_level4_pgt);
333 333
334#ifdef CONFIG_NUMA 334#ifdef CONFIG_NUMA
@@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void)
337#endif 337#endif
338 vmcoreinfo_append_str("KERNELOFFSET=%lx\n", 338 vmcoreinfo_append_str("KERNELOFFSET=%lx\n",
339 kaslr_offset()); 339 kaslr_offset());
340 VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); 340 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
341 VMCOREINFO_VMALLOC_START(VMALLOC_START);
342 VMCOREINFO_VMEMMAP_START(VMEMMAP_START);
343} 341}
344 342
345/* arch-dependent functionality related to kexec file-based syscall */ 343/* arch-dependent functionality related to kexec file-based syscall */
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c
index 1e68806d6695..6a16decf278f 100644
--- a/arch/xtensa/kernel/pci-dma.c
+++ b/arch/xtensa/kernel/pci-dma.c
@@ -189,7 +189,9 @@ static dma_addr_t xtensa_map_page(struct device *dev, struct page *page,
189{ 189{
190 dma_addr_t dma_handle = page_to_phys(page) + offset; 190 dma_addr_t dma_handle = page_to_phys(page) + offset;
191 191
192 xtensa_sync_single_for_device(dev, dma_handle, size, dir); 192 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
193 xtensa_sync_single_for_device(dev, dma_handle, size, dir);
194
193 return dma_handle; 195 return dma_handle;
194} 196}
195 197
@@ -197,7 +199,8 @@ static void xtensa_unmap_page(struct device *dev, dma_addr_t dma_handle,
197 size_t size, enum dma_data_direction dir, 199 size_t size, enum dma_data_direction dir,
198 unsigned long attrs) 200 unsigned long attrs)
199{ 201{
200 xtensa_sync_single_for_cpu(dev, dma_handle, size, dir); 202 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
203 xtensa_sync_single_for_cpu(dev, dma_handle, size, dir);
201} 204}
202 205
203static int xtensa_map_sg(struct device *dev, struct scatterlist *sg, 206static int xtensa_map_sg(struct device *dev, struct scatterlist *sg,
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c
index 199b8e99f7d7..737187865269 100644
--- a/drivers/char/agp/alpha-agp.c
+++ b/drivers/char/agp/alpha-agp.c
@@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma,
19 unsigned long pa; 19 unsigned long pa;
20 struct page *page; 20 struct page *page;
21 21
22 dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start 22 dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base;
23 + agp->aperture.bus_base;
24 pa = agp->ops->translate(agp, dma_addr); 23 pa = agp->ops->translate(agp, dma_addr);
25 24
26 if (pa == (unsigned long)-EINVAL) 25 if (pa == (unsigned long)-EINVAL)
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c
index f3f92d5fcda0..a697ca0cab1e 100644
--- a/drivers/char/mspec.c
+++ b/drivers/char/mspec.c
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
227 * be because another thread has installed the pte first, so it 227 * be because another thread has installed the pte first, so it
228 * is no problem. 228 * is no problem.
229 */ 229 */
230 vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); 230 vm_insert_pfn(vma, vmf->address, pfn);
231 231
232 return VM_FAULT_NOPAGE; 232 return VM_FAULT_NOPAGE;
233} 233}
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index 7a4869151d3b..a77262d31911 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(tpm_put_ops);
84 * 84 *
85 * The return'd chip has been tpm_try_get_ops'd and must be released via 85 * The return'd chip has been tpm_try_get_ops'd and must be released via
86 * tpm_put_ops 86 * tpm_put_ops
87 */ 87 */
88struct tpm_chip *tpm_chip_find_get(int chip_num) 88struct tpm_chip *tpm_chip_find_get(int chip_num)
89{ 89{
90 struct tpm_chip *chip, *res = NULL; 90 struct tpm_chip *chip, *res = NULL;
@@ -103,7 +103,7 @@ struct tpm_chip *tpm_chip_find_get(int chip_num)
103 } 103 }
104 } while (chip_prev != chip_num); 104 } while (chip_prev != chip_num);
105 } else { 105 } else {
106 chip = idr_find_slowpath(&dev_nums_idr, chip_num); 106 chip = idr_find(&dev_nums_idr, chip_num);
107 if (chip && !tpm_try_get_ops(chip)) 107 if (chip && !tpm_try_get_ops(chip))
108 res = chip; 108 res = chip;
109 } 109 }
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 286447a83dab..26ec39ddf21f 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
328static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, 328static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
329 struct vm_fault *vmf) 329 struct vm_fault *vmf)
330{ 330{
331 unsigned long vaddr = (unsigned long) vmf->virtual_address;
332 struct device *dev = &dax_dev->dev; 331 struct device *dev = &dax_dev->dev;
333 struct dax_region *dax_region; 332 struct dax_region *dax_region;
334 int rc = VM_FAULT_SIGBUS; 333 int rc = VM_FAULT_SIGBUS;
@@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
353 352
354 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); 353 pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
355 354
356 rc = vm_insert_mixed(vma, vaddr, pfn); 355 rc = vm_insert_mixed(vma, vmf->address, pfn);
357 356
358 if (rc == -ENOMEM) 357 if (rc == -ENOMEM)
359 return VM_FAULT_OOM; 358 return VM_FAULT_OOM;
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c
index 768087ddb046..a293c8be232c 100644
--- a/drivers/gpu/drm/armada/armada_gem.c
+++ b/drivers/gpu/drm/armada/armada_gem.c
@@ -17,12 +17,11 @@
17static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 17static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
18{ 18{
19 struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); 19 struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data);
20 unsigned long addr = (unsigned long)vmf->virtual_address;
21 unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; 20 unsigned long pfn = obj->phys_addr >> PAGE_SHIFT;
22 int ret; 21 int ret;
23 22
24 pfn += (addr - vma->vm_start) >> PAGE_SHIFT; 23 pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT;
25 ret = vm_insert_pfn(vma, addr, pfn); 24 ret = vm_insert_pfn(vma, vmf->address, pfn);
26 25
27 switch (ret) { 26 switch (ret) {
28 case 0: 27 case 0:
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c
index caa4e4ca616d..bd311c77c254 100644
--- a/drivers/gpu/drm/drm_vm.c
+++ b/drivers/gpu/drm/drm_vm.c
@@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
124 * Using vm_pgoff as a selector forces us to use this unusual 124 * Using vm_pgoff as a selector forces us to use this unusual
125 * addressing scheme. 125 * addressing scheme.
126 */ 126 */
127 resource_size_t offset = (unsigned long)vmf->virtual_address - 127 resource_size_t offset = vmf->address - vma->vm_start;
128 vma->vm_start;
129 resource_size_t baddr = map->offset + offset; 128 resource_size_t baddr = map->offset + offset;
130 struct drm_agp_mem *agpmem; 129 struct drm_agp_mem *agpmem;
131 struct page *page; 130 struct page *page;
@@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
195 if (!map) 194 if (!map)
196 return VM_FAULT_SIGBUS; /* Nothing allocated */ 195 return VM_FAULT_SIGBUS; /* Nothing allocated */
197 196
198 offset = (unsigned long)vmf->virtual_address - vma->vm_start; 197 offset = vmf->address - vma->vm_start;
199 i = (unsigned long)map->handle + offset; 198 i = (unsigned long)map->handle + offset;
200 page = vmalloc_to_page((void *)i); 199 page = vmalloc_to_page((void *)i);
201 if (!page) 200 if (!page)
@@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
301 if (!dma->pagelist) 300 if (!dma->pagelist)
302 return VM_FAULT_SIGBUS; /* Nothing allocated */ 301 return VM_FAULT_SIGBUS; /* Nothing allocated */
303 302
304 offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ 303 offset = vmf->address - vma->vm_start;
304 /* vm_[pg]off[set] should be 0 */
305 page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ 305 page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */
306 page = virt_to_page((void *)dma->pagelist[page_nr]); 306 page = virt_to_page((void *)dma->pagelist[page_nr]);
307 307
@@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
337 if (!entry->pagelist) 337 if (!entry->pagelist)
338 return VM_FAULT_SIGBUS; /* Nothing allocated */ 338 return VM_FAULT_SIGBUS; /* Nothing allocated */
339 339
340 offset = (unsigned long)vmf->virtual_address - vma->vm_start; 340 offset = vmf->address - vma->vm_start;
341 map_offset = map->offset - (unsigned long)dev->sg->virtual; 341 map_offset = map->offset - (unsigned long)dev->sg->virtual;
342 page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); 342 page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT);
343 page = entry->pagelist[page_offset]; 343 page = entry->pagelist[page_offset];
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
index 7d066a91d778..114dddbd297b 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
@@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
202 } 202 }
203 203
204 /* We don't use vmf->pgoff since that has the fake offset: */ 204 /* We don't use vmf->pgoff since that has the fake offset: */
205 pgoff = ((unsigned long)vmf->virtual_address - 205 pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
206 vma->vm_start) >> PAGE_SHIFT;
207 206
208 page = pages[pgoff]; 207 page = pages[pgoff];
209 208
210 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, 209 VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
211 page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); 210 page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT);
212 211
213 ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); 212 ret = vm_insert_page(vma, vmf->address, page);
214 213
215out: 214out:
216 switch (ret) { 215 switch (ret) {
@@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
759 down_read(&mm->mmap_sem); 758 down_read(&mm->mmap_sem);
760 while (pinned < npages) { 759 while (pinned < npages) {
761 ret = get_user_pages_remote(task, mm, ptr, npages - pinned, 760 ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
762 flags, pvec + pinned, NULL); 761 flags, pvec + pinned, NULL, NULL);
763 if (ret < 0) 762 if (ret < 0)
764 break; 763 break;
765 764
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c
index ea7a18230888..57b81460fec8 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_gem.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c
@@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
455 pgoff_t page_offset; 455 pgoff_t page_offset;
456 int ret; 456 int ret;
457 457
458 page_offset = ((unsigned long)vmf->virtual_address - 458 page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
459 vma->vm_start) >> PAGE_SHIFT;
460 459
461 if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { 460 if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) {
462 DRM_ERROR("invalid page offset\n"); 461 DRM_ERROR("invalid page offset\n");
@@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
465 } 464 }
466 465
467 pfn = page_to_pfn(exynos_gem->pages[page_offset]); 466 pfn = page_to_pfn(exynos_gem->pages[page_offset]);
468 ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, 467 ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
469 __pfn_to_pfn_t(pfn, PFN_DEV));
470 468
471out: 469out:
472 switch (ret) { 470 switch (ret) {
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c
index 4071b2d1e8cf..8b44fa542562 100644
--- a/drivers/gpu/drm/gma500/framebuffer.c
+++ b/drivers/gpu/drm/gma500/framebuffer.c
@@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
125 psbfb->gtt->offset; 125 psbfb->gtt->offset;
126 126
127 page_num = vma_pages(vma); 127 page_num = vma_pages(vma);
128 address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); 128 address = vmf->address - (vmf->pgoff << PAGE_SHIFT);
129 129
130 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 130 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
131 131
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c
index 6d1cb6b370b1..527c62917660 100644
--- a/drivers/gpu/drm/gma500/gem.c
+++ b/drivers/gpu/drm/gma500/gem.c
@@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
197 197
198 /* Page relative to the VMA start - we must calculate this ourselves 198 /* Page relative to the VMA start - we must calculate this ourselves
199 because vmf->pgoff is the fake GEM offset */ 199 because vmf->pgoff is the fake GEM offset */
200 page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) 200 page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
201 >> PAGE_SHIFT;
202 201
203 /* CPU view of the page, don't go via the GART for CPU writes */ 202 /* CPU view of the page, don't go via the GART for CPU writes */
204 if (r->stolen) 203 if (r->stolen)
205 pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; 204 pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT;
206 else 205 else
207 pfn = page_to_pfn(r->pages[page_offset]); 206 pfn = page_to_pfn(r->pages[page_offset]);
208 ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); 207 ret = vm_insert_pfn(vma, vmf->address, pfn);
209 208
210fail: 209fail:
211 mutex_unlock(&dev_priv->mmap_mutex); 210 mutex_unlock(&dev_priv->mmap_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d0dcaf35b429..412f3513f269 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
1796 int ret; 1796 int ret;
1797 1797
1798 /* We don't use vmf->pgoff since that has the fake offset */ 1798 /* We don't use vmf->pgoff since that has the fake offset */
1799 page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> 1799 page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT;
1800 PAGE_SHIFT;
1801 1800
1802 trace_i915_gem_object_fault(obj, page_offset, true, write); 1801 trace_i915_gem_object_fault(obj, page_offset, true, write);
1803 1802
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 107ddf51065e..d068af2ec3a3 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work)
515 obj->userptr.ptr + pinned * PAGE_SIZE, 515 obj->userptr.ptr + pinned * PAGE_SIZE,
516 npages - pinned, 516 npages - pinned,
517 flags, 517 flags,
518 pvec + pinned, NULL); 518 pvec + pinned, NULL, NULL);
519 if (ret < 0) 519 if (ret < 0)
520 break; 520 break;
521 521
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index cd06cfd94687..d8bc59c7e261 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
225 } 225 }
226 226
227 /* We don't use vmf->pgoff since that has the fake offset: */ 227 /* We don't use vmf->pgoff since that has the fake offset: */
228 pgoff = ((unsigned long)vmf->virtual_address - 228 pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
229 vma->vm_start) >> PAGE_SHIFT;
230 229
231 pfn = page_to_pfn(pages[pgoff]); 230 pfn = page_to_pfn(pages[pgoff]);
232 231
233 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, 232 VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
234 pfn, pfn << PAGE_SHIFT); 233 pfn, pfn << PAGE_SHIFT);
235 234
236 ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, 235 ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
237 __pfn_to_pfn_t(pfn, PFN_DEV));
238 236
239out_unlock: 237out_unlock:
240 mutex_unlock(&dev->struct_mutex); 238 mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c
index d4e1e11466f8..4a90c690f09e 100644
--- a/drivers/gpu/drm/omapdrm/omap_gem.c
+++ b/drivers/gpu/drm/omapdrm/omap_gem.c
@@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj,
398 pgoff_t pgoff; 398 pgoff_t pgoff;
399 399
400 /* We don't use vmf->pgoff since that has the fake offset: */ 400 /* We don't use vmf->pgoff since that has the fake offset: */
401 pgoff = ((unsigned long)vmf->virtual_address - 401 pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
402 vma->vm_start) >> PAGE_SHIFT;
403 402
404 if (omap_obj->pages) { 403 if (omap_obj->pages) {
405 omap_gem_cpu_sync(obj, pgoff); 404 omap_gem_cpu_sync(obj, pgoff);
@@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj,
409 pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; 408 pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff;
410 } 409 }
411 410
412 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, 411 VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
413 pfn, pfn << PAGE_SHIFT); 412 pfn, pfn << PAGE_SHIFT);
414 413
415 return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, 414 return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV));
416 __pfn_to_pfn_t(pfn, PFN_DEV));
417} 415}
418 416
419/* Special handling for the case of faulting in 2d tiled buffers */ 417/* Special handling for the case of faulting in 2d tiled buffers */
@@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj,
427 struct page *pages[64]; /* XXX is this too much to have on stack? */ 425 struct page *pages[64]; /* XXX is this too much to have on stack? */
428 unsigned long pfn; 426 unsigned long pfn;
429 pgoff_t pgoff, base_pgoff; 427 pgoff_t pgoff, base_pgoff;
430 void __user *vaddr; 428 unsigned long vaddr;
431 int i, ret, slots; 429 int i, ret, slots;
432 430
433 /* 431 /*
@@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj,
447 const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); 445 const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE);
448 446
449 /* We don't use vmf->pgoff since that has the fake offset: */ 447 /* We don't use vmf->pgoff since that has the fake offset: */
450 pgoff = ((unsigned long)vmf->virtual_address - 448 pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
451 vma->vm_start) >> PAGE_SHIFT;
452 449
453 /* 450 /*
454 * Actual address we start mapping at is rounded down to previous slot 451 * Actual address we start mapping at is rounded down to previous slot
@@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj,
459 /* figure out buffer width in slots */ 456 /* figure out buffer width in slots */
460 slots = omap_obj->width >> priv->usergart[fmt].slot_shift; 457 slots = omap_obj->width >> priv->usergart[fmt].slot_shift;
461 458
462 vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); 459 vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT);
463 460
464 entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; 461 entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last];
465 462
@@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj,
503 500
504 pfn = entry->paddr >> PAGE_SHIFT; 501 pfn = entry->paddr >> PAGE_SHIFT;
505 502
506 VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, 503 VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address,
507 pfn, pfn << PAGE_SHIFT); 504 pfn, pfn << PAGE_SHIFT);
508 505
509 for (i = n; i > 0; i--) { 506 for (i = n; i > 0; i--) {
510 vm_insert_mixed(vma, (unsigned long)vaddr, 507 vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV));
511 __pfn_to_pfn_t(pfn, PFN_DEV));
512 pfn += priv->usergart[fmt].stride_pfn; 508 pfn += priv->usergart[fmt].stride_pfn;
513 vaddr += PAGE_SIZE * m; 509 vaddr += PAGE_SIZE * m;
514 } 510 }
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index c08e5279eeac..7d853e6b5ff0 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
452 if (!bo->pages) 452 if (!bo->pages)
453 return VM_FAULT_SIGBUS; 453 return VM_FAULT_SIGBUS;
454 454
455 offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; 455 offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
456 page = bo->pages[offset]; 456 page = bo->pages[offset];
457 457
458 err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); 458 err = vm_insert_page(vma, vmf->address, page);
459 switch (err) { 459 switch (err) {
460 case -EAGAIN: 460 case -EAGAIN:
461 case 0: 461 case 0:
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
index 4748aedc933a..68ef993ab431 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
@@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
101 struct page *page; 101 struct page *page;
102 int ret; 102 int ret;
103 int i; 103 int i;
104 unsigned long address = (unsigned long)vmf->virtual_address; 104 unsigned long address = vmf->address;
105 int retval = VM_FAULT_NOPAGE; 105 int retval = VM_FAULT_NOPAGE;
106 struct ttm_mem_type_manager *man = 106 struct ttm_mem_type_manager *man =
107 &bdev->man[bo->mem.mem_type]; 107 &bdev->man[bo->mem.mem_type];
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c
index 818e70712b18..3c0c4bd3f750 100644
--- a/drivers/gpu/drm/udl/udl_gem.c
+++ b/drivers/gpu/drm/udl/udl_gem.c
@@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
107 unsigned int page_offset; 107 unsigned int page_offset;
108 int ret = 0; 108 int ret = 0;
109 109
110 page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> 110 page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
111 PAGE_SHIFT;
112 111
113 if (!obj->pages) 112 if (!obj->pages)
114 return VM_FAULT_SIGBUS; 113 return VM_FAULT_SIGBUS;
115 114
116 page = obj->pages[page_offset]; 115 page = obj->pages[page_offset];
117 ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); 116 ret = vm_insert_page(vma, vmf->address, page);
118 switch (ret) { 117 switch (ret) {
119 case -EAGAIN: 118 case -EAGAIN:
120 case 0: 119 case 0:
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c
index f36c14729b55..477e07f0ecb6 100644
--- a/drivers/gpu/drm/vgem/vgem_drv.c
+++ b/drivers/gpu/drm/vgem/vgem_drv.c
@@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
54{ 54{
55 struct drm_vgem_gem_object *obj = vma->vm_private_data; 55 struct drm_vgem_gem_object *obj = vma->vm_private_data;
56 /* We don't use vmf->pgoff since that has the fake offset */ 56 /* We don't use vmf->pgoff since that has the fake offset */
57 unsigned long vaddr = (unsigned long)vmf->virtual_address; 57 unsigned long vaddr = vmf->address;
58 struct page *page; 58 struct page *page;
59 59
60 page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, 60 page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping,
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index 1f0fe3217f23..6b079a31dced 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
578 */ 578 */
579 npages = get_user_pages_remote(owning_process, owning_mm, 579 npages = get_user_pages_remote(owning_process, owning_mm,
580 user_virt, gup_num_pages, 580 user_virt, gup_num_pages,
581 flags, local_page_list, NULL); 581 flags, local_page_list, NULL, NULL);
582 up_read(&owning_mm->mmap_sem); 582 up_read(&owning_mm->mmap_sem);
583 583
584 if (npages < 0) 584 if (npages < 0)
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 1db0af6c7f94..ba63ca57ed7e 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
439 struct page *page; 439 struct page *page;
440 440
441 dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", 441 dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n",
442 (unsigned long)vmf->virtual_address, 442 vmf->address, vma->vm_start, vma->vm_end);
443 vma->vm_start, vma->vm_end);
444 443
445 page = alloc_page(GFP_USER | __GFP_DMA32); 444 page = alloc_page(GFP_USER | __GFP_DMA32);
446 if (!page) 445 if (!page)
447 return VM_FAULT_OOM; 446 return VM_FAULT_OOM;
448 clear_user_highpage(page, (unsigned long)vmf->virtual_address); 447 clear_user_highpage(page, vmf->address);
449 vmf->page = page; 448 vmf->page = page;
450 449
451 return 0; 450 return 0;
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 5e506c19108a..5d36dcc7f47e 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -117,13 +117,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
117static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 117static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
118{ 118{
119 struct cxl_context *ctx = vma->vm_file->private_data; 119 struct cxl_context *ctx = vma->vm_file->private_data;
120 unsigned long address = (unsigned long)vmf->virtual_address;
121 u64 area, offset; 120 u64 area, offset;
122 121
123 offset = vmf->pgoff << PAGE_SHIFT; 122 offset = vmf->pgoff << PAGE_SHIFT;
124 123
125 pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", 124 pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n",
126 __func__, ctx->pe, address, offset); 125 __func__, ctx->pe, vmf->address, offset);
127 126
128 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { 127 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
129 area = ctx->afu->psn_phys; 128 area = ctx->afu->psn_phys;
@@ -155,7 +154,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
155 return VM_FAULT_SIGBUS; 154 return VM_FAULT_SIGBUS;
156 } 155 }
157 156
158 vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); 157 vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT);
159 158
160 mutex_unlock(&ctx->status_mutex); 159 mutex_unlock(&ctx->status_mutex);
161 160
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c
index 33741ad4a74a..af2e077da4b8 100644
--- a/drivers/misc/sgi-gru/grumain.c
+++ b/drivers/misc/sgi-gru/grumain.c
@@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
932 unsigned long paddr, vaddr; 932 unsigned long paddr, vaddr;
933 unsigned long expires; 933 unsigned long expires;
934 934
935 vaddr = (unsigned long)vmf->virtual_address; 935 vaddr = vmf->address;
936 gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", 936 gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n",
937 vma, vaddr, GSEG_BASE(vaddr)); 937 vma, vaddr, GSEG_BASE(vaddr));
938 STAT(nopfn); 938 STAT(nopfn);
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index d11093dce1b9..acbc3abe2ddd 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -210,7 +210,12 @@ struct igb_tx_buffer {
210struct igb_rx_buffer { 210struct igb_rx_buffer {
211 dma_addr_t dma; 211 dma_addr_t dma;
212 struct page *page; 212 struct page *page;
213 unsigned int page_offset; 213#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
214 __u32 page_offset;
215#else
216 __u16 page_offset;
217#endif
218 __u16 pagecnt_bias;
214}; 219};
215 220
216struct igb_tx_queue_stats { 221struct igb_tx_queue_stats {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index cae24a8ccf47..a761001308dc 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -3947,11 +3947,23 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
3947 if (!buffer_info->page) 3947 if (!buffer_info->page)
3948 continue; 3948 continue;
3949 3949
3950 dma_unmap_page(rx_ring->dev, 3950 /* Invalidate cache lines that may have been written to by
3951 buffer_info->dma, 3951 * device so that we avoid corrupting memory.
3952 PAGE_SIZE, 3952 */
3953 DMA_FROM_DEVICE); 3953 dma_sync_single_range_for_cpu(rx_ring->dev,
3954 __free_page(buffer_info->page); 3954 buffer_info->dma,
3955 buffer_info->page_offset,
3956 IGB_RX_BUFSZ,
3957 DMA_FROM_DEVICE);
3958
3959 /* free resources associated with mapping */
3960 dma_unmap_page_attrs(rx_ring->dev,
3961 buffer_info->dma,
3962 PAGE_SIZE,
3963 DMA_FROM_DEVICE,
3964 DMA_ATTR_SKIP_CPU_SYNC);
3965 __page_frag_drain(buffer_info->page, 0,
3966 buffer_info->pagecnt_bias);
3955 3967
3956 buffer_info->page = NULL; 3968 buffer_info->page = NULL;
3957 } 3969 }
@@ -6812,12 +6824,6 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
6812 6824
6813 /* transfer page from old buffer to new buffer */ 6825 /* transfer page from old buffer to new buffer */
6814 *new_buff = *old_buff; 6826 *new_buff = *old_buff;
6815
6816 /* sync the buffer for use by the device */
6817 dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma,
6818 old_buff->page_offset,
6819 IGB_RX_BUFSZ,
6820 DMA_FROM_DEVICE);
6821} 6827}
6822 6828
6823static inline bool igb_page_is_reserved(struct page *page) 6829static inline bool igb_page_is_reserved(struct page *page)
@@ -6829,13 +6835,15 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
6829 struct page *page, 6835 struct page *page,
6830 unsigned int truesize) 6836 unsigned int truesize)
6831{ 6837{
6838 unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--;
6839
6832 /* avoid re-using remote pages */ 6840 /* avoid re-using remote pages */
6833 if (unlikely(igb_page_is_reserved(page))) 6841 if (unlikely(igb_page_is_reserved(page)))
6834 return false; 6842 return false;
6835 6843
6836#if (PAGE_SIZE < 8192) 6844#if (PAGE_SIZE < 8192)
6837 /* if we are only owner of page we can reuse it */ 6845 /* if we are only owner of page we can reuse it */
6838 if (unlikely(page_count(page) != 1)) 6846 if (unlikely(page_ref_count(page) != pagecnt_bias))
6839 return false; 6847 return false;
6840 6848
6841 /* flip page offset to other buffer */ 6849 /* flip page offset to other buffer */
@@ -6848,10 +6856,14 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
6848 return false; 6856 return false;
6849#endif 6857#endif
6850 6858
6851 /* Even if we own the page, we are not allowed to use atomic_set() 6859 /* If we have drained the page fragment pool we need to update
6852 * This would break get_page_unless_zero() users. 6860 * the pagecnt_bias and page count so that we fully restock the
6861 * number of references the driver holds.
6853 */ 6862 */
6854 page_ref_inc(page); 6863 if (unlikely(pagecnt_bias == 1)) {
6864 page_ref_add(page, USHRT_MAX);
6865 rx_buffer->pagecnt_bias = USHRT_MAX;
6866 }
6855 6867
6856 return true; 6868 return true;
6857} 6869}
@@ -6903,7 +6915,6 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring,
6903 return true; 6915 return true;
6904 6916
6905 /* this page cannot be reused so discard it */ 6917 /* this page cannot be reused so discard it */
6906 __free_page(page);
6907 return false; 6918 return false;
6908 } 6919 }
6909 6920
@@ -6938,6 +6949,13 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
6938 page = rx_buffer->page; 6949 page = rx_buffer->page;
6939 prefetchw(page); 6950 prefetchw(page);
6940 6951
6952 /* we are reusing so sync this buffer for CPU use */
6953 dma_sync_single_range_for_cpu(rx_ring->dev,
6954 rx_buffer->dma,
6955 rx_buffer->page_offset,
6956 size,
6957 DMA_FROM_DEVICE);
6958
6941 if (likely(!skb)) { 6959 if (likely(!skb)) {
6942 void *page_addr = page_address(page) + 6960 void *page_addr = page_address(page) +
6943 rx_buffer->page_offset; 6961 rx_buffer->page_offset;
@@ -6962,21 +6980,18 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
6962 prefetchw(skb->data); 6980 prefetchw(skb->data);
6963 } 6981 }
6964 6982
6965 /* we are reusing so sync this buffer for CPU use */
6966 dma_sync_single_range_for_cpu(rx_ring->dev,
6967 rx_buffer->dma,
6968 rx_buffer->page_offset,
6969 size,
6970 DMA_FROM_DEVICE);
6971
6972 /* pull page into skb */ 6983 /* pull page into skb */
6973 if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { 6984 if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) {
6974 /* hand second half of page back to the ring */ 6985 /* hand second half of page back to the ring */
6975 igb_reuse_rx_page(rx_ring, rx_buffer); 6986 igb_reuse_rx_page(rx_ring, rx_buffer);
6976 } else { 6987 } else {
6977 /* we are not reusing the buffer so unmap it */ 6988 /* We are not reusing the buffer so unmap it and free
6978 dma_unmap_page(rx_ring->dev, rx_buffer->dma, 6989 * any references we are holding to it
6979 PAGE_SIZE, DMA_FROM_DEVICE); 6990 */
6991 dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
6992 PAGE_SIZE, DMA_FROM_DEVICE,
6993 DMA_ATTR_SKIP_CPU_SYNC);
6994 __page_frag_drain(page, 0, rx_buffer->pagecnt_bias);
6980 } 6995 }
6981 6996
6982 /* clear contents of rx_buffer */ 6997 /* clear contents of rx_buffer */
@@ -7234,7 +7249,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
7234 } 7249 }
7235 7250
7236 /* map page for use */ 7251 /* map page for use */
7237 dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); 7252 dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
7253 DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC);
7238 7254
7239 /* if mapping failed free memory back to system since 7255 /* if mapping failed free memory back to system since
7240 * there isn't much point in holding memory we can't use 7256 * there isn't much point in holding memory we can't use
@@ -7249,6 +7265,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
7249 bi->dma = dma; 7265 bi->dma = dma;
7250 bi->page = page; 7266 bi->page = page;
7251 bi->page_offset = 0; 7267 bi->page_offset = 0;
7268 bi->pagecnt_bias = 1;
7252 7269
7253 return true; 7270 return true;
7254} 7271}
@@ -7275,6 +7292,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
7275 if (!igb_alloc_mapped_page(rx_ring, bi)) 7292 if (!igb_alloc_mapped_page(rx_ring, bi))
7276 break; 7293 break;
7277 7294
7295 /* sync the buffer for use by the device */
7296 dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
7297 bi->page_offset,
7298 IGB_RX_BUFSZ,
7299 DMA_FROM_DEVICE);
7300
7278 /* Refresh the desc even if buffer_addrs didn't change 7301 /* Refresh the desc even if buffer_addrs didn't change
7279 * because each write-back erases this info. 7302 * because each write-back erases this info.
7280 */ 7303 */
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
index e9cef9de9ed8..c96f9b1d948a 100644
--- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
+++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c
@@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv,
900 900
901 /* bound gain by 2 bits value max, 3rd bit is sign */ 901 /* bound gain by 2 bits value max, 3rd bit is sign */
902 data->delta_gain_code[i] = 902 data->delta_gain_code[i] =
903 min(abs(delta_g), 903 min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
904 (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE);
905 904
906 if (delta_g < 0) 905 if (delta_g < 0)
907 /* 906 /*
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c
index d5cc3070e83f..b653451843c8 100644
--- a/drivers/staging/android/ion/ion.c
+++ b/drivers/staging/android/ion/ion.c
@@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
882 BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); 882 BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]);
883 883
884 pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); 884 pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff]));
885 ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); 885 ret = vm_insert_pfn(vma, vmf->address, pfn);
886 mutex_unlock(&buffer->lock); 886 mutex_unlock(&buffer->lock);
887 if (ret) 887 if (ret)
888 return VM_FAULT_ERROR; 888 return VM_FAULT_ERROR;
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c
index 0b6d388d8aa4..697cbfbe9374 100644
--- a/drivers/staging/lustre/lustre/llite/vvp_io.c
+++ b/drivers/staging/lustre/lustre/llite/vvp_io.c
@@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
1014 "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", 1014 "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n",
1015 vmf->page, vmf->page->mapping, vmf->page->index, 1015 vmf->page, vmf->page->mapping, vmf->page->index,
1016 (long)vmf->page->flags, page_count(vmf->page), 1016 (long)vmf->page->flags, page_count(vmf->page),
1017 page_private(vmf->page), vmf->virtual_address); 1017 page_private(vmf->page), (void *)vmf->address);
1018 if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { 1018 if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) {
1019 lock_page(vmf->page); 1019 lock_page(vmf->page);
1020 cfio->ft_flags |= VM_FAULT_LOCKED; 1020 cfio->ft_flags |= VM_FAULT_LOCKED;
@@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio)
1025 } 1025 }
1026 1026
1027 if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { 1027 if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) {
1028 CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); 1028 CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address);
1029 return -EFAULT; 1029 return -EFAULT;
1030 } 1030 }
1031 1031
1032 if (cfio->ft_flags & VM_FAULT_OOM) { 1032 if (cfio->ft_flags & VM_FAULT_OOM) {
1033 CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); 1033 CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address);
1034 return -ENOMEM; 1034 return -ENOMEM;
1035 } 1035 }
1036 1036
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c
index 7abd70b2a588..3151d2a0fe59 100644
--- a/drivers/usb/gadget/function/f_hid.c
+++ b/drivers/usb/gadget/function/f_hid.c
@@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f)
905 mutex_lock(&hidg_ida_lock); 905 mutex_lock(&hidg_ida_lock);
906 906
907 hidg_put_minor(opts->minor); 907 hidg_put_minor(opts->minor);
908 if (idr_is_empty(&hidg_ida.idr)) 908 if (ida_is_empty(&hidg_ida))
909 ghid_cleanup(); 909 ghid_cleanup();
910 910
911 mutex_unlock(&hidg_ida_lock); 911 mutex_unlock(&hidg_ida_lock);
@@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void)
931 931
932 mutex_lock(&hidg_ida_lock); 932 mutex_lock(&hidg_ida_lock);
933 933
934 if (idr_is_empty(&hidg_ida.idr)) { 934 if (ida_is_empty(&hidg_ida)) {
935 status = ghid_setup(NULL, HIDG_MINORS); 935 status = ghid_setup(NULL, HIDG_MINORS);
936 if (status) { 936 if (status) {
937 ret = ERR_PTR(status); 937 ret = ERR_PTR(status);
@@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void)
944 if (opts->minor < 0) { 944 if (opts->minor < 0) {
945 ret = ERR_PTR(opts->minor); 945 ret = ERR_PTR(opts->minor);
946 kfree(opts); 946 kfree(opts);
947 if (idr_is_empty(&hidg_ida.idr)) 947 if (ida_is_empty(&hidg_ida))
948 ghid_cleanup(); 948 ghid_cleanup();
949 goto unlock; 949 goto unlock;
950 } 950 }
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c
index 0de36cda6e41..8054da9276dd 100644
--- a/drivers/usb/gadget/function/f_printer.c
+++ b/drivers/usb/gadget/function/f_printer.c
@@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f)
1265 mutex_lock(&printer_ida_lock); 1265 mutex_lock(&printer_ida_lock);
1266 1266
1267 gprinter_put_minor(opts->minor); 1267 gprinter_put_minor(opts->minor);
1268 if (idr_is_empty(&printer_ida.idr)) 1268 if (ida_is_empty(&printer_ida))
1269 gprinter_cleanup(); 1269 gprinter_cleanup();
1270 1270
1271 mutex_unlock(&printer_ida_lock); 1271 mutex_unlock(&printer_ida_lock);
@@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void)
1289 1289
1290 mutex_lock(&printer_ida_lock); 1290 mutex_lock(&printer_ida_lock);
1291 1291
1292 if (idr_is_empty(&printer_ida.idr)) { 1292 if (ida_is_empty(&printer_ida)) {
1293 status = gprinter_setup(PRINTER_MINORS); 1293 status = gprinter_setup(PRINTER_MINORS);
1294 if (status) { 1294 if (status) {
1295 ret = ERR_PTR(status); 1295 ret = ERR_PTR(status);
@@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void)
1302 if (opts->minor < 0) { 1302 if (opts->minor < 0) {
1303 ret = ERR_PTR(opts->minor); 1303 ret = ERR_PTR(opts->minor);
1304 kfree(opts); 1304 kfree(opts);
1305 if (idr_is_empty(&printer_ida.idr)) 1305 if (ida_is_empty(&printer_ida))
1306 gprinter_cleanup(); 1306 gprinter_cleanup();
1307 goto unlock; 1307 goto unlock;
1308 } 1308 }
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 9815e45c23c4..f3726ba12aa6 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
362 362
363 down_read(&mm->mmap_sem); 363 down_read(&mm->mmap_sem);
364 ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, 364 ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
365 NULL); 365 NULL, NULL);
366 up_read(&mm->mmap_sem); 366 up_read(&mm->mmap_sem);
367 } 367 }
368 368
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 702040fe2001..6e3306f4a525 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
602{ 602{
603 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", 603 printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n",
604 vma, vma->vm_start, vma->vm_end, 604 vma, vma->vm_start, vma->vm_end,
605 vmf->pgoff, vmf->virtual_address); 605 vmf->pgoff, (void *)vmf->address);
606 606
607 return VM_FAULT_SIGBUS; 607 return VM_FAULT_SIGBUS;
608} 608}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 180f910339f4..3b713b6fcc26 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,12 +202,12 @@ static struct ratelimit_state printk_limits[] = {
202void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 202void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
203{ 203{
204 struct super_block *sb = fs_info->sb; 204 struct super_block *sb = fs_info->sb;
205 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1]; 205 char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
206 struct va_format vaf; 206 struct va_format vaf;
207 va_list args; 207 va_list args;
208 const char *type = NULL;
209 int kern_level; 208 int kern_level;
210 struct ratelimit_state *ratelimit; 209 const char *type = logtypes[4];
210 struct ratelimit_state *ratelimit = &printk_limits[4];
211 211
212 va_start(args, fmt); 212 va_start(args, fmt);
213 213
@@ -223,12 +223,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
223 fmt += size; 223 fmt += size;
224 } 224 }
225 225
226 if (!type) {
227 *lvl = '\0';
228 type = logtypes[4];
229 ratelimit = &printk_limits[4];
230 }
231
232 vaf.fmt = fmt; 226 vaf.fmt = fmt;
233 vaf.va = &args; 227 vaf.va = &args;
234 228
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index bf62ad919a95..00ee006a8aa2 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
162 slot = radix_tree_iter_retry(&iter); 162 slot = radix_tree_iter_retry(&iter);
163 continue; 163 continue;
164 } 164 }
165 slot = radix_tree_iter_resume(slot, &iter);
165 spin_unlock(&fs_info->buffer_lock); 166 spin_unlock(&fs_info->buffer_lock);
166 free_extent_buffer_stale(eb); 167 free_extent_buffer_stale(eb);
167 spin_lock(&fs_info->buffer_lock); 168 spin_lock(&fs_info->buffer_lock);
diff --git a/fs/dax.c b/fs/dax.c
index 5ae8e11ad786..a8732fbed381 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,6 +31,7 @@
31#include <linux/vmstat.h> 31#include <linux/vmstat.h>
32#include <linux/pfn_t.h> 32#include <linux/pfn_t.h>
33#include <linux/sizes.h> 33#include <linux/sizes.h>
34#include <linux/mmu_notifier.h>
34#include <linux/iomap.h> 35#include <linux/iomap.h>
35#include "internal.h" 36#include "internal.h"
36 37
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
240 } 241 }
241} 242}
242 243
244static void dax_unlock_mapping_entry(struct address_space *mapping,
245 pgoff_t index)
246{
247 void *entry, **slot;
248
249 spin_lock_irq(&mapping->tree_lock);
250 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
251 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
252 !slot_locked(mapping, slot))) {
253 spin_unlock_irq(&mapping->tree_lock);
254 return;
255 }
256 unlock_slot(mapping, slot);
257 spin_unlock_irq(&mapping->tree_lock);
258 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
259}
260
243static void put_locked_mapping_entry(struct address_space *mapping, 261static void put_locked_mapping_entry(struct address_space *mapping,
244 pgoff_t index, void *entry) 262 pgoff_t index, void *entry)
245{ 263{
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
433 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 451 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
434} 452}
435 453
436void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
437{
438 void *entry, **slot;
439
440 spin_lock_irq(&mapping->tree_lock);
441 entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
442 if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
443 !slot_locked(mapping, slot))) {
444 spin_unlock_irq(&mapping->tree_lock);
445 return;
446 }
447 unlock_slot(mapping, slot);
448 spin_unlock_irq(&mapping->tree_lock);
449 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
450}
451
452/* 454/*
453 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree 455 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
454 * entry to get unlocked before deleting it. 456 * entry to get unlocked before deleting it.
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
500 /* This will replace locked radix tree entry with a hole page */ 502 /* This will replace locked radix tree entry with a hole page */
501 page = find_or_create_page(mapping, vmf->pgoff, 503 page = find_or_create_page(mapping, vmf->pgoff,
502 vmf->gfp_mask | __GFP_ZERO); 504 vmf->gfp_mask | __GFP_ZERO);
503 if (!page) { 505 if (!page)
504 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
505 return VM_FAULT_OOM; 506 return VM_FAULT_OOM;
506 }
507 vmf->page = page; 507 vmf->page = page;
508 return VM_FAULT_LOCKED; 508 return VM_FAULT_LOCKED;
509} 509}
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
615 return new_entry; 615 return new_entry;
616} 616}
617 617
618static inline unsigned long
619pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
620{
621 unsigned long address;
622
623 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
624 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
625 return address;
626}
627
628/* Walk all mappings of a given index of a file and writeprotect them */
629static void dax_mapping_entry_mkclean(struct address_space *mapping,
630 pgoff_t index, unsigned long pfn)
631{
632 struct vm_area_struct *vma;
633 pte_t *ptep;
634 pte_t pte;
635 spinlock_t *ptl;
636 bool changed;
637
638 i_mmap_lock_read(mapping);
639 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
640 unsigned long address;
641
642 cond_resched();
643
644 if (!(vma->vm_flags & VM_SHARED))
645 continue;
646
647 address = pgoff_address(index, vma);
648 changed = false;
649 if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
650 continue;
651 if (pfn != pte_pfn(*ptep))
652 goto unlock;
653 if (!pte_dirty(*ptep) && !pte_write(*ptep))
654 goto unlock;
655
656 flush_cache_page(vma, address, pfn);
657 pte = ptep_clear_flush(vma, address, ptep);
658 pte = pte_wrprotect(pte);
659 pte = pte_mkclean(pte);
660 set_pte_at(vma->vm_mm, address, ptep, pte);
661 changed = true;
662unlock:
663 pte_unmap_unlock(ptep, ptl);
664
665 if (changed)
666 mmu_notifier_invalidate_page(vma->vm_mm, address);
667 }
668 i_mmap_unlock_read(mapping);
669}
670
618static int dax_writeback_one(struct block_device *bdev, 671static int dax_writeback_one(struct block_device *bdev,
619 struct address_space *mapping, pgoff_t index, void *entry) 672 struct address_space *mapping, pgoff_t index, void *entry)
620{ 673{
621 struct radix_tree_root *page_tree = &mapping->page_tree; 674 struct radix_tree_root *page_tree = &mapping->page_tree;
622 struct radix_tree_node *node;
623 struct blk_dax_ctl dax; 675 struct blk_dax_ctl dax;
624 void **slot; 676 void *entry2, **slot;
625 int ret = 0; 677 int ret = 0;
626 678
627 spin_lock_irq(&mapping->tree_lock);
628 /* 679 /*
629 * Regular page slots are stabilized by the page lock even 680 * A page got tagged dirty in DAX mapping? Something is seriously
630 * without the tree itself locked. These unlocked entries 681 * wrong.
631 * need verification under the tree lock.
632 */ 682 */
633 if (!__radix_tree_lookup(page_tree, index, &node, &slot)) 683 if (WARN_ON(!radix_tree_exceptional_entry(entry)))
634 goto unlock; 684 return -EIO;
635 if (*slot != entry)
636 goto unlock;
637
638 /* another fsync thread may have already written back this entry */
639 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
640 goto unlock;
641 685
686 spin_lock_irq(&mapping->tree_lock);
687 entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
688 /* Entry got punched out / reallocated? */
689 if (!entry2 || !radix_tree_exceptional_entry(entry2))
690 goto put_unlocked;
691 /*
692 * Entry got reallocated elsewhere? No need to writeback. We have to
693 * compare sectors as we must not bail out due to difference in lockbit
694 * or entry type.
695 */
696 if (dax_radix_sector(entry2) != dax_radix_sector(entry))
697 goto put_unlocked;
642 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 698 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
643 dax_is_zero_entry(entry))) { 699 dax_is_zero_entry(entry))) {
644 ret = -EIO; 700 ret = -EIO;
645 goto unlock; 701 goto put_unlocked;
646 } 702 }
647 703
704 /* Another fsync thread may have already written back this entry */
705 if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
706 goto put_unlocked;
707 /* Lock the entry to serialize with page faults */
708 entry = lock_slot(mapping, slot);
709 /*
710 * We can clear the tag now but we have to be careful so that concurrent
711 * dax_writeback_one() calls for the same index cannot finish before we
712 * actually flush the caches. This is achieved as the calls will look
713 * at the entry only under tree_lock and once they do that they will
714 * see the entry locked and wait for it to unlock.
715 */
716 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
717 spin_unlock_irq(&mapping->tree_lock);
718
648 /* 719 /*
649 * Even if dax_writeback_mapping_range() was given a wbc->range_start 720 * Even if dax_writeback_mapping_range() was given a wbc->range_start
650 * in the middle of a PMD, the 'index' we are given will be aligned to 721 * in the middle of a PMD, the 'index' we are given will be aligned to
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev,
654 */ 725 */
655 dax.sector = dax_radix_sector(entry); 726 dax.sector = dax_radix_sector(entry);
656 dax.size = PAGE_SIZE << dax_radix_order(entry); 727 dax.size = PAGE_SIZE << dax_radix_order(entry);
657 spin_unlock_irq(&mapping->tree_lock);
658 728
659 /* 729 /*
660 * We cannot hold tree_lock while calling dax_map_atomic() because it 730 * We cannot hold tree_lock while calling dax_map_atomic() because it
661 * eventually calls cond_resched(). 731 * eventually calls cond_resched().
662 */ 732 */
663 ret = dax_map_atomic(bdev, &dax); 733 ret = dax_map_atomic(bdev, &dax);
664 if (ret < 0) 734 if (ret < 0) {
735 put_locked_mapping_entry(mapping, index, entry);
665 return ret; 736 return ret;
737 }
666 738
667 if (WARN_ON_ONCE(ret < dax.size)) { 739 if (WARN_ON_ONCE(ret < dax.size)) {
668 ret = -EIO; 740 ret = -EIO;
669 goto unmap; 741 goto unmap;
670 } 742 }
671 743
744 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
672 wb_cache_pmem(dax.addr, dax.size); 745 wb_cache_pmem(dax.addr, dax.size);
673 746 /*
747 * After we have flushed the cache, we can clear the dirty tag. There
748 * cannot be new dirty data in the pfn after the flush has completed as
749 * the pfn mappings are writeprotected and fault waits for mapping
750 * entry lock.
751 */
674 spin_lock_irq(&mapping->tree_lock); 752 spin_lock_irq(&mapping->tree_lock);
675 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); 753 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
676 spin_unlock_irq(&mapping->tree_lock); 754 spin_unlock_irq(&mapping->tree_lock);
677 unmap: 755 unmap:
678 dax_unmap_atomic(bdev, &dax); 756 dax_unmap_atomic(bdev, &dax);
757 put_locked_mapping_entry(mapping, index, entry);
679 return ret; 758 return ret;
680 759
681 unlock: 760 put_unlocked:
761 put_unlocked_mapping_entry(mapping, index, entry2);
682 spin_unlock_irq(&mapping->tree_lock); 762 spin_unlock_irq(&mapping->tree_lock);
683 return ret; 763 return ret;
684} 764}
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping,
738 struct block_device *bdev, sector_t sector, size_t size, 818 struct block_device *bdev, sector_t sector, size_t size,
739 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) 819 void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
740{ 820{
741 unsigned long vaddr = (unsigned long)vmf->virtual_address; 821 unsigned long vaddr = vmf->address;
742 struct blk_dax_ctl dax = { 822 struct blk_dax_ctl dax = {
743 .sector = sector, 823 .sector = sector,
744 .size = size, 824 .size = size,
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
767{ 847{
768 struct file *file = vma->vm_file; 848 struct file *file = vma->vm_file;
769 struct address_space *mapping = file->f_mapping; 849 struct address_space *mapping = file->f_mapping;
770 void *entry; 850 void *entry, **slot;
771 pgoff_t index = vmf->pgoff; 851 pgoff_t index = vmf->pgoff;
772 852
773 spin_lock_irq(&mapping->tree_lock); 853 spin_lock_irq(&mapping->tree_lock);
774 entry = get_unlocked_mapping_entry(mapping, index, NULL); 854 entry = get_unlocked_mapping_entry(mapping, index, &slot);
775 if (!entry || !radix_tree_exceptional_entry(entry)) 855 if (!entry || !radix_tree_exceptional_entry(entry)) {
776 goto out; 856 if (entry)
857 put_unlocked_mapping_entry(mapping, index, entry);
858 spin_unlock_irq(&mapping->tree_lock);
859 return VM_FAULT_NOPAGE;
860 }
777 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); 861 radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
778 put_unlocked_mapping_entry(mapping, index, entry); 862 entry = lock_slot(mapping, slot);
779out:
780 spin_unlock_irq(&mapping->tree_lock); 863 spin_unlock_irq(&mapping->tree_lock);
864 /*
865 * If we race with somebody updating the PTE and finish_mkwrite_fault()
866 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
867 * the fault in either case.
868 */
869 finish_mkwrite_fault(vmf);
870 put_locked_mapping_entry(mapping, index, entry);
781 return VM_FAULT_NOPAGE; 871 return VM_FAULT_NOPAGE;
782} 872}
783EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); 873EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
948{ 1038{
949 struct address_space *mapping = vma->vm_file->f_mapping; 1039 struct address_space *mapping = vma->vm_file->f_mapping;
950 struct inode *inode = mapping->host; 1040 struct inode *inode = mapping->host;
951 unsigned long vaddr = (unsigned long)vmf->virtual_address; 1041 unsigned long vaddr = vmf->address;
952 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1042 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
953 sector_t sector; 1043 sector_t sector;
954 struct iomap iomap = { 0 }; 1044 struct iomap iomap = { 0 };
955 unsigned flags = IOMAP_FAULT; 1045 unsigned flags = IOMAP_FAULT;
956 int error, major = 0; 1046 int error, major = 0;
957 int locked_status = 0; 1047 int vmf_ret = 0;
958 void *entry; 1048 void *entry;
959 1049
960 /* 1050 /*
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1007 1097
1008 if (error) 1098 if (error)
1009 goto finish_iomap; 1099 goto finish_iomap;
1010 if (!radix_tree_exceptional_entry(entry)) { 1100
1011 vmf->page = entry; 1101 __SetPageUptodate(vmf->cow_page);
1012 locked_status = VM_FAULT_LOCKED; 1102 vmf_ret = finish_fault(vmf);
1013 } else { 1103 if (!vmf_ret)
1014 vmf->entry = entry; 1104 vmf_ret = VM_FAULT_DONE_COW;
1015 locked_status = VM_FAULT_DAX_LOCKED;
1016 }
1017 goto finish_iomap; 1105 goto finish_iomap;
1018 } 1106 }
1019 1107
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1030 case IOMAP_UNWRITTEN: 1118 case IOMAP_UNWRITTEN:
1031 case IOMAP_HOLE: 1119 case IOMAP_HOLE:
1032 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1120 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1033 locked_status = dax_load_hole(mapping, entry, vmf); 1121 vmf_ret = dax_load_hole(mapping, entry, vmf);
1034 break; 1122 break;
1035 } 1123 }
1036 /*FALLTHRU*/ 1124 /*FALLTHRU*/
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1042 1130
1043 finish_iomap: 1131 finish_iomap:
1044 if (ops->iomap_end) { 1132 if (ops->iomap_end) {
1045 if (error) { 1133 if (error || (vmf_ret & VM_FAULT_ERROR)) {
1046 /* keep previous error */ 1134 /* keep previous error */
1047 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, 1135 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1048 &iomap); 1136 &iomap);
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1052 } 1140 }
1053 } 1141 }
1054 unlock_entry: 1142 unlock_entry:
1055 if (!locked_status || error) 1143 if (vmf_ret != VM_FAULT_LOCKED || error)
1056 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1144 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1057 out: 1145 out:
1058 if (error == -ENOMEM) 1146 if (error == -ENOMEM)
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1060 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1148 /* -EBUSY is fine, somebody else faulted on the same PTE */
1061 if (error < 0 && error != -EBUSY) 1149 if (error < 0 && error != -EBUSY)
1062 return VM_FAULT_SIGBUS | major; 1150 return VM_FAULT_SIGBUS | major;
1063 if (locked_status) { 1151 if (vmf_ret) {
1064 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ 1152 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
1065 return locked_status; 1153 return vmf_ret;
1066 } 1154 }
1067 return VM_FAULT_NOPAGE | major; 1155 return VM_FAULT_NOPAGE | major;
1068} 1156}
diff --git a/fs/exec.c b/fs/exec.c
index 88b5e1efdbd6..8112eacf10f3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
209 * doing the exec and bprm->mm is the new process's mm. 209 * doing the exec and bprm->mm is the new process's mm.
210 */ 210 */
211 ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, 211 ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
212 &page, NULL); 212 &page, NULL, NULL);
213 if (ret <= 0) 213 if (ret <= 0)
214 return NULL; 214 return NULL;
215 215
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 85959d8324df..d96e2f30084b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,9 +257,9 @@ out:
257 * fatal_signal_pending()s, and the mmap_sem must be released before 257 * fatal_signal_pending()s, and the mmap_sem must be released before
258 * returning it. 258 * returning it.
259 */ 259 */
260int handle_userfault(struct fault_env *fe, unsigned long reason) 260int handle_userfault(struct vm_fault *vmf, unsigned long reason)
261{ 261{
262 struct mm_struct *mm = fe->vma->vm_mm; 262 struct mm_struct *mm = vmf->vma->vm_mm;
263 struct userfaultfd_ctx *ctx; 263 struct userfaultfd_ctx *ctx;
264 struct userfaultfd_wait_queue uwq; 264 struct userfaultfd_wait_queue uwq;
265 int ret; 265 int ret;
@@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
268 BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 268 BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
269 269
270 ret = VM_FAULT_SIGBUS; 270 ret = VM_FAULT_SIGBUS;
271 ctx = fe->vma->vm_userfaultfd_ctx.ctx; 271 ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
272 if (!ctx) 272 if (!ctx)
273 goto out; 273 goto out;
274 274
@@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
301 * without first stopping userland access to the memory. For 301 * without first stopping userland access to the memory. For
302 * VM_UFFD_MISSING userfaults this is enough for now. 302 * VM_UFFD_MISSING userfaults this is enough for now.
303 */ 303 */
304 if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { 304 if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
305 /* 305 /*
306 * Validate the invariant that nowait must allow retry 306 * Validate the invariant that nowait must allow retry
307 * to be sure not to return SIGBUS erroneously on 307 * to be sure not to return SIGBUS erroneously on
308 * nowait invocations. 308 * nowait invocations.
309 */ 309 */
310 BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); 310 BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
311#ifdef CONFIG_DEBUG_VM 311#ifdef CONFIG_DEBUG_VM
312 if (printk_ratelimit()) { 312 if (printk_ratelimit()) {
313 printk(KERN_WARNING 313 printk(KERN_WARNING
314 "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); 314 "FAULT_FLAG_ALLOW_RETRY missing %x\n",
315 vmf->flags);
315 dump_stack(); 316 dump_stack();
316 } 317 }
317#endif 318#endif
@@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
323 * and wait. 324 * and wait.
324 */ 325 */
325 ret = VM_FAULT_RETRY; 326 ret = VM_FAULT_RETRY;
326 if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) 327 if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
327 goto out; 328 goto out;
328 329
329 /* take the reference before dropping the mmap_sem */ 330 /* take the reference before dropping the mmap_sem */
@@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
331 332
332 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 333 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
333 uwq.wq.private = current; 334 uwq.wq.private = current;
334 uwq.msg = userfault_msg(fe->address, fe->flags, reason); 335 uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
335 uwq.ctx = ctx; 336 uwq.ctx = ctx;
336 337
337 return_to_userland = 338 return_to_userland =
338 (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == 339 (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
339 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); 340 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
340 341
341 spin_lock(&ctx->fault_pending_wqh.lock); 342 spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
353 TASK_KILLABLE); 354 TASK_KILLABLE);
354 spin_unlock(&ctx->fault_pending_wqh.lock); 355 spin_unlock(&ctx->fault_pending_wqh.lock);
355 356
356 must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); 357 must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
358 reason);
357 up_read(&mm->mmap_sem); 359 up_read(&mm->mmap_sem);
358 360
359 if (likely(must_wait && !ACCESS_ONCE(ctx->released) && 361 if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0afade8bd3d7..f97bcfe79472 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
46 46
47#ifdef CONFIG_FS_DAX 47#ifdef CONFIG_FS_DAX
48struct page *read_dax_sector(struct block_device *bdev, sector_t n); 48struct page *read_dax_sector(struct block_device *bdev, sector_t n);
49void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
50int __dax_zero_page_range(struct block_device *bdev, sector_t sector, 49int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
51 unsigned int offset, unsigned int length); 50 unsigned int offset, unsigned int length);
52#else 51#else
@@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
55{ 54{
56 return ERR_PTR(-ENXIO); 55 return ERR_PTR(-ENXIO);
57} 56}
58/* Shouldn't ever be called when dax is disabled. */
59static inline void dax_unlock_mapping_entry(struct address_space *mapping,
60 pgoff_t index)
61{
62 BUG();
63}
64static inline int __dax_zero_page_range(struct block_device *bdev, 57static inline int __dax_zero_page_range(struct block_device *bdev,
65 sector_t sector, unsigned int offset, unsigned int length) 58 sector_t sector, unsigned int offset, unsigned int length)
66{ 59{
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 08528afdf58b..10c5a17b1f51 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -243,29 +243,33 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg
243 ops->unmap_sg(dev, sg, nents, dir, attrs); 243 ops->unmap_sg(dev, sg, nents, dir, attrs);
244} 244}
245 245
246static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, 246static inline dma_addr_t dma_map_page_attrs(struct device *dev,
247 size_t offset, size_t size, 247 struct page *page,
248 enum dma_data_direction dir) 248 size_t offset, size_t size,
249 enum dma_data_direction dir,
250 unsigned long attrs)
249{ 251{
250 struct dma_map_ops *ops = get_dma_ops(dev); 252 struct dma_map_ops *ops = get_dma_ops(dev);
251 dma_addr_t addr; 253 dma_addr_t addr;
252 254
253 kmemcheck_mark_initialized(page_address(page) + offset, size); 255 kmemcheck_mark_initialized(page_address(page) + offset, size);
254 BUG_ON(!valid_dma_direction(dir)); 256 BUG_ON(!valid_dma_direction(dir));
255 addr = ops->map_page(dev, page, offset, size, dir, 0); 257 addr = ops->map_page(dev, page, offset, size, dir, attrs);
256 debug_dma_map_page(dev, page, offset, size, dir, addr, false); 258 debug_dma_map_page(dev, page, offset, size, dir, addr, false);
257 259
258 return addr; 260 return addr;
259} 261}
260 262
261static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, 263static inline void dma_unmap_page_attrs(struct device *dev,
262 size_t size, enum dma_data_direction dir) 264 dma_addr_t addr, size_t size,
265 enum dma_data_direction dir,
266 unsigned long attrs)
263{ 267{
264 struct dma_map_ops *ops = get_dma_ops(dev); 268 struct dma_map_ops *ops = get_dma_ops(dev);
265 269
266 BUG_ON(!valid_dma_direction(dir)); 270 BUG_ON(!valid_dma_direction(dir));
267 if (ops->unmap_page) 271 if (ops->unmap_page)
268 ops->unmap_page(dev, addr, size, dir, 0); 272 ops->unmap_page(dev, addr, size, dir, attrs);
269 debug_dma_unmap_page(dev, addr, size, dir, false); 273 debug_dma_unmap_page(dev, addr, size, dir, false);
270} 274}
271 275
@@ -385,6 +389,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
385#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) 389#define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
386#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) 390#define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
387#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) 391#define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0)
392#define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
393#define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
388 394
389extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, 395extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
390 void *cpu_addr, dma_addr_t dma_addr, size_t size); 396 void *cpu_addr, dma_addr_t dma_addr, size_t size);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f8041f9de31e..4175dca4ac39 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold);
506extern void free_hot_cold_page_list(struct list_head *list, bool cold); 506extern void free_hot_cold_page_list(struct list_head *list, bool cold);
507 507
508struct page_frag_cache; 508struct page_frag_cache;
509extern void __page_frag_drain(struct page *page, unsigned int order,
510 unsigned int count);
509extern void *__alloc_page_frag(struct page_frag_cache *nc, 511extern void *__alloc_page_frag(struct page_frag_cache *nc,
510 unsigned int fragsz, gfp_t gfp_mask); 512 unsigned int fragsz, gfp_t gfp_mask);
511extern void __free_page_frag(void *addr); 513extern void __free_page_frag(void *addr);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 1f782aa1d8e6..97e478d6b690 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,12 +1,12 @@
1#ifndef _LINUX_HUGE_MM_H 1#ifndef _LINUX_HUGE_MM_H
2#define _LINUX_HUGE_MM_H 2#define _LINUX_HUGE_MM_H
3 3
4extern int do_huge_pmd_anonymous_page(struct fault_env *fe); 4extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf);
5extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 5extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
7 struct vm_area_struct *vma); 7 struct vm_area_struct *vma);
8extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); 8extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd);
9extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); 9extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd);
10extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 10extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
11 unsigned long addr, 11 unsigned long addr,
12 pmd_t *pmd, 12 pmd_t *pmd,
@@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page)
142 return 1; 142 return 1;
143} 143}
144 144
145extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); 145extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
146 146
147extern struct page *huge_zero_page; 147extern struct page *huge_zero_page;
148 148
@@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
212 return NULL; 212 return NULL;
213} 213}
214 214
215static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) 215static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd)
216{ 216{
217 return 0; 217 return 0;
218} 218}
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 083d61e92706..3c01b89aed67 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -18,12 +18,11 @@
18#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
19 19
20/* 20/*
21 * We want shallower trees and thus more bits covered at each layer. 8 21 * Using 6 bits at each layer allows us to allocate 7 layers out of each page.
22 * bits gives us large enough first layer for most use cases and maximum 22 * 8 bits only gave us 3 layers out of every pair of pages, which is less
23 * tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and 23 * efficient except for trees with a largest element between 192-255 inclusive.
24 * 1k on 32bit.
25 */ 24 */
26#define IDR_BITS 8 25#define IDR_BITS 6
27#define IDR_SIZE (1 << IDR_BITS) 26#define IDR_SIZE (1 << IDR_BITS)
28#define IDR_MASK ((1 << IDR_BITS)-1) 27#define IDR_MASK ((1 << IDR_BITS)-1)
29 28
@@ -56,6 +55,32 @@ struct idr {
56#define DEFINE_IDR(name) struct idr name = IDR_INIT(name) 55#define DEFINE_IDR(name) struct idr name = IDR_INIT(name)
57 56
58/** 57/**
58 * idr_get_cursor - Return the current position of the cyclic allocator
59 * @idr: idr handle
60 *
61 * The value returned is the value that will be next returned from
62 * idr_alloc_cyclic() if it is free (otherwise the search will start from
63 * this position).
64 */
65static inline unsigned int idr_get_cursor(struct idr *idr)
66{
67 return READ_ONCE(idr->cur);
68}
69
70/**
71 * idr_set_cursor - Set the current position of the cyclic allocator
72 * @idr: idr handle
73 * @val: new position
74 *
75 * The next call to idr_alloc_cyclic() will return @val if it is free
76 * (otherwise the search will start from this position).
77 */
78static inline void idr_set_cursor(struct idr *idr, unsigned int val)
79{
80 WRITE_ONCE(idr->cur, val);
81}
82
83/**
59 * DOC: idr sync 84 * DOC: idr sync
60 * idr synchronization (stolen from radix-tree.h) 85 * idr synchronization (stolen from radix-tree.h)
61 * 86 *
@@ -195,6 +220,11 @@ static inline int ida_get_new(struct ida *ida, int *p_id)
195 return ida_get_new_above(ida, 0, p_id); 220 return ida_get_new_above(ida, 0, p_id);
196} 221}
197 222
223static inline bool ida_is_empty(struct ida *ida)
224{
225 return idr_is_empty(&ida->idr);
226}
227
198void __init idr_init_cache(void); 228void __init idr_init_cache(void);
199 229
200#endif /* __IDR_H__ */ 230#endif /* __IDR_H__ */
diff --git a/include/linux/kdb.h b/include/linux/kdb.h
index 410decacff8f..68bd88223417 100644
--- a/include/linux/kdb.h
+++ b/include/linux/kdb.h
@@ -77,7 +77,6 @@ extern int kdb_poll_idx;
77 * number whenever the kernel debugger is entered. 77 * number whenever the kernel debugger is entered.
78 */ 78 */
79extern int kdb_initial_cpu; 79extern int kdb_initial_cpu;
80extern atomic_t kdb_event;
81 80
82/* Types and messages used for dynamically added kdb shell commands */ 81/* Types and messages used for dynamically added kdb shell commands */
83 82
@@ -162,6 +161,7 @@ enum kdb_msgsrc {
162}; 161};
163 162
164extern int kdb_trap_printk; 163extern int kdb_trap_printk;
164extern int kdb_printf_cpu;
165extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, 165extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt,
166 va_list args); 166 va_list args);
167extern __printf(1, 2) int kdb_printf(const char *, ...); 167extern __printf(1, 2) int kdb_printf(const char *, ...);
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 406c33dcae13..d7437777baaa 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -259,12 +259,6 @@ phys_addr_t paddr_vmcoreinfo_note(void);
259 vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) 259 vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name)
260#define VMCOREINFO_CONFIG(name) \ 260#define VMCOREINFO_CONFIG(name) \
261 vmcoreinfo_append_str("CONFIG_%s=y\n", #name) 261 vmcoreinfo_append_str("CONFIG_%s=y\n", #name)
262#define VMCOREINFO_PAGE_OFFSET(value) \
263 vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value)
264#define VMCOREINFO_VMALLOC_START(value) \
265 vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value)
266#define VMCOREINFO_VMEMMAP_START(value) \
267 vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value)
268 262
269extern struct kimage *kexec_image; 263extern struct kimage *kexec_image;
270extern struct kimage *kexec_crash_image; 264extern struct kimage *kexec_crash_image;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0b5b2e4df14e..4424784ac374 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -292,36 +292,23 @@ extern pgprot_t protection_map[16];
292 * pgoff should be used in favour of virtual_address, if possible. 292 * pgoff should be used in favour of virtual_address, if possible.
293 */ 293 */
294struct vm_fault { 294struct vm_fault {
295 struct vm_area_struct *vma; /* Target VMA */
295 unsigned int flags; /* FAULT_FLAG_xxx flags */ 296 unsigned int flags; /* FAULT_FLAG_xxx flags */
296 gfp_t gfp_mask; /* gfp mask to be used for allocations */ 297 gfp_t gfp_mask; /* gfp mask to be used for allocations */
297 pgoff_t pgoff; /* Logical page offset based on vma */ 298 pgoff_t pgoff; /* Logical page offset based on vma */
298 void __user *virtual_address; /* Faulting virtual address */ 299 unsigned long address; /* Faulting virtual address */
300 pmd_t *pmd; /* Pointer to pmd entry matching
301 * the 'address' */
302 pte_t orig_pte; /* Value of PTE at the time of fault */
299 303
300 struct page *cow_page; /* Handler may choose to COW */ 304 struct page *cow_page; /* Page handler may use for COW fault */
305 struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */
301 struct page *page; /* ->fault handlers should return a 306 struct page *page; /* ->fault handlers should return a
302 * page here, unless VM_FAULT_NOPAGE 307 * page here, unless VM_FAULT_NOPAGE
303 * is set (which is also implied by 308 * is set (which is also implied by
304 * VM_FAULT_ERROR). 309 * VM_FAULT_ERROR).
305 */ 310 */
306 void *entry; /* ->fault handler can alternatively 311 /* These three entries are valid only while holding ptl lock */
307 * return locked DAX entry. In that
308 * case handler should return
309 * VM_FAULT_DAX_LOCKED and fill in
310 * entry here.
311 */
312};
313
314/*
315 * Page fault context: passes though page fault handler instead of endless list
316 * of function arguments.
317 */
318struct fault_env {
319 struct vm_area_struct *vma; /* Target VMA */
320 unsigned long address; /* Faulting virtual address */
321 unsigned int flags; /* FAULT_FLAG_xxx flags */
322 pmd_t *pmd; /* Pointer to pmd entry matching
323 * the 'address'
324 */
325 pte_t *pte; /* Pointer to pte entry matching 312 pte_t *pte; /* Pointer to pte entry matching
326 * the 'address'. NULL if the page 313 * the 'address'. NULL if the page
327 * table hasn't been allocated. 314 * table hasn't been allocated.
@@ -351,7 +338,7 @@ struct vm_operations_struct {
351 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 338 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
352 int (*pmd_fault)(struct vm_area_struct *, unsigned long address, 339 int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
353 pmd_t *, unsigned int flags); 340 pmd_t *, unsigned int flags);
354 void (*map_pages)(struct fault_env *fe, 341 void (*map_pages)(struct vm_fault *vmf,
355 pgoff_t start_pgoff, pgoff_t end_pgoff); 342 pgoff_t start_pgoff, pgoff_t end_pgoff);
356 343
357 /* notification that a previously read-only page is about to become 344 /* notification that a previously read-only page is about to become
@@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
625 return pte; 612 return pte;
626} 613}
627 614
628int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, 615int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
629 struct page *page); 616 struct page *page);
617int finish_fault(struct vm_fault *vmf);
618int finish_mkwrite_fault(struct vm_fault *vmf);
630#endif 619#endif
631 620
632/* 621/*
@@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
1110#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 1099#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
1111#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ 1100#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
1112#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ 1101#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
1113#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ 1102#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
1114 1103
1115#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 1104#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
1116 1105
@@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
1221 struct vm_area_struct *vma); 1210 struct vm_area_struct *vma);
1222void unmap_mapping_range(struct address_space *mapping, 1211void unmap_mapping_range(struct address_space *mapping,
1223 loff_t const holebegin, loff_t const holelen, int even_cows); 1212 loff_t const holebegin, loff_t const holelen, int even_cows);
1213int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
1214 spinlock_t **ptlp);
1224int follow_pfn(struct vm_area_struct *vma, unsigned long address, 1215int follow_pfn(struct vm_area_struct *vma, unsigned long address,
1225 unsigned long *pfn); 1216 unsigned long *pfn);
1226int follow_phys(struct vm_area_struct *vma, unsigned long address, 1217int follow_phys(struct vm_area_struct *vma, unsigned long address,
@@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1276long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 1267long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
1277 unsigned long start, unsigned long nr_pages, 1268 unsigned long start, unsigned long nr_pages,
1278 unsigned int gup_flags, struct page **pages, 1269 unsigned int gup_flags, struct page **pages,
1279 struct vm_area_struct **vmas); 1270 struct vm_area_struct **vmas, int *locked);
1280long get_user_pages(unsigned long start, unsigned long nr_pages, 1271long get_user_pages(unsigned long start, unsigned long nr_pages,
1281 unsigned int gup_flags, struct page **pages, 1272 unsigned int gup_flags, struct page **pages,
1282 struct vm_area_struct **vmas); 1273 struct vm_area_struct **vmas);
1283long get_user_pages_locked(unsigned long start, unsigned long nr_pages, 1274long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
1284 unsigned int gup_flags, struct page **pages, int *locked); 1275 unsigned int gup_flags, struct page **pages, int *locked);
1285long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
1286 unsigned long start, unsigned long nr_pages,
1287 struct page **pages, unsigned int gup_flags);
1288long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 1276long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
1289 struct page **pages, unsigned int gup_flags); 1277 struct page **pages, unsigned int gup_flags);
1290int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1278int get_user_pages_fast(unsigned long start, int nr_pages, int write,
@@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *);
2099 2087
2100/* generic vm_area_ops exported for stackable file systems */ 2088/* generic vm_area_ops exported for stackable file systems */
2101extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 2089extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
2102extern void filemap_map_pages(struct fault_env *fe, 2090extern void filemap_map_pages(struct vm_fault *vmf,
2103 pgoff_t start_pgoff, pgoff_t end_pgoff); 2091 pgoff_t start_pgoff, pgoff_t end_pgoff);
2104extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2092extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2105 2093
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a78c35cff1ae..aacca824a6ae 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -7,6 +7,23 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <asm/irq.h> 8#include <asm/irq.h>
9 9
10/*
11 * The run state of the lockup detectors is controlled by the content of the
12 * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
13 * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
14 *
15 * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
16 * are variables that are only used as an 'interface' between the parameters
17 * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
18 * 'watchdog_thresh' variable is handled differently because its value is not
19 * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
20 * is equal zero.
21 */
22#define NMI_WATCHDOG_ENABLED_BIT 0
23#define SOFT_WATCHDOG_ENABLED_BIT 1
24#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
25#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
26
10/** 27/**
11 * touch_nmi_watchdog - restart NMI watchdog timeout. 28 * touch_nmi_watchdog - restart NMI watchdog timeout.
12 * 29 *
@@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled;
91extern int soft_watchdog_enabled; 108extern int soft_watchdog_enabled;
92extern int watchdog_user_enabled; 109extern int watchdog_user_enabled;
93extern int watchdog_thresh; 110extern int watchdog_thresh;
111extern unsigned long watchdog_enabled;
94extern unsigned long *watchdog_cpumask_bits; 112extern unsigned long *watchdog_cpumask_bits;
113#ifdef CONFIG_SMP
95extern int sysctl_softlockup_all_cpu_backtrace; 114extern int sysctl_softlockup_all_cpu_backtrace;
96extern int sysctl_hardlockup_all_cpu_backtrace; 115extern int sysctl_hardlockup_all_cpu_backtrace;
116#else
117#define sysctl_softlockup_all_cpu_backtrace 0
118#define sysctl_hardlockup_all_cpu_backtrace 0
119#endif
120extern bool is_hardlockup(void);
97struct ctl_table; 121struct ctl_table;
98extern int proc_watchdog(struct ctl_table *, int , 122extern int proc_watchdog(struct ctl_table *, int ,
99 void __user *, size_t *, loff_t *); 123 void __user *, size_t *, loff_t *);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 744486057e9e..5dea8f6440e4 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -80,23 +80,25 @@ static inline bool radix_tree_is_internal_node(void *ptr)
80#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ 80#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
81 RADIX_TREE_MAP_SHIFT)) 81 RADIX_TREE_MAP_SHIFT))
82 82
83/*
84 * @count is the count of every non-NULL element in the ->slots array
85 * whether that is an exceptional entry, a retry entry, a user pointer,
86 * a sibling entry or a pointer to the next level of the tree.
87 * @exceptional is the count of every element in ->slots which is
88 * either radix_tree_exceptional_entry() or is a sibling entry for an
89 * exceptional entry.
90 */
83struct radix_tree_node { 91struct radix_tree_node {
84 unsigned char shift; /* Bits remaining in each slot */ 92 unsigned char shift; /* Bits remaining in each slot */
85 unsigned char offset; /* Slot offset in parent */ 93 unsigned char offset; /* Slot offset in parent */
86 unsigned char count; /* Total entry count */ 94 unsigned char count; /* Total entry count */
87 unsigned char exceptional; /* Exceptional entry count */ 95 unsigned char exceptional; /* Exceptional entry count */
96 struct radix_tree_node *parent; /* Used when ascending tree */
97 void *private_data; /* For tree user */
88 union { 98 union {
89 struct { 99 struct list_head private_list; /* For tree user */
90 /* Used when ascending tree */ 100 struct rcu_head rcu_head; /* Used when freeing node */
91 struct radix_tree_node *parent;
92 /* For tree user */
93 void *private_data;
94 };
95 /* Used when freeing node */
96 struct rcu_head rcu_head;
97 }; 101 };
98 /* For tree user */
99 struct list_head private_list;
100 void __rcu *slots[RADIX_TREE_MAP_SIZE]; 102 void __rcu *slots[RADIX_TREE_MAP_SIZE];
101 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 103 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
102}; 104};
@@ -127,6 +129,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root)
127} 129}
128 130
129/** 131/**
132 * struct radix_tree_iter - radix tree iterator state
133 *
134 * @index: index of current slot
135 * @next_index: one beyond the last index for this chunk
136 * @tags: bit-mask for tag-iterating
137 * @node: node that contains current slot
138 * @shift: shift for the node that holds our slots
139 *
140 * This radix tree iterator works in terms of "chunks" of slots. A chunk is a
141 * subinterval of slots contained within one radix tree leaf node. It is
142 * described by a pointer to its first slot and a struct radix_tree_iter
143 * which holds the chunk's position in the tree and its size. For tagged
144 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
145 * radix tree tag.
146 */
147struct radix_tree_iter {
148 unsigned long index;
149 unsigned long next_index;
150 unsigned long tags;
151 struct radix_tree_node *node;
152#ifdef CONFIG_RADIX_TREE_MULTIORDER
153 unsigned int shift;
154#endif
155};
156
157static inline unsigned int iter_shift(const struct radix_tree_iter *iter)
158{
159#ifdef CONFIG_RADIX_TREE_MULTIORDER
160 return iter->shift;
161#else
162 return 0;
163#endif
164}
165
166/**
130 * Radix-tree synchronization 167 * Radix-tree synchronization
131 * 168 *
132 * The radix-tree API requires that users provide all synchronisation (with 169 * The radix-tree API requires that users provide all synchronisation (with
@@ -264,6 +301,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
264 struct radix_tree_node *node, 301 struct radix_tree_node *node,
265 void **slot, void *item, 302 void **slot, void *item,
266 radix_tree_update_node_t update_node, void *private); 303 radix_tree_update_node_t update_node, void *private);
304void radix_tree_iter_replace(struct radix_tree_root *,
305 const struct radix_tree_iter *, void **slot, void *item);
267void radix_tree_replace_slot(struct radix_tree_root *root, 306void radix_tree_replace_slot(struct radix_tree_root *root,
268 void **slot, void *item); 307 void **slot, void *item);
269void __radix_tree_delete_node(struct radix_tree_root *root, 308void __radix_tree_delete_node(struct radix_tree_root *root,
@@ -289,6 +328,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
289 unsigned long index, unsigned int tag); 328 unsigned long index, unsigned int tag);
290int radix_tree_tag_get(struct radix_tree_root *root, 329int radix_tree_tag_get(struct radix_tree_root *root,
291 unsigned long index, unsigned int tag); 330 unsigned long index, unsigned int tag);
331void radix_tree_iter_tag_set(struct radix_tree_root *root,
332 const struct radix_tree_iter *iter, unsigned int tag);
292unsigned int 333unsigned int
293radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, 334radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
294 unsigned long first_index, unsigned int max_items, 335 unsigned long first_index, unsigned int max_items,
@@ -297,50 +338,18 @@ unsigned int
297radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, 338radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
298 unsigned long first_index, unsigned int max_items, 339 unsigned long first_index, unsigned int max_items,
299 unsigned int tag); 340 unsigned int tag);
300unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
301 unsigned long *first_indexp, unsigned long last_index,
302 unsigned long nr_to_tag,
303 unsigned int fromtag, unsigned int totag);
304int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); 341int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
305unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item);
306 342
307static inline void radix_tree_preload_end(void) 343static inline void radix_tree_preload_end(void)
308{ 344{
309 preempt_enable(); 345 preempt_enable();
310} 346}
311 347
312/** 348int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t);
313 * struct radix_tree_iter - radix tree iterator state 349int radix_tree_split(struct radix_tree_root *, unsigned long index,
314 * 350 unsigned new_order);
315 * @index: index of current slot 351int radix_tree_join(struct radix_tree_root *, unsigned long index,
316 * @next_index: one beyond the last index for this chunk 352 unsigned new_order, void *);
317 * @tags: bit-mask for tag-iterating
318 * @shift: shift for the node that holds our slots
319 *
320 * This radix tree iterator works in terms of "chunks" of slots. A chunk is a
321 * subinterval of slots contained within one radix tree leaf node. It is
322 * described by a pointer to its first slot and a struct radix_tree_iter
323 * which holds the chunk's position in the tree and its size. For tagged
324 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
325 * radix tree tag.
326 */
327struct radix_tree_iter {
328 unsigned long index;
329 unsigned long next_index;
330 unsigned long tags;
331#ifdef CONFIG_RADIX_TREE_MULTIORDER
332 unsigned int shift;
333#endif
334};
335
336static inline unsigned int iter_shift(struct radix_tree_iter *iter)
337{
338#ifdef CONFIG_RADIX_TREE_MULTIORDER
339 return iter->shift;
340#else
341 return 0;
342#endif
343}
344 353
345#define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ 354#define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */
346#define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ 355#define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */
@@ -409,20 +418,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
409} 418}
410 419
411/** 420/**
412 * radix_tree_iter_next - resume iterating when the chunk may be invalid 421 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
413 * @iter: iterator state 422 * @slot: pointer to current slot
423 * @iter: iterator state
424 * Returns: New slot pointer
414 * 425 *
415 * If the iterator needs to release then reacquire a lock, the chunk may 426 * If the iterator needs to release then reacquire a lock, the chunk may
416 * have been invalidated by an insertion or deletion. Call this function 427 * have been invalidated by an insertion or deletion. Call this function
417 * to continue the iteration from the next index. 428 * before releasing the lock to continue the iteration from the next index.
418 */ 429 */
419static inline __must_check 430void **__must_check radix_tree_iter_resume(void **slot,
420void **radix_tree_iter_next(struct radix_tree_iter *iter) 431 struct radix_tree_iter *iter);
421{
422 iter->next_index = __radix_tree_iter_add(iter, 1);
423 iter->tags = 0;
424 return NULL;
425}
426 432
427/** 433/**
428 * radix_tree_chunk_size - get current chunk size 434 * radix_tree_chunk_size - get current chunk size
@@ -436,10 +442,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter)
436 return (iter->next_index - iter->index) >> iter_shift(iter); 442 return (iter->next_index - iter->index) >> iter_shift(iter);
437} 443}
438 444
439static inline struct radix_tree_node *entry_to_node(void *ptr) 445#ifdef CONFIG_RADIX_TREE_MULTIORDER
446void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
447 unsigned flags);
448#else
449/* Can't happen without sibling entries, but the compiler can't tell that */
450static inline void ** __radix_tree_next_slot(void **slot,
451 struct radix_tree_iter *iter, unsigned flags)
440{ 452{
441 return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); 453 return slot;
442} 454}
455#endif
443 456
444/** 457/**
445 * radix_tree_next_slot - find next slot in chunk 458 * radix_tree_next_slot - find next slot in chunk
@@ -453,7 +466,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr)
453 * For tagged lookup it also eats @iter->tags. 466 * For tagged lookup it also eats @iter->tags.
454 * 467 *
455 * There are several cases where 'slot' can be passed in as NULL to this 468 * There are several cases where 'slot' can be passed in as NULL to this
456 * function. These cases result from the use of radix_tree_iter_next() or 469 * function. These cases result from the use of radix_tree_iter_resume() or
457 * radix_tree_iter_retry(). In these cases we don't end up dereferencing 470 * radix_tree_iter_retry(). In these cases we don't end up dereferencing
458 * 'slot' because either: 471 * 'slot' because either:
459 * a) we are doing tagged iteration and iter->tags has been set to 0, or 472 * a) we are doing tagged iteration and iter->tags has been set to 0, or
@@ -464,51 +477,31 @@ static __always_inline void **
464radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) 477radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
465{ 478{
466 if (flags & RADIX_TREE_ITER_TAGGED) { 479 if (flags & RADIX_TREE_ITER_TAGGED) {
467 void *canon = slot;
468
469 iter->tags >>= 1; 480 iter->tags >>= 1;
470 if (unlikely(!iter->tags)) 481 if (unlikely(!iter->tags))
471 return NULL; 482 return NULL;
472 while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
473 radix_tree_is_internal_node(slot[1])) {
474 if (entry_to_node(slot[1]) == canon) {
475 iter->tags >>= 1;
476 iter->index = __radix_tree_iter_add(iter, 1);
477 slot++;
478 continue;
479 }
480 iter->next_index = __radix_tree_iter_add(iter, 1);
481 return NULL;
482 }
483 if (likely(iter->tags & 1ul)) { 483 if (likely(iter->tags & 1ul)) {
484 iter->index = __radix_tree_iter_add(iter, 1); 484 iter->index = __radix_tree_iter_add(iter, 1);
485 return slot + 1; 485 slot++;
486 goto found;
486 } 487 }
487 if (!(flags & RADIX_TREE_ITER_CONTIG)) { 488 if (!(flags & RADIX_TREE_ITER_CONTIG)) {
488 unsigned offset = __ffs(iter->tags); 489 unsigned offset = __ffs(iter->tags);
489 490
490 iter->tags >>= offset; 491 iter->tags >>= offset++;
491 iter->index = __radix_tree_iter_add(iter, offset + 1); 492 iter->index = __radix_tree_iter_add(iter, offset);
492 return slot + offset + 1; 493 slot += offset;
494 goto found;
493 } 495 }
494 } else { 496 } else {
495 long count = radix_tree_chunk_size(iter); 497 long count = radix_tree_chunk_size(iter);
496 void *canon = slot;
497 498
498 while (--count > 0) { 499 while (--count > 0) {
499 slot++; 500 slot++;
500 iter->index = __radix_tree_iter_add(iter, 1); 501 iter->index = __radix_tree_iter_add(iter, 1);
501 502
502 if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) &&
503 radix_tree_is_internal_node(*slot)) {
504 if (entry_to_node(*slot) == canon)
505 continue;
506 iter->next_index = iter->index;
507 break;
508 }
509
510 if (likely(*slot)) 503 if (likely(*slot))
511 return slot; 504 goto found;
512 if (flags & RADIX_TREE_ITER_CONTIG) { 505 if (flags & RADIX_TREE_ITER_CONTIG) {
513 /* forbid switching to the next chunk */ 506 /* forbid switching to the next chunk */
514 iter->next_index = 0; 507 iter->next_index = 0;
@@ -517,6 +510,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
517 } 510 }
518 } 511 }
519 return NULL; 512 return NULL;
513
514 found:
515 if (unlikely(radix_tree_is_internal_node(*slot)))
516 return __radix_tree_next_slot(slot, iter, flags);
517 return slot;
520} 518}
521 519
522/** 520/**
@@ -567,6 +565,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
567 slot || (slot = radix_tree_next_chunk(root, iter, \ 565 slot || (slot = radix_tree_next_chunk(root, iter, \
568 RADIX_TREE_ITER_TAGGED | tag)) ; \ 566 RADIX_TREE_ITER_TAGGED | tag)) ; \
569 slot = radix_tree_next_slot(slot, iter, \ 567 slot = radix_tree_next_slot(slot, iter, \
570 RADIX_TREE_ITER_TAGGED)) 568 RADIX_TREE_ITER_TAGGED | tag))
571 569
572#endif /* _LINUX_RADIX_TREE_H */ 570#endif /* _LINUX_RADIX_TREE_H */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index b63f63eaa39c..5308304993be 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set)
97 } 97 }
98} 98}
99 99
100static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
101{
102 switch (_NSIG_WORDS) {
103 case 4:
104 return (set1->sig[3] == set2->sig[3]) &&
105 (set1->sig[2] == set2->sig[2]) &&
106 (set1->sig[1] == set2->sig[1]) &&
107 (set1->sig[0] == set2->sig[0]);
108 case 2:
109 return (set1->sig[1] == set2->sig[1]) &&
110 (set1->sig[0] == set2->sig[0]);
111 case 1:
112 return set1->sig[0] == set2->sig[0];
113 }
114 return 0;
115}
116
100#define sigmask(sig) (1UL << ((sig) - 1)) 117#define sigmask(sig) (1UL << ((sig) - 1))
101 118
102#ifndef __HAVE_ARCH_SIG_SETOPS 119#ifndef __HAVE_ARCH_SIG_SETOPS
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index dd66a952e8cd..11b92b047a1e 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,7 +27,7 @@
27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) 27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) 28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
29 29
30extern int handle_userfault(struct fault_env *fe, unsigned long reason); 30extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
31 31
32extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 32extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
33 unsigned long src_start, unsigned long len); 33 unsigned long src_start, unsigned long len);
@@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
55#else /* CONFIG_USERFAULTFD */ 55#else /* CONFIG_USERFAULTFD */
56 56
57/* mm helpers */ 57/* mm helpers */
58static inline int handle_userfault(struct fault_env *fe, unsigned long reason) 58static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason)
59{ 59{
60 return VM_FAULT_SIGBUS; 60 return VM_FAULT_SIGBUS;
61} 61}
diff --git a/ipc/msg.c b/ipc/msg.c
index 32e9bd837cde..e3e52ce01123 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -763,7 +763,10 @@ static inline int convert_mode(long *msgtyp, int msgflg)
763 if (*msgtyp == 0) 763 if (*msgtyp == 0)
764 return SEARCH_ANY; 764 return SEARCH_ANY;
765 if (*msgtyp < 0) { 765 if (*msgtyp < 0) {
766 *msgtyp = -*msgtyp; 766 if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */
767 *msgtyp = LONG_MAX;
768 else
769 *msgtyp = -*msgtyp;
767 return SEARCH_LESSEQUAL; 770 return SEARCH_LESSEQUAL;
768 } 771 }
769 if (msgflg & MSG_EXCEPT) 772 if (msgflg & MSG_EXCEPT)
diff --git a/ipc/sem.c b/ipc/sem.c
index 10b94bc59d4a..e08b94851922 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -11,6 +11,7 @@
11 * (c) 2001 Red Hat Inc 11 * (c) 2001 Red Hat Inc
12 * Lockless wakeup 12 * Lockless wakeup
13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com> 13 * (c) 2003 Manfred Spraul <manfred@colorfullife.com>
14 * (c) 2016 Davidlohr Bueso <dave@stgolabs.net>
14 * Further wakeup optimizations, documentation 15 * Further wakeup optimizations, documentation
15 * (c) 2010 Manfred Spraul <manfred@colorfullife.com> 16 * (c) 2010 Manfred Spraul <manfred@colorfullife.com>
16 * 17 *
@@ -53,15 +54,11 @@
53 * Semaphores are actively given to waiting tasks (necessary for FIFO). 54 * Semaphores are actively given to waiting tasks (necessary for FIFO).
54 * (see update_queue()) 55 * (see update_queue())
55 * - To improve the scalability, the actual wake-up calls are performed after 56 * - To improve the scalability, the actual wake-up calls are performed after
56 * dropping all locks. (see wake_up_sem_queue_prepare(), 57 * dropping all locks. (see wake_up_sem_queue_prepare())
57 * wake_up_sem_queue_do())
58 * - All work is done by the waker, the woken up task does not have to do 58 * - All work is done by the waker, the woken up task does not have to do
59 * anything - not even acquiring a lock or dropping a refcount. 59 * anything - not even acquiring a lock or dropping a refcount.
60 * - A woken up task may not even touch the semaphore array anymore, it may 60 * - A woken up task may not even touch the semaphore array anymore, it may
61 * have been destroyed already by a semctl(RMID). 61 * have been destroyed already by a semctl(RMID).
62 * - The synchronizations between wake-ups due to a timeout/signal and a
63 * wake-up due to a completed semaphore operation is achieved by using an
64 * intermediate state (IN_WAKEUP).
65 * - UNDO values are stored in an array (one per process and per 62 * - UNDO values are stored in an array (one per process and per
66 * semaphore array, lazily allocated). For backwards compatibility, multiple 63 * semaphore array, lazily allocated). For backwards compatibility, multiple
67 * modes for the UNDO variables are supported (per process, per thread) 64 * modes for the UNDO variables are supported (per process, per thread)
@@ -118,7 +115,8 @@ struct sem_queue {
118 struct sembuf *sops; /* array of pending operations */ 115 struct sembuf *sops; /* array of pending operations */
119 struct sembuf *blocking; /* the operation that blocked */ 116 struct sembuf *blocking; /* the operation that blocked */
120 int nsops; /* number of operations */ 117 int nsops; /* number of operations */
121 int alter; /* does *sops alter the array? */ 118 bool alter; /* does *sops alter the array? */
119 bool dupsop; /* sops on more than one sem_num */
122}; 120};
123 121
124/* Each task has a list of undo requests. They are executed automatically 122/* Each task has a list of undo requests. They are executed automatically
@@ -416,29 +414,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
416 * 414 *
417 * The caller holds the RCU read lock. 415 * The caller holds the RCU read lock.
418 */ 416 */
419static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns,
420 int id, struct sembuf *sops, int nsops, int *locknum)
421{
422 struct kern_ipc_perm *ipcp;
423 struct sem_array *sma;
424
425 ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
426 if (IS_ERR(ipcp))
427 return ERR_CAST(ipcp);
428
429 sma = container_of(ipcp, struct sem_array, sem_perm);
430 *locknum = sem_lock(sma, sops, nsops);
431
432 /* ipc_rmid() may have already freed the ID while sem_lock
433 * was spinning: verify that the structure is still valid
434 */
435 if (ipc_valid_object(ipcp))
436 return container_of(ipcp, struct sem_array, sem_perm);
437
438 sem_unlock(sma, *locknum);
439 return ERR_PTR(-EINVAL);
440}
441
442static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) 417static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id)
443{ 418{
444 struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); 419 struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id);
@@ -471,40 +446,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
471 ipc_rmid(&sem_ids(ns), &s->sem_perm); 446 ipc_rmid(&sem_ids(ns), &s->sem_perm);
472} 447}
473 448
474/*
475 * Lockless wakeup algorithm:
476 * Without the check/retry algorithm a lockless wakeup is possible:
477 * - queue.status is initialized to -EINTR before blocking.
478 * - wakeup is performed by
479 * * unlinking the queue entry from the pending list
480 * * setting queue.status to IN_WAKEUP
481 * This is the notification for the blocked thread that a
482 * result value is imminent.
483 * * call wake_up_process
484 * * set queue.status to the final value.
485 * - the previously blocked thread checks queue.status:
486 * * if it's IN_WAKEUP, then it must wait until the value changes
487 * * if it's not -EINTR, then the operation was completed by
488 * update_queue. semtimedop can return queue.status without
489 * performing any operation on the sem array.
490 * * otherwise it must acquire the spinlock and check what's up.
491 *
492 * The two-stage algorithm is necessary to protect against the following
493 * races:
494 * - if queue.status is set after wake_up_process, then the woken up idle
495 * thread could race forward and try (and fail) to acquire sma->lock
496 * before update_queue had a chance to set queue.status
497 * - if queue.status is written before wake_up_process and if the
498 * blocked process is woken up by a signal between writing
499 * queue.status and the wake_up_process, then the woken up
500 * process could return from semtimedop and die by calling
501 * sys_exit before wake_up_process is called. Then wake_up_process
502 * will oops, because the task structure is already invalid.
503 * (yes, this happened on s390 with sysv msg).
504 *
505 */
506#define IN_WAKEUP 1
507
508/** 449/**
509 * newary - Create a new semaphore set 450 * newary - Create a new semaphore set
510 * @ns: namespace 451 * @ns: namespace
@@ -624,15 +565,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg)
624} 565}
625 566
626/** 567/**
627 * perform_atomic_semop - Perform (if possible) a semaphore operation 568 * perform_atomic_semop[_slow] - Attempt to perform semaphore
569 * operations on a given array.
628 * @sma: semaphore array 570 * @sma: semaphore array
629 * @q: struct sem_queue that describes the operation 571 * @q: struct sem_queue that describes the operation
630 * 572 *
573 * Caller blocking are as follows, based the value
574 * indicated by the semaphore operation (sem_op):
575 *
576 * (1) >0 never blocks.
577 * (2) 0 (wait-for-zero operation): semval is non-zero.
578 * (3) <0 attempting to decrement semval to a value smaller than zero.
579 *
631 * Returns 0 if the operation was possible. 580 * Returns 0 if the operation was possible.
632 * Returns 1 if the operation is impossible, the caller must sleep. 581 * Returns 1 if the operation is impossible, the caller must sleep.
633 * Negative values are error codes. 582 * Returns <0 for error codes.
634 */ 583 */
635static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) 584static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q)
636{ 585{
637 int result, sem_op, nsops, pid; 586 int result, sem_op, nsops, pid;
638 struct sembuf *sop; 587 struct sembuf *sop;
@@ -703,51 +652,84 @@ undo:
703 return result; 652 return result;
704} 653}
705 654
706/** wake_up_sem_queue_prepare(q, error): Prepare wake-up 655static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q)
707 * @q: queue entry that must be signaled
708 * @error: Error value for the signal
709 *
710 * Prepare the wake-up of the queue entry q.
711 */
712static void wake_up_sem_queue_prepare(struct list_head *pt,
713 struct sem_queue *q, int error)
714{ 656{
715 if (list_empty(pt)) { 657 int result, sem_op, nsops;
716 /* 658 struct sembuf *sop;
717 * Hold preempt off so that we don't get preempted and have the 659 struct sem *curr;
718 * wakee busy-wait until we're scheduled back on. 660 struct sembuf *sops;
719 */ 661 struct sem_undo *un;
720 preempt_disable(); 662
663 sops = q->sops;
664 nsops = q->nsops;
665 un = q->undo;
666
667 if (unlikely(q->dupsop))
668 return perform_atomic_semop_slow(sma, q);
669
670 /*
671 * We scan the semaphore set twice, first to ensure that the entire
672 * operation can succeed, therefore avoiding any pointless writes
673 * to shared memory and having to undo such changes in order to block
674 * until the operations can go through.
675 */
676 for (sop = sops; sop < sops + nsops; sop++) {
677 curr = sma->sem_base + sop->sem_num;
678 sem_op = sop->sem_op;
679 result = curr->semval;
680
681 if (!sem_op && result)
682 goto would_block; /* wait-for-zero */
683
684 result += sem_op;
685 if (result < 0)
686 goto would_block;
687
688 if (result > SEMVMX)
689 return -ERANGE;
690
691 if (sop->sem_flg & SEM_UNDO) {
692 int undo = un->semadj[sop->sem_num] - sem_op;
693
694 /* Exceeding the undo range is an error. */
695 if (undo < (-SEMAEM - 1) || undo > SEMAEM)
696 return -ERANGE;
697 }
698 }
699
700 for (sop = sops; sop < sops + nsops; sop++) {
701 curr = sma->sem_base + sop->sem_num;
702 sem_op = sop->sem_op;
703 result = curr->semval;
704
705 if (sop->sem_flg & SEM_UNDO) {
706 int undo = un->semadj[sop->sem_num] - sem_op;
707
708 un->semadj[sop->sem_num] = undo;
709 }
710 curr->semval += sem_op;
711 curr->sempid = q->pid;
721 } 712 }
722 q->status = IN_WAKEUP;
723 q->pid = error;
724 713
725 list_add_tail(&q->list, pt); 714 return 0;
715
716would_block:
717 q->blocking = sop;
718 return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1;
726} 719}
727 720
728/** 721static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error,
729 * wake_up_sem_queue_do - do the actual wake-up 722 struct wake_q_head *wake_q)
730 * @pt: list of tasks to be woken up
731 *
732 * Do the actual wake-up.
733 * The function is called without any locks held, thus the semaphore array
734 * could be destroyed already and the tasks can disappear as soon as the
735 * status is set to the actual return code.
736 */
737static void wake_up_sem_queue_do(struct list_head *pt)
738{ 723{
739 struct sem_queue *q, *t; 724 wake_q_add(wake_q, q->sleeper);
740 int did_something; 725 /*
741 726 * Rely on the above implicit barrier, such that we can
742 did_something = !list_empty(pt); 727 * ensure that we hold reference to the task before setting
743 list_for_each_entry_safe(q, t, pt, list) { 728 * q->status. Otherwise we could race with do_exit if the
744 wake_up_process(q->sleeper); 729 * task is awoken by an external event before calling
745 /* q can disappear immediately after writing q->status. */ 730 * wake_up_process().
746 smp_wmb(); 731 */
747 q->status = q->pid; 732 WRITE_ONCE(q->status, error);
748 }
749 if (did_something)
750 preempt_enable();
751} 733}
752 734
753static void unlink_queue(struct sem_array *sma, struct sem_queue *q) 735static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
@@ -767,7 +749,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q)
767 * modified the array. 749 * modified the array.
768 * Note that wait-for-zero operations are handled without restart. 750 * Note that wait-for-zero operations are handled without restart.
769 */ 751 */
770static int check_restart(struct sem_array *sma, struct sem_queue *q) 752static inline int check_restart(struct sem_array *sma, struct sem_queue *q)
771{ 753{
772 /* pending complex alter operations are too difficult to analyse */ 754 /* pending complex alter operations are too difficult to analyse */
773 if (!list_empty(&sma->pending_alter)) 755 if (!list_empty(&sma->pending_alter))
@@ -795,21 +777,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q)
795 * wake_const_ops - wake up non-alter tasks 777 * wake_const_ops - wake up non-alter tasks
796 * @sma: semaphore array. 778 * @sma: semaphore array.
797 * @semnum: semaphore that was modified. 779 * @semnum: semaphore that was modified.
798 * @pt: list head for the tasks that must be woken up. 780 * @wake_q: lockless wake-queue head.
799 * 781 *
800 * wake_const_ops must be called after a semaphore in a semaphore array 782 * wake_const_ops must be called after a semaphore in a semaphore array
801 * was set to 0. If complex const operations are pending, wake_const_ops must 783 * was set to 0. If complex const operations are pending, wake_const_ops must
802 * be called with semnum = -1, as well as with the number of each modified 784 * be called with semnum = -1, as well as with the number of each modified
803 * semaphore. 785 * semaphore.
804 * The tasks that must be woken up are added to @pt. The return code 786 * The tasks that must be woken up are added to @wake_q. The return code
805 * is stored in q->pid. 787 * is stored in q->pid.
806 * The function returns 1 if at least one operation was completed successfully. 788 * The function returns 1 if at least one operation was completed successfully.
807 */ 789 */
808static int wake_const_ops(struct sem_array *sma, int semnum, 790static int wake_const_ops(struct sem_array *sma, int semnum,
809 struct list_head *pt) 791 struct wake_q_head *wake_q)
810{ 792{
811 struct sem_queue *q; 793 struct sem_queue *q, *tmp;
812 struct list_head *walk;
813 struct list_head *pending_list; 794 struct list_head *pending_list;
814 int semop_completed = 0; 795 int semop_completed = 0;
815 796
@@ -818,25 +799,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
818 else 799 else
819 pending_list = &sma->sem_base[semnum].pending_const; 800 pending_list = &sma->sem_base[semnum].pending_const;
820 801
821 walk = pending_list->next; 802 list_for_each_entry_safe(q, tmp, pending_list, list) {
822 while (walk != pending_list) { 803 int error = perform_atomic_semop(sma, q);
823 int error;
824
825 q = container_of(walk, struct sem_queue, list);
826 walk = walk->next;
827
828 error = perform_atomic_semop(sma, q);
829
830 if (error <= 0) {
831 /* operation completed, remove from queue & wakeup */
832 804
833 unlink_queue(sma, q); 805 if (error > 0)
806 continue;
807 /* operation completed, remove from queue & wakeup */
808 unlink_queue(sma, q);
834 809
835 wake_up_sem_queue_prepare(pt, q, error); 810 wake_up_sem_queue_prepare(q, error, wake_q);
836 if (error == 0) 811 if (error == 0)
837 semop_completed = 1; 812 semop_completed = 1;
838 }
839 } 813 }
814
840 return semop_completed; 815 return semop_completed;
841} 816}
842 817
@@ -845,14 +820,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum,
845 * @sma: semaphore array 820 * @sma: semaphore array
846 * @sops: operations that were performed 821 * @sops: operations that were performed
847 * @nsops: number of operations 822 * @nsops: number of operations
848 * @pt: list head of the tasks that must be woken up. 823 * @wake_q: lockless wake-queue head
849 * 824 *
850 * Checks all required queue for wait-for-zero operations, based 825 * Checks all required queue for wait-for-zero operations, based
851 * on the actual changes that were performed on the semaphore array. 826 * on the actual changes that were performed on the semaphore array.
852 * The function returns 1 if at least one operation was completed successfully. 827 * The function returns 1 if at least one operation was completed successfully.
853 */ 828 */
854static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, 829static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
855 int nsops, struct list_head *pt) 830 int nsops, struct wake_q_head *wake_q)
856{ 831{
857 int i; 832 int i;
858 int semop_completed = 0; 833 int semop_completed = 0;
@@ -865,7 +840,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
865 840
866 if (sma->sem_base[num].semval == 0) { 841 if (sma->sem_base[num].semval == 0) {
867 got_zero = 1; 842 got_zero = 1;
868 semop_completed |= wake_const_ops(sma, num, pt); 843 semop_completed |= wake_const_ops(sma, num, wake_q);
869 } 844 }
870 } 845 }
871 } else { 846 } else {
@@ -876,7 +851,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
876 for (i = 0; i < sma->sem_nsems; i++) { 851 for (i = 0; i < sma->sem_nsems; i++) {
877 if (sma->sem_base[i].semval == 0) { 852 if (sma->sem_base[i].semval == 0) {
878 got_zero = 1; 853 got_zero = 1;
879 semop_completed |= wake_const_ops(sma, i, pt); 854 semop_completed |= wake_const_ops(sma, i, wake_q);
880 } 855 }
881 } 856 }
882 } 857 }
@@ -885,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
885 * then check the global queue, too. 860 * then check the global queue, too.
886 */ 861 */
887 if (got_zero) 862 if (got_zero)
888 semop_completed |= wake_const_ops(sma, -1, pt); 863 semop_completed |= wake_const_ops(sma, -1, wake_q);
889 864
890 return semop_completed; 865 return semop_completed;
891} 866}
@@ -895,22 +870,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops,
895 * update_queue - look for tasks that can be completed. 870 * update_queue - look for tasks that can be completed.
896 * @sma: semaphore array. 871 * @sma: semaphore array.
897 * @semnum: semaphore that was modified. 872 * @semnum: semaphore that was modified.
898 * @pt: list head for the tasks that must be woken up. 873 * @wake_q: lockless wake-queue head.
899 * 874 *
900 * update_queue must be called after a semaphore in a semaphore array 875 * update_queue must be called after a semaphore in a semaphore array
901 * was modified. If multiple semaphores were modified, update_queue must 876 * was modified. If multiple semaphores were modified, update_queue must
902 * be called with semnum = -1, as well as with the number of each modified 877 * be called with semnum = -1, as well as with the number of each modified
903 * semaphore. 878 * semaphore.
904 * The tasks that must be woken up are added to @pt. The return code 879 * The tasks that must be woken up are added to @wake_q. The return code
905 * is stored in q->pid. 880 * is stored in q->pid.
906 * The function internally checks if const operations can now succeed. 881 * The function internally checks if const operations can now succeed.
907 * 882 *
908 * The function return 1 if at least one semop was completed successfully. 883 * The function return 1 if at least one semop was completed successfully.
909 */ 884 */
910static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) 885static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q)
911{ 886{
912 struct sem_queue *q; 887 struct sem_queue *q, *tmp;
913 struct list_head *walk;
914 struct list_head *pending_list; 888 struct list_head *pending_list;
915 int semop_completed = 0; 889 int semop_completed = 0;
916 890
@@ -920,13 +894,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt)
920 pending_list = &sma->sem_base[semnum].pending_alter; 894 pending_list = &sma->sem_base[semnum].pending_alter;
921 895
922again: 896again:
923 walk = pending_list->next; 897 list_for_each_entry_safe(q, tmp, pending_list, list) {
924 while (walk != pending_list) {
925 int error, restart; 898 int error, restart;
926 899
927 q = container_of(walk, struct sem_queue, list);
928 walk = walk->next;
929
930 /* If we are scanning the single sop, per-semaphore list of 900 /* If we are scanning the single sop, per-semaphore list of
931 * one semaphore and that semaphore is 0, then it is not 901 * one semaphore and that semaphore is 0, then it is not
932 * necessary to scan further: simple increments 902 * necessary to scan further: simple increments
@@ -949,11 +919,11 @@ again:
949 restart = 0; 919 restart = 0;
950 } else { 920 } else {
951 semop_completed = 1; 921 semop_completed = 1;
952 do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); 922 do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q);
953 restart = check_restart(sma, q); 923 restart = check_restart(sma, q);
954 } 924 }
955 925
956 wake_up_sem_queue_prepare(pt, q, error); 926 wake_up_sem_queue_prepare(q, error, wake_q);
957 if (restart) 927 if (restart)
958 goto again; 928 goto again;
959 } 929 }
@@ -984,24 +954,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops)
984 * @sops: operations that were performed 954 * @sops: operations that were performed
985 * @nsops: number of operations 955 * @nsops: number of operations
986 * @otime: force setting otime 956 * @otime: force setting otime
987 * @pt: list head of the tasks that must be woken up. 957 * @wake_q: lockless wake-queue head
988 * 958 *
989 * do_smart_update() does the required calls to update_queue and wakeup_zero, 959 * do_smart_update() does the required calls to update_queue and wakeup_zero,
990 * based on the actual changes that were performed on the semaphore array. 960 * based on the actual changes that were performed on the semaphore array.
991 * Note that the function does not do the actual wake-up: the caller is 961 * Note that the function does not do the actual wake-up: the caller is
992 * responsible for calling wake_up_sem_queue_do(@pt). 962 * responsible for calling wake_up_q().
993 * It is safe to perform this call after dropping all locks. 963 * It is safe to perform this call after dropping all locks.
994 */ 964 */
995static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, 965static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops,
996 int otime, struct list_head *pt) 966 int otime, struct wake_q_head *wake_q)
997{ 967{
998 int i; 968 int i;
999 969
1000 otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); 970 otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q);
1001 971
1002 if (!list_empty(&sma->pending_alter)) { 972 if (!list_empty(&sma->pending_alter)) {
1003 /* semaphore array uses the global queue - just process it. */ 973 /* semaphore array uses the global queue - just process it. */
1004 otime |= update_queue(sma, -1, pt); 974 otime |= update_queue(sma, -1, wake_q);
1005 } else { 975 } else {
1006 if (!sops) { 976 if (!sops) {
1007 /* 977 /*
@@ -1009,7 +979,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
1009 * known. Check all. 979 * known. Check all.
1010 */ 980 */
1011 for (i = 0; i < sma->sem_nsems; i++) 981 for (i = 0; i < sma->sem_nsems; i++)
1012 otime |= update_queue(sma, i, pt); 982 otime |= update_queue(sma, i, wake_q);
1013 } else { 983 } else {
1014 /* 984 /*
1015 * Check the semaphores that were increased: 985 * Check the semaphores that were increased:
@@ -1023,7 +993,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop
1023 for (i = 0; i < nsops; i++) { 993 for (i = 0; i < nsops; i++) {
1024 if (sops[i].sem_op > 0) { 994 if (sops[i].sem_op > 0) {
1025 otime |= update_queue(sma, 995 otime |= update_queue(sma,
1026 sops[i].sem_num, pt); 996 sops[i].sem_num, wake_q);
1027 } 997 }
1028 } 998 }
1029 } 999 }
@@ -1111,8 +1081,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1111 struct sem_undo *un, *tu; 1081 struct sem_undo *un, *tu;
1112 struct sem_queue *q, *tq; 1082 struct sem_queue *q, *tq;
1113 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); 1083 struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
1114 struct list_head tasks;
1115 int i; 1084 int i;
1085 DEFINE_WAKE_Q(wake_q);
1116 1086
1117 /* Free the existing undo structures for this semaphore set. */ 1087 /* Free the existing undo structures for this semaphore set. */
1118 ipc_assert_locked_object(&sma->sem_perm); 1088 ipc_assert_locked_object(&sma->sem_perm);
@@ -1126,25 +1096,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1126 } 1096 }
1127 1097
1128 /* Wake up all pending processes and let them fail with EIDRM. */ 1098 /* Wake up all pending processes and let them fail with EIDRM. */
1129 INIT_LIST_HEAD(&tasks);
1130 list_for_each_entry_safe(q, tq, &sma->pending_const, list) { 1099 list_for_each_entry_safe(q, tq, &sma->pending_const, list) {
1131 unlink_queue(sma, q); 1100 unlink_queue(sma, q);
1132 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1101 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1133 } 1102 }
1134 1103
1135 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { 1104 list_for_each_entry_safe(q, tq, &sma->pending_alter, list) {
1136 unlink_queue(sma, q); 1105 unlink_queue(sma, q);
1137 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1106 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1138 } 1107 }
1139 for (i = 0; i < sma->sem_nsems; i++) { 1108 for (i = 0; i < sma->sem_nsems; i++) {
1140 struct sem *sem = sma->sem_base + i; 1109 struct sem *sem = sma->sem_base + i;
1141 list_for_each_entry_safe(q, tq, &sem->pending_const, list) { 1110 list_for_each_entry_safe(q, tq, &sem->pending_const, list) {
1142 unlink_queue(sma, q); 1111 unlink_queue(sma, q);
1143 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1112 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1144 } 1113 }
1145 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { 1114 list_for_each_entry_safe(q, tq, &sem->pending_alter, list) {
1146 unlink_queue(sma, q); 1115 unlink_queue(sma, q);
1147 wake_up_sem_queue_prepare(&tasks, q, -EIDRM); 1116 wake_up_sem_queue_prepare(q, -EIDRM, &wake_q);
1148 } 1117 }
1149 } 1118 }
1150 1119
@@ -1153,7 +1122,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
1153 sem_unlock(sma, -1); 1122 sem_unlock(sma, -1);
1154 rcu_read_unlock(); 1123 rcu_read_unlock();
1155 1124
1156 wake_up_sem_queue_do(&tasks); 1125 wake_up_q(&wake_q);
1157 ns->used_sems -= sma->sem_nsems; 1126 ns->used_sems -= sma->sem_nsems;
1158 ipc_rcu_putref(sma, sem_rcu_free); 1127 ipc_rcu_putref(sma, sem_rcu_free);
1159} 1128}
@@ -1292,9 +1261,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1292 struct sem_undo *un; 1261 struct sem_undo *un;
1293 struct sem_array *sma; 1262 struct sem_array *sma;
1294 struct sem *curr; 1263 struct sem *curr;
1295 int err; 1264 int err, val;
1296 struct list_head tasks; 1265 DEFINE_WAKE_Q(wake_q);
1297 int val; 1266
1298#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) 1267#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
1299 /* big-endian 64bit */ 1268 /* big-endian 64bit */
1300 val = arg >> 32; 1269 val = arg >> 32;
@@ -1306,8 +1275,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1306 if (val > SEMVMX || val < 0) 1275 if (val > SEMVMX || val < 0)
1307 return -ERANGE; 1276 return -ERANGE;
1308 1277
1309 INIT_LIST_HEAD(&tasks);
1310
1311 rcu_read_lock(); 1278 rcu_read_lock();
1312 sma = sem_obtain_object_check(ns, semid); 1279 sma = sem_obtain_object_check(ns, semid);
1313 if (IS_ERR(sma)) { 1280 if (IS_ERR(sma)) {
@@ -1350,10 +1317,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
1350 curr->sempid = task_tgid_vnr(current); 1317 curr->sempid = task_tgid_vnr(current);
1351 sma->sem_ctime = get_seconds(); 1318 sma->sem_ctime = get_seconds();
1352 /* maybe some queued-up processes were waiting for this */ 1319 /* maybe some queued-up processes were waiting for this */
1353 do_smart_update(sma, NULL, 0, 0, &tasks); 1320 do_smart_update(sma, NULL, 0, 0, &wake_q);
1354 sem_unlock(sma, -1); 1321 sem_unlock(sma, -1);
1355 rcu_read_unlock(); 1322 rcu_read_unlock();
1356 wake_up_sem_queue_do(&tasks); 1323 wake_up_q(&wake_q);
1357 return 0; 1324 return 0;
1358} 1325}
1359 1326
@@ -1365,9 +1332,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1365 int err, nsems; 1332 int err, nsems;
1366 ushort fast_sem_io[SEMMSL_FAST]; 1333 ushort fast_sem_io[SEMMSL_FAST];
1367 ushort *sem_io = fast_sem_io; 1334 ushort *sem_io = fast_sem_io;
1368 struct list_head tasks; 1335 DEFINE_WAKE_Q(wake_q);
1369
1370 INIT_LIST_HEAD(&tasks);
1371 1336
1372 rcu_read_lock(); 1337 rcu_read_lock();
1373 sma = sem_obtain_object_check(ns, semid); 1338 sma = sem_obtain_object_check(ns, semid);
@@ -1478,7 +1443,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
1478 } 1443 }
1479 sma->sem_ctime = get_seconds(); 1444 sma->sem_ctime = get_seconds();
1480 /* maybe some queued-up processes were waiting for this */ 1445 /* maybe some queued-up processes were waiting for this */
1481 do_smart_update(sma, NULL, 0, 0, &tasks); 1446 do_smart_update(sma, NULL, 0, 0, &wake_q);
1482 err = 0; 1447 err = 0;
1483 goto out_unlock; 1448 goto out_unlock;
1484 } 1449 }
@@ -1514,7 +1479,7 @@ out_unlock:
1514 sem_unlock(sma, -1); 1479 sem_unlock(sma, -1);
1515out_rcu_wakeup: 1480out_rcu_wakeup:
1516 rcu_read_unlock(); 1481 rcu_read_unlock();
1517 wake_up_sem_queue_do(&tasks); 1482 wake_up_q(&wake_q);
1518out_free: 1483out_free:
1519 if (sem_io != fast_sem_io) 1484 if (sem_io != fast_sem_io)
1520 ipc_free(sem_io); 1485 ipc_free(sem_io);
@@ -1787,32 +1752,6 @@ out:
1787 return un; 1752 return un;
1788} 1753}
1789 1754
1790
1791/**
1792 * get_queue_result - retrieve the result code from sem_queue
1793 * @q: Pointer to queue structure
1794 *
1795 * Retrieve the return code from the pending queue. If IN_WAKEUP is found in
1796 * q->status, then we must loop until the value is replaced with the final
1797 * value: This may happen if a task is woken up by an unrelated event (e.g.
1798 * signal) and in parallel the task is woken up by another task because it got
1799 * the requested semaphores.
1800 *
1801 * The function can be called with or without holding the semaphore spinlock.
1802 */
1803static int get_queue_result(struct sem_queue *q)
1804{
1805 int error;
1806
1807 error = q->status;
1808 while (unlikely(error == IN_WAKEUP)) {
1809 cpu_relax();
1810 error = q->status;
1811 }
1812
1813 return error;
1814}
1815
1816SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, 1755SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1817 unsigned, nsops, const struct timespec __user *, timeout) 1756 unsigned, nsops, const struct timespec __user *, timeout)
1818{ 1757{
@@ -1821,11 +1760,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1821 struct sembuf fast_sops[SEMOPM_FAST]; 1760 struct sembuf fast_sops[SEMOPM_FAST];
1822 struct sembuf *sops = fast_sops, *sop; 1761 struct sembuf *sops = fast_sops, *sop;
1823 struct sem_undo *un; 1762 struct sem_undo *un;
1824 int undos = 0, alter = 0, max, locknum; 1763 int max, locknum;
1764 bool undos = false, alter = false, dupsop = false;
1825 struct sem_queue queue; 1765 struct sem_queue queue;
1826 unsigned long jiffies_left = 0; 1766 unsigned long dup = 0, jiffies_left = 0;
1827 struct ipc_namespace *ns; 1767 struct ipc_namespace *ns;
1828 struct list_head tasks;
1829 1768
1830 ns = current->nsproxy->ipc_ns; 1769 ns = current->nsproxy->ipc_ns;
1831 1770
@@ -1838,10 +1777,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1838 if (sops == NULL) 1777 if (sops == NULL)
1839 return -ENOMEM; 1778 return -ENOMEM;
1840 } 1779 }
1780
1841 if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { 1781 if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) {
1842 error = -EFAULT; 1782 error = -EFAULT;
1843 goto out_free; 1783 goto out_free;
1844 } 1784 }
1785
1845 if (timeout) { 1786 if (timeout) {
1846 struct timespec _timeout; 1787 struct timespec _timeout;
1847 if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { 1788 if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) {
@@ -1855,18 +1796,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1855 } 1796 }
1856 jiffies_left = timespec_to_jiffies(&_timeout); 1797 jiffies_left = timespec_to_jiffies(&_timeout);
1857 } 1798 }
1799
1858 max = 0; 1800 max = 0;
1859 for (sop = sops; sop < sops + nsops; sop++) { 1801 for (sop = sops; sop < sops + nsops; sop++) {
1802 unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG);
1803
1860 if (sop->sem_num >= max) 1804 if (sop->sem_num >= max)
1861 max = sop->sem_num; 1805 max = sop->sem_num;
1862 if (sop->sem_flg & SEM_UNDO) 1806 if (sop->sem_flg & SEM_UNDO)
1863 undos = 1; 1807 undos = true;
1864 if (sop->sem_op != 0) 1808 if (dup & mask) {
1865 alter = 1; 1809 /*
1810 * There was a previous alter access that appears
1811 * to have accessed the same semaphore, thus use
1812 * the dupsop logic. "appears", because the detection
1813 * can only check % BITS_PER_LONG.
1814 */
1815 dupsop = true;
1816 }
1817 if (sop->sem_op != 0) {
1818 alter = true;
1819 dup |= mask;
1820 }
1866 } 1821 }
1867 1822
1868 INIT_LIST_HEAD(&tasks);
1869
1870 if (undos) { 1823 if (undos) {
1871 /* On success, find_alloc_undo takes the rcu_read_lock */ 1824 /* On success, find_alloc_undo takes the rcu_read_lock */
1872 un = find_alloc_undo(ns, semid); 1825 un = find_alloc_undo(ns, semid);
@@ -1887,16 +1840,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1887 } 1840 }
1888 1841
1889 error = -EFBIG; 1842 error = -EFBIG;
1890 if (max >= sma->sem_nsems) 1843 if (max >= sma->sem_nsems) {
1891 goto out_rcu_wakeup; 1844 rcu_read_unlock();
1845 goto out_free;
1846 }
1892 1847
1893 error = -EACCES; 1848 error = -EACCES;
1894 if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) 1849 if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) {
1895 goto out_rcu_wakeup; 1850 rcu_read_unlock();
1851 goto out_free;
1852 }
1896 1853
1897 error = security_sem_semop(sma, sops, nsops, alter); 1854 error = security_sem_semop(sma, sops, nsops, alter);
1898 if (error) 1855 if (error) {
1899 goto out_rcu_wakeup; 1856 rcu_read_unlock();
1857 goto out_free;
1858 }
1900 1859
1901 error = -EIDRM; 1860 error = -EIDRM;
1902 locknum = sem_lock(sma, sops, nsops); 1861 locknum = sem_lock(sma, sops, nsops);
@@ -1925,24 +1884,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1925 queue.undo = un; 1884 queue.undo = un;
1926 queue.pid = task_tgid_vnr(current); 1885 queue.pid = task_tgid_vnr(current);
1927 queue.alter = alter; 1886 queue.alter = alter;
1887 queue.dupsop = dupsop;
1928 1888
1929 error = perform_atomic_semop(sma, &queue); 1889 error = perform_atomic_semop(sma, &queue);
1930 if (error == 0) { 1890 if (error == 0) { /* non-blocking succesfull path */
1931 /* If the operation was successful, then do 1891 DEFINE_WAKE_Q(wake_q);
1892
1893 /*
1894 * If the operation was successful, then do
1932 * the required updates. 1895 * the required updates.
1933 */ 1896 */
1934 if (alter) 1897 if (alter)
1935 do_smart_update(sma, sops, nsops, 1, &tasks); 1898 do_smart_update(sma, sops, nsops, 1, &wake_q);
1936 else 1899 else
1937 set_semotime(sma, sops); 1900 set_semotime(sma, sops);
1901
1902 sem_unlock(sma, locknum);
1903 rcu_read_unlock();
1904 wake_up_q(&wake_q);
1905
1906 goto out_free;
1938 } 1907 }
1939 if (error <= 0) 1908 if (error < 0) /* non-blocking error path */
1940 goto out_unlock_free; 1909 goto out_unlock_free;
1941 1910
1942 /* We need to sleep on this operation, so we put the current 1911 /*
1912 * We need to sleep on this operation, so we put the current
1943 * task into the pending queue and go to sleep. 1913 * task into the pending queue and go to sleep.
1944 */ 1914 */
1945
1946 if (nsops == 1) { 1915 if (nsops == 1) {
1947 struct sem *curr; 1916 struct sem *curr;
1948 curr = &sma->sem_base[sops->sem_num]; 1917 curr = &sma->sem_base[sops->sem_num];
@@ -1971,77 +1940,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
1971 sma->complex_count++; 1940 sma->complex_count++;
1972 } 1941 }
1973 1942
1974 queue.status = -EINTR; 1943 do {
1975 queue.sleeper = current; 1944 queue.status = -EINTR;
1945 queue.sleeper = current;
1976 1946
1977sleep_again: 1947 __set_current_state(TASK_INTERRUPTIBLE);
1978 __set_current_state(TASK_INTERRUPTIBLE); 1948 sem_unlock(sma, locknum);
1979 sem_unlock(sma, locknum); 1949 rcu_read_unlock();
1980 rcu_read_unlock();
1981
1982 if (timeout)
1983 jiffies_left = schedule_timeout(jiffies_left);
1984 else
1985 schedule();
1986 1950
1987 error = get_queue_result(&queue); 1951 if (timeout)
1952 jiffies_left = schedule_timeout(jiffies_left);
1953 else
1954 schedule();
1988 1955
1989 if (error != -EINTR) { 1956 /*
1990 /* fast path: update_queue already obtained all requested 1957 * fastpath: the semop has completed, either successfully or
1991 * resources. 1958 * not, from the syscall pov, is quite irrelevant to us at this
1992 * Perform a smp_mb(): User space could assume that semop() 1959 * point; we're done.
1993 * is a memory barrier: Without the mb(), the cpu could 1960 *
1994 * speculatively read in user space stale data that was 1961 * We _do_ care, nonetheless, about being awoken by a signal or
1995 * overwritten by the previous owner of the semaphore. 1962 * spuriously. The queue.status is checked again in the
1963 * slowpath (aka after taking sem_lock), such that we can detect
1964 * scenarios where we were awakened externally, during the
1965 * window between wake_q_add() and wake_up_q().
1996 */ 1966 */
1997 smp_mb(); 1967 error = READ_ONCE(queue.status);
1998 1968 if (error != -EINTR) {
1999 goto out_free; 1969 /*
2000 } 1970 * User space could assume that semop() is a memory
2001 1971 * barrier: Without the mb(), the cpu could
2002 rcu_read_lock(); 1972 * speculatively read in userspace stale data that was
2003 sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); 1973 * overwritten by the previous owner of the semaphore.
2004 1974 */
2005 /* 1975 smp_mb();
2006 * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. 1976 goto out_free;
2007 */ 1977 }
2008 error = get_queue_result(&queue);
2009 1978
2010 /* 1979 rcu_read_lock();
2011 * Array removed? If yes, leave without sem_unlock(). 1980 sem_lock(sma, sops, nsops);
2012 */
2013 if (IS_ERR(sma)) {
2014 rcu_read_unlock();
2015 goto out_free;
2016 }
2017 1981
1982 if (!ipc_valid_object(&sma->sem_perm))
1983 goto out_unlock_free;
2018 1984
2019 /* 1985 error = READ_ONCE(queue.status);
2020 * If queue.status != -EINTR we are woken up by another process.
2021 * Leave without unlink_queue(), but with sem_unlock().
2022 */
2023 if (error != -EINTR)
2024 goto out_unlock_free;
2025 1986
2026 /* 1987 /*
2027 * If an interrupt occurred we have to clean up the queue 1988 * If queue.status != -EINTR we are woken up by another process.
2028 */ 1989 * Leave without unlink_queue(), but with sem_unlock().
2029 if (timeout && jiffies_left == 0) 1990 */
2030 error = -EAGAIN; 1991 if (error != -EINTR)
1992 goto out_unlock_free;
2031 1993
2032 /* 1994 /*
2033 * If the wakeup was spurious, just retry 1995 * If an interrupt occurred we have to clean up the queue.
2034 */ 1996 */
2035 if (error == -EINTR && !signal_pending(current)) 1997 if (timeout && jiffies_left == 0)
2036 goto sleep_again; 1998 error = -EAGAIN;
1999 } while (error == -EINTR && !signal_pending(current)); /* spurious */
2037 2000
2038 unlink_queue(sma, &queue); 2001 unlink_queue(sma, &queue);
2039 2002
2040out_unlock_free: 2003out_unlock_free:
2041 sem_unlock(sma, locknum); 2004 sem_unlock(sma, locknum);
2042out_rcu_wakeup:
2043 rcu_read_unlock(); 2005 rcu_read_unlock();
2044 wake_up_sem_queue_do(&tasks);
2045out_free: 2006out_free:
2046 if (sops != fast_sops) 2007 if (sops != fast_sops)
2047 kfree(sops); 2008 kfree(sops);
@@ -2102,8 +2063,8 @@ void exit_sem(struct task_struct *tsk)
2102 for (;;) { 2063 for (;;) {
2103 struct sem_array *sma; 2064 struct sem_array *sma;
2104 struct sem_undo *un; 2065 struct sem_undo *un;
2105 struct list_head tasks;
2106 int semid, i; 2066 int semid, i;
2067 DEFINE_WAKE_Q(wake_q);
2107 2068
2108 cond_resched(); 2069 cond_resched();
2109 2070
@@ -2191,11 +2152,10 @@ void exit_sem(struct task_struct *tsk)
2191 } 2152 }
2192 } 2153 }
2193 /* maybe some queued-up processes were waiting for this */ 2154 /* maybe some queued-up processes were waiting for this */
2194 INIT_LIST_HEAD(&tasks); 2155 do_smart_update(sma, NULL, 0, 1, &wake_q);
2195 do_smart_update(sma, NULL, 0, 1, &tasks);
2196 sem_unlock(sma, -1); 2156 sem_unlock(sma, -1);
2197 rcu_read_unlock(); 2157 rcu_read_unlock();
2198 wake_up_sem_queue_do(&tasks); 2158 wake_up_q(&wake_q);
2199 2159
2200 kfree_rcu(un, rcu); 2160 kfree_rcu(un, rcu);
2201 } 2161 }
diff --git a/ipc/shm.c b/ipc/shm.c
index dbac8860c721..81203e8ba013 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -89,6 +89,7 @@ void shm_init_ns(struct ipc_namespace *ns)
89static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) 89static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
90{ 90{
91 struct shmid_kernel *shp; 91 struct shmid_kernel *shp;
92
92 shp = container_of(ipcp, struct shmid_kernel, shm_perm); 93 shp = container_of(ipcp, struct shmid_kernel, shm_perm);
93 94
94 if (shp->shm_nattch) { 95 if (shp->shm_nattch) {
@@ -387,6 +388,7 @@ static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
387 struct file *file = vma->vm_file; 388 struct file *file = vma->vm_file;
388 struct shm_file_data *sfd = shm_file_data(file); 389 struct shm_file_data *sfd = shm_file_data(file);
389 int err = 0; 390 int err = 0;
391
390 if (sfd->vm_ops->set_policy) 392 if (sfd->vm_ops->set_policy)
391 err = sfd->vm_ops->set_policy(vma, new); 393 err = sfd->vm_ops->set_policy(vma, new);
392 return err; 394 return err;
@@ -417,7 +419,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
417 * In case of remap_file_pages() emulation, the file can represent 419 * In case of remap_file_pages() emulation, the file can represent
418 * removed IPC ID: propogate shm_lock() error to caller. 420 * removed IPC ID: propogate shm_lock() error to caller.
419 */ 421 */
420 ret =__shm_open(vma); 422 ret = __shm_open(vma);
421 if (ret) 423 if (ret)
422 return ret; 424 return ret;
423 425
@@ -468,6 +470,7 @@ static unsigned long shm_get_unmapped_area(struct file *file,
468 unsigned long flags) 470 unsigned long flags)
469{ 471{
470 struct shm_file_data *sfd = shm_file_data(file); 472 struct shm_file_data *sfd = shm_file_data(file);
473
471 return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, 474 return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len,
472 pgoff, flags); 475 pgoff, flags);
473} 476}
@@ -766,6 +769,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
766 } else { 769 } else {
767#ifdef CONFIG_SHMEM 770#ifdef CONFIG_SHMEM
768 struct shmem_inode_info *info = SHMEM_I(inode); 771 struct shmem_inode_info *info = SHMEM_I(inode);
772
769 spin_lock_irq(&info->lock); 773 spin_lock_irq(&info->lock);
770 *rss_add += inode->i_mapping->nrpages; 774 *rss_add += inode->i_mapping->nrpages;
771 *swp_add += info->swapped; 775 *swp_add += info->swapped;
@@ -1028,6 +1032,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
1028 1032
1029 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { 1033 if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
1030 kuid_t euid = current_euid(); 1034 kuid_t euid = current_euid();
1035
1031 if (!uid_eq(euid, shp->shm_perm.uid) && 1036 if (!uid_eq(euid, shp->shm_perm.uid) &&
1032 !uid_eq(euid, shp->shm_perm.cuid)) { 1037 !uid_eq(euid, shp->shm_perm.cuid)) {
1033 err = -EPERM; 1038 err = -EPERM;
@@ -1045,6 +1050,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
1045 1050
1046 if (cmd == SHM_LOCK) { 1051 if (cmd == SHM_LOCK) {
1047 struct user_struct *user = current_user(); 1052 struct user_struct *user = current_user();
1053
1048 err = shmem_lock(shm_file, 1, user); 1054 err = shmem_lock(shm_file, 1, user);
1049 if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { 1055 if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
1050 shp->shm_perm.mode |= SHM_LOCKED; 1056 shp->shm_perm.mode |= SHM_LOCKED;
@@ -1354,9 +1360,10 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
1354 vma = next; 1360 vma = next;
1355 } 1361 }
1356 1362
1357#else /* CONFIG_MMU */ 1363#else /* CONFIG_MMU */
1358 /* under NOMMU conditions, the exact address to be destroyed must be 1364 /* under NOMMU conditions, the exact address to be destroyed must be
1359 * given */ 1365 * given
1366 */
1360 if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { 1367 if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
1361 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); 1368 do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
1362 retval = 0; 1369 retval = 0;
diff --git a/kernel/Makefile b/kernel/Makefile
index eaee9de224bd..12c679f769c6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
84obj-$(CONFIG_KGDB) += debug/ 84obj-$(CONFIG_KGDB) += debug/
85obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 85obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
86obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 86obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
87obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
87obj-$(CONFIG_SECCOMP) += seccomp.o 88obj-$(CONFIG_SECCOMP) += seccomp.o
88obj-$(CONFIG_RELAY) += relay.o 89obj-$(CONFIG_RELAY) += relay.o
89obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 90obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0874e2edd275..79517e5549f1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -598,11 +598,11 @@ return_normal:
598 /* 598 /*
599 * Wait for the other CPUs to be notified and be waiting for us: 599 * Wait for the other CPUs to be notified and be waiting for us:
600 */ 600 */
601 time_left = loops_per_jiffy * HZ; 601 time_left = MSEC_PER_SEC;
602 while (kgdb_do_roundup && --time_left && 602 while (kgdb_do_roundup && --time_left &&
603 (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != 603 (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) !=
604 online_cpus) 604 online_cpus)
605 cpu_relax(); 605 udelay(1000);
606 if (!time_left) 606 if (!time_left)
607 pr_crit("Timed out waiting for secondary CPUs.\n"); 607 pr_crit("Timed out waiting for secondary CPUs.\n");
608 608
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 98c9011eac78..e74be38245ad 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -30,6 +30,7 @@
30char kdb_prompt_str[CMD_BUFLEN]; 30char kdb_prompt_str[CMD_BUFLEN];
31 31
32int kdb_trap_printk; 32int kdb_trap_printk;
33int kdb_printf_cpu = -1;
33 34
34static int kgdb_transition_check(char *buffer) 35static int kgdb_transition_check(char *buffer)
35{ 36{
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
554 int linecount; 555 int linecount;
555 int colcount; 556 int colcount;
556 int logging, saved_loglevel = 0; 557 int logging, saved_loglevel = 0;
557 int saved_trap_printk;
558 int got_printf_lock = 0;
559 int retlen = 0; 558 int retlen = 0;
560 int fnd, len; 559 int fnd, len;
560 int this_cpu, old_cpu;
561 char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; 561 char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
562 char *moreprompt = "more> "; 562 char *moreprompt = "more> ";
563 struct console *c = console_drivers; 563 struct console *c = console_drivers;
564 static DEFINE_SPINLOCK(kdb_printf_lock);
565 unsigned long uninitialized_var(flags); 564 unsigned long uninitialized_var(flags);
566 565
567 preempt_disable();
568 saved_trap_printk = kdb_trap_printk;
569 kdb_trap_printk = 0;
570
571 /* Serialize kdb_printf if multiple cpus try to write at once. 566 /* Serialize kdb_printf if multiple cpus try to write at once.
572 * But if any cpu goes recursive in kdb, just print the output, 567 * But if any cpu goes recursive in kdb, just print the output,
573 * even if it is interleaved with any other text. 568 * even if it is interleaved with any other text.
574 */ 569 */
575 if (!KDB_STATE(PRINTF_LOCK)) { 570 local_irq_save(flags);
576 KDB_STATE_SET(PRINTF_LOCK); 571 this_cpu = smp_processor_id();
577 spin_lock_irqsave(&kdb_printf_lock, flags); 572 for (;;) {
578 got_printf_lock = 1; 573 old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu);
579 atomic_inc(&kdb_event); 574 if (old_cpu == -1 || old_cpu == this_cpu)
580 } else { 575 break;
581 __acquire(kdb_printf_lock); 576
577 cpu_relax();
582 } 578 }
583 579
584 diag = kdbgetintenv("LINES", &linecount); 580 diag = kdbgetintenv("LINES", &linecount);
@@ -847,16 +843,9 @@ kdb_print_out:
847 suspend_grep = 0; /* end of what may have been a recursive call */ 843 suspend_grep = 0; /* end of what may have been a recursive call */
848 if (logging) 844 if (logging)
849 console_loglevel = saved_loglevel; 845 console_loglevel = saved_loglevel;
850 if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { 846 /* kdb_printf_cpu locked the code above. */
851 got_printf_lock = 0; 847 smp_store_release(&kdb_printf_cpu, old_cpu);
852 spin_unlock_irqrestore(&kdb_printf_lock, flags); 848 local_irq_restore(flags);
853 KDB_STATE_CLEAR(PRINTF_LOCK);
854 atomic_dec(&kdb_event);
855 } else {
856 __release(kdb_printf_lock);
857 }
858 kdb_trap_printk = saved_trap_printk;
859 preempt_enable();
860 return retlen; 849 return retlen;
861} 850}
862 851
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2a20c0dfdafc..ca183919d302 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -60,7 +60,6 @@ int kdb_grep_trailing;
60 * Kernel debugger state flags 60 * Kernel debugger state flags
61 */ 61 */
62int kdb_flags; 62int kdb_flags;
63atomic_t kdb_event;
64 63
65/* 64/*
66 * kdb_lock protects updates to kdb_initial_cpu. Used to 65 * kdb_lock protects updates to kdb_initial_cpu. Used to
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 75014d7f4568..fc224fbcf954 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -132,7 +132,6 @@ extern int kdb_state;
132#define KDB_STATE_PAGER 0x00000400 /* pager is available */ 132#define KDB_STATE_PAGER 0x00000400 /* pager is available */
133#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching 133#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
134 * back to initial cpu */ 134 * back to initial cpu */
135#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
136#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ 135#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
137#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ 136#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
138#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been 137#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..215871bda3a2 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
301retry: 301retry:
302 /* Read the page with vaddr into memory */ 302 /* Read the page with vaddr into memory */
303 ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, 303 ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page,
304 &vma); 304 &vma, NULL);
305 if (ret <= 0) 305 if (ret <= 0)
306 return ret; 306 return ret;
307 307
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1712 * essentially a kernel access to the memory. 1712 * essentially a kernel access to the memory.
1713 */ 1713 */
1714 result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, 1714 result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page,
1715 NULL); 1715 NULL, NULL);
1716 if (result < 0) 1716 if (result < 0)
1717 return result; 1717 return result;
1718 1718
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 3cbb0c879705..cc2fa35ca480 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -1,11 +1,16 @@
1#define pr_fmt(fmt) "kcov: " fmt 1#define pr_fmt(fmt) "kcov: " fmt
2 2
3#define DISABLE_BRANCH_PROFILING 3#define DISABLE_BRANCH_PROFILING
4#include <linux/atomic.h>
4#include <linux/compiler.h> 5#include <linux/compiler.h>
6#include <linux/errno.h>
7#include <linux/export.h>
5#include <linux/types.h> 8#include <linux/types.h>
6#include <linux/file.h> 9#include <linux/file.h>
7#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/init.h>
8#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/preempt.h>
9#include <linux/printk.h> 14#include <linux/printk.h>
10#include <linux/sched.h> 15#include <linux/sched.h>
11#include <linux/slab.h> 16#include <linux/slab.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 561675589511..5617cc412444 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
441 while (hole_end <= crashk_res.end) { 441 while (hole_end <= crashk_res.end) {
442 unsigned long i; 442 unsigned long i;
443 443
444 cond_resched();
445
444 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) 446 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
445 break; 447 break;
446 /* See if I overlap any of the segments */ 448 /* See if I overlap any of the segments */
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void)
1467#endif 1469#endif
1468 VMCOREINFO_NUMBER(PG_head_mask); 1470 VMCOREINFO_NUMBER(PG_head_mask);
1469 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); 1471 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1470#ifdef CONFIG_X86
1471 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
1472#endif
1473#ifdef CONFIG_HUGETLB_PAGE 1472#ifdef CONFIG_HUGETLB_PAGE
1474 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); 1473 VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR);
1475#endif 1474#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 577f2288d19f..a3ce35e0fa1e 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args)
1926 int r; 1926 int r;
1927 1927
1928#ifdef CONFIG_KGDB_KDB 1928#ifdef CONFIG_KGDB_KDB
1929 if (unlikely(kdb_trap_printk)) { 1929 /* Allow to pass printk() to kdb but avoid a recursion. */
1930 if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) {
1930 r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); 1931 r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
1931 return r; 1932 return r;
1932 } 1933 }
diff --git a/kernel/relay.c b/kernel/relay.c
index da79a109dbeb..8f18d314a96a 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan,
809{ 809{
810 struct rchan_buf *buf; 810 struct rchan_buf *buf;
811 811
812 if (!chan) 812 if (!chan || cpu >= NR_CPUS)
813 return; 813 return;
814 814
815 buf = *per_cpu_ptr(chan->buf, cpu); 815 buf = *per_cpu_ptr(chan->buf, cpu);
816 if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) 816 if (!buf || subbufs_consumed > chan->n_subbufs)
817 return; 817 return;
818 818
819 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) 819 if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
diff --git a/kernel/signal.c b/kernel/signal.c
index 29a410780aa9..ae60996fedff 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset)
2491{ 2491{
2492 struct task_struct *tsk = current; 2492 struct task_struct *tsk = current;
2493 2493
2494 /*
2495 * In case the signal mask hasn't changed, there is nothing we need
2496 * to do. The current->blocked shouldn't be modified by other task.
2497 */
2498 if (sigequalsets(&tsk->blocked, newset))
2499 return;
2500
2494 spin_lock_irq(&tsk->sighand->siglock); 2501 spin_lock_irq(&tsk->sighand->siglock);
2495 __set_task_blocked(tsk, newset); 2502 __set_task_blocked(tsk, newset);
2496 spin_unlock_irq(&tsk->sighand->siglock); 2503 spin_unlock_irq(&tsk->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 39b3368f6de6..1475d2545b7e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void)
2389#ifdef CONFIG_COREDUMP 2389#ifdef CONFIG_COREDUMP
2390 if (suid_dumpable == SUID_DUMP_ROOT && 2390 if (suid_dumpable == SUID_DUMP_ROOT &&
2391 core_pattern[0] != '/' && core_pattern[0] != '|') { 2391 core_pattern[0] != '/' && core_pattern[0] != '|') {
2392 printk(KERN_WARNING "Unsafe core_pattern used with "\ 2392 printk(KERN_WARNING
2393 "suid_dumpable=2. Pipe handler or fully qualified "\ 2393"Unsafe core_pattern used with fs.suid_dumpable=2.\n"
2394 "core dump path required.\n"); 2394"Pipe handler or fully qualified core dump path required.\n"
2395"Set kernel.core_pattern before fs.suid_dumpable.\n"
2396 );
2395 } 2397 }
2396#endif 2398#endif
2397} 2399}
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6eb99c17dbd8..ece4b177052b 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1354 "warning: process `%s' used the deprecated sysctl " 1354 "warning: process `%s' used the deprecated sysctl "
1355 "system call with ", current->comm); 1355 "system call with ", current->comm);
1356 for (i = 0; i < nlen; i++) 1356 for (i = 0; i < nlen; i++)
1357 printk("%d.", name[i]); 1357 printk(KERN_CONT "%d.", name[i]);
1358 printk("\n"); 1358 printk(KERN_CONT "\n");
1359 } 1359 }
1360 return; 1360 return;
1361} 1361}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9b08ca391aed..3921cf7fea8e 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
516 516
517 spin_lock_irqsave(&ptr->it_lock, flags); 517 spin_lock_irqsave(&ptr->it_lock, flags);
518 if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { 518 if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
519 if (posix_timer_event(ptr, 0) != 0) 519 if (IS_ENABLED(CONFIG_POSIX_TIMERS) &&
520 posix_timer_event(ptr, 0) != 0)
520 ptr->it_overrun++; 521 ptr->it_overrun++;
521 } 522 }
522 523
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f280ec..d4b0fa01cae3 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
24 24
25#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
26#include <linux/kvm_para.h> 26#include <linux/kvm_para.h>
27#include <linux/perf_event.h>
28#include <linux/kthread.h> 27#include <linux/kthread.h>
29 28
30/*
31 * The run state of the lockup detectors is controlled by the content of the
32 * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
33 * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
34 *
35 * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
36 * are variables that are only used as an 'interface' between the parameters
37 * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
38 * 'watchdog_thresh' variable is handled differently because its value is not
39 * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
40 * is equal zero.
41 */
42#define NMI_WATCHDOG_ENABLED_BIT 0
43#define SOFT_WATCHDOG_ENABLED_BIT 1
44#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
45#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
46
47static DEFINE_MUTEX(watchdog_proc_mutex); 29static DEFINE_MUTEX(watchdog_proc_mutex);
48 30
49#ifdef CONFIG_HARDLOCKUP_DETECTOR 31#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
50static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; 32unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
51#else 33#else
52static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; 34unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
53#endif 35#endif
54int __read_mostly nmi_watchdog_enabled; 36int __read_mostly nmi_watchdog_enabled;
55int __read_mostly soft_watchdog_enabled; 37int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
59#ifdef CONFIG_SMP 41#ifdef CONFIG_SMP
60int __read_mostly sysctl_softlockup_all_cpu_backtrace; 42int __read_mostly sysctl_softlockup_all_cpu_backtrace;
61int __read_mostly sysctl_hardlockup_all_cpu_backtrace; 43int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
62#else
63#define sysctl_softlockup_all_cpu_backtrace 0
64#define sysctl_hardlockup_all_cpu_backtrace 0
65#endif 44#endif
66static struct cpumask watchdog_cpumask __read_mostly; 45static struct cpumask watchdog_cpumask __read_mostly;
67unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); 46unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
100static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); 79static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
101static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); 80static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
102static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); 81static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
103#ifdef CONFIG_HARDLOCKUP_DETECTOR
104static DEFINE_PER_CPU(bool, hard_watchdog_warn);
105static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
106static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); 82static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
107static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
108#endif
109static unsigned long soft_lockup_nmi_warn; 83static unsigned long soft_lockup_nmi_warn;
110 84
111/* boot commands */
112/*
113 * Should we panic when a soft-lockup or hard-lockup occurs:
114 */
115#ifdef CONFIG_HARDLOCKUP_DETECTOR
116unsigned int __read_mostly hardlockup_panic =
117 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
118static unsigned long hardlockup_allcpu_dumped;
119/*
120 * We may not want to enable hard lockup detection by default in all cases,
121 * for example when running the kernel as a guest on a hypervisor. In these
122 * cases this function can be called to disable hard lockup detection. This
123 * function should only be executed once by the boot processor before the
124 * kernel command line parameters are parsed, because otherwise it is not
125 * possible to override this in hardlockup_panic_setup().
126 */
127void hardlockup_detector_disable(void)
128{
129 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
130}
131
132static int __init hardlockup_panic_setup(char *str)
133{
134 if (!strncmp(str, "panic", 5))
135 hardlockup_panic = 1;
136 else if (!strncmp(str, "nopanic", 7))
137 hardlockup_panic = 0;
138 else if (!strncmp(str, "0", 1))
139 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
140 else if (!strncmp(str, "1", 1))
141 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
142 return 1;
143}
144__setup("nmi_watchdog=", hardlockup_panic_setup);
145#endif
146
147unsigned int __read_mostly softlockup_panic = 85unsigned int __read_mostly softlockup_panic =
148 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; 86 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
149 87
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void)
264 wq_watchdog_touch(-1); 202 wq_watchdog_touch(-1);
265} 203}
266 204
267#ifdef CONFIG_HARDLOCKUP_DETECTOR
268void touch_nmi_watchdog(void)
269{
270 /*
271 * Using __raw here because some code paths have
272 * preemption enabled. If preemption is enabled
273 * then interrupts should be enabled too, in which
274 * case we shouldn't have to worry about the watchdog
275 * going off.
276 */
277 raw_cpu_write(watchdog_nmi_touch, true);
278 touch_softlockup_watchdog();
279}
280EXPORT_SYMBOL(touch_nmi_watchdog);
281
282#endif
283
284void touch_softlockup_watchdog_sync(void) 205void touch_softlockup_watchdog_sync(void)
285{ 206{
286 __this_cpu_write(softlockup_touch_sync, true); 207 __this_cpu_write(softlockup_touch_sync, true);
287 __this_cpu_write(watchdog_touch_ts, 0); 208 __this_cpu_write(watchdog_touch_ts, 0);
288} 209}
289 210
290#ifdef CONFIG_HARDLOCKUP_DETECTOR
291/* watchdog detector functions */ 211/* watchdog detector functions */
292static bool is_hardlockup(void) 212bool is_hardlockup(void)
293{ 213{
294 unsigned long hrint = __this_cpu_read(hrtimer_interrupts); 214 unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
295 215
@@ -299,7 +219,6 @@ static bool is_hardlockup(void)
299 __this_cpu_write(hrtimer_interrupts_saved, hrint); 219 __this_cpu_write(hrtimer_interrupts_saved, hrint);
300 return false; 220 return false;
301} 221}
302#endif
303 222
304static int is_softlockup(unsigned long touch_ts) 223static int is_softlockup(unsigned long touch_ts)
305{ 224{
@@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts)
313 return 0; 232 return 0;
314} 233}
315 234
316#ifdef CONFIG_HARDLOCKUP_DETECTOR
317
318static struct perf_event_attr wd_hw_attr = {
319 .type = PERF_TYPE_HARDWARE,
320 .config = PERF_COUNT_HW_CPU_CYCLES,
321 .size = sizeof(struct perf_event_attr),
322 .pinned = 1,
323 .disabled = 1,
324};
325
326/* Callback function for perf event subsystem */
327static void watchdog_overflow_callback(struct perf_event *event,
328 struct perf_sample_data *data,
329 struct pt_regs *regs)
330{
331 /* Ensure the watchdog never gets throttled */
332 event->hw.interrupts = 0;
333
334 if (__this_cpu_read(watchdog_nmi_touch) == true) {
335 __this_cpu_write(watchdog_nmi_touch, false);
336 return;
337 }
338
339 /* check for a hardlockup
340 * This is done by making sure our timer interrupt
341 * is incrementing. The timer interrupt should have
342 * fired multiple times before we overflow'd. If it hasn't
343 * then this is a good indication the cpu is stuck
344 */
345 if (is_hardlockup()) {
346 int this_cpu = smp_processor_id();
347 struct pt_regs *regs = get_irq_regs();
348
349 /* only print hardlockups once */
350 if (__this_cpu_read(hard_watchdog_warn) == true)
351 return;
352
353 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
354 print_modules();
355 print_irqtrace_events(current);
356 if (regs)
357 show_regs(regs);
358 else
359 dump_stack();
360
361 /*
362 * Perform all-CPU dump only once to avoid multiple hardlockups
363 * generating interleaving traces
364 */
365 if (sysctl_hardlockup_all_cpu_backtrace &&
366 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
367 trigger_allbutself_cpu_backtrace();
368
369 if (hardlockup_panic)
370 nmi_panic(regs, "Hard LOCKUP");
371
372 __this_cpu_write(hard_watchdog_warn, true);
373 return;
374 }
375
376 __this_cpu_write(hard_watchdog_warn, false);
377 return;
378}
379#endif /* CONFIG_HARDLOCKUP_DETECTOR */
380
381static void watchdog_interrupt_count(void) 235static void watchdog_interrupt_count(void)
382{ 236{
383 __this_cpu_inc(hrtimer_interrupts); 237 __this_cpu_inc(hrtimer_interrupts);
384} 238}
385 239
386static int watchdog_nmi_enable(unsigned int cpu); 240/*
387static void watchdog_nmi_disable(unsigned int cpu); 241 * These two functions are mostly architecture specific
242 * defining them as weak here.
243 */
244int __weak watchdog_nmi_enable(unsigned int cpu)
245{
246 return 0;
247}
248void __weak watchdog_nmi_disable(unsigned int cpu)
249{
250}
388 251
389static int watchdog_enable_all_cpus(void); 252static int watchdog_enable_all_cpus(void);
390static void watchdog_disable_all_cpus(void); 253static void watchdog_disable_all_cpus(void);
@@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu)
577 watchdog_nmi_disable(cpu); 440 watchdog_nmi_disable(cpu);
578} 441}
579 442
580#ifdef CONFIG_HARDLOCKUP_DETECTOR
581/*
582 * People like the simple clean cpu node info on boot.
583 * Reduce the watchdog noise by only printing messages
584 * that are different from what cpu0 displayed.
585 */
586static unsigned long cpu0_err;
587
588static int watchdog_nmi_enable(unsigned int cpu)
589{
590 struct perf_event_attr *wd_attr;
591 struct perf_event *event = per_cpu(watchdog_ev, cpu);
592
593 /* nothing to do if the hard lockup detector is disabled */
594 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
595 goto out;
596
597 /* is it already setup and enabled? */
598 if (event && event->state > PERF_EVENT_STATE_OFF)
599 goto out;
600
601 /* it is setup but not enabled */
602 if (event != NULL)
603 goto out_enable;
604
605 wd_attr = &wd_hw_attr;
606 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
607
608 /* Try to register using hardware perf events */
609 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
610
611 /* save cpu0 error for future comparision */
612 if (cpu == 0 && IS_ERR(event))
613 cpu0_err = PTR_ERR(event);
614
615 if (!IS_ERR(event)) {
616 /* only print for cpu0 or different than cpu0 */
617 if (cpu == 0 || cpu0_err)
618 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
619 goto out_save;
620 }
621
622 /*
623 * Disable the hard lockup detector if _any_ CPU fails to set up
624 * set up the hardware perf event. The watchdog() function checks
625 * the NMI_WATCHDOG_ENABLED bit periodically.
626 *
627 * The barriers are for syncing up watchdog_enabled across all the
628 * cpus, as clear_bit() does not use barriers.
629 */
630 smp_mb__before_atomic();
631 clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
632 smp_mb__after_atomic();
633
634 /* skip displaying the same error again */
635 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
636 return PTR_ERR(event);
637
638 /* vary the KERN level based on the returned errno */
639 if (PTR_ERR(event) == -EOPNOTSUPP)
640 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
641 else if (PTR_ERR(event) == -ENOENT)
642 pr_warn("disabled (cpu%i): hardware events not enabled\n",
643 cpu);
644 else
645 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
646 cpu, PTR_ERR(event));
647
648 pr_info("Shutting down hard lockup detector on all cpus\n");
649
650 return PTR_ERR(event);
651
652 /* success path */
653out_save:
654 per_cpu(watchdog_ev, cpu) = event;
655out_enable:
656 perf_event_enable(per_cpu(watchdog_ev, cpu));
657out:
658 return 0;
659}
660
661static void watchdog_nmi_disable(unsigned int cpu)
662{
663 struct perf_event *event = per_cpu(watchdog_ev, cpu);
664
665 if (event) {
666 perf_event_disable(event);
667 per_cpu(watchdog_ev, cpu) = NULL;
668
669 /* should be in cleanup, but blocks oprofile */
670 perf_event_release_kernel(event);
671 }
672 if (cpu == 0) {
673 /* watchdog_nmi_enable() expects this to be zero initially. */
674 cpu0_err = 0;
675 }
676}
677
678#else
679static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
680static void watchdog_nmi_disable(unsigned int cpu) { return; }
681#endif /* CONFIG_HARDLOCKUP_DETECTOR */
682
683static struct smp_hotplug_thread watchdog_threads = { 443static struct smp_hotplug_thread watchdog_threads = {
684 .store = &softlockup_watchdog, 444 .store = &softlockup_watchdog,
685 .thread_should_run = watchdog_should_run, 445 .thread_should_run = watchdog_should_run,
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..84016c8aee6b
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,227 @@
1/*
2 * Detect hard lockups on a system
3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 *
6 * Note: Most of this code is borrowed heavily from the original softlockup
7 * detector, so thanks to Ingo for the initial implementation.
8 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 * to those contributors as well.
10 */
11
12#define pr_fmt(fmt) "NMI watchdog: " fmt
13
14#include <linux/nmi.h>
15#include <linux/module.h>
16#include <asm/irq_regs.h>
17#include <linux/perf_event.h>
18
19static DEFINE_PER_CPU(bool, hard_watchdog_warn);
20static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
21static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
22
23/* boot commands */
24/*
25 * Should we panic when a soft-lockup or hard-lockup occurs:
26 */
27unsigned int __read_mostly hardlockup_panic =
28 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
29static unsigned long hardlockup_allcpu_dumped;
30/*
31 * We may not want to enable hard lockup detection by default in all cases,
32 * for example when running the kernel as a guest on a hypervisor. In these
33 * cases this function can be called to disable hard lockup detection. This
34 * function should only be executed once by the boot processor before the
35 * kernel command line parameters are parsed, because otherwise it is not
36 * possible to override this in hardlockup_panic_setup().
37 */
38void hardlockup_detector_disable(void)
39{
40 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
41}
42
43static int __init hardlockup_panic_setup(char *str)
44{
45 if (!strncmp(str, "panic", 5))
46 hardlockup_panic = 1;
47 else if (!strncmp(str, "nopanic", 7))
48 hardlockup_panic = 0;
49 else if (!strncmp(str, "0", 1))
50 watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
51 else if (!strncmp(str, "1", 1))
52 watchdog_enabled |= NMI_WATCHDOG_ENABLED;
53 return 1;
54}
55__setup("nmi_watchdog=", hardlockup_panic_setup);
56
57void touch_nmi_watchdog(void)
58{
59 /*
60 * Using __raw here because some code paths have
61 * preemption enabled. If preemption is enabled
62 * then interrupts should be enabled too, in which
63 * case we shouldn't have to worry about the watchdog
64 * going off.
65 */
66 raw_cpu_write(watchdog_nmi_touch, true);
67 touch_softlockup_watchdog();
68}
69EXPORT_SYMBOL(touch_nmi_watchdog);
70
71static struct perf_event_attr wd_hw_attr = {
72 .type = PERF_TYPE_HARDWARE,
73 .config = PERF_COUNT_HW_CPU_CYCLES,
74 .size = sizeof(struct perf_event_attr),
75 .pinned = 1,
76 .disabled = 1,
77};
78
79/* Callback function for perf event subsystem */
80static void watchdog_overflow_callback(struct perf_event *event,
81 struct perf_sample_data *data,
82 struct pt_regs *regs)
83{
84 /* Ensure the watchdog never gets throttled */
85 event->hw.interrupts = 0;
86
87 if (__this_cpu_read(watchdog_nmi_touch) == true) {
88 __this_cpu_write(watchdog_nmi_touch, false);
89 return;
90 }
91
92 /* check for a hardlockup
93 * This is done by making sure our timer interrupt
94 * is incrementing. The timer interrupt should have
95 * fired multiple times before we overflow'd. If it hasn't
96 * then this is a good indication the cpu is stuck
97 */
98 if (is_hardlockup()) {
99 int this_cpu = smp_processor_id();
100
101 /* only print hardlockups once */
102 if (__this_cpu_read(hard_watchdog_warn) == true)
103 return;
104
105 pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
106 print_modules();
107 print_irqtrace_events(current);
108 if (regs)
109 show_regs(regs);
110 else
111 dump_stack();
112
113 /*
114 * Perform all-CPU dump only once to avoid multiple hardlockups
115 * generating interleaving traces
116 */
117 if (sysctl_hardlockup_all_cpu_backtrace &&
118 !test_and_set_bit(0, &hardlockup_allcpu_dumped))
119 trigger_allbutself_cpu_backtrace();
120
121 if (hardlockup_panic)
122 nmi_panic(regs, "Hard LOCKUP");
123
124 __this_cpu_write(hard_watchdog_warn, true);
125 return;
126 }
127
128 __this_cpu_write(hard_watchdog_warn, false);
129 return;
130}
131
132/*
133 * People like the simple clean cpu node info on boot.
134 * Reduce the watchdog noise by only printing messages
135 * that are different from what cpu0 displayed.
136 */
137static unsigned long cpu0_err;
138
139int watchdog_nmi_enable(unsigned int cpu)
140{
141 struct perf_event_attr *wd_attr;
142 struct perf_event *event = per_cpu(watchdog_ev, cpu);
143
144 /* nothing to do if the hard lockup detector is disabled */
145 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
146 goto out;
147
148 /* is it already setup and enabled? */
149 if (event && event->state > PERF_EVENT_STATE_OFF)
150 goto out;
151
152 /* it is setup but not enabled */
153 if (event != NULL)
154 goto out_enable;
155
156 wd_attr = &wd_hw_attr;
157 wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
158
159 /* Try to register using hardware perf events */
160 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
161
162 /* save cpu0 error for future comparision */
163 if (cpu == 0 && IS_ERR(event))
164 cpu0_err = PTR_ERR(event);
165
166 if (!IS_ERR(event)) {
167 /* only print for cpu0 or different than cpu0 */
168 if (cpu == 0 || cpu0_err)
169 pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
170 goto out_save;
171 }
172
173 /*
174 * Disable the hard lockup detector if _any_ CPU fails to set up
175 * set up the hardware perf event. The watchdog() function checks
176 * the NMI_WATCHDOG_ENABLED bit periodically.
177 *
178 * The barriers are for syncing up watchdog_enabled across all the
179 * cpus, as clear_bit() does not use barriers.
180 */
181 smp_mb__before_atomic();
182 clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
183 smp_mb__after_atomic();
184
185 /* skip displaying the same error again */
186 if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
187 return PTR_ERR(event);
188
189 /* vary the KERN level based on the returned errno */
190 if (PTR_ERR(event) == -EOPNOTSUPP)
191 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
192 else if (PTR_ERR(event) == -ENOENT)
193 pr_warn("disabled (cpu%i): hardware events not enabled\n",
194 cpu);
195 else
196 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
197 cpu, PTR_ERR(event));
198
199 pr_info("Shutting down hard lockup detector on all cpus\n");
200
201 return PTR_ERR(event);
202
203 /* success path */
204out_save:
205 per_cpu(watchdog_ev, cpu) = event;
206out_enable:
207 perf_event_enable(per_cpu(watchdog_ev, cpu));
208out:
209 return 0;
210}
211
212void watchdog_nmi_disable(unsigned int cpu)
213{
214 struct perf_event *event = per_cpu(watchdog_ev, cpu);
215
216 if (event) {
217 perf_event_disable(event);
218 per_cpu(watchdog_ev, cpu) = NULL;
219
220 /* should be in cleanup, but blocks oprofile */
221 perf_event_release_kernel(event);
222 }
223 if (cpu == 0) {
224 /* watchdog_nmi_enable() expects this to be zero initially. */
225 cpu0_err = 0;
226 }
227}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e6327d102184..7446097f72bd 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -194,8 +194,8 @@ config GDB_SCRIPTS
194 build directory. If you load vmlinux into gdb, the helper 194 build directory. If you load vmlinux into gdb, the helper
195 scripts will be automatically imported by gdb as well, and 195 scripts will be automatically imported by gdb as well, and
196 additional functions are available to analyze a Linux kernel 196 additional functions are available to analyze a Linux kernel
197 instance. See Documentation/gdb-kernel-debugging.txt for further 197 instance. See Documentation/dev-tools/gdb-kernel-debugging.rst
198 details. 198 for further details.
199 199
200config ENABLE_WARN_DEPRECATED 200config ENABLE_WARN_DEPRECATED
201 bool "Enable __deprecated logic" 201 bool "Enable __deprecated logic"
@@ -542,7 +542,7 @@ config DEBUG_KMEMLEAK
542 difference being that the orphan objects are not freed but 542 difference being that the orphan objects are not freed but
543 only shown in /sys/kernel/debug/kmemleak. Enabling this 543 only shown in /sys/kernel/debug/kmemleak. Enabling this
544 feature will introduce an overhead to memory 544 feature will introduce an overhead to memory
545 allocations. See Documentation/kmemleak.txt for more 545 allocations. See Documentation/dev-tools/kmemleak.rst for more
546 details. 546 details.
547 547
548 Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances 548 Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances
@@ -739,7 +739,7 @@ config KCOV
739 different machines and across reboots. If you need stable PC values, 739 different machines and across reboots. If you need stable PC values,
740 disable RANDOMIZE_BASE. 740 disable RANDOMIZE_BASE.
741 741
742 For more details, see Documentation/kcov.txt. 742 For more details, see Documentation/dev-tools/kcov.rst.
743 743
744config KCOV_INSTRUMENT_ALL 744config KCOV_INSTRUMENT_ALL
745 bool "Instrument all code by default" 745 bool "Instrument all code by default"
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan
index bc6e651df68c..a669c193b878 100644
--- a/lib/Kconfig.ubsan
+++ b/lib/Kconfig.ubsan
@@ -10,7 +10,8 @@ config UBSAN
10 This option enables undefined behaviour sanity checker 10 This option enables undefined behaviour sanity checker
11 Compile-time instrumentation is used to detect various undefined 11 Compile-time instrumentation is used to detect various undefined
12 behaviours in runtime. Various types of checks may be enabled 12 behaviours in runtime. Various types of checks may be enabled
13 via boot parameter ubsan_handle (see: Documentation/ubsan.txt). 13 via boot parameter ubsan_handle
14 (see: Documentation/dev-tools/ubsan.rst).
14 15
15config UBSAN_SANITIZE_ALL 16config UBSAN_SANITIZE_ALL
16 bool "Enable instrumentation for the entire kernel" 17 bool "Enable instrumentation for the entire kernel"
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 2e8c6f7aa56e..0019aca0f328 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -22,6 +22,7 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#include <linux/cpu.h>
25#include <linux/errno.h> 26#include <linux/errno.h>
26#include <linux/init.h> 27#include <linux/init.h>
27#include <linux/kernel.h> 28#include <linux/kernel.h>
@@ -69,6 +70,11 @@ struct radix_tree_preload {
69}; 70};
70static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; 71static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
71 72
73static inline struct radix_tree_node *entry_to_node(void *ptr)
74{
75 return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
76}
77
72static inline void *node_to_entry(void *ptr) 78static inline void *node_to_entry(void *ptr)
73{ 79{
74 return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); 80 return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
@@ -191,13 +197,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
191 * Returns next bit offset, or size if nothing found. 197 * Returns next bit offset, or size if nothing found.
192 */ 198 */
193static __always_inline unsigned long 199static __always_inline unsigned long
194radix_tree_find_next_bit(const unsigned long *addr, 200radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
195 unsigned long size, unsigned long offset) 201 unsigned long offset)
196{ 202{
197 if (!__builtin_constant_p(size)) 203 const unsigned long *addr = node->tags[tag];
198 return find_next_bit(addr, size, offset);
199 204
200 if (offset < size) { 205 if (offset < RADIX_TREE_MAP_SIZE) {
201 unsigned long tmp; 206 unsigned long tmp;
202 207
203 addr += offset / BITS_PER_LONG; 208 addr += offset / BITS_PER_LONG;
@@ -205,14 +210,32 @@ radix_tree_find_next_bit(const unsigned long *addr,
205 if (tmp) 210 if (tmp)
206 return __ffs(tmp) + offset; 211 return __ffs(tmp) + offset;
207 offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); 212 offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
208 while (offset < size) { 213 while (offset < RADIX_TREE_MAP_SIZE) {
209 tmp = *++addr; 214 tmp = *++addr;
210 if (tmp) 215 if (tmp)
211 return __ffs(tmp) + offset; 216 return __ffs(tmp) + offset;
212 offset += BITS_PER_LONG; 217 offset += BITS_PER_LONG;
213 } 218 }
214 } 219 }
215 return size; 220 return RADIX_TREE_MAP_SIZE;
221}
222
223static unsigned int iter_offset(const struct radix_tree_iter *iter)
224{
225 return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK;
226}
227
228/*
229 * The maximum index which can be stored in a radix tree
230 */
231static inline unsigned long shift_maxindex(unsigned int shift)
232{
233 return (RADIX_TREE_MAP_SIZE << shift) - 1;
234}
235
236static inline unsigned long node_maxindex(struct radix_tree_node *node)
237{
238 return shift_maxindex(node->shift);
216} 239}
217 240
218#ifndef __KERNEL__ 241#ifndef __KERNEL__
@@ -220,10 +243,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
220{ 243{
221 unsigned long i; 244 unsigned long i;
222 245
223 pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", 246 pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n",
224 node, node->offset, 247 node, node->offset, index, index | node_maxindex(node),
248 node->parent,
225 node->tags[0][0], node->tags[1][0], node->tags[2][0], 249 node->tags[0][0], node->tags[1][0], node->tags[2][0],
226 node->shift, node->count, node->exceptional, node->parent); 250 node->shift, node->count, node->exceptional);
227 251
228 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { 252 for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
229 unsigned long first = index | (i << node->shift); 253 unsigned long first = index | (i << node->shift);
@@ -231,14 +255,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index)
231 void *entry = node->slots[i]; 255 void *entry = node->slots[i];
232 if (!entry) 256 if (!entry)
233 continue; 257 continue;
234 if (is_sibling_entry(node, entry)) { 258 if (entry == RADIX_TREE_RETRY) {
235 pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", 259 pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n",
236 entry, i, 260 i, first, last, node);
237 *(void **)entry_to_node(entry),
238 first, last);
239 } else if (!radix_tree_is_internal_node(entry)) { 261 } else if (!radix_tree_is_internal_node(entry)) {
240 pr_debug("radix entry %p offset %ld indices %ld-%ld\n", 262 pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n",
241 entry, i, first, last); 263 entry, i, first, last, node);
264 } else if (is_sibling_entry(node, entry)) {
265 pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n",
266 entry, i, first, last, node,
267 *(void **)entry_to_node(entry));
242 } else { 268 } else {
243 dump_node(entry_to_node(entry), first); 269 dump_node(entry_to_node(entry), first);
244 } 270 }
@@ -262,7 +288,10 @@ static void radix_tree_dump(struct radix_tree_root *root)
262 * that the caller has pinned this thread of control to the current CPU. 288 * that the caller has pinned this thread of control to the current CPU.
263 */ 289 */
264static struct radix_tree_node * 290static struct radix_tree_node *
265radix_tree_node_alloc(struct radix_tree_root *root) 291radix_tree_node_alloc(struct radix_tree_root *root,
292 struct radix_tree_node *parent,
293 unsigned int shift, unsigned int offset,
294 unsigned int count, unsigned int exceptional)
266{ 295{
267 struct radix_tree_node *ret = NULL; 296 struct radix_tree_node *ret = NULL;
268 gfp_t gfp_mask = root_gfp_mask(root); 297 gfp_t gfp_mask = root_gfp_mask(root);
@@ -307,6 +336,13 @@ radix_tree_node_alloc(struct radix_tree_root *root)
307 ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 336 ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
308out: 337out:
309 BUG_ON(radix_tree_is_internal_node(ret)); 338 BUG_ON(radix_tree_is_internal_node(ret));
339 if (ret) {
340 ret->parent = parent;
341 ret->shift = shift;
342 ret->offset = offset;
343 ret->count = count;
344 ret->exceptional = exceptional;
345 }
310 return ret; 346 return ret;
311} 347}
312 348
@@ -314,17 +350,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head)
314{ 350{
315 struct radix_tree_node *node = 351 struct radix_tree_node *node =
316 container_of(head, struct radix_tree_node, rcu_head); 352 container_of(head, struct radix_tree_node, rcu_head);
317 int i;
318 353
319 /* 354 /*
320 * must only free zeroed nodes into the slab. radix_tree_shrink 355 * Must only free zeroed nodes into the slab. We can be left with
321 * can leave us with a non-NULL entry in the first slot, so clear 356 * non-NULL entries by radix_tree_free_nodes, so clear the entries
322 * that here to make sure. 357 * and tags here.
323 */ 358 */
324 for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) 359 memset(node->slots, 0, sizeof(node->slots));
325 tag_clear(node, i, 0); 360 memset(node->tags, 0, sizeof(node->tags));
326 361 INIT_LIST_HEAD(&node->private_list);
327 node->slots[0] = NULL;
328 362
329 kmem_cache_free(radix_tree_node_cachep, node); 363 kmem_cache_free(radix_tree_node_cachep, node);
330} 364}
@@ -344,7 +378,7 @@ radix_tree_node_free(struct radix_tree_node *node)
344 * To make use of this facility, the radix tree must be initialised without 378 * To make use of this facility, the radix tree must be initialised without
345 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). 379 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
346 */ 380 */
347static int __radix_tree_preload(gfp_t gfp_mask, int nr) 381static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
348{ 382{
349 struct radix_tree_preload *rtp; 383 struct radix_tree_preload *rtp;
350 struct radix_tree_node *node; 384 struct radix_tree_node *node;
@@ -410,6 +444,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
410} 444}
411EXPORT_SYMBOL(radix_tree_maybe_preload); 445EXPORT_SYMBOL(radix_tree_maybe_preload);
412 446
447#ifdef CONFIG_RADIX_TREE_MULTIORDER
448/*
449 * Preload with enough objects to ensure that we can split a single entry
450 * of order @old_order into many entries of size @new_order
451 */
452int radix_tree_split_preload(unsigned int old_order, unsigned int new_order,
453 gfp_t gfp_mask)
454{
455 unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT);
456 unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) -
457 (new_order / RADIX_TREE_MAP_SHIFT);
458 unsigned nr = 0;
459
460 WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
461 BUG_ON(new_order >= old_order);
462
463 while (layers--)
464 nr = nr * RADIX_TREE_MAP_SIZE + 1;
465 return __radix_tree_preload(gfp_mask, top * nr);
466}
467#endif
468
413/* 469/*
414 * The same as function above, but preload number of nodes required to insert 470 * The same as function above, but preload number of nodes required to insert
415 * (1 << order) continuous naturally-aligned elements. 471 * (1 << order) continuous naturally-aligned elements.
@@ -455,19 +511,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
455 return __radix_tree_preload(gfp_mask, nr_nodes); 511 return __radix_tree_preload(gfp_mask, nr_nodes);
456} 512}
457 513
458/*
459 * The maximum index which can be stored in a radix tree
460 */
461static inline unsigned long shift_maxindex(unsigned int shift)
462{
463 return (RADIX_TREE_MAP_SIZE << shift) - 1;
464}
465
466static inline unsigned long node_maxindex(struct radix_tree_node *node)
467{
468 return shift_maxindex(node->shift);
469}
470
471static unsigned radix_tree_load_root(struct radix_tree_root *root, 514static unsigned radix_tree_load_root(struct radix_tree_root *root,
472 struct radix_tree_node **nodep, unsigned long *maxindex) 515 struct radix_tree_node **nodep, unsigned long *maxindex)
473{ 516{
@@ -505,8 +548,8 @@ static int radix_tree_extend(struct radix_tree_root *root,
505 goto out; 548 goto out;
506 549
507 do { 550 do {
508 struct radix_tree_node *node = radix_tree_node_alloc(root); 551 struct radix_tree_node *node = radix_tree_node_alloc(root,
509 552 NULL, shift, 0, 1, 0);
510 if (!node) 553 if (!node)
511 return -ENOMEM; 554 return -ENOMEM;
512 555
@@ -517,16 +560,11 @@ static int radix_tree_extend(struct radix_tree_root *root,
517 } 560 }
518 561
519 BUG_ON(shift > BITS_PER_LONG); 562 BUG_ON(shift > BITS_PER_LONG);
520 node->shift = shift;
521 node->offset = 0;
522 node->count = 1;
523 node->parent = NULL;
524 if (radix_tree_is_internal_node(slot)) { 563 if (radix_tree_is_internal_node(slot)) {
525 entry_to_node(slot)->parent = node; 564 entry_to_node(slot)->parent = node;
526 } else { 565 } else if (radix_tree_exceptional_entry(slot)) {
527 /* Moving an exceptional root->rnode to a node */ 566 /* Moving an exceptional root->rnode to a node */
528 if (radix_tree_exceptional_entry(slot)) 567 node->exceptional = 1;
529 node->exceptional = 1;
530 } 568 }
531 node->slots[0] = slot; 569 node->slots[0] = slot;
532 slot = node_to_entry(node); 570 slot = node_to_entry(node);
@@ -665,26 +703,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
665 shift = radix_tree_load_root(root, &child, &maxindex); 703 shift = radix_tree_load_root(root, &child, &maxindex);
666 704
667 /* Make sure the tree is high enough. */ 705 /* Make sure the tree is high enough. */
706 if (order > 0 && max == ((1UL << order) - 1))
707 max++;
668 if (max > maxindex) { 708 if (max > maxindex) {
669 int error = radix_tree_extend(root, max, shift); 709 int error = radix_tree_extend(root, max, shift);
670 if (error < 0) 710 if (error < 0)
671 return error; 711 return error;
672 shift = error; 712 shift = error;
673 child = root->rnode; 713 child = root->rnode;
674 if (order == shift)
675 shift += RADIX_TREE_MAP_SHIFT;
676 } 714 }
677 715
678 while (shift > order) { 716 while (shift > order) {
679 shift -= RADIX_TREE_MAP_SHIFT; 717 shift -= RADIX_TREE_MAP_SHIFT;
680 if (child == NULL) { 718 if (child == NULL) {
681 /* Have to add a child node. */ 719 /* Have to add a child node. */
682 child = radix_tree_node_alloc(root); 720 child = radix_tree_node_alloc(root, node, shift,
721 offset, 0, 0);
683 if (!child) 722 if (!child)
684 return -ENOMEM; 723 return -ENOMEM;
685 child->shift = shift;
686 child->offset = offset;
687 child->parent = node;
688 rcu_assign_pointer(*slot, node_to_entry(child)); 724 rcu_assign_pointer(*slot, node_to_entry(child));
689 if (node) 725 if (node)
690 node->count++; 726 node->count++;
@@ -697,31 +733,125 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
697 slot = &node->slots[offset]; 733 slot = &node->slots[offset];
698 } 734 }
699 735
736 if (nodep)
737 *nodep = node;
738 if (slotp)
739 *slotp = slot;
740 return 0;
741}
742
700#ifdef CONFIG_RADIX_TREE_MULTIORDER 743#ifdef CONFIG_RADIX_TREE_MULTIORDER
701 /* Insert pointers to the canonical entry */ 744/*
702 if (order > shift) { 745 * Free any nodes below this node. The tree is presumed to not need
703 unsigned i, n = 1 << (order - shift); 746 * shrinking, and any user data in the tree is presumed to not need a
747 * destructor called on it. If we need to add a destructor, we can
748 * add that functionality later. Note that we may not clear tags or
749 * slots from the tree as an RCU walker may still have a pointer into
750 * this subtree. We could replace the entries with RADIX_TREE_RETRY,
751 * but we'll still have to clear those in rcu_free.
752 */
753static void radix_tree_free_nodes(struct radix_tree_node *node)
754{
755 unsigned offset = 0;
756 struct radix_tree_node *child = entry_to_node(node);
757
758 for (;;) {
759 void *entry = child->slots[offset];
760 if (radix_tree_is_internal_node(entry) &&
761 !is_sibling_entry(child, entry)) {
762 child = entry_to_node(entry);
763 offset = 0;
764 continue;
765 }
766 offset++;
767 while (offset == RADIX_TREE_MAP_SIZE) {
768 struct radix_tree_node *old = child;
769 offset = child->offset + 1;
770 child = child->parent;
771 radix_tree_node_free(old);
772 if (old == entry_to_node(node))
773 return;
774 }
775 }
776}
777
778static inline int insert_entries(struct radix_tree_node *node, void **slot,
779 void *item, unsigned order, bool replace)
780{
781 struct radix_tree_node *child;
782 unsigned i, n, tag, offset, tags = 0;
783
784 if (node) {
785 if (order > node->shift)
786 n = 1 << (order - node->shift);
787 else
788 n = 1;
789 offset = get_slot_offset(node, slot);
790 } else {
791 n = 1;
792 offset = 0;
793 }
794
795 if (n > 1) {
704 offset = offset & ~(n - 1); 796 offset = offset & ~(n - 1);
705 slot = &node->slots[offset]; 797 slot = &node->slots[offset];
706 child = node_to_entry(slot); 798 }
707 for (i = 0; i < n; i++) { 799 child = node_to_entry(slot);
708 if (slot[i]) 800
801 for (i = 0; i < n; i++) {
802 if (slot[i]) {
803 if (replace) {
804 node->count--;
805 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
806 if (tag_get(node, tag, offset + i))
807 tags |= 1 << tag;
808 } else
709 return -EEXIST; 809 return -EEXIST;
710 } 810 }
811 }
711 812
712 for (i = 1; i < n; i++) { 813 for (i = 0; i < n; i++) {
814 struct radix_tree_node *old = slot[i];
815 if (i) {
713 rcu_assign_pointer(slot[i], child); 816 rcu_assign_pointer(slot[i], child);
714 node->count++; 817 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
818 if (tags & (1 << tag))
819 tag_clear(node, tag, offset + i);
820 } else {
821 rcu_assign_pointer(slot[i], item);
822 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
823 if (tags & (1 << tag))
824 tag_set(node, tag, offset);
715 } 825 }
826 if (radix_tree_is_internal_node(old) &&
827 !is_sibling_entry(node, old) &&
828 (old != RADIX_TREE_RETRY))
829 radix_tree_free_nodes(old);
830 if (radix_tree_exceptional_entry(old))
831 node->exceptional--;
716 } 832 }
717#endif 833 if (node) {
718 834 node->count += n;
719 if (nodep) 835 if (radix_tree_exceptional_entry(item))
720 *nodep = node; 836 node->exceptional += n;
721 if (slotp) 837 }
722 *slotp = slot; 838 return n;
723 return 0; 839}
840#else
841static inline int insert_entries(struct radix_tree_node *node, void **slot,
842 void *item, unsigned order, bool replace)
843{
844 if (*slot)
845 return -EEXIST;
846 rcu_assign_pointer(*slot, item);
847 if (node) {
848 node->count++;
849 if (radix_tree_exceptional_entry(item))
850 node->exceptional++;
851 }
852 return 1;
724} 853}
854#endif
725 855
726/** 856/**
727 * __radix_tree_insert - insert into a radix tree 857 * __radix_tree_insert - insert into a radix tree
@@ -744,15 +874,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index,
744 error = __radix_tree_create(root, index, order, &node, &slot); 874 error = __radix_tree_create(root, index, order, &node, &slot);
745 if (error) 875 if (error)
746 return error; 876 return error;
747 if (*slot != NULL) 877
748 return -EEXIST; 878 error = insert_entries(node, slot, item, order, false);
749 rcu_assign_pointer(*slot, item); 879 if (error < 0)
880 return error;
750 881
751 if (node) { 882 if (node) {
752 unsigned offset = get_slot_offset(node, slot); 883 unsigned offset = get_slot_offset(node, slot);
753 node->count++;
754 if (radix_tree_exceptional_entry(item))
755 node->exceptional++;
756 BUG_ON(tag_get(node, 0, offset)); 884 BUG_ON(tag_get(node, 0, offset));
757 BUG_ON(tag_get(node, 1, offset)); 885 BUG_ON(tag_get(node, 1, offset));
758 BUG_ON(tag_get(node, 2, offset)); 886 BUG_ON(tag_get(node, 2, offset));
@@ -850,6 +978,24 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
850} 978}
851EXPORT_SYMBOL(radix_tree_lookup); 979EXPORT_SYMBOL(radix_tree_lookup);
852 980
981static inline int slot_count(struct radix_tree_node *node,
982 void **slot)
983{
984 int n = 1;
985#ifdef CONFIG_RADIX_TREE_MULTIORDER
986 void *ptr = node_to_entry(slot);
987 unsigned offset = get_slot_offset(node, slot);
988 int i;
989
990 for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
991 if (node->slots[offset + i] != ptr)
992 break;
993 n++;
994 }
995#endif
996 return n;
997}
998
853static void replace_slot(struct radix_tree_root *root, 999static void replace_slot(struct radix_tree_root *root,
854 struct radix_tree_node *node, 1000 struct radix_tree_node *node,
855 void **slot, void *item, 1001 void **slot, void *item,
@@ -868,12 +1014,35 @@ static void replace_slot(struct radix_tree_root *root,
868 1014
869 if (node) { 1015 if (node) {
870 node->count += count; 1016 node->count += count;
871 node->exceptional += exceptional; 1017 if (exceptional) {
1018 exceptional *= slot_count(node, slot);
1019 node->exceptional += exceptional;
1020 }
872 } 1021 }
873 1022
874 rcu_assign_pointer(*slot, item); 1023 rcu_assign_pointer(*slot, item);
875} 1024}
876 1025
1026static inline void delete_sibling_entries(struct radix_tree_node *node,
1027 void **slot)
1028{
1029#ifdef CONFIG_RADIX_TREE_MULTIORDER
1030 bool exceptional = radix_tree_exceptional_entry(*slot);
1031 void *ptr = node_to_entry(slot);
1032 unsigned offset = get_slot_offset(node, slot);
1033 int i;
1034
1035 for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
1036 if (node->slots[offset + i] != ptr)
1037 break;
1038 node->slots[offset + i] = NULL;
1039 node->count--;
1040 if (exceptional)
1041 node->exceptional--;
1042 }
1043#endif
1044}
1045
877/** 1046/**
878 * __radix_tree_replace - replace item in a slot 1047 * __radix_tree_replace - replace item in a slot
879 * @root: radix tree root 1048 * @root: radix tree root
@@ -891,6 +1060,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
891 void **slot, void *item, 1060 void **slot, void *item,
892 radix_tree_update_node_t update_node, void *private) 1061 radix_tree_update_node_t update_node, void *private)
893{ 1062{
1063 if (!item)
1064 delete_sibling_entries(node, slot);
894 /* 1065 /*
895 * This function supports replacing exceptional entries and 1066 * This function supports replacing exceptional entries and
896 * deleting entries, but that needs accounting against the 1067 * deleting entries, but that needs accounting against the
@@ -921,7 +1092,8 @@ void __radix_tree_replace(struct radix_tree_root *root,
921 * NOTE: This cannot be used to switch between non-entries (empty slots), 1092 * NOTE: This cannot be used to switch between non-entries (empty slots),
922 * regular entries, and exceptional entries, as that requires accounting 1093 * regular entries, and exceptional entries, as that requires accounting
923 * inside the radix tree node. When switching from one type of entry or 1094 * inside the radix tree node. When switching from one type of entry or
924 * deleting, use __radix_tree_lookup() and __radix_tree_replace(). 1095 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
1096 * radix_tree_iter_replace().
925 */ 1097 */
926void radix_tree_replace_slot(struct radix_tree_root *root, 1098void radix_tree_replace_slot(struct radix_tree_root *root,
927 void **slot, void *item) 1099 void **slot, void *item)
@@ -930,6 +1102,164 @@ void radix_tree_replace_slot(struct radix_tree_root *root,
930} 1102}
931 1103
932/** 1104/**
1105 * radix_tree_iter_replace - replace item in a slot
1106 * @root: radix tree root
1107 * @slot: pointer to slot
1108 * @item: new item to store in the slot.
1109 *
1110 * For use with radix_tree_split() and radix_tree_for_each_slot().
1111 * Caller must hold tree write locked across split and replacement.
1112 */
1113void radix_tree_iter_replace(struct radix_tree_root *root,
1114 const struct radix_tree_iter *iter, void **slot, void *item)
1115{
1116 __radix_tree_replace(root, iter->node, slot, item, NULL, NULL);
1117}
1118
1119#ifdef CONFIG_RADIX_TREE_MULTIORDER
1120/**
1121 * radix_tree_join - replace multiple entries with one multiorder entry
1122 * @root: radix tree root
1123 * @index: an index inside the new entry
1124 * @order: order of the new entry
1125 * @item: new entry
1126 *
1127 * Call this function to replace several entries with one larger entry.
1128 * The existing entries are presumed to not need freeing as a result of
1129 * this call.
1130 *
1131 * The replacement entry will have all the tags set on it that were set
1132 * on any of the entries it is replacing.
1133 */
1134int radix_tree_join(struct radix_tree_root *root, unsigned long index,
1135 unsigned order, void *item)
1136{
1137 struct radix_tree_node *node;
1138 void **slot;
1139 int error;
1140
1141 BUG_ON(radix_tree_is_internal_node(item));
1142
1143 error = __radix_tree_create(root, index, order, &node, &slot);
1144 if (!error)
1145 error = insert_entries(node, slot, item, order, true);
1146 if (error > 0)
1147 error = 0;
1148
1149 return error;
1150}
1151
1152/**
1153 * radix_tree_split - Split an entry into smaller entries
1154 * @root: radix tree root
1155 * @index: An index within the large entry
1156 * @order: Order of new entries
1157 *
1158 * Call this function as the first step in replacing a multiorder entry
1159 * with several entries of lower order. After this function returns,
1160 * loop over the relevant portion of the tree using radix_tree_for_each_slot()
1161 * and call radix_tree_iter_replace() to set up each new entry.
1162 *
1163 * The tags from this entry are replicated to all the new entries.
1164 *
1165 * The radix tree should be locked against modification during the entire
1166 * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which
1167 * should prompt RCU walkers to restart the lookup from the root.
1168 */
1169int radix_tree_split(struct radix_tree_root *root, unsigned long index,
1170 unsigned order)
1171{
1172 struct radix_tree_node *parent, *node, *child;
1173 void **slot;
1174 unsigned int offset, end;
1175 unsigned n, tag, tags = 0;
1176
1177 if (!__radix_tree_lookup(root, index, &parent, &slot))
1178 return -ENOENT;
1179 if (!parent)
1180 return -ENOENT;
1181
1182 offset = get_slot_offset(parent, slot);
1183
1184 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1185 if (tag_get(parent, tag, offset))
1186 tags |= 1 << tag;
1187
1188 for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) {
1189 if (!is_sibling_entry(parent, parent->slots[end]))
1190 break;
1191 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1192 if (tags & (1 << tag))
1193 tag_set(parent, tag, end);
1194 /* rcu_assign_pointer ensures tags are set before RETRY */
1195 rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY);
1196 }
1197 rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY);
1198 parent->exceptional -= (end - offset);
1199
1200 if (order == parent->shift)
1201 return 0;
1202 if (order > parent->shift) {
1203 while (offset < end)
1204 offset += insert_entries(parent, &parent->slots[offset],
1205 RADIX_TREE_RETRY, order, true);
1206 return 0;
1207 }
1208
1209 node = parent;
1210
1211 for (;;) {
1212 if (node->shift > order) {
1213 child = radix_tree_node_alloc(root, node,
1214 node->shift - RADIX_TREE_MAP_SHIFT,
1215 offset, 0, 0);
1216 if (!child)
1217 goto nomem;
1218 if (node != parent) {
1219 node->count++;
1220 node->slots[offset] = node_to_entry(child);
1221 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1222 if (tags & (1 << tag))
1223 tag_set(node, tag, offset);
1224 }
1225
1226 node = child;
1227 offset = 0;
1228 continue;
1229 }
1230
1231 n = insert_entries(node, &node->slots[offset],
1232 RADIX_TREE_RETRY, order, false);
1233 BUG_ON(n > RADIX_TREE_MAP_SIZE);
1234
1235 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1236 if (tags & (1 << tag))
1237 tag_set(node, tag, offset);
1238 offset += n;
1239
1240 while (offset == RADIX_TREE_MAP_SIZE) {
1241 if (node == parent)
1242 break;
1243 offset = node->offset;
1244 child = node;
1245 node = node->parent;
1246 rcu_assign_pointer(node->slots[offset],
1247 node_to_entry(child));
1248 offset++;
1249 }
1250 if ((node == parent) && (offset == end))
1251 return 0;
1252 }
1253
1254 nomem:
1255 /* Shouldn't happen; did user forget to preload? */
1256 /* TODO: free all the allocated nodes */
1257 WARN_ON(1);
1258 return -ENOMEM;
1259}
1260#endif
1261
1262/**
933 * radix_tree_tag_set - set a tag on a radix tree node 1263 * radix_tree_tag_set - set a tag on a radix tree node
934 * @root: radix tree root 1264 * @root: radix tree root
935 * @index: index key 1265 * @index: index key
@@ -990,6 +1320,34 @@ static void node_tag_clear(struct radix_tree_root *root,
990 root_tag_clear(root, tag); 1320 root_tag_clear(root, tag);
991} 1321}
992 1322
1323static void node_tag_set(struct radix_tree_root *root,
1324 struct radix_tree_node *node,
1325 unsigned int tag, unsigned int offset)
1326{
1327 while (node) {
1328 if (tag_get(node, tag, offset))
1329 return;
1330 tag_set(node, tag, offset);
1331 offset = node->offset;
1332 node = node->parent;
1333 }
1334
1335 if (!root_tag_get(root, tag))
1336 root_tag_set(root, tag);
1337}
1338
1339/**
1340 * radix_tree_iter_tag_set - set a tag on the current iterator entry
1341 * @root: radix tree root
1342 * @iter: iterator state
1343 * @tag: tag to set
1344 */
1345void radix_tree_iter_tag_set(struct radix_tree_root *root,
1346 const struct radix_tree_iter *iter, unsigned int tag)
1347{
1348 node_tag_set(root, iter->node, tag, iter_offset(iter));
1349}
1350
993/** 1351/**
994 * radix_tree_tag_clear - clear a tag on a radix tree node 1352 * radix_tree_tag_clear - clear a tag on a radix tree node
995 * @root: radix tree root 1353 * @root: radix tree root
@@ -1085,6 +1443,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter,
1085#endif 1443#endif
1086} 1444}
1087 1445
1446/* Construct iter->tags bit-mask from node->tags[tag] array */
1447static void set_iter_tags(struct radix_tree_iter *iter,
1448 struct radix_tree_node *node, unsigned offset,
1449 unsigned tag)
1450{
1451 unsigned tag_long = offset / BITS_PER_LONG;
1452 unsigned tag_bit = offset % BITS_PER_LONG;
1453
1454 iter->tags = node->tags[tag][tag_long] >> tag_bit;
1455
1456 /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
1457 if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
1458 /* Pick tags from next element */
1459 if (tag_bit)
1460 iter->tags |= node->tags[tag][tag_long + 1] <<
1461 (BITS_PER_LONG - tag_bit);
1462 /* Clip chunk size, here only BITS_PER_LONG tags */
1463 iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
1464 }
1465}
1466
1467#ifdef CONFIG_RADIX_TREE_MULTIORDER
1468static void **skip_siblings(struct radix_tree_node **nodep,
1469 void **slot, struct radix_tree_iter *iter)
1470{
1471 void *sib = node_to_entry(slot - 1);
1472
1473 while (iter->index < iter->next_index) {
1474 *nodep = rcu_dereference_raw(*slot);
1475 if (*nodep && *nodep != sib)
1476 return slot;
1477 slot++;
1478 iter->index = __radix_tree_iter_add(iter, 1);
1479 iter->tags >>= 1;
1480 }
1481
1482 *nodep = NULL;
1483 return NULL;
1484}
1485
1486void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
1487 unsigned flags)
1488{
1489 unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
1490 struct radix_tree_node *node = rcu_dereference_raw(*slot);
1491
1492 slot = skip_siblings(&node, slot, iter);
1493
1494 while (radix_tree_is_internal_node(node)) {
1495 unsigned offset;
1496 unsigned long next_index;
1497
1498 if (node == RADIX_TREE_RETRY)
1499 return slot;
1500 node = entry_to_node(node);
1501 iter->node = node;
1502 iter->shift = node->shift;
1503
1504 if (flags & RADIX_TREE_ITER_TAGGED) {
1505 offset = radix_tree_find_next_bit(node, tag, 0);
1506 if (offset == RADIX_TREE_MAP_SIZE)
1507 return NULL;
1508 slot = &node->slots[offset];
1509 iter->index = __radix_tree_iter_add(iter, offset);
1510 set_iter_tags(iter, node, offset, tag);
1511 node = rcu_dereference_raw(*slot);
1512 } else {
1513 offset = 0;
1514 slot = &node->slots[0];
1515 for (;;) {
1516 node = rcu_dereference_raw(*slot);
1517 if (node)
1518 break;
1519 slot++;
1520 offset++;
1521 if (offset == RADIX_TREE_MAP_SIZE)
1522 return NULL;
1523 }
1524 iter->index = __radix_tree_iter_add(iter, offset);
1525 }
1526 if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0))
1527 goto none;
1528 next_index = (iter->index | shift_maxindex(iter->shift)) + 1;
1529 if (next_index < iter->next_index)
1530 iter->next_index = next_index;
1531 }
1532
1533 return slot;
1534 none:
1535 iter->next_index = 0;
1536 return NULL;
1537}
1538EXPORT_SYMBOL(__radix_tree_next_slot);
1539#else
1540static void **skip_siblings(struct radix_tree_node **nodep,
1541 void **slot, struct radix_tree_iter *iter)
1542{
1543 return slot;
1544}
1545#endif
1546
1547void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter)
1548{
1549 struct radix_tree_node *node;
1550
1551 slot++;
1552 iter->index = __radix_tree_iter_add(iter, 1);
1553 node = rcu_dereference_raw(*slot);
1554 skip_siblings(&node, slot, iter);
1555 iter->next_index = iter->index;
1556 iter->tags = 0;
1557 return NULL;
1558}
1559EXPORT_SYMBOL(radix_tree_iter_resume);
1560
1088/** 1561/**
1089 * radix_tree_next_chunk - find next chunk of slots for iteration 1562 * radix_tree_next_chunk - find next chunk of slots for iteration
1090 * 1563 *
@@ -1110,7 +1583,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
1110 * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. 1583 * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
1111 * 1584 *
1112 * This condition also used by radix_tree_next_slot() to stop 1585 * This condition also used by radix_tree_next_slot() to stop
1113 * contiguous iterating, and forbid swithing to the next chunk. 1586 * contiguous iterating, and forbid switching to the next chunk.
1114 */ 1587 */
1115 index = iter->next_index; 1588 index = iter->next_index;
1116 if (!index && iter->index) 1589 if (!index && iter->index)
@@ -1128,6 +1601,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
1128 iter->index = index; 1601 iter->index = index;
1129 iter->next_index = maxindex + 1; 1602 iter->next_index = maxindex + 1;
1130 iter->tags = 1; 1603 iter->tags = 1;
1604 iter->node = NULL;
1131 __set_iter_shift(iter, 0); 1605 __set_iter_shift(iter, 0);
1132 return (void **)&root->rnode; 1606 return (void **)&root->rnode;
1133 } 1607 }
@@ -1143,9 +1617,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
1143 return NULL; 1617 return NULL;
1144 1618
1145 if (flags & RADIX_TREE_ITER_TAGGED) 1619 if (flags & RADIX_TREE_ITER_TAGGED)
1146 offset = radix_tree_find_next_bit( 1620 offset = radix_tree_find_next_bit(node, tag,
1147 node->tags[tag],
1148 RADIX_TREE_MAP_SIZE,
1149 offset + 1); 1621 offset + 1);
1150 else 1622 else
1151 while (++offset < RADIX_TREE_MAP_SIZE) { 1623 while (++offset < RADIX_TREE_MAP_SIZE) {
@@ -1165,154 +1637,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root,
1165 child = rcu_dereference_raw(node->slots[offset]); 1637 child = rcu_dereference_raw(node->slots[offset]);
1166 } 1638 }
1167 1639
1168 if ((child == NULL) || (child == RADIX_TREE_RETRY)) 1640 if (!child)
1169 goto restart; 1641 goto restart;
1642 if (child == RADIX_TREE_RETRY)
1643 break;
1170 } while (radix_tree_is_internal_node(child)); 1644 } while (radix_tree_is_internal_node(child));
1171 1645
1172 /* Update the iterator state */ 1646 /* Update the iterator state */
1173 iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); 1647 iter->index = (index &~ node_maxindex(node)) | (offset << node->shift);
1174 iter->next_index = (index | node_maxindex(node)) + 1; 1648 iter->next_index = (index | node_maxindex(node)) + 1;
1649 iter->node = node;
1175 __set_iter_shift(iter, node->shift); 1650 __set_iter_shift(iter, node->shift);
1176 1651
1177 /* Construct iter->tags bit-mask from node->tags[tag] array */ 1652 if (flags & RADIX_TREE_ITER_TAGGED)
1178 if (flags & RADIX_TREE_ITER_TAGGED) { 1653 set_iter_tags(iter, node, offset, tag);
1179 unsigned tag_long, tag_bit;
1180
1181 tag_long = offset / BITS_PER_LONG;
1182 tag_bit = offset % BITS_PER_LONG;
1183 iter->tags = node->tags[tag][tag_long] >> tag_bit;
1184 /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
1185 if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
1186 /* Pick tags from next element */
1187 if (tag_bit)
1188 iter->tags |= node->tags[tag][tag_long + 1] <<
1189 (BITS_PER_LONG - tag_bit);
1190 /* Clip chunk size, here only BITS_PER_LONG tags */
1191 iter->next_index = index + BITS_PER_LONG;
1192 }
1193 }
1194 1654
1195 return node->slots + offset; 1655 return node->slots + offset;
1196} 1656}
1197EXPORT_SYMBOL(radix_tree_next_chunk); 1657EXPORT_SYMBOL(radix_tree_next_chunk);
1198 1658
1199/** 1659/**
1200 * radix_tree_range_tag_if_tagged - for each item in given range set given
1201 * tag if item has another tag set
1202 * @root: radix tree root
1203 * @first_indexp: pointer to a starting index of a range to scan
1204 * @last_index: last index of a range to scan
1205 * @nr_to_tag: maximum number items to tag
1206 * @iftag: tag index to test
1207 * @settag: tag index to set if tested tag is set
1208 *
1209 * This function scans range of radix tree from first_index to last_index
1210 * (inclusive). For each item in the range if iftag is set, the function sets
1211 * also settag. The function stops either after tagging nr_to_tag items or
1212 * after reaching last_index.
1213 *
1214 * The tags must be set from the leaf level only and propagated back up the
1215 * path to the root. We must do this so that we resolve the full path before
1216 * setting any tags on intermediate nodes. If we set tags as we descend, then
1217 * we can get to the leaf node and find that the index that has the iftag
1218 * set is outside the range we are scanning. This reults in dangling tags and
1219 * can lead to problems with later tag operations (e.g. livelocks on lookups).
1220 *
1221 * The function returns the number of leaves where the tag was set and sets
1222 * *first_indexp to the first unscanned index.
1223 * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must
1224 * be prepared to handle that.
1225 */
1226unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
1227 unsigned long *first_indexp, unsigned long last_index,
1228 unsigned long nr_to_tag,
1229 unsigned int iftag, unsigned int settag)
1230{
1231 struct radix_tree_node *parent, *node, *child;
1232 unsigned long maxindex;
1233 unsigned long tagged = 0;
1234 unsigned long index = *first_indexp;
1235
1236 radix_tree_load_root(root, &child, &maxindex);
1237 last_index = min(last_index, maxindex);
1238 if (index > last_index)
1239 return 0;
1240 if (!nr_to_tag)
1241 return 0;
1242 if (!root_tag_get(root, iftag)) {
1243 *first_indexp = last_index + 1;
1244 return 0;
1245 }
1246 if (!radix_tree_is_internal_node(child)) {
1247 *first_indexp = last_index + 1;
1248 root_tag_set(root, settag);
1249 return 1;
1250 }
1251
1252 node = entry_to_node(child);
1253
1254 for (;;) {
1255 unsigned offset = radix_tree_descend(node, &child, index);
1256 if (!child)
1257 goto next;
1258 if (!tag_get(node, iftag, offset))
1259 goto next;
1260 /* Sibling slots never have tags set on them */
1261 if (radix_tree_is_internal_node(child)) {
1262 node = entry_to_node(child);
1263 continue;
1264 }
1265
1266 /* tag the leaf */
1267 tagged++;
1268 tag_set(node, settag, offset);
1269
1270 /* walk back up the path tagging interior nodes */
1271 parent = node;
1272 for (;;) {
1273 offset = parent->offset;
1274 parent = parent->parent;
1275 if (!parent)
1276 break;
1277 /* stop if we find a node with the tag already set */
1278 if (tag_get(parent, settag, offset))
1279 break;
1280 tag_set(parent, settag, offset);
1281 }
1282 next:
1283 /* Go to next entry in node */
1284 index = ((index >> node->shift) + 1) << node->shift;
1285 /* Overflow can happen when last_index is ~0UL... */
1286 if (index > last_index || !index)
1287 break;
1288 offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
1289 while (offset == 0) {
1290 /*
1291 * We've fully scanned this node. Go up. Because
1292 * last_index is guaranteed to be in the tree, what
1293 * we do below cannot wander astray.
1294 */
1295 node = node->parent;
1296 offset = (index >> node->shift) & RADIX_TREE_MAP_MASK;
1297 }
1298 if (is_sibling_entry(node, node->slots[offset]))
1299 goto next;
1300 if (tagged >= nr_to_tag)
1301 break;
1302 }
1303 /*
1304 * We need not to tag the root tag if there is no tag which is set with
1305 * settag within the range from *first_indexp to last_index.
1306 */
1307 if (tagged > 0)
1308 root_tag_set(root, settag);
1309 *first_indexp = index;
1310
1311 return tagged;
1312}
1313EXPORT_SYMBOL(radix_tree_range_tag_if_tagged);
1314
1315/**
1316 * radix_tree_gang_lookup - perform multiple lookup on a radix tree 1660 * radix_tree_gang_lookup - perform multiple lookup on a radix tree
1317 * @root: radix tree root 1661 * @root: radix tree root
1318 * @results: where the results of the lookup are placed 1662 * @results: where the results of the lookup are placed
@@ -1477,105 +1821,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
1477} 1821}
1478EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); 1822EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);
1479 1823
1480#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP)
1481#include <linux/sched.h> /* for cond_resched() */
1482
1483struct locate_info {
1484 unsigned long found_index;
1485 bool stop;
1486};
1487
1488/*
1489 * This linear search is at present only useful to shmem_unuse_inode().
1490 */
1491static unsigned long __locate(struct radix_tree_node *slot, void *item,
1492 unsigned long index, struct locate_info *info)
1493{
1494 unsigned long i;
1495
1496 do {
1497 unsigned int shift = slot->shift;
1498
1499 for (i = (index >> shift) & RADIX_TREE_MAP_MASK;
1500 i < RADIX_TREE_MAP_SIZE;
1501 i++, index += (1UL << shift)) {
1502 struct radix_tree_node *node =
1503 rcu_dereference_raw(slot->slots[i]);
1504 if (node == RADIX_TREE_RETRY)
1505 goto out;
1506 if (!radix_tree_is_internal_node(node)) {
1507 if (node == item) {
1508 info->found_index = index;
1509 info->stop = true;
1510 goto out;
1511 }
1512 continue;
1513 }
1514 node = entry_to_node(node);
1515 if (is_sibling_entry(slot, node))
1516 continue;
1517 slot = node;
1518 break;
1519 }
1520 } while (i < RADIX_TREE_MAP_SIZE);
1521
1522out:
1523 if ((index == 0) && (i == RADIX_TREE_MAP_SIZE))
1524 info->stop = true;
1525 return index;
1526}
1527
1528/**
1529 * radix_tree_locate_item - search through radix tree for item
1530 * @root: radix tree root
1531 * @item: item to be found
1532 *
1533 * Returns index where item was found, or -1 if not found.
1534 * Caller must hold no lock (since this time-consuming function needs
1535 * to be preemptible), and must check afterwards if item is still there.
1536 */
1537unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1538{
1539 struct radix_tree_node *node;
1540 unsigned long max_index;
1541 unsigned long cur_index = 0;
1542 struct locate_info info = {
1543 .found_index = -1,
1544 .stop = false,
1545 };
1546
1547 do {
1548 rcu_read_lock();
1549 node = rcu_dereference_raw(root->rnode);
1550 if (!radix_tree_is_internal_node(node)) {
1551 rcu_read_unlock();
1552 if (node == item)
1553 info.found_index = 0;
1554 break;
1555 }
1556
1557 node = entry_to_node(node);
1558
1559 max_index = node_maxindex(node);
1560 if (cur_index > max_index) {
1561 rcu_read_unlock();
1562 break;
1563 }
1564
1565 cur_index = __locate(node, item, cur_index, &info);
1566 rcu_read_unlock();
1567 cond_resched();
1568 } while (!info.stop && cur_index <= max_index);
1569
1570 return info.found_index;
1571}
1572#else
1573unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item)
1574{
1575 return -1;
1576}
1577#endif /* CONFIG_SHMEM && CONFIG_SWAP */
1578
1579/** 1824/**
1580 * __radix_tree_delete_node - try to free node after clearing a slot 1825 * __radix_tree_delete_node - try to free node after clearing a slot
1581 * @root: radix tree root 1826 * @root: radix tree root
@@ -1591,20 +1836,6 @@ void __radix_tree_delete_node(struct radix_tree_root *root,
1591 delete_node(root, node, NULL, NULL); 1836 delete_node(root, node, NULL, NULL);
1592} 1837}
1593 1838
1594static inline void delete_sibling_entries(struct radix_tree_node *node,
1595 void *ptr, unsigned offset)
1596{
1597#ifdef CONFIG_RADIX_TREE_MULTIORDER
1598 int i;
1599 for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
1600 if (node->slots[offset + i] != ptr)
1601 break;
1602 node->slots[offset + i] = NULL;
1603 node->count--;
1604 }
1605#endif
1606}
1607
1608/** 1839/**
1609 * radix_tree_delete_item - delete an item from a radix tree 1840 * radix_tree_delete_item - delete an item from a radix tree
1610 * @root: radix tree root 1841 * @root: radix tree root
@@ -1644,7 +1875,6 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
1644 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) 1875 for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
1645 node_tag_clear(root, node, tag, offset); 1876 node_tag_clear(root, node, tag, offset);
1646 1877
1647 delete_sibling_entries(node, node_to_entry(slot), offset);
1648 __radix_tree_replace(root, node, slot, NULL, NULL, NULL); 1878 __radix_tree_replace(root, node, slot, NULL, NULL, NULL);
1649 1879
1650 return entry; 1880 return entry;
diff --git a/mm/compaction.c b/mm/compaction.c
index 223464227299..949198d01260 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
818 page_count(page) > page_mapcount(page)) 818 page_count(page) > page_mapcount(page))
819 goto isolate_fail; 819 goto isolate_fail;
820 820
821 /*
822 * Only allow to migrate anonymous pages in GFP_NOFS context
823 * because those do not depend on fs locks.
824 */
825 if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
826 goto isolate_fail;
827
821 /* If we already hold the lock, we can skip some rechecking */ 828 /* If we already hold the lock, we can skip some rechecking */
822 if (!locked) { 829 if (!locked) {
823 locked = compact_trylock_irqsave(zone_lru_lock(zone), 830 locked = compact_trylock_irqsave(zone_lru_lock(zone),
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1677 unsigned int alloc_flags, const struct alloc_context *ac, 1684 unsigned int alloc_flags, const struct alloc_context *ac,
1678 enum compact_priority prio) 1685 enum compact_priority prio)
1679{ 1686{
1680 int may_enter_fs = gfp_mask & __GFP_FS;
1681 int may_perform_io = gfp_mask & __GFP_IO; 1687 int may_perform_io = gfp_mask & __GFP_IO;
1682 struct zoneref *z; 1688 struct zoneref *z;
1683 struct zone *zone; 1689 struct zone *zone;
1684 enum compact_result rc = COMPACT_SKIPPED; 1690 enum compact_result rc = COMPACT_SKIPPED;
1685 1691
1686 /* Check if the GFP flags allow compaction */ 1692 /*
1687 if (!may_enter_fs || !may_perform_io) 1693 * Check if the GFP flags allow compaction - GFP_NOIO is really
1694 * tricky context because the migration might require IO
1695 */
1696 if (!may_perform_io)
1688 return COMPACT_SKIPPED; 1697 return COMPACT_SKIPPED;
1689 1698
1690 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); 1699 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
@@ -1751,6 +1760,7 @@ static void compact_node(int nid)
1751 .mode = MIGRATE_SYNC, 1760 .mode = MIGRATE_SYNC,
1752 .ignore_skip_hint = true, 1761 .ignore_skip_hint = true,
1753 .whole_zone = true, 1762 .whole_zone = true,
1763 .gfp_mask = GFP_KERNEL,
1754 }; 1764 };
1755 1765
1756 1766
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
1876 .classzone_idx = pgdat->kcompactd_classzone_idx, 1886 .classzone_idx = pgdat->kcompactd_classzone_idx,
1877 .mode = MIGRATE_SYNC_LIGHT, 1887 .mode = MIGRATE_SYNC_LIGHT,
1878 .ignore_skip_hint = true, 1888 .ignore_skip_hint = true,
1889 .gfp_mask = GFP_KERNEL,
1879 1890
1880 }; 1891 };
1881 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, 1892 trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
diff --git a/mm/filemap.c b/mm/filemap.c
index b06517b7f97f..32be3c8f3a11 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2164,12 +2164,12 @@ page_not_uptodate:
2164} 2164}
2165EXPORT_SYMBOL(filemap_fault); 2165EXPORT_SYMBOL(filemap_fault);
2166 2166
2167void filemap_map_pages(struct fault_env *fe, 2167void filemap_map_pages(struct vm_fault *vmf,
2168 pgoff_t start_pgoff, pgoff_t end_pgoff) 2168 pgoff_t start_pgoff, pgoff_t end_pgoff)
2169{ 2169{
2170 struct radix_tree_iter iter; 2170 struct radix_tree_iter iter;
2171 void **slot; 2171 void **slot;
2172 struct file *file = fe->vma->vm_file; 2172 struct file *file = vmf->vma->vm_file;
2173 struct address_space *mapping = file->f_mapping; 2173 struct address_space *mapping = file->f_mapping;
2174 pgoff_t last_pgoff = start_pgoff; 2174 pgoff_t last_pgoff = start_pgoff;
2175 loff_t size; 2175 loff_t size;
@@ -2225,11 +2225,11 @@ repeat:
2225 if (file->f_ra.mmap_miss > 0) 2225 if (file->f_ra.mmap_miss > 0)
2226 file->f_ra.mmap_miss--; 2226 file->f_ra.mmap_miss--;
2227 2227
2228 fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; 2228 vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
2229 if (fe->pte) 2229 if (vmf->pte)
2230 fe->pte += iter.index - last_pgoff; 2230 vmf->pte += iter.index - last_pgoff;
2231 last_pgoff = iter.index; 2231 last_pgoff = iter.index;
2232 if (alloc_set_pte(fe, NULL, page)) 2232 if (alloc_set_pte(vmf, NULL, page))
2233 goto unlock; 2233 goto unlock;
2234 unlock_page(page); 2234 unlock_page(page);
2235 goto next; 2235 goto next;
@@ -2239,7 +2239,7 @@ skip:
2239 put_page(page); 2239 put_page(page);
2240next: 2240next:
2241 /* Huge page is mapped? No need to proceed. */ 2241 /* Huge page is mapped? No need to proceed. */
2242 if (pmd_trans_huge(*fe->pmd)) 2242 if (pmd_trans_huge(*vmf->pmd))
2243 break; 2243 break;
2244 if (iter.index == end_pgoff) 2244 if (iter.index == end_pgoff)
2245 break; 2245 break;
diff --git a/mm/gup.c b/mm/gup.c
index e50178c58b97..55315555489d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked);
865 * caller if required (just like with __get_user_pages). "FOLL_GET" 865 * caller if required (just like with __get_user_pages). "FOLL_GET"
866 * is set implicitly if "pages" is non-NULL. 866 * is set implicitly if "pages" is non-NULL.
867 */ 867 */
868__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 868static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk,
869 unsigned long start, unsigned long nr_pages, 869 struct mm_struct *mm, unsigned long start,
870 struct page **pages, unsigned int gup_flags) 870 unsigned long nr_pages, struct page **pages,
871 unsigned int gup_flags)
871{ 872{
872 long ret; 873 long ret;
873 int locked = 1; 874 int locked = 1;
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m
879 up_read(&mm->mmap_sem); 880 up_read(&mm->mmap_sem);
880 return ret; 881 return ret;
881} 882}
882EXPORT_SYMBOL(__get_user_pages_unlocked);
883 883
884/* 884/*
885 * get_user_pages_unlocked() is suitable to replace the form: 885 * get_user_pages_unlocked() is suitable to replace the form:
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
917 * only intends to ensure the pages are faulted in. 917 * only intends to ensure the pages are faulted in.
918 * @vmas: array of pointers to vmas corresponding to each page. 918 * @vmas: array of pointers to vmas corresponding to each page.
919 * Or NULL if the caller does not require them. 919 * Or NULL if the caller does not require them.
920 * @locked: pointer to lock flag indicating whether lock is held and
921 * subsequently whether VM_FAULT_RETRY functionality can be
922 * utilised. Lock must initially be held.
920 * 923 *
921 * Returns number of pages pinned. This may be fewer than the number 924 * Returns number of pages pinned. This may be fewer than the number
922 * requested. If nr_pages is 0 or negative, returns 0. If no pages 925 * requested. If nr_pages is 0 or negative, returns 0. If no pages
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
960long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, 963long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
961 unsigned long start, unsigned long nr_pages, 964 unsigned long start, unsigned long nr_pages,
962 unsigned int gup_flags, struct page **pages, 965 unsigned int gup_flags, struct page **pages,
963 struct vm_area_struct **vmas) 966 struct vm_area_struct **vmas, int *locked)
964{ 967{
965 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, 968 return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
966 NULL, false, 969 locked, true,
967 gup_flags | FOLL_TOUCH | FOLL_REMOTE); 970 gup_flags | FOLL_TOUCH | FOLL_REMOTE);
968} 971}
969EXPORT_SYMBOL(get_user_pages_remote); 972EXPORT_SYMBOL(get_user_pages_remote);
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote);
971/* 974/*
972 * This is the same as get_user_pages_remote(), just with a 975 * This is the same as get_user_pages_remote(), just with a
973 * less-flexible calling convention where we assume that the task 976 * less-flexible calling convention where we assume that the task
974 * and mm being operated on are the current task's. We also 977 * and mm being operated on are the current task's and don't allow
975 * obviously don't pass FOLL_REMOTE in here. 978 * passing of a locked parameter. We also obviously don't pass
979 * FOLL_REMOTE in here.
976 */ 980 */
977long get_user_pages(unsigned long start, unsigned long nr_pages, 981long get_user_pages(unsigned long start, unsigned long nr_pages,
978 unsigned int gup_flags, struct page **pages, 982 unsigned int gup_flags, struct page **pages,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cee42cf05477..10eedbf14421 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
542} 542}
543EXPORT_SYMBOL_GPL(thp_get_unmapped_area); 543EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
544 544
545static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, 545static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
546 gfp_t gfp) 546 gfp_t gfp)
547{ 547{
548 struct vm_area_struct *vma = fe->vma; 548 struct vm_area_struct *vma = vmf->vma;
549 struct mem_cgroup *memcg; 549 struct mem_cgroup *memcg;
550 pgtable_t pgtable; 550 pgtable_t pgtable;
551 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 551 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
552 552
553 VM_BUG_ON_PAGE(!PageCompound(page), page); 553 VM_BUG_ON_PAGE(!PageCompound(page), page);
554 554
@@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
573 */ 573 */
574 __SetPageUptodate(page); 574 __SetPageUptodate(page);
575 575
576 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 576 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
577 if (unlikely(!pmd_none(*fe->pmd))) { 577 if (unlikely(!pmd_none(*vmf->pmd))) {
578 spin_unlock(fe->ptl); 578 spin_unlock(vmf->ptl);
579 mem_cgroup_cancel_charge(page, memcg, true); 579 mem_cgroup_cancel_charge(page, memcg, true);
580 put_page(page); 580 put_page(page);
581 pte_free(vma->vm_mm, pgtable); 581 pte_free(vma->vm_mm, pgtable);
@@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
586 if (userfaultfd_missing(vma)) { 586 if (userfaultfd_missing(vma)) {
587 int ret; 587 int ret;
588 588
589 spin_unlock(fe->ptl); 589 spin_unlock(vmf->ptl);
590 mem_cgroup_cancel_charge(page, memcg, true); 590 mem_cgroup_cancel_charge(page, memcg, true);
591 put_page(page); 591 put_page(page);
592 pte_free(vma->vm_mm, pgtable); 592 pte_free(vma->vm_mm, pgtable);
593 ret = handle_userfault(fe, VM_UFFD_MISSING); 593 ret = handle_userfault(vmf, VM_UFFD_MISSING);
594 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 594 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
595 return ret; 595 return ret;
596 } 596 }
@@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
600 page_add_new_anon_rmap(page, vma, haddr, true); 600 page_add_new_anon_rmap(page, vma, haddr, true);
601 mem_cgroup_commit_charge(page, memcg, false, true); 601 mem_cgroup_commit_charge(page, memcg, false, true);
602 lru_cache_add_active_or_unevictable(page, vma); 602 lru_cache_add_active_or_unevictable(page, vma);
603 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); 603 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
604 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 604 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
605 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 605 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
606 atomic_long_inc(&vma->vm_mm->nr_ptes); 606 atomic_long_inc(&vma->vm_mm->nr_ptes);
607 spin_unlock(fe->ptl); 607 spin_unlock(vmf->ptl);
608 count_vm_event(THP_FAULT_ALLOC); 608 count_vm_event(THP_FAULT_ALLOC);
609 } 609 }
610 610
@@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
651 return true; 651 return true;
652} 652}
653 653
654int do_huge_pmd_anonymous_page(struct fault_env *fe) 654int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
655{ 655{
656 struct vm_area_struct *vma = fe->vma; 656 struct vm_area_struct *vma = vmf->vma;
657 gfp_t gfp; 657 gfp_t gfp;
658 struct page *page; 658 struct page *page;
659 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 659 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
660 660
661 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 661 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
662 return VM_FAULT_FALLBACK; 662 return VM_FAULT_FALLBACK;
@@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
664 return VM_FAULT_OOM; 664 return VM_FAULT_OOM;
665 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 665 if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
666 return VM_FAULT_OOM; 666 return VM_FAULT_OOM;
667 if (!(fe->flags & FAULT_FLAG_WRITE) && 667 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
668 !mm_forbids_zeropage(vma->vm_mm) && 668 !mm_forbids_zeropage(vma->vm_mm) &&
669 transparent_hugepage_use_zero_page()) { 669 transparent_hugepage_use_zero_page()) {
670 pgtable_t pgtable; 670 pgtable_t pgtable;
@@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
680 count_vm_event(THP_FAULT_FALLBACK); 680 count_vm_event(THP_FAULT_FALLBACK);
681 return VM_FAULT_FALLBACK; 681 return VM_FAULT_FALLBACK;
682 } 682 }
683 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 683 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
684 ret = 0; 684 ret = 0;
685 set = false; 685 set = false;
686 if (pmd_none(*fe->pmd)) { 686 if (pmd_none(*vmf->pmd)) {
687 if (userfaultfd_missing(vma)) { 687 if (userfaultfd_missing(vma)) {
688 spin_unlock(fe->ptl); 688 spin_unlock(vmf->ptl);
689 ret = handle_userfault(fe, VM_UFFD_MISSING); 689 ret = handle_userfault(vmf, VM_UFFD_MISSING);
690 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 690 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
691 } else { 691 } else {
692 set_huge_zero_page(pgtable, vma->vm_mm, vma, 692 set_huge_zero_page(pgtable, vma->vm_mm, vma,
693 haddr, fe->pmd, zero_page); 693 haddr, vmf->pmd, zero_page);
694 spin_unlock(fe->ptl); 694 spin_unlock(vmf->ptl);
695 set = true; 695 set = true;
696 } 696 }
697 } else 697 } else
698 spin_unlock(fe->ptl); 698 spin_unlock(vmf->ptl);
699 if (!set) 699 if (!set)
700 pte_free(vma->vm_mm, pgtable); 700 pte_free(vma->vm_mm, pgtable);
701 return ret; 701 return ret;
@@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe)
707 return VM_FAULT_FALLBACK; 707 return VM_FAULT_FALLBACK;
708 } 708 }
709 prep_transhuge_page(page); 709 prep_transhuge_page(page);
710 return __do_huge_pmd_anonymous_page(fe, page, gfp); 710 return __do_huge_pmd_anonymous_page(vmf, page, gfp);
711} 711}
712 712
713static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 713static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -879,30 +879,30 @@ out:
879 return ret; 879 return ret;
880} 880}
881 881
882void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) 882void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
883{ 883{
884 pmd_t entry; 884 pmd_t entry;
885 unsigned long haddr; 885 unsigned long haddr;
886 886
887 fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); 887 vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
888 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) 888 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
889 goto unlock; 889 goto unlock;
890 890
891 entry = pmd_mkyoung(orig_pmd); 891 entry = pmd_mkyoung(orig_pmd);
892 haddr = fe->address & HPAGE_PMD_MASK; 892 haddr = vmf->address & HPAGE_PMD_MASK;
893 if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, 893 if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry,
894 fe->flags & FAULT_FLAG_WRITE)) 894 vmf->flags & FAULT_FLAG_WRITE))
895 update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); 895 update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
896 896
897unlock: 897unlock:
898 spin_unlock(fe->ptl); 898 spin_unlock(vmf->ptl);
899} 899}
900 900
901static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, 901static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
902 struct page *page) 902 struct page *page)
903{ 903{
904 struct vm_area_struct *vma = fe->vma; 904 struct vm_area_struct *vma = vmf->vma;
905 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 905 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
906 struct mem_cgroup *memcg; 906 struct mem_cgroup *memcg;
907 pgtable_t pgtable; 907 pgtable_t pgtable;
908 pmd_t _pmd; 908 pmd_t _pmd;
@@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
921 for (i = 0; i < HPAGE_PMD_NR; i++) { 921 for (i = 0; i < HPAGE_PMD_NR; i++) {
922 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 922 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
923 __GFP_OTHER_NODE, vma, 923 __GFP_OTHER_NODE, vma,
924 fe->address, page_to_nid(page)); 924 vmf->address, page_to_nid(page));
925 if (unlikely(!pages[i] || 925 if (unlikely(!pages[i] ||
926 mem_cgroup_try_charge(pages[i], vma->vm_mm, 926 mem_cgroup_try_charge(pages[i], vma->vm_mm,
927 GFP_KERNEL, &memcg, false))) { 927 GFP_KERNEL, &memcg, false))) {
@@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
952 mmun_end = haddr + HPAGE_PMD_SIZE; 952 mmun_end = haddr + HPAGE_PMD_SIZE;
953 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 953 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
954 954
955 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 955 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
956 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) 956 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
957 goto out_free_pages; 957 goto out_free_pages;
958 VM_BUG_ON_PAGE(!PageHead(page), page); 958 VM_BUG_ON_PAGE(!PageHead(page), page);
959 959
960 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); 960 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
961 /* leave pmd empty until pte is filled */ 961 /* leave pmd empty until pte is filled */
962 962
963 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); 963 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
964 pmd_populate(vma->vm_mm, &_pmd, pgtable); 964 pmd_populate(vma->vm_mm, &_pmd, pgtable);
965 965
966 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 966 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
969 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 969 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
970 memcg = (void *)page_private(pages[i]); 970 memcg = (void *)page_private(pages[i]);
971 set_page_private(pages[i], 0); 971 set_page_private(pages[i], 0);
972 page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); 972 page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
973 mem_cgroup_commit_charge(pages[i], memcg, false, false); 973 mem_cgroup_commit_charge(pages[i], memcg, false, false);
974 lru_cache_add_active_or_unevictable(pages[i], vma); 974 lru_cache_add_active_or_unevictable(pages[i], vma);
975 fe->pte = pte_offset_map(&_pmd, haddr); 975 vmf->pte = pte_offset_map(&_pmd, haddr);
976 VM_BUG_ON(!pte_none(*fe->pte)); 976 VM_BUG_ON(!pte_none(*vmf->pte));
977 set_pte_at(vma->vm_mm, haddr, fe->pte, entry); 977 set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
978 pte_unmap(fe->pte); 978 pte_unmap(vmf->pte);
979 } 979 }
980 kfree(pages); 980 kfree(pages);
981 981
982 smp_wmb(); /* make pte visible before pmd */ 982 smp_wmb(); /* make pte visible before pmd */
983 pmd_populate(vma->vm_mm, fe->pmd, pgtable); 983 pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
984 page_remove_rmap(page, true); 984 page_remove_rmap(page, true);
985 spin_unlock(fe->ptl); 985 spin_unlock(vmf->ptl);
986 986
987 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 987 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
988 988
@@ -993,7 +993,7 @@ out:
993 return ret; 993 return ret;
994 994
995out_free_pages: 995out_free_pages:
996 spin_unlock(fe->ptl); 996 spin_unlock(vmf->ptl);
997 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 997 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
998 for (i = 0; i < HPAGE_PMD_NR; i++) { 998 for (i = 0; i < HPAGE_PMD_NR; i++) {
999 memcg = (void *)page_private(pages[i]); 999 memcg = (void *)page_private(pages[i]);
@@ -1005,23 +1005,23 @@ out_free_pages:
1005 goto out; 1005 goto out;
1006} 1006}
1007 1007
1008int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) 1008int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1009{ 1009{
1010 struct vm_area_struct *vma = fe->vma; 1010 struct vm_area_struct *vma = vmf->vma;
1011 struct page *page = NULL, *new_page; 1011 struct page *page = NULL, *new_page;
1012 struct mem_cgroup *memcg; 1012 struct mem_cgroup *memcg;
1013 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 1013 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1014 unsigned long mmun_start; /* For mmu_notifiers */ 1014 unsigned long mmun_start; /* For mmu_notifiers */
1015 unsigned long mmun_end; /* For mmu_notifiers */ 1015 unsigned long mmun_end; /* For mmu_notifiers */
1016 gfp_t huge_gfp; /* for allocation and charge */ 1016 gfp_t huge_gfp; /* for allocation and charge */
1017 int ret = 0; 1017 int ret = 0;
1018 1018
1019 fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); 1019 vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1020 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1020 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1021 if (is_huge_zero_pmd(orig_pmd)) 1021 if (is_huge_zero_pmd(orig_pmd))
1022 goto alloc; 1022 goto alloc;
1023 spin_lock(fe->ptl); 1023 spin_lock(vmf->ptl);
1024 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) 1024 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1025 goto out_unlock; 1025 goto out_unlock;
1026 1026
1027 page = pmd_page(orig_pmd); 1027 page = pmd_page(orig_pmd);
@@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
1034 pmd_t entry; 1034 pmd_t entry;
1035 entry = pmd_mkyoung(orig_pmd); 1035 entry = pmd_mkyoung(orig_pmd);
1036 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1036 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1037 if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) 1037 if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
1038 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1038 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1039 ret |= VM_FAULT_WRITE; 1039 ret |= VM_FAULT_WRITE;
1040 goto out_unlock; 1040 goto out_unlock;
1041 } 1041 }
1042 get_page(page); 1042 get_page(page);
1043 spin_unlock(fe->ptl); 1043 spin_unlock(vmf->ptl);
1044alloc: 1044alloc:
1045 if (transparent_hugepage_enabled(vma) && 1045 if (transparent_hugepage_enabled(vma) &&
1046 !transparent_hugepage_debug_cow()) { 1046 !transparent_hugepage_debug_cow()) {
@@ -1053,12 +1053,12 @@ alloc:
1053 prep_transhuge_page(new_page); 1053 prep_transhuge_page(new_page);
1054 } else { 1054 } else {
1055 if (!page) { 1055 if (!page) {
1056 split_huge_pmd(vma, fe->pmd, fe->address); 1056 split_huge_pmd(vma, vmf->pmd, vmf->address);
1057 ret |= VM_FAULT_FALLBACK; 1057 ret |= VM_FAULT_FALLBACK;
1058 } else { 1058 } else {
1059 ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); 1059 ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
1060 if (ret & VM_FAULT_OOM) { 1060 if (ret & VM_FAULT_OOM) {
1061 split_huge_pmd(vma, fe->pmd, fe->address); 1061 split_huge_pmd(vma, vmf->pmd, vmf->address);
1062 ret |= VM_FAULT_FALLBACK; 1062 ret |= VM_FAULT_FALLBACK;
1063 } 1063 }
1064 put_page(page); 1064 put_page(page);
@@ -1070,7 +1070,7 @@ alloc:
1070 if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, 1070 if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
1071 huge_gfp, &memcg, true))) { 1071 huge_gfp, &memcg, true))) {
1072 put_page(new_page); 1072 put_page(new_page);
1073 split_huge_pmd(vma, fe->pmd, fe->address); 1073 split_huge_pmd(vma, vmf->pmd, vmf->address);
1074 if (page) 1074 if (page)
1075 put_page(page); 1075 put_page(page);
1076 ret |= VM_FAULT_FALLBACK; 1076 ret |= VM_FAULT_FALLBACK;
@@ -1090,11 +1090,11 @@ alloc:
1090 mmun_end = haddr + HPAGE_PMD_SIZE; 1090 mmun_end = haddr + HPAGE_PMD_SIZE;
1091 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); 1091 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1092 1092
1093 spin_lock(fe->ptl); 1093 spin_lock(vmf->ptl);
1094 if (page) 1094 if (page)
1095 put_page(page); 1095 put_page(page);
1096 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { 1096 if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1097 spin_unlock(fe->ptl); 1097 spin_unlock(vmf->ptl);
1098 mem_cgroup_cancel_charge(new_page, memcg, true); 1098 mem_cgroup_cancel_charge(new_page, memcg, true);
1099 put_page(new_page); 1099 put_page(new_page);
1100 goto out_mn; 1100 goto out_mn;
@@ -1102,12 +1102,12 @@ alloc:
1102 pmd_t entry; 1102 pmd_t entry;
1103 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1103 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1104 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1104 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1105 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); 1105 pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1106 page_add_new_anon_rmap(new_page, vma, haddr, true); 1106 page_add_new_anon_rmap(new_page, vma, haddr, true);
1107 mem_cgroup_commit_charge(new_page, memcg, false, true); 1107 mem_cgroup_commit_charge(new_page, memcg, false, true);
1108 lru_cache_add_active_or_unevictable(new_page, vma); 1108 lru_cache_add_active_or_unevictable(new_page, vma);
1109 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 1109 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1110 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1110 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1111 if (!page) { 1111 if (!page) {
1112 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); 1112 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1113 } else { 1113 } else {
@@ -1117,13 +1117,13 @@ alloc:
1117 } 1117 }
1118 ret |= VM_FAULT_WRITE; 1118 ret |= VM_FAULT_WRITE;
1119 } 1119 }
1120 spin_unlock(fe->ptl); 1120 spin_unlock(vmf->ptl);
1121out_mn: 1121out_mn:
1122 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); 1122 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1123out: 1123out:
1124 return ret; 1124 return ret;
1125out_unlock: 1125out_unlock:
1126 spin_unlock(fe->ptl); 1126 spin_unlock(vmf->ptl);
1127 return ret; 1127 return ret;
1128} 1128}
1129 1129
@@ -1196,12 +1196,12 @@ out:
1196} 1196}
1197 1197
1198/* NUMA hinting page fault entry point for trans huge pmds */ 1198/* NUMA hinting page fault entry point for trans huge pmds */
1199int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) 1199int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1200{ 1200{
1201 struct vm_area_struct *vma = fe->vma; 1201 struct vm_area_struct *vma = vmf->vma;
1202 struct anon_vma *anon_vma = NULL; 1202 struct anon_vma *anon_vma = NULL;
1203 struct page *page; 1203 struct page *page;
1204 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 1204 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1205 int page_nid = -1, this_nid = numa_node_id(); 1205 int page_nid = -1, this_nid = numa_node_id();
1206 int target_nid, last_cpupid = -1; 1206 int target_nid, last_cpupid = -1;
1207 bool page_locked; 1207 bool page_locked;
@@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1209 bool was_writable; 1209 bool was_writable;
1210 int flags = 0; 1210 int flags = 0;
1211 1211
1212 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 1212 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1213 if (unlikely(!pmd_same(pmd, *fe->pmd))) 1213 if (unlikely(!pmd_same(pmd, *vmf->pmd)))
1214 goto out_unlock; 1214 goto out_unlock;
1215 1215
1216 /* 1216 /*
@@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1218 * without disrupting NUMA hinting information. Do not relock and 1218 * without disrupting NUMA hinting information. Do not relock and
1219 * check_same as the page may no longer be mapped. 1219 * check_same as the page may no longer be mapped.
1220 */ 1220 */
1221 if (unlikely(pmd_trans_migrating(*fe->pmd))) { 1221 if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
1222 page = pmd_page(*fe->pmd); 1222 page = pmd_page(*vmf->pmd);
1223 spin_unlock(fe->ptl); 1223 spin_unlock(vmf->ptl);
1224 wait_on_page_locked(page); 1224 wait_on_page_locked(page);
1225 goto out; 1225 goto out;
1226 } 1226 }
@@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1253 1253
1254 /* Migration could have started since the pmd_trans_migrating check */ 1254 /* Migration could have started since the pmd_trans_migrating check */
1255 if (!page_locked) { 1255 if (!page_locked) {
1256 spin_unlock(fe->ptl); 1256 spin_unlock(vmf->ptl);
1257 wait_on_page_locked(page); 1257 wait_on_page_locked(page);
1258 page_nid = -1; 1258 page_nid = -1;
1259 goto out; 1259 goto out;
@@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1264 * to serialises splits 1264 * to serialises splits
1265 */ 1265 */
1266 get_page(page); 1266 get_page(page);
1267 spin_unlock(fe->ptl); 1267 spin_unlock(vmf->ptl);
1268 anon_vma = page_lock_anon_vma_read(page); 1268 anon_vma = page_lock_anon_vma_read(page);
1269 1269
1270 /* Confirm the PMD did not change while page_table_lock was released */ 1270 /* Confirm the PMD did not change while page_table_lock was released */
1271 spin_lock(fe->ptl); 1271 spin_lock(vmf->ptl);
1272 if (unlikely(!pmd_same(pmd, *fe->pmd))) { 1272 if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
1273 unlock_page(page); 1273 unlock_page(page);
1274 put_page(page); 1274 put_page(page);
1275 page_nid = -1; 1275 page_nid = -1;
@@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1287 * Migrate the THP to the requested node, returns with page unlocked 1287 * Migrate the THP to the requested node, returns with page unlocked
1288 * and access rights restored. 1288 * and access rights restored.
1289 */ 1289 */
1290 spin_unlock(fe->ptl); 1290 spin_unlock(vmf->ptl);
1291 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, 1291 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1292 fe->pmd, pmd, fe->address, page, target_nid); 1292 vmf->pmd, pmd, vmf->address, page, target_nid);
1293 if (migrated) { 1293 if (migrated) {
1294 flags |= TNF_MIGRATED; 1294 flags |= TNF_MIGRATED;
1295 page_nid = target_nid; 1295 page_nid = target_nid;
@@ -1304,18 +1304,19 @@ clear_pmdnuma:
1304 pmd = pmd_mkyoung(pmd); 1304 pmd = pmd_mkyoung(pmd);
1305 if (was_writable) 1305 if (was_writable)
1306 pmd = pmd_mkwrite(pmd); 1306 pmd = pmd_mkwrite(pmd);
1307 set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); 1307 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1308 update_mmu_cache_pmd(vma, fe->address, fe->pmd); 1308 update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1309 unlock_page(page); 1309 unlock_page(page);
1310out_unlock: 1310out_unlock:
1311 spin_unlock(fe->ptl); 1311 spin_unlock(vmf->ptl);
1312 1312
1313out: 1313out:
1314 if (anon_vma) 1314 if (anon_vma)
1315 page_unlock_anon_vma_read(anon_vma); 1315 page_unlock_anon_vma_read(anon_vma);
1316 1316
1317 if (page_nid != -1) 1317 if (page_nid != -1)
1318 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); 1318 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
1319 vmf->flags);
1319 1320
1320 return 0; 1321 return 0;
1321} 1322}
diff --git a/mm/internal.h b/mm/internal.h
index 537ac9951f5f..44d68895a9b9 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,7 +36,7 @@
36/* Do not use these with a slab allocator */ 36/* Do not use these with a slab allocator */
37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) 37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
38 38
39int do_swap_page(struct fault_env *fe, pte_t orig_pte); 39int do_swap_page(struct vm_fault *vmf);
40 40
41void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 41void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
42 unsigned long floor, unsigned long ceiling); 42 unsigned long floor, unsigned long ceiling);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 09460955e818..e32389a97030 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
875 unsigned long address, pmd_t *pmd, 875 unsigned long address, pmd_t *pmd,
876 int referenced) 876 int referenced)
877{ 877{
878 pte_t pteval;
879 int swapped_in = 0, ret = 0; 878 int swapped_in = 0, ret = 0;
880 struct fault_env fe = { 879 struct vm_fault vmf = {
881 .vma = vma, 880 .vma = vma,
882 .address = address, 881 .address = address,
883 .flags = FAULT_FLAG_ALLOW_RETRY, 882 .flags = FAULT_FLAG_ALLOW_RETRY,
884 .pmd = pmd, 883 .pmd = pmd,
884 .pgoff = linear_page_index(vma, address),
885 }; 885 };
886 886
887 /* we only decide to swapin, if there is enough young ptes */ 887 /* we only decide to swapin, if there is enough young ptes */
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
889 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); 889 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
890 return false; 890 return false;
891 } 891 }
892 fe.pte = pte_offset_map(pmd, address); 892 vmf.pte = pte_offset_map(pmd, address);
893 for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; 893 for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
894 fe.pte++, fe.address += PAGE_SIZE) { 894 vmf.pte++, vmf.address += PAGE_SIZE) {
895 pteval = *fe.pte; 895 vmf.orig_pte = *vmf.pte;
896 if (!is_swap_pte(pteval)) 896 if (!is_swap_pte(vmf.orig_pte))
897 continue; 897 continue;
898 swapped_in++; 898 swapped_in++;
899 ret = do_swap_page(&fe, pteval); 899 ret = do_swap_page(&vmf);
900 900
901 /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ 901 /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
902 if (ret & VM_FAULT_RETRY) { 902 if (ret & VM_FAULT_RETRY) {
903 down_read(&mm->mmap_sem); 903 down_read(&mm->mmap_sem);
904 if (hugepage_vma_revalidate(mm, address, &fe.vma)) { 904 if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
905 /* vma is no longer available, don't continue to swapin */ 905 /* vma is no longer available, don't continue to swapin */
906 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); 906 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
907 return false; 907 return false;
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
915 return false; 915 return false;
916 } 916 }
917 /* pte is unmapped now, we need to map it */ 917 /* pte is unmapped now, we need to map it */
918 fe.pte = pte_offset_map(pmd, fe.address); 918 vmf.pte = pte_offset_map(pmd, vmf.address);
919 } 919 }
920 fe.pte--; 920 vmf.pte--;
921 pte_unmap(fe.pte); 921 pte_unmap(vmf.pte);
922 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); 922 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
923 return true; 923 return true;
924} 924}
@@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm,
1446 radix_tree_replace_slot(&mapping->page_tree, slot, 1446 radix_tree_replace_slot(&mapping->page_tree, slot,
1447 new_page + (index % HPAGE_PMD_NR)); 1447 new_page + (index % HPAGE_PMD_NR));
1448 1448
1449 slot = radix_tree_iter_next(&iter); 1449 slot = radix_tree_iter_resume(slot, &iter);
1450 index++; 1450 index++;
1451 continue; 1451 continue;
1452out_lru: 1452out_lru:
@@ -1546,7 +1546,6 @@ tree_unlocked:
1546 /* Put holes back where they were */ 1546 /* Put holes back where they were */
1547 radix_tree_delete(&mapping->page_tree, 1547 radix_tree_delete(&mapping->page_tree,
1548 iter.index); 1548 iter.index);
1549 slot = radix_tree_iter_next(&iter);
1550 continue; 1549 continue;
1551 } 1550 }
1552 1551
@@ -1557,11 +1556,11 @@ tree_unlocked:
1557 page_ref_unfreeze(page, 2); 1556 page_ref_unfreeze(page, 2);
1558 radix_tree_replace_slot(&mapping->page_tree, 1557 radix_tree_replace_slot(&mapping->page_tree,
1559 slot, page); 1558 slot, page);
1559 slot = radix_tree_iter_resume(slot, &iter);
1560 spin_unlock_irq(&mapping->tree_lock); 1560 spin_unlock_irq(&mapping->tree_lock);
1561 putback_lru_page(page); 1561 putback_lru_page(page);
1562 unlock_page(page); 1562 unlock_page(page);
1563 spin_lock_irq(&mapping->tree_lock); 1563 spin_lock_irq(&mapping->tree_lock);
1564 slot = radix_tree_iter_next(&iter);
1565 } 1564 }
1566 VM_BUG_ON(nr_none); 1565 VM_BUG_ON(nr_none);
1567 spin_unlock_irq(&mapping->tree_lock); 1566 spin_unlock_irq(&mapping->tree_lock);
@@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
1641 present++; 1640 present++;
1642 1641
1643 if (need_resched()) { 1642 if (need_resched()) {
1643 slot = radix_tree_iter_resume(slot, &iter);
1644 cond_resched_rcu(); 1644 cond_resched_rcu();
1645 slot = radix_tree_iter_next(&iter);
1646 } 1645 }
1647 } 1646 }
1648 rcu_read_unlock(); 1647 rcu_read_unlock();
diff --git a/mm/memory.c b/mm/memory.c
index 08d8da39de28..455c3e628d52 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
2034 * 2034 *
2035 * We do this without the lock held, so that it can sleep if it needs to. 2035 * We do this without the lock held, so that it can sleep if it needs to.
2036 */ 2036 */
2037static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, 2037static int do_page_mkwrite(struct vm_fault *vmf)
2038 unsigned long address)
2039{ 2038{
2040 struct vm_fault vmf;
2041 int ret; 2039 int ret;
2040 struct page *page = vmf->page;
2041 unsigned int old_flags = vmf->flags;
2042 2042
2043 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2043 vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2044 vmf.pgoff = page->index;
2045 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
2046 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2047 vmf.page = page;
2048 vmf.cow_page = NULL;
2049 2044
2050 ret = vma->vm_ops->page_mkwrite(vma, &vmf); 2045 ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf);
2046 /* Restore original flags so that caller is not surprised */
2047 vmf->flags = old_flags;
2051 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2048 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2052 return ret; 2049 return ret;
2053 if (unlikely(!(ret & VM_FAULT_LOCKED))) { 2050 if (unlikely(!(ret & VM_FAULT_LOCKED))) {
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2063} 2060}
2064 2061
2065/* 2062/*
2063 * Handle dirtying of a page in shared file mapping on a write fault.
2064 *
2065 * The function expects the page to be locked and unlocks it.
2066 */
2067static void fault_dirty_shared_page(struct vm_area_struct *vma,
2068 struct page *page)
2069{
2070 struct address_space *mapping;
2071 bool dirtied;
2072 bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
2073
2074 dirtied = set_page_dirty(page);
2075 VM_BUG_ON_PAGE(PageAnon(page), page);
2076 /*
2077 * Take a local copy of the address_space - page.mapping may be zeroed
2078 * by truncate after unlock_page(). The address_space itself remains
2079 * pinned by vma->vm_file's reference. We rely on unlock_page()'s
2080 * release semantics to prevent the compiler from undoing this copying.
2081 */
2082 mapping = page_rmapping(page);
2083 unlock_page(page);
2084
2085 if ((dirtied || page_mkwrite) && mapping) {
2086 /*
2087 * Some device drivers do not set page.mapping
2088 * but still dirty their pages
2089 */
2090 balance_dirty_pages_ratelimited(mapping);
2091 }
2092
2093 if (!page_mkwrite)
2094 file_update_time(vma->vm_file);
2095}
2096
2097/*
2066 * Handle write page faults for pages that can be reused in the current vma 2098 * Handle write page faults for pages that can be reused in the current vma
2067 * 2099 *
2068 * This can happen either due to the mapping being with the VM_SHARED flag, 2100 * This can happen either due to the mapping being with the VM_SHARED flag,
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2070 * case, all we need to do here is to mark the page as writable and update 2102 * case, all we need to do here is to mark the page as writable and update
2071 * any related book-keeping. 2103 * any related book-keeping.
2072 */ 2104 */
2073static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, 2105static inline void wp_page_reuse(struct vm_fault *vmf)
2074 struct page *page, int page_mkwrite, int dirty_shared) 2106 __releases(vmf->ptl)
2075 __releases(fe->ptl)
2076{ 2107{
2077 struct vm_area_struct *vma = fe->vma; 2108 struct vm_area_struct *vma = vmf->vma;
2109 struct page *page = vmf->page;
2078 pte_t entry; 2110 pte_t entry;
2079 /* 2111 /*
2080 * Clear the pages cpupid information as the existing 2112 * Clear the pages cpupid information as the existing
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2084 if (page) 2116 if (page)
2085 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); 2117 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2086 2118
2087 flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); 2119 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2088 entry = pte_mkyoung(orig_pte); 2120 entry = pte_mkyoung(vmf->orig_pte);
2089 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2121 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2090 if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) 2122 if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
2091 update_mmu_cache(vma, fe->address, fe->pte); 2123 update_mmu_cache(vma, vmf->address, vmf->pte);
2092 pte_unmap_unlock(fe->pte, fe->ptl); 2124 pte_unmap_unlock(vmf->pte, vmf->ptl);
2093
2094 if (dirty_shared) {
2095 struct address_space *mapping;
2096 int dirtied;
2097
2098 if (!page_mkwrite)
2099 lock_page(page);
2100
2101 dirtied = set_page_dirty(page);
2102 VM_BUG_ON_PAGE(PageAnon(page), page);
2103 mapping = page->mapping;
2104 unlock_page(page);
2105 put_page(page);
2106
2107 if ((dirtied || page_mkwrite) && mapping) {
2108 /*
2109 * Some device drivers do not set page.mapping
2110 * but still dirty their pages
2111 */
2112 balance_dirty_pages_ratelimited(mapping);
2113 }
2114
2115 if (!page_mkwrite)
2116 file_update_time(vma->vm_file);
2117 }
2118
2119 return VM_FAULT_WRITE;
2120} 2125}
2121 2126
2122/* 2127/*
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2135 * held to the old page, as well as updating the rmap. 2140 * held to the old page, as well as updating the rmap.
2136 * - In any case, unlock the PTL and drop the reference we took to the old page. 2141 * - In any case, unlock the PTL and drop the reference we took to the old page.
2137 */ 2142 */
2138static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, 2143static int wp_page_copy(struct vm_fault *vmf)
2139 struct page *old_page)
2140{ 2144{
2141 struct vm_area_struct *vma = fe->vma; 2145 struct vm_area_struct *vma = vmf->vma;
2142 struct mm_struct *mm = vma->vm_mm; 2146 struct mm_struct *mm = vma->vm_mm;
2147 struct page *old_page = vmf->page;
2143 struct page *new_page = NULL; 2148 struct page *new_page = NULL;
2144 pte_t entry; 2149 pte_t entry;
2145 int page_copied = 0; 2150 int page_copied = 0;
2146 const unsigned long mmun_start = fe->address & PAGE_MASK; 2151 const unsigned long mmun_start = vmf->address & PAGE_MASK;
2147 const unsigned long mmun_end = mmun_start + PAGE_SIZE; 2152 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2148 struct mem_cgroup *memcg; 2153 struct mem_cgroup *memcg;
2149 2154
2150 if (unlikely(anon_vma_prepare(vma))) 2155 if (unlikely(anon_vma_prepare(vma)))
2151 goto oom; 2156 goto oom;
2152 2157
2153 if (is_zero_pfn(pte_pfn(orig_pte))) { 2158 if (is_zero_pfn(pte_pfn(vmf->orig_pte))) {
2154 new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); 2159 new_page = alloc_zeroed_user_highpage_movable(vma,
2160 vmf->address);
2155 if (!new_page) 2161 if (!new_page)
2156 goto oom; 2162 goto oom;
2157 } else { 2163 } else {
2158 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, 2164 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2159 fe->address); 2165 vmf->address);
2160 if (!new_page) 2166 if (!new_page)
2161 goto oom; 2167 goto oom;
2162 cow_user_page(new_page, old_page, fe->address, vma); 2168 cow_user_page(new_page, old_page, vmf->address, vma);
2163 } 2169 }
2164 2170
2165 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) 2171 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2172 /* 2178 /*
2173 * Re-check the pte - we dropped the lock 2179 * Re-check the pte - we dropped the lock
2174 */ 2180 */
2175 fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); 2181 vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
2176 if (likely(pte_same(*fe->pte, orig_pte))) { 2182 if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
2177 if (old_page) { 2183 if (old_page) {
2178 if (!PageAnon(old_page)) { 2184 if (!PageAnon(old_page)) {
2179 dec_mm_counter_fast(mm, 2185 dec_mm_counter_fast(mm,
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2183 } else { 2189 } else {
2184 inc_mm_counter_fast(mm, MM_ANONPAGES); 2190 inc_mm_counter_fast(mm, MM_ANONPAGES);
2185 } 2191 }
2186 flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); 2192 flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
2187 entry = mk_pte(new_page, vma->vm_page_prot); 2193 entry = mk_pte(new_page, vma->vm_page_prot);
2188 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2194 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2189 /* 2195 /*
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2192 * seen in the presence of one thread doing SMC and another 2198 * seen in the presence of one thread doing SMC and another
2193 * thread doing COW. 2199 * thread doing COW.
2194 */ 2200 */
2195 ptep_clear_flush_notify(vma, fe->address, fe->pte); 2201 ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
2196 page_add_new_anon_rmap(new_page, vma, fe->address, false); 2202 page_add_new_anon_rmap(new_page, vma, vmf->address, false);
2197 mem_cgroup_commit_charge(new_page, memcg, false, false); 2203 mem_cgroup_commit_charge(new_page, memcg, false, false);
2198 lru_cache_add_active_or_unevictable(new_page, vma); 2204 lru_cache_add_active_or_unevictable(new_page, vma);
2199 /* 2205 /*
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2201 * mmu page tables (such as kvm shadow page tables), we want the 2207 * mmu page tables (such as kvm shadow page tables), we want the
2202 * new page to be mapped directly into the secondary page table. 2208 * new page to be mapped directly into the secondary page table.
2203 */ 2209 */
2204 set_pte_at_notify(mm, fe->address, fe->pte, entry); 2210 set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
2205 update_mmu_cache(vma, fe->address, fe->pte); 2211 update_mmu_cache(vma, vmf->address, vmf->pte);
2206 if (old_page) { 2212 if (old_page) {
2207 /* 2213 /*
2208 * Only after switching the pte to the new page may 2214 * Only after switching the pte to the new page may
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2239 if (new_page) 2245 if (new_page)
2240 put_page(new_page); 2246 put_page(new_page);
2241 2247
2242 pte_unmap_unlock(fe->pte, fe->ptl); 2248 pte_unmap_unlock(vmf->pte, vmf->ptl);
2243 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2249 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2244 if (old_page) { 2250 if (old_page) {
2245 /* 2251 /*
@@ -2263,79 +2269,91 @@ oom:
2263 return VM_FAULT_OOM; 2269 return VM_FAULT_OOM;
2264} 2270}
2265 2271
2272/**
2273 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
2274 * writeable once the page is prepared
2275 *
2276 * @vmf: structure describing the fault
2277 *
2278 * This function handles all that is needed to finish a write page fault in a
2279 * shared mapping due to PTE being read-only once the mapped page is prepared.
2280 * It handles locking of PTE and modifying it. The function returns
2281 * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
2282 * lock.
2283 *
2284 * The function expects the page to be locked or other protection against
2285 * concurrent faults / writeback (such as DAX radix tree locks).
2286 */
2287int finish_mkwrite_fault(struct vm_fault *vmf)
2288{
2289 WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
2290 vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
2291 &vmf->ptl);
2292 /*
2293 * We might have raced with another page fault while we released the
2294 * pte_offset_map_lock.
2295 */
2296 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2297 pte_unmap_unlock(vmf->pte, vmf->ptl);
2298 return VM_FAULT_NOPAGE;
2299 }
2300 wp_page_reuse(vmf);
2301 return 0;
2302}
2303
2266/* 2304/*
2267 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED 2305 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2268 * mapping 2306 * mapping
2269 */ 2307 */
2270static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) 2308static int wp_pfn_shared(struct vm_fault *vmf)
2271{ 2309{
2272 struct vm_area_struct *vma = fe->vma; 2310 struct vm_area_struct *vma = vmf->vma;
2273 2311
2274 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { 2312 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2275 struct vm_fault vmf = {
2276 .page = NULL,
2277 .pgoff = linear_page_index(vma, fe->address),
2278 .virtual_address =
2279 (void __user *)(fe->address & PAGE_MASK),
2280 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2281 };
2282 int ret; 2313 int ret;
2283 2314
2284 pte_unmap_unlock(fe->pte, fe->ptl); 2315 pte_unmap_unlock(vmf->pte, vmf->ptl);
2285 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); 2316 vmf->flags |= FAULT_FLAG_MKWRITE;
2286 if (ret & VM_FAULT_ERROR) 2317 ret = vma->vm_ops->pfn_mkwrite(vma, vmf);
2318 if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
2287 return ret; 2319 return ret;
2288 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2320 return finish_mkwrite_fault(vmf);
2289 &fe->ptl);
2290 /*
2291 * We might have raced with another page fault while we
2292 * released the pte_offset_map_lock.
2293 */
2294 if (!pte_same(*fe->pte, orig_pte)) {
2295 pte_unmap_unlock(fe->pte, fe->ptl);
2296 return 0;
2297 }
2298 } 2321 }
2299 return wp_page_reuse(fe, orig_pte, NULL, 0, 0); 2322 wp_page_reuse(vmf);
2323 return VM_FAULT_WRITE;
2300} 2324}
2301 2325
2302static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, 2326static int wp_page_shared(struct vm_fault *vmf)
2303 struct page *old_page) 2327 __releases(vmf->ptl)
2304 __releases(fe->ptl)
2305{ 2328{
2306 struct vm_area_struct *vma = fe->vma; 2329 struct vm_area_struct *vma = vmf->vma;
2307 int page_mkwrite = 0;
2308 2330
2309 get_page(old_page); 2331 get_page(vmf->page);
2310 2332
2311 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2333 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2312 int tmp; 2334 int tmp;
2313 2335
2314 pte_unmap_unlock(fe->pte, fe->ptl); 2336 pte_unmap_unlock(vmf->pte, vmf->ptl);
2315 tmp = do_page_mkwrite(vma, old_page, fe->address); 2337 tmp = do_page_mkwrite(vmf);
2316 if (unlikely(!tmp || (tmp & 2338 if (unlikely(!tmp || (tmp &
2317 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 2339 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2318 put_page(old_page); 2340 put_page(vmf->page);
2319 return tmp; 2341 return tmp;
2320 } 2342 }
2321 /* 2343 tmp = finish_mkwrite_fault(vmf);
2322 * Since we dropped the lock we need to revalidate 2344 if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2323 * the PTE as someone else may have changed it. If 2345 unlock_page(vmf->page);
2324 * they did, we just return, as we can count on the 2346 put_page(vmf->page);
2325 * MMU to tell us if they didn't also make it writable. 2347 return tmp;
2326 */
2327 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2328 &fe->ptl);
2329 if (!pte_same(*fe->pte, orig_pte)) {
2330 unlock_page(old_page);
2331 pte_unmap_unlock(fe->pte, fe->ptl);
2332 put_page(old_page);
2333 return 0;
2334 } 2348 }
2335 page_mkwrite = 1; 2349 } else {
2350 wp_page_reuse(vmf);
2351 lock_page(vmf->page);
2336 } 2352 }
2353 fault_dirty_shared_page(vma, vmf->page);
2354 put_page(vmf->page);
2337 2355
2338 return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); 2356 return VM_FAULT_WRITE;
2339} 2357}
2340 2358
2341/* 2359/*
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
2356 * but allow concurrent faults), with pte both mapped and locked. 2374 * but allow concurrent faults), with pte both mapped and locked.
2357 * We return with mmap_sem still held, but pte unmapped and unlocked. 2375 * We return with mmap_sem still held, but pte unmapped and unlocked.
2358 */ 2376 */
2359static int do_wp_page(struct fault_env *fe, pte_t orig_pte) 2377static int do_wp_page(struct vm_fault *vmf)
2360 __releases(fe->ptl) 2378 __releases(vmf->ptl)
2361{ 2379{
2362 struct vm_area_struct *vma = fe->vma; 2380 struct vm_area_struct *vma = vmf->vma;
2363 struct page *old_page;
2364 2381
2365 old_page = vm_normal_page(vma, fe->address, orig_pte); 2382 vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
2366 if (!old_page) { 2383 if (!vmf->page) {
2367 /* 2384 /*
2368 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a 2385 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2369 * VM_PFNMAP VMA. 2386 * VM_PFNMAP VMA.
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2373 */ 2390 */
2374 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2391 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2375 (VM_WRITE|VM_SHARED)) 2392 (VM_WRITE|VM_SHARED))
2376 return wp_pfn_shared(fe, orig_pte); 2393 return wp_pfn_shared(vmf);
2377 2394
2378 pte_unmap_unlock(fe->pte, fe->ptl); 2395 pte_unmap_unlock(vmf->pte, vmf->ptl);
2379 return wp_page_copy(fe, orig_pte, old_page); 2396 return wp_page_copy(vmf);
2380 } 2397 }
2381 2398
2382 /* 2399 /*
2383 * Take out anonymous pages first, anonymous shared vmas are 2400 * Take out anonymous pages first, anonymous shared vmas are
2384 * not dirty accountable. 2401 * not dirty accountable.
2385 */ 2402 */
2386 if (PageAnon(old_page) && !PageKsm(old_page)) { 2403 if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
2387 int total_mapcount; 2404 int total_mapcount;
2388 if (!trylock_page(old_page)) { 2405 if (!trylock_page(vmf->page)) {
2389 get_page(old_page); 2406 get_page(vmf->page);
2390 pte_unmap_unlock(fe->pte, fe->ptl); 2407 pte_unmap_unlock(vmf->pte, vmf->ptl);
2391 lock_page(old_page); 2408 lock_page(vmf->page);
2392 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, 2409 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2393 fe->address, &fe->ptl); 2410 vmf->address, &vmf->ptl);
2394 if (!pte_same(*fe->pte, orig_pte)) { 2411 if (!pte_same(*vmf->pte, vmf->orig_pte)) {
2395 unlock_page(old_page); 2412 unlock_page(vmf->page);
2396 pte_unmap_unlock(fe->pte, fe->ptl); 2413 pte_unmap_unlock(vmf->pte, vmf->ptl);
2397 put_page(old_page); 2414 put_page(vmf->page);
2398 return 0; 2415 return 0;
2399 } 2416 }
2400 put_page(old_page); 2417 put_page(vmf->page);
2401 } 2418 }
2402 if (reuse_swap_page(old_page, &total_mapcount)) { 2419 if (reuse_swap_page(vmf->page, &total_mapcount)) {
2403 if (total_mapcount == 1) { 2420 if (total_mapcount == 1) {
2404 /* 2421 /*
2405 * The page is all ours. Move it to 2422 * The page is all ours. Move it to
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2408 * Protected against the rmap code by 2425 * Protected against the rmap code by
2409 * the page lock. 2426 * the page lock.
2410 */ 2427 */
2411 page_move_anon_rmap(old_page, vma); 2428 page_move_anon_rmap(vmf->page, vma);
2412 } 2429 }
2413 unlock_page(old_page); 2430 unlock_page(vmf->page);
2414 return wp_page_reuse(fe, orig_pte, old_page, 0, 0); 2431 wp_page_reuse(vmf);
2432 return VM_FAULT_WRITE;
2415 } 2433 }
2416 unlock_page(old_page); 2434 unlock_page(vmf->page);
2417 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2435 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2418 (VM_WRITE|VM_SHARED))) { 2436 (VM_WRITE|VM_SHARED))) {
2419 return wp_page_shared(fe, orig_pte, old_page); 2437 return wp_page_shared(vmf);
2420 } 2438 }
2421 2439
2422 /* 2440 /*
2423 * Ok, we need to copy. Oh, well.. 2441 * Ok, we need to copy. Oh, well..
2424 */ 2442 */
2425 get_page(old_page); 2443 get_page(vmf->page);
2426 2444
2427 pte_unmap_unlock(fe->pte, fe->ptl); 2445 pte_unmap_unlock(vmf->pte, vmf->ptl);
2428 return wp_page_copy(fe, orig_pte, old_page); 2446 return wp_page_copy(vmf);
2429} 2447}
2430 2448
2431static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2449static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
2513 * We return with the mmap_sem locked or unlocked in the same cases 2531 * We return with the mmap_sem locked or unlocked in the same cases
2514 * as does filemap_fault(). 2532 * as does filemap_fault().
2515 */ 2533 */
2516int do_swap_page(struct fault_env *fe, pte_t orig_pte) 2534int do_swap_page(struct vm_fault *vmf)
2517{ 2535{
2518 struct vm_area_struct *vma = fe->vma; 2536 struct vm_area_struct *vma = vmf->vma;
2519 struct page *page, *swapcache; 2537 struct page *page, *swapcache;
2520 struct mem_cgroup *memcg; 2538 struct mem_cgroup *memcg;
2521 swp_entry_t entry; 2539 swp_entry_t entry;
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2524 int exclusive = 0; 2542 int exclusive = 0;
2525 int ret = 0; 2543 int ret = 0;
2526 2544
2527 if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) 2545 if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
2528 goto out; 2546 goto out;
2529 2547
2530 entry = pte_to_swp_entry(orig_pte); 2548 entry = pte_to_swp_entry(vmf->orig_pte);
2531 if (unlikely(non_swap_entry(entry))) { 2549 if (unlikely(non_swap_entry(entry))) {
2532 if (is_migration_entry(entry)) { 2550 if (is_migration_entry(entry)) {
2533 migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); 2551 migration_entry_wait(vma->vm_mm, vmf->pmd,
2552 vmf->address);
2534 } else if (is_hwpoison_entry(entry)) { 2553 } else if (is_hwpoison_entry(entry)) {
2535 ret = VM_FAULT_HWPOISON; 2554 ret = VM_FAULT_HWPOISON;
2536 } else { 2555 } else {
2537 print_bad_pte(vma, fe->address, orig_pte, NULL); 2556 print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
2538 ret = VM_FAULT_SIGBUS; 2557 ret = VM_FAULT_SIGBUS;
2539 } 2558 }
2540 goto out; 2559 goto out;
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2542 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2561 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2543 page = lookup_swap_cache(entry); 2562 page = lookup_swap_cache(entry);
2544 if (!page) { 2563 if (!page) {
2545 page = swapin_readahead(entry, 2564 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma,
2546 GFP_HIGHUSER_MOVABLE, vma, fe->address); 2565 vmf->address);
2547 if (!page) { 2566 if (!page) {
2548 /* 2567 /*
2549 * Back out if somebody else faulted in this pte 2568 * Back out if somebody else faulted in this pte
2550 * while we released the pte lock. 2569 * while we released the pte lock.
2551 */ 2570 */
2552 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, 2571 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2553 fe->address, &fe->ptl); 2572 vmf->address, &vmf->ptl);
2554 if (likely(pte_same(*fe->pte, orig_pte))) 2573 if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
2555 ret = VM_FAULT_OOM; 2574 ret = VM_FAULT_OOM;
2556 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2575 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2557 goto unlock; 2576 goto unlock;
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2573 } 2592 }
2574 2593
2575 swapcache = page; 2594 swapcache = page;
2576 locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); 2595 locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
2577 2596
2578 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2597 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2579 if (!locked) { 2598 if (!locked) {
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2590 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2609 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2591 goto out_page; 2610 goto out_page;
2592 2611
2593 page = ksm_might_need_to_copy(page, vma, fe->address); 2612 page = ksm_might_need_to_copy(page, vma, vmf->address);
2594 if (unlikely(!page)) { 2613 if (unlikely(!page)) {
2595 ret = VM_FAULT_OOM; 2614 ret = VM_FAULT_OOM;
2596 page = swapcache; 2615 page = swapcache;
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2606 /* 2625 /*
2607 * Back out if somebody else already faulted in this pte. 2626 * Back out if somebody else already faulted in this pte.
2608 */ 2627 */
2609 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2628 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2610 &fe->ptl); 2629 &vmf->ptl);
2611 if (unlikely(!pte_same(*fe->pte, orig_pte))) 2630 if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
2612 goto out_nomap; 2631 goto out_nomap;
2613 2632
2614 if (unlikely(!PageUptodate(page))) { 2633 if (unlikely(!PageUptodate(page))) {
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2629 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2648 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2630 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); 2649 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2631 pte = mk_pte(page, vma->vm_page_prot); 2650 pte = mk_pte(page, vma->vm_page_prot);
2632 if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { 2651 if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2633 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2652 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2634 fe->flags &= ~FAULT_FLAG_WRITE; 2653 vmf->flags &= ~FAULT_FLAG_WRITE;
2635 ret |= VM_FAULT_WRITE; 2654 ret |= VM_FAULT_WRITE;
2636 exclusive = RMAP_EXCLUSIVE; 2655 exclusive = RMAP_EXCLUSIVE;
2637 } 2656 }
2638 flush_icache_page(vma, page); 2657 flush_icache_page(vma, page);
2639 if (pte_swp_soft_dirty(orig_pte)) 2658 if (pte_swp_soft_dirty(vmf->orig_pte))
2640 pte = pte_mksoft_dirty(pte); 2659 pte = pte_mksoft_dirty(pte);
2641 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); 2660 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
2661 vmf->orig_pte = pte;
2642 if (page == swapcache) { 2662 if (page == swapcache) {
2643 do_page_add_anon_rmap(page, vma, fe->address, exclusive); 2663 do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
2644 mem_cgroup_commit_charge(page, memcg, true, false); 2664 mem_cgroup_commit_charge(page, memcg, true, false);
2645 activate_page(page); 2665 activate_page(page);
2646 } else { /* ksm created a completely new copy */ 2666 } else { /* ksm created a completely new copy */
2647 page_add_new_anon_rmap(page, vma, fe->address, false); 2667 page_add_new_anon_rmap(page, vma, vmf->address, false);
2648 mem_cgroup_commit_charge(page, memcg, false, false); 2668 mem_cgroup_commit_charge(page, memcg, false, false);
2649 lru_cache_add_active_or_unevictable(page, vma); 2669 lru_cache_add_active_or_unevictable(page, vma);
2650 } 2670 }
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2667 put_page(swapcache); 2687 put_page(swapcache);
2668 } 2688 }
2669 2689
2670 if (fe->flags & FAULT_FLAG_WRITE) { 2690 if (vmf->flags & FAULT_FLAG_WRITE) {
2671 ret |= do_wp_page(fe, pte); 2691 ret |= do_wp_page(vmf);
2672 if (ret & VM_FAULT_ERROR) 2692 if (ret & VM_FAULT_ERROR)
2673 ret &= VM_FAULT_ERROR; 2693 ret &= VM_FAULT_ERROR;
2674 goto out; 2694 goto out;
2675 } 2695 }
2676 2696
2677 /* No need to invalidate - it was non-present before */ 2697 /* No need to invalidate - it was non-present before */
2678 update_mmu_cache(vma, fe->address, fe->pte); 2698 update_mmu_cache(vma, vmf->address, vmf->pte);
2679unlock: 2699unlock:
2680 pte_unmap_unlock(fe->pte, fe->ptl); 2700 pte_unmap_unlock(vmf->pte, vmf->ptl);
2681out: 2701out:
2682 return ret; 2702 return ret;
2683out_nomap: 2703out_nomap:
2684 mem_cgroup_cancel_charge(page, memcg, false); 2704 mem_cgroup_cancel_charge(page, memcg, false);
2685 pte_unmap_unlock(fe->pte, fe->ptl); 2705 pte_unmap_unlock(vmf->pte, vmf->ptl);
2686out_page: 2706out_page:
2687 unlock_page(page); 2707 unlock_page(page);
2688out_release: 2708out_release:
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2733 * but allow concurrent faults), and pte mapped but not yet locked. 2753 * but allow concurrent faults), and pte mapped but not yet locked.
2734 * We return with mmap_sem still held, but pte unmapped and unlocked. 2754 * We return with mmap_sem still held, but pte unmapped and unlocked.
2735 */ 2755 */
2736static int do_anonymous_page(struct fault_env *fe) 2756static int do_anonymous_page(struct vm_fault *vmf)
2737{ 2757{
2738 struct vm_area_struct *vma = fe->vma; 2758 struct vm_area_struct *vma = vmf->vma;
2739 struct mem_cgroup *memcg; 2759 struct mem_cgroup *memcg;
2740 struct page *page; 2760 struct page *page;
2741 pte_t entry; 2761 pte_t entry;
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe)
2745 return VM_FAULT_SIGBUS; 2765 return VM_FAULT_SIGBUS;
2746 2766
2747 /* Check if we need to add a guard page to the stack */ 2767 /* Check if we need to add a guard page to the stack */
2748 if (check_stack_guard_page(vma, fe->address) < 0) 2768 if (check_stack_guard_page(vma, vmf->address) < 0)
2749 return VM_FAULT_SIGSEGV; 2769 return VM_FAULT_SIGSEGV;
2750 2770
2751 /* 2771 /*
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe)
2758 * 2778 *
2759 * Here we only have down_read(mmap_sem). 2779 * Here we only have down_read(mmap_sem).
2760 */ 2780 */
2761 if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) 2781 if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
2762 return VM_FAULT_OOM; 2782 return VM_FAULT_OOM;
2763 2783
2764 /* See the comment in pte_alloc_one_map() */ 2784 /* See the comment in pte_alloc_one_map() */
2765 if (unlikely(pmd_trans_unstable(fe->pmd))) 2785 if (unlikely(pmd_trans_unstable(vmf->pmd)))
2766 return 0; 2786 return 0;
2767 2787
2768 /* Use the zero-page for reads */ 2788 /* Use the zero-page for reads */
2769 if (!(fe->flags & FAULT_FLAG_WRITE) && 2789 if (!(vmf->flags & FAULT_FLAG_WRITE) &&
2770 !mm_forbids_zeropage(vma->vm_mm)) { 2790 !mm_forbids_zeropage(vma->vm_mm)) {
2771 entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), 2791 entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
2772 vma->vm_page_prot)); 2792 vma->vm_page_prot));
2773 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2793 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
2774 &fe->ptl); 2794 vmf->address, &vmf->ptl);
2775 if (!pte_none(*fe->pte)) 2795 if (!pte_none(*vmf->pte))
2776 goto unlock; 2796 goto unlock;
2777 /* Deliver the page fault to userland, check inside PT lock */ 2797 /* Deliver the page fault to userland, check inside PT lock */
2778 if (userfaultfd_missing(vma)) { 2798 if (userfaultfd_missing(vma)) {
2779 pte_unmap_unlock(fe->pte, fe->ptl); 2799 pte_unmap_unlock(vmf->pte, vmf->ptl);
2780 return handle_userfault(fe, VM_UFFD_MISSING); 2800 return handle_userfault(vmf, VM_UFFD_MISSING);
2781 } 2801 }
2782 goto setpte; 2802 goto setpte;
2783 } 2803 }
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe)
2785 /* Allocate our own private page. */ 2805 /* Allocate our own private page. */
2786 if (unlikely(anon_vma_prepare(vma))) 2806 if (unlikely(anon_vma_prepare(vma)))
2787 goto oom; 2807 goto oom;
2788 page = alloc_zeroed_user_highpage_movable(vma, fe->address); 2808 page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
2789 if (!page) 2809 if (!page)
2790 goto oom; 2810 goto oom;
2791 2811
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe)
2803 if (vma->vm_flags & VM_WRITE) 2823 if (vma->vm_flags & VM_WRITE)
2804 entry = pte_mkwrite(pte_mkdirty(entry)); 2824 entry = pte_mkwrite(pte_mkdirty(entry));
2805 2825
2806 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2826 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2807 &fe->ptl); 2827 &vmf->ptl);
2808 if (!pte_none(*fe->pte)) 2828 if (!pte_none(*vmf->pte))
2809 goto release; 2829 goto release;
2810 2830
2811 /* Deliver the page fault to userland, check inside PT lock */ 2831 /* Deliver the page fault to userland, check inside PT lock */
2812 if (userfaultfd_missing(vma)) { 2832 if (userfaultfd_missing(vma)) {
2813 pte_unmap_unlock(fe->pte, fe->ptl); 2833 pte_unmap_unlock(vmf->pte, vmf->ptl);
2814 mem_cgroup_cancel_charge(page, memcg, false); 2834 mem_cgroup_cancel_charge(page, memcg, false);
2815 put_page(page); 2835 put_page(page);
2816 return handle_userfault(fe, VM_UFFD_MISSING); 2836 return handle_userfault(vmf, VM_UFFD_MISSING);
2817 } 2837 }
2818 2838
2819 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2839 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2820 page_add_new_anon_rmap(page, vma, fe->address, false); 2840 page_add_new_anon_rmap(page, vma, vmf->address, false);
2821 mem_cgroup_commit_charge(page, memcg, false, false); 2841 mem_cgroup_commit_charge(page, memcg, false, false);
2822 lru_cache_add_active_or_unevictable(page, vma); 2842 lru_cache_add_active_or_unevictable(page, vma);
2823setpte: 2843setpte:
2824 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); 2844 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
2825 2845
2826 /* No need to invalidate - it was non-present before */ 2846 /* No need to invalidate - it was non-present before */
2827 update_mmu_cache(vma, fe->address, fe->pte); 2847 update_mmu_cache(vma, vmf->address, vmf->pte);
2828unlock: 2848unlock:
2829 pte_unmap_unlock(fe->pte, fe->ptl); 2849 pte_unmap_unlock(vmf->pte, vmf->ptl);
2830 return 0; 2850 return 0;
2831release: 2851release:
2832 mem_cgroup_cancel_charge(page, memcg, false); 2852 mem_cgroup_cancel_charge(page, memcg, false);
@@ -2843,62 +2863,50 @@ oom:
2843 * released depending on flags and vma->vm_ops->fault() return value. 2863 * released depending on flags and vma->vm_ops->fault() return value.
2844 * See filemap_fault() and __lock_page_retry(). 2864 * See filemap_fault() and __lock_page_retry().
2845 */ 2865 */
2846static int __do_fault(struct fault_env *fe, pgoff_t pgoff, 2866static int __do_fault(struct vm_fault *vmf)
2847 struct page *cow_page, struct page **page, void **entry)
2848{ 2867{
2849 struct vm_area_struct *vma = fe->vma; 2868 struct vm_area_struct *vma = vmf->vma;
2850 struct vm_fault vmf;
2851 int ret; 2869 int ret;
2852 2870
2853 vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); 2871 ret = vma->vm_ops->fault(vma, vmf);
2854 vmf.pgoff = pgoff; 2872 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
2855 vmf.flags = fe->flags; 2873 VM_FAULT_DONE_COW)))
2856 vmf.page = NULL;
2857 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2858 vmf.cow_page = cow_page;
2859
2860 ret = vma->vm_ops->fault(vma, &vmf);
2861 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
2862 return ret;
2863 if (ret & VM_FAULT_DAX_LOCKED) {
2864 *entry = vmf.entry;
2865 return ret; 2874 return ret;
2866 }
2867 2875
2868 if (unlikely(PageHWPoison(vmf.page))) { 2876 if (unlikely(PageHWPoison(vmf->page))) {
2869 if (ret & VM_FAULT_LOCKED) 2877 if (ret & VM_FAULT_LOCKED)
2870 unlock_page(vmf.page); 2878 unlock_page(vmf->page);
2871 put_page(vmf.page); 2879 put_page(vmf->page);
2880 vmf->page = NULL;
2872 return VM_FAULT_HWPOISON; 2881 return VM_FAULT_HWPOISON;
2873 } 2882 }
2874 2883
2875 if (unlikely(!(ret & VM_FAULT_LOCKED))) 2884 if (unlikely(!(ret & VM_FAULT_LOCKED)))
2876 lock_page(vmf.page); 2885 lock_page(vmf->page);
2877 else 2886 else
2878 VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); 2887 VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page);
2879 2888
2880 *page = vmf.page;
2881 return ret; 2889 return ret;
2882} 2890}
2883 2891
2884static int pte_alloc_one_map(struct fault_env *fe) 2892static int pte_alloc_one_map(struct vm_fault *vmf)
2885{ 2893{
2886 struct vm_area_struct *vma = fe->vma; 2894 struct vm_area_struct *vma = vmf->vma;
2887 2895
2888 if (!pmd_none(*fe->pmd)) 2896 if (!pmd_none(*vmf->pmd))
2889 goto map_pte; 2897 goto map_pte;
2890 if (fe->prealloc_pte) { 2898 if (vmf->prealloc_pte) {
2891 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 2899 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2892 if (unlikely(!pmd_none(*fe->pmd))) { 2900 if (unlikely(!pmd_none(*vmf->pmd))) {
2893 spin_unlock(fe->ptl); 2901 spin_unlock(vmf->ptl);
2894 goto map_pte; 2902 goto map_pte;
2895 } 2903 }
2896 2904
2897 atomic_long_inc(&vma->vm_mm->nr_ptes); 2905 atomic_long_inc(&vma->vm_mm->nr_ptes);
2898 pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); 2906 pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
2899 spin_unlock(fe->ptl); 2907 spin_unlock(vmf->ptl);
2900 fe->prealloc_pte = 0; 2908 vmf->prealloc_pte = 0;
2901 } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { 2909 } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
2902 return VM_FAULT_OOM; 2910 return VM_FAULT_OOM;
2903 } 2911 }
2904map_pte: 2912map_pte:
@@ -2913,11 +2921,11 @@ map_pte:
2913 * through an atomic read in C, which is what pmd_trans_unstable() 2921 * through an atomic read in C, which is what pmd_trans_unstable()
2914 * provides. 2922 * provides.
2915 */ 2923 */
2916 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) 2924 if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
2917 return VM_FAULT_NOPAGE; 2925 return VM_FAULT_NOPAGE;
2918 2926
2919 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 2927 vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
2920 &fe->ptl); 2928 &vmf->ptl);
2921 return 0; 2929 return 0;
2922} 2930}
2923 2931
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
2935 return true; 2943 return true;
2936} 2944}
2937 2945
2938static void deposit_prealloc_pte(struct fault_env *fe) 2946static void deposit_prealloc_pte(struct vm_fault *vmf)
2939{ 2947{
2940 struct vm_area_struct *vma = fe->vma; 2948 struct vm_area_struct *vma = vmf->vma;
2941 2949
2942 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); 2950 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
2943 /* 2951 /*
2944 * We are going to consume the prealloc table, 2952 * We are going to consume the prealloc table,
2945 * count that as nr_ptes. 2953 * count that as nr_ptes.
2946 */ 2954 */
2947 atomic_long_inc(&vma->vm_mm->nr_ptes); 2955 atomic_long_inc(&vma->vm_mm->nr_ptes);
2948 fe->prealloc_pte = 0; 2956 vmf->prealloc_pte = 0;
2949} 2957}
2950 2958
2951static int do_set_pmd(struct fault_env *fe, struct page *page) 2959static int do_set_pmd(struct vm_fault *vmf, struct page *page)
2952{ 2960{
2953 struct vm_area_struct *vma = fe->vma; 2961 struct vm_area_struct *vma = vmf->vma;
2954 bool write = fe->flags & FAULT_FLAG_WRITE; 2962 bool write = vmf->flags & FAULT_FLAG_WRITE;
2955 unsigned long haddr = fe->address & HPAGE_PMD_MASK; 2963 unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
2956 pmd_t entry; 2964 pmd_t entry;
2957 int i, ret; 2965 int i, ret;
2958 2966
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2966 * Archs like ppc64 need additonal space to store information 2974 * Archs like ppc64 need additonal space to store information
2967 * related to pte entry. Use the preallocated table for that. 2975 * related to pte entry. Use the preallocated table for that.
2968 */ 2976 */
2969 if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { 2977 if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
2970 fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); 2978 vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
2971 if (!fe->prealloc_pte) 2979 if (!vmf->prealloc_pte)
2972 return VM_FAULT_OOM; 2980 return VM_FAULT_OOM;
2973 smp_wmb(); /* See comment in __pte_alloc() */ 2981 smp_wmb(); /* See comment in __pte_alloc() */
2974 } 2982 }
2975 2983
2976 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); 2984 vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
2977 if (unlikely(!pmd_none(*fe->pmd))) 2985 if (unlikely(!pmd_none(*vmf->pmd)))
2978 goto out; 2986 goto out;
2979 2987
2980 for (i = 0; i < HPAGE_PMD_NR; i++) 2988 for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
2990 * deposit and withdraw with pmd lock held 2998 * deposit and withdraw with pmd lock held
2991 */ 2999 */
2992 if (arch_needs_pgtable_deposit()) 3000 if (arch_needs_pgtable_deposit())
2993 deposit_prealloc_pte(fe); 3001 deposit_prealloc_pte(vmf);
2994 3002
2995 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); 3003 set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
2996 3004
2997 update_mmu_cache_pmd(vma, haddr, fe->pmd); 3005 update_mmu_cache_pmd(vma, haddr, vmf->pmd);
2998 3006
2999 /* fault is handled */ 3007 /* fault is handled */
3000 ret = 0; 3008 ret = 0;
@@ -3005,13 +3013,13 @@ out:
3005 * withdraw with pmd lock held. 3013 * withdraw with pmd lock held.
3006 */ 3014 */
3007 if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) 3015 if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
3008 fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, 3016 vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
3009 fe->pmd); 3017 vmf->pmd);
3010 spin_unlock(fe->ptl); 3018 spin_unlock(vmf->ptl);
3011 return ret; 3019 return ret;
3012} 3020}
3013#else 3021#else
3014static int do_set_pmd(struct fault_env *fe, struct page *page) 3022static int do_set_pmd(struct vm_fault *vmf, struct page *page)
3015{ 3023{
3016 BUILD_BUG(); 3024 BUILD_BUG();
3017 return 0; 3025 return 0;
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page)
3022 * alloc_set_pte - setup new PTE entry for given page and add reverse page 3030 * alloc_set_pte - setup new PTE entry for given page and add reverse page
3023 * mapping. If needed, the fucntion allocates page table or use pre-allocated. 3031 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
3024 * 3032 *
3025 * @fe: fault environment 3033 * @vmf: fault environment
3026 * @memcg: memcg to charge page (only for private mappings) 3034 * @memcg: memcg to charge page (only for private mappings)
3027 * @page: page to map 3035 * @page: page to map
3028 * 3036 *
3029 * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. 3037 * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
3038 * return.
3030 * 3039 *
3031 * Target users are page handler itself and implementations of 3040 * Target users are page handler itself and implementations of
3032 * vm_ops->map_pages. 3041 * vm_ops->map_pages.
3033 */ 3042 */
3034int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, 3043int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
3035 struct page *page) 3044 struct page *page)
3036{ 3045{
3037 struct vm_area_struct *vma = fe->vma; 3046 struct vm_area_struct *vma = vmf->vma;
3038 bool write = fe->flags & FAULT_FLAG_WRITE; 3047 bool write = vmf->flags & FAULT_FLAG_WRITE;
3039 pte_t entry; 3048 pte_t entry;
3040 int ret; 3049 int ret;
3041 3050
3042 if (pmd_none(*fe->pmd) && PageTransCompound(page) && 3051 if (pmd_none(*vmf->pmd) && PageTransCompound(page) &&
3043 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { 3052 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3044 /* THP on COW? */ 3053 /* THP on COW? */
3045 VM_BUG_ON_PAGE(memcg, page); 3054 VM_BUG_ON_PAGE(memcg, page);
3046 3055
3047 ret = do_set_pmd(fe, page); 3056 ret = do_set_pmd(vmf, page);
3048 if (ret != VM_FAULT_FALLBACK) 3057 if (ret != VM_FAULT_FALLBACK)
3049 goto fault_handled; 3058 goto fault_handled;
3050 } 3059 }
3051 3060
3052 if (!fe->pte) { 3061 if (!vmf->pte) {
3053 ret = pte_alloc_one_map(fe); 3062 ret = pte_alloc_one_map(vmf);
3054 if (ret) 3063 if (ret)
3055 goto fault_handled; 3064 goto fault_handled;
3056 } 3065 }
3057 3066
3058 /* Re-check under ptl */ 3067 /* Re-check under ptl */
3059 if (unlikely(!pte_none(*fe->pte))) { 3068 if (unlikely(!pte_none(*vmf->pte))) {
3060 ret = VM_FAULT_NOPAGE; 3069 ret = VM_FAULT_NOPAGE;
3061 goto fault_handled; 3070 goto fault_handled;
3062 } 3071 }
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
3068 /* copy-on-write page */ 3077 /* copy-on-write page */
3069 if (write && !(vma->vm_flags & VM_SHARED)) { 3078 if (write && !(vma->vm_flags & VM_SHARED)) {
3070 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 3079 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
3071 page_add_new_anon_rmap(page, vma, fe->address, false); 3080 page_add_new_anon_rmap(page, vma, vmf->address, false);
3072 mem_cgroup_commit_charge(page, memcg, false, false); 3081 mem_cgroup_commit_charge(page, memcg, false, false);
3073 lru_cache_add_active_or_unevictable(page, vma); 3082 lru_cache_add_active_or_unevictable(page, vma);
3074 } else { 3083 } else {
3075 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 3084 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
3076 page_add_file_rmap(page, false); 3085 page_add_file_rmap(page, false);
3077 } 3086 }
3078 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); 3087 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
3079 3088
3080 /* no need to invalidate: a not-present page won't be cached */ 3089 /* no need to invalidate: a not-present page won't be cached */
3081 update_mmu_cache(vma, fe->address, fe->pte); 3090 update_mmu_cache(vma, vmf->address, vmf->pte);
3082 ret = 0; 3091 ret = 0;
3083 3092
3084fault_handled: 3093fault_handled:
3085 /* preallocated pagetable is unused: free it */ 3094 /* preallocated pagetable is unused: free it */
3086 if (fe->prealloc_pte) { 3095 if (vmf->prealloc_pte) {
3087 pte_free(fe->vma->vm_mm, fe->prealloc_pte); 3096 pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
3088 fe->prealloc_pte = 0; 3097 vmf->prealloc_pte = 0;
3089 } 3098 }
3090 return ret; 3099 return ret;
3091} 3100}
3092 3101
3102
3103/**
3104 * finish_fault - finish page fault once we have prepared the page to fault
3105 *
3106 * @vmf: structure describing the fault
3107 *
3108 * This function handles all that is needed to finish a page fault once the
3109 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
3110 * given page, adds reverse page mapping, handles memcg charges and LRU
3111 * addition. The function returns 0 on success, VM_FAULT_ code in case of
3112 * error.
3113 *
3114 * The function expects the page to be locked and on success it consumes a
3115 * reference of a page being mapped (for the PTE which maps it).
3116 */
3117int finish_fault(struct vm_fault *vmf)
3118{
3119 struct page *page;
3120 int ret;
3121
3122 /* Did we COW the page? */
3123 if ((vmf->flags & FAULT_FLAG_WRITE) &&
3124 !(vmf->vma->vm_flags & VM_SHARED))
3125 page = vmf->cow_page;
3126 else
3127 page = vmf->page;
3128 ret = alloc_set_pte(vmf, vmf->memcg, page);
3129 if (vmf->pte)
3130 pte_unmap_unlock(vmf->pte, vmf->ptl);
3131 return ret;
3132}
3133
3093static unsigned long fault_around_bytes __read_mostly = 3134static unsigned long fault_around_bytes __read_mostly =
3094 rounddown_pow_of_two(65536); 3135 rounddown_pow_of_two(65536);
3095 3136
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs);
3154 * fault_around_pages() value (and therefore to page order). This way it's 3195 * fault_around_pages() value (and therefore to page order). This way it's
3155 * easier to guarantee that we don't cross page table boundaries. 3196 * easier to guarantee that we don't cross page table boundaries.
3156 */ 3197 */
3157static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) 3198static int do_fault_around(struct vm_fault *vmf)
3158{ 3199{
3159 unsigned long address = fe->address, nr_pages, mask; 3200 unsigned long address = vmf->address, nr_pages, mask;
3201 pgoff_t start_pgoff = vmf->pgoff;
3160 pgoff_t end_pgoff; 3202 pgoff_t end_pgoff;
3161 int off, ret = 0; 3203 int off, ret = 0;
3162 3204
3163 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 3205 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
3164 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 3206 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
3165 3207
3166 fe->address = max(address & mask, fe->vma->vm_start); 3208 vmf->address = max(address & mask, vmf->vma->vm_start);
3167 off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 3209 off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
3168 start_pgoff -= off; 3210 start_pgoff -= off;
3169 3211
3170 /* 3212 /*
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
3172 * or fault_around_pages() from start_pgoff, depending what is nearest. 3214 * or fault_around_pages() from start_pgoff, depending what is nearest.
3173 */ 3215 */
3174 end_pgoff = start_pgoff - 3216 end_pgoff = start_pgoff -
3175 ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 3217 ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
3176 PTRS_PER_PTE - 1; 3218 PTRS_PER_PTE - 1;
3177 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, 3219 end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1,
3178 start_pgoff + nr_pages - 1); 3220 start_pgoff + nr_pages - 1);
3179 3221
3180 if (pmd_none(*fe->pmd)) { 3222 if (pmd_none(*vmf->pmd)) {
3181 fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); 3223 vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
3182 if (!fe->prealloc_pte) 3224 vmf->address);
3225 if (!vmf->prealloc_pte)
3183 goto out; 3226 goto out;
3184 smp_wmb(); /* See comment in __pte_alloc() */ 3227 smp_wmb(); /* See comment in __pte_alloc() */
3185 } 3228 }
3186 3229
3187 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); 3230 vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
3188 3231
3189 /* Huge page is mapped? Page fault is solved */ 3232 /* Huge page is mapped? Page fault is solved */
3190 if (pmd_trans_huge(*fe->pmd)) { 3233 if (pmd_trans_huge(*vmf->pmd)) {
3191 ret = VM_FAULT_NOPAGE; 3234 ret = VM_FAULT_NOPAGE;
3192 goto out; 3235 goto out;
3193 } 3236 }
3194 3237
3195 /* ->map_pages() haven't done anything useful. Cold page cache? */ 3238 /* ->map_pages() haven't done anything useful. Cold page cache? */
3196 if (!fe->pte) 3239 if (!vmf->pte)
3197 goto out; 3240 goto out;
3198 3241
3199 /* check if the page fault is solved */ 3242 /* check if the page fault is solved */
3200 fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); 3243 vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3201 if (!pte_none(*fe->pte)) 3244 if (!pte_none(*vmf->pte))
3202 ret = VM_FAULT_NOPAGE; 3245 ret = VM_FAULT_NOPAGE;
3203 pte_unmap_unlock(fe->pte, fe->ptl); 3246 pte_unmap_unlock(vmf->pte, vmf->ptl);
3204out: 3247out:
3205 fe->address = address; 3248 vmf->address = address;
3206 fe->pte = NULL; 3249 vmf->pte = NULL;
3207 return ret; 3250 return ret;
3208} 3251}
3209 3252
3210static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) 3253static int do_read_fault(struct vm_fault *vmf)
3211{ 3254{
3212 struct vm_area_struct *vma = fe->vma; 3255 struct vm_area_struct *vma = vmf->vma;
3213 struct page *fault_page;
3214 int ret = 0; 3256 int ret = 0;
3215 3257
3216 /* 3258 /*
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
3219 * something). 3261 * something).
3220 */ 3262 */
3221 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 3263 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3222 ret = do_fault_around(fe, pgoff); 3264 ret = do_fault_around(vmf);
3223 if (ret) 3265 if (ret)
3224 return ret; 3266 return ret;
3225 } 3267 }
3226 3268
3227 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); 3269 ret = __do_fault(vmf);
3228 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3270 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3229 return ret; 3271 return ret;
3230 3272
3231 ret |= alloc_set_pte(fe, NULL, fault_page); 3273 ret |= finish_fault(vmf);
3232 if (fe->pte) 3274 unlock_page(vmf->page);
3233 pte_unmap_unlock(fe->pte, fe->ptl);
3234 unlock_page(fault_page);
3235 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3275 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3236 put_page(fault_page); 3276 put_page(vmf->page);
3237 return ret; 3277 return ret;
3238} 3278}
3239 3279
3240static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) 3280static int do_cow_fault(struct vm_fault *vmf)
3241{ 3281{
3242 struct vm_area_struct *vma = fe->vma; 3282 struct vm_area_struct *vma = vmf->vma;
3243 struct page *fault_page, *new_page;
3244 void *fault_entry;
3245 struct mem_cgroup *memcg;
3246 int ret; 3283 int ret;
3247 3284
3248 if (unlikely(anon_vma_prepare(vma))) 3285 if (unlikely(anon_vma_prepare(vma)))
3249 return VM_FAULT_OOM; 3286 return VM_FAULT_OOM;
3250 3287
3251 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); 3288 vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address);
3252 if (!new_page) 3289 if (!vmf->cow_page)
3253 return VM_FAULT_OOM; 3290 return VM_FAULT_OOM;
3254 3291
3255 if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, 3292 if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
3256 &memcg, false)) { 3293 &vmf->memcg, false)) {
3257 put_page(new_page); 3294 put_page(vmf->cow_page);
3258 return VM_FAULT_OOM; 3295 return VM_FAULT_OOM;
3259 } 3296 }
3260 3297
3261 ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); 3298 ret = __do_fault(vmf);
3262 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3299 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3263 goto uncharge_out; 3300 goto uncharge_out;
3301 if (ret & VM_FAULT_DONE_COW)
3302 return ret;
3264 3303
3265 if (!(ret & VM_FAULT_DAX_LOCKED)) 3304 copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
3266 copy_user_highpage(new_page, fault_page, fe->address, vma); 3305 __SetPageUptodate(vmf->cow_page);
3267 __SetPageUptodate(new_page);
3268 3306
3269 ret |= alloc_set_pte(fe, memcg, new_page); 3307 ret |= finish_fault(vmf);
3270 if (fe->pte) 3308 unlock_page(vmf->page);
3271 pte_unmap_unlock(fe->pte, fe->ptl); 3309 put_page(vmf->page);
3272 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3273 unlock_page(fault_page);
3274 put_page(fault_page);
3275 } else {
3276 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
3277 }
3278 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3310 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3279 goto uncharge_out; 3311 goto uncharge_out;
3280 return ret; 3312 return ret;
3281uncharge_out: 3313uncharge_out:
3282 mem_cgroup_cancel_charge(new_page, memcg, false); 3314 mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
3283 put_page(new_page); 3315 put_page(vmf->cow_page);
3284 return ret; 3316 return ret;
3285} 3317}
3286 3318
3287static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) 3319static int do_shared_fault(struct vm_fault *vmf)
3288{ 3320{
3289 struct vm_area_struct *vma = fe->vma; 3321 struct vm_area_struct *vma = vmf->vma;
3290 struct page *fault_page;
3291 struct address_space *mapping;
3292 int dirtied = 0;
3293 int ret, tmp; 3322 int ret, tmp;
3294 3323
3295 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); 3324 ret = __do_fault(vmf);
3296 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3325 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3297 return ret; 3326 return ret;
3298 3327
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3301 * about to become writable 3330 * about to become writable
3302 */ 3331 */
3303 if (vma->vm_ops->page_mkwrite) { 3332 if (vma->vm_ops->page_mkwrite) {
3304 unlock_page(fault_page); 3333 unlock_page(vmf->page);
3305 tmp = do_page_mkwrite(vma, fault_page, fe->address); 3334 tmp = do_page_mkwrite(vmf);
3306 if (unlikely(!tmp || 3335 if (unlikely(!tmp ||
3307 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 3336 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3308 put_page(fault_page); 3337 put_page(vmf->page);
3309 return tmp; 3338 return tmp;
3310 } 3339 }
3311 } 3340 }
3312 3341
3313 ret |= alloc_set_pte(fe, NULL, fault_page); 3342 ret |= finish_fault(vmf);
3314 if (fe->pte)
3315 pte_unmap_unlock(fe->pte, fe->ptl);
3316 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3343 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3317 VM_FAULT_RETRY))) { 3344 VM_FAULT_RETRY))) {
3318 unlock_page(fault_page); 3345 unlock_page(vmf->page);
3319 put_page(fault_page); 3346 put_page(vmf->page);
3320 return ret; 3347 return ret;
3321 } 3348 }
3322 3349
3323 if (set_page_dirty(fault_page)) 3350 fault_dirty_shared_page(vma, vmf->page);
3324 dirtied = 1;
3325 /*
3326 * Take a local copy of the address_space - page.mapping may be zeroed
3327 * by truncate after unlock_page(). The address_space itself remains
3328 * pinned by vma->vm_file's reference. We rely on unlock_page()'s
3329 * release semantics to prevent the compiler from undoing this copying.
3330 */
3331 mapping = page_rmapping(fault_page);
3332 unlock_page(fault_page);
3333 if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
3334 /*
3335 * Some device drivers do not set page.mapping but still
3336 * dirty their pages
3337 */
3338 balance_dirty_pages_ratelimited(mapping);
3339 }
3340
3341 if (!vma->vm_ops->page_mkwrite)
3342 file_update_time(vma->vm_file);
3343
3344 return ret; 3351 return ret;
3345} 3352}
3346 3353
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3350 * The mmap_sem may have been released depending on flags and our 3357 * The mmap_sem may have been released depending on flags and our
3351 * return value. See filemap_fault() and __lock_page_or_retry(). 3358 * return value. See filemap_fault() and __lock_page_or_retry().
3352 */ 3359 */
3353static int do_fault(struct fault_env *fe) 3360static int do_fault(struct vm_fault *vmf)
3354{ 3361{
3355 struct vm_area_struct *vma = fe->vma; 3362 struct vm_area_struct *vma = vmf->vma;
3356 pgoff_t pgoff = linear_page_index(vma, fe->address);
3357 3363
3358 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3364 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
3359 if (!vma->vm_ops->fault) 3365 if (!vma->vm_ops->fault)
3360 return VM_FAULT_SIGBUS; 3366 return VM_FAULT_SIGBUS;
3361 if (!(fe->flags & FAULT_FLAG_WRITE)) 3367 if (!(vmf->flags & FAULT_FLAG_WRITE))
3362 return do_read_fault(fe, pgoff); 3368 return do_read_fault(vmf);
3363 if (!(vma->vm_flags & VM_SHARED)) 3369 if (!(vma->vm_flags & VM_SHARED))
3364 return do_cow_fault(fe, pgoff); 3370 return do_cow_fault(vmf);
3365 return do_shared_fault(fe, pgoff); 3371 return do_shared_fault(vmf);
3366} 3372}
3367 3373
3368static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3374static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3380 return mpol_misplaced(page, vma, addr); 3386 return mpol_misplaced(page, vma, addr);
3381} 3387}
3382 3388
3383static int do_numa_page(struct fault_env *fe, pte_t pte) 3389static int do_numa_page(struct vm_fault *vmf)
3384{ 3390{
3385 struct vm_area_struct *vma = fe->vma; 3391 struct vm_area_struct *vma = vmf->vma;
3386 struct page *page = NULL; 3392 struct page *page = NULL;
3387 int page_nid = -1; 3393 int page_nid = -1;
3388 int last_cpupid; 3394 int last_cpupid;
3389 int target_nid; 3395 int target_nid;
3390 bool migrated = false; 3396 bool migrated = false;
3397 pte_t pte = vmf->orig_pte;
3391 bool was_writable = pte_write(pte); 3398 bool was_writable = pte_write(pte);
3392 int flags = 0; 3399 int flags = 0;
3393 3400
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
3400 * page table entry is not accessible, so there would be no 3407 * page table entry is not accessible, so there would be no
3401 * concurrent hardware modifications to the PTE. 3408 * concurrent hardware modifications to the PTE.
3402 */ 3409 */
3403 fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); 3410 vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
3404 spin_lock(fe->ptl); 3411 spin_lock(vmf->ptl);
3405 if (unlikely(!pte_same(*fe->pte, pte))) { 3412 if (unlikely(!pte_same(*vmf->pte, pte))) {
3406 pte_unmap_unlock(fe->pte, fe->ptl); 3413 pte_unmap_unlock(vmf->pte, vmf->ptl);
3407 goto out; 3414 goto out;
3408 } 3415 }
3409 3416
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
3412 pte = pte_mkyoung(pte); 3419 pte = pte_mkyoung(pte);
3413 if (was_writable) 3420 if (was_writable)
3414 pte = pte_mkwrite(pte); 3421 pte = pte_mkwrite(pte);
3415 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); 3422 set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
3416 update_mmu_cache(vma, fe->address, fe->pte); 3423 update_mmu_cache(vma, vmf->address, vmf->pte);
3417 3424
3418 page = vm_normal_page(vma, fe->address, pte); 3425 page = vm_normal_page(vma, vmf->address, pte);
3419 if (!page) { 3426 if (!page) {
3420 pte_unmap_unlock(fe->pte, fe->ptl); 3427 pte_unmap_unlock(vmf->pte, vmf->ptl);
3421 return 0; 3428 return 0;
3422 } 3429 }
3423 3430
3424 /* TODO: handle PTE-mapped THP */ 3431 /* TODO: handle PTE-mapped THP */
3425 if (PageCompound(page)) { 3432 if (PageCompound(page)) {
3426 pte_unmap_unlock(fe->pte, fe->ptl); 3433 pte_unmap_unlock(vmf->pte, vmf->ptl);
3427 return 0; 3434 return 0;
3428 } 3435 }
3429 3436
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
3447 3454
3448 last_cpupid = page_cpupid_last(page); 3455 last_cpupid = page_cpupid_last(page);
3449 page_nid = page_to_nid(page); 3456 page_nid = page_to_nid(page);
3450 target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, 3457 target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
3451 &flags); 3458 &flags);
3452 pte_unmap_unlock(fe->pte, fe->ptl); 3459 pte_unmap_unlock(vmf->pte, vmf->ptl);
3453 if (target_nid == -1) { 3460 if (target_nid == -1) {
3454 put_page(page); 3461 put_page(page);
3455 goto out; 3462 goto out;
@@ -3469,28 +3476,28 @@ out:
3469 return 0; 3476 return 0;
3470} 3477}
3471 3478
3472static int create_huge_pmd(struct fault_env *fe) 3479static int create_huge_pmd(struct vm_fault *vmf)
3473{ 3480{
3474 struct vm_area_struct *vma = fe->vma; 3481 struct vm_area_struct *vma = vmf->vma;
3475 if (vma_is_anonymous(vma)) 3482 if (vma_is_anonymous(vma))
3476 return do_huge_pmd_anonymous_page(fe); 3483 return do_huge_pmd_anonymous_page(vmf);
3477 if (vma->vm_ops->pmd_fault) 3484 if (vma->vm_ops->pmd_fault)
3478 return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, 3485 return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd,
3479 fe->flags); 3486 vmf->flags);
3480 return VM_FAULT_FALLBACK; 3487 return VM_FAULT_FALLBACK;
3481} 3488}
3482 3489
3483static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) 3490static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
3484{ 3491{
3485 if (vma_is_anonymous(fe->vma)) 3492 if (vma_is_anonymous(vmf->vma))
3486 return do_huge_pmd_wp_page(fe, orig_pmd); 3493 return do_huge_pmd_wp_page(vmf, orig_pmd);
3487 if (fe->vma->vm_ops->pmd_fault) 3494 if (vmf->vma->vm_ops->pmd_fault)
3488 return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, 3495 return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address,
3489 fe->flags); 3496 vmf->pmd, vmf->flags);
3490 3497
3491 /* COW handled on pte level: split pmd */ 3498 /* COW handled on pte level: split pmd */
3492 VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); 3499 VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma);
3493 __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); 3500 __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
3494 3501
3495 return VM_FAULT_FALLBACK; 3502 return VM_FAULT_FALLBACK;
3496} 3503}
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma)
3515 * The mmap_sem may have been released depending on flags and our return value. 3522 * The mmap_sem may have been released depending on flags and our return value.
3516 * See filemap_fault() and __lock_page_or_retry(). 3523 * See filemap_fault() and __lock_page_or_retry().
3517 */ 3524 */
3518static int handle_pte_fault(struct fault_env *fe) 3525static int handle_pte_fault(struct vm_fault *vmf)
3519{ 3526{
3520 pte_t entry; 3527 pte_t entry;
3521 3528
3522 if (unlikely(pmd_none(*fe->pmd))) { 3529 if (unlikely(pmd_none(*vmf->pmd))) {
3523 /* 3530 /*
3524 * Leave __pte_alloc() until later: because vm_ops->fault may 3531 * Leave __pte_alloc() until later: because vm_ops->fault may
3525 * want to allocate huge page, and if we expose page table 3532 * want to allocate huge page, and if we expose page table
3526 * for an instant, it will be difficult to retract from 3533 * for an instant, it will be difficult to retract from
3527 * concurrent faults and from rmap lookups. 3534 * concurrent faults and from rmap lookups.
3528 */ 3535 */
3529 fe->pte = NULL; 3536 vmf->pte = NULL;
3530 } else { 3537 } else {
3531 /* See comment in pte_alloc_one_map() */ 3538 /* See comment in pte_alloc_one_map() */
3532 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) 3539 if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd))
3533 return 0; 3540 return 0;
3534 /* 3541 /*
3535 * A regular pmd is established and it can't morph into a huge 3542 * A regular pmd is established and it can't morph into a huge
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe)
3537 * mmap_sem read mode and khugepaged takes it in write mode. 3544 * mmap_sem read mode and khugepaged takes it in write mode.
3538 * So now it's safe to run pte_offset_map(). 3545 * So now it's safe to run pte_offset_map().
3539 */ 3546 */
3540 fe->pte = pte_offset_map(fe->pmd, fe->address); 3547 vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
3541 3548 vmf->orig_pte = *vmf->pte;
3542 entry = *fe->pte;
3543 3549
3544 /* 3550 /*
3545 * some architectures can have larger ptes than wordsize, 3551 * some architectures can have larger ptes than wordsize,
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe)
3550 * ptl lock held. So here a barrier will do. 3556 * ptl lock held. So here a barrier will do.
3551 */ 3557 */
3552 barrier(); 3558 barrier();
3553 if (pte_none(entry)) { 3559 if (pte_none(vmf->orig_pte)) {
3554 pte_unmap(fe->pte); 3560 pte_unmap(vmf->pte);
3555 fe->pte = NULL; 3561 vmf->pte = NULL;
3556 } 3562 }
3557 } 3563 }
3558 3564
3559 if (!fe->pte) { 3565 if (!vmf->pte) {
3560 if (vma_is_anonymous(fe->vma)) 3566 if (vma_is_anonymous(vmf->vma))
3561 return do_anonymous_page(fe); 3567 return do_anonymous_page(vmf);
3562 else 3568 else
3563 return do_fault(fe); 3569 return do_fault(vmf);
3564 } 3570 }
3565 3571
3566 if (!pte_present(entry)) 3572 if (!pte_present(vmf->orig_pte))
3567 return do_swap_page(fe, entry); 3573 return do_swap_page(vmf);
3568 3574
3569 if (pte_protnone(entry) && vma_is_accessible(fe->vma)) 3575 if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
3570 return do_numa_page(fe, entry); 3576 return do_numa_page(vmf);
3571 3577
3572 fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); 3578 vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
3573 spin_lock(fe->ptl); 3579 spin_lock(vmf->ptl);
3574 if (unlikely(!pte_same(*fe->pte, entry))) 3580 entry = vmf->orig_pte;
3581 if (unlikely(!pte_same(*vmf->pte, entry)))
3575 goto unlock; 3582 goto unlock;
3576 if (fe->flags & FAULT_FLAG_WRITE) { 3583 if (vmf->flags & FAULT_FLAG_WRITE) {
3577 if (!pte_write(entry)) 3584 if (!pte_write(entry))
3578 return do_wp_page(fe, entry); 3585 return do_wp_page(vmf);
3579 entry = pte_mkdirty(entry); 3586 entry = pte_mkdirty(entry);
3580 } 3587 }
3581 entry = pte_mkyoung(entry); 3588 entry = pte_mkyoung(entry);
3582 if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, 3589 if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
3583 fe->flags & FAULT_FLAG_WRITE)) { 3590 vmf->flags & FAULT_FLAG_WRITE)) {
3584 update_mmu_cache(fe->vma, fe->address, fe->pte); 3591 update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
3585 } else { 3592 } else {
3586 /* 3593 /*
3587 * This is needed only for protection faults but the arch code 3594 * This is needed only for protection faults but the arch code
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe)
3589 * This still avoids useless tlb flushes for .text page faults 3596 * This still avoids useless tlb flushes for .text page faults
3590 * with threads. 3597 * with threads.
3591 */ 3598 */
3592 if (fe->flags & FAULT_FLAG_WRITE) 3599 if (vmf->flags & FAULT_FLAG_WRITE)
3593 flush_tlb_fix_spurious_fault(fe->vma, fe->address); 3600 flush_tlb_fix_spurious_fault(vmf->vma, vmf->address);
3594 } 3601 }
3595unlock: 3602unlock:
3596 pte_unmap_unlock(fe->pte, fe->ptl); 3603 pte_unmap_unlock(vmf->pte, vmf->ptl);
3597 return 0; 3604 return 0;
3598} 3605}
3599 3606
@@ -3606,10 +3613,12 @@ unlock:
3606static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, 3613static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3607 unsigned int flags) 3614 unsigned int flags)
3608{ 3615{
3609 struct fault_env fe = { 3616 struct vm_fault vmf = {
3610 .vma = vma, 3617 .vma = vma,
3611 .address = address, 3618 .address = address & PAGE_MASK,
3612 .flags = flags, 3619 .flags = flags,
3620 .pgoff = linear_page_index(vma, address),
3621 .gfp_mask = __get_fault_gfp_mask(vma),
3613 }; 3622 };
3614 struct mm_struct *mm = vma->vm_mm; 3623 struct mm_struct *mm = vma->vm_mm;
3615 pgd_t *pgd; 3624 pgd_t *pgd;
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3619 pud = pud_alloc(mm, pgd, address); 3628 pud = pud_alloc(mm, pgd, address);
3620 if (!pud) 3629 if (!pud)
3621 return VM_FAULT_OOM; 3630 return VM_FAULT_OOM;
3622 fe.pmd = pmd_alloc(mm, pud, address); 3631 vmf.pmd = pmd_alloc(mm, pud, address);
3623 if (!fe.pmd) 3632 if (!vmf.pmd)
3624 return VM_FAULT_OOM; 3633 return VM_FAULT_OOM;
3625 if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { 3634 if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
3626 int ret = create_huge_pmd(&fe); 3635 int ret = create_huge_pmd(&vmf);
3627 if (!(ret & VM_FAULT_FALLBACK)) 3636 if (!(ret & VM_FAULT_FALLBACK))
3628 return ret; 3637 return ret;
3629 } else { 3638 } else {
3630 pmd_t orig_pmd = *fe.pmd; 3639 pmd_t orig_pmd = *vmf.pmd;
3631 int ret; 3640 int ret;
3632 3641
3633 barrier(); 3642 barrier();
3634 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3643 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3635 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) 3644 if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
3636 return do_huge_pmd_numa_page(&fe, orig_pmd); 3645 return do_huge_pmd_numa_page(&vmf, orig_pmd);
3637 3646
3638 if ((fe.flags & FAULT_FLAG_WRITE) && 3647 if ((vmf.flags & FAULT_FLAG_WRITE) &&
3639 !pmd_write(orig_pmd)) { 3648 !pmd_write(orig_pmd)) {
3640 ret = wp_huge_pmd(&fe, orig_pmd); 3649 ret = wp_huge_pmd(&vmf, orig_pmd);
3641 if (!(ret & VM_FAULT_FALLBACK)) 3650 if (!(ret & VM_FAULT_FALLBACK))
3642 return ret; 3651 return ret;
3643 } else { 3652 } else {
3644 huge_pmd_set_accessed(&fe, orig_pmd); 3653 huge_pmd_set_accessed(&vmf, orig_pmd);
3645 return 0; 3654 return 0;
3646 } 3655 }
3647 } 3656 }
3648 } 3657 }
3649 3658
3650 return handle_pte_fault(&fe); 3659 return handle_pte_fault(&vmf);
3651} 3660}
3652 3661
3653/* 3662/*
@@ -3808,8 +3817,8 @@ out:
3808 return -EINVAL; 3817 return -EINVAL;
3809} 3818}
3810 3819
3811static inline int follow_pte(struct mm_struct *mm, unsigned long address, 3820int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp,
3812 pte_t **ptepp, spinlock_t **ptlp) 3821 spinlock_t **ptlp)
3813{ 3822{
3814 int res; 3823 int res;
3815 3824
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
3919 struct page *page = NULL; 3928 struct page *page = NULL;
3920 3929
3921 ret = get_user_pages_remote(tsk, mm, addr, 1, 3930 ret = get_user_pages_remote(tsk, mm, addr, 1,
3922 gup_flags, &page, &vma); 3931 gup_flags, &page, &vma, NULL);
3923 if (ret <= 0) { 3932 if (ret <= 0) {
3924#ifndef CONFIG_HAVE_IOREMAP_PROT 3933#ifndef CONFIG_HAVE_IOREMAP_PROT
3925 break; 3934 break;
diff --git a/mm/nommu.c b/mm/nommu.c
index 27bc543128e5..210d7ec2843c 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
176} 176}
177EXPORT_SYMBOL(get_user_pages_locked); 177EXPORT_SYMBOL(get_user_pages_locked);
178 178
179long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, 179static long __get_user_pages_unlocked(struct task_struct *tsk,
180 unsigned long start, unsigned long nr_pages, 180 struct mm_struct *mm, unsigned long start,
181 struct page **pages, unsigned int gup_flags) 181 unsigned long nr_pages, struct page **pages,
182 unsigned int gup_flags)
182{ 183{
183 long ret; 184 long ret;
184 down_read(&mm->mmap_sem); 185 down_read(&mm->mmap_sem);
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
187 up_read(&mm->mmap_sem); 188 up_read(&mm->mmap_sem);
188 return ret; 189 return ret;
189} 190}
190EXPORT_SYMBOL(__get_user_pages_unlocked);
191 191
192long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, 192long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
193 struct page **pages, unsigned int gup_flags) 193 struct page **pages, unsigned int gup_flags)
@@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1801} 1801}
1802EXPORT_SYMBOL(filemap_fault); 1802EXPORT_SYMBOL(filemap_fault);
1803 1803
1804void filemap_map_pages(struct fault_env *fe, 1804void filemap_map_pages(struct vm_fault *vmf,
1805 pgoff_t start_pgoff, pgoff_t end_pgoff) 1805 pgoff_t start_pgoff, pgoff_t end_pgoff)
1806{ 1806{
1807 BUG(); 1807 BUG();
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 52e2f8e3b472..290e8b7d3181 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping,
2106 pgoff_t start, pgoff_t end) 2106 pgoff_t start, pgoff_t end)
2107{ 2107{
2108#define WRITEBACK_TAG_BATCH 4096 2108#define WRITEBACK_TAG_BATCH 4096
2109 unsigned long tagged; 2109 unsigned long tagged = 0;
2110 2110 struct radix_tree_iter iter;
2111 do { 2111 void **slot;
2112 spin_lock_irq(&mapping->tree_lock); 2112
2113 tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, 2113 spin_lock_irq(&mapping->tree_lock);
2114 &start, end, WRITEBACK_TAG_BATCH, 2114 radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start,
2115 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); 2115 PAGECACHE_TAG_DIRTY) {
2116 if (iter.index > end)
2117 break;
2118 radix_tree_iter_tag_set(&mapping->page_tree, &iter,
2119 PAGECACHE_TAG_TOWRITE);
2120 tagged++;
2121 if ((tagged % WRITEBACK_TAG_BATCH) != 0)
2122 continue;
2123 slot = radix_tree_iter_resume(slot, &iter);
2116 spin_unlock_irq(&mapping->tree_lock); 2124 spin_unlock_irq(&mapping->tree_lock);
2117 WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
2118 cond_resched(); 2125 cond_resched();
2119 /* We check 'start' to handle wrapping when end == ~0UL */ 2126 spin_lock_irq(&mapping->tree_lock);
2120 } while (tagged >= WRITEBACK_TAG_BATCH && start); 2127 }
2128 spin_unlock_irq(&mapping->tree_lock);
2121} 2129}
2122EXPORT_SYMBOL(tag_pages_for_writeback); 2130EXPORT_SYMBOL(tag_pages_for_writeback);
2123 2131
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f64e7bcb43b7..2c6d5f64feca 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc,
3925 return page; 3925 return page;
3926} 3926}
3927 3927
3928void __page_frag_drain(struct page *page, unsigned int order,
3929 unsigned int count)
3930{
3931 VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
3932
3933 if (page_ref_sub_and_test(page, count)) {
3934 if (order == 0)
3935 free_hot_cold_page(page, false);
3936 else
3937 __free_pages_ok(page, order);
3938 }
3939}
3940EXPORT_SYMBOL(__page_frag_drain);
3941
3928void *__alloc_page_frag(struct page_frag_cache *nc, 3942void *__alloc_page_frag(struct page_frag_cache *nc,
3929 unsigned int fragsz, gfp_t gfp_mask) 3943 unsigned int fragsz, gfp_t gfp_mask)
3930{ 3944{
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index be8dc8d1edb9..84d0c7eada2b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
88 ssize_t rc = 0; 88 ssize_t rc = 0;
89 unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES 89 unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
90 / sizeof(struct pages *); 90 / sizeof(struct pages *);
91 unsigned int flags = FOLL_REMOTE; 91 unsigned int flags = 0;
92 92
93 /* Work out address and page range required */ 93 /* Work out address and page range required */
94 if (len == 0) 94 if (len == 0)
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr,
100 100
101 while (!rc && nr_pages && iov_iter_count(iter)) { 101 while (!rc && nr_pages && iov_iter_count(iter)) {
102 int pages = min(nr_pages, max_pages_per_loop); 102 int pages = min(nr_pages, max_pages_per_loop);
103 int locked = 1;
103 size_t bytes; 104 size_t bytes;
104 105
105 /* 106 /*
106 * Get the pages we're interested in. We must 107 * Get the pages we're interested in. We must
107 * add FOLL_REMOTE because task/mm might not 108 * access remotely because task/mm might not
108 * current/current->mm 109 * current/current->mm
109 */ 110 */
110 pages = __get_user_pages_unlocked(task, mm, pa, pages, 111 down_read(&mm->mmap_sem);
111 process_pages, flags); 112 pages = get_user_pages_remote(task, mm, pa, pages, flags,
113 process_pages, NULL, &locked);
114 if (locked)
115 up_read(&mm->mmap_sem);
112 if (pages <= 0) 116 if (pages <= 0)
113 return -EFAULT; 117 return -EFAULT;
114 118
diff --git a/mm/shmem.c b/mm/shmem.c
index abd7403aba41..54287d443806 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
661 swapped++; 661 swapped++;
662 662
663 if (need_resched()) { 663 if (need_resched()) {
664 slot = radix_tree_iter_resume(slot, &iter);
664 cond_resched_rcu(); 665 cond_resched_rcu();
665 slot = radix_tree_iter_next(&iter);
666 } 666 }
667 } 667 }
668 668
@@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode)
1049 clear_inode(inode); 1049 clear_inode(inode);
1050} 1050}
1051 1051
1052static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
1053{
1054 struct radix_tree_iter iter;
1055 void **slot;
1056 unsigned long found = -1;
1057 unsigned int checked = 0;
1058
1059 rcu_read_lock();
1060 radix_tree_for_each_slot(slot, root, &iter, 0) {
1061 if (*slot == item) {
1062 found = iter.index;
1063 break;
1064 }
1065 checked++;
1066 if ((checked % 4096) != 0)
1067 continue;
1068 slot = radix_tree_iter_resume(slot, &iter);
1069 cond_resched_rcu();
1070 }
1071
1072 rcu_read_unlock();
1073 return found;
1074}
1075
1052/* 1076/*
1053 * If swap found in inode, free it and move page from swapcache to filecache. 1077 * If swap found in inode, free it and move page from swapcache to filecache.
1054 */ 1078 */
@@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
1062 int error = 0; 1086 int error = 0;
1063 1087
1064 radswap = swp_to_radix_entry(swap); 1088 radswap = swp_to_radix_entry(swap);
1065 index = radix_tree_locate_item(&mapping->page_tree, radswap); 1089 index = find_swap_entry(&mapping->page_tree, radswap);
1066 if (index == -1) 1090 if (index == -1)
1067 return -EAGAIN; /* tell shmem_unuse we found nothing */ 1091 return -EAGAIN; /* tell shmem_unuse we found nothing */
1068 1092
@@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping)
2447 } 2471 }
2448 2472
2449 if (need_resched()) { 2473 if (need_resched()) {
2474 slot = radix_tree_iter_resume(slot, &iter);
2450 cond_resched_rcu(); 2475 cond_resched_rcu();
2451 slot = radix_tree_iter_next(&iter);
2452 } 2476 }
2453 } 2477 }
2454 rcu_read_unlock(); 2478 rcu_read_unlock();
@@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping)
2517 spin_unlock_irq(&mapping->tree_lock); 2541 spin_unlock_irq(&mapping->tree_lock);
2518continue_resched: 2542continue_resched:
2519 if (need_resched()) { 2543 if (need_resched()) {
2544 slot = radix_tree_iter_resume(slot, &iter);
2520 cond_resched_rcu(); 2545 cond_resched_rcu();
2521 slot = radix_tree_iter_next(&iter);
2522 } 2546 }
2523 } 2547 }
2524 rcu_read_unlock(); 2548 rcu_read_unlock();
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2d59c9be40e1..5f63f6dcaabb 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -762,16 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = {
762static int __init af_rxrpc_init(void) 762static int __init af_rxrpc_init(void)
763{ 763{
764 int ret = -1; 764 int ret = -1;
765 unsigned int tmp;
765 766
766 BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); 767 BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
767 768
768 get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); 769 get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch));
769 rxrpc_epoch |= RXRPC_RANDOM_EPOCH; 770 rxrpc_epoch |= RXRPC_RANDOM_EPOCH;
770 get_random_bytes(&rxrpc_client_conn_ids.cur, 771 get_random_bytes(&tmp, sizeof(tmp));
771 sizeof(rxrpc_client_conn_ids.cur)); 772 tmp &= 0x3fffffff;
772 rxrpc_client_conn_ids.cur &= 0x3fffffff; 773 if (tmp == 0)
773 if (rxrpc_client_conn_ids.cur == 0) 774 tmp = 1;
774 rxrpc_client_conn_ids.cur = 1; 775 idr_set_cursor(&rxrpc_client_conn_ids, tmp);
775 776
776 ret = -ENOMEM; 777 ret = -ENOMEM;
777 rxrpc_call_jar = kmem_cache_create( 778 rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 60ef9605167e..6cbcdcc29853 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -263,12 +263,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
263 * times the maximum number of client conns away from the current 263 * times the maximum number of client conns away from the current
264 * allocation point to try and keep the IDs concentrated. 264 * allocation point to try and keep the IDs concentrated.
265 */ 265 */
266 id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); 266 id_cursor = idr_get_cursor(&rxrpc_client_conn_ids);
267 id = conn->proto.cid >> RXRPC_CIDSHIFT; 267 id = conn->proto.cid >> RXRPC_CIDSHIFT;
268 distance = id - id_cursor; 268 distance = id - id_cursor;
269 if (distance < 0) 269 if (distance < 0)
270 distance = -distance; 270 distance = -distance;
271 limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; 271 limit = max(rxrpc_max_client_connections * 4, 1024U);
272 if (distance > limit) 272 if (distance > limit)
273 goto mark_dont_reuse; 273 goto mark_dont_reuse;
274 274
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 682b73af7766..838ffa78cfda 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
881 * the execve(). 881 * the execve().
882 */ 882 */
883 if (get_user_pages_remote(current, bprm->mm, pos, 1, 883 if (get_user_pages_remote(current, bprm->mm, pos, 1,
884 FOLL_FORCE, &page, NULL) <= 0) 884 FOLL_FORCE, &page, NULL, NULL) <= 0)
885 return false; 885 return false;
886#else 886#else
887 page = bprm->page[pos / PAGE_SIZE]; 887 page = bprm->page[pos / PAGE_SIZE];
diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h
index 9e5f4846967f..beda1a884b50 100644
--- a/tools/include/asm/bug.h
+++ b/tools/include/asm/bug.h
@@ -12,6 +12,17 @@
12 unlikely(__ret_warn_on); \ 12 unlikely(__ret_warn_on); \
13}) 13})
14 14
15#define WARN_ON_ONCE(condition) ({ \
16 static int __warned; \
17 int __ret_warn_once = !!(condition); \
18 \
19 if (unlikely(__ret_warn_once && !__warned)) { \
20 __warned = true; \
21 WARN_ON(1); \
22 } \
23 unlikely(__ret_warn_once); \
24})
25
15#define WARN_ONCE(condition, format...) ({ \ 26#define WARN_ONCE(condition, format...) ({ \
16 static int __warned; \ 27 static int __warned; \
17 int __ret_warn_once = !!(condition); \ 28 int __ret_warn_once = !!(condition); \
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h
index 43c1c5021e4b..eef41d500e9e 100644
--- a/tools/include/linux/bitmap.h
+++ b/tools/include/linux/bitmap.h
@@ -35,6 +35,32 @@ static inline void bitmap_zero(unsigned long *dst, int nbits)
35 } 35 }
36} 36}
37 37
38static inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
39{
40 unsigned int nlongs = BITS_TO_LONGS(nbits);
41 if (!small_const_nbits(nbits)) {
42 unsigned int len = (nlongs - 1) * sizeof(unsigned long);
43 memset(dst, 0xff, len);
44 }
45 dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
46}
47
48static inline int bitmap_empty(const unsigned long *src, unsigned nbits)
49{
50 if (small_const_nbits(nbits))
51 return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
52
53 return find_first_bit(src, nbits) == nbits;
54}
55
56static inline int bitmap_full(const unsigned long *src, unsigned int nbits)
57{
58 if (small_const_nbits(nbits))
59 return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
60
61 return find_first_zero_bit(src, nbits) == nbits;
62}
63
38static inline int bitmap_weight(const unsigned long *src, int nbits) 64static inline int bitmap_weight(const unsigned long *src, int nbits)
39{ 65{
40 if (small_const_nbits(nbits)) 66 if (small_const_nbits(nbits))
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index d08e214ec6e7..be93ab02b490 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -719,14 +719,14 @@ sub set_value {
719 719
720 if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { 720 if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") {
721 # Note if a test is something other than build, then we 721 # Note if a test is something other than build, then we
722 # will need other manditory options. 722 # will need other mandatory options.
723 if ($prvalue ne "install") { 723 if ($prvalue ne "install") {
724 # for bisect, we need to check BISECT_TYPE 724 # for bisect, we need to check BISECT_TYPE
725 if ($prvalue ne "bisect") { 725 if ($prvalue ne "bisect") {
726 $buildonly = 0; 726 $buildonly = 0;
727 } 727 }
728 } else { 728 } else {
729 # install still limits some manditory options. 729 # install still limits some mandatory options.
730 $buildonly = 2; 730 $buildonly = 2;
731 } 731 }
732 } 732 }
@@ -735,7 +735,7 @@ sub set_value {
735 if ($prvalue ne "install") { 735 if ($prvalue ne "install") {
736 $buildonly = 0; 736 $buildonly = 0;
737 } else { 737 } else {
738 # install still limits some manditory options. 738 # install still limits some mandatory options.
739 $buildonly = 2; 739 $buildonly = 2;
740 } 740 }
741 } 741 }
@@ -3989,7 +3989,7 @@ sub make_min_config {
3989 } 3989 }
3990 } 3990 }
3991 3991
3992 # Save off all the current mandidory configs 3992 # Save off all the current mandatory configs
3993 open (OUT, ">$temp_config") 3993 open (OUT, ">$temp_config")
3994 or die "Can't write to $temp_config"; 3994 or die "Can't write to $temp_config";
3995 foreach my $config (keys %keep_configs) { 3995 foreach my $config (keys %keep_configs) {
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile
index f2e07f2fd4b4..3635e4d3eca7 100644
--- a/tools/testing/radix-tree/Makefile
+++ b/tools/testing/radix-tree/Makefile
@@ -1,10 +1,14 @@
1 1
2CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE 2CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE
3LDFLAGS += -lpthread -lurcu 3LDFLAGS += -lpthread -lurcu
4TARGETS = main 4TARGETS = main
5OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ 5OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \
6 regression1.o regression2.o regression3.o multiorder.o \ 6 regression1.o regression2.o regression3.o multiorder.o \
7 iteration_check.o 7 iteration_check.o benchmark.o
8
9ifdef BENCHMARK
10 CFLAGS += -DBENCHMARK=1
11endif
8 12
9targets: $(TARGETS) 13targets: $(TARGETS)
10 14
@@ -14,7 +18,12 @@ main: $(OFILES)
14clean: 18clean:
15 $(RM) -f $(TARGETS) *.o radix-tree.c 19 $(RM) -f $(TARGETS) *.o radix-tree.c
16 20
17$(OFILES): *.h */*.h ../../../include/linux/radix-tree.h ../../include/linux/*.h 21find_next_bit.o: ../../lib/find_bit.c
22 $(CC) $(CFLAGS) -c -o $@ $<
23
24$(OFILES): *.h */*.h \
25 ../../include/linux/*.h \
26 ../../../include/linux/radix-tree.h
18 27
19radix-tree.c: ../../../lib/radix-tree.c 28radix-tree.c: ../../../lib/radix-tree.c
20 sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ 29 sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@
diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c
new file mode 100644
index 000000000000..215ca86c7605
--- /dev/null
+++ b/tools/testing/radix-tree/benchmark.c
@@ -0,0 +1,98 @@
1/*
2 * benchmark.c:
3 * Author: Konstantin Khlebnikov <koct9i@gmail.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#include <linux/radix-tree.h>
15#include <linux/slab.h>
16#include <linux/errno.h>
17#include <time.h>
18#include "test.h"
19
20#define NSEC_PER_SEC 1000000000L
21
22static long long benchmark_iter(struct radix_tree_root *root, bool tagged)
23{
24 volatile unsigned long sink = 0;
25 struct radix_tree_iter iter;
26 struct timespec start, finish;
27 long long nsec;
28 int l, loops = 1;
29 void **slot;
30
31#ifdef BENCHMARK
32again:
33#endif
34 clock_gettime(CLOCK_MONOTONIC, &start);
35 for (l = 0; l < loops; l++) {
36 if (tagged) {
37 radix_tree_for_each_tagged(slot, root, &iter, 0, 0)
38 sink ^= (unsigned long)slot;
39 } else {
40 radix_tree_for_each_slot(slot, root, &iter, 0)
41 sink ^= (unsigned long)slot;
42 }
43 }
44 clock_gettime(CLOCK_MONOTONIC, &finish);
45
46 nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC +
47 (finish.tv_nsec - start.tv_nsec);
48
49#ifdef BENCHMARK
50 if (loops == 1 && nsec * 5 < NSEC_PER_SEC) {
51 loops = NSEC_PER_SEC / nsec / 4 + 1;
52 goto again;
53 }
54#endif
55
56 nsec /= loops;
57 return nsec;
58}
59
60static void benchmark_size(unsigned long size, unsigned long step, int order)
61{
62 RADIX_TREE(tree, GFP_KERNEL);
63 long long normal, tagged;
64 unsigned long index;
65
66 for (index = 0 ; index < size ; index += step) {
67 item_insert_order(&tree, index, order);
68 radix_tree_tag_set(&tree, index, 0);
69 }
70
71 tagged = benchmark_iter(&tree, true);
72 normal = benchmark_iter(&tree, false);
73
74 printf("Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n",
75 size, step, order, tagged, normal);
76
77 item_kill_tree(&tree);
78 rcu_barrier();
79}
80
81void benchmark(void)
82{
83 unsigned long size[] = {1 << 10, 1 << 20, 0};
84 unsigned long step[] = {1, 2, 7, 15, 63, 64, 65,
85 128, 256, 512, 12345, 0};
86 int c, s;
87
88 printf("starting benchmarks\n");
89 printf("RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT);
90
91 for (c = 0; size[c]; c++)
92 for (s = 0; step[s]; s++)
93 benchmark_size(size[c], step[s], 0);
94
95 for (c = 0; size[c]; c++)
96 for (s = 0; step[s]; s++)
97 benchmark_size(size[c], step[s] << 9, 9);
98}
diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c
deleted file mode 100644
index d1c2178bb2d4..000000000000
--- a/tools/testing/radix-tree/find_next_bit.c
+++ /dev/null
@@ -1,57 +0,0 @@
1/* find_next_bit.c: fallback find next bit implementation
2 *
3 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/types.h>
13#include <linux/bitops.h>
14
15#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
16
17/*
18 * Find the next set bit in a memory region.
19 */
20unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
21 unsigned long offset)
22{
23 const unsigned long *p = addr + BITOP_WORD(offset);
24 unsigned long result = offset & ~(BITS_PER_LONG-1);
25 unsigned long tmp;
26
27 if (offset >= size)
28 return size;
29 size -= result;
30 offset %= BITS_PER_LONG;
31 if (offset) {
32 tmp = *(p++);
33 tmp &= (~0UL << offset);
34 if (size < BITS_PER_LONG)
35 goto found_first;
36 if (tmp)
37 goto found_middle;
38 size -= BITS_PER_LONG;
39 result += BITS_PER_LONG;
40 }
41 while (size & ~(BITS_PER_LONG-1)) {
42 if ((tmp = *(p++)))
43 goto found_middle;
44 result += BITS_PER_LONG;
45 size -= BITS_PER_LONG;
46 }
47 if (!size)
48 return result;
49 tmp = *p;
50
51found_first:
52 tmp &= (~0UL >> (BITS_PER_LONG - size));
53 if (tmp == 0UL) /* Are any bits set? */
54 return result + size; /* Nope. */
55found_middle:
56 return result + __ffs(tmp);
57}
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c
index 9adb8e7415a6..7572b7ed930e 100644
--- a/tools/testing/radix-tree/iteration_check.c
+++ b/tools/testing/radix-tree/iteration_check.c
@@ -16,35 +16,50 @@
16#include <pthread.h> 16#include <pthread.h>
17#include "test.h" 17#include "test.h"
18 18
19#define NUM_THREADS 4 19#define NUM_THREADS 5
20#define TAG 0 20#define MAX_IDX 100
21#define TAG 0
22#define NEW_TAG 1
23
21static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; 24static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER;
22static pthread_t threads[NUM_THREADS]; 25static pthread_t threads[NUM_THREADS];
23RADIX_TREE(tree, GFP_KERNEL); 26static unsigned int seeds[3];
24bool test_complete; 27static RADIX_TREE(tree, GFP_KERNEL);
28static bool test_complete;
29static int max_order;
25 30
26/* relentlessly fill the tree with tagged entries */ 31/* relentlessly fill the tree with tagged entries */
27static void *add_entries_fn(void *arg) 32static void *add_entries_fn(void *arg)
28{ 33{
29 int pgoff; 34 rcu_register_thread();
30 35
31 while (!test_complete) { 36 while (!test_complete) {
32 for (pgoff = 0; pgoff < 100; pgoff++) { 37 unsigned long pgoff;
38 int order;
39
40 for (pgoff = 0; pgoff < MAX_IDX; pgoff++) {
33 pthread_mutex_lock(&tree_lock); 41 pthread_mutex_lock(&tree_lock);
34 if (item_insert(&tree, pgoff) == 0) 42 for (order = max_order; order >= 0; order--) {
35 item_tag_set(&tree, pgoff, TAG); 43 if (item_insert_order(&tree, pgoff, order)
44 == 0) {
45 item_tag_set(&tree, pgoff, TAG);
46 break;
47 }
48 }
36 pthread_mutex_unlock(&tree_lock); 49 pthread_mutex_unlock(&tree_lock);
37 } 50 }
38 } 51 }
39 52
53 rcu_unregister_thread();
54
40 return NULL; 55 return NULL;
41} 56}
42 57
43/* 58/*
44 * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find 59 * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find
45 * things that have been removed and randomly resetting our iteration to the 60 * things that have been removed and randomly resetting our iteration to the
46 * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and 61 * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and
47 * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a 62 * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
48 * NULL 'slot' variable. 63 * NULL 'slot' variable.
49 */ 64 */
50static void *tagged_iteration_fn(void *arg) 65static void *tagged_iteration_fn(void *arg)
@@ -52,17 +67,12 @@ static void *tagged_iteration_fn(void *arg)
52 struct radix_tree_iter iter; 67 struct radix_tree_iter iter;
53 void **slot; 68 void **slot;
54 69
70 rcu_register_thread();
71
55 while (!test_complete) { 72 while (!test_complete) {
56 rcu_read_lock(); 73 rcu_read_lock();
57 radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { 74 radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) {
58 void *entry; 75 void *entry = radix_tree_deref_slot(slot);
59 int i;
60
61 /* busy wait to let removals happen */
62 for (i = 0; i < 1000000; i++)
63 ;
64
65 entry = radix_tree_deref_slot(slot);
66 if (unlikely(!entry)) 76 if (unlikely(!entry))
67 continue; 77 continue;
68 78
@@ -71,20 +81,26 @@ static void *tagged_iteration_fn(void *arg)
71 continue; 81 continue;
72 } 82 }
73 83
74 if (rand() % 50 == 0) 84 if (rand_r(&seeds[0]) % 50 == 0) {
75 slot = radix_tree_iter_next(&iter); 85 slot = radix_tree_iter_resume(slot, &iter);
86 rcu_read_unlock();
87 rcu_barrier();
88 rcu_read_lock();
89 }
76 } 90 }
77 rcu_read_unlock(); 91 rcu_read_unlock();
78 } 92 }
79 93
94 rcu_unregister_thread();
95
80 return NULL; 96 return NULL;
81} 97}
82 98
83/* 99/*
84 * Iterate over the entries, doing a radix_tree_iter_retry() as we find things 100 * Iterate over the entries, doing a radix_tree_iter_retry() as we find things
85 * that have been removed and randomly resetting our iteration to the next 101 * that have been removed and randomly resetting our iteration to the next
86 * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and 102 * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and
87 * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a 103 * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a
88 * NULL 'slot' variable. 104 * NULL 'slot' variable.
89 */ 105 */
90static void *untagged_iteration_fn(void *arg) 106static void *untagged_iteration_fn(void *arg)
@@ -92,17 +108,12 @@ static void *untagged_iteration_fn(void *arg)
92 struct radix_tree_iter iter; 108 struct radix_tree_iter iter;
93 void **slot; 109 void **slot;
94 110
111 rcu_register_thread();
112
95 while (!test_complete) { 113 while (!test_complete) {
96 rcu_read_lock(); 114 rcu_read_lock();
97 radix_tree_for_each_slot(slot, &tree, &iter, 0) { 115 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
98 void *entry; 116 void *entry = radix_tree_deref_slot(slot);
99 int i;
100
101 /* busy wait to let removals happen */
102 for (i = 0; i < 1000000; i++)
103 ;
104
105 entry = radix_tree_deref_slot(slot);
106 if (unlikely(!entry)) 117 if (unlikely(!entry))
107 continue; 118 continue;
108 119
@@ -111,12 +122,18 @@ static void *untagged_iteration_fn(void *arg)
111 continue; 122 continue;
112 } 123 }
113 124
114 if (rand() % 50 == 0) 125 if (rand_r(&seeds[1]) % 50 == 0) {
115 slot = radix_tree_iter_next(&iter); 126 slot = radix_tree_iter_resume(slot, &iter);
127 rcu_read_unlock();
128 rcu_barrier();
129 rcu_read_lock();
130 }
116 } 131 }
117 rcu_read_unlock(); 132 rcu_read_unlock();
118 } 133 }
119 134
135 rcu_unregister_thread();
136
120 return NULL; 137 return NULL;
121} 138}
122 139
@@ -126,47 +143,71 @@ static void *untagged_iteration_fn(void *arg)
126 */ 143 */
127static void *remove_entries_fn(void *arg) 144static void *remove_entries_fn(void *arg)
128{ 145{
146 rcu_register_thread();
147
129 while (!test_complete) { 148 while (!test_complete) {
130 int pgoff; 149 int pgoff;
131 150
132 pgoff = rand() % 100; 151 pgoff = rand_r(&seeds[2]) % MAX_IDX;
133 152
134 pthread_mutex_lock(&tree_lock); 153 pthread_mutex_lock(&tree_lock);
135 item_delete(&tree, pgoff); 154 item_delete(&tree, pgoff);
136 pthread_mutex_unlock(&tree_lock); 155 pthread_mutex_unlock(&tree_lock);
137 } 156 }
138 157
158 rcu_unregister_thread();
159
160 return NULL;
161}
162
163static void *tag_entries_fn(void *arg)
164{
165 rcu_register_thread();
166
167 while (!test_complete) {
168 tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG,
169 NEW_TAG);
170 }
171 rcu_unregister_thread();
139 return NULL; 172 return NULL;
140} 173}
141 174
142/* This is a unit test for a bug found by the syzkaller tester */ 175/* This is a unit test for a bug found by the syzkaller tester */
143void iteration_test(void) 176void iteration_test(unsigned order, unsigned test_duration)
144{ 177{
145 int i; 178 int i;
146 179
147 printf("Running iteration tests for 10 seconds\n"); 180 printf("Running %siteration tests for %d seconds\n",
181 order > 0 ? "multiorder " : "", test_duration);
148 182
149 srand(time(0)); 183 max_order = order;
150 test_complete = false; 184 test_complete = false;
151 185
186 for (i = 0; i < 3; i++)
187 seeds[i] = rand();
188
152 if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { 189 if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) {
153 perror("pthread_create"); 190 perror("create tagged iteration thread");
154 exit(1); 191 exit(1);
155 } 192 }
156 if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { 193 if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) {
157 perror("pthread_create"); 194 perror("create untagged iteration thread");
158 exit(1); 195 exit(1);
159 } 196 }
160 if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { 197 if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) {
161 perror("pthread_create"); 198 perror("create add entry thread");
162 exit(1); 199 exit(1);
163 } 200 }
164 if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { 201 if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) {
165 perror("pthread_create"); 202 perror("create remove entry thread");
203 exit(1);
204 }
205 if (pthread_create(&threads[4], NULL, tag_entries_fn, NULL)) {
206 perror("create tag entry thread");
166 exit(1); 207 exit(1);
167 } 208 }
168 209
169 sleep(10); 210 sleep(test_duration);
170 test_complete = true; 211 test_complete = true;
171 212
172 for (i = 0; i < NUM_THREADS; i++) { 213 for (i = 0; i < NUM_THREADS; i++) {
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c
index 154823737b20..d31ea7c9abec 100644
--- a/tools/testing/radix-tree/linux.c
+++ b/tools/testing/radix-tree/linux.c
@@ -1,14 +1,26 @@
1#include <stdlib.h> 1#include <stdlib.h>
2#include <string.h> 2#include <string.h>
3#include <malloc.h> 3#include <malloc.h>
4#include <pthread.h>
4#include <unistd.h> 5#include <unistd.h>
5#include <assert.h> 6#include <assert.h>
6 7
7#include <linux/mempool.h> 8#include <linux/mempool.h>
9#include <linux/poison.h>
8#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/radix-tree.h>
9#include <urcu/uatomic.h> 12#include <urcu/uatomic.h>
10 13
11int nr_allocated; 14int nr_allocated;
15int preempt_count;
16
17struct kmem_cache {
18 pthread_mutex_t lock;
19 int size;
20 int nr_objs;
21 void *objs;
22 void (*ctor)(void *);
23};
12 24
13void *mempool_alloc(mempool_t *pool, int gfp_mask) 25void *mempool_alloc(mempool_t *pool, int gfp_mask)
14{ 26{
@@ -33,19 +45,59 @@ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
33 45
34void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) 46void *kmem_cache_alloc(struct kmem_cache *cachep, int flags)
35{ 47{
36 void *ret = malloc(cachep->size); 48 struct radix_tree_node *node;
37 if (cachep->ctor) 49
38 cachep->ctor(ret); 50 if (flags & __GFP_NOWARN)
51 return NULL;
52
53 pthread_mutex_lock(&cachep->lock);
54 if (cachep->nr_objs) {
55 cachep->nr_objs--;
56 node = cachep->objs;
57 cachep->objs = node->private_data;
58 pthread_mutex_unlock(&cachep->lock);
59 node->private_data = NULL;
60 } else {
61 pthread_mutex_unlock(&cachep->lock);
62 node = malloc(cachep->size);
63 if (cachep->ctor)
64 cachep->ctor(node);
65 }
66
39 uatomic_inc(&nr_allocated); 67 uatomic_inc(&nr_allocated);
40 return ret; 68 return node;
41} 69}
42 70
43void kmem_cache_free(struct kmem_cache *cachep, void *objp) 71void kmem_cache_free(struct kmem_cache *cachep, void *objp)
44{ 72{
45 assert(objp); 73 assert(objp);
46 uatomic_dec(&nr_allocated); 74 uatomic_dec(&nr_allocated);
47 memset(objp, 0, cachep->size); 75 pthread_mutex_lock(&cachep->lock);
48 free(objp); 76 if (cachep->nr_objs > 10) {
77 memset(objp, POISON_FREE, cachep->size);
78 free(objp);
79 } else {
80 struct radix_tree_node *node = objp;
81 cachep->nr_objs++;
82 node->private_data = cachep->objs;
83 cachep->objs = node;
84 }
85 pthread_mutex_unlock(&cachep->lock);
86}
87
88void *kmalloc(size_t size, gfp_t gfp)
89{
90 void *ret = malloc(size);
91 uatomic_inc(&nr_allocated);
92 return ret;
93}
94
95void kfree(void *p)
96{
97 if (!p)
98 return;
99 uatomic_dec(&nr_allocated);
100 free(p);
49} 101}
50 102
51struct kmem_cache * 103struct kmem_cache *
@@ -54,7 +106,10 @@ kmem_cache_create(const char *name, size_t size, size_t offset,
54{ 106{
55 struct kmem_cache *ret = malloc(sizeof(*ret)); 107 struct kmem_cache *ret = malloc(sizeof(*ret));
56 108
109 pthread_mutex_init(&ret->lock, NULL);
57 ret->size = size; 110 ret->size = size;
111 ret->nr_objs = 0;
112 ret->objs = NULL;
58 ret->ctor = ctor; 113 ret->ctor = ctor;
59 return ret; 114 return ret;
60} 115}
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h
index 71d58427ab60..a13e9bc76eec 100644
--- a/tools/testing/radix-tree/linux/bitops.h
+++ b/tools/testing/radix-tree/linux/bitops.h
@@ -2,9 +2,14 @@
2#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ 2#define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/bitops/find.h>
6#include <linux/bitops/hweight.h>
7#include <linux/kernel.h>
5 8
6#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) 9#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
7#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) 10#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
11#define BITS_PER_BYTE 8
12#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
8 13
9/** 14/**
10 * __set_bit - Set a bit in memory 15 * __set_bit - Set a bit in memory
@@ -17,16 +22,16 @@
17 */ 22 */
18static inline void __set_bit(int nr, volatile unsigned long *addr) 23static inline void __set_bit(int nr, volatile unsigned long *addr)
19{ 24{
20 unsigned long mask = BITOP_MASK(nr); 25 unsigned long mask = BIT_MASK(nr);
21 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 26 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
22 27
23 *p |= mask; 28 *p |= mask;
24} 29}
25 30
26static inline void __clear_bit(int nr, volatile unsigned long *addr) 31static inline void __clear_bit(int nr, volatile unsigned long *addr)
27{ 32{
28 unsigned long mask = BITOP_MASK(nr); 33 unsigned long mask = BIT_MASK(nr);
29 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 34 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
30 35
31 *p &= ~mask; 36 *p &= ~mask;
32} 37}
@@ -42,8 +47,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
42 */ 47 */
43static inline void __change_bit(int nr, volatile unsigned long *addr) 48static inline void __change_bit(int nr, volatile unsigned long *addr)
44{ 49{
45 unsigned long mask = BITOP_MASK(nr); 50 unsigned long mask = BIT_MASK(nr);
46 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 51 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
47 52
48 *p ^= mask; 53 *p ^= mask;
49} 54}
@@ -59,8 +64,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
59 */ 64 */
60static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) 65static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
61{ 66{
62 unsigned long mask = BITOP_MASK(nr); 67 unsigned long mask = BIT_MASK(nr);
63 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 68 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
64 unsigned long old = *p; 69 unsigned long old = *p;
65 70
66 *p = old | mask; 71 *p = old | mask;
@@ -78,8 +83,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
78 */ 83 */
79static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 84static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
80{ 85{
81 unsigned long mask = BITOP_MASK(nr); 86 unsigned long mask = BIT_MASK(nr);
82 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 87 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
83 unsigned long old = *p; 88 unsigned long old = *p;
84 89
85 *p = old & ~mask; 90 *p = old & ~mask;
@@ -90,8 +95,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
90static inline int __test_and_change_bit(int nr, 95static inline int __test_and_change_bit(int nr,
91 volatile unsigned long *addr) 96 volatile unsigned long *addr)
92{ 97{
93 unsigned long mask = BITOP_MASK(nr); 98 unsigned long mask = BIT_MASK(nr);
94 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 99 unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
95 unsigned long old = *p; 100 unsigned long old = *p;
96 101
97 *p = old ^ mask; 102 *p = old ^ mask;
@@ -105,7 +110,7 @@ static inline int __test_and_change_bit(int nr,
105 */ 110 */
106static inline int test_bit(int nr, const volatile unsigned long *addr) 111static inline int test_bit(int nr, const volatile unsigned long *addr)
107{ 112{
108 return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); 113 return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
109} 114}
110 115
111/** 116/**
@@ -147,4 +152,9 @@ unsigned long find_next_bit(const unsigned long *addr,
147 unsigned long size, 152 unsigned long size,
148 unsigned long offset); 153 unsigned long offset);
149 154
155static inline unsigned long hweight_long(unsigned long w)
156{
157 return sizeof(w) == 4 ? hweight32(w) : hweight64(w);
158}
159
150#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ 160#endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h
index 46a825cf2ae1..6a1bcb9d2c4a 100644
--- a/tools/testing/radix-tree/linux/bitops/non-atomic.h
+++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h
@@ -3,7 +3,6 @@
3 3
4#include <asm/types.h> 4#include <asm/types.h>
5 5
6#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
7#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) 6#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
8 7
9/** 8/**
@@ -17,7 +16,7 @@
17 */ 16 */
18static inline void __set_bit(int nr, volatile unsigned long *addr) 17static inline void __set_bit(int nr, volatile unsigned long *addr)
19{ 18{
20 unsigned long mask = BITOP_MASK(nr); 19 unsigned long mask = BIT_MASK(nr);
21 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 20 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
22 21
23 *p |= mask; 22 *p |= mask;
@@ -25,7 +24,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr)
25 24
26static inline void __clear_bit(int nr, volatile unsigned long *addr) 25static inline void __clear_bit(int nr, volatile unsigned long *addr)
27{ 26{
28 unsigned long mask = BITOP_MASK(nr); 27 unsigned long mask = BIT_MASK(nr);
29 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 28 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
30 29
31 *p &= ~mask; 30 *p &= ~mask;
@@ -42,7 +41,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr)
42 */ 41 */
43static inline void __change_bit(int nr, volatile unsigned long *addr) 42static inline void __change_bit(int nr, volatile unsigned long *addr)
44{ 43{
45 unsigned long mask = BITOP_MASK(nr); 44 unsigned long mask = BIT_MASK(nr);
46 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 45 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
47 46
48 *p ^= mask; 47 *p ^= mask;
@@ -59,7 +58,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr)
59 */ 58 */
60static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) 59static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
61{ 60{
62 unsigned long mask = BITOP_MASK(nr); 61 unsigned long mask = BIT_MASK(nr);
63 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 62 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
64 unsigned long old = *p; 63 unsigned long old = *p;
65 64
@@ -78,7 +77,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
78 */ 77 */
79static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) 78static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
80{ 79{
81 unsigned long mask = BITOP_MASK(nr); 80 unsigned long mask = BIT_MASK(nr);
82 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 81 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
83 unsigned long old = *p; 82 unsigned long old = *p;
84 83
@@ -90,7 +89,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
90static inline int __test_and_change_bit(int nr, 89static inline int __test_and_change_bit(int nr,
91 volatile unsigned long *addr) 90 volatile unsigned long *addr)
92{ 91{
93 unsigned long mask = BITOP_MASK(nr); 92 unsigned long mask = BIT_MASK(nr);
94 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); 93 unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr);
95 unsigned long old = *p; 94 unsigned long old = *p;
96 95
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h
index ccbe444977df..23b8ed52f8c8 100644
--- a/tools/testing/radix-tree/linux/bug.h
+++ b/tools/testing/radix-tree/linux/bug.h
@@ -1 +1 @@
#define WARN_ON_ONCE(x) assert(x) #include "asm/bug.h"
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h
index 5201b915f631..5b09b2ce6c33 100644
--- a/tools/testing/radix-tree/linux/gfp.h
+++ b/tools/testing/radix-tree/linux/gfp.h
@@ -3,8 +3,24 @@
3 3
4#define __GFP_BITS_SHIFT 26 4#define __GFP_BITS_SHIFT 26
5#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) 5#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
6#define __GFP_WAIT 1 6
7#define __GFP_ACCOUNT 0 7#define __GFP_HIGH 0x20u
8#define __GFP_NOWARN 0 8#define __GFP_IO 0x40u
9#define __GFP_FS 0x80u
10#define __GFP_NOWARN 0x200u
11#define __GFP_ATOMIC 0x80000u
12#define __GFP_ACCOUNT 0x100000u
13#define __GFP_DIRECT_RECLAIM 0x400000u
14#define __GFP_KSWAPD_RECLAIM 0x2000000u
15
16#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM)
17
18#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM)
19#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
20
21static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
22{
23 return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
24}
9 25
10#endif 26#endif
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index be98a47b4e1b..9b43b4975d83 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -8,9 +8,14 @@
8#include <limits.h> 8#include <limits.h>
9 9
10#include "../../include/linux/compiler.h" 10#include "../../include/linux/compiler.h"
11#include "../../include/linux/err.h"
11#include "../../../include/linux/kconfig.h" 12#include "../../../include/linux/kconfig.h"
12 13
14#ifdef BENCHMARK
15#define RADIX_TREE_MAP_SHIFT 6
16#else
13#define RADIX_TREE_MAP_SHIFT 3 17#define RADIX_TREE_MAP_SHIFT 3
18#endif
14 19
15#ifndef NULL 20#ifndef NULL
16#define NULL 0 21#define NULL 0
@@ -43,4 +48,17 @@ static inline int in_interrupt(void)
43{ 48{
44 return 0; 49 return 0;
45} 50}
51
52/*
53 * This looks more complex than it should be. But we need to
54 * get the type for the ~ right in round_down (it needs to be
55 * as wide as the result!), and we want to evaluate the macro
56 * arguments just once each.
57 */
58#define __round_mask(x, y) ((__typeof__(x))((y)-1))
59#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
60#define round_down(x, y) ((x) & ~__round_mask(x, y))
61
62#define xchg(ptr, x) uatomic_xchg(ptr, x)
63
46#endif /* _KERNEL_H */ 64#endif /* _KERNEL_H */
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h
index 6210672e3baa..65c04c226965 100644
--- a/tools/testing/radix-tree/linux/preempt.h
+++ b/tools/testing/radix-tree/linux/preempt.h
@@ -1,4 +1,4 @@
1/* */ 1extern int preempt_count;
2 2
3#define preempt_disable() do { } while (0) 3#define preempt_disable() uatomic_inc(&preempt_count)
4#define preempt_enable() do { } while (0) 4#define preempt_enable() uatomic_dec(&preempt_count)
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h
index 6d5a34770fd4..e40337f41a38 100644
--- a/tools/testing/radix-tree/linux/slab.h
+++ b/tools/testing/radix-tree/linux/slab.h
@@ -7,15 +7,8 @@
7#define SLAB_PANIC 2 7#define SLAB_PANIC 2
8#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 8#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
9 9
10static inline int gfpflags_allow_blocking(gfp_t mask) 10void *kmalloc(size_t size, gfp_t);
11{ 11void kfree(void *);
12 return 1;
13}
14
15struct kmem_cache {
16 int size;
17 void (*ctor)(void *);
18};
19 12
20void *kmem_cache_alloc(struct kmem_cache *cachep, int flags); 13void *kmem_cache_alloc(struct kmem_cache *cachep, int flags);
21void kmem_cache_free(struct kmem_cache *cachep, void *objp); 14void kmem_cache_free(struct kmem_cache *cachep, void *objp);
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h
index faa0b6ff9ca8..8491d89873bb 100644
--- a/tools/testing/radix-tree/linux/types.h
+++ b/tools/testing/radix-tree/linux/types.h
@@ -6,8 +6,6 @@
6#define __rcu 6#define __rcu
7#define __read_mostly 7#define __read_mostly
8 8
9#define BITS_PER_LONG (sizeof(long) * 8)
10
11static inline void INIT_LIST_HEAD(struct list_head *list) 9static inline void INIT_LIST_HEAD(struct list_head *list)
12{ 10{
13 list->next = list; 11 list->next = list;
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c
index daa9010693e8..f7e9801a6754 100644
--- a/tools/testing/radix-tree/main.c
+++ b/tools/testing/radix-tree/main.c
@@ -67,7 +67,6 @@ void big_gang_check(bool long_run)
67 67
68 for (i = 0; i < (long_run ? 1000 : 3); i++) { 68 for (i = 0; i < (long_run ? 1000 : 3); i++) {
69 __big_gang_check(); 69 __big_gang_check();
70 srand(time(0));
71 printf("%d ", i); 70 printf("%d ", i);
72 fflush(stdout); 71 fflush(stdout);
73 } 72 }
@@ -206,8 +205,7 @@ void copy_tag_check(void)
206 } 205 }
207 206
208// printf("\ncopying tags...\n"); 207// printf("\ncopying tags...\n");
209 cur = start; 208 tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1);
210 tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1);
211 209
212// printf("checking copied tags\n"); 210// printf("checking copied tags\n");
213 assert(tagged == count); 211 assert(tagged == count);
@@ -215,16 +213,13 @@ void copy_tag_check(void)
215 213
216 /* Copy tags in several rounds */ 214 /* Copy tags in several rounds */
217// printf("\ncopying tags...\n"); 215// printf("\ncopying tags...\n");
218 cur = start; 216 tmp = rand() % (count / 10 + 2);
219 do { 217 tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2);
220 tmp = rand() % (count/10+2); 218 assert(tagged == count);
221 tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2);
222 } while (tmp == tagged);
223 219
224// printf("%lu %lu %lu\n", tagged, tmp, count); 220// printf("%lu %lu %lu\n", tagged, tmp, count);
225// printf("checking copied tags\n"); 221// printf("checking copied tags\n");
226 check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); 222 check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2);
227 assert(tagged < tmp);
228 verify_tag_consistency(&tree, 0); 223 verify_tag_consistency(&tree, 0);
229 verify_tag_consistency(&tree, 1); 224 verify_tag_consistency(&tree, 1);
230 verify_tag_consistency(&tree, 2); 225 verify_tag_consistency(&tree, 2);
@@ -240,7 +235,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index,
240 235
241 item_insert_order(tree, index, order); 236 item_insert_order(tree, index, order);
242 item = item_lookup(tree, index); 237 item = item_lookup(tree, index);
243 index2 = radix_tree_locate_item(tree, item); 238 index2 = find_item(tree, item);
244 if (index != index2) { 239 if (index != index2) {
245 printf("index %ld order %d inserted; found %ld\n", 240 printf("index %ld order %d inserted; found %ld\n",
246 index, order, index2); 241 index, order, index2);
@@ -274,17 +269,17 @@ static void locate_check(void)
274 index += (1UL << order)) { 269 index += (1UL << order)) {
275 __locate_check(&tree, index + offset, order); 270 __locate_check(&tree, index + offset, order);
276 } 271 }
277 if (radix_tree_locate_item(&tree, &tree) != -1) 272 if (find_item(&tree, &tree) != -1)
278 abort(); 273 abort();
279 274
280 item_kill_tree(&tree); 275 item_kill_tree(&tree);
281 } 276 }
282 } 277 }
283 278
284 if (radix_tree_locate_item(&tree, &tree) != -1) 279 if (find_item(&tree, &tree) != -1)
285 abort(); 280 abort();
286 __locate_check(&tree, -1, 0); 281 __locate_check(&tree, -1, 0);
287 if (radix_tree_locate_item(&tree, &tree) != -1) 282 if (find_item(&tree, &tree) != -1)
288 abort(); 283 abort();
289 item_kill_tree(&tree); 284 item_kill_tree(&tree);
290} 285}
@@ -293,50 +288,80 @@ static void single_thread_tests(bool long_run)
293{ 288{
294 int i; 289 int i;
295 290
296 printf("starting single_thread_tests: %d allocated\n", nr_allocated); 291 printf("starting single_thread_tests: %d allocated, preempt %d\n",
292 nr_allocated, preempt_count);
297 multiorder_checks(); 293 multiorder_checks();
298 printf("after multiorder_check: %d allocated\n", nr_allocated); 294 rcu_barrier();
295 printf("after multiorder_check: %d allocated, preempt %d\n",
296 nr_allocated, preempt_count);
299 locate_check(); 297 locate_check();
300 printf("after locate_check: %d allocated\n", nr_allocated); 298 rcu_barrier();
299 printf("after locate_check: %d allocated, preempt %d\n",
300 nr_allocated, preempt_count);
301 tag_check(); 301 tag_check();
302 printf("after tag_check: %d allocated\n", nr_allocated); 302 rcu_barrier();
303 printf("after tag_check: %d allocated, preempt %d\n",
304 nr_allocated, preempt_count);
303 gang_check(); 305 gang_check();
304 printf("after gang_check: %d allocated\n", nr_allocated); 306 rcu_barrier();
307 printf("after gang_check: %d allocated, preempt %d\n",
308 nr_allocated, preempt_count);
305 add_and_check(); 309 add_and_check();
306 printf("after add_and_check: %d allocated\n", nr_allocated); 310 rcu_barrier();
311 printf("after add_and_check: %d allocated, preempt %d\n",
312 nr_allocated, preempt_count);
307 dynamic_height_check(); 313 dynamic_height_check();
308 printf("after dynamic_height_check: %d allocated\n", nr_allocated); 314 rcu_barrier();
315 printf("after dynamic_height_check: %d allocated, preempt %d\n",
316 nr_allocated, preempt_count);
309 big_gang_check(long_run); 317 big_gang_check(long_run);
310 printf("after big_gang_check: %d allocated\n", nr_allocated); 318 rcu_barrier();
319 printf("after big_gang_check: %d allocated, preempt %d\n",
320 nr_allocated, preempt_count);
311 for (i = 0; i < (long_run ? 2000 : 3); i++) { 321 for (i = 0; i < (long_run ? 2000 : 3); i++) {
312 copy_tag_check(); 322 copy_tag_check();
313 printf("%d ", i); 323 printf("%d ", i);
314 fflush(stdout); 324 fflush(stdout);
315 } 325 }
316 printf("after copy_tag_check: %d allocated\n", nr_allocated); 326 rcu_barrier();
327 printf("after copy_tag_check: %d allocated, preempt %d\n",
328 nr_allocated, preempt_count);
317} 329}
318 330
319int main(int argc, char **argv) 331int main(int argc, char **argv)
320{ 332{
321 bool long_run = false; 333 bool long_run = false;
322 int opt; 334 int opt;
335 unsigned int seed = time(NULL);
323 336
324 while ((opt = getopt(argc, argv, "l")) != -1) { 337 while ((opt = getopt(argc, argv, "ls:")) != -1) {
325 if (opt == 'l') 338 if (opt == 'l')
326 long_run = true; 339 long_run = true;
340 else if (opt == 's')
341 seed = strtoul(optarg, NULL, 0);
327 } 342 }
328 343
344 printf("random seed %u\n", seed);
345 srand(seed);
346
329 rcu_register_thread(); 347 rcu_register_thread();
330 radix_tree_init(); 348 radix_tree_init();
331 349
332 regression1_test(); 350 regression1_test();
333 regression2_test(); 351 regression2_test();
334 regression3_test(); 352 regression3_test();
335 iteration_test(); 353 iteration_test(0, 10);
354 iteration_test(7, 20);
336 single_thread_tests(long_run); 355 single_thread_tests(long_run);
337 356
338 sleep(1); 357 /* Free any remaining preallocated nodes */
339 printf("after sleep(1): %d allocated\n", nr_allocated); 358 radix_tree_cpu_dead(0);
359
360 benchmark();
361
362 rcu_barrier();
363 printf("after rcu_barrier: %d allocated, preempt %d\n",
364 nr_allocated, preempt_count);
340 rcu_unregister_thread(); 365 rcu_unregister_thread();
341 366
342 exit(0); 367 exit(0);
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c
index d1be94667a30..f79812a5e070 100644
--- a/tools/testing/radix-tree/multiorder.c
+++ b/tools/testing/radix-tree/multiorder.c
@@ -26,7 +26,6 @@ static void __multiorder_tag_test(int index, int order)
26{ 26{
27 RADIX_TREE(tree, GFP_KERNEL); 27 RADIX_TREE(tree, GFP_KERNEL);
28 int base, err, i; 28 int base, err, i;
29 unsigned long first = 0;
30 29
31 /* our canonical entry */ 30 /* our canonical entry */
32 base = index & ~((1 << order) - 1); 31 base = index & ~((1 << order) - 1);
@@ -60,7 +59,7 @@ static void __multiorder_tag_test(int index, int order)
60 assert(!radix_tree_tag_get(&tree, i, 1)); 59 assert(!radix_tree_tag_get(&tree, i, 1));
61 } 60 }
62 61
63 assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1); 62 assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1);
64 assert(radix_tree_tag_clear(&tree, index, 0)); 63 assert(radix_tree_tag_clear(&tree, index, 0));
65 64
66 for_each_index(i, base, order) { 65 for_each_index(i, base, order) {
@@ -76,8 +75,27 @@ static void __multiorder_tag_test(int index, int order)
76 item_kill_tree(&tree); 75 item_kill_tree(&tree);
77} 76}
78 77
78static void __multiorder_tag_test2(unsigned order, unsigned long index2)
79{
80 RADIX_TREE(tree, GFP_KERNEL);
81 unsigned long index = (1 << order);
82 index2 += index;
83
84 assert(item_insert_order(&tree, 0, order) == 0);
85 assert(item_insert(&tree, index2) == 0);
86
87 assert(radix_tree_tag_set(&tree, 0, 0));
88 assert(radix_tree_tag_set(&tree, index2, 0));
89
90 assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2);
91
92 item_kill_tree(&tree);
93}
94
79static void multiorder_tag_tests(void) 95static void multiorder_tag_tests(void)
80{ 96{
97 int i, j;
98
81 /* test multi-order entry for indices 0-7 with no sibling pointers */ 99 /* test multi-order entry for indices 0-7 with no sibling pointers */
82 __multiorder_tag_test(0, 3); 100 __multiorder_tag_test(0, 3);
83 __multiorder_tag_test(5, 3); 101 __multiorder_tag_test(5, 3);
@@ -117,6 +135,10 @@ static void multiorder_tag_tests(void)
117 __multiorder_tag_test(300, 8); 135 __multiorder_tag_test(300, 8);
118 136
119 __multiorder_tag_test(0x12345678UL, 8); 137 __multiorder_tag_test(0x12345678UL, 8);
138
139 for (i = 1; i < 10; i++)
140 for (j = 0; j < (10 << i); j++)
141 __multiorder_tag_test2(i, j);
120} 142}
121 143
122static void multiorder_check(unsigned long index, int order) 144static void multiorder_check(unsigned long index, int order)
@@ -125,7 +147,7 @@ static void multiorder_check(unsigned long index, int order)
125 unsigned long min = index & ~((1UL << order) - 1); 147 unsigned long min = index & ~((1UL << order) - 1);
126 unsigned long max = min + (1UL << order); 148 unsigned long max = min + (1UL << order);
127 void **slot; 149 void **slot;
128 struct item *item2 = item_create(min); 150 struct item *item2 = item_create(min, order);
129 RADIX_TREE(tree, GFP_KERNEL); 151 RADIX_TREE(tree, GFP_KERNEL);
130 152
131 printf("Multiorder index %ld, order %d\n", index, order); 153 printf("Multiorder index %ld, order %d\n", index, order);
@@ -231,11 +253,14 @@ void multiorder_iteration(void)
231 radix_tree_for_each_slot(slot, &tree, &iter, j) { 253 radix_tree_for_each_slot(slot, &tree, &iter, j) {
232 int height = order[i] / RADIX_TREE_MAP_SHIFT; 254 int height = order[i] / RADIX_TREE_MAP_SHIFT;
233 int shift = height * RADIX_TREE_MAP_SHIFT; 255 int shift = height * RADIX_TREE_MAP_SHIFT;
234 int mask = (1 << order[i]) - 1; 256 unsigned long mask = (1UL << order[i]) - 1;
257 struct item *item = *slot;
235 258
236 assert(iter.index >= (index[i] &~ mask)); 259 assert((iter.index | mask) == (index[i] | mask));
237 assert(iter.index <= (index[i] | mask));
238 assert(iter.shift == shift); 260 assert(iter.shift == shift);
261 assert(!radix_tree_is_internal_node(item));
262 assert((item->index | mask) == (index[i] | mask));
263 assert(item->order == order[i]);
239 i++; 264 i++;
240 } 265 }
241 } 266 }
@@ -248,7 +273,6 @@ void multiorder_tagged_iteration(void)
248 RADIX_TREE(tree, GFP_KERNEL); 273 RADIX_TREE(tree, GFP_KERNEL);
249 struct radix_tree_iter iter; 274 struct radix_tree_iter iter;
250 void **slot; 275 void **slot;
251 unsigned long first = 0;
252 int i, j; 276 int i, j;
253 277
254 printf("Multiorder tagged iteration test\n"); 278 printf("Multiorder tagged iteration test\n");
@@ -269,7 +293,7 @@ void multiorder_tagged_iteration(void)
269 assert(radix_tree_tag_set(&tree, tag_index[i], 1)); 293 assert(radix_tree_tag_set(&tree, tag_index[i], 1));
270 294
271 for (j = 0; j < 256; j++) { 295 for (j = 0; j < 256; j++) {
272 int mask, k; 296 int k;
273 297
274 for (i = 0; i < TAG_ENTRIES; i++) { 298 for (i = 0; i < TAG_ENTRIES; i++) {
275 for (k = i; index[k] < tag_index[i]; k++) 299 for (k = i; index[k] < tag_index[i]; k++)
@@ -279,18 +303,22 @@ void multiorder_tagged_iteration(void)
279 } 303 }
280 304
281 radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { 305 radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) {
306 unsigned long mask;
307 struct item *item = *slot;
282 for (k = i; index[k] < tag_index[i]; k++) 308 for (k = i; index[k] < tag_index[i]; k++)
283 ; 309 ;
284 mask = (1 << order[k]) - 1; 310 mask = (1UL << order[k]) - 1;
285 311
286 assert(iter.index >= (tag_index[i] &~ mask)); 312 assert((iter.index | mask) == (tag_index[i] | mask));
287 assert(iter.index <= (tag_index[i] | mask)); 313 assert(!radix_tree_is_internal_node(item));
314 assert((item->index | mask) == (tag_index[i] | mask));
315 assert(item->order == order[k]);
288 i++; 316 i++;
289 } 317 }
290 } 318 }
291 319
292 radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 320 assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) ==
293 MT_NUM_ENTRIES, 1, 2); 321 TAG_ENTRIES);
294 322
295 for (j = 0; j < 256; j++) { 323 for (j = 0; j < 256; j++) {
296 int mask, k; 324 int mask, k;
@@ -303,19 +331,21 @@ void multiorder_tagged_iteration(void)
303 } 331 }
304 332
305 radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { 333 radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) {
334 struct item *item = *slot;
306 for (k = i; index[k] < tag_index[i]; k++) 335 for (k = i; index[k] < tag_index[i]; k++)
307 ; 336 ;
308 mask = (1 << order[k]) - 1; 337 mask = (1 << order[k]) - 1;
309 338
310 assert(iter.index >= (tag_index[i] &~ mask)); 339 assert((iter.index | mask) == (tag_index[i] | mask));
311 assert(iter.index <= (tag_index[i] | mask)); 340 assert(!radix_tree_is_internal_node(item));
341 assert((item->index | mask) == (tag_index[i] | mask));
342 assert(item->order == order[k]);
312 i++; 343 i++;
313 } 344 }
314 } 345 }
315 346
316 first = 1; 347 assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0)
317 radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 348 == TAG_ENTRIES);
318 MT_NUM_ENTRIES, 1, 0);
319 i = 0; 349 i = 0;
320 radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { 350 radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) {
321 assert(iter.index == tag_index[i]); 351 assert(iter.index == tag_index[i]);
@@ -325,6 +355,261 @@ void multiorder_tagged_iteration(void)
325 item_kill_tree(&tree); 355 item_kill_tree(&tree);
326} 356}
327 357
358static void multiorder_join1(unsigned long index,
359 unsigned order1, unsigned order2)
360{
361 unsigned long loc;
362 void *item, *item2 = item_create(index + 1, order1);
363 RADIX_TREE(tree, GFP_KERNEL);
364
365 item_insert_order(&tree, index, order2);
366 item = radix_tree_lookup(&tree, index);
367 radix_tree_join(&tree, index + 1, order1, item2);
368 loc = find_item(&tree, item);
369 if (loc == -1)
370 free(item);
371 item = radix_tree_lookup(&tree, index + 1);
372 assert(item == item2);
373 item_kill_tree(&tree);
374}
375
376static void multiorder_join2(unsigned order1, unsigned order2)
377{
378 RADIX_TREE(tree, GFP_KERNEL);
379 struct radix_tree_node *node;
380 void *item1 = item_create(0, order1);
381 void *item2;
382
383 item_insert_order(&tree, 0, order2);
384 radix_tree_insert(&tree, 1 << order2, (void *)0x12UL);
385 item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
386 assert(item2 == (void *)0x12UL);
387 assert(node->exceptional == 1);
388
389 radix_tree_join(&tree, 0, order1, item1);
390 item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL);
391 assert(item2 == item1);
392 assert(node->exceptional == 0);
393 item_kill_tree(&tree);
394}
395
396/*
397 * This test revealed an accounting bug for exceptional entries at one point.
398 * Nodes were being freed back into the pool with an elevated exception count
399 * by radix_tree_join() and then radix_tree_split() was failing to zero the
400 * count of exceptional entries.
401 */
402static void multiorder_join3(unsigned int order)
403{
404 RADIX_TREE(tree, GFP_KERNEL);
405 struct radix_tree_node *node;
406 void **slot;
407 struct radix_tree_iter iter;
408 unsigned long i;
409
410 for (i = 0; i < (1 << order); i++) {
411 radix_tree_insert(&tree, i, (void *)0x12UL);
412 }
413
414 radix_tree_join(&tree, 0, order, (void *)0x16UL);
415 rcu_barrier();
416
417 radix_tree_split(&tree, 0, 0);
418
419 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
420 radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL);
421 }
422
423 __radix_tree_lookup(&tree, 0, &node, NULL);
424 assert(node->exceptional == node->count);
425
426 item_kill_tree(&tree);
427}
428
429static void multiorder_join(void)
430{
431 int i, j, idx;
432
433 for (idx = 0; idx < 1024; idx = idx * 2 + 3) {
434 for (i = 1; i < 15; i++) {
435 for (j = 0; j < i; j++) {
436 multiorder_join1(idx, i, j);
437 }
438 }
439 }
440
441 for (i = 1; i < 15; i++) {
442 for (j = 0; j < i; j++) {
443 multiorder_join2(i, j);
444 }
445 }
446
447 for (i = 3; i < 10; i++) {
448 multiorder_join3(i);
449 }
450}
451
452static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc)
453{
454 struct radix_tree_preload *rtp = &radix_tree_preloads;
455 if (rtp->nr != 0)
456 printf("split(%u %u) remaining %u\n", old_order, new_order,
457 rtp->nr);
458 /*
459 * Can't check for equality here as some nodes may have been
460 * RCU-freed while we ran. But we should never finish with more
461 * nodes allocated since they should have all been preloaded.
462 */
463 if (nr_allocated > alloc)
464 printf("split(%u %u) allocated %u %u\n", old_order, new_order,
465 alloc, nr_allocated);
466}
467
468static void __multiorder_split(int old_order, int new_order)
469{
470 RADIX_TREE(tree, GFP_ATOMIC);
471 void **slot;
472 struct radix_tree_iter iter;
473 unsigned alloc;
474
475 radix_tree_preload(GFP_KERNEL);
476 assert(item_insert_order(&tree, 0, old_order) == 0);
477 radix_tree_preload_end();
478
479 /* Wipe out the preloaded cache or it'll confuse check_mem() */
480 radix_tree_cpu_dead(0);
481
482 radix_tree_tag_set(&tree, 0, 2);
483
484 radix_tree_split_preload(old_order, new_order, GFP_KERNEL);
485 alloc = nr_allocated;
486 radix_tree_split(&tree, 0, new_order);
487 check_mem(old_order, new_order, alloc);
488 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
489 radix_tree_iter_replace(&tree, &iter, slot,
490 item_create(iter.index, new_order));
491 }
492 radix_tree_preload_end();
493
494 item_kill_tree(&tree);
495}
496
497static void __multiorder_split2(int old_order, int new_order)
498{
499 RADIX_TREE(tree, GFP_KERNEL);
500 void **slot;
501 struct radix_tree_iter iter;
502 struct radix_tree_node *node;
503 void *item;
504
505 __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
506
507 item = __radix_tree_lookup(&tree, 0, &node, NULL);
508 assert(item == (void *)0x12);
509 assert(node->exceptional > 0);
510
511 radix_tree_split(&tree, 0, new_order);
512 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
513 radix_tree_iter_replace(&tree, &iter, slot,
514 item_create(iter.index, new_order));
515 }
516
517 item = __radix_tree_lookup(&tree, 0, &node, NULL);
518 assert(item != (void *)0x12);
519 assert(node->exceptional == 0);
520
521 item_kill_tree(&tree);
522}
523
524static void __multiorder_split3(int old_order, int new_order)
525{
526 RADIX_TREE(tree, GFP_KERNEL);
527 void **slot;
528 struct radix_tree_iter iter;
529 struct radix_tree_node *node;
530 void *item;
531
532 __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
533
534 item = __radix_tree_lookup(&tree, 0, &node, NULL);
535 assert(item == (void *)0x12);
536 assert(node->exceptional > 0);
537
538 radix_tree_split(&tree, 0, new_order);
539 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
540 radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16);
541 }
542
543 item = __radix_tree_lookup(&tree, 0, &node, NULL);
544 assert(item == (void *)0x16);
545 assert(node->exceptional > 0);
546
547 item_kill_tree(&tree);
548
549 __radix_tree_insert(&tree, 0, old_order, (void *)0x12);
550
551 item = __radix_tree_lookup(&tree, 0, &node, NULL);
552 assert(item == (void *)0x12);
553 assert(node->exceptional > 0);
554
555 radix_tree_split(&tree, 0, new_order);
556 radix_tree_for_each_slot(slot, &tree, &iter, 0) {
557 if (iter.index == (1 << new_order))
558 radix_tree_iter_replace(&tree, &iter, slot,
559 (void *)0x16);
560 else
561 radix_tree_iter_replace(&tree, &iter, slot, NULL);
562 }
563
564 item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL);
565 assert(item == (void *)0x16);
566 assert(node->count == node->exceptional);
567 do {
568 node = node->parent;
569 if (!node)
570 break;
571 assert(node->count == 1);
572 assert(node->exceptional == 0);
573 } while (1);
574
575 item_kill_tree(&tree);
576}
577
578static void multiorder_split(void)
579{
580 int i, j;
581
582 for (i = 3; i < 11; i++)
583 for (j = 0; j < i; j++) {
584 __multiorder_split(i, j);
585 __multiorder_split2(i, j);
586 __multiorder_split3(i, j);
587 }
588}
589
590static void multiorder_account(void)
591{
592 RADIX_TREE(tree, GFP_KERNEL);
593 struct radix_tree_node *node;
594 void **slot;
595
596 item_insert_order(&tree, 0, 5);
597
598 __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
599 __radix_tree_lookup(&tree, 0, &node, NULL);
600 assert(node->count == node->exceptional * 2);
601 radix_tree_delete(&tree, 1 << 5);
602 assert(node->exceptional == 0);
603
604 __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12);
605 __radix_tree_lookup(&tree, 1 << 5, &node, &slot);
606 assert(node->count == node->exceptional * 2);
607 __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL);
608 assert(node->exceptional == 0);
609
610 item_kill_tree(&tree);
611}
612
328void multiorder_checks(void) 613void multiorder_checks(void)
329{ 614{
330 int i; 615 int i;
@@ -342,4 +627,9 @@ void multiorder_checks(void)
342 multiorder_tag_tests(); 627 multiorder_tag_tests();
343 multiorder_iteration(); 628 multiorder_iteration();
344 multiorder_tagged_iteration(); 629 multiorder_tagged_iteration();
630 multiorder_join();
631 multiorder_split();
632 multiorder_account();
633
634 radix_tree_cpu_dead(0);
345} 635}
diff --git a/tools/testing/radix-tree/rcupdate.c b/tools/testing/radix-tree/rcupdate.c
deleted file mode 100644
index 31a2d14225d6..000000000000
--- a/tools/testing/radix-tree/rcupdate.c
+++ /dev/null
@@ -1,86 +0,0 @@
1#include <linux/rcupdate.h>
2#include <pthread.h>
3#include <stdio.h>
4#include <assert.h>
5
6static pthread_mutex_t rculock = PTHREAD_MUTEX_INITIALIZER;
7static struct rcu_head *rcuhead_global = NULL;
8static __thread int nr_rcuhead = 0;
9static __thread struct rcu_head *rcuhead = NULL;
10static __thread struct rcu_head *rcutail = NULL;
11
12static pthread_cond_t rcu_worker_cond = PTHREAD_COND_INITIALIZER;
13
14/* switch to urcu implementation when it is merged. */
15void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head))
16{
17 head->func = func;
18 head->next = rcuhead;
19 rcuhead = head;
20 if (!rcutail)
21 rcutail = head;
22 nr_rcuhead++;
23 if (nr_rcuhead >= 1000) {
24 int signal = 0;
25
26 pthread_mutex_lock(&rculock);
27 if (!rcuhead_global)
28 signal = 1;
29 rcutail->next = rcuhead_global;
30 rcuhead_global = head;
31 pthread_mutex_unlock(&rculock);
32
33 nr_rcuhead = 0;
34 rcuhead = NULL;
35 rcutail = NULL;
36
37 if (signal) {
38 pthread_cond_signal(&rcu_worker_cond);
39 }
40 }
41}
42
43static void *rcu_worker(void *arg)
44{
45 struct rcu_head *r;
46
47 rcupdate_thread_init();
48
49 while (1) {
50 pthread_mutex_lock(&rculock);
51 while (!rcuhead_global) {
52 pthread_cond_wait(&rcu_worker_cond, &rculock);
53 }
54 r = rcuhead_global;
55 rcuhead_global = NULL;
56
57 pthread_mutex_unlock(&rculock);
58
59 synchronize_rcu();
60
61 while (r) {
62 struct rcu_head *tmp = r->next;
63 r->func(r);
64 r = tmp;
65 }
66 }
67
68 rcupdate_thread_exit();
69
70 return NULL;
71}
72
73static pthread_t worker_thread;
74void rcupdate_init(void)
75{
76 pthread_create(&worker_thread, NULL, rcu_worker, NULL);
77}
78
79void rcupdate_thread_init(void)
80{
81 rcu_register_thread();
82}
83void rcupdate_thread_exit(void)
84{
85 rcu_unregister_thread();
86}
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c
index 63bf347aaf33..a41325d7a170 100644
--- a/tools/testing/radix-tree/regression2.c
+++ b/tools/testing/radix-tree/regression2.c
@@ -50,6 +50,7 @@
50#include <stdio.h> 50#include <stdio.h>
51 51
52#include "regression.h" 52#include "regression.h"
53#include "test.h"
53 54
54#define PAGECACHE_TAG_DIRTY 0 55#define PAGECACHE_TAG_DIRTY 0
55#define PAGECACHE_TAG_WRITEBACK 1 56#define PAGECACHE_TAG_WRITEBACK 1
@@ -90,7 +91,7 @@ void regression2_test(void)
90 /* 1. */ 91 /* 1. */
91 start = 0; 92 start = 0;
92 end = max_slots - 2; 93 end = max_slots - 2;
93 radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1, 94 tag_tagged_items(&mt_tree, NULL, start, end, 1,
94 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); 95 PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
95 96
96 /* 2. */ 97 /* 2. */
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c
index 1f06ed73d0a8..b594841fae85 100644
--- a/tools/testing/radix-tree/regression3.c
+++ b/tools/testing/radix-tree/regression3.c
@@ -5,7 +5,7 @@
5 * In following radix_tree_next_slot current chunk size becomes zero. 5 * In following radix_tree_next_slot current chunk size becomes zero.
6 * This isn't checked and it tries to dereference null pointer in slot. 6 * This isn't checked and it tries to dereference null pointer in slot.
7 * 7 *
8 * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1, 8 * Helper radix_tree_iter_resume reset slot to NULL and next_index to index + 1,
9 * for tagger iteraction it also must reset cached tags in iterator to abort 9 * for tagger iteraction it also must reset cached tags in iterator to abort
10 * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. 10 * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk.
11 * 11 *
@@ -88,7 +88,7 @@ void regression3_test(void)
88 printf("slot %ld %p\n", iter.index, *slot); 88 printf("slot %ld %p\n", iter.index, *slot);
89 if (!iter.index) { 89 if (!iter.index) {
90 printf("next at %ld\n", iter.index); 90 printf("next at %ld\n", iter.index);
91 slot = radix_tree_iter_next(&iter); 91 slot = radix_tree_iter_resume(slot, &iter);
92 } 92 }
93 } 93 }
94 94
@@ -96,7 +96,7 @@ void regression3_test(void)
96 printf("contig %ld %p\n", iter.index, *slot); 96 printf("contig %ld %p\n", iter.index, *slot);
97 if (!iter.index) { 97 if (!iter.index) {
98 printf("next at %ld\n", iter.index); 98 printf("next at %ld\n", iter.index);
99 slot = radix_tree_iter_next(&iter); 99 slot = radix_tree_iter_resume(slot, &iter);
100 } 100 }
101 } 101 }
102 102
@@ -106,7 +106,7 @@ void regression3_test(void)
106 printf("tagged %ld %p\n", iter.index, *slot); 106 printf("tagged %ld %p\n", iter.index, *slot);
107 if (!iter.index) { 107 if (!iter.index) {
108 printf("next at %ld\n", iter.index); 108 printf("next at %ld\n", iter.index);
109 slot = radix_tree_iter_next(&iter); 109 slot = radix_tree_iter_resume(slot, &iter);
110 } 110 }
111 } 111 }
112 112
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c
index b0ac05741750..fd98c132207a 100644
--- a/tools/testing/radix-tree/tag_check.c
+++ b/tools/testing/radix-tree/tag_check.c
@@ -23,7 +23,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag)
23 item_tag_set(tree, index, tag); 23 item_tag_set(tree, index, tag);
24 ret = item_tag_get(tree, index, tag); 24 ret = item_tag_get(tree, index, tag);
25 assert(ret != 0); 25 assert(ret != 0);
26 ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag); 26 ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag);
27 assert(ret == 1); 27 assert(ret == 1);
28 ret = item_tag_get(tree, index, !tag); 28 ret = item_tag_get(tree, index, !tag);
29 assert(ret != 0); 29 assert(ret != 0);
@@ -51,6 +51,7 @@ void simple_checks(void)
51 verify_tag_consistency(&tree, 1); 51 verify_tag_consistency(&tree, 1);
52 printf("before item_kill_tree: %d allocated\n", nr_allocated); 52 printf("before item_kill_tree: %d allocated\n", nr_allocated);
53 item_kill_tree(&tree); 53 item_kill_tree(&tree);
54 rcu_barrier();
54 printf("after item_kill_tree: %d allocated\n", nr_allocated); 55 printf("after item_kill_tree: %d allocated\n", nr_allocated);
55} 56}
56 57
@@ -319,10 +320,13 @@ static void single_check(void)
319 assert(ret == 0); 320 assert(ret == 0);
320 verify_tag_consistency(&tree, 0); 321 verify_tag_consistency(&tree, 0);
321 verify_tag_consistency(&tree, 1); 322 verify_tag_consistency(&tree, 1);
322 ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1); 323 ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1);
323 assert(ret == 1); 324 assert(ret == 1);
324 ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); 325 ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1);
325 assert(ret == 1); 326 assert(ret == 1);
327 item_tag_clear(&tree, 0, 0);
328 ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 0);
329 assert(ret == 0);
326 item_kill_tree(&tree); 330 item_kill_tree(&tree);
327} 331}
328 332
@@ -331,12 +335,16 @@ void tag_check(void)
331 single_check(); 335 single_check();
332 extend_checks(); 336 extend_checks();
333 contract_checks(); 337 contract_checks();
338 rcu_barrier();
334 printf("after extend_checks: %d allocated\n", nr_allocated); 339 printf("after extend_checks: %d allocated\n", nr_allocated);
335 __leak_check(); 340 __leak_check();
336 leak_check(); 341 leak_check();
342 rcu_barrier();
337 printf("after leak_check: %d allocated\n", nr_allocated); 343 printf("after leak_check: %d allocated\n", nr_allocated);
338 simple_checks(); 344 simple_checks();
345 rcu_barrier();
339 printf("after simple_checks: %d allocated\n", nr_allocated); 346 printf("after simple_checks: %d allocated\n", nr_allocated);
340 thrash_tags(); 347 thrash_tags();
348 rcu_barrier();
341 printf("after thrash_tags: %d allocated\n", nr_allocated); 349 printf("after thrash_tags: %d allocated\n", nr_allocated);
342} 350}
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c
index a6e8099eaf4f..e5726e373646 100644
--- a/tools/testing/radix-tree/test.c
+++ b/tools/testing/radix-tree/test.c
@@ -24,21 +24,29 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag)
24 return radix_tree_tag_get(root, index, tag); 24 return radix_tree_tag_get(root, index, tag);
25} 25}
26 26
27int __item_insert(struct radix_tree_root *root, struct item *item, 27int __item_insert(struct radix_tree_root *root, struct item *item)
28 unsigned order)
29{ 28{
30 return __radix_tree_insert(root, item->index, order, item); 29 return __radix_tree_insert(root, item->index, item->order, item);
31} 30}
32 31
33int item_insert(struct radix_tree_root *root, unsigned long index) 32int item_insert(struct radix_tree_root *root, unsigned long index)
34{ 33{
35 return __item_insert(root, item_create(index), 0); 34 return __item_insert(root, item_create(index, 0));
36} 35}
37 36
38int item_insert_order(struct radix_tree_root *root, unsigned long index, 37int item_insert_order(struct radix_tree_root *root, unsigned long index,
39 unsigned order) 38 unsigned order)
40{ 39{
41 return __item_insert(root, item_create(index), order); 40 return __item_insert(root, item_create(index, order));
41}
42
43void item_sanity(struct item *item, unsigned long index)
44{
45 unsigned long mask;
46 assert(!radix_tree_is_internal_node(item));
47 assert(item->order < BITS_PER_LONG);
48 mask = (1UL << item->order) - 1;
49 assert((item->index | mask) == (index | mask));
42} 50}
43 51
44int item_delete(struct radix_tree_root *root, unsigned long index) 52int item_delete(struct radix_tree_root *root, unsigned long index)
@@ -46,18 +54,19 @@ int item_delete(struct radix_tree_root *root, unsigned long index)
46 struct item *item = radix_tree_delete(root, index); 54 struct item *item = radix_tree_delete(root, index);
47 55
48 if (item) { 56 if (item) {
49 assert(item->index == index); 57 item_sanity(item, index);
50 free(item); 58 free(item);
51 return 1; 59 return 1;
52 } 60 }
53 return 0; 61 return 0;
54} 62}
55 63
56struct item *item_create(unsigned long index) 64struct item *item_create(unsigned long index, unsigned int order)
57{ 65{
58 struct item *ret = malloc(sizeof(*ret)); 66 struct item *ret = malloc(sizeof(*ret));
59 67
60 ret->index = index; 68 ret->index = index;
69 ret->order = order;
61 return ret; 70 return ret;
62} 71}
63 72
@@ -66,8 +75,8 @@ void item_check_present(struct radix_tree_root *root, unsigned long index)
66 struct item *item; 75 struct item *item;
67 76
68 item = radix_tree_lookup(root, index); 77 item = radix_tree_lookup(root, index);
69 assert(item != 0); 78 assert(item != NULL);
70 assert(item->index == index); 79 item_sanity(item, index);
71} 80}
72 81
73struct item *item_lookup(struct radix_tree_root *root, unsigned long index) 82struct item *item_lookup(struct radix_tree_root *root, unsigned long index)
@@ -80,7 +89,7 @@ void item_check_absent(struct radix_tree_root *root, unsigned long index)
80 struct item *item; 89 struct item *item;
81 90
82 item = radix_tree_lookup(root, index); 91 item = radix_tree_lookup(root, index);
83 assert(item == 0); 92 assert(item == NULL);
84} 93}
85 94
86/* 95/*
@@ -142,6 +151,62 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
142 assert(nfound == 0); 151 assert(nfound == 0);
143} 152}
144 153
154/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */
155int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock,
156 unsigned long start, unsigned long end, unsigned batch,
157 unsigned iftag, unsigned thentag)
158{
159 unsigned long tagged = 0;
160 struct radix_tree_iter iter;
161 void **slot;
162
163 if (batch == 0)
164 batch = 1;
165
166 if (lock)
167 pthread_mutex_lock(lock);
168 radix_tree_for_each_tagged(slot, root, &iter, start, iftag) {
169 if (iter.index > end)
170 break;
171 radix_tree_iter_tag_set(root, &iter, thentag);
172 tagged++;
173 if ((tagged % batch) != 0)
174 continue;
175 slot = radix_tree_iter_resume(slot, &iter);
176 if (lock) {
177 pthread_mutex_unlock(lock);
178 rcu_barrier();
179 pthread_mutex_lock(lock);
180 }
181 }
182 if (lock)
183 pthread_mutex_unlock(lock);
184
185 return tagged;
186}
187
188/* Use the same pattern as find_swap_entry() in mm/shmem.c */
189unsigned long find_item(struct radix_tree_root *root, void *item)
190{
191 struct radix_tree_iter iter;
192 void **slot;
193 unsigned long found = -1;
194 unsigned long checked = 0;
195
196 radix_tree_for_each_slot(slot, root, &iter, 0) {
197 if (*slot == item) {
198 found = iter.index;
199 break;
200 }
201 checked++;
202 if ((checked % 4) != 0)
203 continue;
204 slot = radix_tree_iter_resume(slot, &iter);
205 }
206
207 return found;
208}
209
145static int verify_node(struct radix_tree_node *slot, unsigned int tag, 210static int verify_node(struct radix_tree_node *slot, unsigned int tag,
146 int tagged) 211 int tagged)
147{ 212{
@@ -200,9 +265,16 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag)
200 265
201void item_kill_tree(struct radix_tree_root *root) 266void item_kill_tree(struct radix_tree_root *root)
202{ 267{
268 struct radix_tree_iter iter;
269 void **slot;
203 struct item *items[32]; 270 struct item *items[32];
204 int nfound; 271 int nfound;
205 272
273 radix_tree_for_each_slot(slot, root, &iter, 0) {
274 if (radix_tree_exceptional_entry(*slot))
275 radix_tree_delete(root, iter.index);
276 }
277
206 while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { 278 while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) {
207 int i; 279 int i;
208 280
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h
index 217fb2403f09..056a23b56467 100644
--- a/tools/testing/radix-tree/test.h
+++ b/tools/testing/radix-tree/test.h
@@ -5,11 +5,11 @@
5 5
6struct item { 6struct item {
7 unsigned long index; 7 unsigned long index;
8 unsigned int order;
8}; 9};
9 10
10struct item *item_create(unsigned long index); 11struct item *item_create(unsigned long index, unsigned int order);
11int __item_insert(struct radix_tree_root *root, struct item *item, 12int __item_insert(struct radix_tree_root *root, struct item *item);
12 unsigned order);
13int item_insert(struct radix_tree_root *root, unsigned long index); 13int item_insert(struct radix_tree_root *root, unsigned long index);
14int item_insert_order(struct radix_tree_root *root, unsigned long index, 14int item_insert_order(struct radix_tree_root *root, unsigned long index,
15 unsigned order); 15 unsigned order);
@@ -25,9 +25,15 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start,
25 unsigned long nr, int chunk); 25 unsigned long nr, int chunk);
26void item_kill_tree(struct radix_tree_root *root); 26void item_kill_tree(struct radix_tree_root *root);
27 27
28int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *,
29 unsigned long start, unsigned long end, unsigned batch,
30 unsigned iftag, unsigned thentag);
31unsigned long find_item(struct radix_tree_root *, void *item);
32
28void tag_check(void); 33void tag_check(void);
29void multiorder_checks(void); 34void multiorder_checks(void);
30void iteration_test(void); 35void iteration_test(unsigned order, unsigned duration);
36void benchmark(void);
31 37
32struct item * 38struct item *
33item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); 39item_tag_set(struct radix_tree_root *root, unsigned long index, int tag);
@@ -40,7 +46,14 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag);
40extern int nr_allocated; 46extern int nr_allocated;
41 47
42/* Normally private parts of lib/radix-tree.c */ 48/* Normally private parts of lib/radix-tree.c */
49struct radix_tree_node *entry_to_node(void *ptr);
43void radix_tree_dump(struct radix_tree_root *root); 50void radix_tree_dump(struct radix_tree_root *root);
44int root_tag_get(struct radix_tree_root *root, unsigned int tag); 51int root_tag_get(struct radix_tree_root *root, unsigned int tag);
45unsigned long node_maxindex(struct radix_tree_node *); 52unsigned long node_maxindex(struct radix_tree_node *);
46unsigned long shift_maxindex(unsigned int shift); 53unsigned long shift_maxindex(unsigned int shift);
54int radix_tree_cpu_dead(unsigned int cpu);
55struct radix_tree_preload {
56 unsigned nr;
57 struct radix_tree_node *nodes;
58};
59extern struct radix_tree_preload radix_tree_preloads;
diff --git a/usr/Kconfig b/usr/Kconfig
index 572dcf7b6a44..6278f135256d 100644
--- a/usr/Kconfig
+++ b/usr/Kconfig
@@ -98,3 +98,130 @@ config RD_LZ4
98 help 98 help
99 Support loading of a LZ4 encoded initial ramdisk or cpio buffer 99 Support loading of a LZ4 encoded initial ramdisk or cpio buffer
100 If unsure, say N. 100 If unsure, say N.
101
102choice
103 prompt "Built-in initramfs compression mode"
104 depends on INITRAMFS_SOURCE!=""
105 optional
106 help
107 This option allows you to decide by which algorithm the builtin
108 initramfs will be compressed. Several compression algorithms are
109 available, which differ in efficiency, compression and
110 decompression speed. Compression speed is only relevant
111 when building a kernel. Decompression speed is relevant at
112 each boot. Also the memory usage during decompression may become
113 relevant on memory constrained systems. This is usually based on the
114 dictionary size of the algorithm with algorithms like XZ and LZMA
115 featuring large dictionary sizes.
116
117 High compression options are mostly useful for users who are
118 low on RAM, since it reduces the memory consumption during
119 boot.
120
121 Keep in mind that your build system needs to provide the appropriate
122 compression tool to compress the generated initram cpio file for
123 embedding.
124
125 If in doubt, select 'None'
126
127config INITRAMFS_COMPRESSION_NONE
128 bool "None"
129 help
130 Do not compress the built-in initramfs at all. This may sound wasteful
131 in space, but, you should be aware that the built-in initramfs will be
132 compressed at a later stage anyways along with the rest of the kernel,
133 on those architectures that support this. However, not compressing the
134 initramfs may lead to slightly higher memory consumption during a
135 short time at boot, while both the cpio image and the unpacked
136 filesystem image will be present in memory simultaneously
137
138config INITRAMFS_COMPRESSION_GZIP
139 bool "Gzip"
140 depends on RD_GZIP
141 help
142 Use the old and well tested gzip compression algorithm. Gzip provides
143 a good balance between compression ratio and decompression speed and
144 has a reasonable compression speed. It is also more likely to be
145 supported by your build system as the gzip tool is present by default
146 on most distros.
147
148config INITRAMFS_COMPRESSION_BZIP2
149 bool "Bzip2"
150 depends on RD_BZIP2
151 help
152 It's compression ratio and speed is intermediate. Decompression speed
153 is slowest among the choices. The initramfs size is about 10% smaller
154 with bzip2, in comparison to gzip. Bzip2 uses a large amount of
155 memory. For modern kernels you will need at least 8MB RAM or more for
156 booting.
157
158 If you choose this, keep in mind that you need to have the bzip2 tool
159 available to be able to compress the initram.
160
161config INITRAMFS_COMPRESSION_LZMA
162 bool "LZMA"
163 depends on RD_LZMA
164 help
165 This algorithm's compression ratio is best but has a large dictionary
166 size which might cause issues in memory constrained systems.
167 Decompression speed is between the other choices. Compression is
168 slowest. The initramfs size is about 33% smaller with LZMA in
169 comparison to gzip.
170
171 If you choose this, keep in mind that you may need to install the xz
172 or lzma tools to be able to compress the initram.
173
174config INITRAMFS_COMPRESSION_XZ
175 bool "XZ"
176 depends on RD_XZ
177 help
178 XZ uses the LZMA2 algorithm and has a large dictionary which may cause
179 problems on memory constrained systems. The initramfs size is about
180 30% smaller with XZ in comparison to gzip. Decompression speed is
181 better than that of bzip2 but worse than gzip and LZO. Compression is
182 slow.
183
184 If you choose this, keep in mind that you may need to install the xz
185 tool to be able to compress the initram.
186
187config INITRAMFS_COMPRESSION_LZO
188 bool "LZO"
189 depends on RD_LZO
190 help
191 It's compression ratio is the second poorest amongst the choices. The
192 kernel size is about 10% bigger than gzip. Despite that, it's
193 decompression speed is the second fastest and it's compression speed
194 is quite fast too.
195
196 If you choose this, keep in mind that you may need to install the lzop
197 tool to be able to compress the initram.
198
199config INITRAMFS_COMPRESSION_LZ4
200 bool "LZ4"
201 depends on RD_LZ4
202 help
203 It's compression ratio is the poorest amongst the choices. The kernel
204 size is about 15% bigger than gzip; however its decompression speed
205 is the fastest.
206
207 If you choose this, keep in mind that most distros don't provide lz4
208 by default which could cause a build failure.
209
210endchoice
211
212config INITRAMFS_COMPRESSION
213 string
214 default "" if INITRAMFS_COMPRESSION_NONE
215 default ".gz" if INITRAMFS_COMPRESSION_GZIP
216 default ".bz2" if INITRAMFS_COMPRESSION_BZIP2
217 default ".lzma" if INITRAMFS_COMPRESSION_LZMA
218 default ".xz" if INITRAMFS_COMPRESSION_XZ
219 default ".lzo" if INITRAMFS_COMPRESSION_LZO
220 default ".lz4" if INITRAMFS_COMPRESSION_LZ4
221 default ".gz" if RD_GZIP
222 default ".lz4" if RD_LZ4
223 default ".lzo" if RD_LZO
224 default ".xz" if RD_XZ
225 default ".lzma" if RD_LZMA
226 default ".bz2" if RD_BZIP2
227 default ""
diff --git a/usr/Makefile b/usr/Makefile
index e767f019accf..17a513268325 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -5,25 +5,7 @@
5klibcdirs:; 5klibcdirs:;
6PHONY += klibcdirs 6PHONY += klibcdirs
7 7
8 8suffix_y = $(CONFIG_INITRAMFS_COMPRESSION)
9# Bzip2
10suffix_$(CONFIG_RD_BZIP2) = .bz2
11
12# Lzma
13suffix_$(CONFIG_RD_LZMA) = .lzma
14
15# XZ
16suffix_$(CONFIG_RD_XZ) = .xz
17
18# Lzo
19suffix_$(CONFIG_RD_LZO) = .lzo
20
21# Lz4
22suffix_$(CONFIG_RD_LZ4) = .lz4
23
24# Gzip
25suffix_$(CONFIG_RD_GZIP) = .gz
26
27AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)" 9AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)"
28 10
29# Generate builtin.o based on initramfs_data.o 11# Generate builtin.o based on initramfs_data.o
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index efeceb0a222d..3815e940fbea 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work)
76 struct kvm_vcpu *vcpu = apf->vcpu; 76 struct kvm_vcpu *vcpu = apf->vcpu;
77 unsigned long addr = apf->addr; 77 unsigned long addr = apf->addr;
78 gva_t gva = apf->gva; 78 gva_t gva = apf->gva;
79 int locked = 1;
79 80
80 might_sleep(); 81 might_sleep();
81 82
82 /* 83 /*
83 * This work is run asynchromously to the task which owns 84 * This work is run asynchromously to the task which owns
84 * mm and might be done in another context, so we must 85 * mm and might be done in another context, so we must
85 * use FOLL_REMOTE. 86 * access remotely.
86 */ 87 */
87 __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, 88 down_read(&mm->mmap_sem);
88 FOLL_WRITE | FOLL_REMOTE); 89 get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL,
90 &locked);
91 if (locked)
92 up_read(&mm->mmap_sem);
89 93
90 kvm_async_page_present_sync(vcpu, apf); 94 kvm_async_page_present_sync(vcpu, apf);
91 95
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 823544c166be..de102cae7125 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
1418 npages = get_user_page_nowait(addr, write_fault, page); 1418 npages = get_user_page_nowait(addr, write_fault, page);
1419 up_read(&current->mm->mmap_sem); 1419 up_read(&current->mm->mmap_sem);
1420 } else { 1420 } else {
1421 unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; 1421 unsigned int flags = FOLL_HWPOISON;
1422 1422
1423 if (write_fault) 1423 if (write_fault)
1424 flags |= FOLL_WRITE; 1424 flags |= FOLL_WRITE;
1425 1425
1426 npages = __get_user_pages_unlocked(current, current->mm, addr, 1, 1426 npages = get_user_pages_unlocked(addr, 1, page, flags);
1427 page, flags);
1428 } 1427 }
1429 if (npages != 1) 1428 if (npages != 1)
1430 return npages; 1429 return npages;