diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 20:25:18 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-14 20:25:18 -0500 |
commit | a57cb1c1d7974c62a5c80f7869e35b492ace12cd (patch) | |
tree | 5a42ee9a668f171143464bc86013954c1bbe94ad | |
parent | cf1b3341afab9d3ad02a76b3a619ea027dcf4e28 (diff) | |
parent | e1e14ab8411df344a17687821f8f78f0a1e73cbb (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge more updates from Andrew Morton:
- a few misc things
- kexec updates
- DMA-mapping updates to better support networking DMA operations
- IPC updates
- various MM changes to improve DAX fault handling
- lots of radix-tree changes, mainly to the test suite. All leading up
to reimplementing the IDA/IDR code to be a wrapper layer over the
radix-tree. However the final trigger-pulling patch is held off for
4.11.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (114 commits)
radix tree test suite: delete unused rcupdate.c
radix tree test suite: add new tag check
radix-tree: ensure counts are initialised
radix tree test suite: cache recently freed objects
radix tree test suite: add some more functionality
idr: reduce the number of bits per level from 8 to 6
rxrpc: abstract away knowledge of IDR internals
tpm: use idr_find(), not idr_find_slowpath()
idr: add ida_is_empty
radix tree test suite: check multiorder iteration
radix-tree: fix replacement for multiorder entries
radix-tree: add radix_tree_split_preload()
radix-tree: add radix_tree_split
radix-tree: add radix_tree_join
radix-tree: delete radix_tree_range_tag_if_tagged()
radix-tree: delete radix_tree_locate_item()
radix-tree: improve multiorder iterators
btrfs: fix race in btrfs_free_dummy_fs_info()
radix-tree: improve dump output
radix-tree: make radix_tree_find_next_bit more useful
...
140 files changed, 3428 insertions, 2218 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1b5f15653b1b..69e2387ca278 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must | |||
556 | not block. If it's not possible to reach a page without blocking, | 556 | not block. If it's not possible to reach a page without blocking, |
557 | filesystem should skip it. Filesystem should use do_set_pte() to setup | 557 | filesystem should skip it. Filesystem should use do_set_pte() to setup |
558 | page table entry. Pointer to entry associated with the page is passed in | 558 | page table entry. Pointer to entry associated with the page is passed in |
559 | "pte" field in fault_env structure. Pointers to entries for other offsets | 559 | "pte" field in vm_fault structure. Pointers to entries for other offsets |
560 | should be calculated relative to "pte". | 560 | should be calculated relative to "pte". |
561 | 561 | ||
562 | ->page_mkwrite() is called when a previously read-only pte is | 562 | ->page_mkwrite() is called when a previously read-only pte is |
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index cd8aad8226dd..08450a1a5b5f 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c | |||
@@ -158,7 +158,10 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, | |||
158 | unsigned long attrs) | 158 | unsigned long attrs) |
159 | { | 159 | { |
160 | phys_addr_t paddr = page_to_phys(page) + offset; | 160 | phys_addr_t paddr = page_to_phys(page) + offset; |
161 | _dma_cache_sync(paddr, size, dir); | 161 | |
162 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
163 | _dma_cache_sync(paddr, size, dir); | ||
164 | |||
162 | return plat_phys_to_dma(dev, paddr); | 165 | return plat_phys_to_dma(dev, paddr); |
163 | } | 166 | } |
164 | 167 | ||
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c index 301281645d08..75055df1cda3 100644 --- a/arch/arm/common/dmabounce.c +++ b/arch/arm/common/dmabounce.c | |||
@@ -243,7 +243,8 @@ static int needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size) | |||
243 | } | 243 | } |
244 | 244 | ||
245 | static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, | 245 | static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, |
246 | enum dma_data_direction dir) | 246 | enum dma_data_direction dir, |
247 | unsigned long attrs) | ||
247 | { | 248 | { |
248 | struct dmabounce_device_info *device_info = dev->archdata.dmabounce; | 249 | struct dmabounce_device_info *device_info = dev->archdata.dmabounce; |
249 | struct safe_buffer *buf; | 250 | struct safe_buffer *buf; |
@@ -262,7 +263,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, | |||
262 | __func__, buf->ptr, virt_to_dma(dev, buf->ptr), | 263 | __func__, buf->ptr, virt_to_dma(dev, buf->ptr), |
263 | buf->safe, buf->safe_dma_addr); | 264 | buf->safe, buf->safe_dma_addr); |
264 | 265 | ||
265 | if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) { | 266 | if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) && |
267 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { | ||
266 | dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n", | 268 | dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n", |
267 | __func__, ptr, buf->safe, size); | 269 | __func__, ptr, buf->safe, size); |
268 | memcpy(buf->safe, ptr, size); | 270 | memcpy(buf->safe, ptr, size); |
@@ -272,7 +274,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, | |||
272 | } | 274 | } |
273 | 275 | ||
274 | static inline void unmap_single(struct device *dev, struct safe_buffer *buf, | 276 | static inline void unmap_single(struct device *dev, struct safe_buffer *buf, |
275 | size_t size, enum dma_data_direction dir) | 277 | size_t size, enum dma_data_direction dir, |
278 | unsigned long attrs) | ||
276 | { | 279 | { |
277 | BUG_ON(buf->size != size); | 280 | BUG_ON(buf->size != size); |
278 | BUG_ON(buf->direction != dir); | 281 | BUG_ON(buf->direction != dir); |
@@ -283,7 +286,8 @@ static inline void unmap_single(struct device *dev, struct safe_buffer *buf, | |||
283 | 286 | ||
284 | DO_STATS(dev->archdata.dmabounce->bounce_count++); | 287 | DO_STATS(dev->archdata.dmabounce->bounce_count++); |
285 | 288 | ||
286 | if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) { | 289 | if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) && |
290 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { | ||
287 | void *ptr = buf->ptr; | 291 | void *ptr = buf->ptr; |
288 | 292 | ||
289 | dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n", | 293 | dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n", |
@@ -334,7 +338,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, struct page *page, | |||
334 | return DMA_ERROR_CODE; | 338 | return DMA_ERROR_CODE; |
335 | } | 339 | } |
336 | 340 | ||
337 | return map_single(dev, page_address(page) + offset, size, dir); | 341 | return map_single(dev, page_address(page) + offset, size, dir, attrs); |
338 | } | 342 | } |
339 | 343 | ||
340 | /* | 344 | /* |
@@ -357,7 +361,7 @@ static void dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t | |||
357 | return; | 361 | return; |
358 | } | 362 | } |
359 | 363 | ||
360 | unmap_single(dev, buf, size, dir); | 364 | unmap_single(dev, buf, size, dir, attrs); |
361 | } | 365 | } |
362 | 366 | ||
363 | static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr, | 367 | static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr, |
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c index 58610d0df7ed..54534e5d0781 100644 --- a/arch/avr32/mm/dma-coherent.c +++ b/arch/avr32/mm/dma-coherent.c | |||
@@ -146,7 +146,8 @@ static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page, | |||
146 | { | 146 | { |
147 | void *cpu_addr = page_address(page) + offset; | 147 | void *cpu_addr = page_address(page) + offset; |
148 | 148 | ||
149 | dma_cache_sync(dev, cpu_addr, size, direction); | 149 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
150 | dma_cache_sync(dev, cpu_addr, size, direction); | ||
150 | return virt_to_bus(cpu_addr); | 151 | return virt_to_bus(cpu_addr); |
151 | } | 152 | } |
152 | 153 | ||
@@ -162,6 +163,10 @@ static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
162 | 163 | ||
163 | sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; | 164 | sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; |
164 | virt = sg_virt(sg); | 165 | virt = sg_virt(sg); |
166 | |||
167 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
168 | continue; | ||
169 | |||
165 | dma_cache_sync(dev, virt, sg->length, direction); | 170 | dma_cache_sync(dev, virt, sg->length, direction); |
166 | } | 171 | } |
167 | 172 | ||
diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c index 53fbbb61aa86..a27a74a18fb0 100644 --- a/arch/blackfin/kernel/dma-mapping.c +++ b/arch/blackfin/kernel/dma-mapping.c | |||
@@ -118,6 +118,10 @@ static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list, | |||
118 | 118 | ||
119 | for_each_sg(sg_list, sg, nents, i) { | 119 | for_each_sg(sg_list, sg, nents, i) { |
120 | sg->dma_address = (dma_addr_t) sg_virt(sg); | 120 | sg->dma_address = (dma_addr_t) sg_virt(sg); |
121 | |||
122 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
123 | continue; | ||
124 | |||
121 | __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); | 125 | __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); |
122 | } | 126 | } |
123 | 127 | ||
@@ -143,7 +147,9 @@ static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page, | |||
143 | { | 147 | { |
144 | dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); | 148 | dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); |
145 | 149 | ||
146 | _dma_sync(handle, size, dir); | 150 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
151 | _dma_sync(handle, size, dir); | ||
152 | |||
147 | return handle; | 153 | return handle; |
148 | } | 154 | } |
149 | 155 | ||
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c index db4a6a301f5e..6752df32ef06 100644 --- a/arch/c6x/kernel/dma.c +++ b/arch/c6x/kernel/dma.c | |||
@@ -42,14 +42,17 @@ static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page, | |||
42 | { | 42 | { |
43 | dma_addr_t handle = virt_to_phys(page_address(page) + offset); | 43 | dma_addr_t handle = virt_to_phys(page_address(page) + offset); |
44 | 44 | ||
45 | c6x_dma_sync(handle, size, dir); | 45 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
46 | c6x_dma_sync(handle, size, dir); | ||
47 | |||
46 | return handle; | 48 | return handle; |
47 | } | 49 | } |
48 | 50 | ||
49 | static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, | 51 | static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, |
50 | size_t size, enum dma_data_direction dir, unsigned long attrs) | 52 | size_t size, enum dma_data_direction dir, unsigned long attrs) |
51 | { | 53 | { |
52 | c6x_dma_sync(handle, size, dir); | 54 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
55 | c6x_dma_sync(handle, size, dir); | ||
53 | } | 56 | } |
54 | 57 | ||
55 | static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, | 58 | static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, |
@@ -60,7 +63,8 @@ static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
60 | 63 | ||
61 | for_each_sg(sglist, sg, nents, i) { | 64 | for_each_sg(sglist, sg, nents, i) { |
62 | sg->dma_address = sg_phys(sg); | 65 | sg->dma_address = sg_phys(sg); |
63 | c6x_dma_sync(sg->dma_address, sg->length, dir); | 66 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
67 | c6x_dma_sync(sg->dma_address, sg->length, dir); | ||
64 | } | 68 | } |
65 | 69 | ||
66 | return nents; | 70 | return nents; |
@@ -72,9 +76,11 @@ static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
72 | struct scatterlist *sg; | 76 | struct scatterlist *sg; |
73 | int i; | 77 | int i; |
74 | 78 | ||
79 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
80 | return; | ||
81 | |||
75 | for_each_sg(sglist, sg, nents, i) | 82 | for_each_sg(sglist, sg, nents, i) |
76 | c6x_dma_sync(sg_dma_address(sg), sg->length, dir); | 83 | c6x_dma_sync(sg_dma_address(sg), sg->length, dir); |
77 | |||
78 | } | 84 | } |
79 | 85 | ||
80 | static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, | 86 | static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, |
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index 90f2e4cb33d6..187688128c65 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c | |||
@@ -109,16 +109,19 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
109 | int nents, enum dma_data_direction direction, | 109 | int nents, enum dma_data_direction direction, |
110 | unsigned long attrs) | 110 | unsigned long attrs) |
111 | { | 111 | { |
112 | int i; | ||
113 | struct scatterlist *sg; | 112 | struct scatterlist *sg; |
113 | int i; | ||
114 | |||
115 | BUG_ON(direction == DMA_NONE); | ||
116 | |||
117 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
118 | return nents; | ||
114 | 119 | ||
115 | for_each_sg(sglist, sg, nents, i) { | 120 | for_each_sg(sglist, sg, nents, i) { |
116 | frv_cache_wback_inv(sg_dma_address(sg), | 121 | frv_cache_wback_inv(sg_dma_address(sg), |
117 | sg_dma_address(sg) + sg_dma_len(sg)); | 122 | sg_dma_address(sg) + sg_dma_len(sg)); |
118 | } | 123 | } |
119 | 124 | ||
120 | BUG_ON(direction == DMA_NONE); | ||
121 | |||
122 | return nents; | 125 | return nents; |
123 | } | 126 | } |
124 | 127 | ||
@@ -127,7 +130,10 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page, | |||
127 | enum dma_data_direction direction, unsigned long attrs) | 130 | enum dma_data_direction direction, unsigned long attrs) |
128 | { | 131 | { |
129 | BUG_ON(direction == DMA_NONE); | 132 | BUG_ON(direction == DMA_NONE); |
130 | flush_dcache_page(page); | 133 | |
134 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
135 | flush_dcache_page(page); | ||
136 | |||
131 | return (dma_addr_t) page_to_phys(page) + offset; | 137 | return (dma_addr_t) page_to_phys(page) + offset; |
132 | } | 138 | } |
133 | 139 | ||
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index f585745b1abc..dba7df918144 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c | |||
@@ -40,13 +40,16 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
40 | int nents, enum dma_data_direction direction, | 40 | int nents, enum dma_data_direction direction, |
41 | unsigned long attrs) | 41 | unsigned long attrs) |
42 | { | 42 | { |
43 | struct scatterlist *sg; | ||
43 | unsigned long dampr2; | 44 | unsigned long dampr2; |
44 | void *vaddr; | 45 | void *vaddr; |
45 | int i; | 46 | int i; |
46 | struct scatterlist *sg; | ||
47 | 47 | ||
48 | BUG_ON(direction == DMA_NONE); | 48 | BUG_ON(direction == DMA_NONE); |
49 | 49 | ||
50 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
51 | return nents; | ||
52 | |||
50 | dampr2 = __get_DAMPR(2); | 53 | dampr2 = __get_DAMPR(2); |
51 | 54 | ||
52 | for_each_sg(sglist, sg, nents, i) { | 55 | for_each_sg(sglist, sg, nents, i) { |
@@ -70,7 +73,9 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page, | |||
70 | unsigned long offset, size_t size, | 73 | unsigned long offset, size_t size, |
71 | enum dma_data_direction direction, unsigned long attrs) | 74 | enum dma_data_direction direction, unsigned long attrs) |
72 | { | 75 | { |
73 | flush_dcache_page(page); | 76 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
77 | flush_dcache_page(page); | ||
78 | |||
74 | return (dma_addr_t) page_to_phys(page) + offset; | 79 | return (dma_addr_t) page_to_phys(page) + offset; |
75 | } | 80 | } |
76 | 81 | ||
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c index b9017785fb71..dbc4f1003da4 100644 --- a/arch/hexagon/kernel/dma.c +++ b/arch/hexagon/kernel/dma.c | |||
@@ -119,6 +119,9 @@ static int hexagon_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
119 | 119 | ||
120 | s->dma_length = s->length; | 120 | s->dma_length = s->length; |
121 | 121 | ||
122 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
123 | continue; | ||
124 | |||
122 | flush_dcache_range(dma_addr_to_virt(s->dma_address), | 125 | flush_dcache_range(dma_addr_to_virt(s->dma_address), |
123 | dma_addr_to_virt(s->dma_address + s->length)); | 126 | dma_addr_to_virt(s->dma_address + s->length)); |
124 | } | 127 | } |
@@ -180,7 +183,8 @@ static dma_addr_t hexagon_map_page(struct device *dev, struct page *page, | |||
180 | if (!check_addr("map_single", dev, bus, size)) | 183 | if (!check_addr("map_single", dev, bus, size)) |
181 | return bad_dma_address; | 184 | return bad_dma_address; |
182 | 185 | ||
183 | dma_sync(dma_addr_to_virt(bus), size, dir); | 186 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
187 | dma_sync(dma_addr_to_virt(bus), size, dir); | ||
184 | 188 | ||
185 | return bus; | 189 | return bus; |
186 | } | 190 | } |
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index 8cf97cbadc91..07070065a425 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c | |||
@@ -134,7 +134,9 @@ static dma_addr_t m68k_dma_map_page(struct device *dev, struct page *page, | |||
134 | { | 134 | { |
135 | dma_addr_t handle = page_to_phys(page) + offset; | 135 | dma_addr_t handle = page_to_phys(page) + offset; |
136 | 136 | ||
137 | dma_sync_single_for_device(dev, handle, size, dir); | 137 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
138 | dma_sync_single_for_device(dev, handle, size, dir); | ||
139 | |||
138 | return handle; | 140 | return handle; |
139 | } | 141 | } |
140 | 142 | ||
@@ -146,6 +148,10 @@ static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
146 | 148 | ||
147 | for_each_sg(sglist, sg, nents, i) { | 149 | for_each_sg(sglist, sg, nents, i) { |
148 | sg->dma_address = sg_phys(sg); | 150 | sg->dma_address = sg_phys(sg); |
151 | |||
152 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
153 | continue; | ||
154 | |||
149 | dma_sync_single_for_device(dev, sg->dma_address, sg->length, | 155 | dma_sync_single_for_device(dev, sg->dma_address, sg->length, |
150 | dir); | 156 | dir); |
151 | } | 157 | } |
diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c index 0db31e24c541..91968d92652b 100644 --- a/arch/metag/kernel/dma.c +++ b/arch/metag/kernel/dma.c | |||
@@ -484,8 +484,9 @@ static dma_addr_t metag_dma_map_page(struct device *dev, struct page *page, | |||
484 | unsigned long offset, size_t size, | 484 | unsigned long offset, size_t size, |
485 | enum dma_data_direction direction, unsigned long attrs) | 485 | enum dma_data_direction direction, unsigned long attrs) |
486 | { | 486 | { |
487 | dma_sync_for_device((void *)(page_to_phys(page) + offset), size, | 487 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
488 | direction); | 488 | dma_sync_for_device((void *)(page_to_phys(page) + offset), |
489 | size, direction); | ||
489 | return page_to_phys(page) + offset; | 490 | return page_to_phys(page) + offset; |
490 | } | 491 | } |
491 | 492 | ||
@@ -493,7 +494,8 @@ static void metag_dma_unmap_page(struct device *dev, dma_addr_t dma_address, | |||
493 | size_t size, enum dma_data_direction direction, | 494 | size_t size, enum dma_data_direction direction, |
494 | unsigned long attrs) | 495 | unsigned long attrs) |
495 | { | 496 | { |
496 | dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | 497 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
498 | dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | ||
497 | } | 499 | } |
498 | 500 | ||
499 | static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, | 501 | static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, |
@@ -507,6 +509,10 @@ static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
507 | BUG_ON(!sg_page(sg)); | 509 | BUG_ON(!sg_page(sg)); |
508 | 510 | ||
509 | sg->dma_address = sg_phys(sg); | 511 | sg->dma_address = sg_phys(sg); |
512 | |||
513 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
514 | continue; | ||
515 | |||
510 | dma_sync_for_device(sg_virt(sg), sg->length, direction); | 516 | dma_sync_for_device(sg_virt(sg), sg->length, direction); |
511 | } | 517 | } |
512 | 518 | ||
@@ -525,6 +531,10 @@ static void metag_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
525 | BUG_ON(!sg_page(sg)); | 531 | BUG_ON(!sg_page(sg)); |
526 | 532 | ||
527 | sg->dma_address = sg_phys(sg); | 533 | sg->dma_address = sg_phys(sg); |
534 | |||
535 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
536 | continue; | ||
537 | |||
528 | dma_sync_for_cpu(sg_virt(sg), sg->length, direction); | 538 | dma_sync_for_cpu(sg_virt(sg), sg->length, direction); |
529 | } | 539 | } |
530 | } | 540 | } |
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index ec04dc1e2527..818daf230eb4 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c | |||
@@ -61,6 +61,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, | |||
61 | /* FIXME this part of code is untested */ | 61 | /* FIXME this part of code is untested */ |
62 | for_each_sg(sgl, sg, nents, i) { | 62 | for_each_sg(sgl, sg, nents, i) { |
63 | sg->dma_address = sg_phys(sg); | 63 | sg->dma_address = sg_phys(sg); |
64 | |||
65 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
66 | continue; | ||
67 | |||
64 | __dma_sync(page_to_phys(sg_page(sg)) + sg->offset, | 68 | __dma_sync(page_to_phys(sg_page(sg)) + sg->offset, |
65 | sg->length, direction); | 69 | sg->length, direction); |
66 | } | 70 | } |
@@ -80,7 +84,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, | |||
80 | enum dma_data_direction direction, | 84 | enum dma_data_direction direction, |
81 | unsigned long attrs) | 85 | unsigned long attrs) |
82 | { | 86 | { |
83 | __dma_sync(page_to_phys(page) + offset, size, direction); | 87 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
88 | __dma_sync(page_to_phys(page) + offset, size, direction); | ||
84 | return page_to_phys(page) + offset; | 89 | return page_to_phys(page) + offset; |
85 | } | 90 | } |
86 | 91 | ||
@@ -95,7 +100,8 @@ static inline void dma_direct_unmap_page(struct device *dev, | |||
95 | * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and | 100 | * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and |
96 | * dma_address is physical address | 101 | * dma_address is physical address |
97 | */ | 102 | */ |
98 | __dma_sync(dma_address, size, direction); | 103 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
104 | __dma_sync(dma_address, size, direction); | ||
99 | } | 105 | } |
100 | 106 | ||
101 | static inline void | 107 | static inline void |
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c index 1a80b6f73ab2..aab4fd681e1f 100644 --- a/arch/mips/loongson64/common/dma-swiotlb.c +++ b/arch/mips/loongson64/common/dma-swiotlb.c | |||
@@ -61,7 +61,7 @@ static int loongson_dma_map_sg(struct device *dev, struct scatterlist *sg, | |||
61 | int nents, enum dma_data_direction dir, | 61 | int nents, enum dma_data_direction dir, |
62 | unsigned long attrs) | 62 | unsigned long attrs) |
63 | { | 63 | { |
64 | int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, 0); | 64 | int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, attrs); |
65 | mb(); | 65 | mb(); |
66 | 66 | ||
67 | return r; | 67 | return r; |
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index 46d5696c4f27..a39c36af97ad 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c | |||
@@ -293,7 +293,7 @@ static inline void __dma_sync(struct page *page, | |||
293 | static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, | 293 | static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, |
294 | size_t size, enum dma_data_direction direction, unsigned long attrs) | 294 | size_t size, enum dma_data_direction direction, unsigned long attrs) |
295 | { | 295 | { |
296 | if (cpu_needs_post_dma_flush(dev)) | 296 | if (cpu_needs_post_dma_flush(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
297 | __dma_sync(dma_addr_to_page(dev, dma_addr), | 297 | __dma_sync(dma_addr_to_page(dev, dma_addr), |
298 | dma_addr & ~PAGE_MASK, size, direction); | 298 | dma_addr & ~PAGE_MASK, size, direction); |
299 | plat_post_dma_flush(dev); | 299 | plat_post_dma_flush(dev); |
@@ -307,7 +307,8 @@ static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
307 | struct scatterlist *sg; | 307 | struct scatterlist *sg; |
308 | 308 | ||
309 | for_each_sg(sglist, sg, nents, i) { | 309 | for_each_sg(sglist, sg, nents, i) { |
310 | if (!plat_device_is_coherent(dev)) | 310 | if (!plat_device_is_coherent(dev) && |
311 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
311 | __dma_sync(sg_page(sg), sg->offset, sg->length, | 312 | __dma_sync(sg_page(sg), sg->offset, sg->length, |
312 | direction); | 313 | direction); |
313 | #ifdef CONFIG_NEED_SG_DMA_LENGTH | 314 | #ifdef CONFIG_NEED_SG_DMA_LENGTH |
@@ -324,7 +325,7 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page, | |||
324 | unsigned long offset, size_t size, enum dma_data_direction direction, | 325 | unsigned long offset, size_t size, enum dma_data_direction direction, |
325 | unsigned long attrs) | 326 | unsigned long attrs) |
326 | { | 327 | { |
327 | if (!plat_device_is_coherent(dev)) | 328 | if (!plat_device_is_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
328 | __dma_sync(page, offset, size, direction); | 329 | __dma_sync(page, offset, size, direction); |
329 | 330 | ||
330 | return plat_map_dma_mem_page(dev, page) + offset; | 331 | return plat_map_dma_mem_page(dev, page) + offset; |
@@ -339,6 +340,7 @@ static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
339 | 340 | ||
340 | for_each_sg(sglist, sg, nhwentries, i) { | 341 | for_each_sg(sglist, sg, nhwentries, i) { |
341 | if (!plat_device_is_coherent(dev) && | 342 | if (!plat_device_is_coherent(dev) && |
343 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && | ||
342 | direction != DMA_TO_DEVICE) | 344 | direction != DMA_TO_DEVICE) |
343 | __dma_sync(sg_page(sg), sg->offset, sg->length, | 345 | __dma_sync(sg_page(sg), sg->offset, sg->length, |
344 | direction); | 346 | direction); |
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c index d800fad87896..f6a5dcf9d682 100644 --- a/arch/nios2/mm/dma-mapping.c +++ b/arch/nios2/mm/dma-mapping.c | |||
@@ -98,13 +98,17 @@ static int nios2_dma_map_sg(struct device *dev, struct scatterlist *sg, | |||
98 | int i; | 98 | int i; |
99 | 99 | ||
100 | for_each_sg(sg, sg, nents, i) { | 100 | for_each_sg(sg, sg, nents, i) { |
101 | void *addr; | 101 | void *addr = sg_virt(sg); |
102 | 102 | ||
103 | addr = sg_virt(sg); | 103 | if (!addr) |
104 | if (addr) { | 104 | continue; |
105 | __dma_sync_for_device(addr, sg->length, direction); | 105 | |
106 | sg->dma_address = sg_phys(sg); | 106 | sg->dma_address = sg_phys(sg); |
107 | } | 107 | |
108 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
109 | continue; | ||
110 | |||
111 | __dma_sync_for_device(addr, sg->length, direction); | ||
108 | } | 112 | } |
109 | 113 | ||
110 | return nents; | 114 | return nents; |
@@ -117,7 +121,9 @@ static dma_addr_t nios2_dma_map_page(struct device *dev, struct page *page, | |||
117 | { | 121 | { |
118 | void *addr = page_address(page) + offset; | 122 | void *addr = page_address(page) + offset; |
119 | 123 | ||
120 | __dma_sync_for_device(addr, size, direction); | 124 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
125 | __dma_sync_for_device(addr, size, direction); | ||
126 | |||
121 | return page_to_phys(page) + offset; | 127 | return page_to_phys(page) + offset; |
122 | } | 128 | } |
123 | 129 | ||
@@ -125,7 +131,8 @@ static void nios2_dma_unmap_page(struct device *dev, dma_addr_t dma_address, | |||
125 | size_t size, enum dma_data_direction direction, | 131 | size_t size, enum dma_data_direction direction, |
126 | unsigned long attrs) | 132 | unsigned long attrs) |
127 | { | 133 | { |
128 | __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | 134 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
135 | __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | ||
129 | } | 136 | } |
130 | 137 | ||
131 | static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, | 138 | static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, |
@@ -138,6 +145,9 @@ static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, | |||
138 | if (direction == DMA_TO_DEVICE) | 145 | if (direction == DMA_TO_DEVICE) |
139 | return; | 146 | return; |
140 | 147 | ||
148 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
149 | return; | ||
150 | |||
141 | for_each_sg(sg, sg, nhwentries, i) { | 151 | for_each_sg(sg, sg, nhwentries, i) { |
142 | addr = sg_virt(sg); | 152 | addr = sg_virt(sg); |
143 | if (addr) | 153 | if (addr) |
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 140c99140649..906998bac957 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c | |||
@@ -141,6 +141,9 @@ or1k_map_page(struct device *dev, struct page *page, | |||
141 | unsigned long cl; | 141 | unsigned long cl; |
142 | dma_addr_t addr = page_to_phys(page) + offset; | 142 | dma_addr_t addr = page_to_phys(page) + offset; |
143 | 143 | ||
144 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
145 | return addr; | ||
146 | |||
144 | switch (dir) { | 147 | switch (dir) { |
145 | case DMA_TO_DEVICE: | 148 | case DMA_TO_DEVICE: |
146 | /* Flush the dcache for the requested range */ | 149 | /* Flush the dcache for the requested range */ |
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 494ff6e8c88a..b6298a85e8ae 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c | |||
@@ -459,7 +459,9 @@ static dma_addr_t pa11_dma_map_page(struct device *dev, struct page *page, | |||
459 | void *addr = page_address(page) + offset; | 459 | void *addr = page_address(page) + offset; |
460 | BUG_ON(direction == DMA_NONE); | 460 | BUG_ON(direction == DMA_NONE); |
461 | 461 | ||
462 | flush_kernel_dcache_range((unsigned long) addr, size); | 462 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
463 | flush_kernel_dcache_range((unsigned long) addr, size); | ||
464 | |||
463 | return virt_to_phys(addr); | 465 | return virt_to_phys(addr); |
464 | } | 466 | } |
465 | 467 | ||
@@ -469,8 +471,11 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, | |||
469 | { | 471 | { |
470 | BUG_ON(direction == DMA_NONE); | 472 | BUG_ON(direction == DMA_NONE); |
471 | 473 | ||
474 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
475 | return; | ||
476 | |||
472 | if (direction == DMA_TO_DEVICE) | 477 | if (direction == DMA_TO_DEVICE) |
473 | return; | 478 | return; |
474 | 479 | ||
475 | /* | 480 | /* |
476 | * For PCI_DMA_FROMDEVICE this flush is not necessary for the | 481 | * For PCI_DMA_FROMDEVICE this flush is not necessary for the |
@@ -479,7 +484,6 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, | |||
479 | */ | 484 | */ |
480 | 485 | ||
481 | flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); | 486 | flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); |
482 | return; | ||
483 | } | 487 | } |
484 | 488 | ||
485 | static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, | 489 | static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, |
@@ -496,6 +500,10 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
496 | 500 | ||
497 | sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); | 501 | sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); |
498 | sg_dma_len(sg) = sg->length; | 502 | sg_dma_len(sg) = sg->length; |
503 | |||
504 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
505 | continue; | ||
506 | |||
499 | flush_kernel_dcache_range(vaddr, sg->length); | 507 | flush_kernel_dcache_range(vaddr, sg->length); |
500 | } | 508 | } |
501 | return nents; | 509 | return nents; |
@@ -510,14 +518,16 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
510 | 518 | ||
511 | BUG_ON(direction == DMA_NONE); | 519 | BUG_ON(direction == DMA_NONE); |
512 | 520 | ||
521 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
522 | return; | ||
523 | |||
513 | if (direction == DMA_TO_DEVICE) | 524 | if (direction == DMA_TO_DEVICE) |
514 | return; | 525 | return; |
515 | 526 | ||
516 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ | 527 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ |
517 | 528 | ||
518 | for_each_sg(sglist, sg, nents, i) | 529 | for_each_sg(sglist, sg, nents, i) |
519 | flush_kernel_vmap_range(sg_virt(sg), sg->length); | 530 | flush_kernel_vmap_range(sg_virt(sg), sg->length); |
520 | return; | ||
521 | } | 531 | } |
522 | 532 | ||
523 | static void pa11_dma_sync_single_for_cpu(struct device *dev, | 533 | static void pa11_dma_sync_single_for_cpu(struct device *dev, |
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index e64a6016fba7..6877e3fa95bb 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c | |||
@@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, | |||
203 | for_each_sg(sgl, sg, nents, i) { | 203 | for_each_sg(sgl, sg, nents, i) { |
204 | sg->dma_address = sg_phys(sg) + get_dma_offset(dev); | 204 | sg->dma_address = sg_phys(sg) + get_dma_offset(dev); |
205 | sg->dma_length = sg->length; | 205 | sg->dma_length = sg->length; |
206 | |||
207 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
208 | continue; | ||
209 | |||
206 | __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); | 210 | __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); |
207 | } | 211 | } |
208 | 212 | ||
@@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, | |||
235 | unsigned long attrs) | 239 | unsigned long attrs) |
236 | { | 240 | { |
237 | BUG_ON(dir == DMA_NONE); | 241 | BUG_ON(dir == DMA_NONE); |
238 | __dma_sync_page(page, offset, size, dir); | 242 | |
243 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
244 | __dma_sync_page(page, offset, size, dir); | ||
245 | |||
239 | return page_to_phys(page) + offset + get_dma_offset(dev); | 246 | return page_to_phys(page) + offset + get_dma_offset(dev); |
240 | } | 247 | } |
241 | 248 | ||
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 06254467e4dd..3a147122bc98 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c | |||
@@ -236,7 +236,6 @@ static int | |||
236 | spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 236 | spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
237 | { | 237 | { |
238 | struct spu_context *ctx = vma->vm_file->private_data; | 238 | struct spu_context *ctx = vma->vm_file->private_data; |
239 | unsigned long address = (unsigned long)vmf->virtual_address; | ||
240 | unsigned long pfn, offset; | 239 | unsigned long pfn, offset; |
241 | 240 | ||
242 | offset = vmf->pgoff << PAGE_SHIFT; | 241 | offset = vmf->pgoff << PAGE_SHIFT; |
@@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
244 | return VM_FAULT_SIGBUS; | 243 | return VM_FAULT_SIGBUS; |
245 | 244 | ||
246 | pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", | 245 | pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", |
247 | address, offset); | 246 | vmf->address, offset); |
248 | 247 | ||
249 | if (spu_acquire(ctx)) | 248 | if (spu_acquire(ctx)) |
250 | return VM_FAULT_NOPAGE; | 249 | return VM_FAULT_NOPAGE; |
@@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
256 | vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); | 255 | vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); |
257 | pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; | 256 | pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; |
258 | } | 257 | } |
259 | vm_insert_pfn(vma, address, pfn); | 258 | vm_insert_pfn(vma, vmf->address, pfn); |
260 | 259 | ||
261 | spu_release(ctx); | 260 | spu_release(ctx); |
262 | 261 | ||
@@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma, | |||
355 | down_read(¤t->mm->mmap_sem); | 354 | down_read(¤t->mm->mmap_sem); |
356 | } else { | 355 | } else { |
357 | area = ctx->spu->problem_phys + ps_offs; | 356 | area = ctx->spu->problem_phys + ps_offs; |
358 | vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | 357 | vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); |
359 | (area + offset) >> PAGE_SHIFT); | ||
360 | spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); | 358 | spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); |
361 | } | 359 | } |
362 | 360 | ||
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c index eadb669a7329..47fee3b6e29c 100644 --- a/arch/sh/kernel/dma-nommu.c +++ b/arch/sh/kernel/dma-nommu.c | |||
@@ -18,7 +18,9 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, | |||
18 | dma_addr_t addr = page_to_phys(page) + offset; | 18 | dma_addr_t addr = page_to_phys(page) + offset; |
19 | 19 | ||
20 | WARN_ON(size == 0); | 20 | WARN_ON(size == 0); |
21 | dma_cache_sync(dev, page_address(page) + offset, size, dir); | 21 | |
22 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
23 | dma_cache_sync(dev, page_address(page) + offset, size, dir); | ||
22 | 24 | ||
23 | return addr; | 25 | return addr; |
24 | } | 26 | } |
@@ -35,7 +37,8 @@ static int nommu_map_sg(struct device *dev, struct scatterlist *sg, | |||
35 | for_each_sg(sg, s, nents, i) { | 37 | for_each_sg(sg, s, nents, i) { |
36 | BUG_ON(!sg_page(s)); | 38 | BUG_ON(!sg_page(s)); |
37 | 39 | ||
38 | dma_cache_sync(dev, sg_virt(s), s->length, dir); | 40 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
41 | dma_cache_sync(dev, sg_virt(s), s->length, dir); | ||
39 | 42 | ||
40 | s->dma_address = sg_phys(s); | 43 | s->dma_address = sg_phys(s); |
41 | s->dma_length = s->length; | 44 | s->dma_length = s->length; |
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 852a3291db96..9df997995f6b 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c | |||
@@ -415,7 +415,7 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr, | |||
415 | ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL; | 415 | ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL; |
416 | 416 | ||
417 | /* Step 1: Kick data out of streaming buffers if necessary. */ | 417 | /* Step 1: Kick data out of streaming buffers if necessary. */ |
418 | if (strbuf->strbuf_enabled) | 418 | if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
419 | strbuf_flush(strbuf, iommu, bus_addr, ctx, | 419 | strbuf_flush(strbuf, iommu, bus_addr, ctx, |
420 | npages, direction); | 420 | npages, direction); |
421 | 421 | ||
@@ -640,7 +640,7 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
640 | base = iommu->page_table + entry; | 640 | base = iommu->page_table + entry; |
641 | 641 | ||
642 | dma_handle &= IO_PAGE_MASK; | 642 | dma_handle &= IO_PAGE_MASK; |
643 | if (strbuf->strbuf_enabled) | 643 | if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
644 | strbuf_flush(strbuf, iommu, dma_handle, ctx, | 644 | strbuf_flush(strbuf, iommu, dma_handle, ctx, |
645 | npages, direction); | 645 | npages, direction); |
646 | 646 | ||
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 2344103414d1..6ffaec44931a 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c | |||
@@ -527,7 +527,7 @@ static dma_addr_t pci32_map_page(struct device *dev, struct page *page, | |||
527 | static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, | 527 | static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, |
528 | enum dma_data_direction dir, unsigned long attrs) | 528 | enum dma_data_direction dir, unsigned long attrs) |
529 | { | 529 | { |
530 | if (dir != PCI_DMA_TODEVICE) | 530 | if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
531 | dma_make_coherent(ba, PAGE_ALIGN(size)); | 531 | dma_make_coherent(ba, PAGE_ALIGN(size)); |
532 | } | 532 | } |
533 | 533 | ||
@@ -572,7 +572,7 @@ static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl, | |||
572 | struct scatterlist *sg; | 572 | struct scatterlist *sg; |
573 | int n; | 573 | int n; |
574 | 574 | ||
575 | if (dir != PCI_DMA_TODEVICE) { | 575 | if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { |
576 | for_each_sg(sgl, sg, nents, n) { | 576 | for_each_sg(sgl, sg, nents, n) { |
577 | dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); | 577 | dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); |
578 | } | 578 | } |
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index a9973bb4a1b2..95e73c63c99d 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c | |||
@@ -42,7 +42,7 @@ static int panic_on_timeout; | |||
42 | */ | 42 | */ |
43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | 43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ |
44 | EXPORT_SYMBOL(nmi_active); | 44 | EXPORT_SYMBOL(nmi_active); |
45 | 45 | static int nmi_init_done; | |
46 | static unsigned int nmi_hz = HZ; | 46 | static unsigned int nmi_hz = HZ; |
47 | static DEFINE_PER_CPU(short, wd_enabled); | 47 | static DEFINE_PER_CPU(short, wd_enabled); |
48 | static int endflag __initdata; | 48 | static int endflag __initdata; |
@@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) | |||
153 | 153 | ||
154 | void stop_nmi_watchdog(void *unused) | 154 | void stop_nmi_watchdog(void *unused) |
155 | { | 155 | { |
156 | if (!__this_cpu_read(wd_enabled)) | ||
157 | return; | ||
156 | pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); | 158 | pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); |
157 | __this_cpu_write(wd_enabled, 0); | 159 | __this_cpu_write(wd_enabled, 0); |
158 | atomic_dec(&nmi_active); | 160 | atomic_dec(&nmi_active); |
@@ -207,6 +209,9 @@ error: | |||
207 | 209 | ||
208 | void start_nmi_watchdog(void *unused) | 210 | void start_nmi_watchdog(void *unused) |
209 | { | 211 | { |
212 | if (__this_cpu_read(wd_enabled)) | ||
213 | return; | ||
214 | |||
210 | __this_cpu_write(wd_enabled, 1); | 215 | __this_cpu_write(wd_enabled, 1); |
211 | atomic_inc(&nmi_active); | 216 | atomic_inc(&nmi_active); |
212 | 217 | ||
@@ -259,6 +264,8 @@ int __init nmi_init(void) | |||
259 | } | 264 | } |
260 | } | 265 | } |
261 | 266 | ||
267 | nmi_init_done = 1; | ||
268 | |||
262 | return err; | 269 | return err; |
263 | } | 270 | } |
264 | 271 | ||
@@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str) | |||
270 | return 0; | 277 | return 0; |
271 | } | 278 | } |
272 | __setup("nmi_watchdog=", setup_nmi_watchdog); | 279 | __setup("nmi_watchdog=", setup_nmi_watchdog); |
280 | |||
281 | /* | ||
282 | * sparc specific NMI watchdog enable function. | ||
283 | * Enables watchdog if it is not enabled already. | ||
284 | */ | ||
285 | int watchdog_nmi_enable(unsigned int cpu) | ||
286 | { | ||
287 | if (atomic_read(&nmi_active) == -1) { | ||
288 | pr_warn("NMI watchdog cannot be enabled or disabled\n"); | ||
289 | return -1; | ||
290 | } | ||
291 | |||
292 | /* | ||
293 | * watchdog thread could start even before nmi_init is called. | ||
294 | * Just Return in that case. Let nmi_init finish the init | ||
295 | * process first. | ||
296 | */ | ||
297 | if (!nmi_init_done) | ||
298 | return 0; | ||
299 | |||
300 | smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1); | ||
301 | |||
302 | return 0; | ||
303 | } | ||
304 | /* | ||
305 | * sparc specific NMI watchdog disable function. | ||
306 | * Disables watchdog if it is not disabled already. | ||
307 | */ | ||
308 | void watchdog_nmi_disable(unsigned int cpu) | ||
309 | { | ||
310 | if (atomic_read(&nmi_active) == -1) | ||
311 | pr_warn_once("NMI watchdog cannot be enabled or disabled\n"); | ||
312 | else | ||
313 | smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1); | ||
314 | } | ||
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c index 09bb774b39cd..24e0f8c21f2f 100644 --- a/arch/tile/kernel/pci-dma.c +++ b/arch/tile/kernel/pci-dma.c | |||
@@ -213,10 +213,12 @@ static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
213 | 213 | ||
214 | for_each_sg(sglist, sg, nents, i) { | 214 | for_each_sg(sglist, sg, nents, i) { |
215 | sg->dma_address = sg_phys(sg); | 215 | sg->dma_address = sg_phys(sg); |
216 | __dma_prep_pa_range(sg->dma_address, sg->length, direction); | ||
217 | #ifdef CONFIG_NEED_SG_DMA_LENGTH | 216 | #ifdef CONFIG_NEED_SG_DMA_LENGTH |
218 | sg->dma_length = sg->length; | 217 | sg->dma_length = sg->length; |
219 | #endif | 218 | #endif |
219 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
220 | continue; | ||
221 | __dma_prep_pa_range(sg->dma_address, sg->length, direction); | ||
220 | } | 222 | } |
221 | 223 | ||
222 | return nents; | 224 | return nents; |
@@ -232,6 +234,8 @@ static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
232 | BUG_ON(!valid_dma_direction(direction)); | 234 | BUG_ON(!valid_dma_direction(direction)); |
233 | for_each_sg(sglist, sg, nents, i) { | 235 | for_each_sg(sglist, sg, nents, i) { |
234 | sg->dma_address = sg_phys(sg); | 236 | sg->dma_address = sg_phys(sg); |
237 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
238 | continue; | ||
235 | __dma_complete_pa_range(sg->dma_address, sg->length, | 239 | __dma_complete_pa_range(sg->dma_address, sg->length, |
236 | direction); | 240 | direction); |
237 | } | 241 | } |
@@ -245,7 +249,8 @@ static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page, | |||
245 | BUG_ON(!valid_dma_direction(direction)); | 249 | BUG_ON(!valid_dma_direction(direction)); |
246 | 250 | ||
247 | BUG_ON(offset + size > PAGE_SIZE); | 251 | BUG_ON(offset + size > PAGE_SIZE); |
248 | __dma_prep_page(page, offset, size, direction); | 252 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
253 | __dma_prep_page(page, offset, size, direction); | ||
249 | 254 | ||
250 | return page_to_pa(page) + offset; | 255 | return page_to_pa(page) + offset; |
251 | } | 256 | } |
@@ -256,6 +261,9 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address, | |||
256 | { | 261 | { |
257 | BUG_ON(!valid_dma_direction(direction)); | 262 | BUG_ON(!valid_dma_direction(direction)); |
258 | 263 | ||
264 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
265 | return; | ||
266 | |||
259 | __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), | 267 | __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), |
260 | dma_address & (PAGE_SIZE - 1), size, direction); | 268 | dma_address & (PAGE_SIZE - 1), size, direction); |
261 | } | 269 | } |
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e739002427ed..40121d14d34d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c | |||
@@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, | |||
109 | return VM_FAULT_SIGBUS; | 109 | return VM_FAULT_SIGBUS; |
110 | 110 | ||
111 | if (sym_offset == image->sym_vvar_page) { | 111 | if (sym_offset == image->sym_vvar_page) { |
112 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | 112 | ret = vm_insert_pfn(vma, vmf->address, |
113 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); | 113 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); |
114 | } else if (sym_offset == image->sym_pvclock_page) { | 114 | } else if (sym_offset == image->sym_pvclock_page) { |
115 | struct pvclock_vsyscall_time_info *pvti = | 115 | struct pvclock_vsyscall_time_info *pvti = |
@@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, | |||
117 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { | 117 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
118 | ret = vm_insert_pfn( | 118 | ret = vm_insert_pfn( |
119 | vma, | 119 | vma, |
120 | (unsigned long)vmf->virtual_address, | 120 | vmf->address, |
121 | __pa(pvti) >> PAGE_SHIFT); | 121 | __pa(pvti) >> PAGE_SHIFT); |
122 | } | 122 | } |
123 | } | 123 | } |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8c1f218926d7..307b1f4543de 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
@@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image) | |||
328 | 328 | ||
329 | void arch_crash_save_vmcoreinfo(void) | 329 | void arch_crash_save_vmcoreinfo(void) |
330 | { | 330 | { |
331 | VMCOREINFO_SYMBOL(phys_base); | 331 | VMCOREINFO_NUMBER(phys_base); |
332 | VMCOREINFO_SYMBOL(init_level4_pgt); | 332 | VMCOREINFO_SYMBOL(init_level4_pgt); |
333 | 333 | ||
334 | #ifdef CONFIG_NUMA | 334 | #ifdef CONFIG_NUMA |
@@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void) | |||
337 | #endif | 337 | #endif |
338 | vmcoreinfo_append_str("KERNELOFFSET=%lx\n", | 338 | vmcoreinfo_append_str("KERNELOFFSET=%lx\n", |
339 | kaslr_offset()); | 339 | kaslr_offset()); |
340 | VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); | 340 | VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); |
341 | VMCOREINFO_VMALLOC_START(VMALLOC_START); | ||
342 | VMCOREINFO_VMEMMAP_START(VMEMMAP_START); | ||
343 | } | 341 | } |
344 | 342 | ||
345 | /* arch-dependent functionality related to kexec file-based syscall */ | 343 | /* arch-dependent functionality related to kexec file-based syscall */ |
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 1e68806d6695..6a16decf278f 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c | |||
@@ -189,7 +189,9 @@ static dma_addr_t xtensa_map_page(struct device *dev, struct page *page, | |||
189 | { | 189 | { |
190 | dma_addr_t dma_handle = page_to_phys(page) + offset; | 190 | dma_addr_t dma_handle = page_to_phys(page) + offset; |
191 | 191 | ||
192 | xtensa_sync_single_for_device(dev, dma_handle, size, dir); | 192 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
193 | xtensa_sync_single_for_device(dev, dma_handle, size, dir); | ||
194 | |||
193 | return dma_handle; | 195 | return dma_handle; |
194 | } | 196 | } |
195 | 197 | ||
@@ -197,7 +199,8 @@ static void xtensa_unmap_page(struct device *dev, dma_addr_t dma_handle, | |||
197 | size_t size, enum dma_data_direction dir, | 199 | size_t size, enum dma_data_direction dir, |
198 | unsigned long attrs) | 200 | unsigned long attrs) |
199 | { | 201 | { |
200 | xtensa_sync_single_for_cpu(dev, dma_handle, size, dir); | 202 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
203 | xtensa_sync_single_for_cpu(dev, dma_handle, size, dir); | ||
201 | } | 204 | } |
202 | 205 | ||
203 | static int xtensa_map_sg(struct device *dev, struct scatterlist *sg, | 206 | static int xtensa_map_sg(struct device *dev, struct scatterlist *sg, |
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index 199b8e99f7d7..737187865269 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c | |||
@@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, | |||
19 | unsigned long pa; | 19 | unsigned long pa; |
20 | struct page *page; | 20 | struct page *page; |
21 | 21 | ||
22 | dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start | 22 | dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base; |
23 | + agp->aperture.bus_base; | ||
24 | pa = agp->ops->translate(agp, dma_addr); | 23 | pa = agp->ops->translate(agp, dma_addr); |
25 | 24 | ||
26 | if (pa == (unsigned long)-EINVAL) | 25 | if (pa == (unsigned long)-EINVAL) |
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f3f92d5fcda0..a697ca0cab1e 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c | |||
@@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
227 | * be because another thread has installed the pte first, so it | 227 | * be because another thread has installed the pte first, so it |
228 | * is no problem. | 228 | * is no problem. |
229 | */ | 229 | */ |
230 | vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); | 230 | vm_insert_pfn(vma, vmf->address, pfn); |
231 | 231 | ||
232 | return VM_FAULT_NOPAGE; | 232 | return VM_FAULT_NOPAGE; |
233 | } | 233 | } |
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 7a4869151d3b..a77262d31911 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c | |||
@@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(tpm_put_ops); | |||
84 | * | 84 | * |
85 | * The return'd chip has been tpm_try_get_ops'd and must be released via | 85 | * The return'd chip has been tpm_try_get_ops'd and must be released via |
86 | * tpm_put_ops | 86 | * tpm_put_ops |
87 | */ | 87 | */ |
88 | struct tpm_chip *tpm_chip_find_get(int chip_num) | 88 | struct tpm_chip *tpm_chip_find_get(int chip_num) |
89 | { | 89 | { |
90 | struct tpm_chip *chip, *res = NULL; | 90 | struct tpm_chip *chip, *res = NULL; |
@@ -103,7 +103,7 @@ struct tpm_chip *tpm_chip_find_get(int chip_num) | |||
103 | } | 103 | } |
104 | } while (chip_prev != chip_num); | 104 | } while (chip_prev != chip_num); |
105 | } else { | 105 | } else { |
106 | chip = idr_find_slowpath(&dev_nums_idr, chip_num); | 106 | chip = idr_find(&dev_nums_idr, chip_num); |
107 | if (chip && !tpm_try_get_ops(chip)) | 107 | if (chip && !tpm_try_get_ops(chip)) |
108 | res = chip; | 108 | res = chip; |
109 | } | 109 | } |
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 286447a83dab..26ec39ddf21f 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c | |||
@@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |||
328 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | 328 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, |
329 | struct vm_fault *vmf) | 329 | struct vm_fault *vmf) |
330 | { | 330 | { |
331 | unsigned long vaddr = (unsigned long) vmf->virtual_address; | ||
332 | struct device *dev = &dax_dev->dev; | 331 | struct device *dev = &dax_dev->dev; |
333 | struct dax_region *dax_region; | 332 | struct dax_region *dax_region; |
334 | int rc = VM_FAULT_SIGBUS; | 333 | int rc = VM_FAULT_SIGBUS; |
@@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |||
353 | 352 | ||
354 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | 353 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); |
355 | 354 | ||
356 | rc = vm_insert_mixed(vma, vaddr, pfn); | 355 | rc = vm_insert_mixed(vma, vmf->address, pfn); |
357 | 356 | ||
358 | if (rc == -ENOMEM) | 357 | if (rc == -ENOMEM) |
359 | return VM_FAULT_OOM; | 358 | return VM_FAULT_OOM; |
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 768087ddb046..a293c8be232c 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c | |||
@@ -17,12 +17,11 @@ | |||
17 | static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 17 | static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
18 | { | 18 | { |
19 | struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); | 19 | struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); |
20 | unsigned long addr = (unsigned long)vmf->virtual_address; | ||
21 | unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; | 20 | unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; |
22 | int ret; | 21 | int ret; |
23 | 22 | ||
24 | pfn += (addr - vma->vm_start) >> PAGE_SHIFT; | 23 | pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
25 | ret = vm_insert_pfn(vma, addr, pfn); | 24 | ret = vm_insert_pfn(vma, vmf->address, pfn); |
26 | 25 | ||
27 | switch (ret) { | 26 | switch (ret) { |
28 | case 0: | 27 | case 0: |
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index caa4e4ca616d..bd311c77c254 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c | |||
@@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
124 | * Using vm_pgoff as a selector forces us to use this unusual | 124 | * Using vm_pgoff as a selector forces us to use this unusual |
125 | * addressing scheme. | 125 | * addressing scheme. |
126 | */ | 126 | */ |
127 | resource_size_t offset = (unsigned long)vmf->virtual_address - | 127 | resource_size_t offset = vmf->address - vma->vm_start; |
128 | vma->vm_start; | ||
129 | resource_size_t baddr = map->offset + offset; | 128 | resource_size_t baddr = map->offset + offset; |
130 | struct drm_agp_mem *agpmem; | 129 | struct drm_agp_mem *agpmem; |
131 | struct page *page; | 130 | struct page *page; |
@@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
195 | if (!map) | 194 | if (!map) |
196 | return VM_FAULT_SIGBUS; /* Nothing allocated */ | 195 | return VM_FAULT_SIGBUS; /* Nothing allocated */ |
197 | 196 | ||
198 | offset = (unsigned long)vmf->virtual_address - vma->vm_start; | 197 | offset = vmf->address - vma->vm_start; |
199 | i = (unsigned long)map->handle + offset; | 198 | i = (unsigned long)map->handle + offset; |
200 | page = vmalloc_to_page((void *)i); | 199 | page = vmalloc_to_page((void *)i); |
201 | if (!page) | 200 | if (!page) |
@@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
301 | if (!dma->pagelist) | 300 | if (!dma->pagelist) |
302 | return VM_FAULT_SIGBUS; /* Nothing allocated */ | 301 | return VM_FAULT_SIGBUS; /* Nothing allocated */ |
303 | 302 | ||
304 | offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ | 303 | offset = vmf->address - vma->vm_start; |
304 | /* vm_[pg]off[set] should be 0 */ | ||
305 | page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ | 305 | page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ |
306 | page = virt_to_page((void *)dma->pagelist[page_nr]); | 306 | page = virt_to_page((void *)dma->pagelist[page_nr]); |
307 | 307 | ||
@@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
337 | if (!entry->pagelist) | 337 | if (!entry->pagelist) |
338 | return VM_FAULT_SIGBUS; /* Nothing allocated */ | 338 | return VM_FAULT_SIGBUS; /* Nothing allocated */ |
339 | 339 | ||
340 | offset = (unsigned long)vmf->virtual_address - vma->vm_start; | 340 | offset = vmf->address - vma->vm_start; |
341 | map_offset = map->offset - (unsigned long)dev->sg->virtual; | 341 | map_offset = map->offset - (unsigned long)dev->sg->virtual; |
342 | page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); | 342 | page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); |
343 | page = entry->pagelist[page_offset]; | 343 | page = entry->pagelist[page_offset]; |
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 7d066a91d778..114dddbd297b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c | |||
@@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
202 | } | 202 | } |
203 | 203 | ||
204 | /* We don't use vmf->pgoff since that has the fake offset: */ | 204 | /* We don't use vmf->pgoff since that has the fake offset: */ |
205 | pgoff = ((unsigned long)vmf->virtual_address - | 205 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
206 | vma->vm_start) >> PAGE_SHIFT; | ||
207 | 206 | ||
208 | page = pages[pgoff]; | 207 | page = pages[pgoff]; |
209 | 208 | ||
210 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 209 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
211 | page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); | 210 | page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); |
212 | 211 | ||
213 | ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); | 212 | ret = vm_insert_page(vma, vmf->address, page); |
214 | 213 | ||
215 | out: | 214 | out: |
216 | switch (ret) { | 215 | switch (ret) { |
@@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( | |||
759 | down_read(&mm->mmap_sem); | 758 | down_read(&mm->mmap_sem); |
760 | while (pinned < npages) { | 759 | while (pinned < npages) { |
761 | ret = get_user_pages_remote(task, mm, ptr, npages - pinned, | 760 | ret = get_user_pages_remote(task, mm, ptr, npages - pinned, |
762 | flags, pvec + pinned, NULL); | 761 | flags, pvec + pinned, NULL, NULL); |
763 | if (ret < 0) | 762 | if (ret < 0) |
764 | break; | 763 | break; |
765 | 764 | ||
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index ea7a18230888..57b81460fec8 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c | |||
@@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
455 | pgoff_t page_offset; | 455 | pgoff_t page_offset; |
456 | int ret; | 456 | int ret; |
457 | 457 | ||
458 | page_offset = ((unsigned long)vmf->virtual_address - | 458 | page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
459 | vma->vm_start) >> PAGE_SHIFT; | ||
460 | 459 | ||
461 | if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { | 460 | if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { |
462 | DRM_ERROR("invalid page offset\n"); | 461 | DRM_ERROR("invalid page offset\n"); |
@@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
465 | } | 464 | } |
466 | 465 | ||
467 | pfn = page_to_pfn(exynos_gem->pages[page_offset]); | 466 | pfn = page_to_pfn(exynos_gem->pages[page_offset]); |
468 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 467 | ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); |
469 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
470 | 468 | ||
471 | out: | 469 | out: |
472 | switch (ret) { | 470 | switch (ret) { |
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 4071b2d1e8cf..8b44fa542562 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c | |||
@@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
125 | psbfb->gtt->offset; | 125 | psbfb->gtt->offset; |
126 | 126 | ||
127 | page_num = vma_pages(vma); | 127 | page_num = vma_pages(vma); |
128 | address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); | 128 | address = vmf->address - (vmf->pgoff << PAGE_SHIFT); |
129 | 129 | ||
130 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 130 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
131 | 131 | ||
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c index 6d1cb6b370b1..527c62917660 100644 --- a/drivers/gpu/drm/gma500/gem.c +++ b/drivers/gpu/drm/gma500/gem.c | |||
@@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
197 | 197 | ||
198 | /* Page relative to the VMA start - we must calculate this ourselves | 198 | /* Page relative to the VMA start - we must calculate this ourselves |
199 | because vmf->pgoff is the fake GEM offset */ | 199 | because vmf->pgoff is the fake GEM offset */ |
200 | page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) | 200 | page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
201 | >> PAGE_SHIFT; | ||
202 | 201 | ||
203 | /* CPU view of the page, don't go via the GART for CPU writes */ | 202 | /* CPU view of the page, don't go via the GART for CPU writes */ |
204 | if (r->stolen) | 203 | if (r->stolen) |
205 | pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; | 204 | pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; |
206 | else | 205 | else |
207 | pfn = page_to_pfn(r->pages[page_offset]); | 206 | pfn = page_to_pfn(r->pages[page_offset]); |
208 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); | 207 | ret = vm_insert_pfn(vma, vmf->address, pfn); |
209 | 208 | ||
210 | fail: | 209 | fail: |
211 | mutex_unlock(&dev_priv->mmap_mutex); | 210 | mutex_unlock(&dev_priv->mmap_mutex); |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d0dcaf35b429..412f3513f269 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
@@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) | |||
1796 | int ret; | 1796 | int ret; |
1797 | 1797 | ||
1798 | /* We don't use vmf->pgoff since that has the fake offset */ | 1798 | /* We don't use vmf->pgoff since that has the fake offset */ |
1799 | page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> | 1799 | page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; |
1800 | PAGE_SHIFT; | ||
1801 | 1800 | ||
1802 | trace_i915_gem_object_fault(obj, page_offset, true, write); | 1801 | trace_i915_gem_object_fault(obj, page_offset, true, write); |
1803 | 1802 | ||
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 107ddf51065e..d068af2ec3a3 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c | |||
@@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) | |||
515 | obj->userptr.ptr + pinned * PAGE_SIZE, | 515 | obj->userptr.ptr + pinned * PAGE_SIZE, |
516 | npages - pinned, | 516 | npages - pinned, |
517 | flags, | 517 | flags, |
518 | pvec + pinned, NULL); | 518 | pvec + pinned, NULL, NULL); |
519 | if (ret < 0) | 519 | if (ret < 0) |
520 | break; | 520 | break; |
521 | 521 | ||
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index cd06cfd94687..d8bc59c7e261 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c | |||
@@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
225 | } | 225 | } |
226 | 226 | ||
227 | /* We don't use vmf->pgoff since that has the fake offset: */ | 227 | /* We don't use vmf->pgoff since that has the fake offset: */ |
228 | pgoff = ((unsigned long)vmf->virtual_address - | 228 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
229 | vma->vm_start) >> PAGE_SHIFT; | ||
230 | 229 | ||
231 | pfn = page_to_pfn(pages[pgoff]); | 230 | pfn = page_to_pfn(pages[pgoff]); |
232 | 231 | ||
233 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 232 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
234 | pfn, pfn << PAGE_SHIFT); | 233 | pfn, pfn << PAGE_SHIFT); |
235 | 234 | ||
236 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 235 | ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); |
237 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
238 | 236 | ||
239 | out_unlock: | 237 | out_unlock: |
240 | mutex_unlock(&dev->struct_mutex); | 238 | mutex_unlock(&dev->struct_mutex); |
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index d4e1e11466f8..4a90c690f09e 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c | |||
@@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj, | |||
398 | pgoff_t pgoff; | 398 | pgoff_t pgoff; |
399 | 399 | ||
400 | /* We don't use vmf->pgoff since that has the fake offset: */ | 400 | /* We don't use vmf->pgoff since that has the fake offset: */ |
401 | pgoff = ((unsigned long)vmf->virtual_address - | 401 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
402 | vma->vm_start) >> PAGE_SHIFT; | ||
403 | 402 | ||
404 | if (omap_obj->pages) { | 403 | if (omap_obj->pages) { |
405 | omap_gem_cpu_sync(obj, pgoff); | 404 | omap_gem_cpu_sync(obj, pgoff); |
@@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj, | |||
409 | pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; | 408 | pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; |
410 | } | 409 | } |
411 | 410 | ||
412 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 411 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
413 | pfn, pfn << PAGE_SHIFT); | 412 | pfn, pfn << PAGE_SHIFT); |
414 | 413 | ||
415 | return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 414 | return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); |
416 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
417 | } | 415 | } |
418 | 416 | ||
419 | /* Special handling for the case of faulting in 2d tiled buffers */ | 417 | /* Special handling for the case of faulting in 2d tiled buffers */ |
@@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj, | |||
427 | struct page *pages[64]; /* XXX is this too much to have on stack? */ | 425 | struct page *pages[64]; /* XXX is this too much to have on stack? */ |
428 | unsigned long pfn; | 426 | unsigned long pfn; |
429 | pgoff_t pgoff, base_pgoff; | 427 | pgoff_t pgoff, base_pgoff; |
430 | void __user *vaddr; | 428 | unsigned long vaddr; |
431 | int i, ret, slots; | 429 | int i, ret, slots; |
432 | 430 | ||
433 | /* | 431 | /* |
@@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj, | |||
447 | const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); | 445 | const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); |
448 | 446 | ||
449 | /* We don't use vmf->pgoff since that has the fake offset: */ | 447 | /* We don't use vmf->pgoff since that has the fake offset: */ |
450 | pgoff = ((unsigned long)vmf->virtual_address - | 448 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
451 | vma->vm_start) >> PAGE_SHIFT; | ||
452 | 449 | ||
453 | /* | 450 | /* |
454 | * Actual address we start mapping at is rounded down to previous slot | 451 | * Actual address we start mapping at is rounded down to previous slot |
@@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj, | |||
459 | /* figure out buffer width in slots */ | 456 | /* figure out buffer width in slots */ |
460 | slots = omap_obj->width >> priv->usergart[fmt].slot_shift; | 457 | slots = omap_obj->width >> priv->usergart[fmt].slot_shift; |
461 | 458 | ||
462 | vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); | 459 | vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT); |
463 | 460 | ||
464 | entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; | 461 | entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; |
465 | 462 | ||
@@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj, | |||
503 | 500 | ||
504 | pfn = entry->paddr >> PAGE_SHIFT; | 501 | pfn = entry->paddr >> PAGE_SHIFT; |
505 | 502 | ||
506 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 503 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
507 | pfn, pfn << PAGE_SHIFT); | 504 | pfn, pfn << PAGE_SHIFT); |
508 | 505 | ||
509 | for (i = n; i > 0; i--) { | 506 | for (i = n; i > 0; i--) { |
510 | vm_insert_mixed(vma, (unsigned long)vaddr, | 507 | vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV)); |
511 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
512 | pfn += priv->usergart[fmt].stride_pfn; | 508 | pfn += priv->usergart[fmt].stride_pfn; |
513 | vaddr += PAGE_SIZE * m; | 509 | vaddr += PAGE_SIZE * m; |
514 | } | 510 | } |
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index c08e5279eeac..7d853e6b5ff0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c | |||
@@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
452 | if (!bo->pages) | 452 | if (!bo->pages) |
453 | return VM_FAULT_SIGBUS; | 453 | return VM_FAULT_SIGBUS; |
454 | 454 | ||
455 | offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; | 455 | offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
456 | page = bo->pages[offset]; | 456 | page = bo->pages[offset]; |
457 | 457 | ||
458 | err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); | 458 | err = vm_insert_page(vma, vmf->address, page); |
459 | switch (err) { | 459 | switch (err) { |
460 | case -EAGAIN: | 460 | case -EAGAIN: |
461 | case 0: | 461 | case 0: |
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 4748aedc933a..68ef993ab431 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c | |||
@@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
101 | struct page *page; | 101 | struct page *page; |
102 | int ret; | 102 | int ret; |
103 | int i; | 103 | int i; |
104 | unsigned long address = (unsigned long)vmf->virtual_address; | 104 | unsigned long address = vmf->address; |
105 | int retval = VM_FAULT_NOPAGE; | 105 | int retval = VM_FAULT_NOPAGE; |
106 | struct ttm_mem_type_manager *man = | 106 | struct ttm_mem_type_manager *man = |
107 | &bdev->man[bo->mem.mem_type]; | 107 | &bdev->man[bo->mem.mem_type]; |
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 818e70712b18..3c0c4bd3f750 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c | |||
@@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
107 | unsigned int page_offset; | 107 | unsigned int page_offset; |
108 | int ret = 0; | 108 | int ret = 0; |
109 | 109 | ||
110 | page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> | 110 | page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
111 | PAGE_SHIFT; | ||
112 | 111 | ||
113 | if (!obj->pages) | 112 | if (!obj->pages) |
114 | return VM_FAULT_SIGBUS; | 113 | return VM_FAULT_SIGBUS; |
115 | 114 | ||
116 | page = obj->pages[page_offset]; | 115 | page = obj->pages[page_offset]; |
117 | ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); | 116 | ret = vm_insert_page(vma, vmf->address, page); |
118 | switch (ret) { | 117 | switch (ret) { |
119 | case -EAGAIN: | 118 | case -EAGAIN: |
120 | case 0: | 119 | case 0: |
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index f36c14729b55..477e07f0ecb6 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c | |||
@@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
54 | { | 54 | { |
55 | struct drm_vgem_gem_object *obj = vma->vm_private_data; | 55 | struct drm_vgem_gem_object *obj = vma->vm_private_data; |
56 | /* We don't use vmf->pgoff since that has the fake offset */ | 56 | /* We don't use vmf->pgoff since that has the fake offset */ |
57 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 57 | unsigned long vaddr = vmf->address; |
58 | struct page *page; | 58 | struct page *page; |
59 | 59 | ||
60 | page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, | 60 | page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, |
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1f0fe3217f23..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c | |||
@@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, | |||
578 | */ | 578 | */ |
579 | npages = get_user_pages_remote(owning_process, owning_mm, | 579 | npages = get_user_pages_remote(owning_process, owning_mm, |
580 | user_virt, gup_num_pages, | 580 | user_virt, gup_num_pages, |
581 | flags, local_page_list, NULL); | 581 | flags, local_page_list, NULL, NULL); |
582 | up_read(&owning_mm->mmap_sem); | 582 | up_read(&owning_mm->mmap_sem); |
583 | 583 | ||
584 | if (npages < 0) | 584 | if (npages < 0) |
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 1db0af6c7f94..ba63ca57ed7e 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c | |||
@@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
439 | struct page *page; | 439 | struct page *page; |
440 | 440 | ||
441 | dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", | 441 | dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", |
442 | (unsigned long)vmf->virtual_address, | 442 | vmf->address, vma->vm_start, vma->vm_end); |
443 | vma->vm_start, vma->vm_end); | ||
444 | 443 | ||
445 | page = alloc_page(GFP_USER | __GFP_DMA32); | 444 | page = alloc_page(GFP_USER | __GFP_DMA32); |
446 | if (!page) | 445 | if (!page) |
447 | return VM_FAULT_OOM; | 446 | return VM_FAULT_OOM; |
448 | clear_user_highpage(page, (unsigned long)vmf->virtual_address); | 447 | clear_user_highpage(page, vmf->address); |
449 | vmf->page = page; | 448 | vmf->page = page; |
450 | 449 | ||
451 | return 0; | 450 | return 0; |
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 5e506c19108a..5d36dcc7f47e 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c | |||
@@ -117,13 +117,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, | |||
117 | static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 117 | static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
118 | { | 118 | { |
119 | struct cxl_context *ctx = vma->vm_file->private_data; | 119 | struct cxl_context *ctx = vma->vm_file->private_data; |
120 | unsigned long address = (unsigned long)vmf->virtual_address; | ||
121 | u64 area, offset; | 120 | u64 area, offset; |
122 | 121 | ||
123 | offset = vmf->pgoff << PAGE_SHIFT; | 122 | offset = vmf->pgoff << PAGE_SHIFT; |
124 | 123 | ||
125 | pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", | 124 | pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", |
126 | __func__, ctx->pe, address, offset); | 125 | __func__, ctx->pe, vmf->address, offset); |
127 | 126 | ||
128 | if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { | 127 | if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { |
129 | area = ctx->afu->psn_phys; | 128 | area = ctx->afu->psn_phys; |
@@ -155,7 +154,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
155 | return VM_FAULT_SIGBUS; | 154 | return VM_FAULT_SIGBUS; |
156 | } | 155 | } |
157 | 156 | ||
158 | vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); | 157 | vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); |
159 | 158 | ||
160 | mutex_unlock(&ctx->status_mutex); | 159 | mutex_unlock(&ctx->status_mutex); |
161 | 160 | ||
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 33741ad4a74a..af2e077da4b8 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c | |||
@@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
932 | unsigned long paddr, vaddr; | 932 | unsigned long paddr, vaddr; |
933 | unsigned long expires; | 933 | unsigned long expires; |
934 | 934 | ||
935 | vaddr = (unsigned long)vmf->virtual_address; | 935 | vaddr = vmf->address; |
936 | gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", | 936 | gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", |
937 | vma, vaddr, GSEG_BASE(vaddr)); | 937 | vma, vaddr, GSEG_BASE(vaddr)); |
938 | STAT(nopfn); | 938 | STAT(nopfn); |
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index d11093dce1b9..acbc3abe2ddd 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h | |||
@@ -210,7 +210,12 @@ struct igb_tx_buffer { | |||
210 | struct igb_rx_buffer { | 210 | struct igb_rx_buffer { |
211 | dma_addr_t dma; | 211 | dma_addr_t dma; |
212 | struct page *page; | 212 | struct page *page; |
213 | unsigned int page_offset; | 213 | #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) |
214 | __u32 page_offset; | ||
215 | #else | ||
216 | __u16 page_offset; | ||
217 | #endif | ||
218 | __u16 pagecnt_bias; | ||
214 | }; | 219 | }; |
215 | 220 | ||
216 | struct igb_tx_queue_stats { | 221 | struct igb_tx_queue_stats { |
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index cae24a8ccf47..a761001308dc 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c | |||
@@ -3947,11 +3947,23 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) | |||
3947 | if (!buffer_info->page) | 3947 | if (!buffer_info->page) |
3948 | continue; | 3948 | continue; |
3949 | 3949 | ||
3950 | dma_unmap_page(rx_ring->dev, | 3950 | /* Invalidate cache lines that may have been written to by |
3951 | buffer_info->dma, | 3951 | * device so that we avoid corrupting memory. |
3952 | PAGE_SIZE, | 3952 | */ |
3953 | DMA_FROM_DEVICE); | 3953 | dma_sync_single_range_for_cpu(rx_ring->dev, |
3954 | __free_page(buffer_info->page); | 3954 | buffer_info->dma, |
3955 | buffer_info->page_offset, | ||
3956 | IGB_RX_BUFSZ, | ||
3957 | DMA_FROM_DEVICE); | ||
3958 | |||
3959 | /* free resources associated with mapping */ | ||
3960 | dma_unmap_page_attrs(rx_ring->dev, | ||
3961 | buffer_info->dma, | ||
3962 | PAGE_SIZE, | ||
3963 | DMA_FROM_DEVICE, | ||
3964 | DMA_ATTR_SKIP_CPU_SYNC); | ||
3965 | __page_frag_drain(buffer_info->page, 0, | ||
3966 | buffer_info->pagecnt_bias); | ||
3955 | 3967 | ||
3956 | buffer_info->page = NULL; | 3968 | buffer_info->page = NULL; |
3957 | } | 3969 | } |
@@ -6812,12 +6824,6 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring, | |||
6812 | 6824 | ||
6813 | /* transfer page from old buffer to new buffer */ | 6825 | /* transfer page from old buffer to new buffer */ |
6814 | *new_buff = *old_buff; | 6826 | *new_buff = *old_buff; |
6815 | |||
6816 | /* sync the buffer for use by the device */ | ||
6817 | dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma, | ||
6818 | old_buff->page_offset, | ||
6819 | IGB_RX_BUFSZ, | ||
6820 | DMA_FROM_DEVICE); | ||
6821 | } | 6827 | } |
6822 | 6828 | ||
6823 | static inline bool igb_page_is_reserved(struct page *page) | 6829 | static inline bool igb_page_is_reserved(struct page *page) |
@@ -6829,13 +6835,15 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, | |||
6829 | struct page *page, | 6835 | struct page *page, |
6830 | unsigned int truesize) | 6836 | unsigned int truesize) |
6831 | { | 6837 | { |
6838 | unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--; | ||
6839 | |||
6832 | /* avoid re-using remote pages */ | 6840 | /* avoid re-using remote pages */ |
6833 | if (unlikely(igb_page_is_reserved(page))) | 6841 | if (unlikely(igb_page_is_reserved(page))) |
6834 | return false; | 6842 | return false; |
6835 | 6843 | ||
6836 | #if (PAGE_SIZE < 8192) | 6844 | #if (PAGE_SIZE < 8192) |
6837 | /* if we are only owner of page we can reuse it */ | 6845 | /* if we are only owner of page we can reuse it */ |
6838 | if (unlikely(page_count(page) != 1)) | 6846 | if (unlikely(page_ref_count(page) != pagecnt_bias)) |
6839 | return false; | 6847 | return false; |
6840 | 6848 | ||
6841 | /* flip page offset to other buffer */ | 6849 | /* flip page offset to other buffer */ |
@@ -6848,10 +6856,14 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, | |||
6848 | return false; | 6856 | return false; |
6849 | #endif | 6857 | #endif |
6850 | 6858 | ||
6851 | /* Even if we own the page, we are not allowed to use atomic_set() | 6859 | /* If we have drained the page fragment pool we need to update |
6852 | * This would break get_page_unless_zero() users. | 6860 | * the pagecnt_bias and page count so that we fully restock the |
6861 | * number of references the driver holds. | ||
6853 | */ | 6862 | */ |
6854 | page_ref_inc(page); | 6863 | if (unlikely(pagecnt_bias == 1)) { |
6864 | page_ref_add(page, USHRT_MAX); | ||
6865 | rx_buffer->pagecnt_bias = USHRT_MAX; | ||
6866 | } | ||
6855 | 6867 | ||
6856 | return true; | 6868 | return true; |
6857 | } | 6869 | } |
@@ -6903,7 +6915,6 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, | |||
6903 | return true; | 6915 | return true; |
6904 | 6916 | ||
6905 | /* this page cannot be reused so discard it */ | 6917 | /* this page cannot be reused so discard it */ |
6906 | __free_page(page); | ||
6907 | return false; | 6918 | return false; |
6908 | } | 6919 | } |
6909 | 6920 | ||
@@ -6938,6 +6949,13 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, | |||
6938 | page = rx_buffer->page; | 6949 | page = rx_buffer->page; |
6939 | prefetchw(page); | 6950 | prefetchw(page); |
6940 | 6951 | ||
6952 | /* we are reusing so sync this buffer for CPU use */ | ||
6953 | dma_sync_single_range_for_cpu(rx_ring->dev, | ||
6954 | rx_buffer->dma, | ||
6955 | rx_buffer->page_offset, | ||
6956 | size, | ||
6957 | DMA_FROM_DEVICE); | ||
6958 | |||
6941 | if (likely(!skb)) { | 6959 | if (likely(!skb)) { |
6942 | void *page_addr = page_address(page) + | 6960 | void *page_addr = page_address(page) + |
6943 | rx_buffer->page_offset; | 6961 | rx_buffer->page_offset; |
@@ -6962,21 +6980,18 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, | |||
6962 | prefetchw(skb->data); | 6980 | prefetchw(skb->data); |
6963 | } | 6981 | } |
6964 | 6982 | ||
6965 | /* we are reusing so sync this buffer for CPU use */ | ||
6966 | dma_sync_single_range_for_cpu(rx_ring->dev, | ||
6967 | rx_buffer->dma, | ||
6968 | rx_buffer->page_offset, | ||
6969 | size, | ||
6970 | DMA_FROM_DEVICE); | ||
6971 | |||
6972 | /* pull page into skb */ | 6983 | /* pull page into skb */ |
6973 | if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { | 6984 | if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { |
6974 | /* hand second half of page back to the ring */ | 6985 | /* hand second half of page back to the ring */ |
6975 | igb_reuse_rx_page(rx_ring, rx_buffer); | 6986 | igb_reuse_rx_page(rx_ring, rx_buffer); |
6976 | } else { | 6987 | } else { |
6977 | /* we are not reusing the buffer so unmap it */ | 6988 | /* We are not reusing the buffer so unmap it and free |
6978 | dma_unmap_page(rx_ring->dev, rx_buffer->dma, | 6989 | * any references we are holding to it |
6979 | PAGE_SIZE, DMA_FROM_DEVICE); | 6990 | */ |
6991 | dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, | ||
6992 | PAGE_SIZE, DMA_FROM_DEVICE, | ||
6993 | DMA_ATTR_SKIP_CPU_SYNC); | ||
6994 | __page_frag_drain(page, 0, rx_buffer->pagecnt_bias); | ||
6980 | } | 6995 | } |
6981 | 6996 | ||
6982 | /* clear contents of rx_buffer */ | 6997 | /* clear contents of rx_buffer */ |
@@ -7234,7 +7249,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, | |||
7234 | } | 7249 | } |
7235 | 7250 | ||
7236 | /* map page for use */ | 7251 | /* map page for use */ |
7237 | dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); | 7252 | dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, |
7253 | DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); | ||
7238 | 7254 | ||
7239 | /* if mapping failed free memory back to system since | 7255 | /* if mapping failed free memory back to system since |
7240 | * there isn't much point in holding memory we can't use | 7256 | * there isn't much point in holding memory we can't use |
@@ -7249,6 +7265,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, | |||
7249 | bi->dma = dma; | 7265 | bi->dma = dma; |
7250 | bi->page = page; | 7266 | bi->page = page; |
7251 | bi->page_offset = 0; | 7267 | bi->page_offset = 0; |
7268 | bi->pagecnt_bias = 1; | ||
7252 | 7269 | ||
7253 | return true; | 7270 | return true; |
7254 | } | 7271 | } |
@@ -7275,6 +7292,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) | |||
7275 | if (!igb_alloc_mapped_page(rx_ring, bi)) | 7292 | if (!igb_alloc_mapped_page(rx_ring, bi)) |
7276 | break; | 7293 | break; |
7277 | 7294 | ||
7295 | /* sync the buffer for use by the device */ | ||
7296 | dma_sync_single_range_for_device(rx_ring->dev, bi->dma, | ||
7297 | bi->page_offset, | ||
7298 | IGB_RX_BUFSZ, | ||
7299 | DMA_FROM_DEVICE); | ||
7300 | |||
7278 | /* Refresh the desc even if buffer_addrs didn't change | 7301 | /* Refresh the desc even if buffer_addrs didn't change |
7279 | * because each write-back erases this info. | 7302 | * because each write-back erases this info. |
7280 | */ | 7303 | */ |
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c index e9cef9de9ed8..c96f9b1d948a 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c | |||
@@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv, | |||
900 | 900 | ||
901 | /* bound gain by 2 bits value max, 3rd bit is sign */ | 901 | /* bound gain by 2 bits value max, 3rd bit is sign */ |
902 | data->delta_gain_code[i] = | 902 | data->delta_gain_code[i] = |
903 | min(abs(delta_g), | 903 | min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE); |
904 | (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); | ||
905 | 904 | ||
906 | if (delta_g < 0) | 905 | if (delta_g < 0) |
907 | /* | 906 | /* |
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index d5cc3070e83f..b653451843c8 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c | |||
@@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
882 | BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); | 882 | BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); |
883 | 883 | ||
884 | pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); | 884 | pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); |
885 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); | 885 | ret = vm_insert_pfn(vma, vmf->address, pfn); |
886 | mutex_unlock(&buffer->lock); | 886 | mutex_unlock(&buffer->lock); |
887 | if (ret) | 887 | if (ret) |
888 | return VM_FAULT_ERROR; | 888 | return VM_FAULT_ERROR; |
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 0b6d388d8aa4..697cbfbe9374 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c | |||
@@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) | |||
1014 | "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", | 1014 | "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", |
1015 | vmf->page, vmf->page->mapping, vmf->page->index, | 1015 | vmf->page, vmf->page->mapping, vmf->page->index, |
1016 | (long)vmf->page->flags, page_count(vmf->page), | 1016 | (long)vmf->page->flags, page_count(vmf->page), |
1017 | page_private(vmf->page), vmf->virtual_address); | 1017 | page_private(vmf->page), (void *)vmf->address); |
1018 | if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { | 1018 | if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { |
1019 | lock_page(vmf->page); | 1019 | lock_page(vmf->page); |
1020 | cfio->ft_flags |= VM_FAULT_LOCKED; | 1020 | cfio->ft_flags |= VM_FAULT_LOCKED; |
@@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) | |||
1025 | } | 1025 | } |
1026 | 1026 | ||
1027 | if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { | 1027 | if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { |
1028 | CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); | 1028 | CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); |
1029 | return -EFAULT; | 1029 | return -EFAULT; |
1030 | } | 1030 | } |
1031 | 1031 | ||
1032 | if (cfio->ft_flags & VM_FAULT_OOM) { | 1032 | if (cfio->ft_flags & VM_FAULT_OOM) { |
1033 | CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); | 1033 | CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); |
1034 | return -ENOMEM; | 1034 | return -ENOMEM; |
1035 | } | 1035 | } |
1036 | 1036 | ||
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index 7abd70b2a588..3151d2a0fe59 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c | |||
@@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f) | |||
905 | mutex_lock(&hidg_ida_lock); | 905 | mutex_lock(&hidg_ida_lock); |
906 | 906 | ||
907 | hidg_put_minor(opts->minor); | 907 | hidg_put_minor(opts->minor); |
908 | if (idr_is_empty(&hidg_ida.idr)) | 908 | if (ida_is_empty(&hidg_ida)) |
909 | ghid_cleanup(); | 909 | ghid_cleanup(); |
910 | 910 | ||
911 | mutex_unlock(&hidg_ida_lock); | 911 | mutex_unlock(&hidg_ida_lock); |
@@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) | |||
931 | 931 | ||
932 | mutex_lock(&hidg_ida_lock); | 932 | mutex_lock(&hidg_ida_lock); |
933 | 933 | ||
934 | if (idr_is_empty(&hidg_ida.idr)) { | 934 | if (ida_is_empty(&hidg_ida)) { |
935 | status = ghid_setup(NULL, HIDG_MINORS); | 935 | status = ghid_setup(NULL, HIDG_MINORS); |
936 | if (status) { | 936 | if (status) { |
937 | ret = ERR_PTR(status); | 937 | ret = ERR_PTR(status); |
@@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) | |||
944 | if (opts->minor < 0) { | 944 | if (opts->minor < 0) { |
945 | ret = ERR_PTR(opts->minor); | 945 | ret = ERR_PTR(opts->minor); |
946 | kfree(opts); | 946 | kfree(opts); |
947 | if (idr_is_empty(&hidg_ida.idr)) | 947 | if (ida_is_empty(&hidg_ida)) |
948 | ghid_cleanup(); | 948 | ghid_cleanup(); |
949 | goto unlock; | 949 | goto unlock; |
950 | } | 950 | } |
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 0de36cda6e41..8054da9276dd 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c | |||
@@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f) | |||
1265 | mutex_lock(&printer_ida_lock); | 1265 | mutex_lock(&printer_ida_lock); |
1266 | 1266 | ||
1267 | gprinter_put_minor(opts->minor); | 1267 | gprinter_put_minor(opts->minor); |
1268 | if (idr_is_empty(&printer_ida.idr)) | 1268 | if (ida_is_empty(&printer_ida)) |
1269 | gprinter_cleanup(); | 1269 | gprinter_cleanup(); |
1270 | 1270 | ||
1271 | mutex_unlock(&printer_ida_lock); | 1271 | mutex_unlock(&printer_ida_lock); |
@@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) | |||
1289 | 1289 | ||
1290 | mutex_lock(&printer_ida_lock); | 1290 | mutex_lock(&printer_ida_lock); |
1291 | 1291 | ||
1292 | if (idr_is_empty(&printer_ida.idr)) { | 1292 | if (ida_is_empty(&printer_ida)) { |
1293 | status = gprinter_setup(PRINTER_MINORS); | 1293 | status = gprinter_setup(PRINTER_MINORS); |
1294 | if (status) { | 1294 | if (status) { |
1295 | ret = ERR_PTR(status); | 1295 | ret = ERR_PTR(status); |
@@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) | |||
1302 | if (opts->minor < 0) { | 1302 | if (opts->minor < 0) { |
1303 | ret = ERR_PTR(opts->minor); | 1303 | ret = ERR_PTR(opts->minor); |
1304 | kfree(opts); | 1304 | kfree(opts); |
1305 | if (idr_is_empty(&printer_ida.idr)) | 1305 | if (ida_is_empty(&printer_ida)) |
1306 | gprinter_cleanup(); | 1306 | gprinter_cleanup(); |
1307 | goto unlock; | 1307 | goto unlock; |
1308 | } | 1308 | } |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9815e45c23c4..f3726ba12aa6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
@@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, | |||
362 | 362 | ||
363 | down_read(&mm->mmap_sem); | 363 | down_read(&mm->mmap_sem); |
364 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, | 364 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, |
365 | NULL); | 365 | NULL, NULL); |
366 | up_read(&mm->mmap_sem); | 366 | up_read(&mm->mmap_sem); |
367 | } | 367 | } |
368 | 368 | ||
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 702040fe2001..6e3306f4a525 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c | |||
@@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
602 | { | 602 | { |
603 | printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", | 603 | printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", |
604 | vma, vma->vm_start, vma->vm_end, | 604 | vma, vma->vm_start, vma->vm_end, |
605 | vmf->pgoff, vmf->virtual_address); | 605 | vmf->pgoff, (void *)vmf->address); |
606 | 606 | ||
607 | return VM_FAULT_SIGBUS; | 607 | return VM_FAULT_SIGBUS; |
608 | } | 608 | } |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 180f910339f4..3b713b6fcc26 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
@@ -202,12 +202,12 @@ static struct ratelimit_state printk_limits[] = { | |||
202 | void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) | 202 | void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) |
203 | { | 203 | { |
204 | struct super_block *sb = fs_info->sb; | 204 | struct super_block *sb = fs_info->sb; |
205 | char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1]; | 205 | char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; |
206 | struct va_format vaf; | 206 | struct va_format vaf; |
207 | va_list args; | 207 | va_list args; |
208 | const char *type = NULL; | ||
209 | int kern_level; | 208 | int kern_level; |
210 | struct ratelimit_state *ratelimit; | 209 | const char *type = logtypes[4]; |
210 | struct ratelimit_state *ratelimit = &printk_limits[4]; | ||
211 | 211 | ||
212 | va_start(args, fmt); | 212 | va_start(args, fmt); |
213 | 213 | ||
@@ -223,12 +223,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) | |||
223 | fmt += size; | 223 | fmt += size; |
224 | } | 224 | } |
225 | 225 | ||
226 | if (!type) { | ||
227 | *lvl = '\0'; | ||
228 | type = logtypes[4]; | ||
229 | ratelimit = &printk_limits[4]; | ||
230 | } | ||
231 | |||
232 | vaf.fmt = fmt; | 226 | vaf.fmt = fmt; |
233 | vaf.va = &args; | 227 | vaf.va = &args; |
234 | 228 | ||
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index bf62ad919a95..00ee006a8aa2 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c | |||
@@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) | |||
162 | slot = radix_tree_iter_retry(&iter); | 162 | slot = radix_tree_iter_retry(&iter); |
163 | continue; | 163 | continue; |
164 | } | 164 | } |
165 | slot = radix_tree_iter_resume(slot, &iter); | ||
165 | spin_unlock(&fs_info->buffer_lock); | 166 | spin_unlock(&fs_info->buffer_lock); |
166 | free_extent_buffer_stale(eb); | 167 | free_extent_buffer_stale(eb); |
167 | spin_lock(&fs_info->buffer_lock); | 168 | spin_lock(&fs_info->buffer_lock); |
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/vmstat.h> | 31 | #include <linux/vmstat.h> |
32 | #include <linux/pfn_t.h> | 32 | #include <linux/pfn_t.h> |
33 | #include <linux/sizes.h> | 33 | #include <linux/sizes.h> |
34 | #include <linux/mmu_notifier.h> | ||
34 | #include <linux/iomap.h> | 35 | #include <linux/iomap.h> |
35 | #include "internal.h" | 36 | #include "internal.h" |
36 | 37 | ||
@@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, | |||
240 | } | 241 | } |
241 | } | 242 | } |
242 | 243 | ||
244 | static void dax_unlock_mapping_entry(struct address_space *mapping, | ||
245 | pgoff_t index) | ||
246 | { | ||
247 | void *entry, **slot; | ||
248 | |||
249 | spin_lock_irq(&mapping->tree_lock); | ||
250 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); | ||
251 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | ||
252 | !slot_locked(mapping, slot))) { | ||
253 | spin_unlock_irq(&mapping->tree_lock); | ||
254 | return; | ||
255 | } | ||
256 | unlock_slot(mapping, slot); | ||
257 | spin_unlock_irq(&mapping->tree_lock); | ||
258 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | ||
259 | } | ||
260 | |||
243 | static void put_locked_mapping_entry(struct address_space *mapping, | 261 | static void put_locked_mapping_entry(struct address_space *mapping, |
244 | pgoff_t index, void *entry) | 262 | pgoff_t index, void *entry) |
245 | { | 263 | { |
@@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, | |||
433 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | 451 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); |
434 | } | 452 | } |
435 | 453 | ||
436 | void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) | ||
437 | { | ||
438 | void *entry, **slot; | ||
439 | |||
440 | spin_lock_irq(&mapping->tree_lock); | ||
441 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); | ||
442 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | ||
443 | !slot_locked(mapping, slot))) { | ||
444 | spin_unlock_irq(&mapping->tree_lock); | ||
445 | return; | ||
446 | } | ||
447 | unlock_slot(mapping, slot); | ||
448 | spin_unlock_irq(&mapping->tree_lock); | ||
449 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | ||
450 | } | ||
451 | |||
452 | /* | 454 | /* |
453 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree | 455 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree |
454 | * entry to get unlocked before deleting it. | 456 | * entry to get unlocked before deleting it. |
@@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
500 | /* This will replace locked radix tree entry with a hole page */ | 502 | /* This will replace locked radix tree entry with a hole page */ |
501 | page = find_or_create_page(mapping, vmf->pgoff, | 503 | page = find_or_create_page(mapping, vmf->pgoff, |
502 | vmf->gfp_mask | __GFP_ZERO); | 504 | vmf->gfp_mask | __GFP_ZERO); |
503 | if (!page) { | 505 | if (!page) |
504 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
505 | return VM_FAULT_OOM; | 506 | return VM_FAULT_OOM; |
506 | } | ||
507 | vmf->page = page; | 507 | vmf->page = page; |
508 | return VM_FAULT_LOCKED; | 508 | return VM_FAULT_LOCKED; |
509 | } | 509 | } |
@@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
615 | return new_entry; | 615 | return new_entry; |
616 | } | 616 | } |
617 | 617 | ||
618 | static inline unsigned long | ||
619 | pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) | ||
620 | { | ||
621 | unsigned long address; | ||
622 | |||
623 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
624 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); | ||
625 | return address; | ||
626 | } | ||
627 | |||
628 | /* Walk all mappings of a given index of a file and writeprotect them */ | ||
629 | static void dax_mapping_entry_mkclean(struct address_space *mapping, | ||
630 | pgoff_t index, unsigned long pfn) | ||
631 | { | ||
632 | struct vm_area_struct *vma; | ||
633 | pte_t *ptep; | ||
634 | pte_t pte; | ||
635 | spinlock_t *ptl; | ||
636 | bool changed; | ||
637 | |||
638 | i_mmap_lock_read(mapping); | ||
639 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { | ||
640 | unsigned long address; | ||
641 | |||
642 | cond_resched(); | ||
643 | |||
644 | if (!(vma->vm_flags & VM_SHARED)) | ||
645 | continue; | ||
646 | |||
647 | address = pgoff_address(index, vma); | ||
648 | changed = false; | ||
649 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
650 | continue; | ||
651 | if (pfn != pte_pfn(*ptep)) | ||
652 | goto unlock; | ||
653 | if (!pte_dirty(*ptep) && !pte_write(*ptep)) | ||
654 | goto unlock; | ||
655 | |||
656 | flush_cache_page(vma, address, pfn); | ||
657 | pte = ptep_clear_flush(vma, address, ptep); | ||
658 | pte = pte_wrprotect(pte); | ||
659 | pte = pte_mkclean(pte); | ||
660 | set_pte_at(vma->vm_mm, address, ptep, pte); | ||
661 | changed = true; | ||
662 | unlock: | ||
663 | pte_unmap_unlock(ptep, ptl); | ||
664 | |||
665 | if (changed) | ||
666 | mmu_notifier_invalidate_page(vma->vm_mm, address); | ||
667 | } | ||
668 | i_mmap_unlock_read(mapping); | ||
669 | } | ||
670 | |||
618 | static int dax_writeback_one(struct block_device *bdev, | 671 | static int dax_writeback_one(struct block_device *bdev, |
619 | struct address_space *mapping, pgoff_t index, void *entry) | 672 | struct address_space *mapping, pgoff_t index, void *entry) |
620 | { | 673 | { |
621 | struct radix_tree_root *page_tree = &mapping->page_tree; | 674 | struct radix_tree_root *page_tree = &mapping->page_tree; |
622 | struct radix_tree_node *node; | ||
623 | struct blk_dax_ctl dax; | 675 | struct blk_dax_ctl dax; |
624 | void **slot; | 676 | void *entry2, **slot; |
625 | int ret = 0; | 677 | int ret = 0; |
626 | 678 | ||
627 | spin_lock_irq(&mapping->tree_lock); | ||
628 | /* | 679 | /* |
629 | * Regular page slots are stabilized by the page lock even | 680 | * A page got tagged dirty in DAX mapping? Something is seriously |
630 | * without the tree itself locked. These unlocked entries | 681 | * wrong. |
631 | * need verification under the tree lock. | ||
632 | */ | 682 | */ |
633 | if (!__radix_tree_lookup(page_tree, index, &node, &slot)) | 683 | if (WARN_ON(!radix_tree_exceptional_entry(entry))) |
634 | goto unlock; | 684 | return -EIO; |
635 | if (*slot != entry) | ||
636 | goto unlock; | ||
637 | |||
638 | /* another fsync thread may have already written back this entry */ | ||
639 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
640 | goto unlock; | ||
641 | 685 | ||
686 | spin_lock_irq(&mapping->tree_lock); | ||
687 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); | ||
688 | /* Entry got punched out / reallocated? */ | ||
689 | if (!entry2 || !radix_tree_exceptional_entry(entry2)) | ||
690 | goto put_unlocked; | ||
691 | /* | ||
692 | * Entry got reallocated elsewhere? No need to writeback. We have to | ||
693 | * compare sectors as we must not bail out due to difference in lockbit | ||
694 | * or entry type. | ||
695 | */ | ||
696 | if (dax_radix_sector(entry2) != dax_radix_sector(entry)) | ||
697 | goto put_unlocked; | ||
642 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || | 698 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
643 | dax_is_zero_entry(entry))) { | 699 | dax_is_zero_entry(entry))) { |
644 | ret = -EIO; | 700 | ret = -EIO; |
645 | goto unlock; | 701 | goto put_unlocked; |
646 | } | 702 | } |
647 | 703 | ||
704 | /* Another fsync thread may have already written back this entry */ | ||
705 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
706 | goto put_unlocked; | ||
707 | /* Lock the entry to serialize with page faults */ | ||
708 | entry = lock_slot(mapping, slot); | ||
709 | /* | ||
710 | * We can clear the tag now but we have to be careful so that concurrent | ||
711 | * dax_writeback_one() calls for the same index cannot finish before we | ||
712 | * actually flush the caches. This is achieved as the calls will look | ||
713 | * at the entry only under tree_lock and once they do that they will | ||
714 | * see the entry locked and wait for it to unlock. | ||
715 | */ | ||
716 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | ||
717 | spin_unlock_irq(&mapping->tree_lock); | ||
718 | |||
648 | /* | 719 | /* |
649 | * Even if dax_writeback_mapping_range() was given a wbc->range_start | 720 | * Even if dax_writeback_mapping_range() was given a wbc->range_start |
650 | * in the middle of a PMD, the 'index' we are given will be aligned to | 721 | * in the middle of a PMD, the 'index' we are given will be aligned to |
@@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev, | |||
654 | */ | 725 | */ |
655 | dax.sector = dax_radix_sector(entry); | 726 | dax.sector = dax_radix_sector(entry); |
656 | dax.size = PAGE_SIZE << dax_radix_order(entry); | 727 | dax.size = PAGE_SIZE << dax_radix_order(entry); |
657 | spin_unlock_irq(&mapping->tree_lock); | ||
658 | 728 | ||
659 | /* | 729 | /* |
660 | * We cannot hold tree_lock while calling dax_map_atomic() because it | 730 | * We cannot hold tree_lock while calling dax_map_atomic() because it |
661 | * eventually calls cond_resched(). | 731 | * eventually calls cond_resched(). |
662 | */ | 732 | */ |
663 | ret = dax_map_atomic(bdev, &dax); | 733 | ret = dax_map_atomic(bdev, &dax); |
664 | if (ret < 0) | 734 | if (ret < 0) { |
735 | put_locked_mapping_entry(mapping, index, entry); | ||
665 | return ret; | 736 | return ret; |
737 | } | ||
666 | 738 | ||
667 | if (WARN_ON_ONCE(ret < dax.size)) { | 739 | if (WARN_ON_ONCE(ret < dax.size)) { |
668 | ret = -EIO; | 740 | ret = -EIO; |
669 | goto unmap; | 741 | goto unmap; |
670 | } | 742 | } |
671 | 743 | ||
744 | dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); | ||
672 | wb_cache_pmem(dax.addr, dax.size); | 745 | wb_cache_pmem(dax.addr, dax.size); |
673 | 746 | /* | |
747 | * After we have flushed the cache, we can clear the dirty tag. There | ||
748 | * cannot be new dirty data in the pfn after the flush has completed as | ||
749 | * the pfn mappings are writeprotected and fault waits for mapping | ||
750 | * entry lock. | ||
751 | */ | ||
674 | spin_lock_irq(&mapping->tree_lock); | 752 | spin_lock_irq(&mapping->tree_lock); |
675 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | 753 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); |
676 | spin_unlock_irq(&mapping->tree_lock); | 754 | spin_unlock_irq(&mapping->tree_lock); |
677 | unmap: | 755 | unmap: |
678 | dax_unmap_atomic(bdev, &dax); | 756 | dax_unmap_atomic(bdev, &dax); |
757 | put_locked_mapping_entry(mapping, index, entry); | ||
679 | return ret; | 758 | return ret; |
680 | 759 | ||
681 | unlock: | 760 | put_unlocked: |
761 | put_unlocked_mapping_entry(mapping, index, entry2); | ||
682 | spin_unlock_irq(&mapping->tree_lock); | 762 | spin_unlock_irq(&mapping->tree_lock); |
683 | return ret; | 763 | return ret; |
684 | } | 764 | } |
@@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
738 | struct block_device *bdev, sector_t sector, size_t size, | 818 | struct block_device *bdev, sector_t sector, size_t size, |
739 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) | 819 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) |
740 | { | 820 | { |
741 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 821 | unsigned long vaddr = vmf->address; |
742 | struct blk_dax_ctl dax = { | 822 | struct blk_dax_ctl dax = { |
743 | .sector = sector, | 823 | .sector = sector, |
744 | .size = size, | 824 | .size = size, |
@@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
767 | { | 847 | { |
768 | struct file *file = vma->vm_file; | 848 | struct file *file = vma->vm_file; |
769 | struct address_space *mapping = file->f_mapping; | 849 | struct address_space *mapping = file->f_mapping; |
770 | void *entry; | 850 | void *entry, **slot; |
771 | pgoff_t index = vmf->pgoff; | 851 | pgoff_t index = vmf->pgoff; |
772 | 852 | ||
773 | spin_lock_irq(&mapping->tree_lock); | 853 | spin_lock_irq(&mapping->tree_lock); |
774 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | 854 | entry = get_unlocked_mapping_entry(mapping, index, &slot); |
775 | if (!entry || !radix_tree_exceptional_entry(entry)) | 855 | if (!entry || !radix_tree_exceptional_entry(entry)) { |
776 | goto out; | 856 | if (entry) |
857 | put_unlocked_mapping_entry(mapping, index, entry); | ||
858 | spin_unlock_irq(&mapping->tree_lock); | ||
859 | return VM_FAULT_NOPAGE; | ||
860 | } | ||
777 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); | 861 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); |
778 | put_unlocked_mapping_entry(mapping, index, entry); | 862 | entry = lock_slot(mapping, slot); |
779 | out: | ||
780 | spin_unlock_irq(&mapping->tree_lock); | 863 | spin_unlock_irq(&mapping->tree_lock); |
864 | /* | ||
865 | * If we race with somebody updating the PTE and finish_mkwrite_fault() | ||
866 | * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry | ||
867 | * the fault in either case. | ||
868 | */ | ||
869 | finish_mkwrite_fault(vmf); | ||
870 | put_locked_mapping_entry(mapping, index, entry); | ||
781 | return VM_FAULT_NOPAGE; | 871 | return VM_FAULT_NOPAGE; |
782 | } | 872 | } |
783 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); | 873 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); |
@@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
948 | { | 1038 | { |
949 | struct address_space *mapping = vma->vm_file->f_mapping; | 1039 | struct address_space *mapping = vma->vm_file->f_mapping; |
950 | struct inode *inode = mapping->host; | 1040 | struct inode *inode = mapping->host; |
951 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 1041 | unsigned long vaddr = vmf->address; |
952 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | 1042 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; |
953 | sector_t sector; | 1043 | sector_t sector; |
954 | struct iomap iomap = { 0 }; | 1044 | struct iomap iomap = { 0 }; |
955 | unsigned flags = IOMAP_FAULT; | 1045 | unsigned flags = IOMAP_FAULT; |
956 | int error, major = 0; | 1046 | int error, major = 0; |
957 | int locked_status = 0; | 1047 | int vmf_ret = 0; |
958 | void *entry; | 1048 | void *entry; |
959 | 1049 | ||
960 | /* | 1050 | /* |
@@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1007 | 1097 | ||
1008 | if (error) | 1098 | if (error) |
1009 | goto finish_iomap; | 1099 | goto finish_iomap; |
1010 | if (!radix_tree_exceptional_entry(entry)) { | 1100 | |
1011 | vmf->page = entry; | 1101 | __SetPageUptodate(vmf->cow_page); |
1012 | locked_status = VM_FAULT_LOCKED; | 1102 | vmf_ret = finish_fault(vmf); |
1013 | } else { | 1103 | if (!vmf_ret) |
1014 | vmf->entry = entry; | 1104 | vmf_ret = VM_FAULT_DONE_COW; |
1015 | locked_status = VM_FAULT_DAX_LOCKED; | ||
1016 | } | ||
1017 | goto finish_iomap; | 1105 | goto finish_iomap; |
1018 | } | 1106 | } |
1019 | 1107 | ||
@@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1030 | case IOMAP_UNWRITTEN: | 1118 | case IOMAP_UNWRITTEN: |
1031 | case IOMAP_HOLE: | 1119 | case IOMAP_HOLE: |
1032 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1120 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
1033 | locked_status = dax_load_hole(mapping, entry, vmf); | 1121 | vmf_ret = dax_load_hole(mapping, entry, vmf); |
1034 | break; | 1122 | break; |
1035 | } | 1123 | } |
1036 | /*FALLTHRU*/ | 1124 | /*FALLTHRU*/ |
@@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1042 | 1130 | ||
1043 | finish_iomap: | 1131 | finish_iomap: |
1044 | if (ops->iomap_end) { | 1132 | if (ops->iomap_end) { |
1045 | if (error) { | 1133 | if (error || (vmf_ret & VM_FAULT_ERROR)) { |
1046 | /* keep previous error */ | 1134 | /* keep previous error */ |
1047 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | 1135 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, |
1048 | &iomap); | 1136 | &iomap); |
@@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1052 | } | 1140 | } |
1053 | } | 1141 | } |
1054 | unlock_entry: | 1142 | unlock_entry: |
1055 | if (!locked_status || error) | 1143 | if (vmf_ret != VM_FAULT_LOCKED || error) |
1056 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1144 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); |
1057 | out: | 1145 | out: |
1058 | if (error == -ENOMEM) | 1146 | if (error == -ENOMEM) |
@@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1060 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1148 | /* -EBUSY is fine, somebody else faulted on the same PTE */ |
1061 | if (error < 0 && error != -EBUSY) | 1149 | if (error < 0 && error != -EBUSY) |
1062 | return VM_FAULT_SIGBUS | major; | 1150 | return VM_FAULT_SIGBUS | major; |
1063 | if (locked_status) { | 1151 | if (vmf_ret) { |
1064 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | 1152 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ |
1065 | return locked_status; | 1153 | return vmf_ret; |
1066 | } | 1154 | } |
1067 | return VM_FAULT_NOPAGE | major; | 1155 | return VM_FAULT_NOPAGE | major; |
1068 | } | 1156 | } |
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, | |||
209 | * doing the exec and bprm->mm is the new process's mm. | 209 | * doing the exec and bprm->mm is the new process's mm. |
210 | */ | 210 | */ |
211 | ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, | 211 | ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, |
212 | &page, NULL); | 212 | &page, NULL, NULL); |
213 | if (ret <= 0) | 213 | if (ret <= 0) |
214 | return NULL; | 214 | return NULL; |
215 | 215 | ||
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..d96e2f30084b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
@@ -257,9 +257,9 @@ out: | |||
257 | * fatal_signal_pending()s, and the mmap_sem must be released before | 257 | * fatal_signal_pending()s, and the mmap_sem must be released before |
258 | * returning it. | 258 | * returning it. |
259 | */ | 259 | */ |
260 | int handle_userfault(struct fault_env *fe, unsigned long reason) | 260 | int handle_userfault(struct vm_fault *vmf, unsigned long reason) |
261 | { | 261 | { |
262 | struct mm_struct *mm = fe->vma->vm_mm; | 262 | struct mm_struct *mm = vmf->vma->vm_mm; |
263 | struct userfaultfd_ctx *ctx; | 263 | struct userfaultfd_ctx *ctx; |
264 | struct userfaultfd_wait_queue uwq; | 264 | struct userfaultfd_wait_queue uwq; |
265 | int ret; | 265 | int ret; |
@@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
268 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 268 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
269 | 269 | ||
270 | ret = VM_FAULT_SIGBUS; | 270 | ret = VM_FAULT_SIGBUS; |
271 | ctx = fe->vma->vm_userfaultfd_ctx.ctx; | 271 | ctx = vmf->vma->vm_userfaultfd_ctx.ctx; |
272 | if (!ctx) | 272 | if (!ctx) |
273 | goto out; | 273 | goto out; |
274 | 274 | ||
@@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
301 | * without first stopping userland access to the memory. For | 301 | * without first stopping userland access to the memory. For |
302 | * VM_UFFD_MISSING userfaults this is enough for now. | 302 | * VM_UFFD_MISSING userfaults this is enough for now. |
303 | */ | 303 | */ |
304 | if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { | 304 | if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { |
305 | /* | 305 | /* |
306 | * Validate the invariant that nowait must allow retry | 306 | * Validate the invariant that nowait must allow retry |
307 | * to be sure not to return SIGBUS erroneously on | 307 | * to be sure not to return SIGBUS erroneously on |
308 | * nowait invocations. | 308 | * nowait invocations. |
309 | */ | 309 | */ |
310 | BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); | 310 | BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); |
311 | #ifdef CONFIG_DEBUG_VM | 311 | #ifdef CONFIG_DEBUG_VM |
312 | if (printk_ratelimit()) { | 312 | if (printk_ratelimit()) { |
313 | printk(KERN_WARNING | 313 | printk(KERN_WARNING |
314 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); | 314 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", |
315 | vmf->flags); | ||
315 | dump_stack(); | 316 | dump_stack(); |
316 | } | 317 | } |
317 | #endif | 318 | #endif |
@@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
323 | * and wait. | 324 | * and wait. |
324 | */ | 325 | */ |
325 | ret = VM_FAULT_RETRY; | 326 | ret = VM_FAULT_RETRY; |
326 | if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) | 327 | if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) |
327 | goto out; | 328 | goto out; |
328 | 329 | ||
329 | /* take the reference before dropping the mmap_sem */ | 330 | /* take the reference before dropping the mmap_sem */ |
@@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
331 | 332 | ||
332 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); | 333 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); |
333 | uwq.wq.private = current; | 334 | uwq.wq.private = current; |
334 | uwq.msg = userfault_msg(fe->address, fe->flags, reason); | 335 | uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); |
335 | uwq.ctx = ctx; | 336 | uwq.ctx = ctx; |
336 | 337 | ||
337 | return_to_userland = | 338 | return_to_userland = |
338 | (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | 339 | (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == |
339 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); | 340 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); |
340 | 341 | ||
341 | spin_lock(&ctx->fault_pending_wqh.lock); | 342 | spin_lock(&ctx->fault_pending_wqh.lock); |
@@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
353 | TASK_KILLABLE); | 354 | TASK_KILLABLE); |
354 | spin_unlock(&ctx->fault_pending_wqh.lock); | 355 | spin_unlock(&ctx->fault_pending_wqh.lock); |
355 | 356 | ||
356 | must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); | 357 | must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, |
358 | reason); | ||
357 | up_read(&mm->mmap_sem); | 359 | up_read(&mm->mmap_sem); |
358 | 360 | ||
359 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && | 361 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && |
diff --git a/include/linux/dax.h b/include/linux/dax.h index 0afade8bd3d7..f97bcfe79472 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
@@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, | |||
46 | 46 | ||
47 | #ifdef CONFIG_FS_DAX | 47 | #ifdef CONFIG_FS_DAX |
48 | struct page *read_dax_sector(struct block_device *bdev, sector_t n); | 48 | struct page *read_dax_sector(struct block_device *bdev, sector_t n); |
49 | void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); | ||
50 | int __dax_zero_page_range(struct block_device *bdev, sector_t sector, | 49 | int __dax_zero_page_range(struct block_device *bdev, sector_t sector, |
51 | unsigned int offset, unsigned int length); | 50 | unsigned int offset, unsigned int length); |
52 | #else | 51 | #else |
@@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev, | |||
55 | { | 54 | { |
56 | return ERR_PTR(-ENXIO); | 55 | return ERR_PTR(-ENXIO); |
57 | } | 56 | } |
58 | /* Shouldn't ever be called when dax is disabled. */ | ||
59 | static inline void dax_unlock_mapping_entry(struct address_space *mapping, | ||
60 | pgoff_t index) | ||
61 | { | ||
62 | BUG(); | ||
63 | } | ||
64 | static inline int __dax_zero_page_range(struct block_device *bdev, | 57 | static inline int __dax_zero_page_range(struct block_device *bdev, |
65 | sector_t sector, unsigned int offset, unsigned int length) | 58 | sector_t sector, unsigned int offset, unsigned int length) |
66 | { | 59 | { |
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 08528afdf58b..10c5a17b1f51 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h | |||
@@ -243,29 +243,33 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg | |||
243 | ops->unmap_sg(dev, sg, nents, dir, attrs); | 243 | ops->unmap_sg(dev, sg, nents, dir, attrs); |
244 | } | 244 | } |
245 | 245 | ||
246 | static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, | 246 | static inline dma_addr_t dma_map_page_attrs(struct device *dev, |
247 | size_t offset, size_t size, | 247 | struct page *page, |
248 | enum dma_data_direction dir) | 248 | size_t offset, size_t size, |
249 | enum dma_data_direction dir, | ||
250 | unsigned long attrs) | ||
249 | { | 251 | { |
250 | struct dma_map_ops *ops = get_dma_ops(dev); | 252 | struct dma_map_ops *ops = get_dma_ops(dev); |
251 | dma_addr_t addr; | 253 | dma_addr_t addr; |
252 | 254 | ||
253 | kmemcheck_mark_initialized(page_address(page) + offset, size); | 255 | kmemcheck_mark_initialized(page_address(page) + offset, size); |
254 | BUG_ON(!valid_dma_direction(dir)); | 256 | BUG_ON(!valid_dma_direction(dir)); |
255 | addr = ops->map_page(dev, page, offset, size, dir, 0); | 257 | addr = ops->map_page(dev, page, offset, size, dir, attrs); |
256 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); | 258 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); |
257 | 259 | ||
258 | return addr; | 260 | return addr; |
259 | } | 261 | } |
260 | 262 | ||
261 | static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, | 263 | static inline void dma_unmap_page_attrs(struct device *dev, |
262 | size_t size, enum dma_data_direction dir) | 264 | dma_addr_t addr, size_t size, |
265 | enum dma_data_direction dir, | ||
266 | unsigned long attrs) | ||
263 | { | 267 | { |
264 | struct dma_map_ops *ops = get_dma_ops(dev); | 268 | struct dma_map_ops *ops = get_dma_ops(dev); |
265 | 269 | ||
266 | BUG_ON(!valid_dma_direction(dir)); | 270 | BUG_ON(!valid_dma_direction(dir)); |
267 | if (ops->unmap_page) | 271 | if (ops->unmap_page) |
268 | ops->unmap_page(dev, addr, size, dir, 0); | 272 | ops->unmap_page(dev, addr, size, dir, attrs); |
269 | debug_dma_unmap_page(dev, addr, size, dir, false); | 273 | debug_dma_unmap_page(dev, addr, size, dir, false); |
270 | } | 274 | } |
271 | 275 | ||
@@ -385,6 +389,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, | |||
385 | #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) | 389 | #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) |
386 | #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) | 390 | #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) |
387 | #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) | 391 | #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) |
392 | #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) | ||
393 | #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) | ||
388 | 394 | ||
389 | extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, | 395 | extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, |
390 | void *cpu_addr, dma_addr_t dma_addr, size_t size); | 396 | void *cpu_addr, dma_addr_t dma_addr, size_t size); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f8041f9de31e..4175dca4ac39 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold); | |||
506 | extern void free_hot_cold_page_list(struct list_head *list, bool cold); | 506 | extern void free_hot_cold_page_list(struct list_head *list, bool cold); |
507 | 507 | ||
508 | struct page_frag_cache; | 508 | struct page_frag_cache; |
509 | extern void __page_frag_drain(struct page *page, unsigned int order, | ||
510 | unsigned int count); | ||
509 | extern void *__alloc_page_frag(struct page_frag_cache *nc, | 511 | extern void *__alloc_page_frag(struct page_frag_cache *nc, |
510 | unsigned int fragsz, gfp_t gfp_mask); | 512 | unsigned int fragsz, gfp_t gfp_mask); |
511 | extern void __free_page_frag(void *addr); | 513 | extern void __free_page_frag(void *addr); |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1f782aa1d8e6..97e478d6b690 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -1,12 +1,12 @@ | |||
1 | #ifndef _LINUX_HUGE_MM_H | 1 | #ifndef _LINUX_HUGE_MM_H |
2 | #define _LINUX_HUGE_MM_H | 2 | #define _LINUX_HUGE_MM_H |
3 | 3 | ||
4 | extern int do_huge_pmd_anonymous_page(struct fault_env *fe); | 4 | extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); |
5 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 5 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
6 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 6 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
7 | struct vm_area_struct *vma); | 7 | struct vm_area_struct *vma); |
8 | extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); | 8 | extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); |
9 | extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); | 9 | extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); |
10 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | 10 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
11 | unsigned long addr, | 11 | unsigned long addr, |
12 | pmd_t *pmd, | 12 | pmd_t *pmd, |
@@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page) | |||
142 | return 1; | 142 | return 1; |
143 | } | 143 | } |
144 | 144 | ||
145 | extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); | 145 | extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); |
146 | 146 | ||
147 | extern struct page *huge_zero_page; | 147 | extern struct page *huge_zero_page; |
148 | 148 | ||
@@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, | |||
212 | return NULL; | 212 | return NULL; |
213 | } | 213 | } |
214 | 214 | ||
215 | static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) | 215 | static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) |
216 | { | 216 | { |
217 | return 0; | 217 | return 0; |
218 | } | 218 | } |
diff --git a/include/linux/idr.h b/include/linux/idr.h index 083d61e92706..3c01b89aed67 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h | |||
@@ -18,12 +18,11 @@ | |||
18 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * We want shallower trees and thus more bits covered at each layer. 8 | 21 | * Using 6 bits at each layer allows us to allocate 7 layers out of each page. |
22 | * bits gives us large enough first layer for most use cases and maximum | 22 | * 8 bits only gave us 3 layers out of every pair of pages, which is less |
23 | * tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and | 23 | * efficient except for trees with a largest element between 192-255 inclusive. |
24 | * 1k on 32bit. | ||
25 | */ | 24 | */ |
26 | #define IDR_BITS 8 | 25 | #define IDR_BITS 6 |
27 | #define IDR_SIZE (1 << IDR_BITS) | 26 | #define IDR_SIZE (1 << IDR_BITS) |
28 | #define IDR_MASK ((1 << IDR_BITS)-1) | 27 | #define IDR_MASK ((1 << IDR_BITS)-1) |
29 | 28 | ||
@@ -56,6 +55,32 @@ struct idr { | |||
56 | #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) | 55 | #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) |
57 | 56 | ||
58 | /** | 57 | /** |
58 | * idr_get_cursor - Return the current position of the cyclic allocator | ||
59 | * @idr: idr handle | ||
60 | * | ||
61 | * The value returned is the value that will be next returned from | ||
62 | * idr_alloc_cyclic() if it is free (otherwise the search will start from | ||
63 | * this position). | ||
64 | */ | ||
65 | static inline unsigned int idr_get_cursor(struct idr *idr) | ||
66 | { | ||
67 | return READ_ONCE(idr->cur); | ||
68 | } | ||
69 | |||
70 | /** | ||
71 | * idr_set_cursor - Set the current position of the cyclic allocator | ||
72 | * @idr: idr handle | ||
73 | * @val: new position | ||
74 | * | ||
75 | * The next call to idr_alloc_cyclic() will return @val if it is free | ||
76 | * (otherwise the search will start from this position). | ||
77 | */ | ||
78 | static inline void idr_set_cursor(struct idr *idr, unsigned int val) | ||
79 | { | ||
80 | WRITE_ONCE(idr->cur, val); | ||
81 | } | ||
82 | |||
83 | /** | ||
59 | * DOC: idr sync | 84 | * DOC: idr sync |
60 | * idr synchronization (stolen from radix-tree.h) | 85 | * idr synchronization (stolen from radix-tree.h) |
61 | * | 86 | * |
@@ -195,6 +220,11 @@ static inline int ida_get_new(struct ida *ida, int *p_id) | |||
195 | return ida_get_new_above(ida, 0, p_id); | 220 | return ida_get_new_above(ida, 0, p_id); |
196 | } | 221 | } |
197 | 222 | ||
223 | static inline bool ida_is_empty(struct ida *ida) | ||
224 | { | ||
225 | return idr_is_empty(&ida->idr); | ||
226 | } | ||
227 | |||
198 | void __init idr_init_cache(void); | 228 | void __init idr_init_cache(void); |
199 | 229 | ||
200 | #endif /* __IDR_H__ */ | 230 | #endif /* __IDR_H__ */ |
diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 410decacff8f..68bd88223417 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h | |||
@@ -77,7 +77,6 @@ extern int kdb_poll_idx; | |||
77 | * number whenever the kernel debugger is entered. | 77 | * number whenever the kernel debugger is entered. |
78 | */ | 78 | */ |
79 | extern int kdb_initial_cpu; | 79 | extern int kdb_initial_cpu; |
80 | extern atomic_t kdb_event; | ||
81 | 80 | ||
82 | /* Types and messages used for dynamically added kdb shell commands */ | 81 | /* Types and messages used for dynamically added kdb shell commands */ |
83 | 82 | ||
@@ -162,6 +161,7 @@ enum kdb_msgsrc { | |||
162 | }; | 161 | }; |
163 | 162 | ||
164 | extern int kdb_trap_printk; | 163 | extern int kdb_trap_printk; |
164 | extern int kdb_printf_cpu; | ||
165 | extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, | 165 | extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, |
166 | va_list args); | 166 | va_list args); |
167 | extern __printf(1, 2) int kdb_printf(const char *, ...); | 167 | extern __printf(1, 2) int kdb_printf(const char *, ...); |
diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 406c33dcae13..d7437777baaa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h | |||
@@ -259,12 +259,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); | |||
259 | vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) | 259 | vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) |
260 | #define VMCOREINFO_CONFIG(name) \ | 260 | #define VMCOREINFO_CONFIG(name) \ |
261 | vmcoreinfo_append_str("CONFIG_%s=y\n", #name) | 261 | vmcoreinfo_append_str("CONFIG_%s=y\n", #name) |
262 | #define VMCOREINFO_PAGE_OFFSET(value) \ | ||
263 | vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value) | ||
264 | #define VMCOREINFO_VMALLOC_START(value) \ | ||
265 | vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value) | ||
266 | #define VMCOREINFO_VMEMMAP_START(value) \ | ||
267 | vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value) | ||
268 | 262 | ||
269 | extern struct kimage *kexec_image; | 263 | extern struct kimage *kexec_image; |
270 | extern struct kimage *kexec_crash_image; | 264 | extern struct kimage *kexec_crash_image; |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5b2e4df14e..4424784ac374 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -292,36 +292,23 @@ extern pgprot_t protection_map[16]; | |||
292 | * pgoff should be used in favour of virtual_address, if possible. | 292 | * pgoff should be used in favour of virtual_address, if possible. |
293 | */ | 293 | */ |
294 | struct vm_fault { | 294 | struct vm_fault { |
295 | struct vm_area_struct *vma; /* Target VMA */ | ||
295 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | 296 | unsigned int flags; /* FAULT_FLAG_xxx flags */ |
296 | gfp_t gfp_mask; /* gfp mask to be used for allocations */ | 297 | gfp_t gfp_mask; /* gfp mask to be used for allocations */ |
297 | pgoff_t pgoff; /* Logical page offset based on vma */ | 298 | pgoff_t pgoff; /* Logical page offset based on vma */ |
298 | void __user *virtual_address; /* Faulting virtual address */ | 299 | unsigned long address; /* Faulting virtual address */ |
300 | pmd_t *pmd; /* Pointer to pmd entry matching | ||
301 | * the 'address' */ | ||
302 | pte_t orig_pte; /* Value of PTE at the time of fault */ | ||
299 | 303 | ||
300 | struct page *cow_page; /* Handler may choose to COW */ | 304 | struct page *cow_page; /* Page handler may use for COW fault */ |
305 | struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ | ||
301 | struct page *page; /* ->fault handlers should return a | 306 | struct page *page; /* ->fault handlers should return a |
302 | * page here, unless VM_FAULT_NOPAGE | 307 | * page here, unless VM_FAULT_NOPAGE |
303 | * is set (which is also implied by | 308 | * is set (which is also implied by |
304 | * VM_FAULT_ERROR). | 309 | * VM_FAULT_ERROR). |
305 | */ | 310 | */ |
306 | void *entry; /* ->fault handler can alternatively | 311 | /* These three entries are valid only while holding ptl lock */ |
307 | * return locked DAX entry. In that | ||
308 | * case handler should return | ||
309 | * VM_FAULT_DAX_LOCKED and fill in | ||
310 | * entry here. | ||
311 | */ | ||
312 | }; | ||
313 | |||
314 | /* | ||
315 | * Page fault context: passes though page fault handler instead of endless list | ||
316 | * of function arguments. | ||
317 | */ | ||
318 | struct fault_env { | ||
319 | struct vm_area_struct *vma; /* Target VMA */ | ||
320 | unsigned long address; /* Faulting virtual address */ | ||
321 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | ||
322 | pmd_t *pmd; /* Pointer to pmd entry matching | ||
323 | * the 'address' | ||
324 | */ | ||
325 | pte_t *pte; /* Pointer to pte entry matching | 312 | pte_t *pte; /* Pointer to pte entry matching |
326 | * the 'address'. NULL if the page | 313 | * the 'address'. NULL if the page |
327 | * table hasn't been allocated. | 314 | * table hasn't been allocated. |
@@ -351,7 +338,7 @@ struct vm_operations_struct { | |||
351 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 338 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
352 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, | 339 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, |
353 | pmd_t *, unsigned int flags); | 340 | pmd_t *, unsigned int flags); |
354 | void (*map_pages)(struct fault_env *fe, | 341 | void (*map_pages)(struct vm_fault *vmf, |
355 | pgoff_t start_pgoff, pgoff_t end_pgoff); | 342 | pgoff_t start_pgoff, pgoff_t end_pgoff); |
356 | 343 | ||
357 | /* notification that a previously read-only page is about to become | 344 | /* notification that a previously read-only page is about to become |
@@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
625 | return pte; | 612 | return pte; |
626 | } | 613 | } |
627 | 614 | ||
628 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | 615 | int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
629 | struct page *page); | 616 | struct page *page); |
617 | int finish_fault(struct vm_fault *vmf); | ||
618 | int finish_mkwrite_fault(struct vm_fault *vmf); | ||
630 | #endif | 619 | #endif |
631 | 620 | ||
632 | /* | 621 | /* |
@@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page) | |||
1110 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 1099 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
1111 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ | 1100 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ |
1112 | #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ | 1101 | #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ |
1113 | #define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ | 1102 | #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ |
1114 | 1103 | ||
1115 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ | 1104 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
1116 | 1105 | ||
@@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
1221 | struct vm_area_struct *vma); | 1210 | struct vm_area_struct *vma); |
1222 | void unmap_mapping_range(struct address_space *mapping, | 1211 | void unmap_mapping_range(struct address_space *mapping, |
1223 | loff_t const holebegin, loff_t const holelen, int even_cows); | 1212 | loff_t const holebegin, loff_t const holelen, int even_cows); |
1213 | int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, | ||
1214 | spinlock_t **ptlp); | ||
1224 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | 1215 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, |
1225 | unsigned long *pfn); | 1216 | unsigned long *pfn); |
1226 | int follow_phys(struct vm_area_struct *vma, unsigned long address, | 1217 | int follow_phys(struct vm_area_struct *vma, unsigned long address, |
@@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
1276 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, | 1267 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, |
1277 | unsigned long start, unsigned long nr_pages, | 1268 | unsigned long start, unsigned long nr_pages, |
1278 | unsigned int gup_flags, struct page **pages, | 1269 | unsigned int gup_flags, struct page **pages, |
1279 | struct vm_area_struct **vmas); | 1270 | struct vm_area_struct **vmas, int *locked); |
1280 | long get_user_pages(unsigned long start, unsigned long nr_pages, | 1271 | long get_user_pages(unsigned long start, unsigned long nr_pages, |
1281 | unsigned int gup_flags, struct page **pages, | 1272 | unsigned int gup_flags, struct page **pages, |
1282 | struct vm_area_struct **vmas); | 1273 | struct vm_area_struct **vmas); |
1283 | long get_user_pages_locked(unsigned long start, unsigned long nr_pages, | 1274 | long get_user_pages_locked(unsigned long start, unsigned long nr_pages, |
1284 | unsigned int gup_flags, struct page **pages, int *locked); | 1275 | unsigned int gup_flags, struct page **pages, int *locked); |
1285 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
1286 | unsigned long start, unsigned long nr_pages, | ||
1287 | struct page **pages, unsigned int gup_flags); | ||
1288 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, | 1276 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
1289 | struct page **pages, unsigned int gup_flags); | 1277 | struct page **pages, unsigned int gup_flags); |
1290 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1278 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
@@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *); | |||
2099 | 2087 | ||
2100 | /* generic vm_area_ops exported for stackable file systems */ | 2088 | /* generic vm_area_ops exported for stackable file systems */ |
2101 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); | 2089 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); |
2102 | extern void filemap_map_pages(struct fault_env *fe, | 2090 | extern void filemap_map_pages(struct vm_fault *vmf, |
2103 | pgoff_t start_pgoff, pgoff_t end_pgoff); | 2091 | pgoff_t start_pgoff, pgoff_t end_pgoff); |
2104 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 2092 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
2105 | 2093 | ||
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35cff1ae..aacca824a6ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
@@ -7,6 +7,23 @@ | |||
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | #include <asm/irq.h> | 8 | #include <asm/irq.h> |
9 | 9 | ||
10 | /* | ||
11 | * The run state of the lockup detectors is controlled by the content of the | ||
12 | * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - | ||
13 | * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. | ||
14 | * | ||
15 | * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' | ||
16 | * are variables that are only used as an 'interface' between the parameters | ||
17 | * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The | ||
18 | * 'watchdog_thresh' variable is handled differently because its value is not | ||
19 | * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' | ||
20 | * is equal zero. | ||
21 | */ | ||
22 | #define NMI_WATCHDOG_ENABLED_BIT 0 | ||
23 | #define SOFT_WATCHDOG_ENABLED_BIT 1 | ||
24 | #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) | ||
25 | #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) | ||
26 | |||
10 | /** | 27 | /** |
11 | * touch_nmi_watchdog - restart NMI watchdog timeout. | 28 | * touch_nmi_watchdog - restart NMI watchdog timeout. |
12 | * | 29 | * |
@@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled; | |||
91 | extern int soft_watchdog_enabled; | 108 | extern int soft_watchdog_enabled; |
92 | extern int watchdog_user_enabled; | 109 | extern int watchdog_user_enabled; |
93 | extern int watchdog_thresh; | 110 | extern int watchdog_thresh; |
111 | extern unsigned long watchdog_enabled; | ||
94 | extern unsigned long *watchdog_cpumask_bits; | 112 | extern unsigned long *watchdog_cpumask_bits; |
113 | #ifdef CONFIG_SMP | ||
95 | extern int sysctl_softlockup_all_cpu_backtrace; | 114 | extern int sysctl_softlockup_all_cpu_backtrace; |
96 | extern int sysctl_hardlockup_all_cpu_backtrace; | 115 | extern int sysctl_hardlockup_all_cpu_backtrace; |
116 | #else | ||
117 | #define sysctl_softlockup_all_cpu_backtrace 0 | ||
118 | #define sysctl_hardlockup_all_cpu_backtrace 0 | ||
119 | #endif | ||
120 | extern bool is_hardlockup(void); | ||
97 | struct ctl_table; | 121 | struct ctl_table; |
98 | extern int proc_watchdog(struct ctl_table *, int , | 122 | extern int proc_watchdog(struct ctl_table *, int , |
99 | void __user *, size_t *, loff_t *); | 123 | void __user *, size_t *, loff_t *); |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 744486057e9e..5dea8f6440e4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -80,23 +80,25 @@ static inline bool radix_tree_is_internal_node(void *ptr) | |||
80 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ | 80 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ |
81 | RADIX_TREE_MAP_SHIFT)) | 81 | RADIX_TREE_MAP_SHIFT)) |
82 | 82 | ||
83 | /* | ||
84 | * @count is the count of every non-NULL element in the ->slots array | ||
85 | * whether that is an exceptional entry, a retry entry, a user pointer, | ||
86 | * a sibling entry or a pointer to the next level of the tree. | ||
87 | * @exceptional is the count of every element in ->slots which is | ||
88 | * either radix_tree_exceptional_entry() or is a sibling entry for an | ||
89 | * exceptional entry. | ||
90 | */ | ||
83 | struct radix_tree_node { | 91 | struct radix_tree_node { |
84 | unsigned char shift; /* Bits remaining in each slot */ | 92 | unsigned char shift; /* Bits remaining in each slot */ |
85 | unsigned char offset; /* Slot offset in parent */ | 93 | unsigned char offset; /* Slot offset in parent */ |
86 | unsigned char count; /* Total entry count */ | 94 | unsigned char count; /* Total entry count */ |
87 | unsigned char exceptional; /* Exceptional entry count */ | 95 | unsigned char exceptional; /* Exceptional entry count */ |
96 | struct radix_tree_node *parent; /* Used when ascending tree */ | ||
97 | void *private_data; /* For tree user */ | ||
88 | union { | 98 | union { |
89 | struct { | 99 | struct list_head private_list; /* For tree user */ |
90 | /* Used when ascending tree */ | 100 | struct rcu_head rcu_head; /* Used when freeing node */ |
91 | struct radix_tree_node *parent; | ||
92 | /* For tree user */ | ||
93 | void *private_data; | ||
94 | }; | ||
95 | /* Used when freeing node */ | ||
96 | struct rcu_head rcu_head; | ||
97 | }; | 101 | }; |
98 | /* For tree user */ | ||
99 | struct list_head private_list; | ||
100 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; | 102 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; |
101 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; | 103 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
102 | }; | 104 | }; |
@@ -127,6 +129,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root) | |||
127 | } | 129 | } |
128 | 130 | ||
129 | /** | 131 | /** |
132 | * struct radix_tree_iter - radix tree iterator state | ||
133 | * | ||
134 | * @index: index of current slot | ||
135 | * @next_index: one beyond the last index for this chunk | ||
136 | * @tags: bit-mask for tag-iterating | ||
137 | * @node: node that contains current slot | ||
138 | * @shift: shift for the node that holds our slots | ||
139 | * | ||
140 | * This radix tree iterator works in terms of "chunks" of slots. A chunk is a | ||
141 | * subinterval of slots contained within one radix tree leaf node. It is | ||
142 | * described by a pointer to its first slot and a struct radix_tree_iter | ||
143 | * which holds the chunk's position in the tree and its size. For tagged | ||
144 | * iteration radix_tree_iter also holds the slots' bit-mask for one chosen | ||
145 | * radix tree tag. | ||
146 | */ | ||
147 | struct radix_tree_iter { | ||
148 | unsigned long index; | ||
149 | unsigned long next_index; | ||
150 | unsigned long tags; | ||
151 | struct radix_tree_node *node; | ||
152 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
153 | unsigned int shift; | ||
154 | #endif | ||
155 | }; | ||
156 | |||
157 | static inline unsigned int iter_shift(const struct radix_tree_iter *iter) | ||
158 | { | ||
159 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
160 | return iter->shift; | ||
161 | #else | ||
162 | return 0; | ||
163 | #endif | ||
164 | } | ||
165 | |||
166 | /** | ||
130 | * Radix-tree synchronization | 167 | * Radix-tree synchronization |
131 | * | 168 | * |
132 | * The radix-tree API requires that users provide all synchronisation (with | 169 | * The radix-tree API requires that users provide all synchronisation (with |
@@ -264,6 +301,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
264 | struct radix_tree_node *node, | 301 | struct radix_tree_node *node, |
265 | void **slot, void *item, | 302 | void **slot, void *item, |
266 | radix_tree_update_node_t update_node, void *private); | 303 | radix_tree_update_node_t update_node, void *private); |
304 | void radix_tree_iter_replace(struct radix_tree_root *, | ||
305 | const struct radix_tree_iter *, void **slot, void *item); | ||
267 | void radix_tree_replace_slot(struct radix_tree_root *root, | 306 | void radix_tree_replace_slot(struct radix_tree_root *root, |
268 | void **slot, void *item); | 307 | void **slot, void *item); |
269 | void __radix_tree_delete_node(struct radix_tree_root *root, | 308 | void __radix_tree_delete_node(struct radix_tree_root *root, |
@@ -289,6 +328,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, | |||
289 | unsigned long index, unsigned int tag); | 328 | unsigned long index, unsigned int tag); |
290 | int radix_tree_tag_get(struct radix_tree_root *root, | 329 | int radix_tree_tag_get(struct radix_tree_root *root, |
291 | unsigned long index, unsigned int tag); | 330 | unsigned long index, unsigned int tag); |
331 | void radix_tree_iter_tag_set(struct radix_tree_root *root, | ||
332 | const struct radix_tree_iter *iter, unsigned int tag); | ||
292 | unsigned int | 333 | unsigned int |
293 | radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | 334 | radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, |
294 | unsigned long first_index, unsigned int max_items, | 335 | unsigned long first_index, unsigned int max_items, |
@@ -297,50 +338,18 @@ unsigned int | |||
297 | radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, | 338 | radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, |
298 | unsigned long first_index, unsigned int max_items, | 339 | unsigned long first_index, unsigned int max_items, |
299 | unsigned int tag); | 340 | unsigned int tag); |
300 | unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | ||
301 | unsigned long *first_indexp, unsigned long last_index, | ||
302 | unsigned long nr_to_tag, | ||
303 | unsigned int fromtag, unsigned int totag); | ||
304 | int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); | 341 | int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); |
305 | unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item); | ||
306 | 342 | ||
307 | static inline void radix_tree_preload_end(void) | 343 | static inline void radix_tree_preload_end(void) |
308 | { | 344 | { |
309 | preempt_enable(); | 345 | preempt_enable(); |
310 | } | 346 | } |
311 | 347 | ||
312 | /** | 348 | int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); |
313 | * struct radix_tree_iter - radix tree iterator state | 349 | int radix_tree_split(struct radix_tree_root *, unsigned long index, |
314 | * | 350 | unsigned new_order); |
315 | * @index: index of current slot | 351 | int radix_tree_join(struct radix_tree_root *, unsigned long index, |
316 | * @next_index: one beyond the last index for this chunk | 352 | unsigned new_order, void *); |
317 | * @tags: bit-mask for tag-iterating | ||
318 | * @shift: shift for the node that holds our slots | ||
319 | * | ||
320 | * This radix tree iterator works in terms of "chunks" of slots. A chunk is a | ||
321 | * subinterval of slots contained within one radix tree leaf node. It is | ||
322 | * described by a pointer to its first slot and a struct radix_tree_iter | ||
323 | * which holds the chunk's position in the tree and its size. For tagged | ||
324 | * iteration radix_tree_iter also holds the slots' bit-mask for one chosen | ||
325 | * radix tree tag. | ||
326 | */ | ||
327 | struct radix_tree_iter { | ||
328 | unsigned long index; | ||
329 | unsigned long next_index; | ||
330 | unsigned long tags; | ||
331 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
332 | unsigned int shift; | ||
333 | #endif | ||
334 | }; | ||
335 | |||
336 | static inline unsigned int iter_shift(struct radix_tree_iter *iter) | ||
337 | { | ||
338 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
339 | return iter->shift; | ||
340 | #else | ||
341 | return 0; | ||
342 | #endif | ||
343 | } | ||
344 | 353 | ||
345 | #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ | 354 | #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ |
346 | #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ | 355 | #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ |
@@ -409,20 +418,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) | |||
409 | } | 418 | } |
410 | 419 | ||
411 | /** | 420 | /** |
412 | * radix_tree_iter_next - resume iterating when the chunk may be invalid | 421 | * radix_tree_iter_resume - resume iterating when the chunk may be invalid |
413 | * @iter: iterator state | 422 | * @slot: pointer to current slot |
423 | * @iter: iterator state | ||
424 | * Returns: New slot pointer | ||
414 | * | 425 | * |
415 | * If the iterator needs to release then reacquire a lock, the chunk may | 426 | * If the iterator needs to release then reacquire a lock, the chunk may |
416 | * have been invalidated by an insertion or deletion. Call this function | 427 | * have been invalidated by an insertion or deletion. Call this function |
417 | * to continue the iteration from the next index. | 428 | * before releasing the lock to continue the iteration from the next index. |
418 | */ | 429 | */ |
419 | static inline __must_check | 430 | void **__must_check radix_tree_iter_resume(void **slot, |
420 | void **radix_tree_iter_next(struct radix_tree_iter *iter) | 431 | struct radix_tree_iter *iter); |
421 | { | ||
422 | iter->next_index = __radix_tree_iter_add(iter, 1); | ||
423 | iter->tags = 0; | ||
424 | return NULL; | ||
425 | } | ||
426 | 432 | ||
427 | /** | 433 | /** |
428 | * radix_tree_chunk_size - get current chunk size | 434 | * radix_tree_chunk_size - get current chunk size |
@@ -436,10 +442,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) | |||
436 | return (iter->next_index - iter->index) >> iter_shift(iter); | 442 | return (iter->next_index - iter->index) >> iter_shift(iter); |
437 | } | 443 | } |
438 | 444 | ||
439 | static inline struct radix_tree_node *entry_to_node(void *ptr) | 445 | #ifdef CONFIG_RADIX_TREE_MULTIORDER |
446 | void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, | ||
447 | unsigned flags); | ||
448 | #else | ||
449 | /* Can't happen without sibling entries, but the compiler can't tell that */ | ||
450 | static inline void ** __radix_tree_next_slot(void **slot, | ||
451 | struct radix_tree_iter *iter, unsigned flags) | ||
440 | { | 452 | { |
441 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); | 453 | return slot; |
442 | } | 454 | } |
455 | #endif | ||
443 | 456 | ||
444 | /** | 457 | /** |
445 | * radix_tree_next_slot - find next slot in chunk | 458 | * radix_tree_next_slot - find next slot in chunk |
@@ -453,7 +466,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr) | |||
453 | * For tagged lookup it also eats @iter->tags. | 466 | * For tagged lookup it also eats @iter->tags. |
454 | * | 467 | * |
455 | * There are several cases where 'slot' can be passed in as NULL to this | 468 | * There are several cases where 'slot' can be passed in as NULL to this |
456 | * function. These cases result from the use of radix_tree_iter_next() or | 469 | * function. These cases result from the use of radix_tree_iter_resume() or |
457 | * radix_tree_iter_retry(). In these cases we don't end up dereferencing | 470 | * radix_tree_iter_retry(). In these cases we don't end up dereferencing |
458 | * 'slot' because either: | 471 | * 'slot' because either: |
459 | * a) we are doing tagged iteration and iter->tags has been set to 0, or | 472 | * a) we are doing tagged iteration and iter->tags has been set to 0, or |
@@ -464,51 +477,31 @@ static __always_inline void ** | |||
464 | radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) | 477 | radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) |
465 | { | 478 | { |
466 | if (flags & RADIX_TREE_ITER_TAGGED) { | 479 | if (flags & RADIX_TREE_ITER_TAGGED) { |
467 | void *canon = slot; | ||
468 | |||
469 | iter->tags >>= 1; | 480 | iter->tags >>= 1; |
470 | if (unlikely(!iter->tags)) | 481 | if (unlikely(!iter->tags)) |
471 | return NULL; | 482 | return NULL; |
472 | while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && | ||
473 | radix_tree_is_internal_node(slot[1])) { | ||
474 | if (entry_to_node(slot[1]) == canon) { | ||
475 | iter->tags >>= 1; | ||
476 | iter->index = __radix_tree_iter_add(iter, 1); | ||
477 | slot++; | ||
478 | continue; | ||
479 | } | ||
480 | iter->next_index = __radix_tree_iter_add(iter, 1); | ||
481 | return NULL; | ||
482 | } | ||
483 | if (likely(iter->tags & 1ul)) { | 483 | if (likely(iter->tags & 1ul)) { |
484 | iter->index = __radix_tree_iter_add(iter, 1); | 484 | iter->index = __radix_tree_iter_add(iter, 1); |
485 | return slot + 1; | 485 | slot++; |
486 | goto found; | ||
486 | } | 487 | } |
487 | if (!(flags & RADIX_TREE_ITER_CONTIG)) { | 488 | if (!(flags & RADIX_TREE_ITER_CONTIG)) { |
488 | unsigned offset = __ffs(iter->tags); | 489 | unsigned offset = __ffs(iter->tags); |
489 | 490 | ||
490 | iter->tags >>= offset; | 491 | iter->tags >>= offset++; |
491 | iter->index = __radix_tree_iter_add(iter, offset + 1); | 492 | iter->index = __radix_tree_iter_add(iter, offset); |
492 | return slot + offset + 1; | 493 | slot += offset; |
494 | goto found; | ||
493 | } | 495 | } |
494 | } else { | 496 | } else { |
495 | long count = radix_tree_chunk_size(iter); | 497 | long count = radix_tree_chunk_size(iter); |
496 | void *canon = slot; | ||
497 | 498 | ||
498 | while (--count > 0) { | 499 | while (--count > 0) { |
499 | slot++; | 500 | slot++; |
500 | iter->index = __radix_tree_iter_add(iter, 1); | 501 | iter->index = __radix_tree_iter_add(iter, 1); |
501 | 502 | ||
502 | if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && | ||
503 | radix_tree_is_internal_node(*slot)) { | ||
504 | if (entry_to_node(*slot) == canon) | ||
505 | continue; | ||
506 | iter->next_index = iter->index; | ||
507 | break; | ||
508 | } | ||
509 | |||
510 | if (likely(*slot)) | 503 | if (likely(*slot)) |
511 | return slot; | 504 | goto found; |
512 | if (flags & RADIX_TREE_ITER_CONTIG) { | 505 | if (flags & RADIX_TREE_ITER_CONTIG) { |
513 | /* forbid switching to the next chunk */ | 506 | /* forbid switching to the next chunk */ |
514 | iter->next_index = 0; | 507 | iter->next_index = 0; |
@@ -517,6 +510,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) | |||
517 | } | 510 | } |
518 | } | 511 | } |
519 | return NULL; | 512 | return NULL; |
513 | |||
514 | found: | ||
515 | if (unlikely(radix_tree_is_internal_node(*slot))) | ||
516 | return __radix_tree_next_slot(slot, iter, flags); | ||
517 | return slot; | ||
520 | } | 518 | } |
521 | 519 | ||
522 | /** | 520 | /** |
@@ -567,6 +565,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) | |||
567 | slot || (slot = radix_tree_next_chunk(root, iter, \ | 565 | slot || (slot = radix_tree_next_chunk(root, iter, \ |
568 | RADIX_TREE_ITER_TAGGED | tag)) ; \ | 566 | RADIX_TREE_ITER_TAGGED | tag)) ; \ |
569 | slot = radix_tree_next_slot(slot, iter, \ | 567 | slot = radix_tree_next_slot(slot, iter, \ |
570 | RADIX_TREE_ITER_TAGGED)) | 568 | RADIX_TREE_ITER_TAGGED | tag)) |
571 | 569 | ||
572 | #endif /* _LINUX_RADIX_TREE_H */ | 570 | #endif /* _LINUX_RADIX_TREE_H */ |
diff --git a/include/linux/signal.h b/include/linux/signal.h index b63f63eaa39c..5308304993be 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h | |||
@@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set) | |||
97 | } | 97 | } |
98 | } | 98 | } |
99 | 99 | ||
100 | static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) | ||
101 | { | ||
102 | switch (_NSIG_WORDS) { | ||
103 | case 4: | ||
104 | return (set1->sig[3] == set2->sig[3]) && | ||
105 | (set1->sig[2] == set2->sig[2]) && | ||
106 | (set1->sig[1] == set2->sig[1]) && | ||
107 | (set1->sig[0] == set2->sig[0]); | ||
108 | case 2: | ||
109 | return (set1->sig[1] == set2->sig[1]) && | ||
110 | (set1->sig[0] == set2->sig[0]); | ||
111 | case 1: | ||
112 | return set1->sig[0] == set2->sig[0]; | ||
113 | } | ||
114 | return 0; | ||
115 | } | ||
116 | |||
100 | #define sigmask(sig) (1UL << ((sig) - 1)) | 117 | #define sigmask(sig) (1UL << ((sig) - 1)) |
101 | 118 | ||
102 | #ifndef __HAVE_ARCH_SIG_SETOPS | 119 | #ifndef __HAVE_ARCH_SIG_SETOPS |
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index dd66a952e8cd..11b92b047a1e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h | |||
@@ -27,7 +27,7 @@ | |||
27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) | 27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) |
28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) | 28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) |
29 | 29 | ||
30 | extern int handle_userfault(struct fault_env *fe, unsigned long reason); | 30 | extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); |
31 | 31 | ||
32 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, | 32 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, |
33 | unsigned long src_start, unsigned long len); | 33 | unsigned long src_start, unsigned long len); |
@@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) | |||
55 | #else /* CONFIG_USERFAULTFD */ | 55 | #else /* CONFIG_USERFAULTFD */ |
56 | 56 | ||
57 | /* mm helpers */ | 57 | /* mm helpers */ |
58 | static inline int handle_userfault(struct fault_env *fe, unsigned long reason) | 58 | static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) |
59 | { | 59 | { |
60 | return VM_FAULT_SIGBUS; | 60 | return VM_FAULT_SIGBUS; |
61 | } | 61 | } |
@@ -763,7 +763,10 @@ static inline int convert_mode(long *msgtyp, int msgflg) | |||
763 | if (*msgtyp == 0) | 763 | if (*msgtyp == 0) |
764 | return SEARCH_ANY; | 764 | return SEARCH_ANY; |
765 | if (*msgtyp < 0) { | 765 | if (*msgtyp < 0) { |
766 | *msgtyp = -*msgtyp; | 766 | if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ |
767 | *msgtyp = LONG_MAX; | ||
768 | else | ||
769 | *msgtyp = -*msgtyp; | ||
767 | return SEARCH_LESSEQUAL; | 770 | return SEARCH_LESSEQUAL; |
768 | } | 771 | } |
769 | if (msgflg & MSG_EXCEPT) | 772 | if (msgflg & MSG_EXCEPT) |
@@ -11,6 +11,7 @@ | |||
11 | * (c) 2001 Red Hat Inc | 11 | * (c) 2001 Red Hat Inc |
12 | * Lockless wakeup | 12 | * Lockless wakeup |
13 | * (c) 2003 Manfred Spraul <manfred@colorfullife.com> | 13 | * (c) 2003 Manfred Spraul <manfred@colorfullife.com> |
14 | * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> | ||
14 | * Further wakeup optimizations, documentation | 15 | * Further wakeup optimizations, documentation |
15 | * (c) 2010 Manfred Spraul <manfred@colorfullife.com> | 16 | * (c) 2010 Manfred Spraul <manfred@colorfullife.com> |
16 | * | 17 | * |
@@ -53,15 +54,11 @@ | |||
53 | * Semaphores are actively given to waiting tasks (necessary for FIFO). | 54 | * Semaphores are actively given to waiting tasks (necessary for FIFO). |
54 | * (see update_queue()) | 55 | * (see update_queue()) |
55 | * - To improve the scalability, the actual wake-up calls are performed after | 56 | * - To improve the scalability, the actual wake-up calls are performed after |
56 | * dropping all locks. (see wake_up_sem_queue_prepare(), | 57 | * dropping all locks. (see wake_up_sem_queue_prepare()) |
57 | * wake_up_sem_queue_do()) | ||
58 | * - All work is done by the waker, the woken up task does not have to do | 58 | * - All work is done by the waker, the woken up task does not have to do |
59 | * anything - not even acquiring a lock or dropping a refcount. | 59 | * anything - not even acquiring a lock or dropping a refcount. |
60 | * - A woken up task may not even touch the semaphore array anymore, it may | 60 | * - A woken up task may not even touch the semaphore array anymore, it may |
61 | * have been destroyed already by a semctl(RMID). | 61 | * have been destroyed already by a semctl(RMID). |
62 | * - The synchronizations between wake-ups due to a timeout/signal and a | ||
63 | * wake-up due to a completed semaphore operation is achieved by using an | ||
64 | * intermediate state (IN_WAKEUP). | ||
65 | * - UNDO values are stored in an array (one per process and per | 62 | * - UNDO values are stored in an array (one per process and per |
66 | * semaphore array, lazily allocated). For backwards compatibility, multiple | 63 | * semaphore array, lazily allocated). For backwards compatibility, multiple |
67 | * modes for the UNDO variables are supported (per process, per thread) | 64 | * modes for the UNDO variables are supported (per process, per thread) |
@@ -118,7 +115,8 @@ struct sem_queue { | |||
118 | struct sembuf *sops; /* array of pending operations */ | 115 | struct sembuf *sops; /* array of pending operations */ |
119 | struct sembuf *blocking; /* the operation that blocked */ | 116 | struct sembuf *blocking; /* the operation that blocked */ |
120 | int nsops; /* number of operations */ | 117 | int nsops; /* number of operations */ |
121 | int alter; /* does *sops alter the array? */ | 118 | bool alter; /* does *sops alter the array? */ |
119 | bool dupsop; /* sops on more than one sem_num */ | ||
122 | }; | 120 | }; |
123 | 121 | ||
124 | /* Each task has a list of undo requests. They are executed automatically | 122 | /* Each task has a list of undo requests. They are executed automatically |
@@ -416,29 +414,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) | |||
416 | * | 414 | * |
417 | * The caller holds the RCU read lock. | 415 | * The caller holds the RCU read lock. |
418 | */ | 416 | */ |
419 | static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, | ||
420 | int id, struct sembuf *sops, int nsops, int *locknum) | ||
421 | { | ||
422 | struct kern_ipc_perm *ipcp; | ||
423 | struct sem_array *sma; | ||
424 | |||
425 | ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); | ||
426 | if (IS_ERR(ipcp)) | ||
427 | return ERR_CAST(ipcp); | ||
428 | |||
429 | sma = container_of(ipcp, struct sem_array, sem_perm); | ||
430 | *locknum = sem_lock(sma, sops, nsops); | ||
431 | |||
432 | /* ipc_rmid() may have already freed the ID while sem_lock | ||
433 | * was spinning: verify that the structure is still valid | ||
434 | */ | ||
435 | if (ipc_valid_object(ipcp)) | ||
436 | return container_of(ipcp, struct sem_array, sem_perm); | ||
437 | |||
438 | sem_unlock(sma, *locknum); | ||
439 | return ERR_PTR(-EINVAL); | ||
440 | } | ||
441 | |||
442 | static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) | 417 | static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) |
443 | { | 418 | { |
444 | struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); | 419 | struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); |
@@ -471,40 +446,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) | |||
471 | ipc_rmid(&sem_ids(ns), &s->sem_perm); | 446 | ipc_rmid(&sem_ids(ns), &s->sem_perm); |
472 | } | 447 | } |
473 | 448 | ||
474 | /* | ||
475 | * Lockless wakeup algorithm: | ||
476 | * Without the check/retry algorithm a lockless wakeup is possible: | ||
477 | * - queue.status is initialized to -EINTR before blocking. | ||
478 | * - wakeup is performed by | ||
479 | * * unlinking the queue entry from the pending list | ||
480 | * * setting queue.status to IN_WAKEUP | ||
481 | * This is the notification for the blocked thread that a | ||
482 | * result value is imminent. | ||
483 | * * call wake_up_process | ||
484 | * * set queue.status to the final value. | ||
485 | * - the previously blocked thread checks queue.status: | ||
486 | * * if it's IN_WAKEUP, then it must wait until the value changes | ||
487 | * * if it's not -EINTR, then the operation was completed by | ||
488 | * update_queue. semtimedop can return queue.status without | ||
489 | * performing any operation on the sem array. | ||
490 | * * otherwise it must acquire the spinlock and check what's up. | ||
491 | * | ||
492 | * The two-stage algorithm is necessary to protect against the following | ||
493 | * races: | ||
494 | * - if queue.status is set after wake_up_process, then the woken up idle | ||
495 | * thread could race forward and try (and fail) to acquire sma->lock | ||
496 | * before update_queue had a chance to set queue.status | ||
497 | * - if queue.status is written before wake_up_process and if the | ||
498 | * blocked process is woken up by a signal between writing | ||
499 | * queue.status and the wake_up_process, then the woken up | ||
500 | * process could return from semtimedop and die by calling | ||
501 | * sys_exit before wake_up_process is called. Then wake_up_process | ||
502 | * will oops, because the task structure is already invalid. | ||
503 | * (yes, this happened on s390 with sysv msg). | ||
504 | * | ||
505 | */ | ||
506 | #define IN_WAKEUP 1 | ||
507 | |||
508 | /** | 449 | /** |
509 | * newary - Create a new semaphore set | 450 | * newary - Create a new semaphore set |
510 | * @ns: namespace | 451 | * @ns: namespace |
@@ -624,15 +565,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) | |||
624 | } | 565 | } |
625 | 566 | ||
626 | /** | 567 | /** |
627 | * perform_atomic_semop - Perform (if possible) a semaphore operation | 568 | * perform_atomic_semop[_slow] - Attempt to perform semaphore |
569 | * operations on a given array. | ||
628 | * @sma: semaphore array | 570 | * @sma: semaphore array |
629 | * @q: struct sem_queue that describes the operation | 571 | * @q: struct sem_queue that describes the operation |
630 | * | 572 | * |
573 | * Caller blocking are as follows, based the value | ||
574 | * indicated by the semaphore operation (sem_op): | ||
575 | * | ||
576 | * (1) >0 never blocks. | ||
577 | * (2) 0 (wait-for-zero operation): semval is non-zero. | ||
578 | * (3) <0 attempting to decrement semval to a value smaller than zero. | ||
579 | * | ||
631 | * Returns 0 if the operation was possible. | 580 | * Returns 0 if the operation was possible. |
632 | * Returns 1 if the operation is impossible, the caller must sleep. | 581 | * Returns 1 if the operation is impossible, the caller must sleep. |
633 | * Negative values are error codes. | 582 | * Returns <0 for error codes. |
634 | */ | 583 | */ |
635 | static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) | 584 | static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) |
636 | { | 585 | { |
637 | int result, sem_op, nsops, pid; | 586 | int result, sem_op, nsops, pid; |
638 | struct sembuf *sop; | 587 | struct sembuf *sop; |
@@ -703,51 +652,84 @@ undo: | |||
703 | return result; | 652 | return result; |
704 | } | 653 | } |
705 | 654 | ||
706 | /** wake_up_sem_queue_prepare(q, error): Prepare wake-up | 655 | static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) |
707 | * @q: queue entry that must be signaled | ||
708 | * @error: Error value for the signal | ||
709 | * | ||
710 | * Prepare the wake-up of the queue entry q. | ||
711 | */ | ||
712 | static void wake_up_sem_queue_prepare(struct list_head *pt, | ||
713 | struct sem_queue *q, int error) | ||
714 | { | 656 | { |
715 | if (list_empty(pt)) { | 657 | int result, sem_op, nsops; |
716 | /* | 658 | struct sembuf *sop; |
717 | * Hold preempt off so that we don't get preempted and have the | 659 | struct sem *curr; |
718 | * wakee busy-wait until we're scheduled back on. | 660 | struct sembuf *sops; |
719 | */ | 661 | struct sem_undo *un; |
720 | preempt_disable(); | 662 | |
663 | sops = q->sops; | ||
664 | nsops = q->nsops; | ||
665 | un = q->undo; | ||
666 | |||
667 | if (unlikely(q->dupsop)) | ||
668 | return perform_atomic_semop_slow(sma, q); | ||
669 | |||
670 | /* | ||
671 | * We scan the semaphore set twice, first to ensure that the entire | ||
672 | * operation can succeed, therefore avoiding any pointless writes | ||
673 | * to shared memory and having to undo such changes in order to block | ||
674 | * until the operations can go through. | ||
675 | */ | ||
676 | for (sop = sops; sop < sops + nsops; sop++) { | ||
677 | curr = sma->sem_base + sop->sem_num; | ||
678 | sem_op = sop->sem_op; | ||
679 | result = curr->semval; | ||
680 | |||
681 | if (!sem_op && result) | ||
682 | goto would_block; /* wait-for-zero */ | ||
683 | |||
684 | result += sem_op; | ||
685 | if (result < 0) | ||
686 | goto would_block; | ||
687 | |||
688 | if (result > SEMVMX) | ||
689 | return -ERANGE; | ||
690 | |||
691 | if (sop->sem_flg & SEM_UNDO) { | ||
692 | int undo = un->semadj[sop->sem_num] - sem_op; | ||
693 | |||
694 | /* Exceeding the undo range is an error. */ | ||
695 | if (undo < (-SEMAEM - 1) || undo > SEMAEM) | ||
696 | return -ERANGE; | ||
697 | } | ||
698 | } | ||
699 | |||
700 | for (sop = sops; sop < sops + nsops; sop++) { | ||
701 | curr = sma->sem_base + sop->sem_num; | ||
702 | sem_op = sop->sem_op; | ||
703 | result = curr->semval; | ||
704 | |||
705 | if (sop->sem_flg & SEM_UNDO) { | ||
706 | int undo = un->semadj[sop->sem_num] - sem_op; | ||
707 | |||
708 | un->semadj[sop->sem_num] = undo; | ||
709 | } | ||
710 | curr->semval += sem_op; | ||
711 | curr->sempid = q->pid; | ||
721 | } | 712 | } |
722 | q->status = IN_WAKEUP; | ||
723 | q->pid = error; | ||
724 | 713 | ||
725 | list_add_tail(&q->list, pt); | 714 | return 0; |
715 | |||
716 | would_block: | ||
717 | q->blocking = sop; | ||
718 | return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; | ||
726 | } | 719 | } |
727 | 720 | ||
728 | /** | 721 | static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, |
729 | * wake_up_sem_queue_do - do the actual wake-up | 722 | struct wake_q_head *wake_q) |
730 | * @pt: list of tasks to be woken up | ||
731 | * | ||
732 | * Do the actual wake-up. | ||
733 | * The function is called without any locks held, thus the semaphore array | ||
734 | * could be destroyed already and the tasks can disappear as soon as the | ||
735 | * status is set to the actual return code. | ||
736 | */ | ||
737 | static void wake_up_sem_queue_do(struct list_head *pt) | ||
738 | { | 723 | { |
739 | struct sem_queue *q, *t; | 724 | wake_q_add(wake_q, q->sleeper); |
740 | int did_something; | 725 | /* |
741 | 726 | * Rely on the above implicit barrier, such that we can | |
742 | did_something = !list_empty(pt); | 727 | * ensure that we hold reference to the task before setting |
743 | list_for_each_entry_safe(q, t, pt, list) { | 728 | * q->status. Otherwise we could race with do_exit if the |
744 | wake_up_process(q->sleeper); | 729 | * task is awoken by an external event before calling |
745 | /* q can disappear immediately after writing q->status. */ | 730 | * wake_up_process(). |
746 | smp_wmb(); | 731 | */ |
747 | q->status = q->pid; | 732 | WRITE_ONCE(q->status, error); |
748 | } | ||
749 | if (did_something) | ||
750 | preempt_enable(); | ||
751 | } | 733 | } |
752 | 734 | ||
753 | static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | 735 | static void unlink_queue(struct sem_array *sma, struct sem_queue *q) |
@@ -767,7 +749,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | |||
767 | * modified the array. | 749 | * modified the array. |
768 | * Note that wait-for-zero operations are handled without restart. | 750 | * Note that wait-for-zero operations are handled without restart. |
769 | */ | 751 | */ |
770 | static int check_restart(struct sem_array *sma, struct sem_queue *q) | 752 | static inline int check_restart(struct sem_array *sma, struct sem_queue *q) |
771 | { | 753 | { |
772 | /* pending complex alter operations are too difficult to analyse */ | 754 | /* pending complex alter operations are too difficult to analyse */ |
773 | if (!list_empty(&sma->pending_alter)) | 755 | if (!list_empty(&sma->pending_alter)) |
@@ -795,21 +777,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) | |||
795 | * wake_const_ops - wake up non-alter tasks | 777 | * wake_const_ops - wake up non-alter tasks |
796 | * @sma: semaphore array. | 778 | * @sma: semaphore array. |
797 | * @semnum: semaphore that was modified. | 779 | * @semnum: semaphore that was modified. |
798 | * @pt: list head for the tasks that must be woken up. | 780 | * @wake_q: lockless wake-queue head. |
799 | * | 781 | * |
800 | * wake_const_ops must be called after a semaphore in a semaphore array | 782 | * wake_const_ops must be called after a semaphore in a semaphore array |
801 | * was set to 0. If complex const operations are pending, wake_const_ops must | 783 | * was set to 0. If complex const operations are pending, wake_const_ops must |
802 | * be called with semnum = -1, as well as with the number of each modified | 784 | * be called with semnum = -1, as well as with the number of each modified |
803 | * semaphore. | 785 | * semaphore. |
804 | * The tasks that must be woken up are added to @pt. The return code | 786 | * The tasks that must be woken up are added to @wake_q. The return code |
805 | * is stored in q->pid. | 787 | * is stored in q->pid. |
806 | * The function returns 1 if at least one operation was completed successfully. | 788 | * The function returns 1 if at least one operation was completed successfully. |
807 | */ | 789 | */ |
808 | static int wake_const_ops(struct sem_array *sma, int semnum, | 790 | static int wake_const_ops(struct sem_array *sma, int semnum, |
809 | struct list_head *pt) | 791 | struct wake_q_head *wake_q) |
810 | { | 792 | { |
811 | struct sem_queue *q; | 793 | struct sem_queue *q, *tmp; |
812 | struct list_head *walk; | ||
813 | struct list_head *pending_list; | 794 | struct list_head *pending_list; |
814 | int semop_completed = 0; | 795 | int semop_completed = 0; |
815 | 796 | ||
@@ -818,25 +799,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum, | |||
818 | else | 799 | else |
819 | pending_list = &sma->sem_base[semnum].pending_const; | 800 | pending_list = &sma->sem_base[semnum].pending_const; |
820 | 801 | ||
821 | walk = pending_list->next; | 802 | list_for_each_entry_safe(q, tmp, pending_list, list) { |
822 | while (walk != pending_list) { | 803 | int error = perform_atomic_semop(sma, q); |
823 | int error; | ||
824 | |||
825 | q = container_of(walk, struct sem_queue, list); | ||
826 | walk = walk->next; | ||
827 | |||
828 | error = perform_atomic_semop(sma, q); | ||
829 | |||
830 | if (error <= 0) { | ||
831 | /* operation completed, remove from queue & wakeup */ | ||
832 | 804 | ||
833 | unlink_queue(sma, q); | 805 | if (error > 0) |
806 | continue; | ||
807 | /* operation completed, remove from queue & wakeup */ | ||
808 | unlink_queue(sma, q); | ||
834 | 809 | ||
835 | wake_up_sem_queue_prepare(pt, q, error); | 810 | wake_up_sem_queue_prepare(q, error, wake_q); |
836 | if (error == 0) | 811 | if (error == 0) |
837 | semop_completed = 1; | 812 | semop_completed = 1; |
838 | } | ||
839 | } | 813 | } |
814 | |||
840 | return semop_completed; | 815 | return semop_completed; |
841 | } | 816 | } |
842 | 817 | ||
@@ -845,14 +820,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum, | |||
845 | * @sma: semaphore array | 820 | * @sma: semaphore array |
846 | * @sops: operations that were performed | 821 | * @sops: operations that were performed |
847 | * @nsops: number of operations | 822 | * @nsops: number of operations |
848 | * @pt: list head of the tasks that must be woken up. | 823 | * @wake_q: lockless wake-queue head |
849 | * | 824 | * |
850 | * Checks all required queue for wait-for-zero operations, based | 825 | * Checks all required queue for wait-for-zero operations, based |
851 | * on the actual changes that were performed on the semaphore array. | 826 | * on the actual changes that were performed on the semaphore array. |
852 | * The function returns 1 if at least one operation was completed successfully. | 827 | * The function returns 1 if at least one operation was completed successfully. |
853 | */ | 828 | */ |
854 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | 829 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, |
855 | int nsops, struct list_head *pt) | 830 | int nsops, struct wake_q_head *wake_q) |
856 | { | 831 | { |
857 | int i; | 832 | int i; |
858 | int semop_completed = 0; | 833 | int semop_completed = 0; |
@@ -865,7 +840,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
865 | 840 | ||
866 | if (sma->sem_base[num].semval == 0) { | 841 | if (sma->sem_base[num].semval == 0) { |
867 | got_zero = 1; | 842 | got_zero = 1; |
868 | semop_completed |= wake_const_ops(sma, num, pt); | 843 | semop_completed |= wake_const_ops(sma, num, wake_q); |
869 | } | 844 | } |
870 | } | 845 | } |
871 | } else { | 846 | } else { |
@@ -876,7 +851,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
876 | for (i = 0; i < sma->sem_nsems; i++) { | 851 | for (i = 0; i < sma->sem_nsems; i++) { |
877 | if (sma->sem_base[i].semval == 0) { | 852 | if (sma->sem_base[i].semval == 0) { |
878 | got_zero = 1; | 853 | got_zero = 1; |
879 | semop_completed |= wake_const_ops(sma, i, pt); | 854 | semop_completed |= wake_const_ops(sma, i, wake_q); |
880 | } | 855 | } |
881 | } | 856 | } |
882 | } | 857 | } |
@@ -885,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
885 | * then check the global queue, too. | 860 | * then check the global queue, too. |
886 | */ | 861 | */ |
887 | if (got_zero) | 862 | if (got_zero) |
888 | semop_completed |= wake_const_ops(sma, -1, pt); | 863 | semop_completed |= wake_const_ops(sma, -1, wake_q); |
889 | 864 | ||
890 | return semop_completed; | 865 | return semop_completed; |
891 | } | 866 | } |
@@ -895,22 +870,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
895 | * update_queue - look for tasks that can be completed. | 870 | * update_queue - look for tasks that can be completed. |
896 | * @sma: semaphore array. | 871 | * @sma: semaphore array. |
897 | * @semnum: semaphore that was modified. | 872 | * @semnum: semaphore that was modified. |
898 | * @pt: list head for the tasks that must be woken up. | 873 | * @wake_q: lockless wake-queue head. |
899 | * | 874 | * |
900 | * update_queue must be called after a semaphore in a semaphore array | 875 | * update_queue must be called after a semaphore in a semaphore array |
901 | * was modified. If multiple semaphores were modified, update_queue must | 876 | * was modified. If multiple semaphores were modified, update_queue must |
902 | * be called with semnum = -1, as well as with the number of each modified | 877 | * be called with semnum = -1, as well as with the number of each modified |
903 | * semaphore. | 878 | * semaphore. |
904 | * The tasks that must be woken up are added to @pt. The return code | 879 | * The tasks that must be woken up are added to @wake_q. The return code |
905 | * is stored in q->pid. | 880 | * is stored in q->pid. |
906 | * The function internally checks if const operations can now succeed. | 881 | * The function internally checks if const operations can now succeed. |
907 | * | 882 | * |
908 | * The function return 1 if at least one semop was completed successfully. | 883 | * The function return 1 if at least one semop was completed successfully. |
909 | */ | 884 | */ |
910 | static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | 885 | static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) |
911 | { | 886 | { |
912 | struct sem_queue *q; | 887 | struct sem_queue *q, *tmp; |
913 | struct list_head *walk; | ||
914 | struct list_head *pending_list; | 888 | struct list_head *pending_list; |
915 | int semop_completed = 0; | 889 | int semop_completed = 0; |
916 | 890 | ||
@@ -920,13 +894,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | |||
920 | pending_list = &sma->sem_base[semnum].pending_alter; | 894 | pending_list = &sma->sem_base[semnum].pending_alter; |
921 | 895 | ||
922 | again: | 896 | again: |
923 | walk = pending_list->next; | 897 | list_for_each_entry_safe(q, tmp, pending_list, list) { |
924 | while (walk != pending_list) { | ||
925 | int error, restart; | 898 | int error, restart; |
926 | 899 | ||
927 | q = container_of(walk, struct sem_queue, list); | ||
928 | walk = walk->next; | ||
929 | |||
930 | /* If we are scanning the single sop, per-semaphore list of | 900 | /* If we are scanning the single sop, per-semaphore list of |
931 | * one semaphore and that semaphore is 0, then it is not | 901 | * one semaphore and that semaphore is 0, then it is not |
932 | * necessary to scan further: simple increments | 902 | * necessary to scan further: simple increments |
@@ -949,11 +919,11 @@ again: | |||
949 | restart = 0; | 919 | restart = 0; |
950 | } else { | 920 | } else { |
951 | semop_completed = 1; | 921 | semop_completed = 1; |
952 | do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); | 922 | do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); |
953 | restart = check_restart(sma, q); | 923 | restart = check_restart(sma, q); |
954 | } | 924 | } |
955 | 925 | ||
956 | wake_up_sem_queue_prepare(pt, q, error); | 926 | wake_up_sem_queue_prepare(q, error, wake_q); |
957 | if (restart) | 927 | if (restart) |
958 | goto again; | 928 | goto again; |
959 | } | 929 | } |
@@ -984,24 +954,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops) | |||
984 | * @sops: operations that were performed | 954 | * @sops: operations that were performed |
985 | * @nsops: number of operations | 955 | * @nsops: number of operations |
986 | * @otime: force setting otime | 956 | * @otime: force setting otime |
987 | * @pt: list head of the tasks that must be woken up. | 957 | * @wake_q: lockless wake-queue head |
988 | * | 958 | * |
989 | * do_smart_update() does the required calls to update_queue and wakeup_zero, | 959 | * do_smart_update() does the required calls to update_queue and wakeup_zero, |
990 | * based on the actual changes that were performed on the semaphore array. | 960 | * based on the actual changes that were performed on the semaphore array. |
991 | * Note that the function does not do the actual wake-up: the caller is | 961 | * Note that the function does not do the actual wake-up: the caller is |
992 | * responsible for calling wake_up_sem_queue_do(@pt). | 962 | * responsible for calling wake_up_q(). |
993 | * It is safe to perform this call after dropping all locks. | 963 | * It is safe to perform this call after dropping all locks. |
994 | */ | 964 | */ |
995 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, | 965 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, |
996 | int otime, struct list_head *pt) | 966 | int otime, struct wake_q_head *wake_q) |
997 | { | 967 | { |
998 | int i; | 968 | int i; |
999 | 969 | ||
1000 | otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); | 970 | otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); |
1001 | 971 | ||
1002 | if (!list_empty(&sma->pending_alter)) { | 972 | if (!list_empty(&sma->pending_alter)) { |
1003 | /* semaphore array uses the global queue - just process it. */ | 973 | /* semaphore array uses the global queue - just process it. */ |
1004 | otime |= update_queue(sma, -1, pt); | 974 | otime |= update_queue(sma, -1, wake_q); |
1005 | } else { | 975 | } else { |
1006 | if (!sops) { | 976 | if (!sops) { |
1007 | /* | 977 | /* |
@@ -1009,7 +979,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
1009 | * known. Check all. | 979 | * known. Check all. |
1010 | */ | 980 | */ |
1011 | for (i = 0; i < sma->sem_nsems; i++) | 981 | for (i = 0; i < sma->sem_nsems; i++) |
1012 | otime |= update_queue(sma, i, pt); | 982 | otime |= update_queue(sma, i, wake_q); |
1013 | } else { | 983 | } else { |
1014 | /* | 984 | /* |
1015 | * Check the semaphores that were increased: | 985 | * Check the semaphores that were increased: |
@@ -1023,7 +993,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
1023 | for (i = 0; i < nsops; i++) { | 993 | for (i = 0; i < nsops; i++) { |
1024 | if (sops[i].sem_op > 0) { | 994 | if (sops[i].sem_op > 0) { |
1025 | otime |= update_queue(sma, | 995 | otime |= update_queue(sma, |
1026 | sops[i].sem_num, pt); | 996 | sops[i].sem_num, wake_q); |
1027 | } | 997 | } |
1028 | } | 998 | } |
1029 | } | 999 | } |
@@ -1111,8 +1081,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
1111 | struct sem_undo *un, *tu; | 1081 | struct sem_undo *un, *tu; |
1112 | struct sem_queue *q, *tq; | 1082 | struct sem_queue *q, *tq; |
1113 | struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); | 1083 | struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); |
1114 | struct list_head tasks; | ||
1115 | int i; | 1084 | int i; |
1085 | DEFINE_WAKE_Q(wake_q); | ||
1116 | 1086 | ||
1117 | /* Free the existing undo structures for this semaphore set. */ | 1087 | /* Free the existing undo structures for this semaphore set. */ |
1118 | ipc_assert_locked_object(&sma->sem_perm); | 1088 | ipc_assert_locked_object(&sma->sem_perm); |
@@ -1126,25 +1096,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
1126 | } | 1096 | } |
1127 | 1097 | ||
1128 | /* Wake up all pending processes and let them fail with EIDRM. */ | 1098 | /* Wake up all pending processes and let them fail with EIDRM. */ |
1129 | INIT_LIST_HEAD(&tasks); | ||
1130 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { | 1099 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { |
1131 | unlink_queue(sma, q); | 1100 | unlink_queue(sma, q); |
1132 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1101 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
1133 | } | 1102 | } |
1134 | 1103 | ||
1135 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { | 1104 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { |
1136 | unlink_queue(sma, q); | 1105 | unlink_queue(sma, q); |
1137 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1106 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
1138 | } | 1107 | } |
1139 | for (i = 0; i < sma->sem_nsems; i++) { | 1108 | for (i = 0; i < sma->sem_nsems; i++) { |
1140 | struct sem *sem = sma->sem_base + i; | 1109 | struct sem *sem = sma->sem_base + i; |
1141 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { | 1110 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { |
1142 | unlink_queue(sma, q); | 1111 | unlink_queue(sma, q); |
1143 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1112 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
1144 | } | 1113 | } |
1145 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { | 1114 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { |
1146 | unlink_queue(sma, q); | 1115 | unlink_queue(sma, q); |
1147 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1116 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
1148 | } | 1117 | } |
1149 | } | 1118 | } |
1150 | 1119 | ||
@@ -1153,7 +1122,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
1153 | sem_unlock(sma, -1); | 1122 | sem_unlock(sma, -1); |
1154 | rcu_read_unlock(); | 1123 | rcu_read_unlock(); |
1155 | 1124 | ||
1156 | wake_up_sem_queue_do(&tasks); | 1125 | wake_up_q(&wake_q); |
1157 | ns->used_sems -= sma->sem_nsems; | 1126 | ns->used_sems -= sma->sem_nsems; |
1158 | ipc_rcu_putref(sma, sem_rcu_free); | 1127 | ipc_rcu_putref(sma, sem_rcu_free); |
1159 | } | 1128 | } |
@@ -1292,9 +1261,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
1292 | struct sem_undo *un; | 1261 | struct sem_undo *un; |
1293 | struct sem_array *sma; | 1262 | struct sem_array *sma; |
1294 | struct sem *curr; | 1263 | struct sem *curr; |
1295 | int err; | 1264 | int err, val; |
1296 | struct list_head tasks; | 1265 | DEFINE_WAKE_Q(wake_q); |
1297 | int val; | 1266 | |
1298 | #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) | 1267 | #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) |
1299 | /* big-endian 64bit */ | 1268 | /* big-endian 64bit */ |
1300 | val = arg >> 32; | 1269 | val = arg >> 32; |
@@ -1306,8 +1275,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
1306 | if (val > SEMVMX || val < 0) | 1275 | if (val > SEMVMX || val < 0) |
1307 | return -ERANGE; | 1276 | return -ERANGE; |
1308 | 1277 | ||
1309 | INIT_LIST_HEAD(&tasks); | ||
1310 | |||
1311 | rcu_read_lock(); | 1278 | rcu_read_lock(); |
1312 | sma = sem_obtain_object_check(ns, semid); | 1279 | sma = sem_obtain_object_check(ns, semid); |
1313 | if (IS_ERR(sma)) { | 1280 | if (IS_ERR(sma)) { |
@@ -1350,10 +1317,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
1350 | curr->sempid = task_tgid_vnr(current); | 1317 | curr->sempid = task_tgid_vnr(current); |
1351 | sma->sem_ctime = get_seconds(); | 1318 | sma->sem_ctime = get_seconds(); |
1352 | /* maybe some queued-up processes were waiting for this */ | 1319 | /* maybe some queued-up processes were waiting for this */ |
1353 | do_smart_update(sma, NULL, 0, 0, &tasks); | 1320 | do_smart_update(sma, NULL, 0, 0, &wake_q); |
1354 | sem_unlock(sma, -1); | 1321 | sem_unlock(sma, -1); |
1355 | rcu_read_unlock(); | 1322 | rcu_read_unlock(); |
1356 | wake_up_sem_queue_do(&tasks); | 1323 | wake_up_q(&wake_q); |
1357 | return 0; | 1324 | return 0; |
1358 | } | 1325 | } |
1359 | 1326 | ||
@@ -1365,9 +1332,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
1365 | int err, nsems; | 1332 | int err, nsems; |
1366 | ushort fast_sem_io[SEMMSL_FAST]; | 1333 | ushort fast_sem_io[SEMMSL_FAST]; |
1367 | ushort *sem_io = fast_sem_io; | 1334 | ushort *sem_io = fast_sem_io; |
1368 | struct list_head tasks; | 1335 | DEFINE_WAKE_Q(wake_q); |
1369 | |||
1370 | INIT_LIST_HEAD(&tasks); | ||
1371 | 1336 | ||
1372 | rcu_read_lock(); | 1337 | rcu_read_lock(); |
1373 | sma = sem_obtain_object_check(ns, semid); | 1338 | sma = sem_obtain_object_check(ns, semid); |
@@ -1478,7 +1443,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
1478 | } | 1443 | } |
1479 | sma->sem_ctime = get_seconds(); | 1444 | sma->sem_ctime = get_seconds(); |
1480 | /* maybe some queued-up processes were waiting for this */ | 1445 | /* maybe some queued-up processes were waiting for this */ |
1481 | do_smart_update(sma, NULL, 0, 0, &tasks); | 1446 | do_smart_update(sma, NULL, 0, 0, &wake_q); |
1482 | err = 0; | 1447 | err = 0; |
1483 | goto out_unlock; | 1448 | goto out_unlock; |
1484 | } | 1449 | } |
@@ -1514,7 +1479,7 @@ out_unlock: | |||
1514 | sem_unlock(sma, -1); | 1479 | sem_unlock(sma, -1); |
1515 | out_rcu_wakeup: | 1480 | out_rcu_wakeup: |
1516 | rcu_read_unlock(); | 1481 | rcu_read_unlock(); |
1517 | wake_up_sem_queue_do(&tasks); | 1482 | wake_up_q(&wake_q); |
1518 | out_free: | 1483 | out_free: |
1519 | if (sem_io != fast_sem_io) | 1484 | if (sem_io != fast_sem_io) |
1520 | ipc_free(sem_io); | 1485 | ipc_free(sem_io); |
@@ -1787,32 +1752,6 @@ out: | |||
1787 | return un; | 1752 | return un; |
1788 | } | 1753 | } |
1789 | 1754 | ||
1790 | |||
1791 | /** | ||
1792 | * get_queue_result - retrieve the result code from sem_queue | ||
1793 | * @q: Pointer to queue structure | ||
1794 | * | ||
1795 | * Retrieve the return code from the pending queue. If IN_WAKEUP is found in | ||
1796 | * q->status, then we must loop until the value is replaced with the final | ||
1797 | * value: This may happen if a task is woken up by an unrelated event (e.g. | ||
1798 | * signal) and in parallel the task is woken up by another task because it got | ||
1799 | * the requested semaphores. | ||
1800 | * | ||
1801 | * The function can be called with or without holding the semaphore spinlock. | ||
1802 | */ | ||
1803 | static int get_queue_result(struct sem_queue *q) | ||
1804 | { | ||
1805 | int error; | ||
1806 | |||
1807 | error = q->status; | ||
1808 | while (unlikely(error == IN_WAKEUP)) { | ||
1809 | cpu_relax(); | ||
1810 | error = q->status; | ||
1811 | } | ||
1812 | |||
1813 | return error; | ||
1814 | } | ||
1815 | |||
1816 | SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | 1755 | SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, |
1817 | unsigned, nsops, const struct timespec __user *, timeout) | 1756 | unsigned, nsops, const struct timespec __user *, timeout) |
1818 | { | 1757 | { |
@@ -1821,11 +1760,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1821 | struct sembuf fast_sops[SEMOPM_FAST]; | 1760 | struct sembuf fast_sops[SEMOPM_FAST]; |
1822 | struct sembuf *sops = fast_sops, *sop; | 1761 | struct sembuf *sops = fast_sops, *sop; |
1823 | struct sem_undo *un; | 1762 | struct sem_undo *un; |
1824 | int undos = 0, alter = 0, max, locknum; | 1763 | int max, locknum; |
1764 | bool undos = false, alter = false, dupsop = false; | ||
1825 | struct sem_queue queue; | 1765 | struct sem_queue queue; |
1826 | unsigned long jiffies_left = 0; | 1766 | unsigned long dup = 0, jiffies_left = 0; |
1827 | struct ipc_namespace *ns; | 1767 | struct ipc_namespace *ns; |
1828 | struct list_head tasks; | ||
1829 | 1768 | ||
1830 | ns = current->nsproxy->ipc_ns; | 1769 | ns = current->nsproxy->ipc_ns; |
1831 | 1770 | ||
@@ -1838,10 +1777,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1838 | if (sops == NULL) | 1777 | if (sops == NULL) |
1839 | return -ENOMEM; | 1778 | return -ENOMEM; |
1840 | } | 1779 | } |
1780 | |||
1841 | if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { | 1781 | if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { |
1842 | error = -EFAULT; | 1782 | error = -EFAULT; |
1843 | goto out_free; | 1783 | goto out_free; |
1844 | } | 1784 | } |
1785 | |||
1845 | if (timeout) { | 1786 | if (timeout) { |
1846 | struct timespec _timeout; | 1787 | struct timespec _timeout; |
1847 | if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { | 1788 | if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { |
@@ -1855,18 +1796,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1855 | } | 1796 | } |
1856 | jiffies_left = timespec_to_jiffies(&_timeout); | 1797 | jiffies_left = timespec_to_jiffies(&_timeout); |
1857 | } | 1798 | } |
1799 | |||
1858 | max = 0; | 1800 | max = 0; |
1859 | for (sop = sops; sop < sops + nsops; sop++) { | 1801 | for (sop = sops; sop < sops + nsops; sop++) { |
1802 | unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); | ||
1803 | |||
1860 | if (sop->sem_num >= max) | 1804 | if (sop->sem_num >= max) |
1861 | max = sop->sem_num; | 1805 | max = sop->sem_num; |
1862 | if (sop->sem_flg & SEM_UNDO) | 1806 | if (sop->sem_flg & SEM_UNDO) |
1863 | undos = 1; | 1807 | undos = true; |
1864 | if (sop->sem_op != 0) | 1808 | if (dup & mask) { |
1865 | alter = 1; | 1809 | /* |
1810 | * There was a previous alter access that appears | ||
1811 | * to have accessed the same semaphore, thus use | ||
1812 | * the dupsop logic. "appears", because the detection | ||
1813 | * can only check % BITS_PER_LONG. | ||
1814 | */ | ||
1815 | dupsop = true; | ||
1816 | } | ||
1817 | if (sop->sem_op != 0) { | ||
1818 | alter = true; | ||
1819 | dup |= mask; | ||
1820 | } | ||
1866 | } | 1821 | } |
1867 | 1822 | ||
1868 | INIT_LIST_HEAD(&tasks); | ||
1869 | |||
1870 | if (undos) { | 1823 | if (undos) { |
1871 | /* On success, find_alloc_undo takes the rcu_read_lock */ | 1824 | /* On success, find_alloc_undo takes the rcu_read_lock */ |
1872 | un = find_alloc_undo(ns, semid); | 1825 | un = find_alloc_undo(ns, semid); |
@@ -1887,16 +1840,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1887 | } | 1840 | } |
1888 | 1841 | ||
1889 | error = -EFBIG; | 1842 | error = -EFBIG; |
1890 | if (max >= sma->sem_nsems) | 1843 | if (max >= sma->sem_nsems) { |
1891 | goto out_rcu_wakeup; | 1844 | rcu_read_unlock(); |
1845 | goto out_free; | ||
1846 | } | ||
1892 | 1847 | ||
1893 | error = -EACCES; | 1848 | error = -EACCES; |
1894 | if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) | 1849 | if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { |
1895 | goto out_rcu_wakeup; | 1850 | rcu_read_unlock(); |
1851 | goto out_free; | ||
1852 | } | ||
1896 | 1853 | ||
1897 | error = security_sem_semop(sma, sops, nsops, alter); | 1854 | error = security_sem_semop(sma, sops, nsops, alter); |
1898 | if (error) | 1855 | if (error) { |
1899 | goto out_rcu_wakeup; | 1856 | rcu_read_unlock(); |
1857 | goto out_free; | ||
1858 | } | ||
1900 | 1859 | ||
1901 | error = -EIDRM; | 1860 | error = -EIDRM; |
1902 | locknum = sem_lock(sma, sops, nsops); | 1861 | locknum = sem_lock(sma, sops, nsops); |
@@ -1925,24 +1884,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1925 | queue.undo = un; | 1884 | queue.undo = un; |
1926 | queue.pid = task_tgid_vnr(current); | 1885 | queue.pid = task_tgid_vnr(current); |
1927 | queue.alter = alter; | 1886 | queue.alter = alter; |
1887 | queue.dupsop = dupsop; | ||
1928 | 1888 | ||
1929 | error = perform_atomic_semop(sma, &queue); | 1889 | error = perform_atomic_semop(sma, &queue); |
1930 | if (error == 0) { | 1890 | if (error == 0) { /* non-blocking succesfull path */ |
1931 | /* If the operation was successful, then do | 1891 | DEFINE_WAKE_Q(wake_q); |
1892 | |||
1893 | /* | ||
1894 | * If the operation was successful, then do | ||
1932 | * the required updates. | 1895 | * the required updates. |
1933 | */ | 1896 | */ |
1934 | if (alter) | 1897 | if (alter) |
1935 | do_smart_update(sma, sops, nsops, 1, &tasks); | 1898 | do_smart_update(sma, sops, nsops, 1, &wake_q); |
1936 | else | 1899 | else |
1937 | set_semotime(sma, sops); | 1900 | set_semotime(sma, sops); |
1901 | |||
1902 | sem_unlock(sma, locknum); | ||
1903 | rcu_read_unlock(); | ||
1904 | wake_up_q(&wake_q); | ||
1905 | |||
1906 | goto out_free; | ||
1938 | } | 1907 | } |
1939 | if (error <= 0) | 1908 | if (error < 0) /* non-blocking error path */ |
1940 | goto out_unlock_free; | 1909 | goto out_unlock_free; |
1941 | 1910 | ||
1942 | /* We need to sleep on this operation, so we put the current | 1911 | /* |
1912 | * We need to sleep on this operation, so we put the current | ||
1943 | * task into the pending queue and go to sleep. | 1913 | * task into the pending queue and go to sleep. |
1944 | */ | 1914 | */ |
1945 | |||
1946 | if (nsops == 1) { | 1915 | if (nsops == 1) { |
1947 | struct sem *curr; | 1916 | struct sem *curr; |
1948 | curr = &sma->sem_base[sops->sem_num]; | 1917 | curr = &sma->sem_base[sops->sem_num]; |
@@ -1971,77 +1940,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
1971 | sma->complex_count++; | 1940 | sma->complex_count++; |
1972 | } | 1941 | } |
1973 | 1942 | ||
1974 | queue.status = -EINTR; | 1943 | do { |
1975 | queue.sleeper = current; | 1944 | queue.status = -EINTR; |
1945 | queue.sleeper = current; | ||
1976 | 1946 | ||
1977 | sleep_again: | 1947 | __set_current_state(TASK_INTERRUPTIBLE); |
1978 | __set_current_state(TASK_INTERRUPTIBLE); | 1948 | sem_unlock(sma, locknum); |
1979 | sem_unlock(sma, locknum); | 1949 | rcu_read_unlock(); |
1980 | rcu_read_unlock(); | ||
1981 | |||
1982 | if (timeout) | ||
1983 | jiffies_left = schedule_timeout(jiffies_left); | ||
1984 | else | ||
1985 | schedule(); | ||
1986 | 1950 | ||
1987 | error = get_queue_result(&queue); | 1951 | if (timeout) |
1952 | jiffies_left = schedule_timeout(jiffies_left); | ||
1953 | else | ||
1954 | schedule(); | ||
1988 | 1955 | ||
1989 | if (error != -EINTR) { | 1956 | /* |
1990 | /* fast path: update_queue already obtained all requested | 1957 | * fastpath: the semop has completed, either successfully or |
1991 | * resources. | 1958 | * not, from the syscall pov, is quite irrelevant to us at this |
1992 | * Perform a smp_mb(): User space could assume that semop() | 1959 | * point; we're done. |
1993 | * is a memory barrier: Without the mb(), the cpu could | 1960 | * |
1994 | * speculatively read in user space stale data that was | 1961 | * We _do_ care, nonetheless, about being awoken by a signal or |
1995 | * overwritten by the previous owner of the semaphore. | 1962 | * spuriously. The queue.status is checked again in the |
1963 | * slowpath (aka after taking sem_lock), such that we can detect | ||
1964 | * scenarios where we were awakened externally, during the | ||
1965 | * window between wake_q_add() and wake_up_q(). | ||
1996 | */ | 1966 | */ |
1997 | smp_mb(); | 1967 | error = READ_ONCE(queue.status); |
1998 | 1968 | if (error != -EINTR) { | |
1999 | goto out_free; | 1969 | /* |
2000 | } | 1970 | * User space could assume that semop() is a memory |
2001 | 1971 | * barrier: Without the mb(), the cpu could | |
2002 | rcu_read_lock(); | 1972 | * speculatively read in userspace stale data that was |
2003 | sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); | 1973 | * overwritten by the previous owner of the semaphore. |
2004 | 1974 | */ | |
2005 | /* | 1975 | smp_mb(); |
2006 | * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. | 1976 | goto out_free; |
2007 | */ | 1977 | } |
2008 | error = get_queue_result(&queue); | ||
2009 | 1978 | ||
2010 | /* | 1979 | rcu_read_lock(); |
2011 | * Array removed? If yes, leave without sem_unlock(). | 1980 | sem_lock(sma, sops, nsops); |
2012 | */ | ||
2013 | if (IS_ERR(sma)) { | ||
2014 | rcu_read_unlock(); | ||
2015 | goto out_free; | ||
2016 | } | ||
2017 | 1981 | ||
1982 | if (!ipc_valid_object(&sma->sem_perm)) | ||
1983 | goto out_unlock_free; | ||
2018 | 1984 | ||
2019 | /* | 1985 | error = READ_ONCE(queue.status); |
2020 | * If queue.status != -EINTR we are woken up by another process. | ||
2021 | * Leave without unlink_queue(), but with sem_unlock(). | ||
2022 | */ | ||
2023 | if (error != -EINTR) | ||
2024 | goto out_unlock_free; | ||
2025 | 1986 | ||
2026 | /* | 1987 | /* |
2027 | * If an interrupt occurred we have to clean up the queue | 1988 | * If queue.status != -EINTR we are woken up by another process. |
2028 | */ | 1989 | * Leave without unlink_queue(), but with sem_unlock(). |
2029 | if (timeout && jiffies_left == 0) | 1990 | */ |
2030 | error = -EAGAIN; | 1991 | if (error != -EINTR) |
1992 | goto out_unlock_free; | ||
2031 | 1993 | ||
2032 | /* | 1994 | /* |
2033 | * If the wakeup was spurious, just retry | 1995 | * If an interrupt occurred we have to clean up the queue. |
2034 | */ | 1996 | */ |
2035 | if (error == -EINTR && !signal_pending(current)) | 1997 | if (timeout && jiffies_left == 0) |
2036 | goto sleep_again; | 1998 | error = -EAGAIN; |
1999 | } while (error == -EINTR && !signal_pending(current)); /* spurious */ | ||
2037 | 2000 | ||
2038 | unlink_queue(sma, &queue); | 2001 | unlink_queue(sma, &queue); |
2039 | 2002 | ||
2040 | out_unlock_free: | 2003 | out_unlock_free: |
2041 | sem_unlock(sma, locknum); | 2004 | sem_unlock(sma, locknum); |
2042 | out_rcu_wakeup: | ||
2043 | rcu_read_unlock(); | 2005 | rcu_read_unlock(); |
2044 | wake_up_sem_queue_do(&tasks); | ||
2045 | out_free: | 2006 | out_free: |
2046 | if (sops != fast_sops) | 2007 | if (sops != fast_sops) |
2047 | kfree(sops); | 2008 | kfree(sops); |
@@ -2102,8 +2063,8 @@ void exit_sem(struct task_struct *tsk) | |||
2102 | for (;;) { | 2063 | for (;;) { |
2103 | struct sem_array *sma; | 2064 | struct sem_array *sma; |
2104 | struct sem_undo *un; | 2065 | struct sem_undo *un; |
2105 | struct list_head tasks; | ||
2106 | int semid, i; | 2066 | int semid, i; |
2067 | DEFINE_WAKE_Q(wake_q); | ||
2107 | 2068 | ||
2108 | cond_resched(); | 2069 | cond_resched(); |
2109 | 2070 | ||
@@ -2191,11 +2152,10 @@ void exit_sem(struct task_struct *tsk) | |||
2191 | } | 2152 | } |
2192 | } | 2153 | } |
2193 | /* maybe some queued-up processes were waiting for this */ | 2154 | /* maybe some queued-up processes were waiting for this */ |
2194 | INIT_LIST_HEAD(&tasks); | 2155 | do_smart_update(sma, NULL, 0, 1, &wake_q); |
2195 | do_smart_update(sma, NULL, 0, 1, &tasks); | ||
2196 | sem_unlock(sma, -1); | 2156 | sem_unlock(sma, -1); |
2197 | rcu_read_unlock(); | 2157 | rcu_read_unlock(); |
2198 | wake_up_sem_queue_do(&tasks); | 2158 | wake_up_q(&wake_q); |
2199 | 2159 | ||
2200 | kfree_rcu(un, rcu); | 2160 | kfree_rcu(un, rcu); |
2201 | } | 2161 | } |
@@ -89,6 +89,7 @@ void shm_init_ns(struct ipc_namespace *ns) | |||
89 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | 89 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) |
90 | { | 90 | { |
91 | struct shmid_kernel *shp; | 91 | struct shmid_kernel *shp; |
92 | |||
92 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 93 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
93 | 94 | ||
94 | if (shp->shm_nattch) { | 95 | if (shp->shm_nattch) { |
@@ -387,6 +388,7 @@ static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | |||
387 | struct file *file = vma->vm_file; | 388 | struct file *file = vma->vm_file; |
388 | struct shm_file_data *sfd = shm_file_data(file); | 389 | struct shm_file_data *sfd = shm_file_data(file); |
389 | int err = 0; | 390 | int err = 0; |
391 | |||
390 | if (sfd->vm_ops->set_policy) | 392 | if (sfd->vm_ops->set_policy) |
391 | err = sfd->vm_ops->set_policy(vma, new); | 393 | err = sfd->vm_ops->set_policy(vma, new); |
392 | return err; | 394 | return err; |
@@ -417,7 +419,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma) | |||
417 | * In case of remap_file_pages() emulation, the file can represent | 419 | * In case of remap_file_pages() emulation, the file can represent |
418 | * removed IPC ID: propogate shm_lock() error to caller. | 420 | * removed IPC ID: propogate shm_lock() error to caller. |
419 | */ | 421 | */ |
420 | ret =__shm_open(vma); | 422 | ret = __shm_open(vma); |
421 | if (ret) | 423 | if (ret) |
422 | return ret; | 424 | return ret; |
423 | 425 | ||
@@ -468,6 +470,7 @@ static unsigned long shm_get_unmapped_area(struct file *file, | |||
468 | unsigned long flags) | 470 | unsigned long flags) |
469 | { | 471 | { |
470 | struct shm_file_data *sfd = shm_file_data(file); | 472 | struct shm_file_data *sfd = shm_file_data(file); |
473 | |||
471 | return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, | 474 | return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, |
472 | pgoff, flags); | 475 | pgoff, flags); |
473 | } | 476 | } |
@@ -766,6 +769,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, | |||
766 | } else { | 769 | } else { |
767 | #ifdef CONFIG_SHMEM | 770 | #ifdef CONFIG_SHMEM |
768 | struct shmem_inode_info *info = SHMEM_I(inode); | 771 | struct shmem_inode_info *info = SHMEM_I(inode); |
772 | |||
769 | spin_lock_irq(&info->lock); | 773 | spin_lock_irq(&info->lock); |
770 | *rss_add += inode->i_mapping->nrpages; | 774 | *rss_add += inode->i_mapping->nrpages; |
771 | *swp_add += info->swapped; | 775 | *swp_add += info->swapped; |
@@ -1028,6 +1032,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) | |||
1028 | 1032 | ||
1029 | if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { | 1033 | if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { |
1030 | kuid_t euid = current_euid(); | 1034 | kuid_t euid = current_euid(); |
1035 | |||
1031 | if (!uid_eq(euid, shp->shm_perm.uid) && | 1036 | if (!uid_eq(euid, shp->shm_perm.uid) && |
1032 | !uid_eq(euid, shp->shm_perm.cuid)) { | 1037 | !uid_eq(euid, shp->shm_perm.cuid)) { |
1033 | err = -EPERM; | 1038 | err = -EPERM; |
@@ -1045,6 +1050,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) | |||
1045 | 1050 | ||
1046 | if (cmd == SHM_LOCK) { | 1051 | if (cmd == SHM_LOCK) { |
1047 | struct user_struct *user = current_user(); | 1052 | struct user_struct *user = current_user(); |
1053 | |||
1048 | err = shmem_lock(shm_file, 1, user); | 1054 | err = shmem_lock(shm_file, 1, user); |
1049 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { | 1055 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { |
1050 | shp->shm_perm.mode |= SHM_LOCKED; | 1056 | shp->shm_perm.mode |= SHM_LOCKED; |
@@ -1354,9 +1360,10 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
1354 | vma = next; | 1360 | vma = next; |
1355 | } | 1361 | } |
1356 | 1362 | ||
1357 | #else /* CONFIG_MMU */ | 1363 | #else /* CONFIG_MMU */ |
1358 | /* under NOMMU conditions, the exact address to be destroyed must be | 1364 | /* under NOMMU conditions, the exact address to be destroyed must be |
1359 | * given */ | 1365 | * given |
1366 | */ | ||
1360 | if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { | 1367 | if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { |
1361 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1368 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
1362 | retval = 0; | 1369 | retval = 0; |
diff --git a/kernel/Makefile b/kernel/Makefile index eaee9de224bd..12c679f769c6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
84 | obj-$(CONFIG_KGDB) += debug/ | 84 | obj-$(CONFIG_KGDB) += debug/ |
85 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 85 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
86 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 86 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
87 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o | ||
87 | obj-$(CONFIG_SECCOMP) += seccomp.o | 88 | obj-$(CONFIG_SECCOMP) += seccomp.o |
88 | obj-$(CONFIG_RELAY) += relay.o | 89 | obj-$(CONFIG_RELAY) += relay.o |
89 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 90 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0874e2edd275..79517e5549f1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
@@ -598,11 +598,11 @@ return_normal: | |||
598 | /* | 598 | /* |
599 | * Wait for the other CPUs to be notified and be waiting for us: | 599 | * Wait for the other CPUs to be notified and be waiting for us: |
600 | */ | 600 | */ |
601 | time_left = loops_per_jiffy * HZ; | 601 | time_left = MSEC_PER_SEC; |
602 | while (kgdb_do_roundup && --time_left && | 602 | while (kgdb_do_roundup && --time_left && |
603 | (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != | 603 | (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != |
604 | online_cpus) | 604 | online_cpus) |
605 | cpu_relax(); | 605 | udelay(1000); |
606 | if (!time_left) | 606 | if (!time_left) |
607 | pr_crit("Timed out waiting for secondary CPUs.\n"); | 607 | pr_crit("Timed out waiting for secondary CPUs.\n"); |
608 | 608 | ||
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 98c9011eac78..e74be38245ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
@@ -30,6 +30,7 @@ | |||
30 | char kdb_prompt_str[CMD_BUFLEN]; | 30 | char kdb_prompt_str[CMD_BUFLEN]; |
31 | 31 | ||
32 | int kdb_trap_printk; | 32 | int kdb_trap_printk; |
33 | int kdb_printf_cpu = -1; | ||
33 | 34 | ||
34 | static int kgdb_transition_check(char *buffer) | 35 | static int kgdb_transition_check(char *buffer) |
35 | { | 36 | { |
@@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) | |||
554 | int linecount; | 555 | int linecount; |
555 | int colcount; | 556 | int colcount; |
556 | int logging, saved_loglevel = 0; | 557 | int logging, saved_loglevel = 0; |
557 | int saved_trap_printk; | ||
558 | int got_printf_lock = 0; | ||
559 | int retlen = 0; | 558 | int retlen = 0; |
560 | int fnd, len; | 559 | int fnd, len; |
560 | int this_cpu, old_cpu; | ||
561 | char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; | 561 | char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; |
562 | char *moreprompt = "more> "; | 562 | char *moreprompt = "more> "; |
563 | struct console *c = console_drivers; | 563 | struct console *c = console_drivers; |
564 | static DEFINE_SPINLOCK(kdb_printf_lock); | ||
565 | unsigned long uninitialized_var(flags); | 564 | unsigned long uninitialized_var(flags); |
566 | 565 | ||
567 | preempt_disable(); | ||
568 | saved_trap_printk = kdb_trap_printk; | ||
569 | kdb_trap_printk = 0; | ||
570 | |||
571 | /* Serialize kdb_printf if multiple cpus try to write at once. | 566 | /* Serialize kdb_printf if multiple cpus try to write at once. |
572 | * But if any cpu goes recursive in kdb, just print the output, | 567 | * But if any cpu goes recursive in kdb, just print the output, |
573 | * even if it is interleaved with any other text. | 568 | * even if it is interleaved with any other text. |
574 | */ | 569 | */ |
575 | if (!KDB_STATE(PRINTF_LOCK)) { | 570 | local_irq_save(flags); |
576 | KDB_STATE_SET(PRINTF_LOCK); | 571 | this_cpu = smp_processor_id(); |
577 | spin_lock_irqsave(&kdb_printf_lock, flags); | 572 | for (;;) { |
578 | got_printf_lock = 1; | 573 | old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); |
579 | atomic_inc(&kdb_event); | 574 | if (old_cpu == -1 || old_cpu == this_cpu) |
580 | } else { | 575 | break; |
581 | __acquire(kdb_printf_lock); | 576 | |
577 | cpu_relax(); | ||
582 | } | 578 | } |
583 | 579 | ||
584 | diag = kdbgetintenv("LINES", &linecount); | 580 | diag = kdbgetintenv("LINES", &linecount); |
@@ -847,16 +843,9 @@ kdb_print_out: | |||
847 | suspend_grep = 0; /* end of what may have been a recursive call */ | 843 | suspend_grep = 0; /* end of what may have been a recursive call */ |
848 | if (logging) | 844 | if (logging) |
849 | console_loglevel = saved_loglevel; | 845 | console_loglevel = saved_loglevel; |
850 | if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { | 846 | /* kdb_printf_cpu locked the code above. */ |
851 | got_printf_lock = 0; | 847 | smp_store_release(&kdb_printf_cpu, old_cpu); |
852 | spin_unlock_irqrestore(&kdb_printf_lock, flags); | 848 | local_irq_restore(flags); |
853 | KDB_STATE_CLEAR(PRINTF_LOCK); | ||
854 | atomic_dec(&kdb_event); | ||
855 | } else { | ||
856 | __release(kdb_printf_lock); | ||
857 | } | ||
858 | kdb_trap_printk = saved_trap_printk; | ||
859 | preempt_enable(); | ||
860 | return retlen; | 849 | return retlen; |
861 | } | 850 | } |
862 | 851 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2a20c0dfdafc..ca183919d302 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
@@ -60,7 +60,6 @@ int kdb_grep_trailing; | |||
60 | * Kernel debugger state flags | 60 | * Kernel debugger state flags |
61 | */ | 61 | */ |
62 | int kdb_flags; | 62 | int kdb_flags; |
63 | atomic_t kdb_event; | ||
64 | 63 | ||
65 | /* | 64 | /* |
66 | * kdb_lock protects updates to kdb_initial_cpu. Used to | 65 | * kdb_lock protects updates to kdb_initial_cpu. Used to |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 75014d7f4568..fc224fbcf954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
@@ -132,7 +132,6 @@ extern int kdb_state; | |||
132 | #define KDB_STATE_PAGER 0x00000400 /* pager is available */ | 132 | #define KDB_STATE_PAGER 0x00000400 /* pager is available */ |
133 | #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching | 133 | #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching |
134 | * back to initial cpu */ | 134 | * back to initial cpu */ |
135 | #define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ | ||
136 | #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ | 135 | #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ |
137 | #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ | 136 | #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ |
138 | #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been | 137 | #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, | |||
301 | retry: | 301 | retry: |
302 | /* Read the page with vaddr into memory */ | 302 | /* Read the page with vaddr into memory */ |
303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, | 303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, |
304 | &vma); | 304 | &vma, NULL); |
305 | if (ret <= 0) | 305 | if (ret <= 0) |
306 | return ret; | 306 | return ret; |
307 | 307 | ||
@@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
1712 | * essentially a kernel access to the memory. | 1712 | * essentially a kernel access to the memory. |
1713 | */ | 1713 | */ |
1714 | result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, | 1714 | result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, |
1715 | NULL); | 1715 | NULL, NULL); |
1716 | if (result < 0) | 1716 | if (result < 0) |
1717 | return result; | 1717 | return result; |
1718 | 1718 | ||
diff --git a/kernel/kcov.c b/kernel/kcov.c index 3cbb0c879705..cc2fa35ca480 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c | |||
@@ -1,11 +1,16 @@ | |||
1 | #define pr_fmt(fmt) "kcov: " fmt | 1 | #define pr_fmt(fmt) "kcov: " fmt |
2 | 2 | ||
3 | #define DISABLE_BRANCH_PROFILING | 3 | #define DISABLE_BRANCH_PROFILING |
4 | #include <linux/atomic.h> | ||
4 | #include <linux/compiler.h> | 5 | #include <linux/compiler.h> |
6 | #include <linux/errno.h> | ||
7 | #include <linux/export.h> | ||
5 | #include <linux/types.h> | 8 | #include <linux/types.h> |
6 | #include <linux/file.h> | 9 | #include <linux/file.h> |
7 | #include <linux/fs.h> | 10 | #include <linux/fs.h> |
11 | #include <linux/init.h> | ||
8 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
13 | #include <linux/preempt.h> | ||
9 | #include <linux/printk.h> | 14 | #include <linux/printk.h> |
10 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
11 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 561675589511..5617cc412444 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
@@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | |||
441 | while (hole_end <= crashk_res.end) { | 441 | while (hole_end <= crashk_res.end) { |
442 | unsigned long i; | 442 | unsigned long i; |
443 | 443 | ||
444 | cond_resched(); | ||
445 | |||
444 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) | 446 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) |
445 | break; | 447 | break; |
446 | /* See if I overlap any of the segments */ | 448 | /* See if I overlap any of the segments */ |
@@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1467 | #endif | 1469 | #endif |
1468 | VMCOREINFO_NUMBER(PG_head_mask); | 1470 | VMCOREINFO_NUMBER(PG_head_mask); |
1469 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | 1471 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); |
1470 | #ifdef CONFIG_X86 | ||
1471 | VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); | ||
1472 | #endif | ||
1473 | #ifdef CONFIG_HUGETLB_PAGE | 1472 | #ifdef CONFIG_HUGETLB_PAGE |
1474 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); | 1473 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); |
1475 | #endif | 1474 | #endif |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 577f2288d19f..a3ce35e0fa1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args) | |||
1926 | int r; | 1926 | int r; |
1927 | 1927 | ||
1928 | #ifdef CONFIG_KGDB_KDB | 1928 | #ifdef CONFIG_KGDB_KDB |
1929 | if (unlikely(kdb_trap_printk)) { | 1929 | /* Allow to pass printk() to kdb but avoid a recursion. */ |
1930 | if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { | ||
1930 | r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); | 1931 | r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); |
1931 | return r; | 1932 | return r; |
1932 | } | 1933 | } |
diff --git a/kernel/relay.c b/kernel/relay.c index da79a109dbeb..8f18d314a96a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan, | |||
809 | { | 809 | { |
810 | struct rchan_buf *buf; | 810 | struct rchan_buf *buf; |
811 | 811 | ||
812 | if (!chan) | 812 | if (!chan || cpu >= NR_CPUS) |
813 | return; | 813 | return; |
814 | 814 | ||
815 | buf = *per_cpu_ptr(chan->buf, cpu); | 815 | buf = *per_cpu_ptr(chan->buf, cpu); |
816 | if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) | 816 | if (!buf || subbufs_consumed > chan->n_subbufs) |
817 | return; | 817 | return; |
818 | 818 | ||
819 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) | 819 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) |
diff --git a/kernel/signal.c b/kernel/signal.c index 29a410780aa9..ae60996fedff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset) | |||
2491 | { | 2491 | { |
2492 | struct task_struct *tsk = current; | 2492 | struct task_struct *tsk = current; |
2493 | 2493 | ||
2494 | /* | ||
2495 | * In case the signal mask hasn't changed, there is nothing we need | ||
2496 | * to do. The current->blocked shouldn't be modified by other task. | ||
2497 | */ | ||
2498 | if (sigequalsets(&tsk->blocked, newset)) | ||
2499 | return; | ||
2500 | |||
2494 | spin_lock_irq(&tsk->sighand->siglock); | 2501 | spin_lock_irq(&tsk->sighand->siglock); |
2495 | __set_task_blocked(tsk, newset); | 2502 | __set_task_blocked(tsk, newset); |
2496 | spin_unlock_irq(&tsk->sighand->siglock); | 2503 | spin_unlock_irq(&tsk->sighand->siglock); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 39b3368f6de6..1475d2545b7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void) | |||
2389 | #ifdef CONFIG_COREDUMP | 2389 | #ifdef CONFIG_COREDUMP |
2390 | if (suid_dumpable == SUID_DUMP_ROOT && | 2390 | if (suid_dumpable == SUID_DUMP_ROOT && |
2391 | core_pattern[0] != '/' && core_pattern[0] != '|') { | 2391 | core_pattern[0] != '/' && core_pattern[0] != '|') { |
2392 | printk(KERN_WARNING "Unsafe core_pattern used with "\ | 2392 | printk(KERN_WARNING |
2393 | "suid_dumpable=2. Pipe handler or fully qualified "\ | 2393 | "Unsafe core_pattern used with fs.suid_dumpable=2.\n" |
2394 | "core dump path required.\n"); | 2394 | "Pipe handler or fully qualified core dump path required.\n" |
2395 | "Set kernel.core_pattern before fs.suid_dumpable.\n" | ||
2396 | ); | ||
2395 | } | 2397 | } |
2396 | #endif | 2398 | #endif |
2397 | } | 2399 | } |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6eb99c17dbd8..ece4b177052b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
@@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen) | |||
1354 | "warning: process `%s' used the deprecated sysctl " | 1354 | "warning: process `%s' used the deprecated sysctl " |
1355 | "system call with ", current->comm); | 1355 | "system call with ", current->comm); |
1356 | for (i = 0; i < nlen; i++) | 1356 | for (i = 0; i < nlen; i++) |
1357 | printk("%d.", name[i]); | 1357 | printk(KERN_CONT "%d.", name[i]); |
1358 | printk("\n"); | 1358 | printk(KERN_CONT "\n"); |
1359 | } | 1359 | } |
1360 | return; | 1360 | return; |
1361 | } | 1361 | } |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9b08ca391aed..3921cf7fea8e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
@@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, | |||
516 | 516 | ||
517 | spin_lock_irqsave(&ptr->it_lock, flags); | 517 | spin_lock_irqsave(&ptr->it_lock, flags); |
518 | if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { | 518 | if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { |
519 | if (posix_timer_event(ptr, 0) != 0) | 519 | if (IS_ENABLED(CONFIG_POSIX_TIMERS) && |
520 | posix_timer_event(ptr, 0) != 0) | ||
520 | ptr->it_overrun++; | 521 | ptr->it_overrun++; |
521 | } | 522 | } |
522 | 523 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -24,32 +24,14 @@ | |||
24 | 24 | ||
25 | #include <asm/irq_regs.h> | 25 | #include <asm/irq_regs.h> |
26 | #include <linux/kvm_para.h> | 26 | #include <linux/kvm_para.h> |
27 | #include <linux/perf_event.h> | ||
28 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
29 | 28 | ||
30 | /* | ||
31 | * The run state of the lockup detectors is controlled by the content of the | ||
32 | * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - | ||
33 | * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. | ||
34 | * | ||
35 | * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' | ||
36 | * are variables that are only used as an 'interface' between the parameters | ||
37 | * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The | ||
38 | * 'watchdog_thresh' variable is handled differently because its value is not | ||
39 | * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' | ||
40 | * is equal zero. | ||
41 | */ | ||
42 | #define NMI_WATCHDOG_ENABLED_BIT 0 | ||
43 | #define SOFT_WATCHDOG_ENABLED_BIT 1 | ||
44 | #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) | ||
45 | #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) | ||
46 | |||
47 | static DEFINE_MUTEX(watchdog_proc_mutex); | 29 | static DEFINE_MUTEX(watchdog_proc_mutex); |
48 | 30 | ||
49 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 31 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) |
50 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; | 32 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; |
51 | #else | 33 | #else |
52 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | 34 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; |
53 | #endif | 35 | #endif |
54 | int __read_mostly nmi_watchdog_enabled; | 36 | int __read_mostly nmi_watchdog_enabled; |
55 | int __read_mostly soft_watchdog_enabled; | 37 | int __read_mostly soft_watchdog_enabled; |
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10; | |||
59 | #ifdef CONFIG_SMP | 41 | #ifdef CONFIG_SMP |
60 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 42 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
61 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | 43 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; |
62 | #else | ||
63 | #define sysctl_softlockup_all_cpu_backtrace 0 | ||
64 | #define sysctl_hardlockup_all_cpu_backtrace 0 | ||
65 | #endif | 44 | #endif |
66 | static struct cpumask watchdog_cpumask __read_mostly; | 45 | static struct cpumask watchdog_cpumask __read_mostly; |
67 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 46 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
@@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); | |||
100 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | 79 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); |
101 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | 80 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
102 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | 81 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); |
103 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
104 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
105 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
106 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | 82 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); |
107 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
108 | #endif | ||
109 | static unsigned long soft_lockup_nmi_warn; | 83 | static unsigned long soft_lockup_nmi_warn; |
110 | 84 | ||
111 | /* boot commands */ | ||
112 | /* | ||
113 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
114 | */ | ||
115 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
116 | unsigned int __read_mostly hardlockup_panic = | ||
117 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
118 | static unsigned long hardlockup_allcpu_dumped; | ||
119 | /* | ||
120 | * We may not want to enable hard lockup detection by default in all cases, | ||
121 | * for example when running the kernel as a guest on a hypervisor. In these | ||
122 | * cases this function can be called to disable hard lockup detection. This | ||
123 | * function should only be executed once by the boot processor before the | ||
124 | * kernel command line parameters are parsed, because otherwise it is not | ||
125 | * possible to override this in hardlockup_panic_setup(). | ||
126 | */ | ||
127 | void hardlockup_detector_disable(void) | ||
128 | { | ||
129 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
130 | } | ||
131 | |||
132 | static int __init hardlockup_panic_setup(char *str) | ||
133 | { | ||
134 | if (!strncmp(str, "panic", 5)) | ||
135 | hardlockup_panic = 1; | ||
136 | else if (!strncmp(str, "nopanic", 7)) | ||
137 | hardlockup_panic = 0; | ||
138 | else if (!strncmp(str, "0", 1)) | ||
139 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
140 | else if (!strncmp(str, "1", 1)) | ||
141 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
142 | return 1; | ||
143 | } | ||
144 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
145 | #endif | ||
146 | |||
147 | unsigned int __read_mostly softlockup_panic = | 85 | unsigned int __read_mostly softlockup_panic = |
148 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | 86 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; |
149 | 87 | ||
@@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void) | |||
264 | wq_watchdog_touch(-1); | 202 | wq_watchdog_touch(-1); |
265 | } | 203 | } |
266 | 204 | ||
267 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
268 | void touch_nmi_watchdog(void) | ||
269 | { | ||
270 | /* | ||
271 | * Using __raw here because some code paths have | ||
272 | * preemption enabled. If preemption is enabled | ||
273 | * then interrupts should be enabled too, in which | ||
274 | * case we shouldn't have to worry about the watchdog | ||
275 | * going off. | ||
276 | */ | ||
277 | raw_cpu_write(watchdog_nmi_touch, true); | ||
278 | touch_softlockup_watchdog(); | ||
279 | } | ||
280 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
281 | |||
282 | #endif | ||
283 | |||
284 | void touch_softlockup_watchdog_sync(void) | 205 | void touch_softlockup_watchdog_sync(void) |
285 | { | 206 | { |
286 | __this_cpu_write(softlockup_touch_sync, true); | 207 | __this_cpu_write(softlockup_touch_sync, true); |
287 | __this_cpu_write(watchdog_touch_ts, 0); | 208 | __this_cpu_write(watchdog_touch_ts, 0); |
288 | } | 209 | } |
289 | 210 | ||
290 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
291 | /* watchdog detector functions */ | 211 | /* watchdog detector functions */ |
292 | static bool is_hardlockup(void) | 212 | bool is_hardlockup(void) |
293 | { | 213 | { |
294 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); | 214 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
295 | 215 | ||
@@ -299,7 +219,6 @@ static bool is_hardlockup(void) | |||
299 | __this_cpu_write(hrtimer_interrupts_saved, hrint); | 219 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
300 | return false; | 220 | return false; |
301 | } | 221 | } |
302 | #endif | ||
303 | 222 | ||
304 | static int is_softlockup(unsigned long touch_ts) | 223 | static int is_softlockup(unsigned long touch_ts) |
305 | { | 224 | { |
@@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts) | |||
313 | return 0; | 232 | return 0; |
314 | } | 233 | } |
315 | 234 | ||
316 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
317 | |||
318 | static struct perf_event_attr wd_hw_attr = { | ||
319 | .type = PERF_TYPE_HARDWARE, | ||
320 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
321 | .size = sizeof(struct perf_event_attr), | ||
322 | .pinned = 1, | ||
323 | .disabled = 1, | ||
324 | }; | ||
325 | |||
326 | /* Callback function for perf event subsystem */ | ||
327 | static void watchdog_overflow_callback(struct perf_event *event, | ||
328 | struct perf_sample_data *data, | ||
329 | struct pt_regs *regs) | ||
330 | { | ||
331 | /* Ensure the watchdog never gets throttled */ | ||
332 | event->hw.interrupts = 0; | ||
333 | |||
334 | if (__this_cpu_read(watchdog_nmi_touch) == true) { | ||
335 | __this_cpu_write(watchdog_nmi_touch, false); | ||
336 | return; | ||
337 | } | ||
338 | |||
339 | /* check for a hardlockup | ||
340 | * This is done by making sure our timer interrupt | ||
341 | * is incrementing. The timer interrupt should have | ||
342 | * fired multiple times before we overflow'd. If it hasn't | ||
343 | * then this is a good indication the cpu is stuck | ||
344 | */ | ||
345 | if (is_hardlockup()) { | ||
346 | int this_cpu = smp_processor_id(); | ||
347 | struct pt_regs *regs = get_irq_regs(); | ||
348 | |||
349 | /* only print hardlockups once */ | ||
350 | if (__this_cpu_read(hard_watchdog_warn) == true) | ||
351 | return; | ||
352 | |||
353 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
354 | print_modules(); | ||
355 | print_irqtrace_events(current); | ||
356 | if (regs) | ||
357 | show_regs(regs); | ||
358 | else | ||
359 | dump_stack(); | ||
360 | |||
361 | /* | ||
362 | * Perform all-CPU dump only once to avoid multiple hardlockups | ||
363 | * generating interleaving traces | ||
364 | */ | ||
365 | if (sysctl_hardlockup_all_cpu_backtrace && | ||
366 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) | ||
367 | trigger_allbutself_cpu_backtrace(); | ||
368 | |||
369 | if (hardlockup_panic) | ||
370 | nmi_panic(regs, "Hard LOCKUP"); | ||
371 | |||
372 | __this_cpu_write(hard_watchdog_warn, true); | ||
373 | return; | ||
374 | } | ||
375 | |||
376 | __this_cpu_write(hard_watchdog_warn, false); | ||
377 | return; | ||
378 | } | ||
379 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
380 | |||
381 | static void watchdog_interrupt_count(void) | 235 | static void watchdog_interrupt_count(void) |
382 | { | 236 | { |
383 | __this_cpu_inc(hrtimer_interrupts); | 237 | __this_cpu_inc(hrtimer_interrupts); |
384 | } | 238 | } |
385 | 239 | ||
386 | static int watchdog_nmi_enable(unsigned int cpu); | 240 | /* |
387 | static void watchdog_nmi_disable(unsigned int cpu); | 241 | * These two functions are mostly architecture specific |
242 | * defining them as weak here. | ||
243 | */ | ||
244 | int __weak watchdog_nmi_enable(unsigned int cpu) | ||
245 | { | ||
246 | return 0; | ||
247 | } | ||
248 | void __weak watchdog_nmi_disable(unsigned int cpu) | ||
249 | { | ||
250 | } | ||
388 | 251 | ||
389 | static int watchdog_enable_all_cpus(void); | 252 | static int watchdog_enable_all_cpus(void); |
390 | static void watchdog_disable_all_cpus(void); | 253 | static void watchdog_disable_all_cpus(void); |
@@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu) | |||
577 | watchdog_nmi_disable(cpu); | 440 | watchdog_nmi_disable(cpu); |
578 | } | 441 | } |
579 | 442 | ||
580 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
581 | /* | ||
582 | * People like the simple clean cpu node info on boot. | ||
583 | * Reduce the watchdog noise by only printing messages | ||
584 | * that are different from what cpu0 displayed. | ||
585 | */ | ||
586 | static unsigned long cpu0_err; | ||
587 | |||
588 | static int watchdog_nmi_enable(unsigned int cpu) | ||
589 | { | ||
590 | struct perf_event_attr *wd_attr; | ||
591 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
592 | |||
593 | /* nothing to do if the hard lockup detector is disabled */ | ||
594 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
595 | goto out; | ||
596 | |||
597 | /* is it already setup and enabled? */ | ||
598 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
599 | goto out; | ||
600 | |||
601 | /* it is setup but not enabled */ | ||
602 | if (event != NULL) | ||
603 | goto out_enable; | ||
604 | |||
605 | wd_attr = &wd_hw_attr; | ||
606 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); | ||
607 | |||
608 | /* Try to register using hardware perf events */ | ||
609 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | ||
610 | |||
611 | /* save cpu0 error for future comparision */ | ||
612 | if (cpu == 0 && IS_ERR(event)) | ||
613 | cpu0_err = PTR_ERR(event); | ||
614 | |||
615 | if (!IS_ERR(event)) { | ||
616 | /* only print for cpu0 or different than cpu0 */ | ||
617 | if (cpu == 0 || cpu0_err) | ||
618 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
619 | goto out_save; | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
624 | * set up the hardware perf event. The watchdog() function checks | ||
625 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
626 | * | ||
627 | * The barriers are for syncing up watchdog_enabled across all the | ||
628 | * cpus, as clear_bit() does not use barriers. | ||
629 | */ | ||
630 | smp_mb__before_atomic(); | ||
631 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
632 | smp_mb__after_atomic(); | ||
633 | |||
634 | /* skip displaying the same error again */ | ||
635 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
636 | return PTR_ERR(event); | ||
637 | |||
638 | /* vary the KERN level based on the returned errno */ | ||
639 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
640 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
641 | else if (PTR_ERR(event) == -ENOENT) | ||
642 | pr_warn("disabled (cpu%i): hardware events not enabled\n", | ||
643 | cpu); | ||
644 | else | ||
645 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | ||
646 | cpu, PTR_ERR(event)); | ||
647 | |||
648 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
649 | |||
650 | return PTR_ERR(event); | ||
651 | |||
652 | /* success path */ | ||
653 | out_save: | ||
654 | per_cpu(watchdog_ev, cpu) = event; | ||
655 | out_enable: | ||
656 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
657 | out: | ||
658 | return 0; | ||
659 | } | ||
660 | |||
661 | static void watchdog_nmi_disable(unsigned int cpu) | ||
662 | { | ||
663 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
664 | |||
665 | if (event) { | ||
666 | perf_event_disable(event); | ||
667 | per_cpu(watchdog_ev, cpu) = NULL; | ||
668 | |||
669 | /* should be in cleanup, but blocks oprofile */ | ||
670 | perf_event_release_kernel(event); | ||
671 | } | ||
672 | if (cpu == 0) { | ||
673 | /* watchdog_nmi_enable() expects this to be zero initially. */ | ||
674 | cpu0_err = 0; | ||
675 | } | ||
676 | } | ||
677 | |||
678 | #else | ||
679 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | ||
680 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | ||
681 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
682 | |||
683 | static struct smp_hotplug_thread watchdog_threads = { | 443 | static struct smp_hotplug_thread watchdog_threads = { |
684 | .store = &softlockup_watchdog, | 444 | .store = &softlockup_watchdog, |
685 | .thread_should_run = watchdog_should_run, | 445 | .thread_should_run = watchdog_should_run, |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c | |||
@@ -0,0 +1,227 @@ | |||
1 | /* | ||
2 | * Detect hard lockups on a system | ||
3 | * | ||
4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
5 | * | ||
6 | * Note: Most of this code is borrowed heavily from the original softlockup | ||
7 | * detector, so thanks to Ingo for the initial implementation. | ||
8 | * Some chunks also taken from the old x86-specific nmi watchdog code, thanks | ||
9 | * to those contributors as well. | ||
10 | */ | ||
11 | |||
12 | #define pr_fmt(fmt) "NMI watchdog: " fmt | ||
13 | |||
14 | #include <linux/nmi.h> | ||
15 | #include <linux/module.h> | ||
16 | #include <asm/irq_regs.h> | ||
17 | #include <linux/perf_event.h> | ||
18 | |||
19 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
20 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
21 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
22 | |||
23 | /* boot commands */ | ||
24 | /* | ||
25 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
26 | */ | ||
27 | unsigned int __read_mostly hardlockup_panic = | ||
28 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
29 | static unsigned long hardlockup_allcpu_dumped; | ||
30 | /* | ||
31 | * We may not want to enable hard lockup detection by default in all cases, | ||
32 | * for example when running the kernel as a guest on a hypervisor. In these | ||
33 | * cases this function can be called to disable hard lockup detection. This | ||
34 | * function should only be executed once by the boot processor before the | ||
35 | * kernel command line parameters are parsed, because otherwise it is not | ||
36 | * possible to override this in hardlockup_panic_setup(). | ||
37 | */ | ||
38 | void hardlockup_detector_disable(void) | ||
39 | { | ||
40 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
41 | } | ||
42 | |||
43 | static int __init hardlockup_panic_setup(char *str) | ||
44 | { | ||
45 | if (!strncmp(str, "panic", 5)) | ||
46 | hardlockup_panic = 1; | ||
47 | else if (!strncmp(str, "nopanic", 7)) | ||
48 | hardlockup_panic = 0; | ||
49 | else if (!strncmp(str, "0", 1)) | ||
50 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
51 | else if (!strncmp(str, "1", 1)) | ||
52 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
53 | return 1; | ||
54 | } | ||
55 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
56 | |||
57 | void touch_nmi_watchdog(void) | ||
58 | { | ||
59 | /* | ||
60 | * Using __raw here because some code paths have | ||
61 | * preemption enabled. If preemption is enabled | ||
62 | * then interrupts should be enabled too, in which | ||
63 | * case we shouldn't have to worry about the watchdog | ||
64 | * going off. | ||
65 | */ | ||
66 | raw_cpu_write(watchdog_nmi_touch, true); | ||
67 | touch_softlockup_watchdog(); | ||
68 | } | ||
69 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
70 | |||
71 | static struct perf_event_attr wd_hw_attr = { | ||
72 | .type = PERF_TYPE_HARDWARE, | ||
73 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
74 | .size = sizeof(struct perf_event_attr), | ||
75 | .pinned = 1, | ||
76 | .disabled = 1, | ||
77 | }; | ||
78 | |||
79 | /* Callback function for perf event subsystem */ | ||
80 | static void watchdog_overflow_callback(struct perf_event *event, | ||
81 | struct perf_sample_data *data, | ||
82 | struct pt_regs *regs) | ||
83 | { | ||
84 | /* Ensure the watchdog never gets throttled */ | ||
85 | event->hw.interrupts = 0; | ||
86 | |||
87 | if (__this_cpu_read(watchdog_nmi_touch) == true) { | ||
88 | __this_cpu_write(watchdog_nmi_touch, false); | ||
89 | return; | ||
90 | } | ||
91 | |||
92 | /* check for a hardlockup | ||
93 | * This is done by making sure our timer interrupt | ||
94 | * is incrementing. The timer interrupt should have | ||
95 | * fired multiple times before we overflow'd. If it hasn't | ||
96 | * then this is a good indication the cpu is stuck | ||
97 | */ | ||
98 | if (is_hardlockup()) { | ||
99 | int this_cpu = smp_processor_id(); | ||
100 | |||
101 | /* only print hardlockups once */ | ||
102 | if (__this_cpu_read(hard_watchdog_warn) == true) | ||
103 | return; | ||
104 | |||
105 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
106 | print_modules(); | ||
107 | print_irqtrace_events(current); | ||
108 | if (regs) | ||
109 | show_regs(regs); | ||
110 | else | ||
111 | dump_stack(); | ||
112 | |||
113 | /* | ||
114 | * Perform all-CPU dump only once to avoid multiple hardlockups | ||
115 | * generating interleaving traces | ||
116 | */ | ||
117 | if (sysctl_hardlockup_all_cpu_backtrace && | ||
118 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) | ||
119 | trigger_allbutself_cpu_backtrace(); | ||
120 | |||
121 | if (hardlockup_panic) | ||
122 | nmi_panic(regs, "Hard LOCKUP"); | ||
123 | |||
124 | __this_cpu_write(hard_watchdog_warn, true); | ||
125 | return; | ||
126 | } | ||
127 | |||
128 | __this_cpu_write(hard_watchdog_warn, false); | ||
129 | return; | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * People like the simple clean cpu node info on boot. | ||
134 | * Reduce the watchdog noise by only printing messages | ||
135 | * that are different from what cpu0 displayed. | ||
136 | */ | ||
137 | static unsigned long cpu0_err; | ||
138 | |||
139 | int watchdog_nmi_enable(unsigned int cpu) | ||
140 | { | ||
141 | struct perf_event_attr *wd_attr; | ||
142 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
143 | |||
144 | /* nothing to do if the hard lockup detector is disabled */ | ||
145 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
146 | goto out; | ||
147 | |||
148 | /* is it already setup and enabled? */ | ||
149 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
150 | goto out; | ||
151 | |||
152 | /* it is setup but not enabled */ | ||
153 | if (event != NULL) | ||
154 | goto out_enable; | ||
155 | |||
156 | wd_attr = &wd_hw_attr; | ||
157 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); | ||
158 | |||
159 | /* Try to register using hardware perf events */ | ||
160 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | ||
161 | |||
162 | /* save cpu0 error for future comparision */ | ||
163 | if (cpu == 0 && IS_ERR(event)) | ||
164 | cpu0_err = PTR_ERR(event); | ||
165 | |||
166 | if (!IS_ERR(event)) { | ||
167 | /* only print for cpu0 or different than cpu0 */ | ||
168 | if (cpu == 0 || cpu0_err) | ||
169 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
170 | goto out_save; | ||
171 | } | ||
172 | |||
173 | /* | ||
174 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
175 | * set up the hardware perf event. The watchdog() function checks | ||
176 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
177 | * | ||
178 | * The barriers are for syncing up watchdog_enabled across all the | ||
179 | * cpus, as clear_bit() does not use barriers. | ||
180 | */ | ||
181 | smp_mb__before_atomic(); | ||
182 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
183 | smp_mb__after_atomic(); | ||
184 | |||
185 | /* skip displaying the same error again */ | ||
186 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
187 | return PTR_ERR(event); | ||
188 | |||
189 | /* vary the KERN level based on the returned errno */ | ||
190 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
191 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
192 | else if (PTR_ERR(event) == -ENOENT) | ||
193 | pr_warn("disabled (cpu%i): hardware events not enabled\n", | ||
194 | cpu); | ||
195 | else | ||
196 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | ||
197 | cpu, PTR_ERR(event)); | ||
198 | |||
199 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
200 | |||
201 | return PTR_ERR(event); | ||
202 | |||
203 | /* success path */ | ||
204 | out_save: | ||
205 | per_cpu(watchdog_ev, cpu) = event; | ||
206 | out_enable: | ||
207 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
208 | out: | ||
209 | return 0; | ||
210 | } | ||
211 | |||
212 | void watchdog_nmi_disable(unsigned int cpu) | ||
213 | { | ||
214 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
215 | |||
216 | if (event) { | ||
217 | perf_event_disable(event); | ||
218 | per_cpu(watchdog_ev, cpu) = NULL; | ||
219 | |||
220 | /* should be in cleanup, but blocks oprofile */ | ||
221 | perf_event_release_kernel(event); | ||
222 | } | ||
223 | if (cpu == 0) { | ||
224 | /* watchdog_nmi_enable() expects this to be zero initially. */ | ||
225 | cpu0_err = 0; | ||
226 | } | ||
227 | } | ||
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e6327d102184..7446097f72bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -194,8 +194,8 @@ config GDB_SCRIPTS | |||
194 | build directory. If you load vmlinux into gdb, the helper | 194 | build directory. If you load vmlinux into gdb, the helper |
195 | scripts will be automatically imported by gdb as well, and | 195 | scripts will be automatically imported by gdb as well, and |
196 | additional functions are available to analyze a Linux kernel | 196 | additional functions are available to analyze a Linux kernel |
197 | instance. See Documentation/gdb-kernel-debugging.txt for further | 197 | instance. See Documentation/dev-tools/gdb-kernel-debugging.rst |
198 | details. | 198 | for further details. |
199 | 199 | ||
200 | config ENABLE_WARN_DEPRECATED | 200 | config ENABLE_WARN_DEPRECATED |
201 | bool "Enable __deprecated logic" | 201 | bool "Enable __deprecated logic" |
@@ -542,7 +542,7 @@ config DEBUG_KMEMLEAK | |||
542 | difference being that the orphan objects are not freed but | 542 | difference being that the orphan objects are not freed but |
543 | only shown in /sys/kernel/debug/kmemleak. Enabling this | 543 | only shown in /sys/kernel/debug/kmemleak. Enabling this |
544 | feature will introduce an overhead to memory | 544 | feature will introduce an overhead to memory |
545 | allocations. See Documentation/kmemleak.txt for more | 545 | allocations. See Documentation/dev-tools/kmemleak.rst for more |
546 | details. | 546 | details. |
547 | 547 | ||
548 | Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances | 548 | Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances |
@@ -739,7 +739,7 @@ config KCOV | |||
739 | different machines and across reboots. If you need stable PC values, | 739 | different machines and across reboots. If you need stable PC values, |
740 | disable RANDOMIZE_BASE. | 740 | disable RANDOMIZE_BASE. |
741 | 741 | ||
742 | For more details, see Documentation/kcov.txt. | 742 | For more details, see Documentation/dev-tools/kcov.rst. |
743 | 743 | ||
744 | config KCOV_INSTRUMENT_ALL | 744 | config KCOV_INSTRUMENT_ALL |
745 | bool "Instrument all code by default" | 745 | bool "Instrument all code by default" |
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index bc6e651df68c..a669c193b878 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan | |||
@@ -10,7 +10,8 @@ config UBSAN | |||
10 | This option enables undefined behaviour sanity checker | 10 | This option enables undefined behaviour sanity checker |
11 | Compile-time instrumentation is used to detect various undefined | 11 | Compile-time instrumentation is used to detect various undefined |
12 | behaviours in runtime. Various types of checks may be enabled | 12 | behaviours in runtime. Various types of checks may be enabled |
13 | via boot parameter ubsan_handle (see: Documentation/ubsan.txt). | 13 | via boot parameter ubsan_handle |
14 | (see: Documentation/dev-tools/ubsan.rst). | ||
14 | 15 | ||
15 | config UBSAN_SANITIZE_ALL | 16 | config UBSAN_SANITIZE_ALL |
16 | bool "Enable instrumentation for the entire kernel" | 17 | bool "Enable instrumentation for the entire kernel" |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 2e8c6f7aa56e..0019aca0f328 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -22,6 +22,7 @@ | |||
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
23 | */ | 23 | */ |
24 | 24 | ||
25 | #include <linux/cpu.h> | ||
25 | #include <linux/errno.h> | 26 | #include <linux/errno.h> |
26 | #include <linux/init.h> | 27 | #include <linux/init.h> |
27 | #include <linux/kernel.h> | 28 | #include <linux/kernel.h> |
@@ -69,6 +70,11 @@ struct radix_tree_preload { | |||
69 | }; | 70 | }; |
70 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; | 71 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; |
71 | 72 | ||
73 | static inline struct radix_tree_node *entry_to_node(void *ptr) | ||
74 | { | ||
75 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); | ||
76 | } | ||
77 | |||
72 | static inline void *node_to_entry(void *ptr) | 78 | static inline void *node_to_entry(void *ptr) |
73 | { | 79 | { |
74 | return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); | 80 | return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); |
@@ -191,13 +197,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) | |||
191 | * Returns next bit offset, or size if nothing found. | 197 | * Returns next bit offset, or size if nothing found. |
192 | */ | 198 | */ |
193 | static __always_inline unsigned long | 199 | static __always_inline unsigned long |
194 | radix_tree_find_next_bit(const unsigned long *addr, | 200 | radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, |
195 | unsigned long size, unsigned long offset) | 201 | unsigned long offset) |
196 | { | 202 | { |
197 | if (!__builtin_constant_p(size)) | 203 | const unsigned long *addr = node->tags[tag]; |
198 | return find_next_bit(addr, size, offset); | ||
199 | 204 | ||
200 | if (offset < size) { | 205 | if (offset < RADIX_TREE_MAP_SIZE) { |
201 | unsigned long tmp; | 206 | unsigned long tmp; |
202 | 207 | ||
203 | addr += offset / BITS_PER_LONG; | 208 | addr += offset / BITS_PER_LONG; |
@@ -205,14 +210,32 @@ radix_tree_find_next_bit(const unsigned long *addr, | |||
205 | if (tmp) | 210 | if (tmp) |
206 | return __ffs(tmp) + offset; | 211 | return __ffs(tmp) + offset; |
207 | offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); | 212 | offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); |
208 | while (offset < size) { | 213 | while (offset < RADIX_TREE_MAP_SIZE) { |
209 | tmp = *++addr; | 214 | tmp = *++addr; |
210 | if (tmp) | 215 | if (tmp) |
211 | return __ffs(tmp) + offset; | 216 | return __ffs(tmp) + offset; |
212 | offset += BITS_PER_LONG; | 217 | offset += BITS_PER_LONG; |
213 | } | 218 | } |
214 | } | 219 | } |
215 | return size; | 220 | return RADIX_TREE_MAP_SIZE; |
221 | } | ||
222 | |||
223 | static unsigned int iter_offset(const struct radix_tree_iter *iter) | ||
224 | { | ||
225 | return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; | ||
226 | } | ||
227 | |||
228 | /* | ||
229 | * The maximum index which can be stored in a radix tree | ||
230 | */ | ||
231 | static inline unsigned long shift_maxindex(unsigned int shift) | ||
232 | { | ||
233 | return (RADIX_TREE_MAP_SIZE << shift) - 1; | ||
234 | } | ||
235 | |||
236 | static inline unsigned long node_maxindex(struct radix_tree_node *node) | ||
237 | { | ||
238 | return shift_maxindex(node->shift); | ||
216 | } | 239 | } |
217 | 240 | ||
218 | #ifndef __KERNEL__ | 241 | #ifndef __KERNEL__ |
@@ -220,10 +243,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) | |||
220 | { | 243 | { |
221 | unsigned long i; | 244 | unsigned long i; |
222 | 245 | ||
223 | pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", | 246 | pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", |
224 | node, node->offset, | 247 | node, node->offset, index, index | node_maxindex(node), |
248 | node->parent, | ||
225 | node->tags[0][0], node->tags[1][0], node->tags[2][0], | 249 | node->tags[0][0], node->tags[1][0], node->tags[2][0], |
226 | node->shift, node->count, node->exceptional, node->parent); | 250 | node->shift, node->count, node->exceptional); |
227 | 251 | ||
228 | for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { | 252 | for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { |
229 | unsigned long first = index | (i << node->shift); | 253 | unsigned long first = index | (i << node->shift); |
@@ -231,14 +255,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) | |||
231 | void *entry = node->slots[i]; | 255 | void *entry = node->slots[i]; |
232 | if (!entry) | 256 | if (!entry) |
233 | continue; | 257 | continue; |
234 | if (is_sibling_entry(node, entry)) { | 258 | if (entry == RADIX_TREE_RETRY) { |
235 | pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", | 259 | pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", |
236 | entry, i, | 260 | i, first, last, node); |
237 | *(void **)entry_to_node(entry), | ||
238 | first, last); | ||
239 | } else if (!radix_tree_is_internal_node(entry)) { | 261 | } else if (!radix_tree_is_internal_node(entry)) { |
240 | pr_debug("radix entry %p offset %ld indices %ld-%ld\n", | 262 | pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", |
241 | entry, i, first, last); | 263 | entry, i, first, last, node); |
264 | } else if (is_sibling_entry(node, entry)) { | ||
265 | pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", | ||
266 | entry, i, first, last, node, | ||
267 | *(void **)entry_to_node(entry)); | ||
242 | } else { | 268 | } else { |
243 | dump_node(entry_to_node(entry), first); | 269 | dump_node(entry_to_node(entry), first); |
244 | } | 270 | } |
@@ -262,7 +288,10 @@ static void radix_tree_dump(struct radix_tree_root *root) | |||
262 | * that the caller has pinned this thread of control to the current CPU. | 288 | * that the caller has pinned this thread of control to the current CPU. |
263 | */ | 289 | */ |
264 | static struct radix_tree_node * | 290 | static struct radix_tree_node * |
265 | radix_tree_node_alloc(struct radix_tree_root *root) | 291 | radix_tree_node_alloc(struct radix_tree_root *root, |
292 | struct radix_tree_node *parent, | ||
293 | unsigned int shift, unsigned int offset, | ||
294 | unsigned int count, unsigned int exceptional) | ||
266 | { | 295 | { |
267 | struct radix_tree_node *ret = NULL; | 296 | struct radix_tree_node *ret = NULL; |
268 | gfp_t gfp_mask = root_gfp_mask(root); | 297 | gfp_t gfp_mask = root_gfp_mask(root); |
@@ -307,6 +336,13 @@ radix_tree_node_alloc(struct radix_tree_root *root) | |||
307 | ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); | 336 | ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); |
308 | out: | 337 | out: |
309 | BUG_ON(radix_tree_is_internal_node(ret)); | 338 | BUG_ON(radix_tree_is_internal_node(ret)); |
339 | if (ret) { | ||
340 | ret->parent = parent; | ||
341 | ret->shift = shift; | ||
342 | ret->offset = offset; | ||
343 | ret->count = count; | ||
344 | ret->exceptional = exceptional; | ||
345 | } | ||
310 | return ret; | 346 | return ret; |
311 | } | 347 | } |
312 | 348 | ||
@@ -314,17 +350,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) | |||
314 | { | 350 | { |
315 | struct radix_tree_node *node = | 351 | struct radix_tree_node *node = |
316 | container_of(head, struct radix_tree_node, rcu_head); | 352 | container_of(head, struct radix_tree_node, rcu_head); |
317 | int i; | ||
318 | 353 | ||
319 | /* | 354 | /* |
320 | * must only free zeroed nodes into the slab. radix_tree_shrink | 355 | * Must only free zeroed nodes into the slab. We can be left with |
321 | * can leave us with a non-NULL entry in the first slot, so clear | 356 | * non-NULL entries by radix_tree_free_nodes, so clear the entries |
322 | * that here to make sure. | 357 | * and tags here. |
323 | */ | 358 | */ |
324 | for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) | 359 | memset(node->slots, 0, sizeof(node->slots)); |
325 | tag_clear(node, i, 0); | 360 | memset(node->tags, 0, sizeof(node->tags)); |
326 | 361 | INIT_LIST_HEAD(&node->private_list); | |
327 | node->slots[0] = NULL; | ||
328 | 362 | ||
329 | kmem_cache_free(radix_tree_node_cachep, node); | 363 | kmem_cache_free(radix_tree_node_cachep, node); |
330 | } | 364 | } |
@@ -344,7 +378,7 @@ radix_tree_node_free(struct radix_tree_node *node) | |||
344 | * To make use of this facility, the radix tree must be initialised without | 378 | * To make use of this facility, the radix tree must be initialised without |
345 | * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). | 379 | * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). |
346 | */ | 380 | */ |
347 | static int __radix_tree_preload(gfp_t gfp_mask, int nr) | 381 | static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) |
348 | { | 382 | { |
349 | struct radix_tree_preload *rtp; | 383 | struct radix_tree_preload *rtp; |
350 | struct radix_tree_node *node; | 384 | struct radix_tree_node *node; |
@@ -410,6 +444,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) | |||
410 | } | 444 | } |
411 | EXPORT_SYMBOL(radix_tree_maybe_preload); | 445 | EXPORT_SYMBOL(radix_tree_maybe_preload); |
412 | 446 | ||
447 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
448 | /* | ||
449 | * Preload with enough objects to ensure that we can split a single entry | ||
450 | * of order @old_order into many entries of size @new_order | ||
451 | */ | ||
452 | int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, | ||
453 | gfp_t gfp_mask) | ||
454 | { | ||
455 | unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); | ||
456 | unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - | ||
457 | (new_order / RADIX_TREE_MAP_SHIFT); | ||
458 | unsigned nr = 0; | ||
459 | |||
460 | WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); | ||
461 | BUG_ON(new_order >= old_order); | ||
462 | |||
463 | while (layers--) | ||
464 | nr = nr * RADIX_TREE_MAP_SIZE + 1; | ||
465 | return __radix_tree_preload(gfp_mask, top * nr); | ||
466 | } | ||
467 | #endif | ||
468 | |||
413 | /* | 469 | /* |
414 | * The same as function above, but preload number of nodes required to insert | 470 | * The same as function above, but preload number of nodes required to insert |
415 | * (1 << order) continuous naturally-aligned elements. | 471 | * (1 << order) continuous naturally-aligned elements. |
@@ -455,19 +511,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) | |||
455 | return __radix_tree_preload(gfp_mask, nr_nodes); | 511 | return __radix_tree_preload(gfp_mask, nr_nodes); |
456 | } | 512 | } |
457 | 513 | ||
458 | /* | ||
459 | * The maximum index which can be stored in a radix tree | ||
460 | */ | ||
461 | static inline unsigned long shift_maxindex(unsigned int shift) | ||
462 | { | ||
463 | return (RADIX_TREE_MAP_SIZE << shift) - 1; | ||
464 | } | ||
465 | |||
466 | static inline unsigned long node_maxindex(struct radix_tree_node *node) | ||
467 | { | ||
468 | return shift_maxindex(node->shift); | ||
469 | } | ||
470 | |||
471 | static unsigned radix_tree_load_root(struct radix_tree_root *root, | 514 | static unsigned radix_tree_load_root(struct radix_tree_root *root, |
472 | struct radix_tree_node **nodep, unsigned long *maxindex) | 515 | struct radix_tree_node **nodep, unsigned long *maxindex) |
473 | { | 516 | { |
@@ -505,8 +548,8 @@ static int radix_tree_extend(struct radix_tree_root *root, | |||
505 | goto out; | 548 | goto out; |
506 | 549 | ||
507 | do { | 550 | do { |
508 | struct radix_tree_node *node = radix_tree_node_alloc(root); | 551 | struct radix_tree_node *node = radix_tree_node_alloc(root, |
509 | 552 | NULL, shift, 0, 1, 0); | |
510 | if (!node) | 553 | if (!node) |
511 | return -ENOMEM; | 554 | return -ENOMEM; |
512 | 555 | ||
@@ -517,16 +560,11 @@ static int radix_tree_extend(struct radix_tree_root *root, | |||
517 | } | 560 | } |
518 | 561 | ||
519 | BUG_ON(shift > BITS_PER_LONG); | 562 | BUG_ON(shift > BITS_PER_LONG); |
520 | node->shift = shift; | ||
521 | node->offset = 0; | ||
522 | node->count = 1; | ||
523 | node->parent = NULL; | ||
524 | if (radix_tree_is_internal_node(slot)) { | 563 | if (radix_tree_is_internal_node(slot)) { |
525 | entry_to_node(slot)->parent = node; | 564 | entry_to_node(slot)->parent = node; |
526 | } else { | 565 | } else if (radix_tree_exceptional_entry(slot)) { |
527 | /* Moving an exceptional root->rnode to a node */ | 566 | /* Moving an exceptional root->rnode to a node */ |
528 | if (radix_tree_exceptional_entry(slot)) | 567 | node->exceptional = 1; |
529 | node->exceptional = 1; | ||
530 | } | 568 | } |
531 | node->slots[0] = slot; | 569 | node->slots[0] = slot; |
532 | slot = node_to_entry(node); | 570 | slot = node_to_entry(node); |
@@ -665,26 +703,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, | |||
665 | shift = radix_tree_load_root(root, &child, &maxindex); | 703 | shift = radix_tree_load_root(root, &child, &maxindex); |
666 | 704 | ||
667 | /* Make sure the tree is high enough. */ | 705 | /* Make sure the tree is high enough. */ |
706 | if (order > 0 && max == ((1UL << order) - 1)) | ||
707 | max++; | ||
668 | if (max > maxindex) { | 708 | if (max > maxindex) { |
669 | int error = radix_tree_extend(root, max, shift); | 709 | int error = radix_tree_extend(root, max, shift); |
670 | if (error < 0) | 710 | if (error < 0) |
671 | return error; | 711 | return error; |
672 | shift = error; | 712 | shift = error; |
673 | child = root->rnode; | 713 | child = root->rnode; |
674 | if (order == shift) | ||
675 | shift += RADIX_TREE_MAP_SHIFT; | ||
676 | } | 714 | } |
677 | 715 | ||
678 | while (shift > order) { | 716 | while (shift > order) { |
679 | shift -= RADIX_TREE_MAP_SHIFT; | 717 | shift -= RADIX_TREE_MAP_SHIFT; |
680 | if (child == NULL) { | 718 | if (child == NULL) { |
681 | /* Have to add a child node. */ | 719 | /* Have to add a child node. */ |
682 | child = radix_tree_node_alloc(root); | 720 | child = radix_tree_node_alloc(root, node, shift, |
721 | offset, 0, 0); | ||
683 | if (!child) | 722 | if (!child) |
684 | return -ENOMEM; | 723 | return -ENOMEM; |
685 | child->shift = shift; | ||
686 | child->offset = offset; | ||
687 | child->parent = node; | ||
688 | rcu_assign_pointer(*slot, node_to_entry(child)); | 724 | rcu_assign_pointer(*slot, node_to_entry(child)); |
689 | if (node) | 725 | if (node) |
690 | node->count++; | 726 | node->count++; |
@@ -697,31 +733,125 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, | |||
697 | slot = &node->slots[offset]; | 733 | slot = &node->slots[offset]; |
698 | } | 734 | } |
699 | 735 | ||
736 | if (nodep) | ||
737 | *nodep = node; | ||
738 | if (slotp) | ||
739 | *slotp = slot; | ||
740 | return 0; | ||
741 | } | ||
742 | |||
700 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | 743 | #ifdef CONFIG_RADIX_TREE_MULTIORDER |
701 | /* Insert pointers to the canonical entry */ | 744 | /* |
702 | if (order > shift) { | 745 | * Free any nodes below this node. The tree is presumed to not need |
703 | unsigned i, n = 1 << (order - shift); | 746 | * shrinking, and any user data in the tree is presumed to not need a |
747 | * destructor called on it. If we need to add a destructor, we can | ||
748 | * add that functionality later. Note that we may not clear tags or | ||
749 | * slots from the tree as an RCU walker may still have a pointer into | ||
750 | * this subtree. We could replace the entries with RADIX_TREE_RETRY, | ||
751 | * but we'll still have to clear those in rcu_free. | ||
752 | */ | ||
753 | static void radix_tree_free_nodes(struct radix_tree_node *node) | ||
754 | { | ||
755 | unsigned offset = 0; | ||
756 | struct radix_tree_node *child = entry_to_node(node); | ||
757 | |||
758 | for (;;) { | ||
759 | void *entry = child->slots[offset]; | ||
760 | if (radix_tree_is_internal_node(entry) && | ||
761 | !is_sibling_entry(child, entry)) { | ||
762 | child = entry_to_node(entry); | ||
763 | offset = 0; | ||
764 | continue; | ||
765 | } | ||
766 | offset++; | ||
767 | while (offset == RADIX_TREE_MAP_SIZE) { | ||
768 | struct radix_tree_node *old = child; | ||
769 | offset = child->offset + 1; | ||
770 | child = child->parent; | ||
771 | radix_tree_node_free(old); | ||
772 | if (old == entry_to_node(node)) | ||
773 | return; | ||
774 | } | ||
775 | } | ||
776 | } | ||
777 | |||
778 | static inline int insert_entries(struct radix_tree_node *node, void **slot, | ||
779 | void *item, unsigned order, bool replace) | ||
780 | { | ||
781 | struct radix_tree_node *child; | ||
782 | unsigned i, n, tag, offset, tags = 0; | ||
783 | |||
784 | if (node) { | ||
785 | if (order > node->shift) | ||
786 | n = 1 << (order - node->shift); | ||
787 | else | ||
788 | n = 1; | ||
789 | offset = get_slot_offset(node, slot); | ||
790 | } else { | ||
791 | n = 1; | ||
792 | offset = 0; | ||
793 | } | ||
794 | |||
795 | if (n > 1) { | ||
704 | offset = offset & ~(n - 1); | 796 | offset = offset & ~(n - 1); |
705 | slot = &node->slots[offset]; | 797 | slot = &node->slots[offset]; |
706 | child = node_to_entry(slot); | 798 | } |
707 | for (i = 0; i < n; i++) { | 799 | child = node_to_entry(slot); |
708 | if (slot[i]) | 800 | |
801 | for (i = 0; i < n; i++) { | ||
802 | if (slot[i]) { | ||
803 | if (replace) { | ||
804 | node->count--; | ||
805 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
806 | if (tag_get(node, tag, offset + i)) | ||
807 | tags |= 1 << tag; | ||
808 | } else | ||
709 | return -EEXIST; | 809 | return -EEXIST; |
710 | } | 810 | } |
811 | } | ||
711 | 812 | ||
712 | for (i = 1; i < n; i++) { | 813 | for (i = 0; i < n; i++) { |
814 | struct radix_tree_node *old = slot[i]; | ||
815 | if (i) { | ||
713 | rcu_assign_pointer(slot[i], child); | 816 | rcu_assign_pointer(slot[i], child); |
714 | node->count++; | 817 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) |
818 | if (tags & (1 << tag)) | ||
819 | tag_clear(node, tag, offset + i); | ||
820 | } else { | ||
821 | rcu_assign_pointer(slot[i], item); | ||
822 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
823 | if (tags & (1 << tag)) | ||
824 | tag_set(node, tag, offset); | ||
715 | } | 825 | } |
826 | if (radix_tree_is_internal_node(old) && | ||
827 | !is_sibling_entry(node, old) && | ||
828 | (old != RADIX_TREE_RETRY)) | ||
829 | radix_tree_free_nodes(old); | ||
830 | if (radix_tree_exceptional_entry(old)) | ||
831 | node->exceptional--; | ||
716 | } | 832 | } |
717 | #endif | 833 | if (node) { |
718 | 834 | node->count += n; | |
719 | if (nodep) | 835 | if (radix_tree_exceptional_entry(item)) |
720 | *nodep = node; | 836 | node->exceptional += n; |
721 | if (slotp) | 837 | } |
722 | *slotp = slot; | 838 | return n; |
723 | return 0; | 839 | } |
840 | #else | ||
841 | static inline int insert_entries(struct radix_tree_node *node, void **slot, | ||
842 | void *item, unsigned order, bool replace) | ||
843 | { | ||
844 | if (*slot) | ||
845 | return -EEXIST; | ||
846 | rcu_assign_pointer(*slot, item); | ||
847 | if (node) { | ||
848 | node->count++; | ||
849 | if (radix_tree_exceptional_entry(item)) | ||
850 | node->exceptional++; | ||
851 | } | ||
852 | return 1; | ||
724 | } | 853 | } |
854 | #endif | ||
725 | 855 | ||
726 | /** | 856 | /** |
727 | * __radix_tree_insert - insert into a radix tree | 857 | * __radix_tree_insert - insert into a radix tree |
@@ -744,15 +874,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, | |||
744 | error = __radix_tree_create(root, index, order, &node, &slot); | 874 | error = __radix_tree_create(root, index, order, &node, &slot); |
745 | if (error) | 875 | if (error) |
746 | return error; | 876 | return error; |
747 | if (*slot != NULL) | 877 | |
748 | return -EEXIST; | 878 | error = insert_entries(node, slot, item, order, false); |
749 | rcu_assign_pointer(*slot, item); | 879 | if (error < 0) |
880 | return error; | ||
750 | 881 | ||
751 | if (node) { | 882 | if (node) { |
752 | unsigned offset = get_slot_offset(node, slot); | 883 | unsigned offset = get_slot_offset(node, slot); |
753 | node->count++; | ||
754 | if (radix_tree_exceptional_entry(item)) | ||
755 | node->exceptional++; | ||
756 | BUG_ON(tag_get(node, 0, offset)); | 884 | BUG_ON(tag_get(node, 0, offset)); |
757 | BUG_ON(tag_get(node, 1, offset)); | 885 | BUG_ON(tag_get(node, 1, offset)); |
758 | BUG_ON(tag_get(node, 2, offset)); | 886 | BUG_ON(tag_get(node, 2, offset)); |
@@ -850,6 +978,24 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) | |||
850 | } | 978 | } |
851 | EXPORT_SYMBOL(radix_tree_lookup); | 979 | EXPORT_SYMBOL(radix_tree_lookup); |
852 | 980 | ||
981 | static inline int slot_count(struct radix_tree_node *node, | ||
982 | void **slot) | ||
983 | { | ||
984 | int n = 1; | ||
985 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
986 | void *ptr = node_to_entry(slot); | ||
987 | unsigned offset = get_slot_offset(node, slot); | ||
988 | int i; | ||
989 | |||
990 | for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { | ||
991 | if (node->slots[offset + i] != ptr) | ||
992 | break; | ||
993 | n++; | ||
994 | } | ||
995 | #endif | ||
996 | return n; | ||
997 | } | ||
998 | |||
853 | static void replace_slot(struct radix_tree_root *root, | 999 | static void replace_slot(struct radix_tree_root *root, |
854 | struct radix_tree_node *node, | 1000 | struct radix_tree_node *node, |
855 | void **slot, void *item, | 1001 | void **slot, void *item, |
@@ -868,12 +1014,35 @@ static void replace_slot(struct radix_tree_root *root, | |||
868 | 1014 | ||
869 | if (node) { | 1015 | if (node) { |
870 | node->count += count; | 1016 | node->count += count; |
871 | node->exceptional += exceptional; | 1017 | if (exceptional) { |
1018 | exceptional *= slot_count(node, slot); | ||
1019 | node->exceptional += exceptional; | ||
1020 | } | ||
872 | } | 1021 | } |
873 | 1022 | ||
874 | rcu_assign_pointer(*slot, item); | 1023 | rcu_assign_pointer(*slot, item); |
875 | } | 1024 | } |
876 | 1025 | ||
1026 | static inline void delete_sibling_entries(struct radix_tree_node *node, | ||
1027 | void **slot) | ||
1028 | { | ||
1029 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
1030 | bool exceptional = radix_tree_exceptional_entry(*slot); | ||
1031 | void *ptr = node_to_entry(slot); | ||
1032 | unsigned offset = get_slot_offset(node, slot); | ||
1033 | int i; | ||
1034 | |||
1035 | for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { | ||
1036 | if (node->slots[offset + i] != ptr) | ||
1037 | break; | ||
1038 | node->slots[offset + i] = NULL; | ||
1039 | node->count--; | ||
1040 | if (exceptional) | ||
1041 | node->exceptional--; | ||
1042 | } | ||
1043 | #endif | ||
1044 | } | ||
1045 | |||
877 | /** | 1046 | /** |
878 | * __radix_tree_replace - replace item in a slot | 1047 | * __radix_tree_replace - replace item in a slot |
879 | * @root: radix tree root | 1048 | * @root: radix tree root |
@@ -891,6 +1060,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
891 | void **slot, void *item, | 1060 | void **slot, void *item, |
892 | radix_tree_update_node_t update_node, void *private) | 1061 | radix_tree_update_node_t update_node, void *private) |
893 | { | 1062 | { |
1063 | if (!item) | ||
1064 | delete_sibling_entries(node, slot); | ||
894 | /* | 1065 | /* |
895 | * This function supports replacing exceptional entries and | 1066 | * This function supports replacing exceptional entries and |
896 | * deleting entries, but that needs accounting against the | 1067 | * deleting entries, but that needs accounting against the |
@@ -921,7 +1092,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
921 | * NOTE: This cannot be used to switch between non-entries (empty slots), | 1092 | * NOTE: This cannot be used to switch between non-entries (empty slots), |
922 | * regular entries, and exceptional entries, as that requires accounting | 1093 | * regular entries, and exceptional entries, as that requires accounting |
923 | * inside the radix tree node. When switching from one type of entry or | 1094 | * inside the radix tree node. When switching from one type of entry or |
924 | * deleting, use __radix_tree_lookup() and __radix_tree_replace(). | 1095 | * deleting, use __radix_tree_lookup() and __radix_tree_replace() or |
1096 | * radix_tree_iter_replace(). | ||
925 | */ | 1097 | */ |
926 | void radix_tree_replace_slot(struct radix_tree_root *root, | 1098 | void radix_tree_replace_slot(struct radix_tree_root *root, |
927 | void **slot, void *item) | 1099 | void **slot, void *item) |
@@ -930,6 +1102,164 @@ void radix_tree_replace_slot(struct radix_tree_root *root, | |||
930 | } | 1102 | } |
931 | 1103 | ||
932 | /** | 1104 | /** |
1105 | * radix_tree_iter_replace - replace item in a slot | ||
1106 | * @root: radix tree root | ||
1107 | * @slot: pointer to slot | ||
1108 | * @item: new item to store in the slot. | ||
1109 | * | ||
1110 | * For use with radix_tree_split() and radix_tree_for_each_slot(). | ||
1111 | * Caller must hold tree write locked across split and replacement. | ||
1112 | */ | ||
1113 | void radix_tree_iter_replace(struct radix_tree_root *root, | ||
1114 | const struct radix_tree_iter *iter, void **slot, void *item) | ||
1115 | { | ||
1116 | __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); | ||
1117 | } | ||
1118 | |||
1119 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
1120 | /** | ||
1121 | * radix_tree_join - replace multiple entries with one multiorder entry | ||
1122 | * @root: radix tree root | ||
1123 | * @index: an index inside the new entry | ||
1124 | * @order: order of the new entry | ||
1125 | * @item: new entry | ||
1126 | * | ||
1127 | * Call this function to replace several entries with one larger entry. | ||
1128 | * The existing entries are presumed to not need freeing as a result of | ||
1129 | * this call. | ||
1130 | * | ||
1131 | * The replacement entry will have all the tags set on it that were set | ||
1132 | * on any of the entries it is replacing. | ||
1133 | */ | ||
1134 | int radix_tree_join(struct radix_tree_root *root, unsigned long index, | ||
1135 | unsigned order, void *item) | ||
1136 | { | ||
1137 | struct radix_tree_node *node; | ||
1138 | void **slot; | ||
1139 | int error; | ||
1140 | |||
1141 | BUG_ON(radix_tree_is_internal_node(item)); | ||
1142 | |||
1143 | error = __radix_tree_create(root, index, order, &node, &slot); | ||
1144 | if (!error) | ||
1145 | error = insert_entries(node, slot, item, order, true); | ||
1146 | if (error > 0) | ||
1147 | error = 0; | ||
1148 | |||
1149 | return error; | ||
1150 | } | ||
1151 | |||
1152 | /** | ||
1153 | * radix_tree_split - Split an entry into smaller entries | ||
1154 | * @root: radix tree root | ||
1155 | * @index: An index within the large entry | ||
1156 | * @order: Order of new entries | ||
1157 | * | ||
1158 | * Call this function as the first step in replacing a multiorder entry | ||
1159 | * with several entries of lower order. After this function returns, | ||
1160 | * loop over the relevant portion of the tree using radix_tree_for_each_slot() | ||
1161 | * and call radix_tree_iter_replace() to set up each new entry. | ||
1162 | * | ||
1163 | * The tags from this entry are replicated to all the new entries. | ||
1164 | * | ||
1165 | * The radix tree should be locked against modification during the entire | ||
1166 | * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which | ||
1167 | * should prompt RCU walkers to restart the lookup from the root. | ||
1168 | */ | ||
1169 | int radix_tree_split(struct radix_tree_root *root, unsigned long index, | ||
1170 | unsigned order) | ||
1171 | { | ||
1172 | struct radix_tree_node *parent, *node, *child; | ||
1173 | void **slot; | ||
1174 | unsigned int offset, end; | ||
1175 | unsigned n, tag, tags = 0; | ||
1176 | |||
1177 | if (!__radix_tree_lookup(root, index, &parent, &slot)) | ||
1178 | return -ENOENT; | ||
1179 | if (!parent) | ||
1180 | return -ENOENT; | ||
1181 | |||
1182 | offset = get_slot_offset(parent, slot); | ||
1183 | |||
1184 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
1185 | if (tag_get(parent, tag, offset)) | ||
1186 | tags |= 1 << tag; | ||
1187 | |||
1188 | for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { | ||
1189 | if (!is_sibling_entry(parent, parent->slots[end])) | ||
1190 | break; | ||
1191 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
1192 | if (tags & (1 << tag)) | ||
1193 | tag_set(parent, tag, end); | ||
1194 | /* rcu_assign_pointer ensures tags are set before RETRY */ | ||
1195 | rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); | ||
1196 | } | ||
1197 | rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); | ||
1198 | parent->exceptional -= (end - offset); | ||
1199 | |||
1200 | if (order == parent->shift) | ||
1201 | return 0; | ||
1202 | if (order > parent->shift) { | ||
1203 | while (offset < end) | ||
1204 | offset += insert_entries(parent, &parent->slots[offset], | ||
1205 | RADIX_TREE_RETRY, order, true); | ||
1206 | return 0; | ||
1207 | } | ||
1208 | |||
1209 | node = parent; | ||
1210 | |||
1211 | for (;;) { | ||
1212 | if (node->shift > order) { | ||
1213 | child = radix_tree_node_alloc(root, node, | ||
1214 | node->shift - RADIX_TREE_MAP_SHIFT, | ||
1215 | offset, 0, 0); | ||
1216 | if (!child) | ||
1217 | goto nomem; | ||
1218 | if (node != parent) { | ||
1219 | node->count++; | ||
1220 | node->slots[offset] = node_to_entry(child); | ||
1221 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
1222 | if (tags & (1 << tag)) | ||
1223 | tag_set(node, tag, offset); | ||
1224 | } | ||
1225 | |||
1226 | node = child; | ||
1227 | offset = 0; | ||
1228 | continue; | ||
1229 | } | ||
1230 | |||
1231 | n = insert_entries(node, &node->slots[offset], | ||
1232 | RADIX_TREE_RETRY, order, false); | ||
1233 | BUG_ON(n > RADIX_TREE_MAP_SIZE); | ||
1234 | |||
1235 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
1236 | if (tags & (1 << tag)) | ||
1237 | tag_set(node, tag, offset); | ||
1238 | offset += n; | ||
1239 | |||
1240 | while (offset == RADIX_TREE_MAP_SIZE) { | ||
1241 | if (node == parent) | ||
1242 | break; | ||
1243 | offset = node->offset; | ||
1244 | child = node; | ||
1245 | node = node->parent; | ||
1246 | rcu_assign_pointer(node->slots[offset], | ||
1247 | node_to_entry(child)); | ||
1248 | offset++; | ||
1249 | } | ||
1250 | if ((node == parent) && (offset == end)) | ||
1251 | return 0; | ||
1252 | } | ||
1253 | |||
1254 | nomem: | ||
1255 | /* Shouldn't happen; did user forget to preload? */ | ||
1256 | /* TODO: free all the allocated nodes */ | ||
1257 | WARN_ON(1); | ||
1258 | return -ENOMEM; | ||
1259 | } | ||
1260 | #endif | ||
1261 | |||
1262 | /** | ||
933 | * radix_tree_tag_set - set a tag on a radix tree node | 1263 | * radix_tree_tag_set - set a tag on a radix tree node |
934 | * @root: radix tree root | 1264 | * @root: radix tree root |
935 | * @index: index key | 1265 | * @index: index key |
@@ -990,6 +1320,34 @@ static void node_tag_clear(struct radix_tree_root *root, | |||
990 | root_tag_clear(root, tag); | 1320 | root_tag_clear(root, tag); |
991 | } | 1321 | } |
992 | 1322 | ||
1323 | static void node_tag_set(struct radix_tree_root *root, | ||
1324 | struct radix_tree_node *node, | ||
1325 | unsigned int tag, unsigned int offset) | ||
1326 | { | ||
1327 | while (node) { | ||
1328 | if (tag_get(node, tag, offset)) | ||
1329 | return; | ||
1330 | tag_set(node, tag, offset); | ||
1331 | offset = node->offset; | ||
1332 | node = node->parent; | ||
1333 | } | ||
1334 | |||
1335 | if (!root_tag_get(root, tag)) | ||
1336 | root_tag_set(root, tag); | ||
1337 | } | ||
1338 | |||
1339 | /** | ||
1340 | * radix_tree_iter_tag_set - set a tag on the current iterator entry | ||
1341 | * @root: radix tree root | ||
1342 | * @iter: iterator state | ||
1343 | * @tag: tag to set | ||
1344 | */ | ||
1345 | void radix_tree_iter_tag_set(struct radix_tree_root *root, | ||
1346 | const struct radix_tree_iter *iter, unsigned int tag) | ||
1347 | { | ||
1348 | node_tag_set(root, iter->node, tag, iter_offset(iter)); | ||
1349 | } | ||
1350 | |||
993 | /** | 1351 | /** |
994 | * radix_tree_tag_clear - clear a tag on a radix tree node | 1352 | * radix_tree_tag_clear - clear a tag on a radix tree node |
995 | * @root: radix tree root | 1353 | * @root: radix tree root |
@@ -1085,6 +1443,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, | |||
1085 | #endif | 1443 | #endif |
1086 | } | 1444 | } |
1087 | 1445 | ||
1446 | /* Construct iter->tags bit-mask from node->tags[tag] array */ | ||
1447 | static void set_iter_tags(struct radix_tree_iter *iter, | ||
1448 | struct radix_tree_node *node, unsigned offset, | ||
1449 | unsigned tag) | ||
1450 | { | ||
1451 | unsigned tag_long = offset / BITS_PER_LONG; | ||
1452 | unsigned tag_bit = offset % BITS_PER_LONG; | ||
1453 | |||
1454 | iter->tags = node->tags[tag][tag_long] >> tag_bit; | ||
1455 | |||
1456 | /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ | ||
1457 | if (tag_long < RADIX_TREE_TAG_LONGS - 1) { | ||
1458 | /* Pick tags from next element */ | ||
1459 | if (tag_bit) | ||
1460 | iter->tags |= node->tags[tag][tag_long + 1] << | ||
1461 | (BITS_PER_LONG - tag_bit); | ||
1462 | /* Clip chunk size, here only BITS_PER_LONG tags */ | ||
1463 | iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); | ||
1464 | } | ||
1465 | } | ||
1466 | |||
1467 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
1468 | static void **skip_siblings(struct radix_tree_node **nodep, | ||
1469 | void **slot, struct radix_tree_iter *iter) | ||
1470 | { | ||
1471 | void *sib = node_to_entry(slot - 1); | ||
1472 | |||
1473 | while (iter->index < iter->next_index) { | ||
1474 | *nodep = rcu_dereference_raw(*slot); | ||
1475 | if (*nodep && *nodep != sib) | ||
1476 | return slot; | ||
1477 | slot++; | ||
1478 | iter->index = __radix_tree_iter_add(iter, 1); | ||
1479 | iter->tags >>= 1; | ||
1480 | } | ||
1481 | |||
1482 | *nodep = NULL; | ||
1483 | return NULL; | ||
1484 | } | ||
1485 | |||
1486 | void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, | ||
1487 | unsigned flags) | ||
1488 | { | ||
1489 | unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; | ||
1490 | struct radix_tree_node *node = rcu_dereference_raw(*slot); | ||
1491 | |||
1492 | slot = skip_siblings(&node, slot, iter); | ||
1493 | |||
1494 | while (radix_tree_is_internal_node(node)) { | ||
1495 | unsigned offset; | ||
1496 | unsigned long next_index; | ||
1497 | |||
1498 | if (node == RADIX_TREE_RETRY) | ||
1499 | return slot; | ||
1500 | node = entry_to_node(node); | ||
1501 | iter->node = node; | ||
1502 | iter->shift = node->shift; | ||
1503 | |||
1504 | if (flags & RADIX_TREE_ITER_TAGGED) { | ||
1505 | offset = radix_tree_find_next_bit(node, tag, 0); | ||
1506 | if (offset == RADIX_TREE_MAP_SIZE) | ||
1507 | return NULL; | ||
1508 | slot = &node->slots[offset]; | ||
1509 | iter->index = __radix_tree_iter_add(iter, offset); | ||
1510 | set_iter_tags(iter, node, offset, tag); | ||
1511 | node = rcu_dereference_raw(*slot); | ||
1512 | } else { | ||
1513 | offset = 0; | ||
1514 | slot = &node->slots[0]; | ||
1515 | for (;;) { | ||
1516 | node = rcu_dereference_raw(*slot); | ||
1517 | if (node) | ||
1518 | break; | ||
1519 | slot++; | ||
1520 | offset++; | ||
1521 | if (offset == RADIX_TREE_MAP_SIZE) | ||
1522 | return NULL; | ||
1523 | } | ||
1524 | iter->index = __radix_tree_iter_add(iter, offset); | ||
1525 | } | ||
1526 | if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) | ||
1527 | goto none; | ||
1528 | next_index = (iter->index | shift_maxindex(iter->shift)) + 1; | ||
1529 | if (next_index < iter->next_index) | ||
1530 | iter->next_index = next_index; | ||
1531 | } | ||
1532 | |||
1533 | return slot; | ||
1534 | none: | ||
1535 | iter->next_index = 0; | ||
1536 | return NULL; | ||
1537 | } | ||
1538 | EXPORT_SYMBOL(__radix_tree_next_slot); | ||
1539 | #else | ||
1540 | static void **skip_siblings(struct radix_tree_node **nodep, | ||
1541 | void **slot, struct radix_tree_iter *iter) | ||
1542 | { | ||
1543 | return slot; | ||
1544 | } | ||
1545 | #endif | ||
1546 | |||
1547 | void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) | ||
1548 | { | ||
1549 | struct radix_tree_node *node; | ||
1550 | |||
1551 | slot++; | ||
1552 | iter->index = __radix_tree_iter_add(iter, 1); | ||
1553 | node = rcu_dereference_raw(*slot); | ||
1554 | skip_siblings(&node, slot, iter); | ||
1555 | iter->next_index = iter->index; | ||
1556 | iter->tags = 0; | ||
1557 | return NULL; | ||
1558 | } | ||
1559 | EXPORT_SYMBOL(radix_tree_iter_resume); | ||
1560 | |||
1088 | /** | 1561 | /** |
1089 | * radix_tree_next_chunk - find next chunk of slots for iteration | 1562 | * radix_tree_next_chunk - find next chunk of slots for iteration |
1090 | * | 1563 | * |
@@ -1110,7 +1583,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
1110 | * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. | 1583 | * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. |
1111 | * | 1584 | * |
1112 | * This condition also used by radix_tree_next_slot() to stop | 1585 | * This condition also used by radix_tree_next_slot() to stop |
1113 | * contiguous iterating, and forbid swithing to the next chunk. | 1586 | * contiguous iterating, and forbid switching to the next chunk. |
1114 | */ | 1587 | */ |
1115 | index = iter->next_index; | 1588 | index = iter->next_index; |
1116 | if (!index && iter->index) | 1589 | if (!index && iter->index) |
@@ -1128,6 +1601,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
1128 | iter->index = index; | 1601 | iter->index = index; |
1129 | iter->next_index = maxindex + 1; | 1602 | iter->next_index = maxindex + 1; |
1130 | iter->tags = 1; | 1603 | iter->tags = 1; |
1604 | iter->node = NULL; | ||
1131 | __set_iter_shift(iter, 0); | 1605 | __set_iter_shift(iter, 0); |
1132 | return (void **)&root->rnode; | 1606 | return (void **)&root->rnode; |
1133 | } | 1607 | } |
@@ -1143,9 +1617,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
1143 | return NULL; | 1617 | return NULL; |
1144 | 1618 | ||
1145 | if (flags & RADIX_TREE_ITER_TAGGED) | 1619 | if (flags & RADIX_TREE_ITER_TAGGED) |
1146 | offset = radix_tree_find_next_bit( | 1620 | offset = radix_tree_find_next_bit(node, tag, |
1147 | node->tags[tag], | ||
1148 | RADIX_TREE_MAP_SIZE, | ||
1149 | offset + 1); | 1621 | offset + 1); |
1150 | else | 1622 | else |
1151 | while (++offset < RADIX_TREE_MAP_SIZE) { | 1623 | while (++offset < RADIX_TREE_MAP_SIZE) { |
@@ -1165,154 +1637,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
1165 | child = rcu_dereference_raw(node->slots[offset]); | 1637 | child = rcu_dereference_raw(node->slots[offset]); |
1166 | } | 1638 | } |
1167 | 1639 | ||
1168 | if ((child == NULL) || (child == RADIX_TREE_RETRY)) | 1640 | if (!child) |
1169 | goto restart; | 1641 | goto restart; |
1642 | if (child == RADIX_TREE_RETRY) | ||
1643 | break; | ||
1170 | } while (radix_tree_is_internal_node(child)); | 1644 | } while (radix_tree_is_internal_node(child)); |
1171 | 1645 | ||
1172 | /* Update the iterator state */ | 1646 | /* Update the iterator state */ |
1173 | iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); | 1647 | iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); |
1174 | iter->next_index = (index | node_maxindex(node)) + 1; | 1648 | iter->next_index = (index | node_maxindex(node)) + 1; |
1649 | iter->node = node; | ||
1175 | __set_iter_shift(iter, node->shift); | 1650 | __set_iter_shift(iter, node->shift); |
1176 | 1651 | ||
1177 | /* Construct iter->tags bit-mask from node->tags[tag] array */ | 1652 | if (flags & RADIX_TREE_ITER_TAGGED) |
1178 | if (flags & RADIX_TREE_ITER_TAGGED) { | 1653 | set_iter_tags(iter, node, offset, tag); |
1179 | unsigned tag_long, tag_bit; | ||
1180 | |||
1181 | tag_long = offset / BITS_PER_LONG; | ||
1182 | tag_bit = offset % BITS_PER_LONG; | ||
1183 | iter->tags = node->tags[tag][tag_long] >> tag_bit; | ||
1184 | /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ | ||
1185 | if (tag_long < RADIX_TREE_TAG_LONGS - 1) { | ||
1186 | /* Pick tags from next element */ | ||
1187 | if (tag_bit) | ||
1188 | iter->tags |= node->tags[tag][tag_long + 1] << | ||
1189 | (BITS_PER_LONG - tag_bit); | ||
1190 | /* Clip chunk size, here only BITS_PER_LONG tags */ | ||
1191 | iter->next_index = index + BITS_PER_LONG; | ||
1192 | } | ||
1193 | } | ||
1194 | 1654 | ||
1195 | return node->slots + offset; | 1655 | return node->slots + offset; |
1196 | } | 1656 | } |
1197 | EXPORT_SYMBOL(radix_tree_next_chunk); | 1657 | EXPORT_SYMBOL(radix_tree_next_chunk); |
1198 | 1658 | ||
1199 | /** | 1659 | /** |
1200 | * radix_tree_range_tag_if_tagged - for each item in given range set given | ||
1201 | * tag if item has another tag set | ||
1202 | * @root: radix tree root | ||
1203 | * @first_indexp: pointer to a starting index of a range to scan | ||
1204 | * @last_index: last index of a range to scan | ||
1205 | * @nr_to_tag: maximum number items to tag | ||
1206 | * @iftag: tag index to test | ||
1207 | * @settag: tag index to set if tested tag is set | ||
1208 | * | ||
1209 | * This function scans range of radix tree from first_index to last_index | ||
1210 | * (inclusive). For each item in the range if iftag is set, the function sets | ||
1211 | * also settag. The function stops either after tagging nr_to_tag items or | ||
1212 | * after reaching last_index. | ||
1213 | * | ||
1214 | * The tags must be set from the leaf level only and propagated back up the | ||
1215 | * path to the root. We must do this so that we resolve the full path before | ||
1216 | * setting any tags on intermediate nodes. If we set tags as we descend, then | ||
1217 | * we can get to the leaf node and find that the index that has the iftag | ||
1218 | * set is outside the range we are scanning. This reults in dangling tags and | ||
1219 | * can lead to problems with later tag operations (e.g. livelocks on lookups). | ||
1220 | * | ||
1221 | * The function returns the number of leaves where the tag was set and sets | ||
1222 | * *first_indexp to the first unscanned index. | ||
1223 | * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must | ||
1224 | * be prepared to handle that. | ||
1225 | */ | ||
1226 | unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | ||
1227 | unsigned long *first_indexp, unsigned long last_index, | ||
1228 | unsigned long nr_to_tag, | ||
1229 | unsigned int iftag, unsigned int settag) | ||
1230 | { | ||
1231 | struct radix_tree_node *parent, *node, *child; | ||
1232 | unsigned long maxindex; | ||
1233 | unsigned long tagged = 0; | ||
1234 | unsigned long index = *first_indexp; | ||
1235 | |||
1236 | radix_tree_load_root(root, &child, &maxindex); | ||
1237 | last_index = min(last_index, maxindex); | ||
1238 | if (index > last_index) | ||
1239 | return 0; | ||
1240 | if (!nr_to_tag) | ||
1241 | return 0; | ||
1242 | if (!root_tag_get(root, iftag)) { | ||
1243 | *first_indexp = last_index + 1; | ||
1244 | return 0; | ||
1245 | } | ||
1246 | if (!radix_tree_is_internal_node(child)) { | ||
1247 | *first_indexp = last_index + 1; | ||
1248 | root_tag_set(root, settag); | ||
1249 | return 1; | ||
1250 | } | ||
1251 | |||
1252 | node = entry_to_node(child); | ||
1253 | |||
1254 | for (;;) { | ||
1255 | unsigned offset = radix_tree_descend(node, &child, index); | ||
1256 | if (!child) | ||
1257 | goto next; | ||
1258 | if (!tag_get(node, iftag, offset)) | ||
1259 | goto next; | ||
1260 | /* Sibling slots never have tags set on them */ | ||
1261 | if (radix_tree_is_internal_node(child)) { | ||
1262 | node = entry_to_node(child); | ||
1263 | continue; | ||
1264 | } | ||
1265 | |||
1266 | /* tag the leaf */ | ||
1267 | tagged++; | ||
1268 | tag_set(node, settag, offset); | ||
1269 | |||
1270 | /* walk back up the path tagging interior nodes */ | ||
1271 | parent = node; | ||
1272 | for (;;) { | ||
1273 | offset = parent->offset; | ||
1274 | parent = parent->parent; | ||
1275 | if (!parent) | ||
1276 | break; | ||
1277 | /* stop if we find a node with the tag already set */ | ||
1278 | if (tag_get(parent, settag, offset)) | ||
1279 | break; | ||
1280 | tag_set(parent, settag, offset); | ||
1281 | } | ||
1282 | next: | ||
1283 | /* Go to next entry in node */ | ||
1284 | index = ((index >> node->shift) + 1) << node->shift; | ||
1285 | /* Overflow can happen when last_index is ~0UL... */ | ||
1286 | if (index > last_index || !index) | ||
1287 | break; | ||
1288 | offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; | ||
1289 | while (offset == 0) { | ||
1290 | /* | ||
1291 | * We've fully scanned this node. Go up. Because | ||
1292 | * last_index is guaranteed to be in the tree, what | ||
1293 | * we do below cannot wander astray. | ||
1294 | */ | ||
1295 | node = node->parent; | ||
1296 | offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; | ||
1297 | } | ||
1298 | if (is_sibling_entry(node, node->slots[offset])) | ||
1299 | goto next; | ||
1300 | if (tagged >= nr_to_tag) | ||
1301 | break; | ||
1302 | } | ||
1303 | /* | ||
1304 | * We need not to tag the root tag if there is no tag which is set with | ||
1305 | * settag within the range from *first_indexp to last_index. | ||
1306 | */ | ||
1307 | if (tagged > 0) | ||
1308 | root_tag_set(root, settag); | ||
1309 | *first_indexp = index; | ||
1310 | |||
1311 | return tagged; | ||
1312 | } | ||
1313 | EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); | ||
1314 | |||
1315 | /** | ||
1316 | * radix_tree_gang_lookup - perform multiple lookup on a radix tree | 1660 | * radix_tree_gang_lookup - perform multiple lookup on a radix tree |
1317 | * @root: radix tree root | 1661 | * @root: radix tree root |
1318 | * @results: where the results of the lookup are placed | 1662 | * @results: where the results of the lookup are placed |
@@ -1477,105 +1821,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, | |||
1477 | } | 1821 | } |
1478 | EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); | 1822 | EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); |
1479 | 1823 | ||
1480 | #if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) | ||
1481 | #include <linux/sched.h> /* for cond_resched() */ | ||
1482 | |||
1483 | struct locate_info { | ||
1484 | unsigned long found_index; | ||
1485 | bool stop; | ||
1486 | }; | ||
1487 | |||
1488 | /* | ||
1489 | * This linear search is at present only useful to shmem_unuse_inode(). | ||
1490 | */ | ||
1491 | static unsigned long __locate(struct radix_tree_node *slot, void *item, | ||
1492 | unsigned long index, struct locate_info *info) | ||
1493 | { | ||
1494 | unsigned long i; | ||
1495 | |||
1496 | do { | ||
1497 | unsigned int shift = slot->shift; | ||
1498 | |||
1499 | for (i = (index >> shift) & RADIX_TREE_MAP_MASK; | ||
1500 | i < RADIX_TREE_MAP_SIZE; | ||
1501 | i++, index += (1UL << shift)) { | ||
1502 | struct radix_tree_node *node = | ||
1503 | rcu_dereference_raw(slot->slots[i]); | ||
1504 | if (node == RADIX_TREE_RETRY) | ||
1505 | goto out; | ||
1506 | if (!radix_tree_is_internal_node(node)) { | ||
1507 | if (node == item) { | ||
1508 | info->found_index = index; | ||
1509 | info->stop = true; | ||
1510 | goto out; | ||
1511 | } | ||
1512 | continue; | ||
1513 | } | ||
1514 | node = entry_to_node(node); | ||
1515 | if (is_sibling_entry(slot, node)) | ||
1516 | continue; | ||
1517 | slot = node; | ||
1518 | break; | ||
1519 | } | ||
1520 | } while (i < RADIX_TREE_MAP_SIZE); | ||
1521 | |||
1522 | out: | ||
1523 | if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) | ||
1524 | info->stop = true; | ||
1525 | return index; | ||
1526 | } | ||
1527 | |||
1528 | /** | ||
1529 | * radix_tree_locate_item - search through radix tree for item | ||
1530 | * @root: radix tree root | ||
1531 | * @item: item to be found | ||
1532 | * | ||
1533 | * Returns index where item was found, or -1 if not found. | ||
1534 | * Caller must hold no lock (since this time-consuming function needs | ||
1535 | * to be preemptible), and must check afterwards if item is still there. | ||
1536 | */ | ||
1537 | unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) | ||
1538 | { | ||
1539 | struct radix_tree_node *node; | ||
1540 | unsigned long max_index; | ||
1541 | unsigned long cur_index = 0; | ||
1542 | struct locate_info info = { | ||
1543 | .found_index = -1, | ||
1544 | .stop = false, | ||
1545 | }; | ||
1546 | |||
1547 | do { | ||
1548 | rcu_read_lock(); | ||
1549 | node = rcu_dereference_raw(root->rnode); | ||
1550 | if (!radix_tree_is_internal_node(node)) { | ||
1551 | rcu_read_unlock(); | ||
1552 | if (node == item) | ||
1553 | info.found_index = 0; | ||
1554 | break; | ||
1555 | } | ||
1556 | |||
1557 | node = entry_to_node(node); | ||
1558 | |||
1559 | max_index = node_maxindex(node); | ||
1560 | if (cur_index > max_index) { | ||
1561 | rcu_read_unlock(); | ||
1562 | break; | ||
1563 | } | ||
1564 | |||
1565 | cur_index = __locate(node, item, cur_index, &info); | ||
1566 | rcu_read_unlock(); | ||
1567 | cond_resched(); | ||
1568 | } while (!info.stop && cur_index <= max_index); | ||
1569 | |||
1570 | return info.found_index; | ||
1571 | } | ||
1572 | #else | ||
1573 | unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) | ||
1574 | { | ||
1575 | return -1; | ||
1576 | } | ||
1577 | #endif /* CONFIG_SHMEM && CONFIG_SWAP */ | ||
1578 | |||
1579 | /** | 1824 | /** |
1580 | * __radix_tree_delete_node - try to free node after clearing a slot | 1825 | * __radix_tree_delete_node - try to free node after clearing a slot |
1581 | * @root: radix tree root | 1826 | * @root: radix tree root |
@@ -1591,20 +1836,6 @@ void __radix_tree_delete_node(struct radix_tree_root *root, | |||
1591 | delete_node(root, node, NULL, NULL); | 1836 | delete_node(root, node, NULL, NULL); |
1592 | } | 1837 | } |
1593 | 1838 | ||
1594 | static inline void delete_sibling_entries(struct radix_tree_node *node, | ||
1595 | void *ptr, unsigned offset) | ||
1596 | { | ||
1597 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
1598 | int i; | ||
1599 | for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { | ||
1600 | if (node->slots[offset + i] != ptr) | ||
1601 | break; | ||
1602 | node->slots[offset + i] = NULL; | ||
1603 | node->count--; | ||
1604 | } | ||
1605 | #endif | ||
1606 | } | ||
1607 | |||
1608 | /** | 1839 | /** |
1609 | * radix_tree_delete_item - delete an item from a radix tree | 1840 | * radix_tree_delete_item - delete an item from a radix tree |
1610 | * @root: radix tree root | 1841 | * @root: radix tree root |
@@ -1644,7 +1875,6 @@ void *radix_tree_delete_item(struct radix_tree_root *root, | |||
1644 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | 1875 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) |
1645 | node_tag_clear(root, node, tag, offset); | 1876 | node_tag_clear(root, node, tag, offset); |
1646 | 1877 | ||
1647 | delete_sibling_entries(node, node_to_entry(slot), offset); | ||
1648 | __radix_tree_replace(root, node, slot, NULL, NULL, NULL); | 1878 | __radix_tree_replace(root, node, slot, NULL, NULL, NULL); |
1649 | 1879 | ||
1650 | return entry; | 1880 | return entry; |
diff --git a/mm/compaction.c b/mm/compaction.c index 223464227299..949198d01260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
818 | page_count(page) > page_mapcount(page)) | 818 | page_count(page) > page_mapcount(page)) |
819 | goto isolate_fail; | 819 | goto isolate_fail; |
820 | 820 | ||
821 | /* | ||
822 | * Only allow to migrate anonymous pages in GFP_NOFS context | ||
823 | * because those do not depend on fs locks. | ||
824 | */ | ||
825 | if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) | ||
826 | goto isolate_fail; | ||
827 | |||
821 | /* If we already hold the lock, we can skip some rechecking */ | 828 | /* If we already hold the lock, we can skip some rechecking */ |
822 | if (!locked) { | 829 | if (!locked) { |
823 | locked = compact_trylock_irqsave(zone_lru_lock(zone), | 830 | locked = compact_trylock_irqsave(zone_lru_lock(zone), |
@@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
1677 | unsigned int alloc_flags, const struct alloc_context *ac, | 1684 | unsigned int alloc_flags, const struct alloc_context *ac, |
1678 | enum compact_priority prio) | 1685 | enum compact_priority prio) |
1679 | { | 1686 | { |
1680 | int may_enter_fs = gfp_mask & __GFP_FS; | ||
1681 | int may_perform_io = gfp_mask & __GFP_IO; | 1687 | int may_perform_io = gfp_mask & __GFP_IO; |
1682 | struct zoneref *z; | 1688 | struct zoneref *z; |
1683 | struct zone *zone; | 1689 | struct zone *zone; |
1684 | enum compact_result rc = COMPACT_SKIPPED; | 1690 | enum compact_result rc = COMPACT_SKIPPED; |
1685 | 1691 | ||
1686 | /* Check if the GFP flags allow compaction */ | 1692 | /* |
1687 | if (!may_enter_fs || !may_perform_io) | 1693 | * Check if the GFP flags allow compaction - GFP_NOIO is really |
1694 | * tricky context because the migration might require IO | ||
1695 | */ | ||
1696 | if (!may_perform_io) | ||
1688 | return COMPACT_SKIPPED; | 1697 | return COMPACT_SKIPPED; |
1689 | 1698 | ||
1690 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); | 1699 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); |
@@ -1751,6 +1760,7 @@ static void compact_node(int nid) | |||
1751 | .mode = MIGRATE_SYNC, | 1760 | .mode = MIGRATE_SYNC, |
1752 | .ignore_skip_hint = true, | 1761 | .ignore_skip_hint = true, |
1753 | .whole_zone = true, | 1762 | .whole_zone = true, |
1763 | .gfp_mask = GFP_KERNEL, | ||
1754 | }; | 1764 | }; |
1755 | 1765 | ||
1756 | 1766 | ||
@@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
1876 | .classzone_idx = pgdat->kcompactd_classzone_idx, | 1886 | .classzone_idx = pgdat->kcompactd_classzone_idx, |
1877 | .mode = MIGRATE_SYNC_LIGHT, | 1887 | .mode = MIGRATE_SYNC_LIGHT, |
1878 | .ignore_skip_hint = true, | 1888 | .ignore_skip_hint = true, |
1889 | .gfp_mask = GFP_KERNEL, | ||
1879 | 1890 | ||
1880 | }; | 1891 | }; |
1881 | trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, | 1892 | trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, |
diff --git a/mm/filemap.c b/mm/filemap.c index b06517b7f97f..32be3c8f3a11 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -2164,12 +2164,12 @@ page_not_uptodate: | |||
2164 | } | 2164 | } |
2165 | EXPORT_SYMBOL(filemap_fault); | 2165 | EXPORT_SYMBOL(filemap_fault); |
2166 | 2166 | ||
2167 | void filemap_map_pages(struct fault_env *fe, | 2167 | void filemap_map_pages(struct vm_fault *vmf, |
2168 | pgoff_t start_pgoff, pgoff_t end_pgoff) | 2168 | pgoff_t start_pgoff, pgoff_t end_pgoff) |
2169 | { | 2169 | { |
2170 | struct radix_tree_iter iter; | 2170 | struct radix_tree_iter iter; |
2171 | void **slot; | 2171 | void **slot; |
2172 | struct file *file = fe->vma->vm_file; | 2172 | struct file *file = vmf->vma->vm_file; |
2173 | struct address_space *mapping = file->f_mapping; | 2173 | struct address_space *mapping = file->f_mapping; |
2174 | pgoff_t last_pgoff = start_pgoff; | 2174 | pgoff_t last_pgoff = start_pgoff; |
2175 | loff_t size; | 2175 | loff_t size; |
@@ -2225,11 +2225,11 @@ repeat: | |||
2225 | if (file->f_ra.mmap_miss > 0) | 2225 | if (file->f_ra.mmap_miss > 0) |
2226 | file->f_ra.mmap_miss--; | 2226 | file->f_ra.mmap_miss--; |
2227 | 2227 | ||
2228 | fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; | 2228 | vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; |
2229 | if (fe->pte) | 2229 | if (vmf->pte) |
2230 | fe->pte += iter.index - last_pgoff; | 2230 | vmf->pte += iter.index - last_pgoff; |
2231 | last_pgoff = iter.index; | 2231 | last_pgoff = iter.index; |
2232 | if (alloc_set_pte(fe, NULL, page)) | 2232 | if (alloc_set_pte(vmf, NULL, page)) |
2233 | goto unlock; | 2233 | goto unlock; |
2234 | unlock_page(page); | 2234 | unlock_page(page); |
2235 | goto next; | 2235 | goto next; |
@@ -2239,7 +2239,7 @@ skip: | |||
2239 | put_page(page); | 2239 | put_page(page); |
2240 | next: | 2240 | next: |
2241 | /* Huge page is mapped? No need to proceed. */ | 2241 | /* Huge page is mapped? No need to proceed. */ |
2242 | if (pmd_trans_huge(*fe->pmd)) | 2242 | if (pmd_trans_huge(*vmf->pmd)) |
2243 | break; | 2243 | break; |
2244 | if (iter.index == end_pgoff) | 2244 | if (iter.index == end_pgoff) |
2245 | break; | 2245 | break; |
@@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); | |||
865 | * caller if required (just like with __get_user_pages). "FOLL_GET" | 865 | * caller if required (just like with __get_user_pages). "FOLL_GET" |
866 | * is set implicitly if "pages" is non-NULL. | 866 | * is set implicitly if "pages" is non-NULL. |
867 | */ | 867 | */ |
868 | __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | 868 | static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, |
869 | unsigned long start, unsigned long nr_pages, | 869 | struct mm_struct *mm, unsigned long start, |
870 | struct page **pages, unsigned int gup_flags) | 870 | unsigned long nr_pages, struct page **pages, |
871 | unsigned int gup_flags) | ||
871 | { | 872 | { |
872 | long ret; | 873 | long ret; |
873 | int locked = 1; | 874 | int locked = 1; |
@@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m | |||
879 | up_read(&mm->mmap_sem); | 880 | up_read(&mm->mmap_sem); |
880 | return ret; | 881 | return ret; |
881 | } | 882 | } |
882 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
883 | 883 | ||
884 | /* | 884 | /* |
885 | * get_user_pages_unlocked() is suitable to replace the form: | 885 | * get_user_pages_unlocked() is suitable to replace the form: |
@@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); | |||
917 | * only intends to ensure the pages are faulted in. | 917 | * only intends to ensure the pages are faulted in. |
918 | * @vmas: array of pointers to vmas corresponding to each page. | 918 | * @vmas: array of pointers to vmas corresponding to each page. |
919 | * Or NULL if the caller does not require them. | 919 | * Or NULL if the caller does not require them. |
920 | * @locked: pointer to lock flag indicating whether lock is held and | ||
921 | * subsequently whether VM_FAULT_RETRY functionality can be | ||
922 | * utilised. Lock must initially be held. | ||
920 | * | 923 | * |
921 | * Returns number of pages pinned. This may be fewer than the number | 924 | * Returns number of pages pinned. This may be fewer than the number |
922 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | 925 | * requested. If nr_pages is 0 or negative, returns 0. If no pages |
@@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); | |||
960 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, | 963 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, |
961 | unsigned long start, unsigned long nr_pages, | 964 | unsigned long start, unsigned long nr_pages, |
962 | unsigned int gup_flags, struct page **pages, | 965 | unsigned int gup_flags, struct page **pages, |
963 | struct vm_area_struct **vmas) | 966 | struct vm_area_struct **vmas, int *locked) |
964 | { | 967 | { |
965 | return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, | 968 | return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, |
966 | NULL, false, | 969 | locked, true, |
967 | gup_flags | FOLL_TOUCH | FOLL_REMOTE); | 970 | gup_flags | FOLL_TOUCH | FOLL_REMOTE); |
968 | } | 971 | } |
969 | EXPORT_SYMBOL(get_user_pages_remote); | 972 | EXPORT_SYMBOL(get_user_pages_remote); |
@@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); | |||
971 | /* | 974 | /* |
972 | * This is the same as get_user_pages_remote(), just with a | 975 | * This is the same as get_user_pages_remote(), just with a |
973 | * less-flexible calling convention where we assume that the task | 976 | * less-flexible calling convention where we assume that the task |
974 | * and mm being operated on are the current task's. We also | 977 | * and mm being operated on are the current task's and don't allow |
975 | * obviously don't pass FOLL_REMOTE in here. | 978 | * passing of a locked parameter. We also obviously don't pass |
979 | * FOLL_REMOTE in here. | ||
976 | */ | 980 | */ |
977 | long get_user_pages(unsigned long start, unsigned long nr_pages, | 981 | long get_user_pages(unsigned long start, unsigned long nr_pages, |
978 | unsigned int gup_flags, struct page **pages, | 982 | unsigned int gup_flags, struct page **pages, |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf05477..10eedbf14421 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, | |||
542 | } | 542 | } |
543 | EXPORT_SYMBOL_GPL(thp_get_unmapped_area); | 543 | EXPORT_SYMBOL_GPL(thp_get_unmapped_area); |
544 | 544 | ||
545 | static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | 545 | static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, |
546 | gfp_t gfp) | 546 | gfp_t gfp) |
547 | { | 547 | { |
548 | struct vm_area_struct *vma = fe->vma; | 548 | struct vm_area_struct *vma = vmf->vma; |
549 | struct mem_cgroup *memcg; | 549 | struct mem_cgroup *memcg; |
550 | pgtable_t pgtable; | 550 | pgtable_t pgtable; |
551 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 551 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
552 | 552 | ||
553 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 553 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
554 | 554 | ||
@@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
573 | */ | 573 | */ |
574 | __SetPageUptodate(page); | 574 | __SetPageUptodate(page); |
575 | 575 | ||
576 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 576 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
577 | if (unlikely(!pmd_none(*fe->pmd))) { | 577 | if (unlikely(!pmd_none(*vmf->pmd))) { |
578 | spin_unlock(fe->ptl); | 578 | spin_unlock(vmf->ptl); |
579 | mem_cgroup_cancel_charge(page, memcg, true); | 579 | mem_cgroup_cancel_charge(page, memcg, true); |
580 | put_page(page); | 580 | put_page(page); |
581 | pte_free(vma->vm_mm, pgtable); | 581 | pte_free(vma->vm_mm, pgtable); |
@@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
586 | if (userfaultfd_missing(vma)) { | 586 | if (userfaultfd_missing(vma)) { |
587 | int ret; | 587 | int ret; |
588 | 588 | ||
589 | spin_unlock(fe->ptl); | 589 | spin_unlock(vmf->ptl); |
590 | mem_cgroup_cancel_charge(page, memcg, true); | 590 | mem_cgroup_cancel_charge(page, memcg, true); |
591 | put_page(page); | 591 | put_page(page); |
592 | pte_free(vma->vm_mm, pgtable); | 592 | pte_free(vma->vm_mm, pgtable); |
593 | ret = handle_userfault(fe, VM_UFFD_MISSING); | 593 | ret = handle_userfault(vmf, VM_UFFD_MISSING); |
594 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 594 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
595 | return ret; | 595 | return ret; |
596 | } | 596 | } |
@@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
600 | page_add_new_anon_rmap(page, vma, haddr, true); | 600 | page_add_new_anon_rmap(page, vma, haddr, true); |
601 | mem_cgroup_commit_charge(page, memcg, false, true); | 601 | mem_cgroup_commit_charge(page, memcg, false, true); |
602 | lru_cache_add_active_or_unevictable(page, vma); | 602 | lru_cache_add_active_or_unevictable(page, vma); |
603 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); | 603 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); |
604 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 604 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
605 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 605 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
606 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 606 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
607 | spin_unlock(fe->ptl); | 607 | spin_unlock(vmf->ptl); |
608 | count_vm_event(THP_FAULT_ALLOC); | 608 | count_vm_event(THP_FAULT_ALLOC); |
609 | } | 609 | } |
610 | 610 | ||
@@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
651 | return true; | 651 | return true; |
652 | } | 652 | } |
653 | 653 | ||
654 | int do_huge_pmd_anonymous_page(struct fault_env *fe) | 654 | int do_huge_pmd_anonymous_page(struct vm_fault *vmf) |
655 | { | 655 | { |
656 | struct vm_area_struct *vma = fe->vma; | 656 | struct vm_area_struct *vma = vmf->vma; |
657 | gfp_t gfp; | 657 | gfp_t gfp; |
658 | struct page *page; | 658 | struct page *page; |
659 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 659 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
660 | 660 | ||
661 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) | 661 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
662 | return VM_FAULT_FALLBACK; | 662 | return VM_FAULT_FALLBACK; |
@@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
664 | return VM_FAULT_OOM; | 664 | return VM_FAULT_OOM; |
665 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) | 665 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) |
666 | return VM_FAULT_OOM; | 666 | return VM_FAULT_OOM; |
667 | if (!(fe->flags & FAULT_FLAG_WRITE) && | 667 | if (!(vmf->flags & FAULT_FLAG_WRITE) && |
668 | !mm_forbids_zeropage(vma->vm_mm) && | 668 | !mm_forbids_zeropage(vma->vm_mm) && |
669 | transparent_hugepage_use_zero_page()) { | 669 | transparent_hugepage_use_zero_page()) { |
670 | pgtable_t pgtable; | 670 | pgtable_t pgtable; |
@@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
680 | count_vm_event(THP_FAULT_FALLBACK); | 680 | count_vm_event(THP_FAULT_FALLBACK); |
681 | return VM_FAULT_FALLBACK; | 681 | return VM_FAULT_FALLBACK; |
682 | } | 682 | } |
683 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 683 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
684 | ret = 0; | 684 | ret = 0; |
685 | set = false; | 685 | set = false; |
686 | if (pmd_none(*fe->pmd)) { | 686 | if (pmd_none(*vmf->pmd)) { |
687 | if (userfaultfd_missing(vma)) { | 687 | if (userfaultfd_missing(vma)) { |
688 | spin_unlock(fe->ptl); | 688 | spin_unlock(vmf->ptl); |
689 | ret = handle_userfault(fe, VM_UFFD_MISSING); | 689 | ret = handle_userfault(vmf, VM_UFFD_MISSING); |
690 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 690 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
691 | } else { | 691 | } else { |
692 | set_huge_zero_page(pgtable, vma->vm_mm, vma, | 692 | set_huge_zero_page(pgtable, vma->vm_mm, vma, |
693 | haddr, fe->pmd, zero_page); | 693 | haddr, vmf->pmd, zero_page); |
694 | spin_unlock(fe->ptl); | 694 | spin_unlock(vmf->ptl); |
695 | set = true; | 695 | set = true; |
696 | } | 696 | } |
697 | } else | 697 | } else |
698 | spin_unlock(fe->ptl); | 698 | spin_unlock(vmf->ptl); |
699 | if (!set) | 699 | if (!set) |
700 | pte_free(vma->vm_mm, pgtable); | 700 | pte_free(vma->vm_mm, pgtable); |
701 | return ret; | 701 | return ret; |
@@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
707 | return VM_FAULT_FALLBACK; | 707 | return VM_FAULT_FALLBACK; |
708 | } | 708 | } |
709 | prep_transhuge_page(page); | 709 | prep_transhuge_page(page); |
710 | return __do_huge_pmd_anonymous_page(fe, page, gfp); | 710 | return __do_huge_pmd_anonymous_page(vmf, page, gfp); |
711 | } | 711 | } |
712 | 712 | ||
713 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 713 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
@@ -879,30 +879,30 @@ out: | |||
879 | return ret; | 879 | return ret; |
880 | } | 880 | } |
881 | 881 | ||
882 | void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) | 882 | void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) |
883 | { | 883 | { |
884 | pmd_t entry; | 884 | pmd_t entry; |
885 | unsigned long haddr; | 885 | unsigned long haddr; |
886 | 886 | ||
887 | fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); | 887 | vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); |
888 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) | 888 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
889 | goto unlock; | 889 | goto unlock; |
890 | 890 | ||
891 | entry = pmd_mkyoung(orig_pmd); | 891 | entry = pmd_mkyoung(orig_pmd); |
892 | haddr = fe->address & HPAGE_PMD_MASK; | 892 | haddr = vmf->address & HPAGE_PMD_MASK; |
893 | if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, | 893 | if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, |
894 | fe->flags & FAULT_FLAG_WRITE)) | 894 | vmf->flags & FAULT_FLAG_WRITE)) |
895 | update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); | 895 | update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); |
896 | 896 | ||
897 | unlock: | 897 | unlock: |
898 | spin_unlock(fe->ptl); | 898 | spin_unlock(vmf->ptl); |
899 | } | 899 | } |
900 | 900 | ||
901 | static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | 901 | static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, |
902 | struct page *page) | 902 | struct page *page) |
903 | { | 903 | { |
904 | struct vm_area_struct *vma = fe->vma; | 904 | struct vm_area_struct *vma = vmf->vma; |
905 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 905 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
906 | struct mem_cgroup *memcg; | 906 | struct mem_cgroup *memcg; |
907 | pgtable_t pgtable; | 907 | pgtable_t pgtable; |
908 | pmd_t _pmd; | 908 | pmd_t _pmd; |
@@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | |||
921 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 921 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
922 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | | 922 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
923 | __GFP_OTHER_NODE, vma, | 923 | __GFP_OTHER_NODE, vma, |
924 | fe->address, page_to_nid(page)); | 924 | vmf->address, page_to_nid(page)); |
925 | if (unlikely(!pages[i] || | 925 | if (unlikely(!pages[i] || |
926 | mem_cgroup_try_charge(pages[i], vma->vm_mm, | 926 | mem_cgroup_try_charge(pages[i], vma->vm_mm, |
927 | GFP_KERNEL, &memcg, false))) { | 927 | GFP_KERNEL, &memcg, false))) { |
@@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | |||
952 | mmun_end = haddr + HPAGE_PMD_SIZE; | 952 | mmun_end = haddr + HPAGE_PMD_SIZE; |
953 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | 953 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
954 | 954 | ||
955 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 955 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
956 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) | 956 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
957 | goto out_free_pages; | 957 | goto out_free_pages; |
958 | VM_BUG_ON_PAGE(!PageHead(page), page); | 958 | VM_BUG_ON_PAGE(!PageHead(page), page); |
959 | 959 | ||
960 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); | 960 | pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); |
961 | /* leave pmd empty until pte is filled */ | 961 | /* leave pmd empty until pte is filled */ |
962 | 962 | ||
963 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); | 963 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); |
964 | pmd_populate(vma->vm_mm, &_pmd, pgtable); | 964 | pmd_populate(vma->vm_mm, &_pmd, pgtable); |
965 | 965 | ||
966 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 966 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
@@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | |||
969 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 969 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
970 | memcg = (void *)page_private(pages[i]); | 970 | memcg = (void *)page_private(pages[i]); |
971 | set_page_private(pages[i], 0); | 971 | set_page_private(pages[i], 0); |
972 | page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); | 972 | page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); |
973 | mem_cgroup_commit_charge(pages[i], memcg, false, false); | 973 | mem_cgroup_commit_charge(pages[i], memcg, false, false); |
974 | lru_cache_add_active_or_unevictable(pages[i], vma); | 974 | lru_cache_add_active_or_unevictable(pages[i], vma); |
975 | fe->pte = pte_offset_map(&_pmd, haddr); | 975 | vmf->pte = pte_offset_map(&_pmd, haddr); |
976 | VM_BUG_ON(!pte_none(*fe->pte)); | 976 | VM_BUG_ON(!pte_none(*vmf->pte)); |
977 | set_pte_at(vma->vm_mm, haddr, fe->pte, entry); | 977 | set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); |
978 | pte_unmap(fe->pte); | 978 | pte_unmap(vmf->pte); |
979 | } | 979 | } |
980 | kfree(pages); | 980 | kfree(pages); |
981 | 981 | ||
982 | smp_wmb(); /* make pte visible before pmd */ | 982 | smp_wmb(); /* make pte visible before pmd */ |
983 | pmd_populate(vma->vm_mm, fe->pmd, pgtable); | 983 | pmd_populate(vma->vm_mm, vmf->pmd, pgtable); |
984 | page_remove_rmap(page, true); | 984 | page_remove_rmap(page, true); |
985 | spin_unlock(fe->ptl); | 985 | spin_unlock(vmf->ptl); |
986 | 986 | ||
987 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 987 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
988 | 988 | ||
@@ -993,7 +993,7 @@ out: | |||
993 | return ret; | 993 | return ret; |
994 | 994 | ||
995 | out_free_pages: | 995 | out_free_pages: |
996 | spin_unlock(fe->ptl); | 996 | spin_unlock(vmf->ptl); |
997 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 997 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
998 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 998 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
999 | memcg = (void *)page_private(pages[i]); | 999 | memcg = (void *)page_private(pages[i]); |
@@ -1005,23 +1005,23 @@ out_free_pages: | |||
1005 | goto out; | 1005 | goto out; |
1006 | } | 1006 | } |
1007 | 1007 | ||
1008 | int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) | 1008 | int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) |
1009 | { | 1009 | { |
1010 | struct vm_area_struct *vma = fe->vma; | 1010 | struct vm_area_struct *vma = vmf->vma; |
1011 | struct page *page = NULL, *new_page; | 1011 | struct page *page = NULL, *new_page; |
1012 | struct mem_cgroup *memcg; | 1012 | struct mem_cgroup *memcg; |
1013 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 1013 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
1014 | unsigned long mmun_start; /* For mmu_notifiers */ | 1014 | unsigned long mmun_start; /* For mmu_notifiers */ |
1015 | unsigned long mmun_end; /* For mmu_notifiers */ | 1015 | unsigned long mmun_end; /* For mmu_notifiers */ |
1016 | gfp_t huge_gfp; /* for allocation and charge */ | 1016 | gfp_t huge_gfp; /* for allocation and charge */ |
1017 | int ret = 0; | 1017 | int ret = 0; |
1018 | 1018 | ||
1019 | fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); | 1019 | vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); |
1020 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 1020 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
1021 | if (is_huge_zero_pmd(orig_pmd)) | 1021 | if (is_huge_zero_pmd(orig_pmd)) |
1022 | goto alloc; | 1022 | goto alloc; |
1023 | spin_lock(fe->ptl); | 1023 | spin_lock(vmf->ptl); |
1024 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) | 1024 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
1025 | goto out_unlock; | 1025 | goto out_unlock; |
1026 | 1026 | ||
1027 | page = pmd_page(orig_pmd); | 1027 | page = pmd_page(orig_pmd); |
@@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) | |||
1034 | pmd_t entry; | 1034 | pmd_t entry; |
1035 | entry = pmd_mkyoung(orig_pmd); | 1035 | entry = pmd_mkyoung(orig_pmd); |
1036 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1036 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1037 | if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) | 1037 | if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) |
1038 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1038 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
1039 | ret |= VM_FAULT_WRITE; | 1039 | ret |= VM_FAULT_WRITE; |
1040 | goto out_unlock; | 1040 | goto out_unlock; |
1041 | } | 1041 | } |
1042 | get_page(page); | 1042 | get_page(page); |
1043 | spin_unlock(fe->ptl); | 1043 | spin_unlock(vmf->ptl); |
1044 | alloc: | 1044 | alloc: |
1045 | if (transparent_hugepage_enabled(vma) && | 1045 | if (transparent_hugepage_enabled(vma) && |
1046 | !transparent_hugepage_debug_cow()) { | 1046 | !transparent_hugepage_debug_cow()) { |
@@ -1053,12 +1053,12 @@ alloc: | |||
1053 | prep_transhuge_page(new_page); | 1053 | prep_transhuge_page(new_page); |
1054 | } else { | 1054 | } else { |
1055 | if (!page) { | 1055 | if (!page) { |
1056 | split_huge_pmd(vma, fe->pmd, fe->address); | 1056 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
1057 | ret |= VM_FAULT_FALLBACK; | 1057 | ret |= VM_FAULT_FALLBACK; |
1058 | } else { | 1058 | } else { |
1059 | ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); | 1059 | ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); |
1060 | if (ret & VM_FAULT_OOM) { | 1060 | if (ret & VM_FAULT_OOM) { |
1061 | split_huge_pmd(vma, fe->pmd, fe->address); | 1061 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
1062 | ret |= VM_FAULT_FALLBACK; | 1062 | ret |= VM_FAULT_FALLBACK; |
1063 | } | 1063 | } |
1064 | put_page(page); | 1064 | put_page(page); |
@@ -1070,7 +1070,7 @@ alloc: | |||
1070 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, | 1070 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, |
1071 | huge_gfp, &memcg, true))) { | 1071 | huge_gfp, &memcg, true))) { |
1072 | put_page(new_page); | 1072 | put_page(new_page); |
1073 | split_huge_pmd(vma, fe->pmd, fe->address); | 1073 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
1074 | if (page) | 1074 | if (page) |
1075 | put_page(page); | 1075 | put_page(page); |
1076 | ret |= VM_FAULT_FALLBACK; | 1076 | ret |= VM_FAULT_FALLBACK; |
@@ -1090,11 +1090,11 @@ alloc: | |||
1090 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1090 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1091 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | 1091 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
1092 | 1092 | ||
1093 | spin_lock(fe->ptl); | 1093 | spin_lock(vmf->ptl); |
1094 | if (page) | 1094 | if (page) |
1095 | put_page(page); | 1095 | put_page(page); |
1096 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { | 1096 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { |
1097 | spin_unlock(fe->ptl); | 1097 | spin_unlock(vmf->ptl); |
1098 | mem_cgroup_cancel_charge(new_page, memcg, true); | 1098 | mem_cgroup_cancel_charge(new_page, memcg, true); |
1099 | put_page(new_page); | 1099 | put_page(new_page); |
1100 | goto out_mn; | 1100 | goto out_mn; |
@@ -1102,12 +1102,12 @@ alloc: | |||
1102 | pmd_t entry; | 1102 | pmd_t entry; |
1103 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1103 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1104 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1104 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1105 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); | 1105 | pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); |
1106 | page_add_new_anon_rmap(new_page, vma, haddr, true); | 1106 | page_add_new_anon_rmap(new_page, vma, haddr, true); |
1107 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1107 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1108 | lru_cache_add_active_or_unevictable(new_page, vma); | 1108 | lru_cache_add_active_or_unevictable(new_page, vma); |
1109 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 1109 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
1110 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1110 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
1111 | if (!page) { | 1111 | if (!page) { |
1112 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1112 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1113 | } else { | 1113 | } else { |
@@ -1117,13 +1117,13 @@ alloc: | |||
1117 | } | 1117 | } |
1118 | ret |= VM_FAULT_WRITE; | 1118 | ret |= VM_FAULT_WRITE; |
1119 | } | 1119 | } |
1120 | spin_unlock(fe->ptl); | 1120 | spin_unlock(vmf->ptl); |
1121 | out_mn: | 1121 | out_mn: |
1122 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 1122 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1123 | out: | 1123 | out: |
1124 | return ret; | 1124 | return ret; |
1125 | out_unlock: | 1125 | out_unlock: |
1126 | spin_unlock(fe->ptl); | 1126 | spin_unlock(vmf->ptl); |
1127 | return ret; | 1127 | return ret; |
1128 | } | 1128 | } |
1129 | 1129 | ||
@@ -1196,12 +1196,12 @@ out: | |||
1196 | } | 1196 | } |
1197 | 1197 | ||
1198 | /* NUMA hinting page fault entry point for trans huge pmds */ | 1198 | /* NUMA hinting page fault entry point for trans huge pmds */ |
1199 | int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | 1199 | int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) |
1200 | { | 1200 | { |
1201 | struct vm_area_struct *vma = fe->vma; | 1201 | struct vm_area_struct *vma = vmf->vma; |
1202 | struct anon_vma *anon_vma = NULL; | 1202 | struct anon_vma *anon_vma = NULL; |
1203 | struct page *page; | 1203 | struct page *page; |
1204 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 1204 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
1205 | int page_nid = -1, this_nid = numa_node_id(); | 1205 | int page_nid = -1, this_nid = numa_node_id(); |
1206 | int target_nid, last_cpupid = -1; | 1206 | int target_nid, last_cpupid = -1; |
1207 | bool page_locked; | 1207 | bool page_locked; |
@@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
1209 | bool was_writable; | 1209 | bool was_writable; |
1210 | int flags = 0; | 1210 | int flags = 0; |
1211 | 1211 | ||
1212 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 1212 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
1213 | if (unlikely(!pmd_same(pmd, *fe->pmd))) | 1213 | if (unlikely(!pmd_same(pmd, *vmf->pmd))) |
1214 | goto out_unlock; | 1214 | goto out_unlock; |
1215 | 1215 | ||
1216 | /* | 1216 | /* |
@@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
1218 | * without disrupting NUMA hinting information. Do not relock and | 1218 | * without disrupting NUMA hinting information. Do not relock and |
1219 | * check_same as the page may no longer be mapped. | 1219 | * check_same as the page may no longer be mapped. |
1220 | */ | 1220 | */ |
1221 | if (unlikely(pmd_trans_migrating(*fe->pmd))) { | 1221 | if (unlikely(pmd_trans_migrating(*vmf->pmd))) { |
1222 | page = pmd_page(*fe->pmd); | 1222 | page = pmd_page(*vmf->pmd); |
1223 | spin_unlock(fe->ptl); | 1223 | spin_unlock(vmf->ptl); |
1224 | wait_on_page_locked(page); | 1224 | wait_on_page_locked(page); |
1225 | goto out; | 1225 | goto out; |
1226 | } | 1226 | } |
@@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
1253 | 1253 | ||
1254 | /* Migration could have started since the pmd_trans_migrating check */ | 1254 | /* Migration could have started since the pmd_trans_migrating check */ |
1255 | if (!page_locked) { | 1255 | if (!page_locked) { |
1256 | spin_unlock(fe->ptl); | 1256 | spin_unlock(vmf->ptl); |
1257 | wait_on_page_locked(page); | 1257 | wait_on_page_locked(page); |
1258 | page_nid = -1; | 1258 | page_nid = -1; |
1259 | goto out; | 1259 | goto out; |
@@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
1264 | * to serialises splits | 1264 | * to serialises splits |
1265 | */ | 1265 | */ |
1266 | get_page(page); | 1266 | get_page(page); |
1267 | spin_unlock(fe->ptl); | 1267 | spin_unlock(vmf->ptl); |
1268 | anon_vma = page_lock_anon_vma_read(page); | 1268 | anon_vma = page_lock_anon_vma_read(page); |
1269 | 1269 | ||
1270 | /* Confirm the PMD did not change while page_table_lock was released */ | 1270 | /* Confirm the PMD did not change while page_table_lock was released */ |
1271 | spin_lock(fe->ptl); | 1271 | spin_lock(vmf->ptl); |
1272 | if (unlikely(!pmd_same(pmd, *fe->pmd))) { | 1272 | if (unlikely(!pmd_same(pmd, *vmf->pmd))) { |
1273 | unlock_page(page); | 1273 | unlock_page(page); |
1274 | put_page(page); | 1274 | put_page(page); |
1275 | page_nid = -1; | 1275 | page_nid = -1; |
@@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
1287 | * Migrate the THP to the requested node, returns with page unlocked | 1287 | * Migrate the THP to the requested node, returns with page unlocked |
1288 | * and access rights restored. | 1288 | * and access rights restored. |
1289 | */ | 1289 | */ |
1290 | spin_unlock(fe->ptl); | 1290 | spin_unlock(vmf->ptl); |
1291 | migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, | 1291 | migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, |
1292 | fe->pmd, pmd, fe->address, page, target_nid); | 1292 | vmf->pmd, pmd, vmf->address, page, target_nid); |
1293 | if (migrated) { | 1293 | if (migrated) { |
1294 | flags |= TNF_MIGRATED; | 1294 | flags |= TNF_MIGRATED; |
1295 | page_nid = target_nid; | 1295 | page_nid = target_nid; |
@@ -1304,18 +1304,19 @@ clear_pmdnuma: | |||
1304 | pmd = pmd_mkyoung(pmd); | 1304 | pmd = pmd_mkyoung(pmd); |
1305 | if (was_writable) | 1305 | if (was_writable) |
1306 | pmd = pmd_mkwrite(pmd); | 1306 | pmd = pmd_mkwrite(pmd); |
1307 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); | 1307 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); |
1308 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1308 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
1309 | unlock_page(page); | 1309 | unlock_page(page); |
1310 | out_unlock: | 1310 | out_unlock: |
1311 | spin_unlock(fe->ptl); | 1311 | spin_unlock(vmf->ptl); |
1312 | 1312 | ||
1313 | out: | 1313 | out: |
1314 | if (anon_vma) | 1314 | if (anon_vma) |
1315 | page_unlock_anon_vma_read(anon_vma); | 1315 | page_unlock_anon_vma_read(anon_vma); |
1316 | 1316 | ||
1317 | if (page_nid != -1) | 1317 | if (page_nid != -1) |
1318 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); | 1318 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, |
1319 | vmf->flags); | ||
1319 | 1320 | ||
1320 | return 0; | 1321 | return 0; |
1321 | } | 1322 | } |
diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..44d68895a9b9 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -36,7 +36,7 @@ | |||
36 | /* Do not use these with a slab allocator */ | 36 | /* Do not use these with a slab allocator */ |
37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) | 37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) |
38 | 38 | ||
39 | int do_swap_page(struct fault_env *fe, pte_t orig_pte); | 39 | int do_swap_page(struct vm_fault *vmf); |
40 | 40 | ||
41 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 41 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
42 | unsigned long floor, unsigned long ceiling); | 42 | unsigned long floor, unsigned long ceiling); |
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..e32389a97030 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
875 | unsigned long address, pmd_t *pmd, | 875 | unsigned long address, pmd_t *pmd, |
876 | int referenced) | 876 | int referenced) |
877 | { | 877 | { |
878 | pte_t pteval; | ||
879 | int swapped_in = 0, ret = 0; | 878 | int swapped_in = 0, ret = 0; |
880 | struct fault_env fe = { | 879 | struct vm_fault vmf = { |
881 | .vma = vma, | 880 | .vma = vma, |
882 | .address = address, | 881 | .address = address, |
883 | .flags = FAULT_FLAG_ALLOW_RETRY, | 882 | .flags = FAULT_FLAG_ALLOW_RETRY, |
884 | .pmd = pmd, | 883 | .pmd = pmd, |
884 | .pgoff = linear_page_index(vma, address), | ||
885 | }; | 885 | }; |
886 | 886 | ||
887 | /* we only decide to swapin, if there is enough young ptes */ | 887 | /* we only decide to swapin, if there is enough young ptes */ |
@@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
889 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | 889 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); |
890 | return false; | 890 | return false; |
891 | } | 891 | } |
892 | fe.pte = pte_offset_map(pmd, address); | 892 | vmf.pte = pte_offset_map(pmd, address); |
893 | for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; | 893 | for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; |
894 | fe.pte++, fe.address += PAGE_SIZE) { | 894 | vmf.pte++, vmf.address += PAGE_SIZE) { |
895 | pteval = *fe.pte; | 895 | vmf.orig_pte = *vmf.pte; |
896 | if (!is_swap_pte(pteval)) | 896 | if (!is_swap_pte(vmf.orig_pte)) |
897 | continue; | 897 | continue; |
898 | swapped_in++; | 898 | swapped_in++; |
899 | ret = do_swap_page(&fe, pteval); | 899 | ret = do_swap_page(&vmf); |
900 | 900 | ||
901 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ | 901 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ |
902 | if (ret & VM_FAULT_RETRY) { | 902 | if (ret & VM_FAULT_RETRY) { |
903 | down_read(&mm->mmap_sem); | 903 | down_read(&mm->mmap_sem); |
904 | if (hugepage_vma_revalidate(mm, address, &fe.vma)) { | 904 | if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { |
905 | /* vma is no longer available, don't continue to swapin */ | 905 | /* vma is no longer available, don't continue to swapin */ |
906 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | 906 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); |
907 | return false; | 907 | return false; |
@@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
915 | return false; | 915 | return false; |
916 | } | 916 | } |
917 | /* pte is unmapped now, we need to map it */ | 917 | /* pte is unmapped now, we need to map it */ |
918 | fe.pte = pte_offset_map(pmd, fe.address); | 918 | vmf.pte = pte_offset_map(pmd, vmf.address); |
919 | } | 919 | } |
920 | fe.pte--; | 920 | vmf.pte--; |
921 | pte_unmap(fe.pte); | 921 | pte_unmap(vmf.pte); |
922 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); | 922 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); |
923 | return true; | 923 | return true; |
924 | } | 924 | } |
@@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm, | |||
1446 | radix_tree_replace_slot(&mapping->page_tree, slot, | 1446 | radix_tree_replace_slot(&mapping->page_tree, slot, |
1447 | new_page + (index % HPAGE_PMD_NR)); | 1447 | new_page + (index % HPAGE_PMD_NR)); |
1448 | 1448 | ||
1449 | slot = radix_tree_iter_next(&iter); | 1449 | slot = radix_tree_iter_resume(slot, &iter); |
1450 | index++; | 1450 | index++; |
1451 | continue; | 1451 | continue; |
1452 | out_lru: | 1452 | out_lru: |
@@ -1546,7 +1546,6 @@ tree_unlocked: | |||
1546 | /* Put holes back where they were */ | 1546 | /* Put holes back where they were */ |
1547 | radix_tree_delete(&mapping->page_tree, | 1547 | radix_tree_delete(&mapping->page_tree, |
1548 | iter.index); | 1548 | iter.index); |
1549 | slot = radix_tree_iter_next(&iter); | ||
1550 | continue; | 1549 | continue; |
1551 | } | 1550 | } |
1552 | 1551 | ||
@@ -1557,11 +1556,11 @@ tree_unlocked: | |||
1557 | page_ref_unfreeze(page, 2); | 1556 | page_ref_unfreeze(page, 2); |
1558 | radix_tree_replace_slot(&mapping->page_tree, | 1557 | radix_tree_replace_slot(&mapping->page_tree, |
1559 | slot, page); | 1558 | slot, page); |
1559 | slot = radix_tree_iter_resume(slot, &iter); | ||
1560 | spin_unlock_irq(&mapping->tree_lock); | 1560 | spin_unlock_irq(&mapping->tree_lock); |
1561 | putback_lru_page(page); | 1561 | putback_lru_page(page); |
1562 | unlock_page(page); | 1562 | unlock_page(page); |
1563 | spin_lock_irq(&mapping->tree_lock); | 1563 | spin_lock_irq(&mapping->tree_lock); |
1564 | slot = radix_tree_iter_next(&iter); | ||
1565 | } | 1564 | } |
1566 | VM_BUG_ON(nr_none); | 1565 | VM_BUG_ON(nr_none); |
1567 | spin_unlock_irq(&mapping->tree_lock); | 1566 | spin_unlock_irq(&mapping->tree_lock); |
@@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, | |||
1641 | present++; | 1640 | present++; |
1642 | 1641 | ||
1643 | if (need_resched()) { | 1642 | if (need_resched()) { |
1643 | slot = radix_tree_iter_resume(slot, &iter); | ||
1644 | cond_resched_rcu(); | 1644 | cond_resched_rcu(); |
1645 | slot = radix_tree_iter_next(&iter); | ||
1646 | } | 1645 | } |
1647 | } | 1646 | } |
1648 | rcu_read_unlock(); | 1647 | rcu_read_unlock(); |
diff --git a/mm/memory.c b/mm/memory.c index 08d8da39de28..455c3e628d52 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) | |||
2034 | * | 2034 | * |
2035 | * We do this without the lock held, so that it can sleep if it needs to. | 2035 | * We do this without the lock held, so that it can sleep if it needs to. |
2036 | */ | 2036 | */ |
2037 | static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 2037 | static int do_page_mkwrite(struct vm_fault *vmf) |
2038 | unsigned long address) | ||
2039 | { | 2038 | { |
2040 | struct vm_fault vmf; | ||
2041 | int ret; | 2039 | int ret; |
2040 | struct page *page = vmf->page; | ||
2041 | unsigned int old_flags = vmf->flags; | ||
2042 | 2042 | ||
2043 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2043 | vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
2044 | vmf.pgoff = page->index; | ||
2045 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | ||
2046 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
2047 | vmf.page = page; | ||
2048 | vmf.cow_page = NULL; | ||
2049 | 2044 | ||
2050 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); | 2045 | ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); |
2046 | /* Restore original flags so that caller is not surprised */ | ||
2047 | vmf->flags = old_flags; | ||
2051 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2048 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
2052 | return ret; | 2049 | return ret; |
2053 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { | 2050 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { |
@@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2063 | } | 2060 | } |
2064 | 2061 | ||
2065 | /* | 2062 | /* |
2063 | * Handle dirtying of a page in shared file mapping on a write fault. | ||
2064 | * | ||
2065 | * The function expects the page to be locked and unlocks it. | ||
2066 | */ | ||
2067 | static void fault_dirty_shared_page(struct vm_area_struct *vma, | ||
2068 | struct page *page) | ||
2069 | { | ||
2070 | struct address_space *mapping; | ||
2071 | bool dirtied; | ||
2072 | bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; | ||
2073 | |||
2074 | dirtied = set_page_dirty(page); | ||
2075 | VM_BUG_ON_PAGE(PageAnon(page), page); | ||
2076 | /* | ||
2077 | * Take a local copy of the address_space - page.mapping may be zeroed | ||
2078 | * by truncate after unlock_page(). The address_space itself remains | ||
2079 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | ||
2080 | * release semantics to prevent the compiler from undoing this copying. | ||
2081 | */ | ||
2082 | mapping = page_rmapping(page); | ||
2083 | unlock_page(page); | ||
2084 | |||
2085 | if ((dirtied || page_mkwrite) && mapping) { | ||
2086 | /* | ||
2087 | * Some device drivers do not set page.mapping | ||
2088 | * but still dirty their pages | ||
2089 | */ | ||
2090 | balance_dirty_pages_ratelimited(mapping); | ||
2091 | } | ||
2092 | |||
2093 | if (!page_mkwrite) | ||
2094 | file_update_time(vma->vm_file); | ||
2095 | } | ||
2096 | |||
2097 | /* | ||
2066 | * Handle write page faults for pages that can be reused in the current vma | 2098 | * Handle write page faults for pages that can be reused in the current vma |
2067 | * | 2099 | * |
2068 | * This can happen either due to the mapping being with the VM_SHARED flag, | 2100 | * This can happen either due to the mapping being with the VM_SHARED flag, |
@@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2070 | * case, all we need to do here is to mark the page as writable and update | 2102 | * case, all we need to do here is to mark the page as writable and update |
2071 | * any related book-keeping. | 2103 | * any related book-keeping. |
2072 | */ | 2104 | */ |
2073 | static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | 2105 | static inline void wp_page_reuse(struct vm_fault *vmf) |
2074 | struct page *page, int page_mkwrite, int dirty_shared) | 2106 | __releases(vmf->ptl) |
2075 | __releases(fe->ptl) | ||
2076 | { | 2107 | { |
2077 | struct vm_area_struct *vma = fe->vma; | 2108 | struct vm_area_struct *vma = vmf->vma; |
2109 | struct page *page = vmf->page; | ||
2078 | pte_t entry; | 2110 | pte_t entry; |
2079 | /* | 2111 | /* |
2080 | * Clear the pages cpupid information as the existing | 2112 | * Clear the pages cpupid information as the existing |
@@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | |||
2084 | if (page) | 2116 | if (page) |
2085 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); | 2117 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
2086 | 2118 | ||
2087 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); | 2119 | flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
2088 | entry = pte_mkyoung(orig_pte); | 2120 | entry = pte_mkyoung(vmf->orig_pte); |
2089 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2121 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2090 | if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) | 2122 | if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) |
2091 | update_mmu_cache(vma, fe->address, fe->pte); | 2123 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2092 | pte_unmap_unlock(fe->pte, fe->ptl); | 2124 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2093 | |||
2094 | if (dirty_shared) { | ||
2095 | struct address_space *mapping; | ||
2096 | int dirtied; | ||
2097 | |||
2098 | if (!page_mkwrite) | ||
2099 | lock_page(page); | ||
2100 | |||
2101 | dirtied = set_page_dirty(page); | ||
2102 | VM_BUG_ON_PAGE(PageAnon(page), page); | ||
2103 | mapping = page->mapping; | ||
2104 | unlock_page(page); | ||
2105 | put_page(page); | ||
2106 | |||
2107 | if ((dirtied || page_mkwrite) && mapping) { | ||
2108 | /* | ||
2109 | * Some device drivers do not set page.mapping | ||
2110 | * but still dirty their pages | ||
2111 | */ | ||
2112 | balance_dirty_pages_ratelimited(mapping); | ||
2113 | } | ||
2114 | |||
2115 | if (!page_mkwrite) | ||
2116 | file_update_time(vma->vm_file); | ||
2117 | } | ||
2118 | |||
2119 | return VM_FAULT_WRITE; | ||
2120 | } | 2125 | } |
2121 | 2126 | ||
2122 | /* | 2127 | /* |
@@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | |||
2135 | * held to the old page, as well as updating the rmap. | 2140 | * held to the old page, as well as updating the rmap. |
2136 | * - In any case, unlock the PTL and drop the reference we took to the old page. | 2141 | * - In any case, unlock the PTL and drop the reference we took to the old page. |
2137 | */ | 2142 | */ |
2138 | static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | 2143 | static int wp_page_copy(struct vm_fault *vmf) |
2139 | struct page *old_page) | ||
2140 | { | 2144 | { |
2141 | struct vm_area_struct *vma = fe->vma; | 2145 | struct vm_area_struct *vma = vmf->vma; |
2142 | struct mm_struct *mm = vma->vm_mm; | 2146 | struct mm_struct *mm = vma->vm_mm; |
2147 | struct page *old_page = vmf->page; | ||
2143 | struct page *new_page = NULL; | 2148 | struct page *new_page = NULL; |
2144 | pte_t entry; | 2149 | pte_t entry; |
2145 | int page_copied = 0; | 2150 | int page_copied = 0; |
2146 | const unsigned long mmun_start = fe->address & PAGE_MASK; | 2151 | const unsigned long mmun_start = vmf->address & PAGE_MASK; |
2147 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; | 2152 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; |
2148 | struct mem_cgroup *memcg; | 2153 | struct mem_cgroup *memcg; |
2149 | 2154 | ||
2150 | if (unlikely(anon_vma_prepare(vma))) | 2155 | if (unlikely(anon_vma_prepare(vma))) |
2151 | goto oom; | 2156 | goto oom; |
2152 | 2157 | ||
2153 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2158 | if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { |
2154 | new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); | 2159 | new_page = alloc_zeroed_user_highpage_movable(vma, |
2160 | vmf->address); | ||
2155 | if (!new_page) | 2161 | if (!new_page) |
2156 | goto oom; | 2162 | goto oom; |
2157 | } else { | 2163 | } else { |
2158 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, | 2164 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, |
2159 | fe->address); | 2165 | vmf->address); |
2160 | if (!new_page) | 2166 | if (!new_page) |
2161 | goto oom; | 2167 | goto oom; |
2162 | cow_user_page(new_page, old_page, fe->address, vma); | 2168 | cow_user_page(new_page, old_page, vmf->address, vma); |
2163 | } | 2169 | } |
2164 | 2170 | ||
2165 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2171 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
@@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2172 | /* | 2178 | /* |
2173 | * Re-check the pte - we dropped the lock | 2179 | * Re-check the pte - we dropped the lock |
2174 | */ | 2180 | */ |
2175 | fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); | 2181 | vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); |
2176 | if (likely(pte_same(*fe->pte, orig_pte))) { | 2182 | if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { |
2177 | if (old_page) { | 2183 | if (old_page) { |
2178 | if (!PageAnon(old_page)) { | 2184 | if (!PageAnon(old_page)) { |
2179 | dec_mm_counter_fast(mm, | 2185 | dec_mm_counter_fast(mm, |
@@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2183 | } else { | 2189 | } else { |
2184 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2190 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2185 | } | 2191 | } |
2186 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); | 2192 | flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
2187 | entry = mk_pte(new_page, vma->vm_page_prot); | 2193 | entry = mk_pte(new_page, vma->vm_page_prot); |
2188 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2194 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2189 | /* | 2195 | /* |
@@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2192 | * seen in the presence of one thread doing SMC and another | 2198 | * seen in the presence of one thread doing SMC and another |
2193 | * thread doing COW. | 2199 | * thread doing COW. |
2194 | */ | 2200 | */ |
2195 | ptep_clear_flush_notify(vma, fe->address, fe->pte); | 2201 | ptep_clear_flush_notify(vma, vmf->address, vmf->pte); |
2196 | page_add_new_anon_rmap(new_page, vma, fe->address, false); | 2202 | page_add_new_anon_rmap(new_page, vma, vmf->address, false); |
2197 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 2203 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
2198 | lru_cache_add_active_or_unevictable(new_page, vma); | 2204 | lru_cache_add_active_or_unevictable(new_page, vma); |
2199 | /* | 2205 | /* |
@@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2201 | * mmu page tables (such as kvm shadow page tables), we want the | 2207 | * mmu page tables (such as kvm shadow page tables), we want the |
2202 | * new page to be mapped directly into the secondary page table. | 2208 | * new page to be mapped directly into the secondary page table. |
2203 | */ | 2209 | */ |
2204 | set_pte_at_notify(mm, fe->address, fe->pte, entry); | 2210 | set_pte_at_notify(mm, vmf->address, vmf->pte, entry); |
2205 | update_mmu_cache(vma, fe->address, fe->pte); | 2211 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2206 | if (old_page) { | 2212 | if (old_page) { |
2207 | /* | 2213 | /* |
2208 | * Only after switching the pte to the new page may | 2214 | * Only after switching the pte to the new page may |
@@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
2239 | if (new_page) | 2245 | if (new_page) |
2240 | put_page(new_page); | 2246 | put_page(new_page); |
2241 | 2247 | ||
2242 | pte_unmap_unlock(fe->pte, fe->ptl); | 2248 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2243 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2249 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2244 | if (old_page) { | 2250 | if (old_page) { |
2245 | /* | 2251 | /* |
@@ -2263,79 +2269,91 @@ oom: | |||
2263 | return VM_FAULT_OOM; | 2269 | return VM_FAULT_OOM; |
2264 | } | 2270 | } |
2265 | 2271 | ||
2272 | /** | ||
2273 | * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE | ||
2274 | * writeable once the page is prepared | ||
2275 | * | ||
2276 | * @vmf: structure describing the fault | ||
2277 | * | ||
2278 | * This function handles all that is needed to finish a write page fault in a | ||
2279 | * shared mapping due to PTE being read-only once the mapped page is prepared. | ||
2280 | * It handles locking of PTE and modifying it. The function returns | ||
2281 | * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE | ||
2282 | * lock. | ||
2283 | * | ||
2284 | * The function expects the page to be locked or other protection against | ||
2285 | * concurrent faults / writeback (such as DAX radix tree locks). | ||
2286 | */ | ||
2287 | int finish_mkwrite_fault(struct vm_fault *vmf) | ||
2288 | { | ||
2289 | WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); | ||
2290 | vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, | ||
2291 | &vmf->ptl); | ||
2292 | /* | ||
2293 | * We might have raced with another page fault while we released the | ||
2294 | * pte_offset_map_lock. | ||
2295 | */ | ||
2296 | if (!pte_same(*vmf->pte, vmf->orig_pte)) { | ||
2297 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
2298 | return VM_FAULT_NOPAGE; | ||
2299 | } | ||
2300 | wp_page_reuse(vmf); | ||
2301 | return 0; | ||
2302 | } | ||
2303 | |||
2266 | /* | 2304 | /* |
2267 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | 2305 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED |
2268 | * mapping | 2306 | * mapping |
2269 | */ | 2307 | */ |
2270 | static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) | 2308 | static int wp_pfn_shared(struct vm_fault *vmf) |
2271 | { | 2309 | { |
2272 | struct vm_area_struct *vma = fe->vma; | 2310 | struct vm_area_struct *vma = vmf->vma; |
2273 | 2311 | ||
2274 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | 2312 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { |
2275 | struct vm_fault vmf = { | ||
2276 | .page = NULL, | ||
2277 | .pgoff = linear_page_index(vma, fe->address), | ||
2278 | .virtual_address = | ||
2279 | (void __user *)(fe->address & PAGE_MASK), | ||
2280 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
2281 | }; | ||
2282 | int ret; | 2313 | int ret; |
2283 | 2314 | ||
2284 | pte_unmap_unlock(fe->pte, fe->ptl); | 2315 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2285 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | 2316 | vmf->flags |= FAULT_FLAG_MKWRITE; |
2286 | if (ret & VM_FAULT_ERROR) | 2317 | ret = vma->vm_ops->pfn_mkwrite(vma, vmf); |
2318 | if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) | ||
2287 | return ret; | 2319 | return ret; |
2288 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2320 | return finish_mkwrite_fault(vmf); |
2289 | &fe->ptl); | ||
2290 | /* | ||
2291 | * We might have raced with another page fault while we | ||
2292 | * released the pte_offset_map_lock. | ||
2293 | */ | ||
2294 | if (!pte_same(*fe->pte, orig_pte)) { | ||
2295 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
2296 | return 0; | ||
2297 | } | ||
2298 | } | 2321 | } |
2299 | return wp_page_reuse(fe, orig_pte, NULL, 0, 0); | 2322 | wp_page_reuse(vmf); |
2323 | return VM_FAULT_WRITE; | ||
2300 | } | 2324 | } |
2301 | 2325 | ||
2302 | static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, | 2326 | static int wp_page_shared(struct vm_fault *vmf) |
2303 | struct page *old_page) | 2327 | __releases(vmf->ptl) |
2304 | __releases(fe->ptl) | ||
2305 | { | 2328 | { |
2306 | struct vm_area_struct *vma = fe->vma; | 2329 | struct vm_area_struct *vma = vmf->vma; |
2307 | int page_mkwrite = 0; | ||
2308 | 2330 | ||
2309 | get_page(old_page); | 2331 | get_page(vmf->page); |
2310 | 2332 | ||
2311 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2333 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2312 | int tmp; | 2334 | int tmp; |
2313 | 2335 | ||
2314 | pte_unmap_unlock(fe->pte, fe->ptl); | 2336 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2315 | tmp = do_page_mkwrite(vma, old_page, fe->address); | 2337 | tmp = do_page_mkwrite(vmf); |
2316 | if (unlikely(!tmp || (tmp & | 2338 | if (unlikely(!tmp || (tmp & |
2317 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2339 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2318 | put_page(old_page); | 2340 | put_page(vmf->page); |
2319 | return tmp; | 2341 | return tmp; |
2320 | } | 2342 | } |
2321 | /* | 2343 | tmp = finish_mkwrite_fault(vmf); |
2322 | * Since we dropped the lock we need to revalidate | 2344 | if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { |
2323 | * the PTE as someone else may have changed it. If | 2345 | unlock_page(vmf->page); |
2324 | * they did, we just return, as we can count on the | 2346 | put_page(vmf->page); |
2325 | * MMU to tell us if they didn't also make it writable. | 2347 | return tmp; |
2326 | */ | ||
2327 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | ||
2328 | &fe->ptl); | ||
2329 | if (!pte_same(*fe->pte, orig_pte)) { | ||
2330 | unlock_page(old_page); | ||
2331 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
2332 | put_page(old_page); | ||
2333 | return 0; | ||
2334 | } | 2348 | } |
2335 | page_mkwrite = 1; | 2349 | } else { |
2350 | wp_page_reuse(vmf); | ||
2351 | lock_page(vmf->page); | ||
2336 | } | 2352 | } |
2353 | fault_dirty_shared_page(vma, vmf->page); | ||
2354 | put_page(vmf->page); | ||
2337 | 2355 | ||
2338 | return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); | 2356 | return VM_FAULT_WRITE; |
2339 | } | 2357 | } |
2340 | 2358 | ||
2341 | /* | 2359 | /* |
@@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, | |||
2356 | * but allow concurrent faults), with pte both mapped and locked. | 2374 | * but allow concurrent faults), with pte both mapped and locked. |
2357 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2375 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2358 | */ | 2376 | */ |
2359 | static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | 2377 | static int do_wp_page(struct vm_fault *vmf) |
2360 | __releases(fe->ptl) | 2378 | __releases(vmf->ptl) |
2361 | { | 2379 | { |
2362 | struct vm_area_struct *vma = fe->vma; | 2380 | struct vm_area_struct *vma = vmf->vma; |
2363 | struct page *old_page; | ||
2364 | 2381 | ||
2365 | old_page = vm_normal_page(vma, fe->address, orig_pte); | 2382 | vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); |
2366 | if (!old_page) { | 2383 | if (!vmf->page) { |
2367 | /* | 2384 | /* |
2368 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | 2385 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
2369 | * VM_PFNMAP VMA. | 2386 | * VM_PFNMAP VMA. |
@@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | |||
2373 | */ | 2390 | */ |
2374 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2391 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2375 | (VM_WRITE|VM_SHARED)) | 2392 | (VM_WRITE|VM_SHARED)) |
2376 | return wp_pfn_shared(fe, orig_pte); | 2393 | return wp_pfn_shared(vmf); |
2377 | 2394 | ||
2378 | pte_unmap_unlock(fe->pte, fe->ptl); | 2395 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2379 | return wp_page_copy(fe, orig_pte, old_page); | 2396 | return wp_page_copy(vmf); |
2380 | } | 2397 | } |
2381 | 2398 | ||
2382 | /* | 2399 | /* |
2383 | * Take out anonymous pages first, anonymous shared vmas are | 2400 | * Take out anonymous pages first, anonymous shared vmas are |
2384 | * not dirty accountable. | 2401 | * not dirty accountable. |
2385 | */ | 2402 | */ |
2386 | if (PageAnon(old_page) && !PageKsm(old_page)) { | 2403 | if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { |
2387 | int total_mapcount; | 2404 | int total_mapcount; |
2388 | if (!trylock_page(old_page)) { | 2405 | if (!trylock_page(vmf->page)) { |
2389 | get_page(old_page); | 2406 | get_page(vmf->page); |
2390 | pte_unmap_unlock(fe->pte, fe->ptl); | 2407 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2391 | lock_page(old_page); | 2408 | lock_page(vmf->page); |
2392 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, | 2409 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
2393 | fe->address, &fe->ptl); | 2410 | vmf->address, &vmf->ptl); |
2394 | if (!pte_same(*fe->pte, orig_pte)) { | 2411 | if (!pte_same(*vmf->pte, vmf->orig_pte)) { |
2395 | unlock_page(old_page); | 2412 | unlock_page(vmf->page); |
2396 | pte_unmap_unlock(fe->pte, fe->ptl); | 2413 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2397 | put_page(old_page); | 2414 | put_page(vmf->page); |
2398 | return 0; | 2415 | return 0; |
2399 | } | 2416 | } |
2400 | put_page(old_page); | 2417 | put_page(vmf->page); |
2401 | } | 2418 | } |
2402 | if (reuse_swap_page(old_page, &total_mapcount)) { | 2419 | if (reuse_swap_page(vmf->page, &total_mapcount)) { |
2403 | if (total_mapcount == 1) { | 2420 | if (total_mapcount == 1) { |
2404 | /* | 2421 | /* |
2405 | * The page is all ours. Move it to | 2422 | * The page is all ours. Move it to |
@@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | |||
2408 | * Protected against the rmap code by | 2425 | * Protected against the rmap code by |
2409 | * the page lock. | 2426 | * the page lock. |
2410 | */ | 2427 | */ |
2411 | page_move_anon_rmap(old_page, vma); | 2428 | page_move_anon_rmap(vmf->page, vma); |
2412 | } | 2429 | } |
2413 | unlock_page(old_page); | 2430 | unlock_page(vmf->page); |
2414 | return wp_page_reuse(fe, orig_pte, old_page, 0, 0); | 2431 | wp_page_reuse(vmf); |
2432 | return VM_FAULT_WRITE; | ||
2415 | } | 2433 | } |
2416 | unlock_page(old_page); | 2434 | unlock_page(vmf->page); |
2417 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2435 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2418 | (VM_WRITE|VM_SHARED))) { | 2436 | (VM_WRITE|VM_SHARED))) { |
2419 | return wp_page_shared(fe, orig_pte, old_page); | 2437 | return wp_page_shared(vmf); |
2420 | } | 2438 | } |
2421 | 2439 | ||
2422 | /* | 2440 | /* |
2423 | * Ok, we need to copy. Oh, well.. | 2441 | * Ok, we need to copy. Oh, well.. |
2424 | */ | 2442 | */ |
2425 | get_page(old_page); | 2443 | get_page(vmf->page); |
2426 | 2444 | ||
2427 | pte_unmap_unlock(fe->pte, fe->ptl); | 2445 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2428 | return wp_page_copy(fe, orig_pte, old_page); | 2446 | return wp_page_copy(vmf); |
2429 | } | 2447 | } |
2430 | 2448 | ||
2431 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2449 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
@@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2513 | * We return with the mmap_sem locked or unlocked in the same cases | 2531 | * We return with the mmap_sem locked or unlocked in the same cases |
2514 | * as does filemap_fault(). | 2532 | * as does filemap_fault(). |
2515 | */ | 2533 | */ |
2516 | int do_swap_page(struct fault_env *fe, pte_t orig_pte) | 2534 | int do_swap_page(struct vm_fault *vmf) |
2517 | { | 2535 | { |
2518 | struct vm_area_struct *vma = fe->vma; | 2536 | struct vm_area_struct *vma = vmf->vma; |
2519 | struct page *page, *swapcache; | 2537 | struct page *page, *swapcache; |
2520 | struct mem_cgroup *memcg; | 2538 | struct mem_cgroup *memcg; |
2521 | swp_entry_t entry; | 2539 | swp_entry_t entry; |
@@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2524 | int exclusive = 0; | 2542 | int exclusive = 0; |
2525 | int ret = 0; | 2543 | int ret = 0; |
2526 | 2544 | ||
2527 | if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) | 2545 | if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) |
2528 | goto out; | 2546 | goto out; |
2529 | 2547 | ||
2530 | entry = pte_to_swp_entry(orig_pte); | 2548 | entry = pte_to_swp_entry(vmf->orig_pte); |
2531 | if (unlikely(non_swap_entry(entry))) { | 2549 | if (unlikely(non_swap_entry(entry))) { |
2532 | if (is_migration_entry(entry)) { | 2550 | if (is_migration_entry(entry)) { |
2533 | migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); | 2551 | migration_entry_wait(vma->vm_mm, vmf->pmd, |
2552 | vmf->address); | ||
2534 | } else if (is_hwpoison_entry(entry)) { | 2553 | } else if (is_hwpoison_entry(entry)) { |
2535 | ret = VM_FAULT_HWPOISON; | 2554 | ret = VM_FAULT_HWPOISON; |
2536 | } else { | 2555 | } else { |
2537 | print_bad_pte(vma, fe->address, orig_pte, NULL); | 2556 | print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); |
2538 | ret = VM_FAULT_SIGBUS; | 2557 | ret = VM_FAULT_SIGBUS; |
2539 | } | 2558 | } |
2540 | goto out; | 2559 | goto out; |
@@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2542 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2561 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2543 | page = lookup_swap_cache(entry); | 2562 | page = lookup_swap_cache(entry); |
2544 | if (!page) { | 2563 | if (!page) { |
2545 | page = swapin_readahead(entry, | 2564 | page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, |
2546 | GFP_HIGHUSER_MOVABLE, vma, fe->address); | 2565 | vmf->address); |
2547 | if (!page) { | 2566 | if (!page) { |
2548 | /* | 2567 | /* |
2549 | * Back out if somebody else faulted in this pte | 2568 | * Back out if somebody else faulted in this pte |
2550 | * while we released the pte lock. | 2569 | * while we released the pte lock. |
2551 | */ | 2570 | */ |
2552 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, | 2571 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
2553 | fe->address, &fe->ptl); | 2572 | vmf->address, &vmf->ptl); |
2554 | if (likely(pte_same(*fe->pte, orig_pte))) | 2573 | if (likely(pte_same(*vmf->pte, vmf->orig_pte))) |
2555 | ret = VM_FAULT_OOM; | 2574 | ret = VM_FAULT_OOM; |
2556 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2575 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2557 | goto unlock; | 2576 | goto unlock; |
@@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2573 | } | 2592 | } |
2574 | 2593 | ||
2575 | swapcache = page; | 2594 | swapcache = page; |
2576 | locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); | 2595 | locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); |
2577 | 2596 | ||
2578 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2597 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2579 | if (!locked) { | 2598 | if (!locked) { |
@@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2590 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2609 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2591 | goto out_page; | 2610 | goto out_page; |
2592 | 2611 | ||
2593 | page = ksm_might_need_to_copy(page, vma, fe->address); | 2612 | page = ksm_might_need_to_copy(page, vma, vmf->address); |
2594 | if (unlikely(!page)) { | 2613 | if (unlikely(!page)) { |
2595 | ret = VM_FAULT_OOM; | 2614 | ret = VM_FAULT_OOM; |
2596 | page = swapcache; | 2615 | page = swapcache; |
@@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2606 | /* | 2625 | /* |
2607 | * Back out if somebody else already faulted in this pte. | 2626 | * Back out if somebody else already faulted in this pte. |
2608 | */ | 2627 | */ |
2609 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2628 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
2610 | &fe->ptl); | 2629 | &vmf->ptl); |
2611 | if (unlikely(!pte_same(*fe->pte, orig_pte))) | 2630 | if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) |
2612 | goto out_nomap; | 2631 | goto out_nomap; |
2613 | 2632 | ||
2614 | if (unlikely(!PageUptodate(page))) { | 2633 | if (unlikely(!PageUptodate(page))) { |
@@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2629 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2648 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2630 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); | 2649 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); |
2631 | pte = mk_pte(page, vma->vm_page_prot); | 2650 | pte = mk_pte(page, vma->vm_page_prot); |
2632 | if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { | 2651 | if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { |
2633 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2652 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2634 | fe->flags &= ~FAULT_FLAG_WRITE; | 2653 | vmf->flags &= ~FAULT_FLAG_WRITE; |
2635 | ret |= VM_FAULT_WRITE; | 2654 | ret |= VM_FAULT_WRITE; |
2636 | exclusive = RMAP_EXCLUSIVE; | 2655 | exclusive = RMAP_EXCLUSIVE; |
2637 | } | 2656 | } |
2638 | flush_icache_page(vma, page); | 2657 | flush_icache_page(vma, page); |
2639 | if (pte_swp_soft_dirty(orig_pte)) | 2658 | if (pte_swp_soft_dirty(vmf->orig_pte)) |
2640 | pte = pte_mksoft_dirty(pte); | 2659 | pte = pte_mksoft_dirty(pte); |
2641 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); | 2660 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
2661 | vmf->orig_pte = pte; | ||
2642 | if (page == swapcache) { | 2662 | if (page == swapcache) { |
2643 | do_page_add_anon_rmap(page, vma, fe->address, exclusive); | 2663 | do_page_add_anon_rmap(page, vma, vmf->address, exclusive); |
2644 | mem_cgroup_commit_charge(page, memcg, true, false); | 2664 | mem_cgroup_commit_charge(page, memcg, true, false); |
2645 | activate_page(page); | 2665 | activate_page(page); |
2646 | } else { /* ksm created a completely new copy */ | 2666 | } else { /* ksm created a completely new copy */ |
2647 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2667 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
2648 | mem_cgroup_commit_charge(page, memcg, false, false); | 2668 | mem_cgroup_commit_charge(page, memcg, false, false); |
2649 | lru_cache_add_active_or_unevictable(page, vma); | 2669 | lru_cache_add_active_or_unevictable(page, vma); |
2650 | } | 2670 | } |
@@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
2667 | put_page(swapcache); | 2687 | put_page(swapcache); |
2668 | } | 2688 | } |
2669 | 2689 | ||
2670 | if (fe->flags & FAULT_FLAG_WRITE) { | 2690 | if (vmf->flags & FAULT_FLAG_WRITE) { |
2671 | ret |= do_wp_page(fe, pte); | 2691 | ret |= do_wp_page(vmf); |
2672 | if (ret & VM_FAULT_ERROR) | 2692 | if (ret & VM_FAULT_ERROR) |
2673 | ret &= VM_FAULT_ERROR; | 2693 | ret &= VM_FAULT_ERROR; |
2674 | goto out; | 2694 | goto out; |
2675 | } | 2695 | } |
2676 | 2696 | ||
2677 | /* No need to invalidate - it was non-present before */ | 2697 | /* No need to invalidate - it was non-present before */ |
2678 | update_mmu_cache(vma, fe->address, fe->pte); | 2698 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2679 | unlock: | 2699 | unlock: |
2680 | pte_unmap_unlock(fe->pte, fe->ptl); | 2700 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2681 | out: | 2701 | out: |
2682 | return ret; | 2702 | return ret; |
2683 | out_nomap: | 2703 | out_nomap: |
2684 | mem_cgroup_cancel_charge(page, memcg, false); | 2704 | mem_cgroup_cancel_charge(page, memcg, false); |
2685 | pte_unmap_unlock(fe->pte, fe->ptl); | 2705 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2686 | out_page: | 2706 | out_page: |
2687 | unlock_page(page); | 2707 | unlock_page(page); |
2688 | out_release: | 2708 | out_release: |
@@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2733 | * but allow concurrent faults), and pte mapped but not yet locked. | 2753 | * but allow concurrent faults), and pte mapped but not yet locked. |
2734 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2754 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2735 | */ | 2755 | */ |
2736 | static int do_anonymous_page(struct fault_env *fe) | 2756 | static int do_anonymous_page(struct vm_fault *vmf) |
2737 | { | 2757 | { |
2738 | struct vm_area_struct *vma = fe->vma; | 2758 | struct vm_area_struct *vma = vmf->vma; |
2739 | struct mem_cgroup *memcg; | 2759 | struct mem_cgroup *memcg; |
2740 | struct page *page; | 2760 | struct page *page; |
2741 | pte_t entry; | 2761 | pte_t entry; |
@@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2745 | return VM_FAULT_SIGBUS; | 2765 | return VM_FAULT_SIGBUS; |
2746 | 2766 | ||
2747 | /* Check if we need to add a guard page to the stack */ | 2767 | /* Check if we need to add a guard page to the stack */ |
2748 | if (check_stack_guard_page(vma, fe->address) < 0) | 2768 | if (check_stack_guard_page(vma, vmf->address) < 0) |
2749 | return VM_FAULT_SIGSEGV; | 2769 | return VM_FAULT_SIGSEGV; |
2750 | 2770 | ||
2751 | /* | 2771 | /* |
@@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2758 | * | 2778 | * |
2759 | * Here we only have down_read(mmap_sem). | 2779 | * Here we only have down_read(mmap_sem). |
2760 | */ | 2780 | */ |
2761 | if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) | 2781 | if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) |
2762 | return VM_FAULT_OOM; | 2782 | return VM_FAULT_OOM; |
2763 | 2783 | ||
2764 | /* See the comment in pte_alloc_one_map() */ | 2784 | /* See the comment in pte_alloc_one_map() */ |
2765 | if (unlikely(pmd_trans_unstable(fe->pmd))) | 2785 | if (unlikely(pmd_trans_unstable(vmf->pmd))) |
2766 | return 0; | 2786 | return 0; |
2767 | 2787 | ||
2768 | /* Use the zero-page for reads */ | 2788 | /* Use the zero-page for reads */ |
2769 | if (!(fe->flags & FAULT_FLAG_WRITE) && | 2789 | if (!(vmf->flags & FAULT_FLAG_WRITE) && |
2770 | !mm_forbids_zeropage(vma->vm_mm)) { | 2790 | !mm_forbids_zeropage(vma->vm_mm)) { |
2771 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), | 2791 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), |
2772 | vma->vm_page_prot)); | 2792 | vma->vm_page_prot)); |
2773 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2793 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
2774 | &fe->ptl); | 2794 | vmf->address, &vmf->ptl); |
2775 | if (!pte_none(*fe->pte)) | 2795 | if (!pte_none(*vmf->pte)) |
2776 | goto unlock; | 2796 | goto unlock; |
2777 | /* Deliver the page fault to userland, check inside PT lock */ | 2797 | /* Deliver the page fault to userland, check inside PT lock */ |
2778 | if (userfaultfd_missing(vma)) { | 2798 | if (userfaultfd_missing(vma)) { |
2779 | pte_unmap_unlock(fe->pte, fe->ptl); | 2799 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2780 | return handle_userfault(fe, VM_UFFD_MISSING); | 2800 | return handle_userfault(vmf, VM_UFFD_MISSING); |
2781 | } | 2801 | } |
2782 | goto setpte; | 2802 | goto setpte; |
2783 | } | 2803 | } |
@@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2785 | /* Allocate our own private page. */ | 2805 | /* Allocate our own private page. */ |
2786 | if (unlikely(anon_vma_prepare(vma))) | 2806 | if (unlikely(anon_vma_prepare(vma))) |
2787 | goto oom; | 2807 | goto oom; |
2788 | page = alloc_zeroed_user_highpage_movable(vma, fe->address); | 2808 | page = alloc_zeroed_user_highpage_movable(vma, vmf->address); |
2789 | if (!page) | 2809 | if (!page) |
2790 | goto oom; | 2810 | goto oom; |
2791 | 2811 | ||
@@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2803 | if (vma->vm_flags & VM_WRITE) | 2823 | if (vma->vm_flags & VM_WRITE) |
2804 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2824 | entry = pte_mkwrite(pte_mkdirty(entry)); |
2805 | 2825 | ||
2806 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2826 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
2807 | &fe->ptl); | 2827 | &vmf->ptl); |
2808 | if (!pte_none(*fe->pte)) | 2828 | if (!pte_none(*vmf->pte)) |
2809 | goto release; | 2829 | goto release; |
2810 | 2830 | ||
2811 | /* Deliver the page fault to userland, check inside PT lock */ | 2831 | /* Deliver the page fault to userland, check inside PT lock */ |
2812 | if (userfaultfd_missing(vma)) { | 2832 | if (userfaultfd_missing(vma)) { |
2813 | pte_unmap_unlock(fe->pte, fe->ptl); | 2833 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2814 | mem_cgroup_cancel_charge(page, memcg, false); | 2834 | mem_cgroup_cancel_charge(page, memcg, false); |
2815 | put_page(page); | 2835 | put_page(page); |
2816 | return handle_userfault(fe, VM_UFFD_MISSING); | 2836 | return handle_userfault(vmf, VM_UFFD_MISSING); |
2817 | } | 2837 | } |
2818 | 2838 | ||
2819 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2839 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2820 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2840 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
2821 | mem_cgroup_commit_charge(page, memcg, false, false); | 2841 | mem_cgroup_commit_charge(page, memcg, false, false); |
2822 | lru_cache_add_active_or_unevictable(page, vma); | 2842 | lru_cache_add_active_or_unevictable(page, vma); |
2823 | setpte: | 2843 | setpte: |
2824 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); | 2844 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); |
2825 | 2845 | ||
2826 | /* No need to invalidate - it was non-present before */ | 2846 | /* No need to invalidate - it was non-present before */ |
2827 | update_mmu_cache(vma, fe->address, fe->pte); | 2847 | update_mmu_cache(vma, vmf->address, vmf->pte); |
2828 | unlock: | 2848 | unlock: |
2829 | pte_unmap_unlock(fe->pte, fe->ptl); | 2849 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
2830 | return 0; | 2850 | return 0; |
2831 | release: | 2851 | release: |
2832 | mem_cgroup_cancel_charge(page, memcg, false); | 2852 | mem_cgroup_cancel_charge(page, memcg, false); |
@@ -2843,62 +2863,50 @@ oom: | |||
2843 | * released depending on flags and vma->vm_ops->fault() return value. | 2863 | * released depending on flags and vma->vm_ops->fault() return value. |
2844 | * See filemap_fault() and __lock_page_retry(). | 2864 | * See filemap_fault() and __lock_page_retry(). |
2845 | */ | 2865 | */ |
2846 | static int __do_fault(struct fault_env *fe, pgoff_t pgoff, | 2866 | static int __do_fault(struct vm_fault *vmf) |
2847 | struct page *cow_page, struct page **page, void **entry) | ||
2848 | { | 2867 | { |
2849 | struct vm_area_struct *vma = fe->vma; | 2868 | struct vm_area_struct *vma = vmf->vma; |
2850 | struct vm_fault vmf; | ||
2851 | int ret; | 2869 | int ret; |
2852 | 2870 | ||
2853 | vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); | 2871 | ret = vma->vm_ops->fault(vma, vmf); |
2854 | vmf.pgoff = pgoff; | 2872 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | |
2855 | vmf.flags = fe->flags; | 2873 | VM_FAULT_DONE_COW))) |
2856 | vmf.page = NULL; | ||
2857 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
2858 | vmf.cow_page = cow_page; | ||
2859 | |||
2860 | ret = vma->vm_ops->fault(vma, &vmf); | ||
2861 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | ||
2862 | return ret; | ||
2863 | if (ret & VM_FAULT_DAX_LOCKED) { | ||
2864 | *entry = vmf.entry; | ||
2865 | return ret; | 2874 | return ret; |
2866 | } | ||
2867 | 2875 | ||
2868 | if (unlikely(PageHWPoison(vmf.page))) { | 2876 | if (unlikely(PageHWPoison(vmf->page))) { |
2869 | if (ret & VM_FAULT_LOCKED) | 2877 | if (ret & VM_FAULT_LOCKED) |
2870 | unlock_page(vmf.page); | 2878 | unlock_page(vmf->page); |
2871 | put_page(vmf.page); | 2879 | put_page(vmf->page); |
2880 | vmf->page = NULL; | ||
2872 | return VM_FAULT_HWPOISON; | 2881 | return VM_FAULT_HWPOISON; |
2873 | } | 2882 | } |
2874 | 2883 | ||
2875 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 2884 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
2876 | lock_page(vmf.page); | 2885 | lock_page(vmf->page); |
2877 | else | 2886 | else |
2878 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); | 2887 | VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); |
2879 | 2888 | ||
2880 | *page = vmf.page; | ||
2881 | return ret; | 2889 | return ret; |
2882 | } | 2890 | } |
2883 | 2891 | ||
2884 | static int pte_alloc_one_map(struct fault_env *fe) | 2892 | static int pte_alloc_one_map(struct vm_fault *vmf) |
2885 | { | 2893 | { |
2886 | struct vm_area_struct *vma = fe->vma; | 2894 | struct vm_area_struct *vma = vmf->vma; |
2887 | 2895 | ||
2888 | if (!pmd_none(*fe->pmd)) | 2896 | if (!pmd_none(*vmf->pmd)) |
2889 | goto map_pte; | 2897 | goto map_pte; |
2890 | if (fe->prealloc_pte) { | 2898 | if (vmf->prealloc_pte) { |
2891 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 2899 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
2892 | if (unlikely(!pmd_none(*fe->pmd))) { | 2900 | if (unlikely(!pmd_none(*vmf->pmd))) { |
2893 | spin_unlock(fe->ptl); | 2901 | spin_unlock(vmf->ptl); |
2894 | goto map_pte; | 2902 | goto map_pte; |
2895 | } | 2903 | } |
2896 | 2904 | ||
2897 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 2905 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
2898 | pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); | 2906 | pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
2899 | spin_unlock(fe->ptl); | 2907 | spin_unlock(vmf->ptl); |
2900 | fe->prealloc_pte = 0; | 2908 | vmf->prealloc_pte = 0; |
2901 | } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { | 2909 | } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { |
2902 | return VM_FAULT_OOM; | 2910 | return VM_FAULT_OOM; |
2903 | } | 2911 | } |
2904 | map_pte: | 2912 | map_pte: |
@@ -2913,11 +2921,11 @@ map_pte: | |||
2913 | * through an atomic read in C, which is what pmd_trans_unstable() | 2921 | * through an atomic read in C, which is what pmd_trans_unstable() |
2914 | * provides. | 2922 | * provides. |
2915 | */ | 2923 | */ |
2916 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | 2924 | if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) |
2917 | return VM_FAULT_NOPAGE; | 2925 | return VM_FAULT_NOPAGE; |
2918 | 2926 | ||
2919 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2927 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
2920 | &fe->ptl); | 2928 | &vmf->ptl); |
2921 | return 0; | 2929 | return 0; |
2922 | } | 2930 | } |
2923 | 2931 | ||
@@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, | |||
2935 | return true; | 2943 | return true; |
2936 | } | 2944 | } |
2937 | 2945 | ||
2938 | static void deposit_prealloc_pte(struct fault_env *fe) | 2946 | static void deposit_prealloc_pte(struct vm_fault *vmf) |
2939 | { | 2947 | { |
2940 | struct vm_area_struct *vma = fe->vma; | 2948 | struct vm_area_struct *vma = vmf->vma; |
2941 | 2949 | ||
2942 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); | 2950 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
2943 | /* | 2951 | /* |
2944 | * We are going to consume the prealloc table, | 2952 | * We are going to consume the prealloc table, |
2945 | * count that as nr_ptes. | 2953 | * count that as nr_ptes. |
2946 | */ | 2954 | */ |
2947 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 2955 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
2948 | fe->prealloc_pte = 0; | 2956 | vmf->prealloc_pte = 0; |
2949 | } | 2957 | } |
2950 | 2958 | ||
2951 | static int do_set_pmd(struct fault_env *fe, struct page *page) | 2959 | static int do_set_pmd(struct vm_fault *vmf, struct page *page) |
2952 | { | 2960 | { |
2953 | struct vm_area_struct *vma = fe->vma; | 2961 | struct vm_area_struct *vma = vmf->vma; |
2954 | bool write = fe->flags & FAULT_FLAG_WRITE; | 2962 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
2955 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 2963 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
2956 | pmd_t entry; | 2964 | pmd_t entry; |
2957 | int i, ret; | 2965 | int i, ret; |
2958 | 2966 | ||
@@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
2966 | * Archs like ppc64 need additonal space to store information | 2974 | * Archs like ppc64 need additonal space to store information |
2967 | * related to pte entry. Use the preallocated table for that. | 2975 | * related to pte entry. Use the preallocated table for that. |
2968 | */ | 2976 | */ |
2969 | if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { | 2977 | if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { |
2970 | fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); | 2978 | vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); |
2971 | if (!fe->prealloc_pte) | 2979 | if (!vmf->prealloc_pte) |
2972 | return VM_FAULT_OOM; | 2980 | return VM_FAULT_OOM; |
2973 | smp_wmb(); /* See comment in __pte_alloc() */ | 2981 | smp_wmb(); /* See comment in __pte_alloc() */ |
2974 | } | 2982 | } |
2975 | 2983 | ||
2976 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 2984 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
2977 | if (unlikely(!pmd_none(*fe->pmd))) | 2985 | if (unlikely(!pmd_none(*vmf->pmd))) |
2978 | goto out; | 2986 | goto out; |
2979 | 2987 | ||
2980 | for (i = 0; i < HPAGE_PMD_NR; i++) | 2988 | for (i = 0; i < HPAGE_PMD_NR; i++) |
@@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
2990 | * deposit and withdraw with pmd lock held | 2998 | * deposit and withdraw with pmd lock held |
2991 | */ | 2999 | */ |
2992 | if (arch_needs_pgtable_deposit()) | 3000 | if (arch_needs_pgtable_deposit()) |
2993 | deposit_prealloc_pte(fe); | 3001 | deposit_prealloc_pte(vmf); |
2994 | 3002 | ||
2995 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 3003 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
2996 | 3004 | ||
2997 | update_mmu_cache_pmd(vma, haddr, fe->pmd); | 3005 | update_mmu_cache_pmd(vma, haddr, vmf->pmd); |
2998 | 3006 | ||
2999 | /* fault is handled */ | 3007 | /* fault is handled */ |
3000 | ret = 0; | 3008 | ret = 0; |
@@ -3005,13 +3013,13 @@ out: | |||
3005 | * withdraw with pmd lock held. | 3013 | * withdraw with pmd lock held. |
3006 | */ | 3014 | */ |
3007 | if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) | 3015 | if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) |
3008 | fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, | 3016 | vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, |
3009 | fe->pmd); | 3017 | vmf->pmd); |
3010 | spin_unlock(fe->ptl); | 3018 | spin_unlock(vmf->ptl); |
3011 | return ret; | 3019 | return ret; |
3012 | } | 3020 | } |
3013 | #else | 3021 | #else |
3014 | static int do_set_pmd(struct fault_env *fe, struct page *page) | 3022 | static int do_set_pmd(struct vm_fault *vmf, struct page *page) |
3015 | { | 3023 | { |
3016 | BUILD_BUG(); | 3024 | BUILD_BUG(); |
3017 | return 0; | 3025 | return 0; |
@@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
3022 | * alloc_set_pte - setup new PTE entry for given page and add reverse page | 3030 | * alloc_set_pte - setup new PTE entry for given page and add reverse page |
3023 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. | 3031 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. |
3024 | * | 3032 | * |
3025 | * @fe: fault environment | 3033 | * @vmf: fault environment |
3026 | * @memcg: memcg to charge page (only for private mappings) | 3034 | * @memcg: memcg to charge page (only for private mappings) |
3027 | * @page: page to map | 3035 | * @page: page to map |
3028 | * | 3036 | * |
3029 | * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. | 3037 | * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on |
3038 | * return. | ||
3030 | * | 3039 | * |
3031 | * Target users are page handler itself and implementations of | 3040 | * Target users are page handler itself and implementations of |
3032 | * vm_ops->map_pages. | 3041 | * vm_ops->map_pages. |
3033 | */ | 3042 | */ |
3034 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | 3043 | int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
3035 | struct page *page) | 3044 | struct page *page) |
3036 | { | 3045 | { |
3037 | struct vm_area_struct *vma = fe->vma; | 3046 | struct vm_area_struct *vma = vmf->vma; |
3038 | bool write = fe->flags & FAULT_FLAG_WRITE; | 3047 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
3039 | pte_t entry; | 3048 | pte_t entry; |
3040 | int ret; | 3049 | int ret; |
3041 | 3050 | ||
3042 | if (pmd_none(*fe->pmd) && PageTransCompound(page) && | 3051 | if (pmd_none(*vmf->pmd) && PageTransCompound(page) && |
3043 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { | 3052 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { |
3044 | /* THP on COW? */ | 3053 | /* THP on COW? */ |
3045 | VM_BUG_ON_PAGE(memcg, page); | 3054 | VM_BUG_ON_PAGE(memcg, page); |
3046 | 3055 | ||
3047 | ret = do_set_pmd(fe, page); | 3056 | ret = do_set_pmd(vmf, page); |
3048 | if (ret != VM_FAULT_FALLBACK) | 3057 | if (ret != VM_FAULT_FALLBACK) |
3049 | goto fault_handled; | 3058 | goto fault_handled; |
3050 | } | 3059 | } |
3051 | 3060 | ||
3052 | if (!fe->pte) { | 3061 | if (!vmf->pte) { |
3053 | ret = pte_alloc_one_map(fe); | 3062 | ret = pte_alloc_one_map(vmf); |
3054 | if (ret) | 3063 | if (ret) |
3055 | goto fault_handled; | 3064 | goto fault_handled; |
3056 | } | 3065 | } |
3057 | 3066 | ||
3058 | /* Re-check under ptl */ | 3067 | /* Re-check under ptl */ |
3059 | if (unlikely(!pte_none(*fe->pte))) { | 3068 | if (unlikely(!pte_none(*vmf->pte))) { |
3060 | ret = VM_FAULT_NOPAGE; | 3069 | ret = VM_FAULT_NOPAGE; |
3061 | goto fault_handled; | 3070 | goto fault_handled; |
3062 | } | 3071 | } |
@@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | |||
3068 | /* copy-on-write page */ | 3077 | /* copy-on-write page */ |
3069 | if (write && !(vma->vm_flags & VM_SHARED)) { | 3078 | if (write && !(vma->vm_flags & VM_SHARED)) { |
3070 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 3079 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
3071 | page_add_new_anon_rmap(page, vma, fe->address, false); | 3080 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
3072 | mem_cgroup_commit_charge(page, memcg, false, false); | 3081 | mem_cgroup_commit_charge(page, memcg, false, false); |
3073 | lru_cache_add_active_or_unevictable(page, vma); | 3082 | lru_cache_add_active_or_unevictable(page, vma); |
3074 | } else { | 3083 | } else { |
3075 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 3084 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
3076 | page_add_file_rmap(page, false); | 3085 | page_add_file_rmap(page, false); |
3077 | } | 3086 | } |
3078 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); | 3087 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); |
3079 | 3088 | ||
3080 | /* no need to invalidate: a not-present page won't be cached */ | 3089 | /* no need to invalidate: a not-present page won't be cached */ |
3081 | update_mmu_cache(vma, fe->address, fe->pte); | 3090 | update_mmu_cache(vma, vmf->address, vmf->pte); |
3082 | ret = 0; | 3091 | ret = 0; |
3083 | 3092 | ||
3084 | fault_handled: | 3093 | fault_handled: |
3085 | /* preallocated pagetable is unused: free it */ | 3094 | /* preallocated pagetable is unused: free it */ |
3086 | if (fe->prealloc_pte) { | 3095 | if (vmf->prealloc_pte) { |
3087 | pte_free(fe->vma->vm_mm, fe->prealloc_pte); | 3096 | pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); |
3088 | fe->prealloc_pte = 0; | 3097 | vmf->prealloc_pte = 0; |
3089 | } | 3098 | } |
3090 | return ret; | 3099 | return ret; |
3091 | } | 3100 | } |
3092 | 3101 | ||
3102 | |||
3103 | /** | ||
3104 | * finish_fault - finish page fault once we have prepared the page to fault | ||
3105 | * | ||
3106 | * @vmf: structure describing the fault | ||
3107 | * | ||
3108 | * This function handles all that is needed to finish a page fault once the | ||
3109 | * page to fault in is prepared. It handles locking of PTEs, inserts PTE for | ||
3110 | * given page, adds reverse page mapping, handles memcg charges and LRU | ||
3111 | * addition. The function returns 0 on success, VM_FAULT_ code in case of | ||
3112 | * error. | ||
3113 | * | ||
3114 | * The function expects the page to be locked and on success it consumes a | ||
3115 | * reference of a page being mapped (for the PTE which maps it). | ||
3116 | */ | ||
3117 | int finish_fault(struct vm_fault *vmf) | ||
3118 | { | ||
3119 | struct page *page; | ||
3120 | int ret; | ||
3121 | |||
3122 | /* Did we COW the page? */ | ||
3123 | if ((vmf->flags & FAULT_FLAG_WRITE) && | ||
3124 | !(vmf->vma->vm_flags & VM_SHARED)) | ||
3125 | page = vmf->cow_page; | ||
3126 | else | ||
3127 | page = vmf->page; | ||
3128 | ret = alloc_set_pte(vmf, vmf->memcg, page); | ||
3129 | if (vmf->pte) | ||
3130 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
3131 | return ret; | ||
3132 | } | ||
3133 | |||
3093 | static unsigned long fault_around_bytes __read_mostly = | 3134 | static unsigned long fault_around_bytes __read_mostly = |
3094 | rounddown_pow_of_two(65536); | 3135 | rounddown_pow_of_two(65536); |
3095 | 3136 | ||
@@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs); | |||
3154 | * fault_around_pages() value (and therefore to page order). This way it's | 3195 | * fault_around_pages() value (and therefore to page order). This way it's |
3155 | * easier to guarantee that we don't cross page table boundaries. | 3196 | * easier to guarantee that we don't cross page table boundaries. |
3156 | */ | 3197 | */ |
3157 | static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | 3198 | static int do_fault_around(struct vm_fault *vmf) |
3158 | { | 3199 | { |
3159 | unsigned long address = fe->address, nr_pages, mask; | 3200 | unsigned long address = vmf->address, nr_pages, mask; |
3201 | pgoff_t start_pgoff = vmf->pgoff; | ||
3160 | pgoff_t end_pgoff; | 3202 | pgoff_t end_pgoff; |
3161 | int off, ret = 0; | 3203 | int off, ret = 0; |
3162 | 3204 | ||
3163 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 3205 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
3164 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 3206 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
3165 | 3207 | ||
3166 | fe->address = max(address & mask, fe->vma->vm_start); | 3208 | vmf->address = max(address & mask, vmf->vma->vm_start); |
3167 | off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 3209 | off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
3168 | start_pgoff -= off; | 3210 | start_pgoff -= off; |
3169 | 3211 | ||
3170 | /* | 3212 | /* |
@@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | |||
3172 | * or fault_around_pages() from start_pgoff, depending what is nearest. | 3214 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
3173 | */ | 3215 | */ |
3174 | end_pgoff = start_pgoff - | 3216 | end_pgoff = start_pgoff - |
3175 | ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 3217 | ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
3176 | PTRS_PER_PTE - 1; | 3218 | PTRS_PER_PTE - 1; |
3177 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, | 3219 | end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, |
3178 | start_pgoff + nr_pages - 1); | 3220 | start_pgoff + nr_pages - 1); |
3179 | 3221 | ||
3180 | if (pmd_none(*fe->pmd)) { | 3222 | if (pmd_none(*vmf->pmd)) { |
3181 | fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); | 3223 | vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, |
3182 | if (!fe->prealloc_pte) | 3224 | vmf->address); |
3225 | if (!vmf->prealloc_pte) | ||
3183 | goto out; | 3226 | goto out; |
3184 | smp_wmb(); /* See comment in __pte_alloc() */ | 3227 | smp_wmb(); /* See comment in __pte_alloc() */ |
3185 | } | 3228 | } |
3186 | 3229 | ||
3187 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); | 3230 | vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); |
3188 | 3231 | ||
3189 | /* Huge page is mapped? Page fault is solved */ | 3232 | /* Huge page is mapped? Page fault is solved */ |
3190 | if (pmd_trans_huge(*fe->pmd)) { | 3233 | if (pmd_trans_huge(*vmf->pmd)) { |
3191 | ret = VM_FAULT_NOPAGE; | 3234 | ret = VM_FAULT_NOPAGE; |
3192 | goto out; | 3235 | goto out; |
3193 | } | 3236 | } |
3194 | 3237 | ||
3195 | /* ->map_pages() haven't done anything useful. Cold page cache? */ | 3238 | /* ->map_pages() haven't done anything useful. Cold page cache? */ |
3196 | if (!fe->pte) | 3239 | if (!vmf->pte) |
3197 | goto out; | 3240 | goto out; |
3198 | 3241 | ||
3199 | /* check if the page fault is solved */ | 3242 | /* check if the page fault is solved */ |
3200 | fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); | 3243 | vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); |
3201 | if (!pte_none(*fe->pte)) | 3244 | if (!pte_none(*vmf->pte)) |
3202 | ret = VM_FAULT_NOPAGE; | 3245 | ret = VM_FAULT_NOPAGE; |
3203 | pte_unmap_unlock(fe->pte, fe->ptl); | 3246 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3204 | out: | 3247 | out: |
3205 | fe->address = address; | 3248 | vmf->address = address; |
3206 | fe->pte = NULL; | 3249 | vmf->pte = NULL; |
3207 | return ret; | 3250 | return ret; |
3208 | } | 3251 | } |
3209 | 3252 | ||
3210 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) | 3253 | static int do_read_fault(struct vm_fault *vmf) |
3211 | { | 3254 | { |
3212 | struct vm_area_struct *vma = fe->vma; | 3255 | struct vm_area_struct *vma = vmf->vma; |
3213 | struct page *fault_page; | ||
3214 | int ret = 0; | 3256 | int ret = 0; |
3215 | 3257 | ||
3216 | /* | 3258 | /* |
@@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) | |||
3219 | * something). | 3261 | * something). |
3220 | */ | 3262 | */ |
3221 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3263 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
3222 | ret = do_fault_around(fe, pgoff); | 3264 | ret = do_fault_around(vmf); |
3223 | if (ret) | 3265 | if (ret) |
3224 | return ret; | 3266 | return ret; |
3225 | } | 3267 | } |
3226 | 3268 | ||
3227 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3269 | ret = __do_fault(vmf); |
3228 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3270 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3229 | return ret; | 3271 | return ret; |
3230 | 3272 | ||
3231 | ret |= alloc_set_pte(fe, NULL, fault_page); | 3273 | ret |= finish_fault(vmf); |
3232 | if (fe->pte) | 3274 | unlock_page(vmf->page); |
3233 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3234 | unlock_page(fault_page); | ||
3235 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3275 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3236 | put_page(fault_page); | 3276 | put_page(vmf->page); |
3237 | return ret; | 3277 | return ret; |
3238 | } | 3278 | } |
3239 | 3279 | ||
3240 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) | 3280 | static int do_cow_fault(struct vm_fault *vmf) |
3241 | { | 3281 | { |
3242 | struct vm_area_struct *vma = fe->vma; | 3282 | struct vm_area_struct *vma = vmf->vma; |
3243 | struct page *fault_page, *new_page; | ||
3244 | void *fault_entry; | ||
3245 | struct mem_cgroup *memcg; | ||
3246 | int ret; | 3283 | int ret; |
3247 | 3284 | ||
3248 | if (unlikely(anon_vma_prepare(vma))) | 3285 | if (unlikely(anon_vma_prepare(vma))) |
3249 | return VM_FAULT_OOM; | 3286 | return VM_FAULT_OOM; |
3250 | 3287 | ||
3251 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); | 3288 | vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); |
3252 | if (!new_page) | 3289 | if (!vmf->cow_page) |
3253 | return VM_FAULT_OOM; | 3290 | return VM_FAULT_OOM; |
3254 | 3291 | ||
3255 | if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, | 3292 | if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, |
3256 | &memcg, false)) { | 3293 | &vmf->memcg, false)) { |
3257 | put_page(new_page); | 3294 | put_page(vmf->cow_page); |
3258 | return VM_FAULT_OOM; | 3295 | return VM_FAULT_OOM; |
3259 | } | 3296 | } |
3260 | 3297 | ||
3261 | ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); | 3298 | ret = __do_fault(vmf); |
3262 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3299 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3263 | goto uncharge_out; | 3300 | goto uncharge_out; |
3301 | if (ret & VM_FAULT_DONE_COW) | ||
3302 | return ret; | ||
3264 | 3303 | ||
3265 | if (!(ret & VM_FAULT_DAX_LOCKED)) | 3304 | copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); |
3266 | copy_user_highpage(new_page, fault_page, fe->address, vma); | 3305 | __SetPageUptodate(vmf->cow_page); |
3267 | __SetPageUptodate(new_page); | ||
3268 | 3306 | ||
3269 | ret |= alloc_set_pte(fe, memcg, new_page); | 3307 | ret |= finish_fault(vmf); |
3270 | if (fe->pte) | 3308 | unlock_page(vmf->page); |
3271 | pte_unmap_unlock(fe->pte, fe->ptl); | 3309 | put_page(vmf->page); |
3272 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | ||
3273 | unlock_page(fault_page); | ||
3274 | put_page(fault_page); | ||
3275 | } else { | ||
3276 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); | ||
3277 | } | ||
3278 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3310 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3279 | goto uncharge_out; | 3311 | goto uncharge_out; |
3280 | return ret; | 3312 | return ret; |
3281 | uncharge_out: | 3313 | uncharge_out: |
3282 | mem_cgroup_cancel_charge(new_page, memcg, false); | 3314 | mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); |
3283 | put_page(new_page); | 3315 | put_page(vmf->cow_page); |
3284 | return ret; | 3316 | return ret; |
3285 | } | 3317 | } |
3286 | 3318 | ||
3287 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | 3319 | static int do_shared_fault(struct vm_fault *vmf) |
3288 | { | 3320 | { |
3289 | struct vm_area_struct *vma = fe->vma; | 3321 | struct vm_area_struct *vma = vmf->vma; |
3290 | struct page *fault_page; | ||
3291 | struct address_space *mapping; | ||
3292 | int dirtied = 0; | ||
3293 | int ret, tmp; | 3322 | int ret, tmp; |
3294 | 3323 | ||
3295 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3324 | ret = __do_fault(vmf); |
3296 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3325 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3297 | return ret; | 3326 | return ret; |
3298 | 3327 | ||
@@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | |||
3301 | * about to become writable | 3330 | * about to become writable |
3302 | */ | 3331 | */ |
3303 | if (vma->vm_ops->page_mkwrite) { | 3332 | if (vma->vm_ops->page_mkwrite) { |
3304 | unlock_page(fault_page); | 3333 | unlock_page(vmf->page); |
3305 | tmp = do_page_mkwrite(vma, fault_page, fe->address); | 3334 | tmp = do_page_mkwrite(vmf); |
3306 | if (unlikely(!tmp || | 3335 | if (unlikely(!tmp || |
3307 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 3336 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
3308 | put_page(fault_page); | 3337 | put_page(vmf->page); |
3309 | return tmp; | 3338 | return tmp; |
3310 | } | 3339 | } |
3311 | } | 3340 | } |
3312 | 3341 | ||
3313 | ret |= alloc_set_pte(fe, NULL, fault_page); | 3342 | ret |= finish_fault(vmf); |
3314 | if (fe->pte) | ||
3315 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3316 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3343 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
3317 | VM_FAULT_RETRY))) { | 3344 | VM_FAULT_RETRY))) { |
3318 | unlock_page(fault_page); | 3345 | unlock_page(vmf->page); |
3319 | put_page(fault_page); | 3346 | put_page(vmf->page); |
3320 | return ret; | 3347 | return ret; |
3321 | } | 3348 | } |
3322 | 3349 | ||
3323 | if (set_page_dirty(fault_page)) | 3350 | fault_dirty_shared_page(vma, vmf->page); |
3324 | dirtied = 1; | ||
3325 | /* | ||
3326 | * Take a local copy of the address_space - page.mapping may be zeroed | ||
3327 | * by truncate after unlock_page(). The address_space itself remains | ||
3328 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | ||
3329 | * release semantics to prevent the compiler from undoing this copying. | ||
3330 | */ | ||
3331 | mapping = page_rmapping(fault_page); | ||
3332 | unlock_page(fault_page); | ||
3333 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { | ||
3334 | /* | ||
3335 | * Some device drivers do not set page.mapping but still | ||
3336 | * dirty their pages | ||
3337 | */ | ||
3338 | balance_dirty_pages_ratelimited(mapping); | ||
3339 | } | ||
3340 | |||
3341 | if (!vma->vm_ops->page_mkwrite) | ||
3342 | file_update_time(vma->vm_file); | ||
3343 | |||
3344 | return ret; | 3351 | return ret; |
3345 | } | 3352 | } |
3346 | 3353 | ||
@@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | |||
3350 | * The mmap_sem may have been released depending on flags and our | 3357 | * The mmap_sem may have been released depending on flags and our |
3351 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3358 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3352 | */ | 3359 | */ |
3353 | static int do_fault(struct fault_env *fe) | 3360 | static int do_fault(struct vm_fault *vmf) |
3354 | { | 3361 | { |
3355 | struct vm_area_struct *vma = fe->vma; | 3362 | struct vm_area_struct *vma = vmf->vma; |
3356 | pgoff_t pgoff = linear_page_index(vma, fe->address); | ||
3357 | 3363 | ||
3358 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3364 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
3359 | if (!vma->vm_ops->fault) | 3365 | if (!vma->vm_ops->fault) |
3360 | return VM_FAULT_SIGBUS; | 3366 | return VM_FAULT_SIGBUS; |
3361 | if (!(fe->flags & FAULT_FLAG_WRITE)) | 3367 | if (!(vmf->flags & FAULT_FLAG_WRITE)) |
3362 | return do_read_fault(fe, pgoff); | 3368 | return do_read_fault(vmf); |
3363 | if (!(vma->vm_flags & VM_SHARED)) | 3369 | if (!(vma->vm_flags & VM_SHARED)) |
3364 | return do_cow_fault(fe, pgoff); | 3370 | return do_cow_fault(vmf); |
3365 | return do_shared_fault(fe, pgoff); | 3371 | return do_shared_fault(vmf); |
3366 | } | 3372 | } |
3367 | 3373 | ||
3368 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3374 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
@@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | |||
3380 | return mpol_misplaced(page, vma, addr); | 3386 | return mpol_misplaced(page, vma, addr); |
3381 | } | 3387 | } |
3382 | 3388 | ||
3383 | static int do_numa_page(struct fault_env *fe, pte_t pte) | 3389 | static int do_numa_page(struct vm_fault *vmf) |
3384 | { | 3390 | { |
3385 | struct vm_area_struct *vma = fe->vma; | 3391 | struct vm_area_struct *vma = vmf->vma; |
3386 | struct page *page = NULL; | 3392 | struct page *page = NULL; |
3387 | int page_nid = -1; | 3393 | int page_nid = -1; |
3388 | int last_cpupid; | 3394 | int last_cpupid; |
3389 | int target_nid; | 3395 | int target_nid; |
3390 | bool migrated = false; | 3396 | bool migrated = false; |
3397 | pte_t pte = vmf->orig_pte; | ||
3391 | bool was_writable = pte_write(pte); | 3398 | bool was_writable = pte_write(pte); |
3392 | int flags = 0; | 3399 | int flags = 0; |
3393 | 3400 | ||
@@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3400 | * page table entry is not accessible, so there would be no | 3407 | * page table entry is not accessible, so there would be no |
3401 | * concurrent hardware modifications to the PTE. | 3408 | * concurrent hardware modifications to the PTE. |
3402 | */ | 3409 | */ |
3403 | fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); | 3410 | vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); |
3404 | spin_lock(fe->ptl); | 3411 | spin_lock(vmf->ptl); |
3405 | if (unlikely(!pte_same(*fe->pte, pte))) { | 3412 | if (unlikely(!pte_same(*vmf->pte, pte))) { |
3406 | pte_unmap_unlock(fe->pte, fe->ptl); | 3413 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3407 | goto out; | 3414 | goto out; |
3408 | } | 3415 | } |
3409 | 3416 | ||
@@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3412 | pte = pte_mkyoung(pte); | 3419 | pte = pte_mkyoung(pte); |
3413 | if (was_writable) | 3420 | if (was_writable) |
3414 | pte = pte_mkwrite(pte); | 3421 | pte = pte_mkwrite(pte); |
3415 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); | 3422 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
3416 | update_mmu_cache(vma, fe->address, fe->pte); | 3423 | update_mmu_cache(vma, vmf->address, vmf->pte); |
3417 | 3424 | ||
3418 | page = vm_normal_page(vma, fe->address, pte); | 3425 | page = vm_normal_page(vma, vmf->address, pte); |
3419 | if (!page) { | 3426 | if (!page) { |
3420 | pte_unmap_unlock(fe->pte, fe->ptl); | 3427 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3421 | return 0; | 3428 | return 0; |
3422 | } | 3429 | } |
3423 | 3430 | ||
3424 | /* TODO: handle PTE-mapped THP */ | 3431 | /* TODO: handle PTE-mapped THP */ |
3425 | if (PageCompound(page)) { | 3432 | if (PageCompound(page)) { |
3426 | pte_unmap_unlock(fe->pte, fe->ptl); | 3433 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3427 | return 0; | 3434 | return 0; |
3428 | } | 3435 | } |
3429 | 3436 | ||
@@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
3447 | 3454 | ||
3448 | last_cpupid = page_cpupid_last(page); | 3455 | last_cpupid = page_cpupid_last(page); |
3449 | page_nid = page_to_nid(page); | 3456 | page_nid = page_to_nid(page); |
3450 | target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, | 3457 | target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, |
3451 | &flags); | 3458 | &flags); |
3452 | pte_unmap_unlock(fe->pte, fe->ptl); | 3459 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3453 | if (target_nid == -1) { | 3460 | if (target_nid == -1) { |
3454 | put_page(page); | 3461 | put_page(page); |
3455 | goto out; | 3462 | goto out; |
@@ -3469,28 +3476,28 @@ out: | |||
3469 | return 0; | 3476 | return 0; |
3470 | } | 3477 | } |
3471 | 3478 | ||
3472 | static int create_huge_pmd(struct fault_env *fe) | 3479 | static int create_huge_pmd(struct vm_fault *vmf) |
3473 | { | 3480 | { |
3474 | struct vm_area_struct *vma = fe->vma; | 3481 | struct vm_area_struct *vma = vmf->vma; |
3475 | if (vma_is_anonymous(vma)) | 3482 | if (vma_is_anonymous(vma)) |
3476 | return do_huge_pmd_anonymous_page(fe); | 3483 | return do_huge_pmd_anonymous_page(vmf); |
3477 | if (vma->vm_ops->pmd_fault) | 3484 | if (vma->vm_ops->pmd_fault) |
3478 | return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, | 3485 | return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, |
3479 | fe->flags); | 3486 | vmf->flags); |
3480 | return VM_FAULT_FALLBACK; | 3487 | return VM_FAULT_FALLBACK; |
3481 | } | 3488 | } |
3482 | 3489 | ||
3483 | static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) | 3490 | static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) |
3484 | { | 3491 | { |
3485 | if (vma_is_anonymous(fe->vma)) | 3492 | if (vma_is_anonymous(vmf->vma)) |
3486 | return do_huge_pmd_wp_page(fe, orig_pmd); | 3493 | return do_huge_pmd_wp_page(vmf, orig_pmd); |
3487 | if (fe->vma->vm_ops->pmd_fault) | 3494 | if (vmf->vma->vm_ops->pmd_fault) |
3488 | return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, | 3495 | return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, |
3489 | fe->flags); | 3496 | vmf->pmd, vmf->flags); |
3490 | 3497 | ||
3491 | /* COW handled on pte level: split pmd */ | 3498 | /* COW handled on pte level: split pmd */ |
3492 | VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); | 3499 | VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); |
3493 | __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); | 3500 | __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); |
3494 | 3501 | ||
3495 | return VM_FAULT_FALLBACK; | 3502 | return VM_FAULT_FALLBACK; |
3496 | } | 3503 | } |
@@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) | |||
3515 | * The mmap_sem may have been released depending on flags and our return value. | 3522 | * The mmap_sem may have been released depending on flags and our return value. |
3516 | * See filemap_fault() and __lock_page_or_retry(). | 3523 | * See filemap_fault() and __lock_page_or_retry(). |
3517 | */ | 3524 | */ |
3518 | static int handle_pte_fault(struct fault_env *fe) | 3525 | static int handle_pte_fault(struct vm_fault *vmf) |
3519 | { | 3526 | { |
3520 | pte_t entry; | 3527 | pte_t entry; |
3521 | 3528 | ||
3522 | if (unlikely(pmd_none(*fe->pmd))) { | 3529 | if (unlikely(pmd_none(*vmf->pmd))) { |
3523 | /* | 3530 | /* |
3524 | * Leave __pte_alloc() until later: because vm_ops->fault may | 3531 | * Leave __pte_alloc() until later: because vm_ops->fault may |
3525 | * want to allocate huge page, and if we expose page table | 3532 | * want to allocate huge page, and if we expose page table |
3526 | * for an instant, it will be difficult to retract from | 3533 | * for an instant, it will be difficult to retract from |
3527 | * concurrent faults and from rmap lookups. | 3534 | * concurrent faults and from rmap lookups. |
3528 | */ | 3535 | */ |
3529 | fe->pte = NULL; | 3536 | vmf->pte = NULL; |
3530 | } else { | 3537 | } else { |
3531 | /* See comment in pte_alloc_one_map() */ | 3538 | /* See comment in pte_alloc_one_map() */ |
3532 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | 3539 | if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) |
3533 | return 0; | 3540 | return 0; |
3534 | /* | 3541 | /* |
3535 | * A regular pmd is established and it can't morph into a huge | 3542 | * A regular pmd is established and it can't morph into a huge |
@@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3537 | * mmap_sem read mode and khugepaged takes it in write mode. | 3544 | * mmap_sem read mode and khugepaged takes it in write mode. |
3538 | * So now it's safe to run pte_offset_map(). | 3545 | * So now it's safe to run pte_offset_map(). |
3539 | */ | 3546 | */ |
3540 | fe->pte = pte_offset_map(fe->pmd, fe->address); | 3547 | vmf->pte = pte_offset_map(vmf->pmd, vmf->address); |
3541 | 3548 | vmf->orig_pte = *vmf->pte; | |
3542 | entry = *fe->pte; | ||
3543 | 3549 | ||
3544 | /* | 3550 | /* |
3545 | * some architectures can have larger ptes than wordsize, | 3551 | * some architectures can have larger ptes than wordsize, |
@@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3550 | * ptl lock held. So here a barrier will do. | 3556 | * ptl lock held. So here a barrier will do. |
3551 | */ | 3557 | */ |
3552 | barrier(); | 3558 | barrier(); |
3553 | if (pte_none(entry)) { | 3559 | if (pte_none(vmf->orig_pte)) { |
3554 | pte_unmap(fe->pte); | 3560 | pte_unmap(vmf->pte); |
3555 | fe->pte = NULL; | 3561 | vmf->pte = NULL; |
3556 | } | 3562 | } |
3557 | } | 3563 | } |
3558 | 3564 | ||
3559 | if (!fe->pte) { | 3565 | if (!vmf->pte) { |
3560 | if (vma_is_anonymous(fe->vma)) | 3566 | if (vma_is_anonymous(vmf->vma)) |
3561 | return do_anonymous_page(fe); | 3567 | return do_anonymous_page(vmf); |
3562 | else | 3568 | else |
3563 | return do_fault(fe); | 3569 | return do_fault(vmf); |
3564 | } | 3570 | } |
3565 | 3571 | ||
3566 | if (!pte_present(entry)) | 3572 | if (!pte_present(vmf->orig_pte)) |
3567 | return do_swap_page(fe, entry); | 3573 | return do_swap_page(vmf); |
3568 | 3574 | ||
3569 | if (pte_protnone(entry) && vma_is_accessible(fe->vma)) | 3575 | if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) |
3570 | return do_numa_page(fe, entry); | 3576 | return do_numa_page(vmf); |
3571 | 3577 | ||
3572 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); | 3578 | vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); |
3573 | spin_lock(fe->ptl); | 3579 | spin_lock(vmf->ptl); |
3574 | if (unlikely(!pte_same(*fe->pte, entry))) | 3580 | entry = vmf->orig_pte; |
3581 | if (unlikely(!pte_same(*vmf->pte, entry))) | ||
3575 | goto unlock; | 3582 | goto unlock; |
3576 | if (fe->flags & FAULT_FLAG_WRITE) { | 3583 | if (vmf->flags & FAULT_FLAG_WRITE) { |
3577 | if (!pte_write(entry)) | 3584 | if (!pte_write(entry)) |
3578 | return do_wp_page(fe, entry); | 3585 | return do_wp_page(vmf); |
3579 | entry = pte_mkdirty(entry); | 3586 | entry = pte_mkdirty(entry); |
3580 | } | 3587 | } |
3581 | entry = pte_mkyoung(entry); | 3588 | entry = pte_mkyoung(entry); |
3582 | if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, | 3589 | if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, |
3583 | fe->flags & FAULT_FLAG_WRITE)) { | 3590 | vmf->flags & FAULT_FLAG_WRITE)) { |
3584 | update_mmu_cache(fe->vma, fe->address, fe->pte); | 3591 | update_mmu_cache(vmf->vma, vmf->address, vmf->pte); |
3585 | } else { | 3592 | } else { |
3586 | /* | 3593 | /* |
3587 | * This is needed only for protection faults but the arch code | 3594 | * This is needed only for protection faults but the arch code |
@@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe) | |||
3589 | * This still avoids useless tlb flushes for .text page faults | 3596 | * This still avoids useless tlb flushes for .text page faults |
3590 | * with threads. | 3597 | * with threads. |
3591 | */ | 3598 | */ |
3592 | if (fe->flags & FAULT_FLAG_WRITE) | 3599 | if (vmf->flags & FAULT_FLAG_WRITE) |
3593 | flush_tlb_fix_spurious_fault(fe->vma, fe->address); | 3600 | flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); |
3594 | } | 3601 | } |
3595 | unlock: | 3602 | unlock: |
3596 | pte_unmap_unlock(fe->pte, fe->ptl); | 3603 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3597 | return 0; | 3604 | return 0; |
3598 | } | 3605 | } |
3599 | 3606 | ||
@@ -3606,10 +3613,12 @@ unlock: | |||
3606 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | 3613 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
3607 | unsigned int flags) | 3614 | unsigned int flags) |
3608 | { | 3615 | { |
3609 | struct fault_env fe = { | 3616 | struct vm_fault vmf = { |
3610 | .vma = vma, | 3617 | .vma = vma, |
3611 | .address = address, | 3618 | .address = address & PAGE_MASK, |
3612 | .flags = flags, | 3619 | .flags = flags, |
3620 | .pgoff = linear_page_index(vma, address), | ||
3621 | .gfp_mask = __get_fault_gfp_mask(vma), | ||
3613 | }; | 3622 | }; |
3614 | struct mm_struct *mm = vma->vm_mm; | 3623 | struct mm_struct *mm = vma->vm_mm; |
3615 | pgd_t *pgd; | 3624 | pgd_t *pgd; |
@@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3619 | pud = pud_alloc(mm, pgd, address); | 3628 | pud = pud_alloc(mm, pgd, address); |
3620 | if (!pud) | 3629 | if (!pud) |
3621 | return VM_FAULT_OOM; | 3630 | return VM_FAULT_OOM; |
3622 | fe.pmd = pmd_alloc(mm, pud, address); | 3631 | vmf.pmd = pmd_alloc(mm, pud, address); |
3623 | if (!fe.pmd) | 3632 | if (!vmf.pmd) |
3624 | return VM_FAULT_OOM; | 3633 | return VM_FAULT_OOM; |
3625 | if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { | 3634 | if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { |
3626 | int ret = create_huge_pmd(&fe); | 3635 | int ret = create_huge_pmd(&vmf); |
3627 | if (!(ret & VM_FAULT_FALLBACK)) | 3636 | if (!(ret & VM_FAULT_FALLBACK)) |
3628 | return ret; | 3637 | return ret; |
3629 | } else { | 3638 | } else { |
3630 | pmd_t orig_pmd = *fe.pmd; | 3639 | pmd_t orig_pmd = *vmf.pmd; |
3631 | int ret; | 3640 | int ret; |
3632 | 3641 | ||
3633 | barrier(); | 3642 | barrier(); |
3634 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3643 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3635 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) | 3644 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) |
3636 | return do_huge_pmd_numa_page(&fe, orig_pmd); | 3645 | return do_huge_pmd_numa_page(&vmf, orig_pmd); |
3637 | 3646 | ||
3638 | if ((fe.flags & FAULT_FLAG_WRITE) && | 3647 | if ((vmf.flags & FAULT_FLAG_WRITE) && |
3639 | !pmd_write(orig_pmd)) { | 3648 | !pmd_write(orig_pmd)) { |
3640 | ret = wp_huge_pmd(&fe, orig_pmd); | 3649 | ret = wp_huge_pmd(&vmf, orig_pmd); |
3641 | if (!(ret & VM_FAULT_FALLBACK)) | 3650 | if (!(ret & VM_FAULT_FALLBACK)) |
3642 | return ret; | 3651 | return ret; |
3643 | } else { | 3652 | } else { |
3644 | huge_pmd_set_accessed(&fe, orig_pmd); | 3653 | huge_pmd_set_accessed(&vmf, orig_pmd); |
3645 | return 0; | 3654 | return 0; |
3646 | } | 3655 | } |
3647 | } | 3656 | } |
3648 | } | 3657 | } |
3649 | 3658 | ||
3650 | return handle_pte_fault(&fe); | 3659 | return handle_pte_fault(&vmf); |
3651 | } | 3660 | } |
3652 | 3661 | ||
3653 | /* | 3662 | /* |
@@ -3808,8 +3817,8 @@ out: | |||
3808 | return -EINVAL; | 3817 | return -EINVAL; |
3809 | } | 3818 | } |
3810 | 3819 | ||
3811 | static inline int follow_pte(struct mm_struct *mm, unsigned long address, | 3820 | int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, |
3812 | pte_t **ptepp, spinlock_t **ptlp) | 3821 | spinlock_t **ptlp) |
3813 | { | 3822 | { |
3814 | int res; | 3823 | int res; |
3815 | 3824 | ||
@@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
3919 | struct page *page = NULL; | 3928 | struct page *page = NULL; |
3920 | 3929 | ||
3921 | ret = get_user_pages_remote(tsk, mm, addr, 1, | 3930 | ret = get_user_pages_remote(tsk, mm, addr, 1, |
3922 | gup_flags, &page, &vma); | 3931 | gup_flags, &page, &vma, NULL); |
3923 | if (ret <= 0) { | 3932 | if (ret <= 0) { |
3924 | #ifndef CONFIG_HAVE_IOREMAP_PROT | 3933 | #ifndef CONFIG_HAVE_IOREMAP_PROT |
3925 | break; | 3934 | break; |
diff --git a/mm/nommu.c b/mm/nommu.c index 27bc543128e5..210d7ec2843c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, | |||
176 | } | 176 | } |
177 | EXPORT_SYMBOL(get_user_pages_locked); | 177 | EXPORT_SYMBOL(get_user_pages_locked); |
178 | 178 | ||
179 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | 179 | static long __get_user_pages_unlocked(struct task_struct *tsk, |
180 | unsigned long start, unsigned long nr_pages, | 180 | struct mm_struct *mm, unsigned long start, |
181 | struct page **pages, unsigned int gup_flags) | 181 | unsigned long nr_pages, struct page **pages, |
182 | unsigned int gup_flags) | ||
182 | { | 183 | { |
183 | long ret; | 184 | long ret; |
184 | down_read(&mm->mmap_sem); | 185 | down_read(&mm->mmap_sem); |
@@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | |||
187 | up_read(&mm->mmap_sem); | 188 | up_read(&mm->mmap_sem); |
188 | return ret; | 189 | return ret; |
189 | } | 190 | } |
190 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
191 | 191 | ||
192 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, | 192 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
193 | struct page **pages, unsigned int gup_flags) | 193 | struct page **pages, unsigned int gup_flags) |
@@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1801 | } | 1801 | } |
1802 | EXPORT_SYMBOL(filemap_fault); | 1802 | EXPORT_SYMBOL(filemap_fault); |
1803 | 1803 | ||
1804 | void filemap_map_pages(struct fault_env *fe, | 1804 | void filemap_map_pages(struct vm_fault *vmf, |
1805 | pgoff_t start_pgoff, pgoff_t end_pgoff) | 1805 | pgoff_t start_pgoff, pgoff_t end_pgoff) |
1806 | { | 1806 | { |
1807 | BUG(); | 1807 | BUG(); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 52e2f8e3b472..290e8b7d3181 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, | |||
2106 | pgoff_t start, pgoff_t end) | 2106 | pgoff_t start, pgoff_t end) |
2107 | { | 2107 | { |
2108 | #define WRITEBACK_TAG_BATCH 4096 | 2108 | #define WRITEBACK_TAG_BATCH 4096 |
2109 | unsigned long tagged; | 2109 | unsigned long tagged = 0; |
2110 | 2110 | struct radix_tree_iter iter; | |
2111 | do { | 2111 | void **slot; |
2112 | spin_lock_irq(&mapping->tree_lock); | 2112 | |
2113 | tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, | 2113 | spin_lock_irq(&mapping->tree_lock); |
2114 | &start, end, WRITEBACK_TAG_BATCH, | 2114 | radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, |
2115 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); | 2115 | PAGECACHE_TAG_DIRTY) { |
2116 | if (iter.index > end) | ||
2117 | break; | ||
2118 | radix_tree_iter_tag_set(&mapping->page_tree, &iter, | ||
2119 | PAGECACHE_TAG_TOWRITE); | ||
2120 | tagged++; | ||
2121 | if ((tagged % WRITEBACK_TAG_BATCH) != 0) | ||
2122 | continue; | ||
2123 | slot = radix_tree_iter_resume(slot, &iter); | ||
2116 | spin_unlock_irq(&mapping->tree_lock); | 2124 | spin_unlock_irq(&mapping->tree_lock); |
2117 | WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); | ||
2118 | cond_resched(); | 2125 | cond_resched(); |
2119 | /* We check 'start' to handle wrapping when end == ~0UL */ | 2126 | spin_lock_irq(&mapping->tree_lock); |
2120 | } while (tagged >= WRITEBACK_TAG_BATCH && start); | 2127 | } |
2128 | spin_unlock_irq(&mapping->tree_lock); | ||
2121 | } | 2129 | } |
2122 | EXPORT_SYMBOL(tag_pages_for_writeback); | 2130 | EXPORT_SYMBOL(tag_pages_for_writeback); |
2123 | 2131 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f64e7bcb43b7..2c6d5f64feca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, | |||
3925 | return page; | 3925 | return page; |
3926 | } | 3926 | } |
3927 | 3927 | ||
3928 | void __page_frag_drain(struct page *page, unsigned int order, | ||
3929 | unsigned int count) | ||
3930 | { | ||
3931 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); | ||
3932 | |||
3933 | if (page_ref_sub_and_test(page, count)) { | ||
3934 | if (order == 0) | ||
3935 | free_hot_cold_page(page, false); | ||
3936 | else | ||
3937 | __free_pages_ok(page, order); | ||
3938 | } | ||
3939 | } | ||
3940 | EXPORT_SYMBOL(__page_frag_drain); | ||
3941 | |||
3928 | void *__alloc_page_frag(struct page_frag_cache *nc, | 3942 | void *__alloc_page_frag(struct page_frag_cache *nc, |
3929 | unsigned int fragsz, gfp_t gfp_mask) | 3943 | unsigned int fragsz, gfp_t gfp_mask) |
3930 | { | 3944 | { |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
88 | ssize_t rc = 0; | 88 | ssize_t rc = 0; |
89 | unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES | 89 | unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES |
90 | / sizeof(struct pages *); | 90 | / sizeof(struct pages *); |
91 | unsigned int flags = FOLL_REMOTE; | 91 | unsigned int flags = 0; |
92 | 92 | ||
93 | /* Work out address and page range required */ | 93 | /* Work out address and page range required */ |
94 | if (len == 0) | 94 | if (len == 0) |
@@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
100 | 100 | ||
101 | while (!rc && nr_pages && iov_iter_count(iter)) { | 101 | while (!rc && nr_pages && iov_iter_count(iter)) { |
102 | int pages = min(nr_pages, max_pages_per_loop); | 102 | int pages = min(nr_pages, max_pages_per_loop); |
103 | int locked = 1; | ||
103 | size_t bytes; | 104 | size_t bytes; |
104 | 105 | ||
105 | /* | 106 | /* |
106 | * Get the pages we're interested in. We must | 107 | * Get the pages we're interested in. We must |
107 | * add FOLL_REMOTE because task/mm might not | 108 | * access remotely because task/mm might not |
108 | * current/current->mm | 109 | * current/current->mm |
109 | */ | 110 | */ |
110 | pages = __get_user_pages_unlocked(task, mm, pa, pages, | 111 | down_read(&mm->mmap_sem); |
111 | process_pages, flags); | 112 | pages = get_user_pages_remote(task, mm, pa, pages, flags, |
113 | process_pages, NULL, &locked); | ||
114 | if (locked) | ||
115 | up_read(&mm->mmap_sem); | ||
112 | if (pages <= 0) | 116 | if (pages <= 0) |
113 | return -EFAULT; | 117 | return -EFAULT; |
114 | 118 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index abd7403aba41..54287d443806 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, | |||
661 | swapped++; | 661 | swapped++; |
662 | 662 | ||
663 | if (need_resched()) { | 663 | if (need_resched()) { |
664 | slot = radix_tree_iter_resume(slot, &iter); | ||
664 | cond_resched_rcu(); | 665 | cond_resched_rcu(); |
665 | slot = radix_tree_iter_next(&iter); | ||
666 | } | 666 | } |
667 | } | 667 | } |
668 | 668 | ||
@@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode) | |||
1049 | clear_inode(inode); | 1049 | clear_inode(inode); |
1050 | } | 1050 | } |
1051 | 1051 | ||
1052 | static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) | ||
1053 | { | ||
1054 | struct radix_tree_iter iter; | ||
1055 | void **slot; | ||
1056 | unsigned long found = -1; | ||
1057 | unsigned int checked = 0; | ||
1058 | |||
1059 | rcu_read_lock(); | ||
1060 | radix_tree_for_each_slot(slot, root, &iter, 0) { | ||
1061 | if (*slot == item) { | ||
1062 | found = iter.index; | ||
1063 | break; | ||
1064 | } | ||
1065 | checked++; | ||
1066 | if ((checked % 4096) != 0) | ||
1067 | continue; | ||
1068 | slot = radix_tree_iter_resume(slot, &iter); | ||
1069 | cond_resched_rcu(); | ||
1070 | } | ||
1071 | |||
1072 | rcu_read_unlock(); | ||
1073 | return found; | ||
1074 | } | ||
1075 | |||
1052 | /* | 1076 | /* |
1053 | * If swap found in inode, free it and move page from swapcache to filecache. | 1077 | * If swap found in inode, free it and move page from swapcache to filecache. |
1054 | */ | 1078 | */ |
@@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
1062 | int error = 0; | 1086 | int error = 0; |
1063 | 1087 | ||
1064 | radswap = swp_to_radix_entry(swap); | 1088 | radswap = swp_to_radix_entry(swap); |
1065 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 1089 | index = find_swap_entry(&mapping->page_tree, radswap); |
1066 | if (index == -1) | 1090 | if (index == -1) |
1067 | return -EAGAIN; /* tell shmem_unuse we found nothing */ | 1091 | return -EAGAIN; /* tell shmem_unuse we found nothing */ |
1068 | 1092 | ||
@@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping) | |||
2447 | } | 2471 | } |
2448 | 2472 | ||
2449 | if (need_resched()) { | 2473 | if (need_resched()) { |
2474 | slot = radix_tree_iter_resume(slot, &iter); | ||
2450 | cond_resched_rcu(); | 2475 | cond_resched_rcu(); |
2451 | slot = radix_tree_iter_next(&iter); | ||
2452 | } | 2476 | } |
2453 | } | 2477 | } |
2454 | rcu_read_unlock(); | 2478 | rcu_read_unlock(); |
@@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) | |||
2517 | spin_unlock_irq(&mapping->tree_lock); | 2541 | spin_unlock_irq(&mapping->tree_lock); |
2518 | continue_resched: | 2542 | continue_resched: |
2519 | if (need_resched()) { | 2543 | if (need_resched()) { |
2544 | slot = radix_tree_iter_resume(slot, &iter); | ||
2520 | cond_resched_rcu(); | 2545 | cond_resched_rcu(); |
2521 | slot = radix_tree_iter_next(&iter); | ||
2522 | } | 2546 | } |
2523 | } | 2547 | } |
2524 | rcu_read_unlock(); | 2548 | rcu_read_unlock(); |
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2d59c9be40e1..5f63f6dcaabb 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c | |||
@@ -762,16 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = { | |||
762 | static int __init af_rxrpc_init(void) | 762 | static int __init af_rxrpc_init(void) |
763 | { | 763 | { |
764 | int ret = -1; | 764 | int ret = -1; |
765 | unsigned int tmp; | ||
765 | 766 | ||
766 | BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); | 767 | BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); |
767 | 768 | ||
768 | get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); | 769 | get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); |
769 | rxrpc_epoch |= RXRPC_RANDOM_EPOCH; | 770 | rxrpc_epoch |= RXRPC_RANDOM_EPOCH; |
770 | get_random_bytes(&rxrpc_client_conn_ids.cur, | 771 | get_random_bytes(&tmp, sizeof(tmp)); |
771 | sizeof(rxrpc_client_conn_ids.cur)); | 772 | tmp &= 0x3fffffff; |
772 | rxrpc_client_conn_ids.cur &= 0x3fffffff; | 773 | if (tmp == 0) |
773 | if (rxrpc_client_conn_ids.cur == 0) | 774 | tmp = 1; |
774 | rxrpc_client_conn_ids.cur = 1; | 775 | idr_set_cursor(&rxrpc_client_conn_ids, tmp); |
775 | 776 | ||
776 | ret = -ENOMEM; | 777 | ret = -ENOMEM; |
777 | rxrpc_call_jar = kmem_cache_create( | 778 | rxrpc_call_jar = kmem_cache_create( |
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 60ef9605167e..6cbcdcc29853 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c | |||
@@ -263,12 +263,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) | |||
263 | * times the maximum number of client conns away from the current | 263 | * times the maximum number of client conns away from the current |
264 | * allocation point to try and keep the IDs concentrated. | 264 | * allocation point to try and keep the IDs concentrated. |
265 | */ | 265 | */ |
266 | id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); | 266 | id_cursor = idr_get_cursor(&rxrpc_client_conn_ids); |
267 | id = conn->proto.cid >> RXRPC_CIDSHIFT; | 267 | id = conn->proto.cid >> RXRPC_CIDSHIFT; |
268 | distance = id - id_cursor; | 268 | distance = id - id_cursor; |
269 | if (distance < 0) | 269 | if (distance < 0) |
270 | distance = -distance; | 270 | distance = -distance; |
271 | limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; | 271 | limit = max(rxrpc_max_client_connections * 4, 1024U); |
272 | if (distance > limit) | 272 | if (distance > limit) |
273 | goto mark_dont_reuse; | 273 | goto mark_dont_reuse; |
274 | 274 | ||
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 682b73af7766..838ffa78cfda 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c | |||
@@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, | |||
881 | * the execve(). | 881 | * the execve(). |
882 | */ | 882 | */ |
883 | if (get_user_pages_remote(current, bprm->mm, pos, 1, | 883 | if (get_user_pages_remote(current, bprm->mm, pos, 1, |
884 | FOLL_FORCE, &page, NULL) <= 0) | 884 | FOLL_FORCE, &page, NULL, NULL) <= 0) |
885 | return false; | 885 | return false; |
886 | #else | 886 | #else |
887 | page = bprm->page[pos / PAGE_SIZE]; | 887 | page = bprm->page[pos / PAGE_SIZE]; |
diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h index 9e5f4846967f..beda1a884b50 100644 --- a/tools/include/asm/bug.h +++ b/tools/include/asm/bug.h | |||
@@ -12,6 +12,17 @@ | |||
12 | unlikely(__ret_warn_on); \ | 12 | unlikely(__ret_warn_on); \ |
13 | }) | 13 | }) |
14 | 14 | ||
15 | #define WARN_ON_ONCE(condition) ({ \ | ||
16 | static int __warned; \ | ||
17 | int __ret_warn_once = !!(condition); \ | ||
18 | \ | ||
19 | if (unlikely(__ret_warn_once && !__warned)) { \ | ||
20 | __warned = true; \ | ||
21 | WARN_ON(1); \ | ||
22 | } \ | ||
23 | unlikely(__ret_warn_once); \ | ||
24 | }) | ||
25 | |||
15 | #define WARN_ONCE(condition, format...) ({ \ | 26 | #define WARN_ONCE(condition, format...) ({ \ |
16 | static int __warned; \ | 27 | static int __warned; \ |
17 | int __ret_warn_once = !!(condition); \ | 28 | int __ret_warn_once = !!(condition); \ |
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 43c1c5021e4b..eef41d500e9e 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h | |||
@@ -35,6 +35,32 @@ static inline void bitmap_zero(unsigned long *dst, int nbits) | |||
35 | } | 35 | } |
36 | } | 36 | } |
37 | 37 | ||
38 | static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) | ||
39 | { | ||
40 | unsigned int nlongs = BITS_TO_LONGS(nbits); | ||
41 | if (!small_const_nbits(nbits)) { | ||
42 | unsigned int len = (nlongs - 1) * sizeof(unsigned long); | ||
43 | memset(dst, 0xff, len); | ||
44 | } | ||
45 | dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); | ||
46 | } | ||
47 | |||
48 | static inline int bitmap_empty(const unsigned long *src, unsigned nbits) | ||
49 | { | ||
50 | if (small_const_nbits(nbits)) | ||
51 | return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); | ||
52 | |||
53 | return find_first_bit(src, nbits) == nbits; | ||
54 | } | ||
55 | |||
56 | static inline int bitmap_full(const unsigned long *src, unsigned int nbits) | ||
57 | { | ||
58 | if (small_const_nbits(nbits)) | ||
59 | return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); | ||
60 | |||
61 | return find_first_zero_bit(src, nbits) == nbits; | ||
62 | } | ||
63 | |||
38 | static inline int bitmap_weight(const unsigned long *src, int nbits) | 64 | static inline int bitmap_weight(const unsigned long *src, int nbits) |
39 | { | 65 | { |
40 | if (small_const_nbits(nbits)) | 66 | if (small_const_nbits(nbits)) |
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index d08e214ec6e7..be93ab02b490 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl | |||
@@ -719,14 +719,14 @@ sub set_value { | |||
719 | 719 | ||
720 | if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { | 720 | if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { |
721 | # Note if a test is something other than build, then we | 721 | # Note if a test is something other than build, then we |
722 | # will need other manditory options. | 722 | # will need other mandatory options. |
723 | if ($prvalue ne "install") { | 723 | if ($prvalue ne "install") { |
724 | # for bisect, we need to check BISECT_TYPE | 724 | # for bisect, we need to check BISECT_TYPE |
725 | if ($prvalue ne "bisect") { | 725 | if ($prvalue ne "bisect") { |
726 | $buildonly = 0; | 726 | $buildonly = 0; |
727 | } | 727 | } |
728 | } else { | 728 | } else { |
729 | # install still limits some manditory options. | 729 | # install still limits some mandatory options. |
730 | $buildonly = 2; | 730 | $buildonly = 2; |
731 | } | 731 | } |
732 | } | 732 | } |
@@ -735,7 +735,7 @@ sub set_value { | |||
735 | if ($prvalue ne "install") { | 735 | if ($prvalue ne "install") { |
736 | $buildonly = 0; | 736 | $buildonly = 0; |
737 | } else { | 737 | } else { |
738 | # install still limits some manditory options. | 738 | # install still limits some mandatory options. |
739 | $buildonly = 2; | 739 | $buildonly = 2; |
740 | } | 740 | } |
741 | } | 741 | } |
@@ -3989,7 +3989,7 @@ sub make_min_config { | |||
3989 | } | 3989 | } |
3990 | } | 3990 | } |
3991 | 3991 | ||
3992 | # Save off all the current mandidory configs | 3992 | # Save off all the current mandatory configs |
3993 | open (OUT, ">$temp_config") | 3993 | open (OUT, ">$temp_config") |
3994 | or die "Can't write to $temp_config"; | 3994 | or die "Can't write to $temp_config"; |
3995 | foreach my $config (keys %keep_configs) { | 3995 | foreach my $config (keys %keep_configs) { |
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index f2e07f2fd4b4..3635e4d3eca7 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile | |||
@@ -1,10 +1,14 @@ | |||
1 | 1 | ||
2 | CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE | 2 | CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE |
3 | LDFLAGS += -lpthread -lurcu | 3 | LDFLAGS += -lpthread -lurcu |
4 | TARGETS = main | 4 | TARGETS = main |
5 | OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ | 5 | OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ |
6 | regression1.o regression2.o regression3.o multiorder.o \ | 6 | regression1.o regression2.o regression3.o multiorder.o \ |
7 | iteration_check.o | 7 | iteration_check.o benchmark.o |
8 | |||
9 | ifdef BENCHMARK | ||
10 | CFLAGS += -DBENCHMARK=1 | ||
11 | endif | ||
8 | 12 | ||
9 | targets: $(TARGETS) | 13 | targets: $(TARGETS) |
10 | 14 | ||
@@ -14,7 +18,12 @@ main: $(OFILES) | |||
14 | clean: | 18 | clean: |
15 | $(RM) -f $(TARGETS) *.o radix-tree.c | 19 | $(RM) -f $(TARGETS) *.o radix-tree.c |
16 | 20 | ||
17 | $(OFILES): *.h */*.h ../../../include/linux/radix-tree.h ../../include/linux/*.h | 21 | find_next_bit.o: ../../lib/find_bit.c |
22 | $(CC) $(CFLAGS) -c -o $@ $< | ||
23 | |||
24 | $(OFILES): *.h */*.h \ | ||
25 | ../../include/linux/*.h \ | ||
26 | ../../../include/linux/radix-tree.h | ||
18 | 27 | ||
19 | radix-tree.c: ../../../lib/radix-tree.c | 28 | radix-tree.c: ../../../lib/radix-tree.c |
20 | sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ | 29 | sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ |
diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c new file mode 100644 index 000000000000..215ca86c7605 --- /dev/null +++ b/tools/testing/radix-tree/benchmark.c | |||
@@ -0,0 +1,98 @@ | |||
1 | /* | ||
2 | * benchmark.c: | ||
3 | * Author: Konstantin Khlebnikov <koct9i@gmail.com> | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | */ | ||
14 | #include <linux/radix-tree.h> | ||
15 | #include <linux/slab.h> | ||
16 | #include <linux/errno.h> | ||
17 | #include <time.h> | ||
18 | #include "test.h" | ||
19 | |||
20 | #define NSEC_PER_SEC 1000000000L | ||
21 | |||
22 | static long long benchmark_iter(struct radix_tree_root *root, bool tagged) | ||
23 | { | ||
24 | volatile unsigned long sink = 0; | ||
25 | struct radix_tree_iter iter; | ||
26 | struct timespec start, finish; | ||
27 | long long nsec; | ||
28 | int l, loops = 1; | ||
29 | void **slot; | ||
30 | |||
31 | #ifdef BENCHMARK | ||
32 | again: | ||
33 | #endif | ||
34 | clock_gettime(CLOCK_MONOTONIC, &start); | ||
35 | for (l = 0; l < loops; l++) { | ||
36 | if (tagged) { | ||
37 | radix_tree_for_each_tagged(slot, root, &iter, 0, 0) | ||
38 | sink ^= (unsigned long)slot; | ||
39 | } else { | ||
40 | radix_tree_for_each_slot(slot, root, &iter, 0) | ||
41 | sink ^= (unsigned long)slot; | ||
42 | } | ||
43 | } | ||
44 | clock_gettime(CLOCK_MONOTONIC, &finish); | ||
45 | |||
46 | nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + | ||
47 | (finish.tv_nsec - start.tv_nsec); | ||
48 | |||
49 | #ifdef BENCHMARK | ||
50 | if (loops == 1 && nsec * 5 < NSEC_PER_SEC) { | ||
51 | loops = NSEC_PER_SEC / nsec / 4 + 1; | ||
52 | goto again; | ||
53 | } | ||
54 | #endif | ||
55 | |||
56 | nsec /= loops; | ||
57 | return nsec; | ||
58 | } | ||
59 | |||
60 | static void benchmark_size(unsigned long size, unsigned long step, int order) | ||
61 | { | ||
62 | RADIX_TREE(tree, GFP_KERNEL); | ||
63 | long long normal, tagged; | ||
64 | unsigned long index; | ||
65 | |||
66 | for (index = 0 ; index < size ; index += step) { | ||
67 | item_insert_order(&tree, index, order); | ||
68 | radix_tree_tag_set(&tree, index, 0); | ||
69 | } | ||
70 | |||
71 | tagged = benchmark_iter(&tree, true); | ||
72 | normal = benchmark_iter(&tree, false); | ||
73 | |||
74 | printf("Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n", | ||
75 | size, step, order, tagged, normal); | ||
76 | |||
77 | item_kill_tree(&tree); | ||
78 | rcu_barrier(); | ||
79 | } | ||
80 | |||
81 | void benchmark(void) | ||
82 | { | ||
83 | unsigned long size[] = {1 << 10, 1 << 20, 0}; | ||
84 | unsigned long step[] = {1, 2, 7, 15, 63, 64, 65, | ||
85 | 128, 256, 512, 12345, 0}; | ||
86 | int c, s; | ||
87 | |||
88 | printf("starting benchmarks\n"); | ||
89 | printf("RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT); | ||
90 | |||
91 | for (c = 0; size[c]; c++) | ||
92 | for (s = 0; step[s]; s++) | ||
93 | benchmark_size(size[c], step[s], 0); | ||
94 | |||
95 | for (c = 0; size[c]; c++) | ||
96 | for (s = 0; step[s]; s++) | ||
97 | benchmark_size(size[c], step[s] << 9, 9); | ||
98 | } | ||
diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c deleted file mode 100644 index d1c2178bb2d4..000000000000 --- a/tools/testing/radix-tree/find_next_bit.c +++ /dev/null | |||
@@ -1,57 +0,0 @@ | |||
1 | /* find_next_bit.c: fallback find next bit implementation | ||
2 | * | ||
3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | ||
4 | * Written by David Howells (dhowells@redhat.com) | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | |||
12 | #include <linux/types.h> | ||
13 | #include <linux/bitops.h> | ||
14 | |||
15 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) | ||
16 | |||
17 | /* | ||
18 | * Find the next set bit in a memory region. | ||
19 | */ | ||
20 | unsigned long find_next_bit(const unsigned long *addr, unsigned long size, | ||
21 | unsigned long offset) | ||
22 | { | ||
23 | const unsigned long *p = addr + BITOP_WORD(offset); | ||
24 | unsigned long result = offset & ~(BITS_PER_LONG-1); | ||
25 | unsigned long tmp; | ||
26 | |||
27 | if (offset >= size) | ||
28 | return size; | ||
29 | size -= result; | ||
30 | offset %= BITS_PER_LONG; | ||
31 | if (offset) { | ||
32 | tmp = *(p++); | ||
33 | tmp &= (~0UL << offset); | ||
34 | if (size < BITS_PER_LONG) | ||
35 | goto found_first; | ||
36 | if (tmp) | ||
37 | goto found_middle; | ||
38 | size -= BITS_PER_LONG; | ||
39 | result += BITS_PER_LONG; | ||
40 | } | ||
41 | while (size & ~(BITS_PER_LONG-1)) { | ||
42 | if ((tmp = *(p++))) | ||
43 | goto found_middle; | ||
44 | result += BITS_PER_LONG; | ||
45 | size -= BITS_PER_LONG; | ||
46 | } | ||
47 | if (!size) | ||
48 | return result; | ||
49 | tmp = *p; | ||
50 | |||
51 | found_first: | ||
52 | tmp &= (~0UL >> (BITS_PER_LONG - size)); | ||
53 | if (tmp == 0UL) /* Are any bits set? */ | ||
54 | return result + size; /* Nope. */ | ||
55 | found_middle: | ||
56 | return result + __ffs(tmp); | ||
57 | } | ||
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index 9adb8e7415a6..7572b7ed930e 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c | |||
@@ -16,35 +16,50 @@ | |||
16 | #include <pthread.h> | 16 | #include <pthread.h> |
17 | #include "test.h" | 17 | #include "test.h" |
18 | 18 | ||
19 | #define NUM_THREADS 4 | 19 | #define NUM_THREADS 5 |
20 | #define TAG 0 | 20 | #define MAX_IDX 100 |
21 | #define TAG 0 | ||
22 | #define NEW_TAG 1 | ||
23 | |||
21 | static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; | 24 | static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; |
22 | static pthread_t threads[NUM_THREADS]; | 25 | static pthread_t threads[NUM_THREADS]; |
23 | RADIX_TREE(tree, GFP_KERNEL); | 26 | static unsigned int seeds[3]; |
24 | bool test_complete; | 27 | static RADIX_TREE(tree, GFP_KERNEL); |
28 | static bool test_complete; | ||
29 | static int max_order; | ||
25 | 30 | ||
26 | /* relentlessly fill the tree with tagged entries */ | 31 | /* relentlessly fill the tree with tagged entries */ |
27 | static void *add_entries_fn(void *arg) | 32 | static void *add_entries_fn(void *arg) |
28 | { | 33 | { |
29 | int pgoff; | 34 | rcu_register_thread(); |
30 | 35 | ||
31 | while (!test_complete) { | 36 | while (!test_complete) { |
32 | for (pgoff = 0; pgoff < 100; pgoff++) { | 37 | unsigned long pgoff; |
38 | int order; | ||
39 | |||
40 | for (pgoff = 0; pgoff < MAX_IDX; pgoff++) { | ||
33 | pthread_mutex_lock(&tree_lock); | 41 | pthread_mutex_lock(&tree_lock); |
34 | if (item_insert(&tree, pgoff) == 0) | 42 | for (order = max_order; order >= 0; order--) { |
35 | item_tag_set(&tree, pgoff, TAG); | 43 | if (item_insert_order(&tree, pgoff, order) |
44 | == 0) { | ||
45 | item_tag_set(&tree, pgoff, TAG); | ||
46 | break; | ||
47 | } | ||
48 | } | ||
36 | pthread_mutex_unlock(&tree_lock); | 49 | pthread_mutex_unlock(&tree_lock); |
37 | } | 50 | } |
38 | } | 51 | } |
39 | 52 | ||
53 | rcu_unregister_thread(); | ||
54 | |||
40 | return NULL; | 55 | return NULL; |
41 | } | 56 | } |
42 | 57 | ||
43 | /* | 58 | /* |
44 | * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find | 59 | * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find |
45 | * things that have been removed and randomly resetting our iteration to the | 60 | * things that have been removed and randomly resetting our iteration to the |
46 | * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and | 61 | * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and |
47 | * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a | 62 | * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a |
48 | * NULL 'slot' variable. | 63 | * NULL 'slot' variable. |
49 | */ | 64 | */ |
50 | static void *tagged_iteration_fn(void *arg) | 65 | static void *tagged_iteration_fn(void *arg) |
@@ -52,17 +67,12 @@ static void *tagged_iteration_fn(void *arg) | |||
52 | struct radix_tree_iter iter; | 67 | struct radix_tree_iter iter; |
53 | void **slot; | 68 | void **slot; |
54 | 69 | ||
70 | rcu_register_thread(); | ||
71 | |||
55 | while (!test_complete) { | 72 | while (!test_complete) { |
56 | rcu_read_lock(); | 73 | rcu_read_lock(); |
57 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { | 74 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { |
58 | void *entry; | 75 | void *entry = radix_tree_deref_slot(slot); |
59 | int i; | ||
60 | |||
61 | /* busy wait to let removals happen */ | ||
62 | for (i = 0; i < 1000000; i++) | ||
63 | ; | ||
64 | |||
65 | entry = radix_tree_deref_slot(slot); | ||
66 | if (unlikely(!entry)) | 76 | if (unlikely(!entry)) |
67 | continue; | 77 | continue; |
68 | 78 | ||
@@ -71,20 +81,26 @@ static void *tagged_iteration_fn(void *arg) | |||
71 | continue; | 81 | continue; |
72 | } | 82 | } |
73 | 83 | ||
74 | if (rand() % 50 == 0) | 84 | if (rand_r(&seeds[0]) % 50 == 0) { |
75 | slot = radix_tree_iter_next(&iter); | 85 | slot = radix_tree_iter_resume(slot, &iter); |
86 | rcu_read_unlock(); | ||
87 | rcu_barrier(); | ||
88 | rcu_read_lock(); | ||
89 | } | ||
76 | } | 90 | } |
77 | rcu_read_unlock(); | 91 | rcu_read_unlock(); |
78 | } | 92 | } |
79 | 93 | ||
94 | rcu_unregister_thread(); | ||
95 | |||
80 | return NULL; | 96 | return NULL; |
81 | } | 97 | } |
82 | 98 | ||
83 | /* | 99 | /* |
84 | * Iterate over the entries, doing a radix_tree_iter_retry() as we find things | 100 | * Iterate over the entries, doing a radix_tree_iter_retry() as we find things |
85 | * that have been removed and randomly resetting our iteration to the next | 101 | * that have been removed and randomly resetting our iteration to the next |
86 | * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and | 102 | * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and |
87 | * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a | 103 | * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a |
88 | * NULL 'slot' variable. | 104 | * NULL 'slot' variable. |
89 | */ | 105 | */ |
90 | static void *untagged_iteration_fn(void *arg) | 106 | static void *untagged_iteration_fn(void *arg) |
@@ -92,17 +108,12 @@ static void *untagged_iteration_fn(void *arg) | |||
92 | struct radix_tree_iter iter; | 108 | struct radix_tree_iter iter; |
93 | void **slot; | 109 | void **slot; |
94 | 110 | ||
111 | rcu_register_thread(); | ||
112 | |||
95 | while (!test_complete) { | 113 | while (!test_complete) { |
96 | rcu_read_lock(); | 114 | rcu_read_lock(); |
97 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | 115 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { |
98 | void *entry; | 116 | void *entry = radix_tree_deref_slot(slot); |
99 | int i; | ||
100 | |||
101 | /* busy wait to let removals happen */ | ||
102 | for (i = 0; i < 1000000; i++) | ||
103 | ; | ||
104 | |||
105 | entry = radix_tree_deref_slot(slot); | ||
106 | if (unlikely(!entry)) | 117 | if (unlikely(!entry)) |
107 | continue; | 118 | continue; |
108 | 119 | ||
@@ -111,12 +122,18 @@ static void *untagged_iteration_fn(void *arg) | |||
111 | continue; | 122 | continue; |
112 | } | 123 | } |
113 | 124 | ||
114 | if (rand() % 50 == 0) | 125 | if (rand_r(&seeds[1]) % 50 == 0) { |
115 | slot = radix_tree_iter_next(&iter); | 126 | slot = radix_tree_iter_resume(slot, &iter); |
127 | rcu_read_unlock(); | ||
128 | rcu_barrier(); | ||
129 | rcu_read_lock(); | ||
130 | } | ||
116 | } | 131 | } |
117 | rcu_read_unlock(); | 132 | rcu_read_unlock(); |
118 | } | 133 | } |
119 | 134 | ||
135 | rcu_unregister_thread(); | ||
136 | |||
120 | return NULL; | 137 | return NULL; |
121 | } | 138 | } |
122 | 139 | ||
@@ -126,47 +143,71 @@ static void *untagged_iteration_fn(void *arg) | |||
126 | */ | 143 | */ |
127 | static void *remove_entries_fn(void *arg) | 144 | static void *remove_entries_fn(void *arg) |
128 | { | 145 | { |
146 | rcu_register_thread(); | ||
147 | |||
129 | while (!test_complete) { | 148 | while (!test_complete) { |
130 | int pgoff; | 149 | int pgoff; |
131 | 150 | ||
132 | pgoff = rand() % 100; | 151 | pgoff = rand_r(&seeds[2]) % MAX_IDX; |
133 | 152 | ||
134 | pthread_mutex_lock(&tree_lock); | 153 | pthread_mutex_lock(&tree_lock); |
135 | item_delete(&tree, pgoff); | 154 | item_delete(&tree, pgoff); |
136 | pthread_mutex_unlock(&tree_lock); | 155 | pthread_mutex_unlock(&tree_lock); |
137 | } | 156 | } |
138 | 157 | ||
158 | rcu_unregister_thread(); | ||
159 | |||
160 | return NULL; | ||
161 | } | ||
162 | |||
163 | static void *tag_entries_fn(void *arg) | ||
164 | { | ||
165 | rcu_register_thread(); | ||
166 | |||
167 | while (!test_complete) { | ||
168 | tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG, | ||
169 | NEW_TAG); | ||
170 | } | ||
171 | rcu_unregister_thread(); | ||
139 | return NULL; | 172 | return NULL; |
140 | } | 173 | } |
141 | 174 | ||
142 | /* This is a unit test for a bug found by the syzkaller tester */ | 175 | /* This is a unit test for a bug found by the syzkaller tester */ |
143 | void iteration_test(void) | 176 | void iteration_test(unsigned order, unsigned test_duration) |
144 | { | 177 | { |
145 | int i; | 178 | int i; |
146 | 179 | ||
147 | printf("Running iteration tests for 10 seconds\n"); | 180 | printf("Running %siteration tests for %d seconds\n", |
181 | order > 0 ? "multiorder " : "", test_duration); | ||
148 | 182 | ||
149 | srand(time(0)); | 183 | max_order = order; |
150 | test_complete = false; | 184 | test_complete = false; |
151 | 185 | ||
186 | for (i = 0; i < 3; i++) | ||
187 | seeds[i] = rand(); | ||
188 | |||
152 | if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { | 189 | if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { |
153 | perror("pthread_create"); | 190 | perror("create tagged iteration thread"); |
154 | exit(1); | 191 | exit(1); |
155 | } | 192 | } |
156 | if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { | 193 | if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { |
157 | perror("pthread_create"); | 194 | perror("create untagged iteration thread"); |
158 | exit(1); | 195 | exit(1); |
159 | } | 196 | } |
160 | if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { | 197 | if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { |
161 | perror("pthread_create"); | 198 | perror("create add entry thread"); |
162 | exit(1); | 199 | exit(1); |
163 | } | 200 | } |
164 | if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { | 201 | if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { |
165 | perror("pthread_create"); | 202 | perror("create remove entry thread"); |
203 | exit(1); | ||
204 | } | ||
205 | if (pthread_create(&threads[4], NULL, tag_entries_fn, NULL)) { | ||
206 | perror("create tag entry thread"); | ||
166 | exit(1); | 207 | exit(1); |
167 | } | 208 | } |
168 | 209 | ||
169 | sleep(10); | 210 | sleep(test_duration); |
170 | test_complete = true; | 211 | test_complete = true; |
171 | 212 | ||
172 | for (i = 0; i < NUM_THREADS; i++) { | 213 | for (i = 0; i < NUM_THREADS; i++) { |
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 154823737b20..d31ea7c9abec 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c | |||
@@ -1,14 +1,26 @@ | |||
1 | #include <stdlib.h> | 1 | #include <stdlib.h> |
2 | #include <string.h> | 2 | #include <string.h> |
3 | #include <malloc.h> | 3 | #include <malloc.h> |
4 | #include <pthread.h> | ||
4 | #include <unistd.h> | 5 | #include <unistd.h> |
5 | #include <assert.h> | 6 | #include <assert.h> |
6 | 7 | ||
7 | #include <linux/mempool.h> | 8 | #include <linux/mempool.h> |
9 | #include <linux/poison.h> | ||
8 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
11 | #include <linux/radix-tree.h> | ||
9 | #include <urcu/uatomic.h> | 12 | #include <urcu/uatomic.h> |
10 | 13 | ||
11 | int nr_allocated; | 14 | int nr_allocated; |
15 | int preempt_count; | ||
16 | |||
17 | struct kmem_cache { | ||
18 | pthread_mutex_t lock; | ||
19 | int size; | ||
20 | int nr_objs; | ||
21 | void *objs; | ||
22 | void (*ctor)(void *); | ||
23 | }; | ||
12 | 24 | ||
13 | void *mempool_alloc(mempool_t *pool, int gfp_mask) | 25 | void *mempool_alloc(mempool_t *pool, int gfp_mask) |
14 | { | 26 | { |
@@ -33,19 +45,59 @@ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, | |||
33 | 45 | ||
34 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) | 46 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) |
35 | { | 47 | { |
36 | void *ret = malloc(cachep->size); | 48 | struct radix_tree_node *node; |
37 | if (cachep->ctor) | 49 | |
38 | cachep->ctor(ret); | 50 | if (flags & __GFP_NOWARN) |
51 | return NULL; | ||
52 | |||
53 | pthread_mutex_lock(&cachep->lock); | ||
54 | if (cachep->nr_objs) { | ||
55 | cachep->nr_objs--; | ||
56 | node = cachep->objs; | ||
57 | cachep->objs = node->private_data; | ||
58 | pthread_mutex_unlock(&cachep->lock); | ||
59 | node->private_data = NULL; | ||
60 | } else { | ||
61 | pthread_mutex_unlock(&cachep->lock); | ||
62 | node = malloc(cachep->size); | ||
63 | if (cachep->ctor) | ||
64 | cachep->ctor(node); | ||
65 | } | ||
66 | |||
39 | uatomic_inc(&nr_allocated); | 67 | uatomic_inc(&nr_allocated); |
40 | return ret; | 68 | return node; |
41 | } | 69 | } |
42 | 70 | ||
43 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) | 71 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) |
44 | { | 72 | { |
45 | assert(objp); | 73 | assert(objp); |
46 | uatomic_dec(&nr_allocated); | 74 | uatomic_dec(&nr_allocated); |
47 | memset(objp, 0, cachep->size); | 75 | pthread_mutex_lock(&cachep->lock); |
48 | free(objp); | 76 | if (cachep->nr_objs > 10) { |
77 | memset(objp, POISON_FREE, cachep->size); | ||
78 | free(objp); | ||
79 | } else { | ||
80 | struct radix_tree_node *node = objp; | ||
81 | cachep->nr_objs++; | ||
82 | node->private_data = cachep->objs; | ||
83 | cachep->objs = node; | ||
84 | } | ||
85 | pthread_mutex_unlock(&cachep->lock); | ||
86 | } | ||
87 | |||
88 | void *kmalloc(size_t size, gfp_t gfp) | ||
89 | { | ||
90 | void *ret = malloc(size); | ||
91 | uatomic_inc(&nr_allocated); | ||
92 | return ret; | ||
93 | } | ||
94 | |||
95 | void kfree(void *p) | ||
96 | { | ||
97 | if (!p) | ||
98 | return; | ||
99 | uatomic_dec(&nr_allocated); | ||
100 | free(p); | ||
49 | } | 101 | } |
50 | 102 | ||
51 | struct kmem_cache * | 103 | struct kmem_cache * |
@@ -54,7 +106,10 @@ kmem_cache_create(const char *name, size_t size, size_t offset, | |||
54 | { | 106 | { |
55 | struct kmem_cache *ret = malloc(sizeof(*ret)); | 107 | struct kmem_cache *ret = malloc(sizeof(*ret)); |
56 | 108 | ||
109 | pthread_mutex_init(&ret->lock, NULL); | ||
57 | ret->size = size; | 110 | ret->size = size; |
111 | ret->nr_objs = 0; | ||
112 | ret->objs = NULL; | ||
58 | ret->ctor = ctor; | 113 | ret->ctor = ctor; |
59 | return ret; | 114 | return ret; |
60 | } | 115 | } |
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h index 71d58427ab60..a13e9bc76eec 100644 --- a/tools/testing/radix-tree/linux/bitops.h +++ b/tools/testing/radix-tree/linux/bitops.h | |||
@@ -2,9 +2,14 @@ | |||
2 | #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ | 2 | #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ |
3 | 3 | ||
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/bitops/find.h> | ||
6 | #include <linux/bitops/hweight.h> | ||
7 | #include <linux/kernel.h> | ||
5 | 8 | ||
6 | #define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | 9 | #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) |
7 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) | 10 | #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) |
11 | #define BITS_PER_BYTE 8 | ||
12 | #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) | ||
8 | 13 | ||
9 | /** | 14 | /** |
10 | * __set_bit - Set a bit in memory | 15 | * __set_bit - Set a bit in memory |
@@ -17,16 +22,16 @@ | |||
17 | */ | 22 | */ |
18 | static inline void __set_bit(int nr, volatile unsigned long *addr) | 23 | static inline void __set_bit(int nr, volatile unsigned long *addr) |
19 | { | 24 | { |
20 | unsigned long mask = BITOP_MASK(nr); | 25 | unsigned long mask = BIT_MASK(nr); |
21 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 26 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
22 | 27 | ||
23 | *p |= mask; | 28 | *p |= mask; |
24 | } | 29 | } |
25 | 30 | ||
26 | static inline void __clear_bit(int nr, volatile unsigned long *addr) | 31 | static inline void __clear_bit(int nr, volatile unsigned long *addr) |
27 | { | 32 | { |
28 | unsigned long mask = BITOP_MASK(nr); | 33 | unsigned long mask = BIT_MASK(nr); |
29 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 34 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
30 | 35 | ||
31 | *p &= ~mask; | 36 | *p &= ~mask; |
32 | } | 37 | } |
@@ -42,8 +47,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) | |||
42 | */ | 47 | */ |
43 | static inline void __change_bit(int nr, volatile unsigned long *addr) | 48 | static inline void __change_bit(int nr, volatile unsigned long *addr) |
44 | { | 49 | { |
45 | unsigned long mask = BITOP_MASK(nr); | 50 | unsigned long mask = BIT_MASK(nr); |
46 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 51 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
47 | 52 | ||
48 | *p ^= mask; | 53 | *p ^= mask; |
49 | } | 54 | } |
@@ -59,8 +64,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) | |||
59 | */ | 64 | */ |
60 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | 65 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) |
61 | { | 66 | { |
62 | unsigned long mask = BITOP_MASK(nr); | 67 | unsigned long mask = BIT_MASK(nr); |
63 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 68 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
64 | unsigned long old = *p; | 69 | unsigned long old = *p; |
65 | 70 | ||
66 | *p = old | mask; | 71 | *p = old | mask; |
@@ -78,8 +83,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | |||
78 | */ | 83 | */ |
79 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | 84 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) |
80 | { | 85 | { |
81 | unsigned long mask = BITOP_MASK(nr); | 86 | unsigned long mask = BIT_MASK(nr); |
82 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 87 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
83 | unsigned long old = *p; | 88 | unsigned long old = *p; |
84 | 89 | ||
85 | *p = old & ~mask; | 90 | *p = old & ~mask; |
@@ -90,8 +95,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | |||
90 | static inline int __test_and_change_bit(int nr, | 95 | static inline int __test_and_change_bit(int nr, |
91 | volatile unsigned long *addr) | 96 | volatile unsigned long *addr) |
92 | { | 97 | { |
93 | unsigned long mask = BITOP_MASK(nr); | 98 | unsigned long mask = BIT_MASK(nr); |
94 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 99 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
95 | unsigned long old = *p; | 100 | unsigned long old = *p; |
96 | 101 | ||
97 | *p = old ^ mask; | 102 | *p = old ^ mask; |
@@ -105,7 +110,7 @@ static inline int __test_and_change_bit(int nr, | |||
105 | */ | 110 | */ |
106 | static inline int test_bit(int nr, const volatile unsigned long *addr) | 111 | static inline int test_bit(int nr, const volatile unsigned long *addr) |
107 | { | 112 | { |
108 | return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); | 113 | return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); |
109 | } | 114 | } |
110 | 115 | ||
111 | /** | 116 | /** |
@@ -147,4 +152,9 @@ unsigned long find_next_bit(const unsigned long *addr, | |||
147 | unsigned long size, | 152 | unsigned long size, |
148 | unsigned long offset); | 153 | unsigned long offset); |
149 | 154 | ||
155 | static inline unsigned long hweight_long(unsigned long w) | ||
156 | { | ||
157 | return sizeof(w) == 4 ? hweight32(w) : hweight64(w); | ||
158 | } | ||
159 | |||
150 | #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ | 160 | #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ |
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h index 46a825cf2ae1..6a1bcb9d2c4a 100644 --- a/tools/testing/radix-tree/linux/bitops/non-atomic.h +++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h | |||
@@ -3,7 +3,6 @@ | |||
3 | 3 | ||
4 | #include <asm/types.h> | 4 | #include <asm/types.h> |
5 | 5 | ||
6 | #define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | ||
7 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) | 6 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) |
8 | 7 | ||
9 | /** | 8 | /** |
@@ -17,7 +16,7 @@ | |||
17 | */ | 16 | */ |
18 | static inline void __set_bit(int nr, volatile unsigned long *addr) | 17 | static inline void __set_bit(int nr, volatile unsigned long *addr) |
19 | { | 18 | { |
20 | unsigned long mask = BITOP_MASK(nr); | 19 | unsigned long mask = BIT_MASK(nr); |
21 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 20 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
22 | 21 | ||
23 | *p |= mask; | 22 | *p |= mask; |
@@ -25,7 +24,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) | |||
25 | 24 | ||
26 | static inline void __clear_bit(int nr, volatile unsigned long *addr) | 25 | static inline void __clear_bit(int nr, volatile unsigned long *addr) |
27 | { | 26 | { |
28 | unsigned long mask = BITOP_MASK(nr); | 27 | unsigned long mask = BIT_MASK(nr); |
29 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 28 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
30 | 29 | ||
31 | *p &= ~mask; | 30 | *p &= ~mask; |
@@ -42,7 +41,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) | |||
42 | */ | 41 | */ |
43 | static inline void __change_bit(int nr, volatile unsigned long *addr) | 42 | static inline void __change_bit(int nr, volatile unsigned long *addr) |
44 | { | 43 | { |
45 | unsigned long mask = BITOP_MASK(nr); | 44 | unsigned long mask = BIT_MASK(nr); |
46 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 45 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
47 | 46 | ||
48 | *p ^= mask; | 47 | *p ^= mask; |
@@ -59,7 +58,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) | |||
59 | */ | 58 | */ |
60 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | 59 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) |
61 | { | 60 | { |
62 | unsigned long mask = BITOP_MASK(nr); | 61 | unsigned long mask = BIT_MASK(nr); |
63 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 62 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
64 | unsigned long old = *p; | 63 | unsigned long old = *p; |
65 | 64 | ||
@@ -78,7 +77,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | |||
78 | */ | 77 | */ |
79 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | 78 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) |
80 | { | 79 | { |
81 | unsigned long mask = BITOP_MASK(nr); | 80 | unsigned long mask = BIT_MASK(nr); |
82 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 81 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
83 | unsigned long old = *p; | 82 | unsigned long old = *p; |
84 | 83 | ||
@@ -90,7 +89,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | |||
90 | static inline int __test_and_change_bit(int nr, | 89 | static inline int __test_and_change_bit(int nr, |
91 | volatile unsigned long *addr) | 90 | volatile unsigned long *addr) |
92 | { | 91 | { |
93 | unsigned long mask = BITOP_MASK(nr); | 92 | unsigned long mask = BIT_MASK(nr); |
94 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 93 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
95 | unsigned long old = *p; | 94 | unsigned long old = *p; |
96 | 95 | ||
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h index ccbe444977df..23b8ed52f8c8 100644 --- a/tools/testing/radix-tree/linux/bug.h +++ b/tools/testing/radix-tree/linux/bug.h | |||
@@ -1 +1 @@ | |||
#define WARN_ON_ONCE(x) assert(x) | #include "asm/bug.h" | ||
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index 5201b915f631..5b09b2ce6c33 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h | |||
@@ -3,8 +3,24 @@ | |||
3 | 3 | ||
4 | #define __GFP_BITS_SHIFT 26 | 4 | #define __GFP_BITS_SHIFT 26 |
5 | #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) | 5 | #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) |
6 | #define __GFP_WAIT 1 | 6 | |
7 | #define __GFP_ACCOUNT 0 | 7 | #define __GFP_HIGH 0x20u |
8 | #define __GFP_NOWARN 0 | 8 | #define __GFP_IO 0x40u |
9 | #define __GFP_FS 0x80u | ||
10 | #define __GFP_NOWARN 0x200u | ||
11 | #define __GFP_ATOMIC 0x80000u | ||
12 | #define __GFP_ACCOUNT 0x100000u | ||
13 | #define __GFP_DIRECT_RECLAIM 0x400000u | ||
14 | #define __GFP_KSWAPD_RECLAIM 0x2000000u | ||
15 | |||
16 | #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) | ||
17 | |||
18 | #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) | ||
19 | #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) | ||
20 | |||
21 | static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) | ||
22 | { | ||
23 | return !!(gfp_flags & __GFP_DIRECT_RECLAIM); | ||
24 | } | ||
9 | 25 | ||
10 | #endif | 26 | #endif |
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index be98a47b4e1b..9b43b4975d83 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h | |||
@@ -8,9 +8,14 @@ | |||
8 | #include <limits.h> | 8 | #include <limits.h> |
9 | 9 | ||
10 | #include "../../include/linux/compiler.h" | 10 | #include "../../include/linux/compiler.h" |
11 | #include "../../include/linux/err.h" | ||
11 | #include "../../../include/linux/kconfig.h" | 12 | #include "../../../include/linux/kconfig.h" |
12 | 13 | ||
14 | #ifdef BENCHMARK | ||
15 | #define RADIX_TREE_MAP_SHIFT 6 | ||
16 | #else | ||
13 | #define RADIX_TREE_MAP_SHIFT 3 | 17 | #define RADIX_TREE_MAP_SHIFT 3 |
18 | #endif | ||
14 | 19 | ||
15 | #ifndef NULL | 20 | #ifndef NULL |
16 | #define NULL 0 | 21 | #define NULL 0 |
@@ -43,4 +48,17 @@ static inline int in_interrupt(void) | |||
43 | { | 48 | { |
44 | return 0; | 49 | return 0; |
45 | } | 50 | } |
51 | |||
52 | /* | ||
53 | * This looks more complex than it should be. But we need to | ||
54 | * get the type for the ~ right in round_down (it needs to be | ||
55 | * as wide as the result!), and we want to evaluate the macro | ||
56 | * arguments just once each. | ||
57 | */ | ||
58 | #define __round_mask(x, y) ((__typeof__(x))((y)-1)) | ||
59 | #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) | ||
60 | #define round_down(x, y) ((x) & ~__round_mask(x, y)) | ||
61 | |||
62 | #define xchg(ptr, x) uatomic_xchg(ptr, x) | ||
63 | |||
46 | #endif /* _KERNEL_H */ | 64 | #endif /* _KERNEL_H */ |
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h index 6210672e3baa..65c04c226965 100644 --- a/tools/testing/radix-tree/linux/preempt.h +++ b/tools/testing/radix-tree/linux/preempt.h | |||
@@ -1,4 +1,4 @@ | |||
1 | /* */ | 1 | extern int preempt_count; |
2 | 2 | ||
3 | #define preempt_disable() do { } while (0) | 3 | #define preempt_disable() uatomic_inc(&preempt_count) |
4 | #define preempt_enable() do { } while (0) | 4 | #define preempt_enable() uatomic_dec(&preempt_count) |
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h index 6d5a34770fd4..e40337f41a38 100644 --- a/tools/testing/radix-tree/linux/slab.h +++ b/tools/testing/radix-tree/linux/slab.h | |||
@@ -7,15 +7,8 @@ | |||
7 | #define SLAB_PANIC 2 | 7 | #define SLAB_PANIC 2 |
8 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ | 8 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ |
9 | 9 | ||
10 | static inline int gfpflags_allow_blocking(gfp_t mask) | 10 | void *kmalloc(size_t size, gfp_t); |
11 | { | 11 | void kfree(void *); |
12 | return 1; | ||
13 | } | ||
14 | |||
15 | struct kmem_cache { | ||
16 | int size; | ||
17 | void (*ctor)(void *); | ||
18 | }; | ||
19 | 12 | ||
20 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags); | 13 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags); |
21 | void kmem_cache_free(struct kmem_cache *cachep, void *objp); | 14 | void kmem_cache_free(struct kmem_cache *cachep, void *objp); |
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h index faa0b6ff9ca8..8491d89873bb 100644 --- a/tools/testing/radix-tree/linux/types.h +++ b/tools/testing/radix-tree/linux/types.h | |||
@@ -6,8 +6,6 @@ | |||
6 | #define __rcu | 6 | #define __rcu |
7 | #define __read_mostly | 7 | #define __read_mostly |
8 | 8 | ||
9 | #define BITS_PER_LONG (sizeof(long) * 8) | ||
10 | |||
11 | static inline void INIT_LIST_HEAD(struct list_head *list) | 9 | static inline void INIT_LIST_HEAD(struct list_head *list) |
12 | { | 10 | { |
13 | list->next = list; | 11 | list->next = list; |
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index daa9010693e8..f7e9801a6754 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c | |||
@@ -67,7 +67,6 @@ void big_gang_check(bool long_run) | |||
67 | 67 | ||
68 | for (i = 0; i < (long_run ? 1000 : 3); i++) { | 68 | for (i = 0; i < (long_run ? 1000 : 3); i++) { |
69 | __big_gang_check(); | 69 | __big_gang_check(); |
70 | srand(time(0)); | ||
71 | printf("%d ", i); | 70 | printf("%d ", i); |
72 | fflush(stdout); | 71 | fflush(stdout); |
73 | } | 72 | } |
@@ -206,8 +205,7 @@ void copy_tag_check(void) | |||
206 | } | 205 | } |
207 | 206 | ||
208 | // printf("\ncopying tags...\n"); | 207 | // printf("\ncopying tags...\n"); |
209 | cur = start; | 208 | tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1); |
210 | tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1); | ||
211 | 209 | ||
212 | // printf("checking copied tags\n"); | 210 | // printf("checking copied tags\n"); |
213 | assert(tagged == count); | 211 | assert(tagged == count); |
@@ -215,16 +213,13 @@ void copy_tag_check(void) | |||
215 | 213 | ||
216 | /* Copy tags in several rounds */ | 214 | /* Copy tags in several rounds */ |
217 | // printf("\ncopying tags...\n"); | 215 | // printf("\ncopying tags...\n"); |
218 | cur = start; | 216 | tmp = rand() % (count / 10 + 2); |
219 | do { | 217 | tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2); |
220 | tmp = rand() % (count/10+2); | 218 | assert(tagged == count); |
221 | tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2); | ||
222 | } while (tmp == tagged); | ||
223 | 219 | ||
224 | // printf("%lu %lu %lu\n", tagged, tmp, count); | 220 | // printf("%lu %lu %lu\n", tagged, tmp, count); |
225 | // printf("checking copied tags\n"); | 221 | // printf("checking copied tags\n"); |
226 | check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); | 222 | check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); |
227 | assert(tagged < tmp); | ||
228 | verify_tag_consistency(&tree, 0); | 223 | verify_tag_consistency(&tree, 0); |
229 | verify_tag_consistency(&tree, 1); | 224 | verify_tag_consistency(&tree, 1); |
230 | verify_tag_consistency(&tree, 2); | 225 | verify_tag_consistency(&tree, 2); |
@@ -240,7 +235,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index, | |||
240 | 235 | ||
241 | item_insert_order(tree, index, order); | 236 | item_insert_order(tree, index, order); |
242 | item = item_lookup(tree, index); | 237 | item = item_lookup(tree, index); |
243 | index2 = radix_tree_locate_item(tree, item); | 238 | index2 = find_item(tree, item); |
244 | if (index != index2) { | 239 | if (index != index2) { |
245 | printf("index %ld order %d inserted; found %ld\n", | 240 | printf("index %ld order %d inserted; found %ld\n", |
246 | index, order, index2); | 241 | index, order, index2); |
@@ -274,17 +269,17 @@ static void locate_check(void) | |||
274 | index += (1UL << order)) { | 269 | index += (1UL << order)) { |
275 | __locate_check(&tree, index + offset, order); | 270 | __locate_check(&tree, index + offset, order); |
276 | } | 271 | } |
277 | if (radix_tree_locate_item(&tree, &tree) != -1) | 272 | if (find_item(&tree, &tree) != -1) |
278 | abort(); | 273 | abort(); |
279 | 274 | ||
280 | item_kill_tree(&tree); | 275 | item_kill_tree(&tree); |
281 | } | 276 | } |
282 | } | 277 | } |
283 | 278 | ||
284 | if (radix_tree_locate_item(&tree, &tree) != -1) | 279 | if (find_item(&tree, &tree) != -1) |
285 | abort(); | 280 | abort(); |
286 | __locate_check(&tree, -1, 0); | 281 | __locate_check(&tree, -1, 0); |
287 | if (radix_tree_locate_item(&tree, &tree) != -1) | 282 | if (find_item(&tree, &tree) != -1) |
288 | abort(); | 283 | abort(); |
289 | item_kill_tree(&tree); | 284 | item_kill_tree(&tree); |
290 | } | 285 | } |
@@ -293,50 +288,80 @@ static void single_thread_tests(bool long_run) | |||
293 | { | 288 | { |
294 | int i; | 289 | int i; |
295 | 290 | ||
296 | printf("starting single_thread_tests: %d allocated\n", nr_allocated); | 291 | printf("starting single_thread_tests: %d allocated, preempt %d\n", |
292 | nr_allocated, preempt_count); | ||
297 | multiorder_checks(); | 293 | multiorder_checks(); |
298 | printf("after multiorder_check: %d allocated\n", nr_allocated); | 294 | rcu_barrier(); |
295 | printf("after multiorder_check: %d allocated, preempt %d\n", | ||
296 | nr_allocated, preempt_count); | ||
299 | locate_check(); | 297 | locate_check(); |
300 | printf("after locate_check: %d allocated\n", nr_allocated); | 298 | rcu_barrier(); |
299 | printf("after locate_check: %d allocated, preempt %d\n", | ||
300 | nr_allocated, preempt_count); | ||
301 | tag_check(); | 301 | tag_check(); |
302 | printf("after tag_check: %d allocated\n", nr_allocated); | 302 | rcu_barrier(); |
303 | printf("after tag_check: %d allocated, preempt %d\n", | ||
304 | nr_allocated, preempt_count); | ||
303 | gang_check(); | 305 | gang_check(); |
304 | printf("after gang_check: %d allocated\n", nr_allocated); | 306 | rcu_barrier(); |
307 | printf("after gang_check: %d allocated, preempt %d\n", | ||
308 | nr_allocated, preempt_count); | ||
305 | add_and_check(); | 309 | add_and_check(); |
306 | printf("after add_and_check: %d allocated\n", nr_allocated); | 310 | rcu_barrier(); |
311 | printf("after add_and_check: %d allocated, preempt %d\n", | ||
312 | nr_allocated, preempt_count); | ||
307 | dynamic_height_check(); | 313 | dynamic_height_check(); |
308 | printf("after dynamic_height_check: %d allocated\n", nr_allocated); | 314 | rcu_barrier(); |
315 | printf("after dynamic_height_check: %d allocated, preempt %d\n", | ||
316 | nr_allocated, preempt_count); | ||
309 | big_gang_check(long_run); | 317 | big_gang_check(long_run); |
310 | printf("after big_gang_check: %d allocated\n", nr_allocated); | 318 | rcu_barrier(); |
319 | printf("after big_gang_check: %d allocated, preempt %d\n", | ||
320 | nr_allocated, preempt_count); | ||
311 | for (i = 0; i < (long_run ? 2000 : 3); i++) { | 321 | for (i = 0; i < (long_run ? 2000 : 3); i++) { |
312 | copy_tag_check(); | 322 | copy_tag_check(); |
313 | printf("%d ", i); | 323 | printf("%d ", i); |
314 | fflush(stdout); | 324 | fflush(stdout); |
315 | } | 325 | } |
316 | printf("after copy_tag_check: %d allocated\n", nr_allocated); | 326 | rcu_barrier(); |
327 | printf("after copy_tag_check: %d allocated, preempt %d\n", | ||
328 | nr_allocated, preempt_count); | ||
317 | } | 329 | } |
318 | 330 | ||
319 | int main(int argc, char **argv) | 331 | int main(int argc, char **argv) |
320 | { | 332 | { |
321 | bool long_run = false; | 333 | bool long_run = false; |
322 | int opt; | 334 | int opt; |
335 | unsigned int seed = time(NULL); | ||
323 | 336 | ||
324 | while ((opt = getopt(argc, argv, "l")) != -1) { | 337 | while ((opt = getopt(argc, argv, "ls:")) != -1) { |
325 | if (opt == 'l') | 338 | if (opt == 'l') |
326 | long_run = true; | 339 | long_run = true; |
340 | else if (opt == 's') | ||
341 | seed = strtoul(optarg, NULL, 0); | ||
327 | } | 342 | } |
328 | 343 | ||
344 | printf("random seed %u\n", seed); | ||
345 | srand(seed); | ||
346 | |||
329 | rcu_register_thread(); | 347 | rcu_register_thread(); |
330 | radix_tree_init(); | 348 | radix_tree_init(); |
331 | 349 | ||
332 | regression1_test(); | 350 | regression1_test(); |
333 | regression2_test(); | 351 | regression2_test(); |
334 | regression3_test(); | 352 | regression3_test(); |
335 | iteration_test(); | 353 | iteration_test(0, 10); |
354 | iteration_test(7, 20); | ||
336 | single_thread_tests(long_run); | 355 | single_thread_tests(long_run); |
337 | 356 | ||
338 | sleep(1); | 357 | /* Free any remaining preallocated nodes */ |
339 | printf("after sleep(1): %d allocated\n", nr_allocated); | 358 | radix_tree_cpu_dead(0); |
359 | |||
360 | benchmark(); | ||
361 | |||
362 | rcu_barrier(); | ||
363 | printf("after rcu_barrier: %d allocated, preempt %d\n", | ||
364 | nr_allocated, preempt_count); | ||
340 | rcu_unregister_thread(); | 365 | rcu_unregister_thread(); |
341 | 366 | ||
342 | exit(0); | 367 | exit(0); |
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index d1be94667a30..f79812a5e070 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c | |||
@@ -26,7 +26,6 @@ static void __multiorder_tag_test(int index, int order) | |||
26 | { | 26 | { |
27 | RADIX_TREE(tree, GFP_KERNEL); | 27 | RADIX_TREE(tree, GFP_KERNEL); |
28 | int base, err, i; | 28 | int base, err, i; |
29 | unsigned long first = 0; | ||
30 | 29 | ||
31 | /* our canonical entry */ | 30 | /* our canonical entry */ |
32 | base = index & ~((1 << order) - 1); | 31 | base = index & ~((1 << order) - 1); |
@@ -60,7 +59,7 @@ static void __multiorder_tag_test(int index, int order) | |||
60 | assert(!radix_tree_tag_get(&tree, i, 1)); | 59 | assert(!radix_tree_tag_get(&tree, i, 1)); |
61 | } | 60 | } |
62 | 61 | ||
63 | assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1); | 62 | assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1); |
64 | assert(radix_tree_tag_clear(&tree, index, 0)); | 63 | assert(radix_tree_tag_clear(&tree, index, 0)); |
65 | 64 | ||
66 | for_each_index(i, base, order) { | 65 | for_each_index(i, base, order) { |
@@ -76,8 +75,27 @@ static void __multiorder_tag_test(int index, int order) | |||
76 | item_kill_tree(&tree); | 75 | item_kill_tree(&tree); |
77 | } | 76 | } |
78 | 77 | ||
78 | static void __multiorder_tag_test2(unsigned order, unsigned long index2) | ||
79 | { | ||
80 | RADIX_TREE(tree, GFP_KERNEL); | ||
81 | unsigned long index = (1 << order); | ||
82 | index2 += index; | ||
83 | |||
84 | assert(item_insert_order(&tree, 0, order) == 0); | ||
85 | assert(item_insert(&tree, index2) == 0); | ||
86 | |||
87 | assert(radix_tree_tag_set(&tree, 0, 0)); | ||
88 | assert(radix_tree_tag_set(&tree, index2, 0)); | ||
89 | |||
90 | assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2); | ||
91 | |||
92 | item_kill_tree(&tree); | ||
93 | } | ||
94 | |||
79 | static void multiorder_tag_tests(void) | 95 | static void multiorder_tag_tests(void) |
80 | { | 96 | { |
97 | int i, j; | ||
98 | |||
81 | /* test multi-order entry for indices 0-7 with no sibling pointers */ | 99 | /* test multi-order entry for indices 0-7 with no sibling pointers */ |
82 | __multiorder_tag_test(0, 3); | 100 | __multiorder_tag_test(0, 3); |
83 | __multiorder_tag_test(5, 3); | 101 | __multiorder_tag_test(5, 3); |
@@ -117,6 +135,10 @@ static void multiorder_tag_tests(void) | |||
117 | __multiorder_tag_test(300, 8); | 135 | __multiorder_tag_test(300, 8); |
118 | 136 | ||
119 | __multiorder_tag_test(0x12345678UL, 8); | 137 | __multiorder_tag_test(0x12345678UL, 8); |
138 | |||
139 | for (i = 1; i < 10; i++) | ||
140 | for (j = 0; j < (10 << i); j++) | ||
141 | __multiorder_tag_test2(i, j); | ||
120 | } | 142 | } |
121 | 143 | ||
122 | static void multiorder_check(unsigned long index, int order) | 144 | static void multiorder_check(unsigned long index, int order) |
@@ -125,7 +147,7 @@ static void multiorder_check(unsigned long index, int order) | |||
125 | unsigned long min = index & ~((1UL << order) - 1); | 147 | unsigned long min = index & ~((1UL << order) - 1); |
126 | unsigned long max = min + (1UL << order); | 148 | unsigned long max = min + (1UL << order); |
127 | void **slot; | 149 | void **slot; |
128 | struct item *item2 = item_create(min); | 150 | struct item *item2 = item_create(min, order); |
129 | RADIX_TREE(tree, GFP_KERNEL); | 151 | RADIX_TREE(tree, GFP_KERNEL); |
130 | 152 | ||
131 | printf("Multiorder index %ld, order %d\n", index, order); | 153 | printf("Multiorder index %ld, order %d\n", index, order); |
@@ -231,11 +253,14 @@ void multiorder_iteration(void) | |||
231 | radix_tree_for_each_slot(slot, &tree, &iter, j) { | 253 | radix_tree_for_each_slot(slot, &tree, &iter, j) { |
232 | int height = order[i] / RADIX_TREE_MAP_SHIFT; | 254 | int height = order[i] / RADIX_TREE_MAP_SHIFT; |
233 | int shift = height * RADIX_TREE_MAP_SHIFT; | 255 | int shift = height * RADIX_TREE_MAP_SHIFT; |
234 | int mask = (1 << order[i]) - 1; | 256 | unsigned long mask = (1UL << order[i]) - 1; |
257 | struct item *item = *slot; | ||
235 | 258 | ||
236 | assert(iter.index >= (index[i] &~ mask)); | 259 | assert((iter.index | mask) == (index[i] | mask)); |
237 | assert(iter.index <= (index[i] | mask)); | ||
238 | assert(iter.shift == shift); | 260 | assert(iter.shift == shift); |
261 | assert(!radix_tree_is_internal_node(item)); | ||
262 | assert((item->index | mask) == (index[i] | mask)); | ||
263 | assert(item->order == order[i]); | ||
239 | i++; | 264 | i++; |
240 | } | 265 | } |
241 | } | 266 | } |
@@ -248,7 +273,6 @@ void multiorder_tagged_iteration(void) | |||
248 | RADIX_TREE(tree, GFP_KERNEL); | 273 | RADIX_TREE(tree, GFP_KERNEL); |
249 | struct radix_tree_iter iter; | 274 | struct radix_tree_iter iter; |
250 | void **slot; | 275 | void **slot; |
251 | unsigned long first = 0; | ||
252 | int i, j; | 276 | int i, j; |
253 | 277 | ||
254 | printf("Multiorder tagged iteration test\n"); | 278 | printf("Multiorder tagged iteration test\n"); |
@@ -269,7 +293,7 @@ void multiorder_tagged_iteration(void) | |||
269 | assert(radix_tree_tag_set(&tree, tag_index[i], 1)); | 293 | assert(radix_tree_tag_set(&tree, tag_index[i], 1)); |
270 | 294 | ||
271 | for (j = 0; j < 256; j++) { | 295 | for (j = 0; j < 256; j++) { |
272 | int mask, k; | 296 | int k; |
273 | 297 | ||
274 | for (i = 0; i < TAG_ENTRIES; i++) { | 298 | for (i = 0; i < TAG_ENTRIES; i++) { |
275 | for (k = i; index[k] < tag_index[i]; k++) | 299 | for (k = i; index[k] < tag_index[i]; k++) |
@@ -279,18 +303,22 @@ void multiorder_tagged_iteration(void) | |||
279 | } | 303 | } |
280 | 304 | ||
281 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { | 305 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { |
306 | unsigned long mask; | ||
307 | struct item *item = *slot; | ||
282 | for (k = i; index[k] < tag_index[i]; k++) | 308 | for (k = i; index[k] < tag_index[i]; k++) |
283 | ; | 309 | ; |
284 | mask = (1 << order[k]) - 1; | 310 | mask = (1UL << order[k]) - 1; |
285 | 311 | ||
286 | assert(iter.index >= (tag_index[i] &~ mask)); | 312 | assert((iter.index | mask) == (tag_index[i] | mask)); |
287 | assert(iter.index <= (tag_index[i] | mask)); | 313 | assert(!radix_tree_is_internal_node(item)); |
314 | assert((item->index | mask) == (tag_index[i] | mask)); | ||
315 | assert(item->order == order[k]); | ||
288 | i++; | 316 | i++; |
289 | } | 317 | } |
290 | } | 318 | } |
291 | 319 | ||
292 | radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, | 320 | assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) == |
293 | MT_NUM_ENTRIES, 1, 2); | 321 | TAG_ENTRIES); |
294 | 322 | ||
295 | for (j = 0; j < 256; j++) { | 323 | for (j = 0; j < 256; j++) { |
296 | int mask, k; | 324 | int mask, k; |
@@ -303,19 +331,21 @@ void multiorder_tagged_iteration(void) | |||
303 | } | 331 | } |
304 | 332 | ||
305 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { | 333 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { |
334 | struct item *item = *slot; | ||
306 | for (k = i; index[k] < tag_index[i]; k++) | 335 | for (k = i; index[k] < tag_index[i]; k++) |
307 | ; | 336 | ; |
308 | mask = (1 << order[k]) - 1; | 337 | mask = (1 << order[k]) - 1; |
309 | 338 | ||
310 | assert(iter.index >= (tag_index[i] &~ mask)); | 339 | assert((iter.index | mask) == (tag_index[i] | mask)); |
311 | assert(iter.index <= (tag_index[i] | mask)); | 340 | assert(!radix_tree_is_internal_node(item)); |
341 | assert((item->index | mask) == (tag_index[i] | mask)); | ||
342 | assert(item->order == order[k]); | ||
312 | i++; | 343 | i++; |
313 | } | 344 | } |
314 | } | 345 | } |
315 | 346 | ||
316 | first = 1; | 347 | assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0) |
317 | radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, | 348 | == TAG_ENTRIES); |
318 | MT_NUM_ENTRIES, 1, 0); | ||
319 | i = 0; | 349 | i = 0; |
320 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { | 350 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { |
321 | assert(iter.index == tag_index[i]); | 351 | assert(iter.index == tag_index[i]); |
@@ -325,6 +355,261 @@ void multiorder_tagged_iteration(void) | |||
325 | item_kill_tree(&tree); | 355 | item_kill_tree(&tree); |
326 | } | 356 | } |
327 | 357 | ||
358 | static void multiorder_join1(unsigned long index, | ||
359 | unsigned order1, unsigned order2) | ||
360 | { | ||
361 | unsigned long loc; | ||
362 | void *item, *item2 = item_create(index + 1, order1); | ||
363 | RADIX_TREE(tree, GFP_KERNEL); | ||
364 | |||
365 | item_insert_order(&tree, index, order2); | ||
366 | item = radix_tree_lookup(&tree, index); | ||
367 | radix_tree_join(&tree, index + 1, order1, item2); | ||
368 | loc = find_item(&tree, item); | ||
369 | if (loc == -1) | ||
370 | free(item); | ||
371 | item = radix_tree_lookup(&tree, index + 1); | ||
372 | assert(item == item2); | ||
373 | item_kill_tree(&tree); | ||
374 | } | ||
375 | |||
376 | static void multiorder_join2(unsigned order1, unsigned order2) | ||
377 | { | ||
378 | RADIX_TREE(tree, GFP_KERNEL); | ||
379 | struct radix_tree_node *node; | ||
380 | void *item1 = item_create(0, order1); | ||
381 | void *item2; | ||
382 | |||
383 | item_insert_order(&tree, 0, order2); | ||
384 | radix_tree_insert(&tree, 1 << order2, (void *)0x12UL); | ||
385 | item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); | ||
386 | assert(item2 == (void *)0x12UL); | ||
387 | assert(node->exceptional == 1); | ||
388 | |||
389 | radix_tree_join(&tree, 0, order1, item1); | ||
390 | item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); | ||
391 | assert(item2 == item1); | ||
392 | assert(node->exceptional == 0); | ||
393 | item_kill_tree(&tree); | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * This test revealed an accounting bug for exceptional entries at one point. | ||
398 | * Nodes were being freed back into the pool with an elevated exception count | ||
399 | * by radix_tree_join() and then radix_tree_split() was failing to zero the | ||
400 | * count of exceptional entries. | ||
401 | */ | ||
402 | static void multiorder_join3(unsigned int order) | ||
403 | { | ||
404 | RADIX_TREE(tree, GFP_KERNEL); | ||
405 | struct radix_tree_node *node; | ||
406 | void **slot; | ||
407 | struct radix_tree_iter iter; | ||
408 | unsigned long i; | ||
409 | |||
410 | for (i = 0; i < (1 << order); i++) { | ||
411 | radix_tree_insert(&tree, i, (void *)0x12UL); | ||
412 | } | ||
413 | |||
414 | radix_tree_join(&tree, 0, order, (void *)0x16UL); | ||
415 | rcu_barrier(); | ||
416 | |||
417 | radix_tree_split(&tree, 0, 0); | ||
418 | |||
419 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
420 | radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL); | ||
421 | } | ||
422 | |||
423 | __radix_tree_lookup(&tree, 0, &node, NULL); | ||
424 | assert(node->exceptional == node->count); | ||
425 | |||
426 | item_kill_tree(&tree); | ||
427 | } | ||
428 | |||
429 | static void multiorder_join(void) | ||
430 | { | ||
431 | int i, j, idx; | ||
432 | |||
433 | for (idx = 0; idx < 1024; idx = idx * 2 + 3) { | ||
434 | for (i = 1; i < 15; i++) { | ||
435 | for (j = 0; j < i; j++) { | ||
436 | multiorder_join1(idx, i, j); | ||
437 | } | ||
438 | } | ||
439 | } | ||
440 | |||
441 | for (i = 1; i < 15; i++) { | ||
442 | for (j = 0; j < i; j++) { | ||
443 | multiorder_join2(i, j); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | for (i = 3; i < 10; i++) { | ||
448 | multiorder_join3(i); | ||
449 | } | ||
450 | } | ||
451 | |||
452 | static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc) | ||
453 | { | ||
454 | struct radix_tree_preload *rtp = &radix_tree_preloads; | ||
455 | if (rtp->nr != 0) | ||
456 | printf("split(%u %u) remaining %u\n", old_order, new_order, | ||
457 | rtp->nr); | ||
458 | /* | ||
459 | * Can't check for equality here as some nodes may have been | ||
460 | * RCU-freed while we ran. But we should never finish with more | ||
461 | * nodes allocated since they should have all been preloaded. | ||
462 | */ | ||
463 | if (nr_allocated > alloc) | ||
464 | printf("split(%u %u) allocated %u %u\n", old_order, new_order, | ||
465 | alloc, nr_allocated); | ||
466 | } | ||
467 | |||
468 | static void __multiorder_split(int old_order, int new_order) | ||
469 | { | ||
470 | RADIX_TREE(tree, GFP_ATOMIC); | ||
471 | void **slot; | ||
472 | struct radix_tree_iter iter; | ||
473 | unsigned alloc; | ||
474 | |||
475 | radix_tree_preload(GFP_KERNEL); | ||
476 | assert(item_insert_order(&tree, 0, old_order) == 0); | ||
477 | radix_tree_preload_end(); | ||
478 | |||
479 | /* Wipe out the preloaded cache or it'll confuse check_mem() */ | ||
480 | radix_tree_cpu_dead(0); | ||
481 | |||
482 | radix_tree_tag_set(&tree, 0, 2); | ||
483 | |||
484 | radix_tree_split_preload(old_order, new_order, GFP_KERNEL); | ||
485 | alloc = nr_allocated; | ||
486 | radix_tree_split(&tree, 0, new_order); | ||
487 | check_mem(old_order, new_order, alloc); | ||
488 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
489 | radix_tree_iter_replace(&tree, &iter, slot, | ||
490 | item_create(iter.index, new_order)); | ||
491 | } | ||
492 | radix_tree_preload_end(); | ||
493 | |||
494 | item_kill_tree(&tree); | ||
495 | } | ||
496 | |||
497 | static void __multiorder_split2(int old_order, int new_order) | ||
498 | { | ||
499 | RADIX_TREE(tree, GFP_KERNEL); | ||
500 | void **slot; | ||
501 | struct radix_tree_iter iter; | ||
502 | struct radix_tree_node *node; | ||
503 | void *item; | ||
504 | |||
505 | __radix_tree_insert(&tree, 0, old_order, (void *)0x12); | ||
506 | |||
507 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
508 | assert(item == (void *)0x12); | ||
509 | assert(node->exceptional > 0); | ||
510 | |||
511 | radix_tree_split(&tree, 0, new_order); | ||
512 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
513 | radix_tree_iter_replace(&tree, &iter, slot, | ||
514 | item_create(iter.index, new_order)); | ||
515 | } | ||
516 | |||
517 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
518 | assert(item != (void *)0x12); | ||
519 | assert(node->exceptional == 0); | ||
520 | |||
521 | item_kill_tree(&tree); | ||
522 | } | ||
523 | |||
524 | static void __multiorder_split3(int old_order, int new_order) | ||
525 | { | ||
526 | RADIX_TREE(tree, GFP_KERNEL); | ||
527 | void **slot; | ||
528 | struct radix_tree_iter iter; | ||
529 | struct radix_tree_node *node; | ||
530 | void *item; | ||
531 | |||
532 | __radix_tree_insert(&tree, 0, old_order, (void *)0x12); | ||
533 | |||
534 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
535 | assert(item == (void *)0x12); | ||
536 | assert(node->exceptional > 0); | ||
537 | |||
538 | radix_tree_split(&tree, 0, new_order); | ||
539 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
540 | radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16); | ||
541 | } | ||
542 | |||
543 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
544 | assert(item == (void *)0x16); | ||
545 | assert(node->exceptional > 0); | ||
546 | |||
547 | item_kill_tree(&tree); | ||
548 | |||
549 | __radix_tree_insert(&tree, 0, old_order, (void *)0x12); | ||
550 | |||
551 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
552 | assert(item == (void *)0x12); | ||
553 | assert(node->exceptional > 0); | ||
554 | |||
555 | radix_tree_split(&tree, 0, new_order); | ||
556 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
557 | if (iter.index == (1 << new_order)) | ||
558 | radix_tree_iter_replace(&tree, &iter, slot, | ||
559 | (void *)0x16); | ||
560 | else | ||
561 | radix_tree_iter_replace(&tree, &iter, slot, NULL); | ||
562 | } | ||
563 | |||
564 | item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL); | ||
565 | assert(item == (void *)0x16); | ||
566 | assert(node->count == node->exceptional); | ||
567 | do { | ||
568 | node = node->parent; | ||
569 | if (!node) | ||
570 | break; | ||
571 | assert(node->count == 1); | ||
572 | assert(node->exceptional == 0); | ||
573 | } while (1); | ||
574 | |||
575 | item_kill_tree(&tree); | ||
576 | } | ||
577 | |||
578 | static void multiorder_split(void) | ||
579 | { | ||
580 | int i, j; | ||
581 | |||
582 | for (i = 3; i < 11; i++) | ||
583 | for (j = 0; j < i; j++) { | ||
584 | __multiorder_split(i, j); | ||
585 | __multiorder_split2(i, j); | ||
586 | __multiorder_split3(i, j); | ||
587 | } | ||
588 | } | ||
589 | |||
590 | static void multiorder_account(void) | ||
591 | { | ||
592 | RADIX_TREE(tree, GFP_KERNEL); | ||
593 | struct radix_tree_node *node; | ||
594 | void **slot; | ||
595 | |||
596 | item_insert_order(&tree, 0, 5); | ||
597 | |||
598 | __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); | ||
599 | __radix_tree_lookup(&tree, 0, &node, NULL); | ||
600 | assert(node->count == node->exceptional * 2); | ||
601 | radix_tree_delete(&tree, 1 << 5); | ||
602 | assert(node->exceptional == 0); | ||
603 | |||
604 | __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); | ||
605 | __radix_tree_lookup(&tree, 1 << 5, &node, &slot); | ||
606 | assert(node->count == node->exceptional * 2); | ||
607 | __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL); | ||
608 | assert(node->exceptional == 0); | ||
609 | |||
610 | item_kill_tree(&tree); | ||
611 | } | ||
612 | |||
328 | void multiorder_checks(void) | 613 | void multiorder_checks(void) |
329 | { | 614 | { |
330 | int i; | 615 | int i; |
@@ -342,4 +627,9 @@ void multiorder_checks(void) | |||
342 | multiorder_tag_tests(); | 627 | multiorder_tag_tests(); |
343 | multiorder_iteration(); | 628 | multiorder_iteration(); |
344 | multiorder_tagged_iteration(); | 629 | multiorder_tagged_iteration(); |
630 | multiorder_join(); | ||
631 | multiorder_split(); | ||
632 | multiorder_account(); | ||
633 | |||
634 | radix_tree_cpu_dead(0); | ||
345 | } | 635 | } |
diff --git a/tools/testing/radix-tree/rcupdate.c b/tools/testing/radix-tree/rcupdate.c deleted file mode 100644 index 31a2d14225d6..000000000000 --- a/tools/testing/radix-tree/rcupdate.c +++ /dev/null | |||
@@ -1,86 +0,0 @@ | |||
1 | #include <linux/rcupdate.h> | ||
2 | #include <pthread.h> | ||
3 | #include <stdio.h> | ||
4 | #include <assert.h> | ||
5 | |||
6 | static pthread_mutex_t rculock = PTHREAD_MUTEX_INITIALIZER; | ||
7 | static struct rcu_head *rcuhead_global = NULL; | ||
8 | static __thread int nr_rcuhead = 0; | ||
9 | static __thread struct rcu_head *rcuhead = NULL; | ||
10 | static __thread struct rcu_head *rcutail = NULL; | ||
11 | |||
12 | static pthread_cond_t rcu_worker_cond = PTHREAD_COND_INITIALIZER; | ||
13 | |||
14 | /* switch to urcu implementation when it is merged. */ | ||
15 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)) | ||
16 | { | ||
17 | head->func = func; | ||
18 | head->next = rcuhead; | ||
19 | rcuhead = head; | ||
20 | if (!rcutail) | ||
21 | rcutail = head; | ||
22 | nr_rcuhead++; | ||
23 | if (nr_rcuhead >= 1000) { | ||
24 | int signal = 0; | ||
25 | |||
26 | pthread_mutex_lock(&rculock); | ||
27 | if (!rcuhead_global) | ||
28 | signal = 1; | ||
29 | rcutail->next = rcuhead_global; | ||
30 | rcuhead_global = head; | ||
31 | pthread_mutex_unlock(&rculock); | ||
32 | |||
33 | nr_rcuhead = 0; | ||
34 | rcuhead = NULL; | ||
35 | rcutail = NULL; | ||
36 | |||
37 | if (signal) { | ||
38 | pthread_cond_signal(&rcu_worker_cond); | ||
39 | } | ||
40 | } | ||
41 | } | ||
42 | |||
43 | static void *rcu_worker(void *arg) | ||
44 | { | ||
45 | struct rcu_head *r; | ||
46 | |||
47 | rcupdate_thread_init(); | ||
48 | |||
49 | while (1) { | ||
50 | pthread_mutex_lock(&rculock); | ||
51 | while (!rcuhead_global) { | ||
52 | pthread_cond_wait(&rcu_worker_cond, &rculock); | ||
53 | } | ||
54 | r = rcuhead_global; | ||
55 | rcuhead_global = NULL; | ||
56 | |||
57 | pthread_mutex_unlock(&rculock); | ||
58 | |||
59 | synchronize_rcu(); | ||
60 | |||
61 | while (r) { | ||
62 | struct rcu_head *tmp = r->next; | ||
63 | r->func(r); | ||
64 | r = tmp; | ||
65 | } | ||
66 | } | ||
67 | |||
68 | rcupdate_thread_exit(); | ||
69 | |||
70 | return NULL; | ||
71 | } | ||
72 | |||
73 | static pthread_t worker_thread; | ||
74 | void rcupdate_init(void) | ||
75 | { | ||
76 | pthread_create(&worker_thread, NULL, rcu_worker, NULL); | ||
77 | } | ||
78 | |||
79 | void rcupdate_thread_init(void) | ||
80 | { | ||
81 | rcu_register_thread(); | ||
82 | } | ||
83 | void rcupdate_thread_exit(void) | ||
84 | { | ||
85 | rcu_unregister_thread(); | ||
86 | } | ||
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c index 63bf347aaf33..a41325d7a170 100644 --- a/tools/testing/radix-tree/regression2.c +++ b/tools/testing/radix-tree/regression2.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <stdio.h> | 50 | #include <stdio.h> |
51 | 51 | ||
52 | #include "regression.h" | 52 | #include "regression.h" |
53 | #include "test.h" | ||
53 | 54 | ||
54 | #define PAGECACHE_TAG_DIRTY 0 | 55 | #define PAGECACHE_TAG_DIRTY 0 |
55 | #define PAGECACHE_TAG_WRITEBACK 1 | 56 | #define PAGECACHE_TAG_WRITEBACK 1 |
@@ -90,7 +91,7 @@ void regression2_test(void) | |||
90 | /* 1. */ | 91 | /* 1. */ |
91 | start = 0; | 92 | start = 0; |
92 | end = max_slots - 2; | 93 | end = max_slots - 2; |
93 | radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1, | 94 | tag_tagged_items(&mt_tree, NULL, start, end, 1, |
94 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); | 95 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); |
95 | 96 | ||
96 | /* 2. */ | 97 | /* 2. */ |
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c index 1f06ed73d0a8..b594841fae85 100644 --- a/tools/testing/radix-tree/regression3.c +++ b/tools/testing/radix-tree/regression3.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * In following radix_tree_next_slot current chunk size becomes zero. | 5 | * In following radix_tree_next_slot current chunk size becomes zero. |
6 | * This isn't checked and it tries to dereference null pointer in slot. | 6 | * This isn't checked and it tries to dereference null pointer in slot. |
7 | * | 7 | * |
8 | * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1, | 8 | * Helper radix_tree_iter_resume reset slot to NULL and next_index to index + 1, |
9 | * for tagger iteraction it also must reset cached tags in iterator to abort | 9 | * for tagger iteraction it also must reset cached tags in iterator to abort |
10 | * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. | 10 | * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. |
11 | * | 11 | * |
@@ -88,7 +88,7 @@ void regression3_test(void) | |||
88 | printf("slot %ld %p\n", iter.index, *slot); | 88 | printf("slot %ld %p\n", iter.index, *slot); |
89 | if (!iter.index) { | 89 | if (!iter.index) { |
90 | printf("next at %ld\n", iter.index); | 90 | printf("next at %ld\n", iter.index); |
91 | slot = radix_tree_iter_next(&iter); | 91 | slot = radix_tree_iter_resume(slot, &iter); |
92 | } | 92 | } |
93 | } | 93 | } |
94 | 94 | ||
@@ -96,7 +96,7 @@ void regression3_test(void) | |||
96 | printf("contig %ld %p\n", iter.index, *slot); | 96 | printf("contig %ld %p\n", iter.index, *slot); |
97 | if (!iter.index) { | 97 | if (!iter.index) { |
98 | printf("next at %ld\n", iter.index); | 98 | printf("next at %ld\n", iter.index); |
99 | slot = radix_tree_iter_next(&iter); | 99 | slot = radix_tree_iter_resume(slot, &iter); |
100 | } | 100 | } |
101 | } | 101 | } |
102 | 102 | ||
@@ -106,7 +106,7 @@ void regression3_test(void) | |||
106 | printf("tagged %ld %p\n", iter.index, *slot); | 106 | printf("tagged %ld %p\n", iter.index, *slot); |
107 | if (!iter.index) { | 107 | if (!iter.index) { |
108 | printf("next at %ld\n", iter.index); | 108 | printf("next at %ld\n", iter.index); |
109 | slot = radix_tree_iter_next(&iter); | 109 | slot = radix_tree_iter_resume(slot, &iter); |
110 | } | 110 | } |
111 | } | 111 | } |
112 | 112 | ||
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c index b0ac05741750..fd98c132207a 100644 --- a/tools/testing/radix-tree/tag_check.c +++ b/tools/testing/radix-tree/tag_check.c | |||
@@ -23,7 +23,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag) | |||
23 | item_tag_set(tree, index, tag); | 23 | item_tag_set(tree, index, tag); |
24 | ret = item_tag_get(tree, index, tag); | 24 | ret = item_tag_get(tree, index, tag); |
25 | assert(ret != 0); | 25 | assert(ret != 0); |
26 | ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag); | 26 | ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag); |
27 | assert(ret == 1); | 27 | assert(ret == 1); |
28 | ret = item_tag_get(tree, index, !tag); | 28 | ret = item_tag_get(tree, index, !tag); |
29 | assert(ret != 0); | 29 | assert(ret != 0); |
@@ -51,6 +51,7 @@ void simple_checks(void) | |||
51 | verify_tag_consistency(&tree, 1); | 51 | verify_tag_consistency(&tree, 1); |
52 | printf("before item_kill_tree: %d allocated\n", nr_allocated); | 52 | printf("before item_kill_tree: %d allocated\n", nr_allocated); |
53 | item_kill_tree(&tree); | 53 | item_kill_tree(&tree); |
54 | rcu_barrier(); | ||
54 | printf("after item_kill_tree: %d allocated\n", nr_allocated); | 55 | printf("after item_kill_tree: %d allocated\n", nr_allocated); |
55 | } | 56 | } |
56 | 57 | ||
@@ -319,10 +320,13 @@ static void single_check(void) | |||
319 | assert(ret == 0); | 320 | assert(ret == 0); |
320 | verify_tag_consistency(&tree, 0); | 321 | verify_tag_consistency(&tree, 0); |
321 | verify_tag_consistency(&tree, 1); | 322 | verify_tag_consistency(&tree, 1); |
322 | ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1); | 323 | ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1); |
323 | assert(ret == 1); | 324 | assert(ret == 1); |
324 | ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); | 325 | ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); |
325 | assert(ret == 1); | 326 | assert(ret == 1); |
327 | item_tag_clear(&tree, 0, 0); | ||
328 | ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 0); | ||
329 | assert(ret == 0); | ||
326 | item_kill_tree(&tree); | 330 | item_kill_tree(&tree); |
327 | } | 331 | } |
328 | 332 | ||
@@ -331,12 +335,16 @@ void tag_check(void) | |||
331 | single_check(); | 335 | single_check(); |
332 | extend_checks(); | 336 | extend_checks(); |
333 | contract_checks(); | 337 | contract_checks(); |
338 | rcu_barrier(); | ||
334 | printf("after extend_checks: %d allocated\n", nr_allocated); | 339 | printf("after extend_checks: %d allocated\n", nr_allocated); |
335 | __leak_check(); | 340 | __leak_check(); |
336 | leak_check(); | 341 | leak_check(); |
342 | rcu_barrier(); | ||
337 | printf("after leak_check: %d allocated\n", nr_allocated); | 343 | printf("after leak_check: %d allocated\n", nr_allocated); |
338 | simple_checks(); | 344 | simple_checks(); |
345 | rcu_barrier(); | ||
339 | printf("after simple_checks: %d allocated\n", nr_allocated); | 346 | printf("after simple_checks: %d allocated\n", nr_allocated); |
340 | thrash_tags(); | 347 | thrash_tags(); |
348 | rcu_barrier(); | ||
341 | printf("after thrash_tags: %d allocated\n", nr_allocated); | 349 | printf("after thrash_tags: %d allocated\n", nr_allocated); |
342 | } | 350 | } |
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index a6e8099eaf4f..e5726e373646 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c | |||
@@ -24,21 +24,29 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag) | |||
24 | return radix_tree_tag_get(root, index, tag); | 24 | return radix_tree_tag_get(root, index, tag); |
25 | } | 25 | } |
26 | 26 | ||
27 | int __item_insert(struct radix_tree_root *root, struct item *item, | 27 | int __item_insert(struct radix_tree_root *root, struct item *item) |
28 | unsigned order) | ||
29 | { | 28 | { |
30 | return __radix_tree_insert(root, item->index, order, item); | 29 | return __radix_tree_insert(root, item->index, item->order, item); |
31 | } | 30 | } |
32 | 31 | ||
33 | int item_insert(struct radix_tree_root *root, unsigned long index) | 32 | int item_insert(struct radix_tree_root *root, unsigned long index) |
34 | { | 33 | { |
35 | return __item_insert(root, item_create(index), 0); | 34 | return __item_insert(root, item_create(index, 0)); |
36 | } | 35 | } |
37 | 36 | ||
38 | int item_insert_order(struct radix_tree_root *root, unsigned long index, | 37 | int item_insert_order(struct radix_tree_root *root, unsigned long index, |
39 | unsigned order) | 38 | unsigned order) |
40 | { | 39 | { |
41 | return __item_insert(root, item_create(index), order); | 40 | return __item_insert(root, item_create(index, order)); |
41 | } | ||
42 | |||
43 | void item_sanity(struct item *item, unsigned long index) | ||
44 | { | ||
45 | unsigned long mask; | ||
46 | assert(!radix_tree_is_internal_node(item)); | ||
47 | assert(item->order < BITS_PER_LONG); | ||
48 | mask = (1UL << item->order) - 1; | ||
49 | assert((item->index | mask) == (index | mask)); | ||
42 | } | 50 | } |
43 | 51 | ||
44 | int item_delete(struct radix_tree_root *root, unsigned long index) | 52 | int item_delete(struct radix_tree_root *root, unsigned long index) |
@@ -46,18 +54,19 @@ int item_delete(struct radix_tree_root *root, unsigned long index) | |||
46 | struct item *item = radix_tree_delete(root, index); | 54 | struct item *item = radix_tree_delete(root, index); |
47 | 55 | ||
48 | if (item) { | 56 | if (item) { |
49 | assert(item->index == index); | 57 | item_sanity(item, index); |
50 | free(item); | 58 | free(item); |
51 | return 1; | 59 | return 1; |
52 | } | 60 | } |
53 | return 0; | 61 | return 0; |
54 | } | 62 | } |
55 | 63 | ||
56 | struct item *item_create(unsigned long index) | 64 | struct item *item_create(unsigned long index, unsigned int order) |
57 | { | 65 | { |
58 | struct item *ret = malloc(sizeof(*ret)); | 66 | struct item *ret = malloc(sizeof(*ret)); |
59 | 67 | ||
60 | ret->index = index; | 68 | ret->index = index; |
69 | ret->order = order; | ||
61 | return ret; | 70 | return ret; |
62 | } | 71 | } |
63 | 72 | ||
@@ -66,8 +75,8 @@ void item_check_present(struct radix_tree_root *root, unsigned long index) | |||
66 | struct item *item; | 75 | struct item *item; |
67 | 76 | ||
68 | item = radix_tree_lookup(root, index); | 77 | item = radix_tree_lookup(root, index); |
69 | assert(item != 0); | 78 | assert(item != NULL); |
70 | assert(item->index == index); | 79 | item_sanity(item, index); |
71 | } | 80 | } |
72 | 81 | ||
73 | struct item *item_lookup(struct radix_tree_root *root, unsigned long index) | 82 | struct item *item_lookup(struct radix_tree_root *root, unsigned long index) |
@@ -80,7 +89,7 @@ void item_check_absent(struct radix_tree_root *root, unsigned long index) | |||
80 | struct item *item; | 89 | struct item *item; |
81 | 90 | ||
82 | item = radix_tree_lookup(root, index); | 91 | item = radix_tree_lookup(root, index); |
83 | assert(item == 0); | 92 | assert(item == NULL); |
84 | } | 93 | } |
85 | 94 | ||
86 | /* | 95 | /* |
@@ -142,6 +151,62 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, | |||
142 | assert(nfound == 0); | 151 | assert(nfound == 0); |
143 | } | 152 | } |
144 | 153 | ||
154 | /* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */ | ||
155 | int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, | ||
156 | unsigned long start, unsigned long end, unsigned batch, | ||
157 | unsigned iftag, unsigned thentag) | ||
158 | { | ||
159 | unsigned long tagged = 0; | ||
160 | struct radix_tree_iter iter; | ||
161 | void **slot; | ||
162 | |||
163 | if (batch == 0) | ||
164 | batch = 1; | ||
165 | |||
166 | if (lock) | ||
167 | pthread_mutex_lock(lock); | ||
168 | radix_tree_for_each_tagged(slot, root, &iter, start, iftag) { | ||
169 | if (iter.index > end) | ||
170 | break; | ||
171 | radix_tree_iter_tag_set(root, &iter, thentag); | ||
172 | tagged++; | ||
173 | if ((tagged % batch) != 0) | ||
174 | continue; | ||
175 | slot = radix_tree_iter_resume(slot, &iter); | ||
176 | if (lock) { | ||
177 | pthread_mutex_unlock(lock); | ||
178 | rcu_barrier(); | ||
179 | pthread_mutex_lock(lock); | ||
180 | } | ||
181 | } | ||
182 | if (lock) | ||
183 | pthread_mutex_unlock(lock); | ||
184 | |||
185 | return tagged; | ||
186 | } | ||
187 | |||
188 | /* Use the same pattern as find_swap_entry() in mm/shmem.c */ | ||
189 | unsigned long find_item(struct radix_tree_root *root, void *item) | ||
190 | { | ||
191 | struct radix_tree_iter iter; | ||
192 | void **slot; | ||
193 | unsigned long found = -1; | ||
194 | unsigned long checked = 0; | ||
195 | |||
196 | radix_tree_for_each_slot(slot, root, &iter, 0) { | ||
197 | if (*slot == item) { | ||
198 | found = iter.index; | ||
199 | break; | ||
200 | } | ||
201 | checked++; | ||
202 | if ((checked % 4) != 0) | ||
203 | continue; | ||
204 | slot = radix_tree_iter_resume(slot, &iter); | ||
205 | } | ||
206 | |||
207 | return found; | ||
208 | } | ||
209 | |||
145 | static int verify_node(struct radix_tree_node *slot, unsigned int tag, | 210 | static int verify_node(struct radix_tree_node *slot, unsigned int tag, |
146 | int tagged) | 211 | int tagged) |
147 | { | 212 | { |
@@ -200,9 +265,16 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) | |||
200 | 265 | ||
201 | void item_kill_tree(struct radix_tree_root *root) | 266 | void item_kill_tree(struct radix_tree_root *root) |
202 | { | 267 | { |
268 | struct radix_tree_iter iter; | ||
269 | void **slot; | ||
203 | struct item *items[32]; | 270 | struct item *items[32]; |
204 | int nfound; | 271 | int nfound; |
205 | 272 | ||
273 | radix_tree_for_each_slot(slot, root, &iter, 0) { | ||
274 | if (radix_tree_exceptional_entry(*slot)) | ||
275 | radix_tree_delete(root, iter.index); | ||
276 | } | ||
277 | |||
206 | while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { | 278 | while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { |
207 | int i; | 279 | int i; |
208 | 280 | ||
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 217fb2403f09..056a23b56467 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h | |||
@@ -5,11 +5,11 @@ | |||
5 | 5 | ||
6 | struct item { | 6 | struct item { |
7 | unsigned long index; | 7 | unsigned long index; |
8 | unsigned int order; | ||
8 | }; | 9 | }; |
9 | 10 | ||
10 | struct item *item_create(unsigned long index); | 11 | struct item *item_create(unsigned long index, unsigned int order); |
11 | int __item_insert(struct radix_tree_root *root, struct item *item, | 12 | int __item_insert(struct radix_tree_root *root, struct item *item); |
12 | unsigned order); | ||
13 | int item_insert(struct radix_tree_root *root, unsigned long index); | 13 | int item_insert(struct radix_tree_root *root, unsigned long index); |
14 | int item_insert_order(struct radix_tree_root *root, unsigned long index, | 14 | int item_insert_order(struct radix_tree_root *root, unsigned long index, |
15 | unsigned order); | 15 | unsigned order); |
@@ -25,9 +25,15 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, | |||
25 | unsigned long nr, int chunk); | 25 | unsigned long nr, int chunk); |
26 | void item_kill_tree(struct radix_tree_root *root); | 26 | void item_kill_tree(struct radix_tree_root *root); |
27 | 27 | ||
28 | int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, | ||
29 | unsigned long start, unsigned long end, unsigned batch, | ||
30 | unsigned iftag, unsigned thentag); | ||
31 | unsigned long find_item(struct radix_tree_root *, void *item); | ||
32 | |||
28 | void tag_check(void); | 33 | void tag_check(void); |
29 | void multiorder_checks(void); | 34 | void multiorder_checks(void); |
30 | void iteration_test(void); | 35 | void iteration_test(unsigned order, unsigned duration); |
36 | void benchmark(void); | ||
31 | 37 | ||
32 | struct item * | 38 | struct item * |
33 | item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); | 39 | item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); |
@@ -40,7 +46,14 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); | |||
40 | extern int nr_allocated; | 46 | extern int nr_allocated; |
41 | 47 | ||
42 | /* Normally private parts of lib/radix-tree.c */ | 48 | /* Normally private parts of lib/radix-tree.c */ |
49 | struct radix_tree_node *entry_to_node(void *ptr); | ||
43 | void radix_tree_dump(struct radix_tree_root *root); | 50 | void radix_tree_dump(struct radix_tree_root *root); |
44 | int root_tag_get(struct radix_tree_root *root, unsigned int tag); | 51 | int root_tag_get(struct radix_tree_root *root, unsigned int tag); |
45 | unsigned long node_maxindex(struct radix_tree_node *); | 52 | unsigned long node_maxindex(struct radix_tree_node *); |
46 | unsigned long shift_maxindex(unsigned int shift); | 53 | unsigned long shift_maxindex(unsigned int shift); |
54 | int radix_tree_cpu_dead(unsigned int cpu); | ||
55 | struct radix_tree_preload { | ||
56 | unsigned nr; | ||
57 | struct radix_tree_node *nodes; | ||
58 | }; | ||
59 | extern struct radix_tree_preload radix_tree_preloads; | ||
diff --git a/usr/Kconfig b/usr/Kconfig index 572dcf7b6a44..6278f135256d 100644 --- a/usr/Kconfig +++ b/usr/Kconfig | |||
@@ -98,3 +98,130 @@ config RD_LZ4 | |||
98 | help | 98 | help |
99 | Support loading of a LZ4 encoded initial ramdisk or cpio buffer | 99 | Support loading of a LZ4 encoded initial ramdisk or cpio buffer |
100 | If unsure, say N. | 100 | If unsure, say N. |
101 | |||
102 | choice | ||
103 | prompt "Built-in initramfs compression mode" | ||
104 | depends on INITRAMFS_SOURCE!="" | ||
105 | optional | ||
106 | help | ||
107 | This option allows you to decide by which algorithm the builtin | ||
108 | initramfs will be compressed. Several compression algorithms are | ||
109 | available, which differ in efficiency, compression and | ||
110 | decompression speed. Compression speed is only relevant | ||
111 | when building a kernel. Decompression speed is relevant at | ||
112 | each boot. Also the memory usage during decompression may become | ||
113 | relevant on memory constrained systems. This is usually based on the | ||
114 | dictionary size of the algorithm with algorithms like XZ and LZMA | ||
115 | featuring large dictionary sizes. | ||
116 | |||
117 | High compression options are mostly useful for users who are | ||
118 | low on RAM, since it reduces the memory consumption during | ||
119 | boot. | ||
120 | |||
121 | Keep in mind that your build system needs to provide the appropriate | ||
122 | compression tool to compress the generated initram cpio file for | ||
123 | embedding. | ||
124 | |||
125 | If in doubt, select 'None' | ||
126 | |||
127 | config INITRAMFS_COMPRESSION_NONE | ||
128 | bool "None" | ||
129 | help | ||
130 | Do not compress the built-in initramfs at all. This may sound wasteful | ||
131 | in space, but, you should be aware that the built-in initramfs will be | ||
132 | compressed at a later stage anyways along with the rest of the kernel, | ||
133 | on those architectures that support this. However, not compressing the | ||
134 | initramfs may lead to slightly higher memory consumption during a | ||
135 | short time at boot, while both the cpio image and the unpacked | ||
136 | filesystem image will be present in memory simultaneously | ||
137 | |||
138 | config INITRAMFS_COMPRESSION_GZIP | ||
139 | bool "Gzip" | ||
140 | depends on RD_GZIP | ||
141 | help | ||
142 | Use the old and well tested gzip compression algorithm. Gzip provides | ||
143 | a good balance between compression ratio and decompression speed and | ||
144 | has a reasonable compression speed. It is also more likely to be | ||
145 | supported by your build system as the gzip tool is present by default | ||
146 | on most distros. | ||
147 | |||
148 | config INITRAMFS_COMPRESSION_BZIP2 | ||
149 | bool "Bzip2" | ||
150 | depends on RD_BZIP2 | ||
151 | help | ||
152 | It's compression ratio and speed is intermediate. Decompression speed | ||
153 | is slowest among the choices. The initramfs size is about 10% smaller | ||
154 | with bzip2, in comparison to gzip. Bzip2 uses a large amount of | ||
155 | memory. For modern kernels you will need at least 8MB RAM or more for | ||
156 | booting. | ||
157 | |||
158 | If you choose this, keep in mind that you need to have the bzip2 tool | ||
159 | available to be able to compress the initram. | ||
160 | |||
161 | config INITRAMFS_COMPRESSION_LZMA | ||
162 | bool "LZMA" | ||
163 | depends on RD_LZMA | ||
164 | help | ||
165 | This algorithm's compression ratio is best but has a large dictionary | ||
166 | size which might cause issues in memory constrained systems. | ||
167 | Decompression speed is between the other choices. Compression is | ||
168 | slowest. The initramfs size is about 33% smaller with LZMA in | ||
169 | comparison to gzip. | ||
170 | |||
171 | If you choose this, keep in mind that you may need to install the xz | ||
172 | or lzma tools to be able to compress the initram. | ||
173 | |||
174 | config INITRAMFS_COMPRESSION_XZ | ||
175 | bool "XZ" | ||
176 | depends on RD_XZ | ||
177 | help | ||
178 | XZ uses the LZMA2 algorithm and has a large dictionary which may cause | ||
179 | problems on memory constrained systems. The initramfs size is about | ||
180 | 30% smaller with XZ in comparison to gzip. Decompression speed is | ||
181 | better than that of bzip2 but worse than gzip and LZO. Compression is | ||
182 | slow. | ||
183 | |||
184 | If you choose this, keep in mind that you may need to install the xz | ||
185 | tool to be able to compress the initram. | ||
186 | |||
187 | config INITRAMFS_COMPRESSION_LZO | ||
188 | bool "LZO" | ||
189 | depends on RD_LZO | ||
190 | help | ||
191 | It's compression ratio is the second poorest amongst the choices. The | ||
192 | kernel size is about 10% bigger than gzip. Despite that, it's | ||
193 | decompression speed is the second fastest and it's compression speed | ||
194 | is quite fast too. | ||
195 | |||
196 | If you choose this, keep in mind that you may need to install the lzop | ||
197 | tool to be able to compress the initram. | ||
198 | |||
199 | config INITRAMFS_COMPRESSION_LZ4 | ||
200 | bool "LZ4" | ||
201 | depends on RD_LZ4 | ||
202 | help | ||
203 | It's compression ratio is the poorest amongst the choices. The kernel | ||
204 | size is about 15% bigger than gzip; however its decompression speed | ||
205 | is the fastest. | ||
206 | |||
207 | If you choose this, keep in mind that most distros don't provide lz4 | ||
208 | by default which could cause a build failure. | ||
209 | |||
210 | endchoice | ||
211 | |||
212 | config INITRAMFS_COMPRESSION | ||
213 | string | ||
214 | default "" if INITRAMFS_COMPRESSION_NONE | ||
215 | default ".gz" if INITRAMFS_COMPRESSION_GZIP | ||
216 | default ".bz2" if INITRAMFS_COMPRESSION_BZIP2 | ||
217 | default ".lzma" if INITRAMFS_COMPRESSION_LZMA | ||
218 | default ".xz" if INITRAMFS_COMPRESSION_XZ | ||
219 | default ".lzo" if INITRAMFS_COMPRESSION_LZO | ||
220 | default ".lz4" if INITRAMFS_COMPRESSION_LZ4 | ||
221 | default ".gz" if RD_GZIP | ||
222 | default ".lz4" if RD_LZ4 | ||
223 | default ".lzo" if RD_LZO | ||
224 | default ".xz" if RD_XZ | ||
225 | default ".lzma" if RD_LZMA | ||
226 | default ".bz2" if RD_BZIP2 | ||
227 | default "" | ||
diff --git a/usr/Makefile b/usr/Makefile index e767f019accf..17a513268325 100644 --- a/usr/Makefile +++ b/usr/Makefile | |||
@@ -5,25 +5,7 @@ | |||
5 | klibcdirs:; | 5 | klibcdirs:; |
6 | PHONY += klibcdirs | 6 | PHONY += klibcdirs |
7 | 7 | ||
8 | 8 | suffix_y = $(CONFIG_INITRAMFS_COMPRESSION) | |
9 | # Bzip2 | ||
10 | suffix_$(CONFIG_RD_BZIP2) = .bz2 | ||
11 | |||
12 | # Lzma | ||
13 | suffix_$(CONFIG_RD_LZMA) = .lzma | ||
14 | |||
15 | # XZ | ||
16 | suffix_$(CONFIG_RD_XZ) = .xz | ||
17 | |||
18 | # Lzo | ||
19 | suffix_$(CONFIG_RD_LZO) = .lzo | ||
20 | |||
21 | # Lz4 | ||
22 | suffix_$(CONFIG_RD_LZ4) = .lz4 | ||
23 | |||
24 | # Gzip | ||
25 | suffix_$(CONFIG_RD_GZIP) = .gz | ||
26 | |||
27 | AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)" | 9 | AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)" |
28 | 10 | ||
29 | # Generate builtin.o based on initramfs_data.o | 11 | # Generate builtin.o based on initramfs_data.o |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index efeceb0a222d..3815e940fbea 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
@@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work) | |||
76 | struct kvm_vcpu *vcpu = apf->vcpu; | 76 | struct kvm_vcpu *vcpu = apf->vcpu; |
77 | unsigned long addr = apf->addr; | 77 | unsigned long addr = apf->addr; |
78 | gva_t gva = apf->gva; | 78 | gva_t gva = apf->gva; |
79 | int locked = 1; | ||
79 | 80 | ||
80 | might_sleep(); | 81 | might_sleep(); |
81 | 82 | ||
82 | /* | 83 | /* |
83 | * This work is run asynchromously to the task which owns | 84 | * This work is run asynchromously to the task which owns |
84 | * mm and might be done in another context, so we must | 85 | * mm and might be done in another context, so we must |
85 | * use FOLL_REMOTE. | 86 | * access remotely. |
86 | */ | 87 | */ |
87 | __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, | 88 | down_read(&mm->mmap_sem); |
88 | FOLL_WRITE | FOLL_REMOTE); | 89 | get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, |
90 | &locked); | ||
91 | if (locked) | ||
92 | up_read(&mm->mmap_sem); | ||
89 | 93 | ||
90 | kvm_async_page_present_sync(vcpu, apf); | 94 | kvm_async_page_present_sync(vcpu, apf); |
91 | 95 | ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 823544c166be..de102cae7125 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, | |||
1418 | npages = get_user_page_nowait(addr, write_fault, page); | 1418 | npages = get_user_page_nowait(addr, write_fault, page); |
1419 | up_read(¤t->mm->mmap_sem); | 1419 | up_read(¤t->mm->mmap_sem); |
1420 | } else { | 1420 | } else { |
1421 | unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; | 1421 | unsigned int flags = FOLL_HWPOISON; |
1422 | 1422 | ||
1423 | if (write_fault) | 1423 | if (write_fault) |
1424 | flags |= FOLL_WRITE; | 1424 | flags |= FOLL_WRITE; |
1425 | 1425 | ||
1426 | npages = __get_user_pages_unlocked(current, current->mm, addr, 1, | 1426 | npages = get_user_pages_unlocked(addr, 1, page, flags); |
1427 | page, flags); | ||
1428 | } | 1427 | } |
1429 | if (npages != 1) | 1428 | if (npages != 1) |
1430 | return npages; | 1429 | return npages; |