diff options
140 files changed, 3428 insertions, 2218 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1b5f15653b1b..69e2387ca278 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
| @@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must | |||
| 556 | not block. If it's not possible to reach a page without blocking, | 556 | not block. If it's not possible to reach a page without blocking, |
| 557 | filesystem should skip it. Filesystem should use do_set_pte() to setup | 557 | filesystem should skip it. Filesystem should use do_set_pte() to setup |
| 558 | page table entry. Pointer to entry associated with the page is passed in | 558 | page table entry. Pointer to entry associated with the page is passed in |
| 559 | "pte" field in fault_env structure. Pointers to entries for other offsets | 559 | "pte" field in vm_fault structure. Pointers to entries for other offsets |
| 560 | should be calculated relative to "pte". | 560 | should be calculated relative to "pte". |
| 561 | 561 | ||
| 562 | ->page_mkwrite() is called when a previously read-only pte is | 562 | ->page_mkwrite() is called when a previously read-only pte is |
diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index cd8aad8226dd..08450a1a5b5f 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c | |||
| @@ -158,7 +158,10 @@ static dma_addr_t arc_dma_map_page(struct device *dev, struct page *page, | |||
| 158 | unsigned long attrs) | 158 | unsigned long attrs) |
| 159 | { | 159 | { |
| 160 | phys_addr_t paddr = page_to_phys(page) + offset; | 160 | phys_addr_t paddr = page_to_phys(page) + offset; |
| 161 | _dma_cache_sync(paddr, size, dir); | 161 | |
| 162 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
| 163 | _dma_cache_sync(paddr, size, dir); | ||
| 164 | |||
| 162 | return plat_phys_to_dma(dev, paddr); | 165 | return plat_phys_to_dma(dev, paddr); |
| 163 | } | 166 | } |
| 164 | 167 | ||
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c index 301281645d08..75055df1cda3 100644 --- a/arch/arm/common/dmabounce.c +++ b/arch/arm/common/dmabounce.c | |||
| @@ -243,7 +243,8 @@ static int needs_bounce(struct device *dev, dma_addr_t dma_addr, size_t size) | |||
| 243 | } | 243 | } |
| 244 | 244 | ||
| 245 | static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, | 245 | static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, |
| 246 | enum dma_data_direction dir) | 246 | enum dma_data_direction dir, |
| 247 | unsigned long attrs) | ||
| 247 | { | 248 | { |
| 248 | struct dmabounce_device_info *device_info = dev->archdata.dmabounce; | 249 | struct dmabounce_device_info *device_info = dev->archdata.dmabounce; |
| 249 | struct safe_buffer *buf; | 250 | struct safe_buffer *buf; |
| @@ -262,7 +263,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, | |||
| 262 | __func__, buf->ptr, virt_to_dma(dev, buf->ptr), | 263 | __func__, buf->ptr, virt_to_dma(dev, buf->ptr), |
| 263 | buf->safe, buf->safe_dma_addr); | 264 | buf->safe, buf->safe_dma_addr); |
| 264 | 265 | ||
| 265 | if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) { | 266 | if ((dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) && |
| 267 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { | ||
| 266 | dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n", | 268 | dev_dbg(dev, "%s: copy unsafe %p to safe %p, size %d\n", |
| 267 | __func__, ptr, buf->safe, size); | 269 | __func__, ptr, buf->safe, size); |
| 268 | memcpy(buf->safe, ptr, size); | 270 | memcpy(buf->safe, ptr, size); |
| @@ -272,7 +274,8 @@ static inline dma_addr_t map_single(struct device *dev, void *ptr, size_t size, | |||
| 272 | } | 274 | } |
| 273 | 275 | ||
| 274 | static inline void unmap_single(struct device *dev, struct safe_buffer *buf, | 276 | static inline void unmap_single(struct device *dev, struct safe_buffer *buf, |
| 275 | size_t size, enum dma_data_direction dir) | 277 | size_t size, enum dma_data_direction dir, |
| 278 | unsigned long attrs) | ||
| 276 | { | 279 | { |
| 277 | BUG_ON(buf->size != size); | 280 | BUG_ON(buf->size != size); |
| 278 | BUG_ON(buf->direction != dir); | 281 | BUG_ON(buf->direction != dir); |
| @@ -283,7 +286,8 @@ static inline void unmap_single(struct device *dev, struct safe_buffer *buf, | |||
| 283 | 286 | ||
| 284 | DO_STATS(dev->archdata.dmabounce->bounce_count++); | 287 | DO_STATS(dev->archdata.dmabounce->bounce_count++); |
| 285 | 288 | ||
| 286 | if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) { | 289 | if ((dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) && |
| 290 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { | ||
| 287 | void *ptr = buf->ptr; | 291 | void *ptr = buf->ptr; |
| 288 | 292 | ||
| 289 | dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n", | 293 | dev_dbg(dev, "%s: copy back safe %p to unsafe %p size %d\n", |
| @@ -334,7 +338,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, struct page *page, | |||
| 334 | return DMA_ERROR_CODE; | 338 | return DMA_ERROR_CODE; |
| 335 | } | 339 | } |
| 336 | 340 | ||
| 337 | return map_single(dev, page_address(page) + offset, size, dir); | 341 | return map_single(dev, page_address(page) + offset, size, dir, attrs); |
| 338 | } | 342 | } |
| 339 | 343 | ||
| 340 | /* | 344 | /* |
| @@ -357,7 +361,7 @@ static void dmabounce_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t | |||
| 357 | return; | 361 | return; |
| 358 | } | 362 | } |
| 359 | 363 | ||
| 360 | unmap_single(dev, buf, size, dir); | 364 | unmap_single(dev, buf, size, dir, attrs); |
| 361 | } | 365 | } |
| 362 | 366 | ||
| 363 | static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr, | 367 | static int __dmabounce_sync_for_cpu(struct device *dev, dma_addr_t addr, |
diff --git a/arch/avr32/mm/dma-coherent.c b/arch/avr32/mm/dma-coherent.c index 58610d0df7ed..54534e5d0781 100644 --- a/arch/avr32/mm/dma-coherent.c +++ b/arch/avr32/mm/dma-coherent.c | |||
| @@ -146,7 +146,8 @@ static dma_addr_t avr32_dma_map_page(struct device *dev, struct page *page, | |||
| 146 | { | 146 | { |
| 147 | void *cpu_addr = page_address(page) + offset; | 147 | void *cpu_addr = page_address(page) + offset; |
| 148 | 148 | ||
| 149 | dma_cache_sync(dev, cpu_addr, size, direction); | 149 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 150 | dma_cache_sync(dev, cpu_addr, size, direction); | ||
| 150 | return virt_to_bus(cpu_addr); | 151 | return virt_to_bus(cpu_addr); |
| 151 | } | 152 | } |
| 152 | 153 | ||
| @@ -162,6 +163,10 @@ static int avr32_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 162 | 163 | ||
| 163 | sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; | 164 | sg->dma_address = page_to_bus(sg_page(sg)) + sg->offset; |
| 164 | virt = sg_virt(sg); | 165 | virt = sg_virt(sg); |
| 166 | |||
| 167 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 168 | continue; | ||
| 169 | |||
| 165 | dma_cache_sync(dev, virt, sg->length, direction); | 170 | dma_cache_sync(dev, virt, sg->length, direction); |
| 166 | } | 171 | } |
| 167 | 172 | ||
diff --git a/arch/blackfin/kernel/dma-mapping.c b/arch/blackfin/kernel/dma-mapping.c index 53fbbb61aa86..a27a74a18fb0 100644 --- a/arch/blackfin/kernel/dma-mapping.c +++ b/arch/blackfin/kernel/dma-mapping.c | |||
| @@ -118,6 +118,10 @@ static int bfin_dma_map_sg(struct device *dev, struct scatterlist *sg_list, | |||
| 118 | 118 | ||
| 119 | for_each_sg(sg_list, sg, nents, i) { | 119 | for_each_sg(sg_list, sg, nents, i) { |
| 120 | sg->dma_address = (dma_addr_t) sg_virt(sg); | 120 | sg->dma_address = (dma_addr_t) sg_virt(sg); |
| 121 | |||
| 122 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 123 | continue; | ||
| 124 | |||
| 121 | __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); | 125 | __dma_sync(sg_dma_address(sg), sg_dma_len(sg), direction); |
| 122 | } | 126 | } |
| 123 | 127 | ||
| @@ -143,7 +147,9 @@ static dma_addr_t bfin_dma_map_page(struct device *dev, struct page *page, | |||
| 143 | { | 147 | { |
| 144 | dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); | 148 | dma_addr_t handle = (dma_addr_t)(page_address(page) + offset); |
| 145 | 149 | ||
| 146 | _dma_sync(handle, size, dir); | 150 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 151 | _dma_sync(handle, size, dir); | ||
| 152 | |||
| 147 | return handle; | 153 | return handle; |
| 148 | } | 154 | } |
| 149 | 155 | ||
diff --git a/arch/c6x/kernel/dma.c b/arch/c6x/kernel/dma.c index db4a6a301f5e..6752df32ef06 100644 --- a/arch/c6x/kernel/dma.c +++ b/arch/c6x/kernel/dma.c | |||
| @@ -42,14 +42,17 @@ static dma_addr_t c6x_dma_map_page(struct device *dev, struct page *page, | |||
| 42 | { | 42 | { |
| 43 | dma_addr_t handle = virt_to_phys(page_address(page) + offset); | 43 | dma_addr_t handle = virt_to_phys(page_address(page) + offset); |
| 44 | 44 | ||
| 45 | c6x_dma_sync(handle, size, dir); | 45 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 46 | c6x_dma_sync(handle, size, dir); | ||
| 47 | |||
| 46 | return handle; | 48 | return handle; |
| 47 | } | 49 | } |
| 48 | 50 | ||
| 49 | static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, | 51 | static void c6x_dma_unmap_page(struct device *dev, dma_addr_t handle, |
| 50 | size_t size, enum dma_data_direction dir, unsigned long attrs) | 52 | size_t size, enum dma_data_direction dir, unsigned long attrs) |
| 51 | { | 53 | { |
| 52 | c6x_dma_sync(handle, size, dir); | 54 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 55 | c6x_dma_sync(handle, size, dir); | ||
| 53 | } | 56 | } |
| 54 | 57 | ||
| 55 | static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, | 58 | static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, |
| @@ -60,7 +63,8 @@ static int c6x_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 60 | 63 | ||
| 61 | for_each_sg(sglist, sg, nents, i) { | 64 | for_each_sg(sglist, sg, nents, i) { |
| 62 | sg->dma_address = sg_phys(sg); | 65 | sg->dma_address = sg_phys(sg); |
| 63 | c6x_dma_sync(sg->dma_address, sg->length, dir); | 66 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 67 | c6x_dma_sync(sg->dma_address, sg->length, dir); | ||
| 64 | } | 68 | } |
| 65 | 69 | ||
| 66 | return nents; | 70 | return nents; |
| @@ -72,9 +76,11 @@ static void c6x_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
| 72 | struct scatterlist *sg; | 76 | struct scatterlist *sg; |
| 73 | int i; | 77 | int i; |
| 74 | 78 | ||
| 79 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 80 | return; | ||
| 81 | |||
| 75 | for_each_sg(sglist, sg, nents, i) | 82 | for_each_sg(sglist, sg, nents, i) |
| 76 | c6x_dma_sync(sg_dma_address(sg), sg->length, dir); | 83 | c6x_dma_sync(sg_dma_address(sg), sg->length, dir); |
| 77 | |||
| 78 | } | 84 | } |
| 79 | 85 | ||
| 80 | static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, | 86 | static void c6x_dma_sync_single_for_cpu(struct device *dev, dma_addr_t handle, |
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c index 90f2e4cb33d6..187688128c65 100644 --- a/arch/frv/mb93090-mb00/pci-dma-nommu.c +++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c | |||
| @@ -109,16 +109,19 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 109 | int nents, enum dma_data_direction direction, | 109 | int nents, enum dma_data_direction direction, |
| 110 | unsigned long attrs) | 110 | unsigned long attrs) |
| 111 | { | 111 | { |
| 112 | int i; | ||
| 113 | struct scatterlist *sg; | 112 | struct scatterlist *sg; |
| 113 | int i; | ||
| 114 | |||
| 115 | BUG_ON(direction == DMA_NONE); | ||
| 116 | |||
| 117 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 118 | return nents; | ||
| 114 | 119 | ||
| 115 | for_each_sg(sglist, sg, nents, i) { | 120 | for_each_sg(sglist, sg, nents, i) { |
| 116 | frv_cache_wback_inv(sg_dma_address(sg), | 121 | frv_cache_wback_inv(sg_dma_address(sg), |
| 117 | sg_dma_address(sg) + sg_dma_len(sg)); | 122 | sg_dma_address(sg) + sg_dma_len(sg)); |
| 118 | } | 123 | } |
| 119 | 124 | ||
| 120 | BUG_ON(direction == DMA_NONE); | ||
| 121 | |||
| 122 | return nents; | 125 | return nents; |
| 123 | } | 126 | } |
| 124 | 127 | ||
| @@ -127,7 +130,10 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page, | |||
| 127 | enum dma_data_direction direction, unsigned long attrs) | 130 | enum dma_data_direction direction, unsigned long attrs) |
| 128 | { | 131 | { |
| 129 | BUG_ON(direction == DMA_NONE); | 132 | BUG_ON(direction == DMA_NONE); |
| 130 | flush_dcache_page(page); | 133 | |
| 134 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
| 135 | flush_dcache_page(page); | ||
| 136 | |||
| 131 | return (dma_addr_t) page_to_phys(page) + offset; | 137 | return (dma_addr_t) page_to_phys(page) + offset; |
| 132 | } | 138 | } |
| 133 | 139 | ||
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c index f585745b1abc..dba7df918144 100644 --- a/arch/frv/mb93090-mb00/pci-dma.c +++ b/arch/frv/mb93090-mb00/pci-dma.c | |||
| @@ -40,13 +40,16 @@ static int frv_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 40 | int nents, enum dma_data_direction direction, | 40 | int nents, enum dma_data_direction direction, |
| 41 | unsigned long attrs) | 41 | unsigned long attrs) |
| 42 | { | 42 | { |
| 43 | struct scatterlist *sg; | ||
| 43 | unsigned long dampr2; | 44 | unsigned long dampr2; |
| 44 | void *vaddr; | 45 | void *vaddr; |
| 45 | int i; | 46 | int i; |
| 46 | struct scatterlist *sg; | ||
| 47 | 47 | ||
| 48 | BUG_ON(direction == DMA_NONE); | 48 | BUG_ON(direction == DMA_NONE); |
| 49 | 49 | ||
| 50 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 51 | return nents; | ||
| 52 | |||
| 50 | dampr2 = __get_DAMPR(2); | 53 | dampr2 = __get_DAMPR(2); |
| 51 | 54 | ||
| 52 | for_each_sg(sglist, sg, nents, i) { | 55 | for_each_sg(sglist, sg, nents, i) { |
| @@ -70,7 +73,9 @@ static dma_addr_t frv_dma_map_page(struct device *dev, struct page *page, | |||
| 70 | unsigned long offset, size_t size, | 73 | unsigned long offset, size_t size, |
| 71 | enum dma_data_direction direction, unsigned long attrs) | 74 | enum dma_data_direction direction, unsigned long attrs) |
| 72 | { | 75 | { |
| 73 | flush_dcache_page(page); | 76 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 77 | flush_dcache_page(page); | ||
| 78 | |||
| 74 | return (dma_addr_t) page_to_phys(page) + offset; | 79 | return (dma_addr_t) page_to_phys(page) + offset; |
| 75 | } | 80 | } |
| 76 | 81 | ||
diff --git a/arch/hexagon/kernel/dma.c b/arch/hexagon/kernel/dma.c index b9017785fb71..dbc4f1003da4 100644 --- a/arch/hexagon/kernel/dma.c +++ b/arch/hexagon/kernel/dma.c | |||
| @@ -119,6 +119,9 @@ static int hexagon_map_sg(struct device *hwdev, struct scatterlist *sg, | |||
| 119 | 119 | ||
| 120 | s->dma_length = s->length; | 120 | s->dma_length = s->length; |
| 121 | 121 | ||
| 122 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 123 | continue; | ||
| 124 | |||
| 122 | flush_dcache_range(dma_addr_to_virt(s->dma_address), | 125 | flush_dcache_range(dma_addr_to_virt(s->dma_address), |
| 123 | dma_addr_to_virt(s->dma_address + s->length)); | 126 | dma_addr_to_virt(s->dma_address + s->length)); |
| 124 | } | 127 | } |
| @@ -180,7 +183,8 @@ static dma_addr_t hexagon_map_page(struct device *dev, struct page *page, | |||
| 180 | if (!check_addr("map_single", dev, bus, size)) | 183 | if (!check_addr("map_single", dev, bus, size)) |
| 181 | return bad_dma_address; | 184 | return bad_dma_address; |
| 182 | 185 | ||
| 183 | dma_sync(dma_addr_to_virt(bus), size, dir); | 186 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 187 | dma_sync(dma_addr_to_virt(bus), size, dir); | ||
| 184 | 188 | ||
| 185 | return bus; | 189 | return bus; |
| 186 | } | 190 | } |
diff --git a/arch/m68k/kernel/dma.c b/arch/m68k/kernel/dma.c index 8cf97cbadc91..07070065a425 100644 --- a/arch/m68k/kernel/dma.c +++ b/arch/m68k/kernel/dma.c | |||
| @@ -134,7 +134,9 @@ static dma_addr_t m68k_dma_map_page(struct device *dev, struct page *page, | |||
| 134 | { | 134 | { |
| 135 | dma_addr_t handle = page_to_phys(page) + offset; | 135 | dma_addr_t handle = page_to_phys(page) + offset; |
| 136 | 136 | ||
| 137 | dma_sync_single_for_device(dev, handle, size, dir); | 137 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 138 | dma_sync_single_for_device(dev, handle, size, dir); | ||
| 139 | |||
| 138 | return handle; | 140 | return handle; |
| 139 | } | 141 | } |
| 140 | 142 | ||
| @@ -146,6 +148,10 @@ static int m68k_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 146 | 148 | ||
| 147 | for_each_sg(sglist, sg, nents, i) { | 149 | for_each_sg(sglist, sg, nents, i) { |
| 148 | sg->dma_address = sg_phys(sg); | 150 | sg->dma_address = sg_phys(sg); |
| 151 | |||
| 152 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 153 | continue; | ||
| 154 | |||
| 149 | dma_sync_single_for_device(dev, sg->dma_address, sg->length, | 155 | dma_sync_single_for_device(dev, sg->dma_address, sg->length, |
| 150 | dir); | 156 | dir); |
| 151 | } | 157 | } |
diff --git a/arch/metag/kernel/dma.c b/arch/metag/kernel/dma.c index 0db31e24c541..91968d92652b 100644 --- a/arch/metag/kernel/dma.c +++ b/arch/metag/kernel/dma.c | |||
| @@ -484,8 +484,9 @@ static dma_addr_t metag_dma_map_page(struct device *dev, struct page *page, | |||
| 484 | unsigned long offset, size_t size, | 484 | unsigned long offset, size_t size, |
| 485 | enum dma_data_direction direction, unsigned long attrs) | 485 | enum dma_data_direction direction, unsigned long attrs) |
| 486 | { | 486 | { |
| 487 | dma_sync_for_device((void *)(page_to_phys(page) + offset), size, | 487 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 488 | direction); | 488 | dma_sync_for_device((void *)(page_to_phys(page) + offset), |
| 489 | size, direction); | ||
| 489 | return page_to_phys(page) + offset; | 490 | return page_to_phys(page) + offset; |
| 490 | } | 491 | } |
| 491 | 492 | ||
| @@ -493,7 +494,8 @@ static void metag_dma_unmap_page(struct device *dev, dma_addr_t dma_address, | |||
| 493 | size_t size, enum dma_data_direction direction, | 494 | size_t size, enum dma_data_direction direction, |
| 494 | unsigned long attrs) | 495 | unsigned long attrs) |
| 495 | { | 496 | { |
| 496 | dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | 497 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 498 | dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | ||
| 497 | } | 499 | } |
| 498 | 500 | ||
| 499 | static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, | 501 | static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, |
| @@ -507,6 +509,10 @@ static int metag_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 507 | BUG_ON(!sg_page(sg)); | 509 | BUG_ON(!sg_page(sg)); |
| 508 | 510 | ||
| 509 | sg->dma_address = sg_phys(sg); | 511 | sg->dma_address = sg_phys(sg); |
| 512 | |||
| 513 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 514 | continue; | ||
| 515 | |||
| 510 | dma_sync_for_device(sg_virt(sg), sg->length, direction); | 516 | dma_sync_for_device(sg_virt(sg), sg->length, direction); |
| 511 | } | 517 | } |
| 512 | 518 | ||
| @@ -525,6 +531,10 @@ static void metag_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
| 525 | BUG_ON(!sg_page(sg)); | 531 | BUG_ON(!sg_page(sg)); |
| 526 | 532 | ||
| 527 | sg->dma_address = sg_phys(sg); | 533 | sg->dma_address = sg_phys(sg); |
| 534 | |||
| 535 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 536 | continue; | ||
| 537 | |||
| 528 | dma_sync_for_cpu(sg_virt(sg), sg->length, direction); | 538 | dma_sync_for_cpu(sg_virt(sg), sg->length, direction); |
| 529 | } | 539 | } |
| 530 | } | 540 | } |
diff --git a/arch/microblaze/kernel/dma.c b/arch/microblaze/kernel/dma.c index ec04dc1e2527..818daf230eb4 100644 --- a/arch/microblaze/kernel/dma.c +++ b/arch/microblaze/kernel/dma.c | |||
| @@ -61,6 +61,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, | |||
| 61 | /* FIXME this part of code is untested */ | 61 | /* FIXME this part of code is untested */ |
| 62 | for_each_sg(sgl, sg, nents, i) { | 62 | for_each_sg(sgl, sg, nents, i) { |
| 63 | sg->dma_address = sg_phys(sg); | 63 | sg->dma_address = sg_phys(sg); |
| 64 | |||
| 65 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 66 | continue; | ||
| 67 | |||
| 64 | __dma_sync(page_to_phys(sg_page(sg)) + sg->offset, | 68 | __dma_sync(page_to_phys(sg_page(sg)) + sg->offset, |
| 65 | sg->length, direction); | 69 | sg->length, direction); |
| 66 | } | 70 | } |
| @@ -80,7 +84,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, | |||
| 80 | enum dma_data_direction direction, | 84 | enum dma_data_direction direction, |
| 81 | unsigned long attrs) | 85 | unsigned long attrs) |
| 82 | { | 86 | { |
| 83 | __dma_sync(page_to_phys(page) + offset, size, direction); | 87 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 88 | __dma_sync(page_to_phys(page) + offset, size, direction); | ||
| 84 | return page_to_phys(page) + offset; | 89 | return page_to_phys(page) + offset; |
| 85 | } | 90 | } |
| 86 | 91 | ||
| @@ -95,7 +100,8 @@ static inline void dma_direct_unmap_page(struct device *dev, | |||
| 95 | * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and | 100 | * phys_to_virt is here because in __dma_sync_page is __virt_to_phys and |
| 96 | * dma_address is physical address | 101 | * dma_address is physical address |
| 97 | */ | 102 | */ |
| 98 | __dma_sync(dma_address, size, direction); | 103 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 104 | __dma_sync(dma_address, size, direction); | ||
| 99 | } | 105 | } |
| 100 | 106 | ||
| 101 | static inline void | 107 | static inline void |
diff --git a/arch/mips/loongson64/common/dma-swiotlb.c b/arch/mips/loongson64/common/dma-swiotlb.c index 1a80b6f73ab2..aab4fd681e1f 100644 --- a/arch/mips/loongson64/common/dma-swiotlb.c +++ b/arch/mips/loongson64/common/dma-swiotlb.c | |||
| @@ -61,7 +61,7 @@ static int loongson_dma_map_sg(struct device *dev, struct scatterlist *sg, | |||
| 61 | int nents, enum dma_data_direction dir, | 61 | int nents, enum dma_data_direction dir, |
| 62 | unsigned long attrs) | 62 | unsigned long attrs) |
| 63 | { | 63 | { |
| 64 | int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, 0); | 64 | int r = swiotlb_map_sg_attrs(dev, sg, nents, dir, attrs); |
| 65 | mb(); | 65 | mb(); |
| 66 | 66 | ||
| 67 | return r; | 67 | return r; |
diff --git a/arch/mips/mm/dma-default.c b/arch/mips/mm/dma-default.c index 46d5696c4f27..a39c36af97ad 100644 --- a/arch/mips/mm/dma-default.c +++ b/arch/mips/mm/dma-default.c | |||
| @@ -293,7 +293,7 @@ static inline void __dma_sync(struct page *page, | |||
| 293 | static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, | 293 | static void mips_dma_unmap_page(struct device *dev, dma_addr_t dma_addr, |
| 294 | size_t size, enum dma_data_direction direction, unsigned long attrs) | 294 | size_t size, enum dma_data_direction direction, unsigned long attrs) |
| 295 | { | 295 | { |
| 296 | if (cpu_needs_post_dma_flush(dev)) | 296 | if (cpu_needs_post_dma_flush(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 297 | __dma_sync(dma_addr_to_page(dev, dma_addr), | 297 | __dma_sync(dma_addr_to_page(dev, dma_addr), |
| 298 | dma_addr & ~PAGE_MASK, size, direction); | 298 | dma_addr & ~PAGE_MASK, size, direction); |
| 299 | plat_post_dma_flush(dev); | 299 | plat_post_dma_flush(dev); |
| @@ -307,7 +307,8 @@ static int mips_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 307 | struct scatterlist *sg; | 307 | struct scatterlist *sg; |
| 308 | 308 | ||
| 309 | for_each_sg(sglist, sg, nents, i) { | 309 | for_each_sg(sglist, sg, nents, i) { |
| 310 | if (!plat_device_is_coherent(dev)) | 310 | if (!plat_device_is_coherent(dev) && |
| 311 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
| 311 | __dma_sync(sg_page(sg), sg->offset, sg->length, | 312 | __dma_sync(sg_page(sg), sg->offset, sg->length, |
| 312 | direction); | 313 | direction); |
| 313 | #ifdef CONFIG_NEED_SG_DMA_LENGTH | 314 | #ifdef CONFIG_NEED_SG_DMA_LENGTH |
| @@ -324,7 +325,7 @@ static dma_addr_t mips_dma_map_page(struct device *dev, struct page *page, | |||
| 324 | unsigned long offset, size_t size, enum dma_data_direction direction, | 325 | unsigned long offset, size_t size, enum dma_data_direction direction, |
| 325 | unsigned long attrs) | 326 | unsigned long attrs) |
| 326 | { | 327 | { |
| 327 | if (!plat_device_is_coherent(dev)) | 328 | if (!plat_device_is_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 328 | __dma_sync(page, offset, size, direction); | 329 | __dma_sync(page, offset, size, direction); |
| 329 | 330 | ||
| 330 | return plat_map_dma_mem_page(dev, page) + offset; | 331 | return plat_map_dma_mem_page(dev, page) + offset; |
| @@ -339,6 +340,7 @@ static void mips_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
| 339 | 340 | ||
| 340 | for_each_sg(sglist, sg, nhwentries, i) { | 341 | for_each_sg(sglist, sg, nhwentries, i) { |
| 341 | if (!plat_device_is_coherent(dev) && | 342 | if (!plat_device_is_coherent(dev) && |
| 343 | !(attrs & DMA_ATTR_SKIP_CPU_SYNC) && | ||
| 342 | direction != DMA_TO_DEVICE) | 344 | direction != DMA_TO_DEVICE) |
| 343 | __dma_sync(sg_page(sg), sg->offset, sg->length, | 345 | __dma_sync(sg_page(sg), sg->offset, sg->length, |
| 344 | direction); | 346 | direction); |
diff --git a/arch/nios2/mm/dma-mapping.c b/arch/nios2/mm/dma-mapping.c index d800fad87896..f6a5dcf9d682 100644 --- a/arch/nios2/mm/dma-mapping.c +++ b/arch/nios2/mm/dma-mapping.c | |||
| @@ -98,13 +98,17 @@ static int nios2_dma_map_sg(struct device *dev, struct scatterlist *sg, | |||
| 98 | int i; | 98 | int i; |
| 99 | 99 | ||
| 100 | for_each_sg(sg, sg, nents, i) { | 100 | for_each_sg(sg, sg, nents, i) { |
| 101 | void *addr; | 101 | void *addr = sg_virt(sg); |
| 102 | 102 | ||
| 103 | addr = sg_virt(sg); | 103 | if (!addr) |
| 104 | if (addr) { | 104 | continue; |
| 105 | __dma_sync_for_device(addr, sg->length, direction); | 105 | |
| 106 | sg->dma_address = sg_phys(sg); | 106 | sg->dma_address = sg_phys(sg); |
| 107 | } | 107 | |
| 108 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 109 | continue; | ||
| 110 | |||
| 111 | __dma_sync_for_device(addr, sg->length, direction); | ||
| 108 | } | 112 | } |
| 109 | 113 | ||
| 110 | return nents; | 114 | return nents; |
| @@ -117,7 +121,9 @@ static dma_addr_t nios2_dma_map_page(struct device *dev, struct page *page, | |||
| 117 | { | 121 | { |
| 118 | void *addr = page_address(page) + offset; | 122 | void *addr = page_address(page) + offset; |
| 119 | 123 | ||
| 120 | __dma_sync_for_device(addr, size, direction); | 124 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 125 | __dma_sync_for_device(addr, size, direction); | ||
| 126 | |||
| 121 | return page_to_phys(page) + offset; | 127 | return page_to_phys(page) + offset; |
| 122 | } | 128 | } |
| 123 | 129 | ||
| @@ -125,7 +131,8 @@ static void nios2_dma_unmap_page(struct device *dev, dma_addr_t dma_address, | |||
| 125 | size_t size, enum dma_data_direction direction, | 131 | size_t size, enum dma_data_direction direction, |
| 126 | unsigned long attrs) | 132 | unsigned long attrs) |
| 127 | { | 133 | { |
| 128 | __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | 134 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 135 | __dma_sync_for_cpu(phys_to_virt(dma_address), size, direction); | ||
| 129 | } | 136 | } |
| 130 | 137 | ||
| 131 | static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, | 138 | static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, |
| @@ -138,6 +145,9 @@ static void nios2_dma_unmap_sg(struct device *dev, struct scatterlist *sg, | |||
| 138 | if (direction == DMA_TO_DEVICE) | 145 | if (direction == DMA_TO_DEVICE) |
| 139 | return; | 146 | return; |
| 140 | 147 | ||
| 148 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 149 | return; | ||
| 150 | |||
| 141 | for_each_sg(sg, sg, nhwentries, i) { | 151 | for_each_sg(sg, sg, nhwentries, i) { |
| 142 | addr = sg_virt(sg); | 152 | addr = sg_virt(sg); |
| 143 | if (addr) | 153 | if (addr) |
diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index 140c99140649..906998bac957 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c | |||
| @@ -141,6 +141,9 @@ or1k_map_page(struct device *dev, struct page *page, | |||
| 141 | unsigned long cl; | 141 | unsigned long cl; |
| 142 | dma_addr_t addr = page_to_phys(page) + offset; | 142 | dma_addr_t addr = page_to_phys(page) + offset; |
| 143 | 143 | ||
| 144 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 145 | return addr; | ||
| 146 | |||
| 144 | switch (dir) { | 147 | switch (dir) { |
| 145 | case DMA_TO_DEVICE: | 148 | case DMA_TO_DEVICE: |
| 146 | /* Flush the dcache for the requested range */ | 149 | /* Flush the dcache for the requested range */ |
diff --git a/arch/parisc/kernel/pci-dma.c b/arch/parisc/kernel/pci-dma.c index 494ff6e8c88a..b6298a85e8ae 100644 --- a/arch/parisc/kernel/pci-dma.c +++ b/arch/parisc/kernel/pci-dma.c | |||
| @@ -459,7 +459,9 @@ static dma_addr_t pa11_dma_map_page(struct device *dev, struct page *page, | |||
| 459 | void *addr = page_address(page) + offset; | 459 | void *addr = page_address(page) + offset; |
| 460 | BUG_ON(direction == DMA_NONE); | 460 | BUG_ON(direction == DMA_NONE); |
| 461 | 461 | ||
| 462 | flush_kernel_dcache_range((unsigned long) addr, size); | 462 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 463 | flush_kernel_dcache_range((unsigned long) addr, size); | ||
| 464 | |||
| 463 | return virt_to_phys(addr); | 465 | return virt_to_phys(addr); |
| 464 | } | 466 | } |
| 465 | 467 | ||
| @@ -469,8 +471,11 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, | |||
| 469 | { | 471 | { |
| 470 | BUG_ON(direction == DMA_NONE); | 472 | BUG_ON(direction == DMA_NONE); |
| 471 | 473 | ||
| 474 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 475 | return; | ||
| 476 | |||
| 472 | if (direction == DMA_TO_DEVICE) | 477 | if (direction == DMA_TO_DEVICE) |
| 473 | return; | 478 | return; |
| 474 | 479 | ||
| 475 | /* | 480 | /* |
| 476 | * For PCI_DMA_FROMDEVICE this flush is not necessary for the | 481 | * For PCI_DMA_FROMDEVICE this flush is not necessary for the |
| @@ -479,7 +484,6 @@ static void pa11_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, | |||
| 479 | */ | 484 | */ |
| 480 | 485 | ||
| 481 | flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); | 486 | flush_kernel_dcache_range((unsigned long) phys_to_virt(dma_handle), size); |
| 482 | return; | ||
| 483 | } | 487 | } |
| 484 | 488 | ||
| 485 | static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, | 489 | static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, |
| @@ -496,6 +500,10 @@ static int pa11_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 496 | 500 | ||
| 497 | sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); | 501 | sg_dma_address(sg) = (dma_addr_t) virt_to_phys(vaddr); |
| 498 | sg_dma_len(sg) = sg->length; | 502 | sg_dma_len(sg) = sg->length; |
| 503 | |||
| 504 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 505 | continue; | ||
| 506 | |||
| 499 | flush_kernel_dcache_range(vaddr, sg->length); | 507 | flush_kernel_dcache_range(vaddr, sg->length); |
| 500 | } | 508 | } |
| 501 | return nents; | 509 | return nents; |
| @@ -510,14 +518,16 @@ static void pa11_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
| 510 | 518 | ||
| 511 | BUG_ON(direction == DMA_NONE); | 519 | BUG_ON(direction == DMA_NONE); |
| 512 | 520 | ||
| 521 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 522 | return; | ||
| 523 | |||
| 513 | if (direction == DMA_TO_DEVICE) | 524 | if (direction == DMA_TO_DEVICE) |
| 514 | return; | 525 | return; |
| 515 | 526 | ||
| 516 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ | 527 | /* once we do combining we'll need to use phys_to_virt(sg_dma_address(sglist)) */ |
| 517 | 528 | ||
| 518 | for_each_sg(sglist, sg, nents, i) | 529 | for_each_sg(sglist, sg, nents, i) |
| 519 | flush_kernel_vmap_range(sg_virt(sg), sg->length); | 530 | flush_kernel_vmap_range(sg_virt(sg), sg->length); |
| 520 | return; | ||
| 521 | } | 531 | } |
| 522 | 532 | ||
| 523 | static void pa11_dma_sync_single_for_cpu(struct device *dev, | 533 | static void pa11_dma_sync_single_for_cpu(struct device *dev, |
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index e64a6016fba7..6877e3fa95bb 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c | |||
| @@ -203,6 +203,10 @@ static int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, | |||
| 203 | for_each_sg(sgl, sg, nents, i) { | 203 | for_each_sg(sgl, sg, nents, i) { |
| 204 | sg->dma_address = sg_phys(sg) + get_dma_offset(dev); | 204 | sg->dma_address = sg_phys(sg) + get_dma_offset(dev); |
| 205 | sg->dma_length = sg->length; | 205 | sg->dma_length = sg->length; |
| 206 | |||
| 207 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 208 | continue; | ||
| 209 | |||
| 206 | __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); | 210 | __dma_sync_page(sg_page(sg), sg->offset, sg->length, direction); |
| 207 | } | 211 | } |
| 208 | 212 | ||
| @@ -235,7 +239,10 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, | |||
| 235 | unsigned long attrs) | 239 | unsigned long attrs) |
| 236 | { | 240 | { |
| 237 | BUG_ON(dir == DMA_NONE); | 241 | BUG_ON(dir == DMA_NONE); |
| 238 | __dma_sync_page(page, offset, size, dir); | 242 | |
| 243 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
| 244 | __dma_sync_page(page, offset, size, dir); | ||
| 245 | |||
| 239 | return page_to_phys(page) + offset + get_dma_offset(dev); | 246 | return page_to_phys(page) + offset + get_dma_offset(dev); |
| 240 | } | 247 | } |
| 241 | 248 | ||
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 06254467e4dd..3a147122bc98 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c | |||
| @@ -236,7 +236,6 @@ static int | |||
| 236 | spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 236 | spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 237 | { | 237 | { |
| 238 | struct spu_context *ctx = vma->vm_file->private_data; | 238 | struct spu_context *ctx = vma->vm_file->private_data; |
| 239 | unsigned long address = (unsigned long)vmf->virtual_address; | ||
| 240 | unsigned long pfn, offset; | 239 | unsigned long pfn, offset; |
| 241 | 240 | ||
| 242 | offset = vmf->pgoff << PAGE_SHIFT; | 241 | offset = vmf->pgoff << PAGE_SHIFT; |
| @@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 244 | return VM_FAULT_SIGBUS; | 243 | return VM_FAULT_SIGBUS; |
| 245 | 244 | ||
| 246 | pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", | 245 | pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", |
| 247 | address, offset); | 246 | vmf->address, offset); |
| 248 | 247 | ||
| 249 | if (spu_acquire(ctx)) | 248 | if (spu_acquire(ctx)) |
| 250 | return VM_FAULT_NOPAGE; | 249 | return VM_FAULT_NOPAGE; |
| @@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 256 | vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); | 255 | vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); |
| 257 | pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; | 256 | pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; |
| 258 | } | 257 | } |
| 259 | vm_insert_pfn(vma, address, pfn); | 258 | vm_insert_pfn(vma, vmf->address, pfn); |
| 260 | 259 | ||
| 261 | spu_release(ctx); | 260 | spu_release(ctx); |
| 262 | 261 | ||
| @@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma, | |||
| 355 | down_read(¤t->mm->mmap_sem); | 354 | down_read(¤t->mm->mmap_sem); |
| 356 | } else { | 355 | } else { |
| 357 | area = ctx->spu->problem_phys + ps_offs; | 356 | area = ctx->spu->problem_phys + ps_offs; |
| 358 | vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | 357 | vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); |
| 359 | (area + offset) >> PAGE_SHIFT); | ||
| 360 | spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); | 358 | spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); |
| 361 | } | 359 | } |
| 362 | 360 | ||
diff --git a/arch/sh/kernel/dma-nommu.c b/arch/sh/kernel/dma-nommu.c index eadb669a7329..47fee3b6e29c 100644 --- a/arch/sh/kernel/dma-nommu.c +++ b/arch/sh/kernel/dma-nommu.c | |||
| @@ -18,7 +18,9 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, | |||
| 18 | dma_addr_t addr = page_to_phys(page) + offset; | 18 | dma_addr_t addr = page_to_phys(page) + offset; |
| 19 | 19 | ||
| 20 | WARN_ON(size == 0); | 20 | WARN_ON(size == 0); |
| 21 | dma_cache_sync(dev, page_address(page) + offset, size, dir); | 21 | |
| 22 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) | ||
| 23 | dma_cache_sync(dev, page_address(page) + offset, size, dir); | ||
| 22 | 24 | ||
| 23 | return addr; | 25 | return addr; |
| 24 | } | 26 | } |
| @@ -35,7 +37,8 @@ static int nommu_map_sg(struct device *dev, struct scatterlist *sg, | |||
| 35 | for_each_sg(sg, s, nents, i) { | 37 | for_each_sg(sg, s, nents, i) { |
| 36 | BUG_ON(!sg_page(s)); | 38 | BUG_ON(!sg_page(s)); |
| 37 | 39 | ||
| 38 | dma_cache_sync(dev, sg_virt(s), s->length, dir); | 40 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 41 | dma_cache_sync(dev, sg_virt(s), s->length, dir); | ||
| 39 | 42 | ||
| 40 | s->dma_address = sg_phys(s); | 43 | s->dma_address = sg_phys(s); |
| 41 | s->dma_length = s->length; | 44 | s->dma_length = s->length; |
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c index 852a3291db96..9df997995f6b 100644 --- a/arch/sparc/kernel/iommu.c +++ b/arch/sparc/kernel/iommu.c | |||
| @@ -415,7 +415,7 @@ static void dma_4u_unmap_page(struct device *dev, dma_addr_t bus_addr, | |||
| 415 | ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL; | 415 | ctx = (iopte_val(*base) & IOPTE_CONTEXT) >> 47UL; |
| 416 | 416 | ||
| 417 | /* Step 1: Kick data out of streaming buffers if necessary. */ | 417 | /* Step 1: Kick data out of streaming buffers if necessary. */ |
| 418 | if (strbuf->strbuf_enabled) | 418 | if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 419 | strbuf_flush(strbuf, iommu, bus_addr, ctx, | 419 | strbuf_flush(strbuf, iommu, bus_addr, ctx, |
| 420 | npages, direction); | 420 | npages, direction); |
| 421 | 421 | ||
| @@ -640,7 +640,7 @@ static void dma_4u_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
| 640 | base = iommu->page_table + entry; | 640 | base = iommu->page_table + entry; |
| 641 | 641 | ||
| 642 | dma_handle &= IO_PAGE_MASK; | 642 | dma_handle &= IO_PAGE_MASK; |
| 643 | if (strbuf->strbuf_enabled) | 643 | if (strbuf->strbuf_enabled && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 644 | strbuf_flush(strbuf, iommu, dma_handle, ctx, | 644 | strbuf_flush(strbuf, iommu, dma_handle, ctx, |
| 645 | npages, direction); | 645 | npages, direction); |
| 646 | 646 | ||
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c index 2344103414d1..6ffaec44931a 100644 --- a/arch/sparc/kernel/ioport.c +++ b/arch/sparc/kernel/ioport.c | |||
| @@ -527,7 +527,7 @@ static dma_addr_t pci32_map_page(struct device *dev, struct page *page, | |||
| 527 | static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, | 527 | static void pci32_unmap_page(struct device *dev, dma_addr_t ba, size_t size, |
| 528 | enum dma_data_direction dir, unsigned long attrs) | 528 | enum dma_data_direction dir, unsigned long attrs) |
| 529 | { | 529 | { |
| 530 | if (dir != PCI_DMA_TODEVICE) | 530 | if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 531 | dma_make_coherent(ba, PAGE_ALIGN(size)); | 531 | dma_make_coherent(ba, PAGE_ALIGN(size)); |
| 532 | } | 532 | } |
| 533 | 533 | ||
| @@ -572,7 +572,7 @@ static void pci32_unmap_sg(struct device *dev, struct scatterlist *sgl, | |||
| 572 | struct scatterlist *sg; | 572 | struct scatterlist *sg; |
| 573 | int n; | 573 | int n; |
| 574 | 574 | ||
| 575 | if (dir != PCI_DMA_TODEVICE) { | 575 | if (dir != PCI_DMA_TODEVICE && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { |
| 576 | for_each_sg(sgl, sg, nents, n) { | 576 | for_each_sg(sgl, sg, nents, n) { |
| 577 | dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); | 577 | dma_make_coherent(sg_phys(sg), PAGE_ALIGN(sg->length)); |
| 578 | } | 578 | } |
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index a9973bb4a1b2..95e73c63c99d 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c | |||
| @@ -42,7 +42,7 @@ static int panic_on_timeout; | |||
| 42 | */ | 42 | */ |
| 43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ | 43 | atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ |
| 44 | EXPORT_SYMBOL(nmi_active); | 44 | EXPORT_SYMBOL(nmi_active); |
| 45 | 45 | static int nmi_init_done; | |
| 46 | static unsigned int nmi_hz = HZ; | 46 | static unsigned int nmi_hz = HZ; |
| 47 | static DEFINE_PER_CPU(short, wd_enabled); | 47 | static DEFINE_PER_CPU(short, wd_enabled); |
| 48 | static int endflag __initdata; | 48 | static int endflag __initdata; |
| @@ -153,6 +153,8 @@ static void report_broken_nmi(int cpu, int *prev_nmi_count) | |||
| 153 | 153 | ||
| 154 | void stop_nmi_watchdog(void *unused) | 154 | void stop_nmi_watchdog(void *unused) |
| 155 | { | 155 | { |
| 156 | if (!__this_cpu_read(wd_enabled)) | ||
| 157 | return; | ||
| 156 | pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); | 158 | pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); |
| 157 | __this_cpu_write(wd_enabled, 0); | 159 | __this_cpu_write(wd_enabled, 0); |
| 158 | atomic_dec(&nmi_active); | 160 | atomic_dec(&nmi_active); |
| @@ -207,6 +209,9 @@ error: | |||
| 207 | 209 | ||
| 208 | void start_nmi_watchdog(void *unused) | 210 | void start_nmi_watchdog(void *unused) |
| 209 | { | 211 | { |
| 212 | if (__this_cpu_read(wd_enabled)) | ||
| 213 | return; | ||
| 214 | |||
| 210 | __this_cpu_write(wd_enabled, 1); | 215 | __this_cpu_write(wd_enabled, 1); |
| 211 | atomic_inc(&nmi_active); | 216 | atomic_inc(&nmi_active); |
| 212 | 217 | ||
| @@ -259,6 +264,8 @@ int __init nmi_init(void) | |||
| 259 | } | 264 | } |
| 260 | } | 265 | } |
| 261 | 266 | ||
| 267 | nmi_init_done = 1; | ||
| 268 | |||
| 262 | return err; | 269 | return err; |
| 263 | } | 270 | } |
| 264 | 271 | ||
| @@ -270,3 +277,38 @@ static int __init setup_nmi_watchdog(char *str) | |||
| 270 | return 0; | 277 | return 0; |
| 271 | } | 278 | } |
| 272 | __setup("nmi_watchdog=", setup_nmi_watchdog); | 279 | __setup("nmi_watchdog=", setup_nmi_watchdog); |
| 280 | |||
| 281 | /* | ||
| 282 | * sparc specific NMI watchdog enable function. | ||
| 283 | * Enables watchdog if it is not enabled already. | ||
| 284 | */ | ||
| 285 | int watchdog_nmi_enable(unsigned int cpu) | ||
| 286 | { | ||
| 287 | if (atomic_read(&nmi_active) == -1) { | ||
| 288 | pr_warn("NMI watchdog cannot be enabled or disabled\n"); | ||
| 289 | return -1; | ||
| 290 | } | ||
| 291 | |||
| 292 | /* | ||
| 293 | * watchdog thread could start even before nmi_init is called. | ||
| 294 | * Just Return in that case. Let nmi_init finish the init | ||
| 295 | * process first. | ||
| 296 | */ | ||
| 297 | if (!nmi_init_done) | ||
| 298 | return 0; | ||
| 299 | |||
| 300 | smp_call_function_single(cpu, start_nmi_watchdog, NULL, 1); | ||
| 301 | |||
| 302 | return 0; | ||
| 303 | } | ||
| 304 | /* | ||
| 305 | * sparc specific NMI watchdog disable function. | ||
| 306 | * Disables watchdog if it is not disabled already. | ||
| 307 | */ | ||
| 308 | void watchdog_nmi_disable(unsigned int cpu) | ||
| 309 | { | ||
| 310 | if (atomic_read(&nmi_active) == -1) | ||
| 311 | pr_warn_once("NMI watchdog cannot be enabled or disabled\n"); | ||
| 312 | else | ||
| 313 | smp_call_function_single(cpu, stop_nmi_watchdog, NULL, 1); | ||
| 314 | } | ||
diff --git a/arch/tile/kernel/pci-dma.c b/arch/tile/kernel/pci-dma.c index 09bb774b39cd..24e0f8c21f2f 100644 --- a/arch/tile/kernel/pci-dma.c +++ b/arch/tile/kernel/pci-dma.c | |||
| @@ -213,10 +213,12 @@ static int tile_dma_map_sg(struct device *dev, struct scatterlist *sglist, | |||
| 213 | 213 | ||
| 214 | for_each_sg(sglist, sg, nents, i) { | 214 | for_each_sg(sglist, sg, nents, i) { |
| 215 | sg->dma_address = sg_phys(sg); | 215 | sg->dma_address = sg_phys(sg); |
| 216 | __dma_prep_pa_range(sg->dma_address, sg->length, direction); | ||
| 217 | #ifdef CONFIG_NEED_SG_DMA_LENGTH | 216 | #ifdef CONFIG_NEED_SG_DMA_LENGTH |
| 218 | sg->dma_length = sg->length; | 217 | sg->dma_length = sg->length; |
| 219 | #endif | 218 | #endif |
| 219 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 220 | continue; | ||
| 221 | __dma_prep_pa_range(sg->dma_address, sg->length, direction); | ||
| 220 | } | 222 | } |
| 221 | 223 | ||
| 222 | return nents; | 224 | return nents; |
| @@ -232,6 +234,8 @@ static void tile_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, | |||
| 232 | BUG_ON(!valid_dma_direction(direction)); | 234 | BUG_ON(!valid_dma_direction(direction)); |
| 233 | for_each_sg(sglist, sg, nents, i) { | 235 | for_each_sg(sglist, sg, nents, i) { |
| 234 | sg->dma_address = sg_phys(sg); | 236 | sg->dma_address = sg_phys(sg); |
| 237 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 238 | continue; | ||
| 235 | __dma_complete_pa_range(sg->dma_address, sg->length, | 239 | __dma_complete_pa_range(sg->dma_address, sg->length, |
| 236 | direction); | 240 | direction); |
| 237 | } | 241 | } |
| @@ -245,7 +249,8 @@ static dma_addr_t tile_dma_map_page(struct device *dev, struct page *page, | |||
| 245 | BUG_ON(!valid_dma_direction(direction)); | 249 | BUG_ON(!valid_dma_direction(direction)); |
| 246 | 250 | ||
| 247 | BUG_ON(offset + size > PAGE_SIZE); | 251 | BUG_ON(offset + size > PAGE_SIZE); |
| 248 | __dma_prep_page(page, offset, size, direction); | 252 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 253 | __dma_prep_page(page, offset, size, direction); | ||
| 249 | 254 | ||
| 250 | return page_to_pa(page) + offset; | 255 | return page_to_pa(page) + offset; |
| 251 | } | 256 | } |
| @@ -256,6 +261,9 @@ static void tile_dma_unmap_page(struct device *dev, dma_addr_t dma_address, | |||
| 256 | { | 261 | { |
| 257 | BUG_ON(!valid_dma_direction(direction)); | 262 | BUG_ON(!valid_dma_direction(direction)); |
| 258 | 263 | ||
| 264 | if (attrs & DMA_ATTR_SKIP_CPU_SYNC) | ||
| 265 | return; | ||
| 266 | |||
| 259 | __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), | 267 | __dma_complete_page(pfn_to_page(PFN_DOWN(dma_address)), |
| 260 | dma_address & (PAGE_SIZE - 1), size, direction); | 268 | dma_address & (PAGE_SIZE - 1), size, direction); |
| 261 | } | 269 | } |
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e739002427ed..40121d14d34d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c | |||
| @@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, | |||
| 109 | return VM_FAULT_SIGBUS; | 109 | return VM_FAULT_SIGBUS; |
| 110 | 110 | ||
| 111 | if (sym_offset == image->sym_vvar_page) { | 111 | if (sym_offset == image->sym_vvar_page) { |
| 112 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | 112 | ret = vm_insert_pfn(vma, vmf->address, |
| 113 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); | 113 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); |
| 114 | } else if (sym_offset == image->sym_pvclock_page) { | 114 | } else if (sym_offset == image->sym_pvclock_page) { |
| 115 | struct pvclock_vsyscall_time_info *pvti = | 115 | struct pvclock_vsyscall_time_info *pvti = |
| @@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, | |||
| 117 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { | 117 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
| 118 | ret = vm_insert_pfn( | 118 | ret = vm_insert_pfn( |
| 119 | vma, | 119 | vma, |
| 120 | (unsigned long)vmf->virtual_address, | 120 | vmf->address, |
| 121 | __pa(pvti) >> PAGE_SHIFT); | 121 | __pa(pvti) >> PAGE_SHIFT); |
| 122 | } | 122 | } |
| 123 | } | 123 | } |
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8c1f218926d7..307b1f4543de 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c | |||
| @@ -328,7 +328,7 @@ void machine_kexec(struct kimage *image) | |||
| 328 | 328 | ||
| 329 | void arch_crash_save_vmcoreinfo(void) | 329 | void arch_crash_save_vmcoreinfo(void) |
| 330 | { | 330 | { |
| 331 | VMCOREINFO_SYMBOL(phys_base); | 331 | VMCOREINFO_NUMBER(phys_base); |
| 332 | VMCOREINFO_SYMBOL(init_level4_pgt); | 332 | VMCOREINFO_SYMBOL(init_level4_pgt); |
| 333 | 333 | ||
| 334 | #ifdef CONFIG_NUMA | 334 | #ifdef CONFIG_NUMA |
| @@ -337,9 +337,7 @@ void arch_crash_save_vmcoreinfo(void) | |||
| 337 | #endif | 337 | #endif |
| 338 | vmcoreinfo_append_str("KERNELOFFSET=%lx\n", | 338 | vmcoreinfo_append_str("KERNELOFFSET=%lx\n", |
| 339 | kaslr_offset()); | 339 | kaslr_offset()); |
| 340 | VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); | 340 | VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); |
| 341 | VMCOREINFO_VMALLOC_START(VMALLOC_START); | ||
| 342 | VMCOREINFO_VMEMMAP_START(VMEMMAP_START); | ||
| 343 | } | 341 | } |
| 344 | 342 | ||
| 345 | /* arch-dependent functionality related to kexec file-based syscall */ | 343 | /* arch-dependent functionality related to kexec file-based syscall */ |
diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 1e68806d6695..6a16decf278f 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c | |||
| @@ -189,7 +189,9 @@ static dma_addr_t xtensa_map_page(struct device *dev, struct page *page, | |||
| 189 | { | 189 | { |
| 190 | dma_addr_t dma_handle = page_to_phys(page) + offset; | 190 | dma_addr_t dma_handle = page_to_phys(page) + offset; |
| 191 | 191 | ||
| 192 | xtensa_sync_single_for_device(dev, dma_handle, size, dir); | 192 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 193 | xtensa_sync_single_for_device(dev, dma_handle, size, dir); | ||
| 194 | |||
| 193 | return dma_handle; | 195 | return dma_handle; |
| 194 | } | 196 | } |
| 195 | 197 | ||
| @@ -197,7 +199,8 @@ static void xtensa_unmap_page(struct device *dev, dma_addr_t dma_handle, | |||
| 197 | size_t size, enum dma_data_direction dir, | 199 | size_t size, enum dma_data_direction dir, |
| 198 | unsigned long attrs) | 200 | unsigned long attrs) |
| 199 | { | 201 | { |
| 200 | xtensa_sync_single_for_cpu(dev, dma_handle, size, dir); | 202 | if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) |
| 203 | xtensa_sync_single_for_cpu(dev, dma_handle, size, dir); | ||
| 201 | } | 204 | } |
| 202 | 205 | ||
| 203 | static int xtensa_map_sg(struct device *dev, struct scatterlist *sg, | 206 | static int xtensa_map_sg(struct device *dev, struct scatterlist *sg, |
diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index 199b8e99f7d7..737187865269 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c | |||
| @@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, | |||
| 19 | unsigned long pa; | 19 | unsigned long pa; |
| 20 | struct page *page; | 20 | struct page *page; |
| 21 | 21 | ||
| 22 | dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start | 22 | dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base; |
| 23 | + agp->aperture.bus_base; | ||
| 24 | pa = agp->ops->translate(agp, dma_addr); | 23 | pa = agp->ops->translate(agp, dma_addr); |
| 25 | 24 | ||
| 26 | if (pa == (unsigned long)-EINVAL) | 25 | if (pa == (unsigned long)-EINVAL) |
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f3f92d5fcda0..a697ca0cab1e 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c | |||
| @@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 227 | * be because another thread has installed the pte first, so it | 227 | * be because another thread has installed the pte first, so it |
| 228 | * is no problem. | 228 | * is no problem. |
| 229 | */ | 229 | */ |
| 230 | vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); | 230 | vm_insert_pfn(vma, vmf->address, pfn); |
| 231 | 231 | ||
| 232 | return VM_FAULT_NOPAGE; | 232 | return VM_FAULT_NOPAGE; |
| 233 | } | 233 | } |
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index 7a4869151d3b..a77262d31911 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c | |||
| @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(tpm_put_ops); | |||
| 84 | * | 84 | * |
| 85 | * The return'd chip has been tpm_try_get_ops'd and must be released via | 85 | * The return'd chip has been tpm_try_get_ops'd and must be released via |
| 86 | * tpm_put_ops | 86 | * tpm_put_ops |
| 87 | */ | 87 | */ |
| 88 | struct tpm_chip *tpm_chip_find_get(int chip_num) | 88 | struct tpm_chip *tpm_chip_find_get(int chip_num) |
| 89 | { | 89 | { |
| 90 | struct tpm_chip *chip, *res = NULL; | 90 | struct tpm_chip *chip, *res = NULL; |
| @@ -103,7 +103,7 @@ struct tpm_chip *tpm_chip_find_get(int chip_num) | |||
| 103 | } | 103 | } |
| 104 | } while (chip_prev != chip_num); | 104 | } while (chip_prev != chip_num); |
| 105 | } else { | 105 | } else { |
| 106 | chip = idr_find_slowpath(&dev_nums_idr, chip_num); | 106 | chip = idr_find(&dev_nums_idr, chip_num); |
| 107 | if (chip && !tpm_try_get_ops(chip)) | 107 | if (chip && !tpm_try_get_ops(chip)) |
| 108 | res = chip; | 108 | res = chip; |
| 109 | } | 109 | } |
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 286447a83dab..26ec39ddf21f 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c | |||
| @@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, | |||
| 328 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | 328 | static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, |
| 329 | struct vm_fault *vmf) | 329 | struct vm_fault *vmf) |
| 330 | { | 330 | { |
| 331 | unsigned long vaddr = (unsigned long) vmf->virtual_address; | ||
| 332 | struct device *dev = &dax_dev->dev; | 331 | struct device *dev = &dax_dev->dev; |
| 333 | struct dax_region *dax_region; | 332 | struct dax_region *dax_region; |
| 334 | int rc = VM_FAULT_SIGBUS; | 333 | int rc = VM_FAULT_SIGBUS; |
| @@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, | |||
| 353 | 352 | ||
| 354 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); | 353 | pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); |
| 355 | 354 | ||
| 356 | rc = vm_insert_mixed(vma, vaddr, pfn); | 355 | rc = vm_insert_mixed(vma, vmf->address, pfn); |
| 357 | 356 | ||
| 358 | if (rc == -ENOMEM) | 357 | if (rc == -ENOMEM) |
| 359 | return VM_FAULT_OOM; | 358 | return VM_FAULT_OOM; |
diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 768087ddb046..a293c8be232c 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c | |||
| @@ -17,12 +17,11 @@ | |||
| 17 | static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 17 | static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 18 | { | 18 | { |
| 19 | struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); | 19 | struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); |
| 20 | unsigned long addr = (unsigned long)vmf->virtual_address; | ||
| 21 | unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; | 20 | unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; |
| 22 | int ret; | 21 | int ret; |
| 23 | 22 | ||
| 24 | pfn += (addr - vma->vm_start) >> PAGE_SHIFT; | 23 | pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 25 | ret = vm_insert_pfn(vma, addr, pfn); | 24 | ret = vm_insert_pfn(vma, vmf->address, pfn); |
| 26 | 25 | ||
| 27 | switch (ret) { | 26 | switch (ret) { |
| 28 | case 0: | 27 | case 0: |
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index caa4e4ca616d..bd311c77c254 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c | |||
| @@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 124 | * Using vm_pgoff as a selector forces us to use this unusual | 124 | * Using vm_pgoff as a selector forces us to use this unusual |
| 125 | * addressing scheme. | 125 | * addressing scheme. |
| 126 | */ | 126 | */ |
| 127 | resource_size_t offset = (unsigned long)vmf->virtual_address - | 127 | resource_size_t offset = vmf->address - vma->vm_start; |
| 128 | vma->vm_start; | ||
| 129 | resource_size_t baddr = map->offset + offset; | 128 | resource_size_t baddr = map->offset + offset; |
| 130 | struct drm_agp_mem *agpmem; | 129 | struct drm_agp_mem *agpmem; |
| 131 | struct page *page; | 130 | struct page *page; |
| @@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 195 | if (!map) | 194 | if (!map) |
| 196 | return VM_FAULT_SIGBUS; /* Nothing allocated */ | 195 | return VM_FAULT_SIGBUS; /* Nothing allocated */ |
| 197 | 196 | ||
| 198 | offset = (unsigned long)vmf->virtual_address - vma->vm_start; | 197 | offset = vmf->address - vma->vm_start; |
| 199 | i = (unsigned long)map->handle + offset; | 198 | i = (unsigned long)map->handle + offset; |
| 200 | page = vmalloc_to_page((void *)i); | 199 | page = vmalloc_to_page((void *)i); |
| 201 | if (!page) | 200 | if (!page) |
| @@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 301 | if (!dma->pagelist) | 300 | if (!dma->pagelist) |
| 302 | return VM_FAULT_SIGBUS; /* Nothing allocated */ | 301 | return VM_FAULT_SIGBUS; /* Nothing allocated */ |
| 303 | 302 | ||
| 304 | offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ | 303 | offset = vmf->address - vma->vm_start; |
| 304 | /* vm_[pg]off[set] should be 0 */ | ||
| 305 | page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ | 305 | page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ |
| 306 | page = virt_to_page((void *)dma->pagelist[page_nr]); | 306 | page = virt_to_page((void *)dma->pagelist[page_nr]); |
| 307 | 307 | ||
| @@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 337 | if (!entry->pagelist) | 337 | if (!entry->pagelist) |
| 338 | return VM_FAULT_SIGBUS; /* Nothing allocated */ | 338 | return VM_FAULT_SIGBUS; /* Nothing allocated */ |
| 339 | 339 | ||
| 340 | offset = (unsigned long)vmf->virtual_address - vma->vm_start; | 340 | offset = vmf->address - vma->vm_start; |
| 341 | map_offset = map->offset - (unsigned long)dev->sg->virtual; | 341 | map_offset = map->offset - (unsigned long)dev->sg->virtual; |
| 342 | page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); | 342 | page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); |
| 343 | page = entry->pagelist[page_offset]; | 343 | page = entry->pagelist[page_offset]; |
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 7d066a91d778..114dddbd297b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c | |||
| @@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 202 | } | 202 | } |
| 203 | 203 | ||
| 204 | /* We don't use vmf->pgoff since that has the fake offset: */ | 204 | /* We don't use vmf->pgoff since that has the fake offset: */ |
| 205 | pgoff = ((unsigned long)vmf->virtual_address - | 205 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 206 | vma->vm_start) >> PAGE_SHIFT; | ||
| 207 | 206 | ||
| 208 | page = pages[pgoff]; | 207 | page = pages[pgoff]; |
| 209 | 208 | ||
| 210 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 209 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
| 211 | page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); | 210 | page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); |
| 212 | 211 | ||
| 213 | ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); | 212 | ret = vm_insert_page(vma, vmf->address, page); |
| 214 | 213 | ||
| 215 | out: | 214 | out: |
| 216 | switch (ret) { | 215 | switch (ret) { |
| @@ -759,7 +758,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( | |||
| 759 | down_read(&mm->mmap_sem); | 758 | down_read(&mm->mmap_sem); |
| 760 | while (pinned < npages) { | 759 | while (pinned < npages) { |
| 761 | ret = get_user_pages_remote(task, mm, ptr, npages - pinned, | 760 | ret = get_user_pages_remote(task, mm, ptr, npages - pinned, |
| 762 | flags, pvec + pinned, NULL); | 761 | flags, pvec + pinned, NULL, NULL); |
| 763 | if (ret < 0) | 762 | if (ret < 0) |
| 764 | break; | 763 | break; |
| 765 | 764 | ||
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index ea7a18230888..57b81460fec8 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c | |||
| @@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 455 | pgoff_t page_offset; | 455 | pgoff_t page_offset; |
| 456 | int ret; | 456 | int ret; |
| 457 | 457 | ||
| 458 | page_offset = ((unsigned long)vmf->virtual_address - | 458 | page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 459 | vma->vm_start) >> PAGE_SHIFT; | ||
| 460 | 459 | ||
| 461 | if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { | 460 | if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { |
| 462 | DRM_ERROR("invalid page offset\n"); | 461 | DRM_ERROR("invalid page offset\n"); |
| @@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 465 | } | 464 | } |
| 466 | 465 | ||
| 467 | pfn = page_to_pfn(exynos_gem->pages[page_offset]); | 466 | pfn = page_to_pfn(exynos_gem->pages[page_offset]); |
| 468 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 467 | ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); |
| 469 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
| 470 | 468 | ||
| 471 | out: | 469 | out: |
| 472 | switch (ret) { | 470 | switch (ret) { |
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 4071b2d1e8cf..8b44fa542562 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c | |||
| @@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 125 | psbfb->gtt->offset; | 125 | psbfb->gtt->offset; |
| 126 | 126 | ||
| 127 | page_num = vma_pages(vma); | 127 | page_num = vma_pages(vma); |
| 128 | address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); | 128 | address = vmf->address - (vmf->pgoff << PAGE_SHIFT); |
| 129 | 129 | ||
| 130 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | 130 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
| 131 | 131 | ||
diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c index 6d1cb6b370b1..527c62917660 100644 --- a/drivers/gpu/drm/gma500/gem.c +++ b/drivers/gpu/drm/gma500/gem.c | |||
| @@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 197 | 197 | ||
| 198 | /* Page relative to the VMA start - we must calculate this ourselves | 198 | /* Page relative to the VMA start - we must calculate this ourselves |
| 199 | because vmf->pgoff is the fake GEM offset */ | 199 | because vmf->pgoff is the fake GEM offset */ |
| 200 | page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) | 200 | page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 201 | >> PAGE_SHIFT; | ||
| 202 | 201 | ||
| 203 | /* CPU view of the page, don't go via the GART for CPU writes */ | 202 | /* CPU view of the page, don't go via the GART for CPU writes */ |
| 204 | if (r->stolen) | 203 | if (r->stolen) |
| 205 | pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; | 204 | pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; |
| 206 | else | 205 | else |
| 207 | pfn = page_to_pfn(r->pages[page_offset]); | 206 | pfn = page_to_pfn(r->pages[page_offset]); |
| 208 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); | 207 | ret = vm_insert_pfn(vma, vmf->address, pfn); |
| 209 | 208 | ||
| 210 | fail: | 209 | fail: |
| 211 | mutex_unlock(&dev_priv->mmap_mutex); | 210 | mutex_unlock(&dev_priv->mmap_mutex); |
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d0dcaf35b429..412f3513f269 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c | |||
| @@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) | |||
| 1796 | int ret; | 1796 | int ret; |
| 1797 | 1797 | ||
| 1798 | /* We don't use vmf->pgoff since that has the fake offset */ | 1798 | /* We don't use vmf->pgoff since that has the fake offset */ |
| 1799 | page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> | 1799 | page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; |
| 1800 | PAGE_SHIFT; | ||
| 1801 | 1800 | ||
| 1802 | trace_i915_gem_object_fault(obj, page_offset, true, write); | 1801 | trace_i915_gem_object_fault(obj, page_offset, true, write); |
| 1803 | 1802 | ||
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 107ddf51065e..d068af2ec3a3 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c | |||
| @@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) | |||
| 515 | obj->userptr.ptr + pinned * PAGE_SIZE, | 515 | obj->userptr.ptr + pinned * PAGE_SIZE, |
| 516 | npages - pinned, | 516 | npages - pinned, |
| 517 | flags, | 517 | flags, |
| 518 | pvec + pinned, NULL); | 518 | pvec + pinned, NULL, NULL); |
| 519 | if (ret < 0) | 519 | if (ret < 0) |
| 520 | break; | 520 | break; |
| 521 | 521 | ||
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index cd06cfd94687..d8bc59c7e261 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c | |||
| @@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 225 | } | 225 | } |
| 226 | 226 | ||
| 227 | /* We don't use vmf->pgoff since that has the fake offset: */ | 227 | /* We don't use vmf->pgoff since that has the fake offset: */ |
| 228 | pgoff = ((unsigned long)vmf->virtual_address - | 228 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 229 | vma->vm_start) >> PAGE_SHIFT; | ||
| 230 | 229 | ||
| 231 | pfn = page_to_pfn(pages[pgoff]); | 230 | pfn = page_to_pfn(pages[pgoff]); |
| 232 | 231 | ||
| 233 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 232 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
| 234 | pfn, pfn << PAGE_SHIFT); | 233 | pfn, pfn << PAGE_SHIFT); |
| 235 | 234 | ||
| 236 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 235 | ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); |
| 237 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
| 238 | 236 | ||
| 239 | out_unlock: | 237 | out_unlock: |
| 240 | mutex_unlock(&dev->struct_mutex); | 238 | mutex_unlock(&dev->struct_mutex); |
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index d4e1e11466f8..4a90c690f09e 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c | |||
| @@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj, | |||
| 398 | pgoff_t pgoff; | 398 | pgoff_t pgoff; |
| 399 | 399 | ||
| 400 | /* We don't use vmf->pgoff since that has the fake offset: */ | 400 | /* We don't use vmf->pgoff since that has the fake offset: */ |
| 401 | pgoff = ((unsigned long)vmf->virtual_address - | 401 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 402 | vma->vm_start) >> PAGE_SHIFT; | ||
| 403 | 402 | ||
| 404 | if (omap_obj->pages) { | 403 | if (omap_obj->pages) { |
| 405 | omap_gem_cpu_sync(obj, pgoff); | 404 | omap_gem_cpu_sync(obj, pgoff); |
| @@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj, | |||
| 409 | pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; | 408 | pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; |
| 410 | } | 409 | } |
| 411 | 410 | ||
| 412 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 411 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
| 413 | pfn, pfn << PAGE_SHIFT); | 412 | pfn, pfn << PAGE_SHIFT); |
| 414 | 413 | ||
| 415 | return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, | 414 | return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); |
| 416 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
| 417 | } | 415 | } |
| 418 | 416 | ||
| 419 | /* Special handling for the case of faulting in 2d tiled buffers */ | 417 | /* Special handling for the case of faulting in 2d tiled buffers */ |
| @@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj, | |||
| 427 | struct page *pages[64]; /* XXX is this too much to have on stack? */ | 425 | struct page *pages[64]; /* XXX is this too much to have on stack? */ |
| 428 | unsigned long pfn; | 426 | unsigned long pfn; |
| 429 | pgoff_t pgoff, base_pgoff; | 427 | pgoff_t pgoff, base_pgoff; |
| 430 | void __user *vaddr; | 428 | unsigned long vaddr; |
| 431 | int i, ret, slots; | 429 | int i, ret, slots; |
| 432 | 430 | ||
| 433 | /* | 431 | /* |
| @@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj, | |||
| 447 | const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); | 445 | const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); |
| 448 | 446 | ||
| 449 | /* We don't use vmf->pgoff since that has the fake offset: */ | 447 | /* We don't use vmf->pgoff since that has the fake offset: */ |
| 450 | pgoff = ((unsigned long)vmf->virtual_address - | 448 | pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 451 | vma->vm_start) >> PAGE_SHIFT; | ||
| 452 | 449 | ||
| 453 | /* | 450 | /* |
| 454 | * Actual address we start mapping at is rounded down to previous slot | 451 | * Actual address we start mapping at is rounded down to previous slot |
| @@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj, | |||
| 459 | /* figure out buffer width in slots */ | 456 | /* figure out buffer width in slots */ |
| 460 | slots = omap_obj->width >> priv->usergart[fmt].slot_shift; | 457 | slots = omap_obj->width >> priv->usergart[fmt].slot_shift; |
| 461 | 458 | ||
| 462 | vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); | 459 | vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT); |
| 463 | 460 | ||
| 464 | entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; | 461 | entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; |
| 465 | 462 | ||
| @@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj, | |||
| 503 | 500 | ||
| 504 | pfn = entry->paddr >> PAGE_SHIFT; | 501 | pfn = entry->paddr >> PAGE_SHIFT; |
| 505 | 502 | ||
| 506 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 503 | VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, |
| 507 | pfn, pfn << PAGE_SHIFT); | 504 | pfn, pfn << PAGE_SHIFT); |
| 508 | 505 | ||
| 509 | for (i = n; i > 0; i--) { | 506 | for (i = n; i > 0; i--) { |
| 510 | vm_insert_mixed(vma, (unsigned long)vaddr, | 507 | vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV)); |
| 511 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
| 512 | pfn += priv->usergart[fmt].stride_pfn; | 508 | pfn += priv->usergart[fmt].stride_pfn; |
| 513 | vaddr += PAGE_SIZE * m; | 509 | vaddr += PAGE_SIZE * m; |
| 514 | } | 510 | } |
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index c08e5279eeac..7d853e6b5ff0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c | |||
| @@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 452 | if (!bo->pages) | 452 | if (!bo->pages) |
| 453 | return VM_FAULT_SIGBUS; | 453 | return VM_FAULT_SIGBUS; |
| 454 | 454 | ||
| 455 | offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; | 455 | offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 456 | page = bo->pages[offset]; | 456 | page = bo->pages[offset]; |
| 457 | 457 | ||
| 458 | err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); | 458 | err = vm_insert_page(vma, vmf->address, page); |
| 459 | switch (err) { | 459 | switch (err) { |
| 460 | case -EAGAIN: | 460 | case -EAGAIN: |
| 461 | case 0: | 461 | case 0: |
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 4748aedc933a..68ef993ab431 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c | |||
| @@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 101 | struct page *page; | 101 | struct page *page; |
| 102 | int ret; | 102 | int ret; |
| 103 | int i; | 103 | int i; |
| 104 | unsigned long address = (unsigned long)vmf->virtual_address; | 104 | unsigned long address = vmf->address; |
| 105 | int retval = VM_FAULT_NOPAGE; | 105 | int retval = VM_FAULT_NOPAGE; |
| 106 | struct ttm_mem_type_manager *man = | 106 | struct ttm_mem_type_manager *man = |
| 107 | &bdev->man[bo->mem.mem_type]; | 107 | &bdev->man[bo->mem.mem_type]; |
diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 818e70712b18..3c0c4bd3f750 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c | |||
| @@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 107 | unsigned int page_offset; | 107 | unsigned int page_offset; |
| 108 | int ret = 0; | 108 | int ret = 0; |
| 109 | 109 | ||
| 110 | page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> | 110 | page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; |
| 111 | PAGE_SHIFT; | ||
| 112 | 111 | ||
| 113 | if (!obj->pages) | 112 | if (!obj->pages) |
| 114 | return VM_FAULT_SIGBUS; | 113 | return VM_FAULT_SIGBUS; |
| 115 | 114 | ||
| 116 | page = obj->pages[page_offset]; | 115 | page = obj->pages[page_offset]; |
| 117 | ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); | 116 | ret = vm_insert_page(vma, vmf->address, page); |
| 118 | switch (ret) { | 117 | switch (ret) { |
| 119 | case -EAGAIN: | 118 | case -EAGAIN: |
| 120 | case 0: | 119 | case 0: |
diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index f36c14729b55..477e07f0ecb6 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c | |||
| @@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 54 | { | 54 | { |
| 55 | struct drm_vgem_gem_object *obj = vma->vm_private_data; | 55 | struct drm_vgem_gem_object *obj = vma->vm_private_data; |
| 56 | /* We don't use vmf->pgoff since that has the fake offset */ | 56 | /* We don't use vmf->pgoff since that has the fake offset */ |
| 57 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 57 | unsigned long vaddr = vmf->address; |
| 58 | struct page *page; | 58 | struct page *page; |
| 59 | 59 | ||
| 60 | page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, | 60 | page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, |
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1f0fe3217f23..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c | |||
| @@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, | |||
| 578 | */ | 578 | */ |
| 579 | npages = get_user_pages_remote(owning_process, owning_mm, | 579 | npages = get_user_pages_remote(owning_process, owning_mm, |
| 580 | user_virt, gup_num_pages, | 580 | user_virt, gup_num_pages, |
| 581 | flags, local_page_list, NULL); | 581 | flags, local_page_list, NULL, NULL); |
| 582 | up_read(&owning_mm->mmap_sem); | 582 | up_read(&owning_mm->mmap_sem); |
| 583 | 583 | ||
| 584 | if (npages < 0) | 584 | if (npages < 0) |
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 1db0af6c7f94..ba63ca57ed7e 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c | |||
| @@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 439 | struct page *page; | 439 | struct page *page; |
| 440 | 440 | ||
| 441 | dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", | 441 | dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", |
| 442 | (unsigned long)vmf->virtual_address, | 442 | vmf->address, vma->vm_start, vma->vm_end); |
| 443 | vma->vm_start, vma->vm_end); | ||
| 444 | 443 | ||
| 445 | page = alloc_page(GFP_USER | __GFP_DMA32); | 444 | page = alloc_page(GFP_USER | __GFP_DMA32); |
| 446 | if (!page) | 445 | if (!page) |
| 447 | return VM_FAULT_OOM; | 446 | return VM_FAULT_OOM; |
| 448 | clear_user_highpage(page, (unsigned long)vmf->virtual_address); | 447 | clear_user_highpage(page, vmf->address); |
| 449 | vmf->page = page; | 448 | vmf->page = page; |
| 450 | 449 | ||
| 451 | return 0; | 450 | return 0; |
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 5e506c19108a..5d36dcc7f47e 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c | |||
| @@ -117,13 +117,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, | |||
| 117 | static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 117 | static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
| 118 | { | 118 | { |
| 119 | struct cxl_context *ctx = vma->vm_file->private_data; | 119 | struct cxl_context *ctx = vma->vm_file->private_data; |
| 120 | unsigned long address = (unsigned long)vmf->virtual_address; | ||
| 121 | u64 area, offset; | 120 | u64 area, offset; |
| 122 | 121 | ||
| 123 | offset = vmf->pgoff << PAGE_SHIFT; | 122 | offset = vmf->pgoff << PAGE_SHIFT; |
| 124 | 123 | ||
| 125 | pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", | 124 | pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", |
| 126 | __func__, ctx->pe, address, offset); | 125 | __func__, ctx->pe, vmf->address, offset); |
| 127 | 126 | ||
| 128 | if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { | 127 | if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { |
| 129 | area = ctx->afu->psn_phys; | 128 | area = ctx->afu->psn_phys; |
| @@ -155,7 +154,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 155 | return VM_FAULT_SIGBUS; | 154 | return VM_FAULT_SIGBUS; |
| 156 | } | 155 | } |
| 157 | 156 | ||
| 158 | vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); | 157 | vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); |
| 159 | 158 | ||
| 160 | mutex_unlock(&ctx->status_mutex); | 159 | mutex_unlock(&ctx->status_mutex); |
| 161 | 160 | ||
diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 33741ad4a74a..af2e077da4b8 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c | |||
| @@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 932 | unsigned long paddr, vaddr; | 932 | unsigned long paddr, vaddr; |
| 933 | unsigned long expires; | 933 | unsigned long expires; |
| 934 | 934 | ||
| 935 | vaddr = (unsigned long)vmf->virtual_address; | 935 | vaddr = vmf->address; |
| 936 | gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", | 936 | gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", |
| 937 | vma, vaddr, GSEG_BASE(vaddr)); | 937 | vma, vaddr, GSEG_BASE(vaddr)); |
| 938 | STAT(nopfn); | 938 | STAT(nopfn); |
diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index d11093dce1b9..acbc3abe2ddd 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h | |||
| @@ -210,7 +210,12 @@ struct igb_tx_buffer { | |||
| 210 | struct igb_rx_buffer { | 210 | struct igb_rx_buffer { |
| 211 | dma_addr_t dma; | 211 | dma_addr_t dma; |
| 212 | struct page *page; | 212 | struct page *page; |
| 213 | unsigned int page_offset; | 213 | #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) |
| 214 | __u32 page_offset; | ||
| 215 | #else | ||
| 216 | __u16 page_offset; | ||
| 217 | #endif | ||
| 218 | __u16 pagecnt_bias; | ||
| 214 | }; | 219 | }; |
| 215 | 220 | ||
| 216 | struct igb_tx_queue_stats { | 221 | struct igb_tx_queue_stats { |
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index cae24a8ccf47..a761001308dc 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c | |||
| @@ -3947,11 +3947,23 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) | |||
| 3947 | if (!buffer_info->page) | 3947 | if (!buffer_info->page) |
| 3948 | continue; | 3948 | continue; |
| 3949 | 3949 | ||
| 3950 | dma_unmap_page(rx_ring->dev, | 3950 | /* Invalidate cache lines that may have been written to by |
| 3951 | buffer_info->dma, | 3951 | * device so that we avoid corrupting memory. |
| 3952 | PAGE_SIZE, | 3952 | */ |
| 3953 | DMA_FROM_DEVICE); | 3953 | dma_sync_single_range_for_cpu(rx_ring->dev, |
| 3954 | __free_page(buffer_info->page); | 3954 | buffer_info->dma, |
| 3955 | buffer_info->page_offset, | ||
| 3956 | IGB_RX_BUFSZ, | ||
| 3957 | DMA_FROM_DEVICE); | ||
| 3958 | |||
| 3959 | /* free resources associated with mapping */ | ||
| 3960 | dma_unmap_page_attrs(rx_ring->dev, | ||
| 3961 | buffer_info->dma, | ||
| 3962 | PAGE_SIZE, | ||
| 3963 | DMA_FROM_DEVICE, | ||
| 3964 | DMA_ATTR_SKIP_CPU_SYNC); | ||
| 3965 | __page_frag_drain(buffer_info->page, 0, | ||
| 3966 | buffer_info->pagecnt_bias); | ||
| 3955 | 3967 | ||
| 3956 | buffer_info->page = NULL; | 3968 | buffer_info->page = NULL; |
| 3957 | } | 3969 | } |
| @@ -6812,12 +6824,6 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring, | |||
| 6812 | 6824 | ||
| 6813 | /* transfer page from old buffer to new buffer */ | 6825 | /* transfer page from old buffer to new buffer */ |
| 6814 | *new_buff = *old_buff; | 6826 | *new_buff = *old_buff; |
| 6815 | |||
| 6816 | /* sync the buffer for use by the device */ | ||
| 6817 | dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma, | ||
| 6818 | old_buff->page_offset, | ||
| 6819 | IGB_RX_BUFSZ, | ||
| 6820 | DMA_FROM_DEVICE); | ||
| 6821 | } | 6827 | } |
| 6822 | 6828 | ||
| 6823 | static inline bool igb_page_is_reserved(struct page *page) | 6829 | static inline bool igb_page_is_reserved(struct page *page) |
| @@ -6829,13 +6835,15 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, | |||
| 6829 | struct page *page, | 6835 | struct page *page, |
| 6830 | unsigned int truesize) | 6836 | unsigned int truesize) |
| 6831 | { | 6837 | { |
| 6838 | unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--; | ||
| 6839 | |||
| 6832 | /* avoid re-using remote pages */ | 6840 | /* avoid re-using remote pages */ |
| 6833 | if (unlikely(igb_page_is_reserved(page))) | 6841 | if (unlikely(igb_page_is_reserved(page))) |
| 6834 | return false; | 6842 | return false; |
| 6835 | 6843 | ||
| 6836 | #if (PAGE_SIZE < 8192) | 6844 | #if (PAGE_SIZE < 8192) |
| 6837 | /* if we are only owner of page we can reuse it */ | 6845 | /* if we are only owner of page we can reuse it */ |
| 6838 | if (unlikely(page_count(page) != 1)) | 6846 | if (unlikely(page_ref_count(page) != pagecnt_bias)) |
| 6839 | return false; | 6847 | return false; |
| 6840 | 6848 | ||
| 6841 | /* flip page offset to other buffer */ | 6849 | /* flip page offset to other buffer */ |
| @@ -6848,10 +6856,14 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, | |||
| 6848 | return false; | 6856 | return false; |
| 6849 | #endif | 6857 | #endif |
| 6850 | 6858 | ||
| 6851 | /* Even if we own the page, we are not allowed to use atomic_set() | 6859 | /* If we have drained the page fragment pool we need to update |
| 6852 | * This would break get_page_unless_zero() users. | 6860 | * the pagecnt_bias and page count so that we fully restock the |
| 6861 | * number of references the driver holds. | ||
| 6853 | */ | 6862 | */ |
| 6854 | page_ref_inc(page); | 6863 | if (unlikely(pagecnt_bias == 1)) { |
| 6864 | page_ref_add(page, USHRT_MAX); | ||
| 6865 | rx_buffer->pagecnt_bias = USHRT_MAX; | ||
| 6866 | } | ||
| 6855 | 6867 | ||
| 6856 | return true; | 6868 | return true; |
| 6857 | } | 6869 | } |
| @@ -6903,7 +6915,6 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, | |||
| 6903 | return true; | 6915 | return true; |
| 6904 | 6916 | ||
| 6905 | /* this page cannot be reused so discard it */ | 6917 | /* this page cannot be reused so discard it */ |
| 6906 | __free_page(page); | ||
| 6907 | return false; | 6918 | return false; |
| 6908 | } | 6919 | } |
| 6909 | 6920 | ||
| @@ -6938,6 +6949,13 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, | |||
| 6938 | page = rx_buffer->page; | 6949 | page = rx_buffer->page; |
| 6939 | prefetchw(page); | 6950 | prefetchw(page); |
| 6940 | 6951 | ||
| 6952 | /* we are reusing so sync this buffer for CPU use */ | ||
| 6953 | dma_sync_single_range_for_cpu(rx_ring->dev, | ||
| 6954 | rx_buffer->dma, | ||
| 6955 | rx_buffer->page_offset, | ||
| 6956 | size, | ||
| 6957 | DMA_FROM_DEVICE); | ||
| 6958 | |||
| 6941 | if (likely(!skb)) { | 6959 | if (likely(!skb)) { |
| 6942 | void *page_addr = page_address(page) + | 6960 | void *page_addr = page_address(page) + |
| 6943 | rx_buffer->page_offset; | 6961 | rx_buffer->page_offset; |
| @@ -6962,21 +6980,18 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, | |||
| 6962 | prefetchw(skb->data); | 6980 | prefetchw(skb->data); |
| 6963 | } | 6981 | } |
| 6964 | 6982 | ||
| 6965 | /* we are reusing so sync this buffer for CPU use */ | ||
| 6966 | dma_sync_single_range_for_cpu(rx_ring->dev, | ||
| 6967 | rx_buffer->dma, | ||
| 6968 | rx_buffer->page_offset, | ||
| 6969 | size, | ||
| 6970 | DMA_FROM_DEVICE); | ||
| 6971 | |||
| 6972 | /* pull page into skb */ | 6983 | /* pull page into skb */ |
| 6973 | if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { | 6984 | if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { |
| 6974 | /* hand second half of page back to the ring */ | 6985 | /* hand second half of page back to the ring */ |
| 6975 | igb_reuse_rx_page(rx_ring, rx_buffer); | 6986 | igb_reuse_rx_page(rx_ring, rx_buffer); |
| 6976 | } else { | 6987 | } else { |
| 6977 | /* we are not reusing the buffer so unmap it */ | 6988 | /* We are not reusing the buffer so unmap it and free |
| 6978 | dma_unmap_page(rx_ring->dev, rx_buffer->dma, | 6989 | * any references we are holding to it |
| 6979 | PAGE_SIZE, DMA_FROM_DEVICE); | 6990 | */ |
| 6991 | dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, | ||
| 6992 | PAGE_SIZE, DMA_FROM_DEVICE, | ||
| 6993 | DMA_ATTR_SKIP_CPU_SYNC); | ||
| 6994 | __page_frag_drain(page, 0, rx_buffer->pagecnt_bias); | ||
| 6980 | } | 6995 | } |
| 6981 | 6996 | ||
| 6982 | /* clear contents of rx_buffer */ | 6997 | /* clear contents of rx_buffer */ |
| @@ -7234,7 +7249,8 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, | |||
| 7234 | } | 7249 | } |
| 7235 | 7250 | ||
| 7236 | /* map page for use */ | 7251 | /* map page for use */ |
| 7237 | dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); | 7252 | dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, |
| 7253 | DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); | ||
| 7238 | 7254 | ||
| 7239 | /* if mapping failed free memory back to system since | 7255 | /* if mapping failed free memory back to system since |
| 7240 | * there isn't much point in holding memory we can't use | 7256 | * there isn't much point in holding memory we can't use |
| @@ -7249,6 +7265,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, | |||
| 7249 | bi->dma = dma; | 7265 | bi->dma = dma; |
| 7250 | bi->page = page; | 7266 | bi->page = page; |
| 7251 | bi->page_offset = 0; | 7267 | bi->page_offset = 0; |
| 7268 | bi->pagecnt_bias = 1; | ||
| 7252 | 7269 | ||
| 7253 | return true; | 7270 | return true; |
| 7254 | } | 7271 | } |
| @@ -7275,6 +7292,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) | |||
| 7275 | if (!igb_alloc_mapped_page(rx_ring, bi)) | 7292 | if (!igb_alloc_mapped_page(rx_ring, bi)) |
| 7276 | break; | 7293 | break; |
| 7277 | 7294 | ||
| 7295 | /* sync the buffer for use by the device */ | ||
| 7296 | dma_sync_single_range_for_device(rx_ring->dev, bi->dma, | ||
| 7297 | bi->page_offset, | ||
| 7298 | IGB_RX_BUFSZ, | ||
| 7299 | DMA_FROM_DEVICE); | ||
| 7300 | |||
| 7278 | /* Refresh the desc even if buffer_addrs didn't change | 7301 | /* Refresh the desc even if buffer_addrs didn't change |
| 7279 | * because each write-back erases this info. | 7302 | * because each write-back erases this info. |
| 7280 | */ | 7303 | */ |
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c index e9cef9de9ed8..c96f9b1d948a 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c | |||
| @@ -900,8 +900,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv, | |||
| 900 | 900 | ||
| 901 | /* bound gain by 2 bits value max, 3rd bit is sign */ | 901 | /* bound gain by 2 bits value max, 3rd bit is sign */ |
| 902 | data->delta_gain_code[i] = | 902 | data->delta_gain_code[i] = |
| 903 | min(abs(delta_g), | 903 | min(abs(delta_g), CHAIN_NOISE_MAX_DELTA_GAIN_CODE); |
| 904 | (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); | ||
| 905 | 904 | ||
| 906 | if (delta_g < 0) | 905 | if (delta_g < 0) |
| 907 | /* | 906 | /* |
diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index d5cc3070e83f..b653451843c8 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c | |||
| @@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 882 | BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); | 882 | BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); |
| 883 | 883 | ||
| 884 | pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); | 884 | pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); |
| 885 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); | 885 | ret = vm_insert_pfn(vma, vmf->address, pfn); |
| 886 | mutex_unlock(&buffer->lock); | 886 | mutex_unlock(&buffer->lock); |
| 887 | if (ret) | 887 | if (ret) |
| 888 | return VM_FAULT_ERROR; | 888 | return VM_FAULT_ERROR; |
diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 0b6d388d8aa4..697cbfbe9374 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c | |||
| @@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) | |||
| 1014 | "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", | 1014 | "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", |
| 1015 | vmf->page, vmf->page->mapping, vmf->page->index, | 1015 | vmf->page, vmf->page->mapping, vmf->page->index, |
| 1016 | (long)vmf->page->flags, page_count(vmf->page), | 1016 | (long)vmf->page->flags, page_count(vmf->page), |
| 1017 | page_private(vmf->page), vmf->virtual_address); | 1017 | page_private(vmf->page), (void *)vmf->address); |
| 1018 | if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { | 1018 | if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { |
| 1019 | lock_page(vmf->page); | 1019 | lock_page(vmf->page); |
| 1020 | cfio->ft_flags |= VM_FAULT_LOCKED; | 1020 | cfio->ft_flags |= VM_FAULT_LOCKED; |
| @@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) | |||
| 1025 | } | 1025 | } |
| 1026 | 1026 | ||
| 1027 | if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { | 1027 | if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { |
| 1028 | CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); | 1028 | CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); |
| 1029 | return -EFAULT; | 1029 | return -EFAULT; |
| 1030 | } | 1030 | } |
| 1031 | 1031 | ||
| 1032 | if (cfio->ft_flags & VM_FAULT_OOM) { | 1032 | if (cfio->ft_flags & VM_FAULT_OOM) { |
| 1033 | CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); | 1033 | CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); |
| 1034 | return -ENOMEM; | 1034 | return -ENOMEM; |
| 1035 | } | 1035 | } |
| 1036 | 1036 | ||
diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index 7abd70b2a588..3151d2a0fe59 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c | |||
| @@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f) | |||
| 905 | mutex_lock(&hidg_ida_lock); | 905 | mutex_lock(&hidg_ida_lock); |
| 906 | 906 | ||
| 907 | hidg_put_minor(opts->minor); | 907 | hidg_put_minor(opts->minor); |
| 908 | if (idr_is_empty(&hidg_ida.idr)) | 908 | if (ida_is_empty(&hidg_ida)) |
| 909 | ghid_cleanup(); | 909 | ghid_cleanup(); |
| 910 | 910 | ||
| 911 | mutex_unlock(&hidg_ida_lock); | 911 | mutex_unlock(&hidg_ida_lock); |
| @@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) | |||
| 931 | 931 | ||
| 932 | mutex_lock(&hidg_ida_lock); | 932 | mutex_lock(&hidg_ida_lock); |
| 933 | 933 | ||
| 934 | if (idr_is_empty(&hidg_ida.idr)) { | 934 | if (ida_is_empty(&hidg_ida)) { |
| 935 | status = ghid_setup(NULL, HIDG_MINORS); | 935 | status = ghid_setup(NULL, HIDG_MINORS); |
| 936 | if (status) { | 936 | if (status) { |
| 937 | ret = ERR_PTR(status); | 937 | ret = ERR_PTR(status); |
| @@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) | |||
| 944 | if (opts->minor < 0) { | 944 | if (opts->minor < 0) { |
| 945 | ret = ERR_PTR(opts->minor); | 945 | ret = ERR_PTR(opts->minor); |
| 946 | kfree(opts); | 946 | kfree(opts); |
| 947 | if (idr_is_empty(&hidg_ida.idr)) | 947 | if (ida_is_empty(&hidg_ida)) |
| 948 | ghid_cleanup(); | 948 | ghid_cleanup(); |
| 949 | goto unlock; | 949 | goto unlock; |
| 950 | } | 950 | } |
diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 0de36cda6e41..8054da9276dd 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c | |||
| @@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f) | |||
| 1265 | mutex_lock(&printer_ida_lock); | 1265 | mutex_lock(&printer_ida_lock); |
| 1266 | 1266 | ||
| 1267 | gprinter_put_minor(opts->minor); | 1267 | gprinter_put_minor(opts->minor); |
| 1268 | if (idr_is_empty(&printer_ida.idr)) | 1268 | if (ida_is_empty(&printer_ida)) |
| 1269 | gprinter_cleanup(); | 1269 | gprinter_cleanup(); |
| 1270 | 1270 | ||
| 1271 | mutex_unlock(&printer_ida_lock); | 1271 | mutex_unlock(&printer_ida_lock); |
| @@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) | |||
| 1289 | 1289 | ||
| 1290 | mutex_lock(&printer_ida_lock); | 1290 | mutex_lock(&printer_ida_lock); |
| 1291 | 1291 | ||
| 1292 | if (idr_is_empty(&printer_ida.idr)) { | 1292 | if (ida_is_empty(&printer_ida)) { |
| 1293 | status = gprinter_setup(PRINTER_MINORS); | 1293 | status = gprinter_setup(PRINTER_MINORS); |
| 1294 | if (status) { | 1294 | if (status) { |
| 1295 | ret = ERR_PTR(status); | 1295 | ret = ERR_PTR(status); |
| @@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) | |||
| 1302 | if (opts->minor < 0) { | 1302 | if (opts->minor < 0) { |
| 1303 | ret = ERR_PTR(opts->minor); | 1303 | ret = ERR_PTR(opts->minor); |
| 1304 | kfree(opts); | 1304 | kfree(opts); |
| 1305 | if (idr_is_empty(&printer_ida.idr)) | 1305 | if (ida_is_empty(&printer_ida)) |
| 1306 | gprinter_cleanup(); | 1306 | gprinter_cleanup(); |
| 1307 | goto unlock; | 1307 | goto unlock; |
| 1308 | } | 1308 | } |
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9815e45c23c4..f3726ba12aa6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c | |||
| @@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, | |||
| 362 | 362 | ||
| 363 | down_read(&mm->mmap_sem); | 363 | down_read(&mm->mmap_sem); |
| 364 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, | 364 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, |
| 365 | NULL); | 365 | NULL, NULL); |
| 366 | up_read(&mm->mmap_sem); | 366 | up_read(&mm->mmap_sem); |
| 367 | } | 367 | } |
| 368 | 368 | ||
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 702040fe2001..6e3306f4a525 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c | |||
| @@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 602 | { | 602 | { |
| 603 | printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", | 603 | printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", |
| 604 | vma, vma->vm_start, vma->vm_end, | 604 | vma, vma->vm_start, vma->vm_end, |
| 605 | vmf->pgoff, vmf->virtual_address); | 605 | vmf->pgoff, (void *)vmf->address); |
| 606 | 606 | ||
| 607 | return VM_FAULT_SIGBUS; | 607 | return VM_FAULT_SIGBUS; |
| 608 | } | 608 | } |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 180f910339f4..3b713b6fcc26 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
| @@ -202,12 +202,12 @@ static struct ratelimit_state printk_limits[] = { | |||
| 202 | void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) | 202 | void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) |
| 203 | { | 203 | { |
| 204 | struct super_block *sb = fs_info->sb; | 204 | struct super_block *sb = fs_info->sb; |
| 205 | char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1]; | 205 | char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; |
| 206 | struct va_format vaf; | 206 | struct va_format vaf; |
| 207 | va_list args; | 207 | va_list args; |
| 208 | const char *type = NULL; | ||
| 209 | int kern_level; | 208 | int kern_level; |
| 210 | struct ratelimit_state *ratelimit; | 209 | const char *type = logtypes[4]; |
| 210 | struct ratelimit_state *ratelimit = &printk_limits[4]; | ||
| 211 | 211 | ||
| 212 | va_start(args, fmt); | 212 | va_start(args, fmt); |
| 213 | 213 | ||
| @@ -223,12 +223,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) | |||
| 223 | fmt += size; | 223 | fmt += size; |
| 224 | } | 224 | } |
| 225 | 225 | ||
| 226 | if (!type) { | ||
| 227 | *lvl = '\0'; | ||
| 228 | type = logtypes[4]; | ||
| 229 | ratelimit = &printk_limits[4]; | ||
| 230 | } | ||
| 231 | |||
| 232 | vaf.fmt = fmt; | 226 | vaf.fmt = fmt; |
| 233 | vaf.va = &args; | 227 | vaf.va = &args; |
| 234 | 228 | ||
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index bf62ad919a95..00ee006a8aa2 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c | |||
| @@ -162,6 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) | |||
| 162 | slot = radix_tree_iter_retry(&iter); | 162 | slot = radix_tree_iter_retry(&iter); |
| 163 | continue; | 163 | continue; |
| 164 | } | 164 | } |
| 165 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 165 | spin_unlock(&fs_info->buffer_lock); | 166 | spin_unlock(&fs_info->buffer_lock); |
| 166 | free_extent_buffer_stale(eb); | 167 | free_extent_buffer_stale(eb); |
| 167 | spin_lock(&fs_info->buffer_lock); | 168 | spin_lock(&fs_info->buffer_lock); |
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/vmstat.h> | 31 | #include <linux/vmstat.h> |
| 32 | #include <linux/pfn_t.h> | 32 | #include <linux/pfn_t.h> |
| 33 | #include <linux/sizes.h> | 33 | #include <linux/sizes.h> |
| 34 | #include <linux/mmu_notifier.h> | ||
| 34 | #include <linux/iomap.h> | 35 | #include <linux/iomap.h> |
| 35 | #include "internal.h" | 36 | #include "internal.h" |
| 36 | 37 | ||
| @@ -240,6 +241,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, | |||
| 240 | } | 241 | } |
| 241 | } | 242 | } |
| 242 | 243 | ||
| 244 | static void dax_unlock_mapping_entry(struct address_space *mapping, | ||
| 245 | pgoff_t index) | ||
| 246 | { | ||
| 247 | void *entry, **slot; | ||
| 248 | |||
| 249 | spin_lock_irq(&mapping->tree_lock); | ||
| 250 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); | ||
| 251 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | ||
| 252 | !slot_locked(mapping, slot))) { | ||
| 253 | spin_unlock_irq(&mapping->tree_lock); | ||
| 254 | return; | ||
| 255 | } | ||
| 256 | unlock_slot(mapping, slot); | ||
| 257 | spin_unlock_irq(&mapping->tree_lock); | ||
| 258 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | ||
| 259 | } | ||
| 260 | |||
| 243 | static void put_locked_mapping_entry(struct address_space *mapping, | 261 | static void put_locked_mapping_entry(struct address_space *mapping, |
| 244 | pgoff_t index, void *entry) | 262 | pgoff_t index, void *entry) |
| 245 | { | 263 | { |
| @@ -433,22 +451,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, | |||
| 433 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | 451 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); |
| 434 | } | 452 | } |
| 435 | 453 | ||
| 436 | void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) | ||
| 437 | { | ||
| 438 | void *entry, **slot; | ||
| 439 | |||
| 440 | spin_lock_irq(&mapping->tree_lock); | ||
| 441 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); | ||
| 442 | if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || | ||
| 443 | !slot_locked(mapping, slot))) { | ||
| 444 | spin_unlock_irq(&mapping->tree_lock); | ||
| 445 | return; | ||
| 446 | } | ||
| 447 | unlock_slot(mapping, slot); | ||
| 448 | spin_unlock_irq(&mapping->tree_lock); | ||
| 449 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | ||
| 450 | } | ||
| 451 | |||
| 452 | /* | 454 | /* |
| 453 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree | 455 | * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree |
| 454 | * entry to get unlocked before deleting it. | 456 | * entry to get unlocked before deleting it. |
| @@ -500,10 +502,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
| 500 | /* This will replace locked radix tree entry with a hole page */ | 502 | /* This will replace locked radix tree entry with a hole page */ |
| 501 | page = find_or_create_page(mapping, vmf->pgoff, | 503 | page = find_or_create_page(mapping, vmf->pgoff, |
| 502 | vmf->gfp_mask | __GFP_ZERO); | 504 | vmf->gfp_mask | __GFP_ZERO); |
| 503 | if (!page) { | 505 | if (!page) |
| 504 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | ||
| 505 | return VM_FAULT_OOM; | 506 | return VM_FAULT_OOM; |
| 506 | } | ||
| 507 | vmf->page = page; | 507 | vmf->page = page; |
| 508 | return VM_FAULT_LOCKED; | 508 | return VM_FAULT_LOCKED; |
| 509 | } | 509 | } |
| @@ -615,36 +615,107 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
| 615 | return new_entry; | 615 | return new_entry; |
| 616 | } | 616 | } |
| 617 | 617 | ||
| 618 | static inline unsigned long | ||
| 619 | pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) | ||
| 620 | { | ||
| 621 | unsigned long address; | ||
| 622 | |||
| 623 | address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
| 624 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); | ||
| 625 | return address; | ||
| 626 | } | ||
| 627 | |||
| 628 | /* Walk all mappings of a given index of a file and writeprotect them */ | ||
| 629 | static void dax_mapping_entry_mkclean(struct address_space *mapping, | ||
| 630 | pgoff_t index, unsigned long pfn) | ||
| 631 | { | ||
| 632 | struct vm_area_struct *vma; | ||
| 633 | pte_t *ptep; | ||
| 634 | pte_t pte; | ||
| 635 | spinlock_t *ptl; | ||
| 636 | bool changed; | ||
| 637 | |||
| 638 | i_mmap_lock_read(mapping); | ||
| 639 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { | ||
| 640 | unsigned long address; | ||
| 641 | |||
| 642 | cond_resched(); | ||
| 643 | |||
| 644 | if (!(vma->vm_flags & VM_SHARED)) | ||
| 645 | continue; | ||
| 646 | |||
| 647 | address = pgoff_address(index, vma); | ||
| 648 | changed = false; | ||
| 649 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
| 650 | continue; | ||
| 651 | if (pfn != pte_pfn(*ptep)) | ||
| 652 | goto unlock; | ||
| 653 | if (!pte_dirty(*ptep) && !pte_write(*ptep)) | ||
| 654 | goto unlock; | ||
| 655 | |||
| 656 | flush_cache_page(vma, address, pfn); | ||
| 657 | pte = ptep_clear_flush(vma, address, ptep); | ||
| 658 | pte = pte_wrprotect(pte); | ||
| 659 | pte = pte_mkclean(pte); | ||
| 660 | set_pte_at(vma->vm_mm, address, ptep, pte); | ||
| 661 | changed = true; | ||
| 662 | unlock: | ||
| 663 | pte_unmap_unlock(ptep, ptl); | ||
| 664 | |||
| 665 | if (changed) | ||
| 666 | mmu_notifier_invalidate_page(vma->vm_mm, address); | ||
| 667 | } | ||
| 668 | i_mmap_unlock_read(mapping); | ||
| 669 | } | ||
| 670 | |||
| 618 | static int dax_writeback_one(struct block_device *bdev, | 671 | static int dax_writeback_one(struct block_device *bdev, |
| 619 | struct address_space *mapping, pgoff_t index, void *entry) | 672 | struct address_space *mapping, pgoff_t index, void *entry) |
| 620 | { | 673 | { |
| 621 | struct radix_tree_root *page_tree = &mapping->page_tree; | 674 | struct radix_tree_root *page_tree = &mapping->page_tree; |
| 622 | struct radix_tree_node *node; | ||
| 623 | struct blk_dax_ctl dax; | 675 | struct blk_dax_ctl dax; |
| 624 | void **slot; | 676 | void *entry2, **slot; |
| 625 | int ret = 0; | 677 | int ret = 0; |
| 626 | 678 | ||
| 627 | spin_lock_irq(&mapping->tree_lock); | ||
| 628 | /* | 679 | /* |
| 629 | * Regular page slots are stabilized by the page lock even | 680 | * A page got tagged dirty in DAX mapping? Something is seriously |
| 630 | * without the tree itself locked. These unlocked entries | 681 | * wrong. |
| 631 | * need verification under the tree lock. | ||
| 632 | */ | 682 | */ |
| 633 | if (!__radix_tree_lookup(page_tree, index, &node, &slot)) | 683 | if (WARN_ON(!radix_tree_exceptional_entry(entry))) |
| 634 | goto unlock; | 684 | return -EIO; |
| 635 | if (*slot != entry) | ||
| 636 | goto unlock; | ||
| 637 | |||
| 638 | /* another fsync thread may have already written back this entry */ | ||
| 639 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
| 640 | goto unlock; | ||
| 641 | 685 | ||
| 686 | spin_lock_irq(&mapping->tree_lock); | ||
| 687 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); | ||
| 688 | /* Entry got punched out / reallocated? */ | ||
| 689 | if (!entry2 || !radix_tree_exceptional_entry(entry2)) | ||
| 690 | goto put_unlocked; | ||
| 691 | /* | ||
| 692 | * Entry got reallocated elsewhere? No need to writeback. We have to | ||
| 693 | * compare sectors as we must not bail out due to difference in lockbit | ||
| 694 | * or entry type. | ||
| 695 | */ | ||
| 696 | if (dax_radix_sector(entry2) != dax_radix_sector(entry)) | ||
| 697 | goto put_unlocked; | ||
| 642 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || | 698 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
| 643 | dax_is_zero_entry(entry))) { | 699 | dax_is_zero_entry(entry))) { |
| 644 | ret = -EIO; | 700 | ret = -EIO; |
| 645 | goto unlock; | 701 | goto put_unlocked; |
| 646 | } | 702 | } |
| 647 | 703 | ||
| 704 | /* Another fsync thread may have already written back this entry */ | ||
| 705 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
| 706 | goto put_unlocked; | ||
| 707 | /* Lock the entry to serialize with page faults */ | ||
| 708 | entry = lock_slot(mapping, slot); | ||
| 709 | /* | ||
| 710 | * We can clear the tag now but we have to be careful so that concurrent | ||
| 711 | * dax_writeback_one() calls for the same index cannot finish before we | ||
| 712 | * actually flush the caches. This is achieved as the calls will look | ||
| 713 | * at the entry only under tree_lock and once they do that they will | ||
| 714 | * see the entry locked and wait for it to unlock. | ||
| 715 | */ | ||
| 716 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | ||
| 717 | spin_unlock_irq(&mapping->tree_lock); | ||
| 718 | |||
| 648 | /* | 719 | /* |
| 649 | * Even if dax_writeback_mapping_range() was given a wbc->range_start | 720 | * Even if dax_writeback_mapping_range() was given a wbc->range_start |
| 650 | * in the middle of a PMD, the 'index' we are given will be aligned to | 721 | * in the middle of a PMD, the 'index' we are given will be aligned to |
| @@ -654,31 +725,40 @@ static int dax_writeback_one(struct block_device *bdev, | |||
| 654 | */ | 725 | */ |
| 655 | dax.sector = dax_radix_sector(entry); | 726 | dax.sector = dax_radix_sector(entry); |
| 656 | dax.size = PAGE_SIZE << dax_radix_order(entry); | 727 | dax.size = PAGE_SIZE << dax_radix_order(entry); |
| 657 | spin_unlock_irq(&mapping->tree_lock); | ||
| 658 | 728 | ||
| 659 | /* | 729 | /* |
| 660 | * We cannot hold tree_lock while calling dax_map_atomic() because it | 730 | * We cannot hold tree_lock while calling dax_map_atomic() because it |
| 661 | * eventually calls cond_resched(). | 731 | * eventually calls cond_resched(). |
| 662 | */ | 732 | */ |
| 663 | ret = dax_map_atomic(bdev, &dax); | 733 | ret = dax_map_atomic(bdev, &dax); |
| 664 | if (ret < 0) | 734 | if (ret < 0) { |
| 735 | put_locked_mapping_entry(mapping, index, entry); | ||
| 665 | return ret; | 736 | return ret; |
| 737 | } | ||
| 666 | 738 | ||
| 667 | if (WARN_ON_ONCE(ret < dax.size)) { | 739 | if (WARN_ON_ONCE(ret < dax.size)) { |
| 668 | ret = -EIO; | 740 | ret = -EIO; |
| 669 | goto unmap; | 741 | goto unmap; |
| 670 | } | 742 | } |
| 671 | 743 | ||
| 744 | dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); | ||
| 672 | wb_cache_pmem(dax.addr, dax.size); | 745 | wb_cache_pmem(dax.addr, dax.size); |
| 673 | 746 | /* | |
| 747 | * After we have flushed the cache, we can clear the dirty tag. There | ||
| 748 | * cannot be new dirty data in the pfn after the flush has completed as | ||
| 749 | * the pfn mappings are writeprotected and fault waits for mapping | ||
| 750 | * entry lock. | ||
| 751 | */ | ||
| 674 | spin_lock_irq(&mapping->tree_lock); | 752 | spin_lock_irq(&mapping->tree_lock); |
| 675 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | 753 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); |
| 676 | spin_unlock_irq(&mapping->tree_lock); | 754 | spin_unlock_irq(&mapping->tree_lock); |
| 677 | unmap: | 755 | unmap: |
| 678 | dax_unmap_atomic(bdev, &dax); | 756 | dax_unmap_atomic(bdev, &dax); |
| 757 | put_locked_mapping_entry(mapping, index, entry); | ||
| 679 | return ret; | 758 | return ret; |
| 680 | 759 | ||
| 681 | unlock: | 760 | put_unlocked: |
| 761 | put_unlocked_mapping_entry(mapping, index, entry2); | ||
| 682 | spin_unlock_irq(&mapping->tree_lock); | 762 | spin_unlock_irq(&mapping->tree_lock); |
| 683 | return ret; | 763 | return ret; |
| 684 | } | 764 | } |
| @@ -738,7 +818,7 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
| 738 | struct block_device *bdev, sector_t sector, size_t size, | 818 | struct block_device *bdev, sector_t sector, size_t size, |
| 739 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) | 819 | void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) |
| 740 | { | 820 | { |
| 741 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 821 | unsigned long vaddr = vmf->address; |
| 742 | struct blk_dax_ctl dax = { | 822 | struct blk_dax_ctl dax = { |
| 743 | .sector = sector, | 823 | .sector = sector, |
| 744 | .size = size, | 824 | .size = size, |
| @@ -767,17 +847,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 767 | { | 847 | { |
| 768 | struct file *file = vma->vm_file; | 848 | struct file *file = vma->vm_file; |
| 769 | struct address_space *mapping = file->f_mapping; | 849 | struct address_space *mapping = file->f_mapping; |
| 770 | void *entry; | 850 | void *entry, **slot; |
| 771 | pgoff_t index = vmf->pgoff; | 851 | pgoff_t index = vmf->pgoff; |
| 772 | 852 | ||
| 773 | spin_lock_irq(&mapping->tree_lock); | 853 | spin_lock_irq(&mapping->tree_lock); |
| 774 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | 854 | entry = get_unlocked_mapping_entry(mapping, index, &slot); |
| 775 | if (!entry || !radix_tree_exceptional_entry(entry)) | 855 | if (!entry || !radix_tree_exceptional_entry(entry)) { |
| 776 | goto out; | 856 | if (entry) |
| 857 | put_unlocked_mapping_entry(mapping, index, entry); | ||
| 858 | spin_unlock_irq(&mapping->tree_lock); | ||
| 859 | return VM_FAULT_NOPAGE; | ||
| 860 | } | ||
| 777 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); | 861 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); |
| 778 | put_unlocked_mapping_entry(mapping, index, entry); | 862 | entry = lock_slot(mapping, slot); |
| 779 | out: | ||
| 780 | spin_unlock_irq(&mapping->tree_lock); | 863 | spin_unlock_irq(&mapping->tree_lock); |
| 864 | /* | ||
| 865 | * If we race with somebody updating the PTE and finish_mkwrite_fault() | ||
| 866 | * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry | ||
| 867 | * the fault in either case. | ||
| 868 | */ | ||
| 869 | finish_mkwrite_fault(vmf); | ||
| 870 | put_locked_mapping_entry(mapping, index, entry); | ||
| 781 | return VM_FAULT_NOPAGE; | 871 | return VM_FAULT_NOPAGE; |
| 782 | } | 872 | } |
| 783 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); | 873 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); |
| @@ -948,13 +1038,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 948 | { | 1038 | { |
| 949 | struct address_space *mapping = vma->vm_file->f_mapping; | 1039 | struct address_space *mapping = vma->vm_file->f_mapping; |
| 950 | struct inode *inode = mapping->host; | 1040 | struct inode *inode = mapping->host; |
| 951 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 1041 | unsigned long vaddr = vmf->address; |
| 952 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; | 1042 | loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; |
| 953 | sector_t sector; | 1043 | sector_t sector; |
| 954 | struct iomap iomap = { 0 }; | 1044 | struct iomap iomap = { 0 }; |
| 955 | unsigned flags = IOMAP_FAULT; | 1045 | unsigned flags = IOMAP_FAULT; |
| 956 | int error, major = 0; | 1046 | int error, major = 0; |
| 957 | int locked_status = 0; | 1047 | int vmf_ret = 0; |
| 958 | void *entry; | 1048 | void *entry; |
| 959 | 1049 | ||
| 960 | /* | 1050 | /* |
| @@ -1007,13 +1097,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1007 | 1097 | ||
| 1008 | if (error) | 1098 | if (error) |
| 1009 | goto finish_iomap; | 1099 | goto finish_iomap; |
| 1010 | if (!radix_tree_exceptional_entry(entry)) { | 1100 | |
| 1011 | vmf->page = entry; | 1101 | __SetPageUptodate(vmf->cow_page); |
| 1012 | locked_status = VM_FAULT_LOCKED; | 1102 | vmf_ret = finish_fault(vmf); |
| 1013 | } else { | 1103 | if (!vmf_ret) |
| 1014 | vmf->entry = entry; | 1104 | vmf_ret = VM_FAULT_DONE_COW; |
| 1015 | locked_status = VM_FAULT_DAX_LOCKED; | ||
| 1016 | } | ||
| 1017 | goto finish_iomap; | 1105 | goto finish_iomap; |
| 1018 | } | 1106 | } |
| 1019 | 1107 | ||
| @@ -1030,7 +1118,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1030 | case IOMAP_UNWRITTEN: | 1118 | case IOMAP_UNWRITTEN: |
| 1031 | case IOMAP_HOLE: | 1119 | case IOMAP_HOLE: |
| 1032 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1120 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
| 1033 | locked_status = dax_load_hole(mapping, entry, vmf); | 1121 | vmf_ret = dax_load_hole(mapping, entry, vmf); |
| 1034 | break; | 1122 | break; |
| 1035 | } | 1123 | } |
| 1036 | /*FALLTHRU*/ | 1124 | /*FALLTHRU*/ |
| @@ -1042,7 +1130,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1042 | 1130 | ||
| 1043 | finish_iomap: | 1131 | finish_iomap: |
| 1044 | if (ops->iomap_end) { | 1132 | if (ops->iomap_end) { |
| 1045 | if (error) { | 1133 | if (error || (vmf_ret & VM_FAULT_ERROR)) { |
| 1046 | /* keep previous error */ | 1134 | /* keep previous error */ |
| 1047 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | 1135 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, |
| 1048 | &iomap); | 1136 | &iomap); |
| @@ -1052,7 +1140,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1052 | } | 1140 | } |
| 1053 | } | 1141 | } |
| 1054 | unlock_entry: | 1142 | unlock_entry: |
| 1055 | if (!locked_status || error) | 1143 | if (vmf_ret != VM_FAULT_LOCKED || error) |
| 1056 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1144 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); |
| 1057 | out: | 1145 | out: |
| 1058 | if (error == -ENOMEM) | 1146 | if (error == -ENOMEM) |
| @@ -1060,9 +1148,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1060 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1148 | /* -EBUSY is fine, somebody else faulted on the same PTE */ |
| 1061 | if (error < 0 && error != -EBUSY) | 1149 | if (error < 0 && error != -EBUSY) |
| 1062 | return VM_FAULT_SIGBUS | major; | 1150 | return VM_FAULT_SIGBUS | major; |
| 1063 | if (locked_status) { | 1151 | if (vmf_ret) { |
| 1064 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | 1152 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ |
| 1065 | return locked_status; | 1153 | return vmf_ret; |
| 1066 | } | 1154 | } |
| 1067 | return VM_FAULT_NOPAGE | major; | 1155 | return VM_FAULT_NOPAGE | major; |
| 1068 | } | 1156 | } |
| @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, | |||
| 209 | * doing the exec and bprm->mm is the new process's mm. | 209 | * doing the exec and bprm->mm is the new process's mm. |
| 210 | */ | 210 | */ |
| 211 | ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, | 211 | ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, |
| 212 | &page, NULL); | 212 | &page, NULL, NULL); |
| 213 | if (ret <= 0) | 213 | if (ret <= 0) |
| 214 | return NULL; | 214 | return NULL; |
| 215 | 215 | ||
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..d96e2f30084b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
| @@ -257,9 +257,9 @@ out: | |||
| 257 | * fatal_signal_pending()s, and the mmap_sem must be released before | 257 | * fatal_signal_pending()s, and the mmap_sem must be released before |
| 258 | * returning it. | 258 | * returning it. |
| 259 | */ | 259 | */ |
| 260 | int handle_userfault(struct fault_env *fe, unsigned long reason) | 260 | int handle_userfault(struct vm_fault *vmf, unsigned long reason) |
| 261 | { | 261 | { |
| 262 | struct mm_struct *mm = fe->vma->vm_mm; | 262 | struct mm_struct *mm = vmf->vma->vm_mm; |
| 263 | struct userfaultfd_ctx *ctx; | 263 | struct userfaultfd_ctx *ctx; |
| 264 | struct userfaultfd_wait_queue uwq; | 264 | struct userfaultfd_wait_queue uwq; |
| 265 | int ret; | 265 | int ret; |
| @@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
| 268 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 268 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
| 269 | 269 | ||
| 270 | ret = VM_FAULT_SIGBUS; | 270 | ret = VM_FAULT_SIGBUS; |
| 271 | ctx = fe->vma->vm_userfaultfd_ctx.ctx; | 271 | ctx = vmf->vma->vm_userfaultfd_ctx.ctx; |
| 272 | if (!ctx) | 272 | if (!ctx) |
| 273 | goto out; | 273 | goto out; |
| 274 | 274 | ||
| @@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
| 301 | * without first stopping userland access to the memory. For | 301 | * without first stopping userland access to the memory. For |
| 302 | * VM_UFFD_MISSING userfaults this is enough for now. | 302 | * VM_UFFD_MISSING userfaults this is enough for now. |
| 303 | */ | 303 | */ |
| 304 | if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { | 304 | if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { |
| 305 | /* | 305 | /* |
| 306 | * Validate the invariant that nowait must allow retry | 306 | * Validate the invariant that nowait must allow retry |
| 307 | * to be sure not to return SIGBUS erroneously on | 307 | * to be sure not to return SIGBUS erroneously on |
| 308 | * nowait invocations. | 308 | * nowait invocations. |
| 309 | */ | 309 | */ |
| 310 | BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); | 310 | BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); |
| 311 | #ifdef CONFIG_DEBUG_VM | 311 | #ifdef CONFIG_DEBUG_VM |
| 312 | if (printk_ratelimit()) { | 312 | if (printk_ratelimit()) { |
| 313 | printk(KERN_WARNING | 313 | printk(KERN_WARNING |
| 314 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); | 314 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", |
| 315 | vmf->flags); | ||
| 315 | dump_stack(); | 316 | dump_stack(); |
| 316 | } | 317 | } |
| 317 | #endif | 318 | #endif |
| @@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
| 323 | * and wait. | 324 | * and wait. |
| 324 | */ | 325 | */ |
| 325 | ret = VM_FAULT_RETRY; | 326 | ret = VM_FAULT_RETRY; |
| 326 | if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) | 327 | if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) |
| 327 | goto out; | 328 | goto out; |
| 328 | 329 | ||
| 329 | /* take the reference before dropping the mmap_sem */ | 330 | /* take the reference before dropping the mmap_sem */ |
| @@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
| 331 | 332 | ||
| 332 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); | 333 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); |
| 333 | uwq.wq.private = current; | 334 | uwq.wq.private = current; |
| 334 | uwq.msg = userfault_msg(fe->address, fe->flags, reason); | 335 | uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); |
| 335 | uwq.ctx = ctx; | 336 | uwq.ctx = ctx; |
| 336 | 337 | ||
| 337 | return_to_userland = | 338 | return_to_userland = |
| 338 | (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | 339 | (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == |
| 339 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); | 340 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); |
| 340 | 341 | ||
| 341 | spin_lock(&ctx->fault_pending_wqh.lock); | 342 | spin_lock(&ctx->fault_pending_wqh.lock); |
| @@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) | |||
| 353 | TASK_KILLABLE); | 354 | TASK_KILLABLE); |
| 354 | spin_unlock(&ctx->fault_pending_wqh.lock); | 355 | spin_unlock(&ctx->fault_pending_wqh.lock); |
| 355 | 356 | ||
| 356 | must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); | 357 | must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, |
| 358 | reason); | ||
| 357 | up_read(&mm->mmap_sem); | 359 | up_read(&mm->mmap_sem); |
| 358 | 360 | ||
| 359 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && | 361 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && |
diff --git a/include/linux/dax.h b/include/linux/dax.h index 0afade8bd3d7..f97bcfe79472 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
| @@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, | |||
| 46 | 46 | ||
| 47 | #ifdef CONFIG_FS_DAX | 47 | #ifdef CONFIG_FS_DAX |
| 48 | struct page *read_dax_sector(struct block_device *bdev, sector_t n); | 48 | struct page *read_dax_sector(struct block_device *bdev, sector_t n); |
| 49 | void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); | ||
| 50 | int __dax_zero_page_range(struct block_device *bdev, sector_t sector, | 49 | int __dax_zero_page_range(struct block_device *bdev, sector_t sector, |
| 51 | unsigned int offset, unsigned int length); | 50 | unsigned int offset, unsigned int length); |
| 52 | #else | 51 | #else |
| @@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev, | |||
| 55 | { | 54 | { |
| 56 | return ERR_PTR(-ENXIO); | 55 | return ERR_PTR(-ENXIO); |
| 57 | } | 56 | } |
| 58 | /* Shouldn't ever be called when dax is disabled. */ | ||
| 59 | static inline void dax_unlock_mapping_entry(struct address_space *mapping, | ||
| 60 | pgoff_t index) | ||
| 61 | { | ||
| 62 | BUG(); | ||
| 63 | } | ||
| 64 | static inline int __dax_zero_page_range(struct block_device *bdev, | 57 | static inline int __dax_zero_page_range(struct block_device *bdev, |
| 65 | sector_t sector, unsigned int offset, unsigned int length) | 58 | sector_t sector, unsigned int offset, unsigned int length) |
| 66 | { | 59 | { |
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 08528afdf58b..10c5a17b1f51 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h | |||
| @@ -243,29 +243,33 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg | |||
| 243 | ops->unmap_sg(dev, sg, nents, dir, attrs); | 243 | ops->unmap_sg(dev, sg, nents, dir, attrs); |
| 244 | } | 244 | } |
| 245 | 245 | ||
| 246 | static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, | 246 | static inline dma_addr_t dma_map_page_attrs(struct device *dev, |
| 247 | size_t offset, size_t size, | 247 | struct page *page, |
| 248 | enum dma_data_direction dir) | 248 | size_t offset, size_t size, |
| 249 | enum dma_data_direction dir, | ||
| 250 | unsigned long attrs) | ||
| 249 | { | 251 | { |
| 250 | struct dma_map_ops *ops = get_dma_ops(dev); | 252 | struct dma_map_ops *ops = get_dma_ops(dev); |
| 251 | dma_addr_t addr; | 253 | dma_addr_t addr; |
| 252 | 254 | ||
| 253 | kmemcheck_mark_initialized(page_address(page) + offset, size); | 255 | kmemcheck_mark_initialized(page_address(page) + offset, size); |
| 254 | BUG_ON(!valid_dma_direction(dir)); | 256 | BUG_ON(!valid_dma_direction(dir)); |
| 255 | addr = ops->map_page(dev, page, offset, size, dir, 0); | 257 | addr = ops->map_page(dev, page, offset, size, dir, attrs); |
| 256 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); | 258 | debug_dma_map_page(dev, page, offset, size, dir, addr, false); |
| 257 | 259 | ||
| 258 | return addr; | 260 | return addr; |
| 259 | } | 261 | } |
| 260 | 262 | ||
| 261 | static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, | 263 | static inline void dma_unmap_page_attrs(struct device *dev, |
| 262 | size_t size, enum dma_data_direction dir) | 264 | dma_addr_t addr, size_t size, |
| 265 | enum dma_data_direction dir, | ||
| 266 | unsigned long attrs) | ||
| 263 | { | 267 | { |
| 264 | struct dma_map_ops *ops = get_dma_ops(dev); | 268 | struct dma_map_ops *ops = get_dma_ops(dev); |
| 265 | 269 | ||
| 266 | BUG_ON(!valid_dma_direction(dir)); | 270 | BUG_ON(!valid_dma_direction(dir)); |
| 267 | if (ops->unmap_page) | 271 | if (ops->unmap_page) |
| 268 | ops->unmap_page(dev, addr, size, dir, 0); | 272 | ops->unmap_page(dev, addr, size, dir, attrs); |
| 269 | debug_dma_unmap_page(dev, addr, size, dir, false); | 273 | debug_dma_unmap_page(dev, addr, size, dir, false); |
| 270 | } | 274 | } |
| 271 | 275 | ||
| @@ -385,6 +389,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, | |||
| 385 | #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) | 389 | #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) |
| 386 | #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) | 390 | #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) |
| 387 | #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) | 391 | #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) |
| 392 | #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) | ||
| 393 | #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) | ||
| 388 | 394 | ||
| 389 | extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, | 395 | extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, |
| 390 | void *cpu_addr, dma_addr_t dma_addr, size_t size); | 396 | void *cpu_addr, dma_addr_t dma_addr, size_t size); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f8041f9de31e..4175dca4ac39 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
| @@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold); | |||
| 506 | extern void free_hot_cold_page_list(struct list_head *list, bool cold); | 506 | extern void free_hot_cold_page_list(struct list_head *list, bool cold); |
| 507 | 507 | ||
| 508 | struct page_frag_cache; | 508 | struct page_frag_cache; |
| 509 | extern void __page_frag_drain(struct page *page, unsigned int order, | ||
| 510 | unsigned int count); | ||
| 509 | extern void *__alloc_page_frag(struct page_frag_cache *nc, | 511 | extern void *__alloc_page_frag(struct page_frag_cache *nc, |
| 510 | unsigned int fragsz, gfp_t gfp_mask); | 512 | unsigned int fragsz, gfp_t gfp_mask); |
| 511 | extern void __free_page_frag(void *addr); | 513 | extern void __free_page_frag(void *addr); |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1f782aa1d8e6..97e478d6b690 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
| @@ -1,12 +1,12 @@ | |||
| 1 | #ifndef _LINUX_HUGE_MM_H | 1 | #ifndef _LINUX_HUGE_MM_H |
| 2 | #define _LINUX_HUGE_MM_H | 2 | #define _LINUX_HUGE_MM_H |
| 3 | 3 | ||
| 4 | extern int do_huge_pmd_anonymous_page(struct fault_env *fe); | 4 | extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); |
| 5 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 5 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| 6 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 6 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
| 7 | struct vm_area_struct *vma); | 7 | struct vm_area_struct *vma); |
| 8 | extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); | 8 | extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); |
| 9 | extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); | 9 | extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); |
| 10 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | 10 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
| 11 | unsigned long addr, | 11 | unsigned long addr, |
| 12 | pmd_t *pmd, | 12 | pmd_t *pmd, |
| @@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page) | |||
| 142 | return 1; | 142 | return 1; |
| 143 | } | 143 | } |
| 144 | 144 | ||
| 145 | extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); | 145 | extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); |
| 146 | 146 | ||
| 147 | extern struct page *huge_zero_page; | 147 | extern struct page *huge_zero_page; |
| 148 | 148 | ||
| @@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, | |||
| 212 | return NULL; | 212 | return NULL; |
| 213 | } | 213 | } |
| 214 | 214 | ||
| 215 | static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) | 215 | static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) |
| 216 | { | 216 | { |
| 217 | return 0; | 217 | return 0; |
| 218 | } | 218 | } |
diff --git a/include/linux/idr.h b/include/linux/idr.h index 083d61e92706..3c01b89aed67 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h | |||
| @@ -18,12 +18,11 @@ | |||
| 18 | #include <linux/rcupdate.h> | 18 | #include <linux/rcupdate.h> |
| 19 | 19 | ||
| 20 | /* | 20 | /* |
| 21 | * We want shallower trees and thus more bits covered at each layer. 8 | 21 | * Using 6 bits at each layer allows us to allocate 7 layers out of each page. |
| 22 | * bits gives us large enough first layer for most use cases and maximum | 22 | * 8 bits only gave us 3 layers out of every pair of pages, which is less |
| 23 | * tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and | 23 | * efficient except for trees with a largest element between 192-255 inclusive. |
| 24 | * 1k on 32bit. | ||
| 25 | */ | 24 | */ |
| 26 | #define IDR_BITS 8 | 25 | #define IDR_BITS 6 |
| 27 | #define IDR_SIZE (1 << IDR_BITS) | 26 | #define IDR_SIZE (1 << IDR_BITS) |
| 28 | #define IDR_MASK ((1 << IDR_BITS)-1) | 27 | #define IDR_MASK ((1 << IDR_BITS)-1) |
| 29 | 28 | ||
| @@ -56,6 +55,32 @@ struct idr { | |||
| 56 | #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) | 55 | #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) |
| 57 | 56 | ||
| 58 | /** | 57 | /** |
| 58 | * idr_get_cursor - Return the current position of the cyclic allocator | ||
| 59 | * @idr: idr handle | ||
| 60 | * | ||
| 61 | * The value returned is the value that will be next returned from | ||
| 62 | * idr_alloc_cyclic() if it is free (otherwise the search will start from | ||
| 63 | * this position). | ||
| 64 | */ | ||
| 65 | static inline unsigned int idr_get_cursor(struct idr *idr) | ||
| 66 | { | ||
| 67 | return READ_ONCE(idr->cur); | ||
| 68 | } | ||
| 69 | |||
| 70 | /** | ||
| 71 | * idr_set_cursor - Set the current position of the cyclic allocator | ||
| 72 | * @idr: idr handle | ||
| 73 | * @val: new position | ||
| 74 | * | ||
| 75 | * The next call to idr_alloc_cyclic() will return @val if it is free | ||
| 76 | * (otherwise the search will start from this position). | ||
| 77 | */ | ||
| 78 | static inline void idr_set_cursor(struct idr *idr, unsigned int val) | ||
| 79 | { | ||
| 80 | WRITE_ONCE(idr->cur, val); | ||
| 81 | } | ||
| 82 | |||
| 83 | /** | ||
| 59 | * DOC: idr sync | 84 | * DOC: idr sync |
| 60 | * idr synchronization (stolen from radix-tree.h) | 85 | * idr synchronization (stolen from radix-tree.h) |
| 61 | * | 86 | * |
| @@ -195,6 +220,11 @@ static inline int ida_get_new(struct ida *ida, int *p_id) | |||
| 195 | return ida_get_new_above(ida, 0, p_id); | 220 | return ida_get_new_above(ida, 0, p_id); |
| 196 | } | 221 | } |
| 197 | 222 | ||
| 223 | static inline bool ida_is_empty(struct ida *ida) | ||
| 224 | { | ||
| 225 | return idr_is_empty(&ida->idr); | ||
| 226 | } | ||
| 227 | |||
| 198 | void __init idr_init_cache(void); | 228 | void __init idr_init_cache(void); |
| 199 | 229 | ||
| 200 | #endif /* __IDR_H__ */ | 230 | #endif /* __IDR_H__ */ |
diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 410decacff8f..68bd88223417 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h | |||
| @@ -77,7 +77,6 @@ extern int kdb_poll_idx; | |||
| 77 | * number whenever the kernel debugger is entered. | 77 | * number whenever the kernel debugger is entered. |
| 78 | */ | 78 | */ |
| 79 | extern int kdb_initial_cpu; | 79 | extern int kdb_initial_cpu; |
| 80 | extern atomic_t kdb_event; | ||
| 81 | 80 | ||
| 82 | /* Types and messages used for dynamically added kdb shell commands */ | 81 | /* Types and messages used for dynamically added kdb shell commands */ |
| 83 | 82 | ||
| @@ -162,6 +161,7 @@ enum kdb_msgsrc { | |||
| 162 | }; | 161 | }; |
| 163 | 162 | ||
| 164 | extern int kdb_trap_printk; | 163 | extern int kdb_trap_printk; |
| 164 | extern int kdb_printf_cpu; | ||
| 165 | extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, | 165 | extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, |
| 166 | va_list args); | 166 | va_list args); |
| 167 | extern __printf(1, 2) int kdb_printf(const char *, ...); | 167 | extern __printf(1, 2) int kdb_printf(const char *, ...); |
diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 406c33dcae13..d7437777baaa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h | |||
| @@ -259,12 +259,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); | |||
| 259 | vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) | 259 | vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) |
| 260 | #define VMCOREINFO_CONFIG(name) \ | 260 | #define VMCOREINFO_CONFIG(name) \ |
| 261 | vmcoreinfo_append_str("CONFIG_%s=y\n", #name) | 261 | vmcoreinfo_append_str("CONFIG_%s=y\n", #name) |
| 262 | #define VMCOREINFO_PAGE_OFFSET(value) \ | ||
| 263 | vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value) | ||
| 264 | #define VMCOREINFO_VMALLOC_START(value) \ | ||
| 265 | vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value) | ||
| 266 | #define VMCOREINFO_VMEMMAP_START(value) \ | ||
| 267 | vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value) | ||
| 268 | 262 | ||
| 269 | extern struct kimage *kexec_image; | 263 | extern struct kimage *kexec_image; |
| 270 | extern struct kimage *kexec_crash_image; | 264 | extern struct kimage *kexec_crash_image; |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0b5b2e4df14e..4424784ac374 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
| @@ -292,36 +292,23 @@ extern pgprot_t protection_map[16]; | |||
| 292 | * pgoff should be used in favour of virtual_address, if possible. | 292 | * pgoff should be used in favour of virtual_address, if possible. |
| 293 | */ | 293 | */ |
| 294 | struct vm_fault { | 294 | struct vm_fault { |
| 295 | struct vm_area_struct *vma; /* Target VMA */ | ||
| 295 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | 296 | unsigned int flags; /* FAULT_FLAG_xxx flags */ |
| 296 | gfp_t gfp_mask; /* gfp mask to be used for allocations */ | 297 | gfp_t gfp_mask; /* gfp mask to be used for allocations */ |
| 297 | pgoff_t pgoff; /* Logical page offset based on vma */ | 298 | pgoff_t pgoff; /* Logical page offset based on vma */ |
| 298 | void __user *virtual_address; /* Faulting virtual address */ | 299 | unsigned long address; /* Faulting virtual address */ |
| 300 | pmd_t *pmd; /* Pointer to pmd entry matching | ||
| 301 | * the 'address' */ | ||
| 302 | pte_t orig_pte; /* Value of PTE at the time of fault */ | ||
| 299 | 303 | ||
| 300 | struct page *cow_page; /* Handler may choose to COW */ | 304 | struct page *cow_page; /* Page handler may use for COW fault */ |
| 305 | struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ | ||
| 301 | struct page *page; /* ->fault handlers should return a | 306 | struct page *page; /* ->fault handlers should return a |
| 302 | * page here, unless VM_FAULT_NOPAGE | 307 | * page here, unless VM_FAULT_NOPAGE |
| 303 | * is set (which is also implied by | 308 | * is set (which is also implied by |
| 304 | * VM_FAULT_ERROR). | 309 | * VM_FAULT_ERROR). |
| 305 | */ | 310 | */ |
| 306 | void *entry; /* ->fault handler can alternatively | 311 | /* These three entries are valid only while holding ptl lock */ |
| 307 | * return locked DAX entry. In that | ||
| 308 | * case handler should return | ||
| 309 | * VM_FAULT_DAX_LOCKED and fill in | ||
| 310 | * entry here. | ||
| 311 | */ | ||
| 312 | }; | ||
| 313 | |||
| 314 | /* | ||
| 315 | * Page fault context: passes though page fault handler instead of endless list | ||
| 316 | * of function arguments. | ||
| 317 | */ | ||
| 318 | struct fault_env { | ||
| 319 | struct vm_area_struct *vma; /* Target VMA */ | ||
| 320 | unsigned long address; /* Faulting virtual address */ | ||
| 321 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | ||
| 322 | pmd_t *pmd; /* Pointer to pmd entry matching | ||
| 323 | * the 'address' | ||
| 324 | */ | ||
| 325 | pte_t *pte; /* Pointer to pte entry matching | 312 | pte_t *pte; /* Pointer to pte entry matching |
| 326 | * the 'address'. NULL if the page | 313 | * the 'address'. NULL if the page |
| 327 | * table hasn't been allocated. | 314 | * table hasn't been allocated. |
| @@ -351,7 +338,7 @@ struct vm_operations_struct { | |||
| 351 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 338 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
| 352 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, | 339 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, |
| 353 | pmd_t *, unsigned int flags); | 340 | pmd_t *, unsigned int flags); |
| 354 | void (*map_pages)(struct fault_env *fe, | 341 | void (*map_pages)(struct vm_fault *vmf, |
| 355 | pgoff_t start_pgoff, pgoff_t end_pgoff); | 342 | pgoff_t start_pgoff, pgoff_t end_pgoff); |
| 356 | 343 | ||
| 357 | /* notification that a previously read-only page is about to become | 344 | /* notification that a previously read-only page is about to become |
| @@ -625,8 +612,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
| 625 | return pte; | 612 | return pte; |
| 626 | } | 613 | } |
| 627 | 614 | ||
| 628 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | 615 | int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
| 629 | struct page *page); | 616 | struct page *page); |
| 617 | int finish_fault(struct vm_fault *vmf); | ||
| 618 | int finish_mkwrite_fault(struct vm_fault *vmf); | ||
| 630 | #endif | 619 | #endif |
| 631 | 620 | ||
| 632 | /* | 621 | /* |
| @@ -1110,7 +1099,7 @@ static inline void clear_page_pfmemalloc(struct page *page) | |||
| 1110 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 1099 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
| 1111 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ | 1100 | #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ |
| 1112 | #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ | 1101 | #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ |
| 1113 | #define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ | 1102 | #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ |
| 1114 | 1103 | ||
| 1115 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ | 1104 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
| 1116 | 1105 | ||
| @@ -1221,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 1221 | struct vm_area_struct *vma); | 1210 | struct vm_area_struct *vma); |
| 1222 | void unmap_mapping_range(struct address_space *mapping, | 1211 | void unmap_mapping_range(struct address_space *mapping, |
| 1223 | loff_t const holebegin, loff_t const holelen, int even_cows); | 1212 | loff_t const holebegin, loff_t const holelen, int even_cows); |
| 1213 | int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, | ||
| 1214 | spinlock_t **ptlp); | ||
| 1224 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | 1215 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, |
| 1225 | unsigned long *pfn); | 1216 | unsigned long *pfn); |
| 1226 | int follow_phys(struct vm_area_struct *vma, unsigned long address, | 1217 | int follow_phys(struct vm_area_struct *vma, unsigned long address, |
| @@ -1276,15 +1267,12 @@ extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1276 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, | 1267 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, |
| 1277 | unsigned long start, unsigned long nr_pages, | 1268 | unsigned long start, unsigned long nr_pages, |
| 1278 | unsigned int gup_flags, struct page **pages, | 1269 | unsigned int gup_flags, struct page **pages, |
| 1279 | struct vm_area_struct **vmas); | 1270 | struct vm_area_struct **vmas, int *locked); |
| 1280 | long get_user_pages(unsigned long start, unsigned long nr_pages, | 1271 | long get_user_pages(unsigned long start, unsigned long nr_pages, |
| 1281 | unsigned int gup_flags, struct page **pages, | 1272 | unsigned int gup_flags, struct page **pages, |
| 1282 | struct vm_area_struct **vmas); | 1273 | struct vm_area_struct **vmas); |
| 1283 | long get_user_pages_locked(unsigned long start, unsigned long nr_pages, | 1274 | long get_user_pages_locked(unsigned long start, unsigned long nr_pages, |
| 1284 | unsigned int gup_flags, struct page **pages, int *locked); | 1275 | unsigned int gup_flags, struct page **pages, int *locked); |
| 1285 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
| 1286 | unsigned long start, unsigned long nr_pages, | ||
| 1287 | struct page **pages, unsigned int gup_flags); | ||
| 1288 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, | 1276 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
| 1289 | struct page **pages, unsigned int gup_flags); | 1277 | struct page **pages, unsigned int gup_flags); |
| 1290 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1278 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
| @@ -2099,7 +2087,7 @@ extern void truncate_inode_pages_final(struct address_space *); | |||
| 2099 | 2087 | ||
| 2100 | /* generic vm_area_ops exported for stackable file systems */ | 2088 | /* generic vm_area_ops exported for stackable file systems */ |
| 2101 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); | 2089 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); |
| 2102 | extern void filemap_map_pages(struct fault_env *fe, | 2090 | extern void filemap_map_pages(struct vm_fault *vmf, |
| 2103 | pgoff_t start_pgoff, pgoff_t end_pgoff); | 2091 | pgoff_t start_pgoff, pgoff_t end_pgoff); |
| 2104 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 2092 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
| 2105 | 2093 | ||
diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35cff1ae..aacca824a6ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h | |||
| @@ -7,6 +7,23 @@ | |||
| 7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
| 8 | #include <asm/irq.h> | 8 | #include <asm/irq.h> |
| 9 | 9 | ||
| 10 | /* | ||
| 11 | * The run state of the lockup detectors is controlled by the content of the | ||
| 12 | * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - | ||
| 13 | * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. | ||
| 14 | * | ||
| 15 | * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' | ||
| 16 | * are variables that are only used as an 'interface' between the parameters | ||
| 17 | * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The | ||
| 18 | * 'watchdog_thresh' variable is handled differently because its value is not | ||
| 19 | * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' | ||
| 20 | * is equal zero. | ||
| 21 | */ | ||
| 22 | #define NMI_WATCHDOG_ENABLED_BIT 0 | ||
| 23 | #define SOFT_WATCHDOG_ENABLED_BIT 1 | ||
| 24 | #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) | ||
| 25 | #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) | ||
| 26 | |||
| 10 | /** | 27 | /** |
| 11 | * touch_nmi_watchdog - restart NMI watchdog timeout. | 28 | * touch_nmi_watchdog - restart NMI watchdog timeout. |
| 12 | * | 29 | * |
| @@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled; | |||
| 91 | extern int soft_watchdog_enabled; | 108 | extern int soft_watchdog_enabled; |
| 92 | extern int watchdog_user_enabled; | 109 | extern int watchdog_user_enabled; |
| 93 | extern int watchdog_thresh; | 110 | extern int watchdog_thresh; |
| 111 | extern unsigned long watchdog_enabled; | ||
| 94 | extern unsigned long *watchdog_cpumask_bits; | 112 | extern unsigned long *watchdog_cpumask_bits; |
| 113 | #ifdef CONFIG_SMP | ||
| 95 | extern int sysctl_softlockup_all_cpu_backtrace; | 114 | extern int sysctl_softlockup_all_cpu_backtrace; |
| 96 | extern int sysctl_hardlockup_all_cpu_backtrace; | 115 | extern int sysctl_hardlockup_all_cpu_backtrace; |
| 116 | #else | ||
| 117 | #define sysctl_softlockup_all_cpu_backtrace 0 | ||
| 118 | #define sysctl_hardlockup_all_cpu_backtrace 0 | ||
| 119 | #endif | ||
| 120 | extern bool is_hardlockup(void); | ||
| 97 | struct ctl_table; | 121 | struct ctl_table; |
| 98 | extern int proc_watchdog(struct ctl_table *, int , | 122 | extern int proc_watchdog(struct ctl_table *, int , |
| 99 | void __user *, size_t *, loff_t *); | 123 | void __user *, size_t *, loff_t *); |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 744486057e9e..5dea8f6440e4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
| @@ -80,23 +80,25 @@ static inline bool radix_tree_is_internal_node(void *ptr) | |||
| 80 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ | 80 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ |
| 81 | RADIX_TREE_MAP_SHIFT)) | 81 | RADIX_TREE_MAP_SHIFT)) |
| 82 | 82 | ||
| 83 | /* | ||
| 84 | * @count is the count of every non-NULL element in the ->slots array | ||
| 85 | * whether that is an exceptional entry, a retry entry, a user pointer, | ||
| 86 | * a sibling entry or a pointer to the next level of the tree. | ||
| 87 | * @exceptional is the count of every element in ->slots which is | ||
| 88 | * either radix_tree_exceptional_entry() or is a sibling entry for an | ||
| 89 | * exceptional entry. | ||
| 90 | */ | ||
| 83 | struct radix_tree_node { | 91 | struct radix_tree_node { |
| 84 | unsigned char shift; /* Bits remaining in each slot */ | 92 | unsigned char shift; /* Bits remaining in each slot */ |
| 85 | unsigned char offset; /* Slot offset in parent */ | 93 | unsigned char offset; /* Slot offset in parent */ |
| 86 | unsigned char count; /* Total entry count */ | 94 | unsigned char count; /* Total entry count */ |
| 87 | unsigned char exceptional; /* Exceptional entry count */ | 95 | unsigned char exceptional; /* Exceptional entry count */ |
| 96 | struct radix_tree_node *parent; /* Used when ascending tree */ | ||
| 97 | void *private_data; /* For tree user */ | ||
| 88 | union { | 98 | union { |
| 89 | struct { | 99 | struct list_head private_list; /* For tree user */ |
| 90 | /* Used when ascending tree */ | 100 | struct rcu_head rcu_head; /* Used when freeing node */ |
| 91 | struct radix_tree_node *parent; | ||
| 92 | /* For tree user */ | ||
| 93 | void *private_data; | ||
| 94 | }; | ||
| 95 | /* Used when freeing node */ | ||
| 96 | struct rcu_head rcu_head; | ||
| 97 | }; | 101 | }; |
| 98 | /* For tree user */ | ||
| 99 | struct list_head private_list; | ||
| 100 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; | 102 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; |
| 101 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; | 103 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
| 102 | }; | 104 | }; |
| @@ -127,6 +129,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root) | |||
| 127 | } | 129 | } |
| 128 | 130 | ||
| 129 | /** | 131 | /** |
| 132 | * struct radix_tree_iter - radix tree iterator state | ||
| 133 | * | ||
| 134 | * @index: index of current slot | ||
| 135 | * @next_index: one beyond the last index for this chunk | ||
| 136 | * @tags: bit-mask for tag-iterating | ||
| 137 | * @node: node that contains current slot | ||
| 138 | * @shift: shift for the node that holds our slots | ||
| 139 | * | ||
| 140 | * This radix tree iterator works in terms of "chunks" of slots. A chunk is a | ||
| 141 | * subinterval of slots contained within one radix tree leaf node. It is | ||
| 142 | * described by a pointer to its first slot and a struct radix_tree_iter | ||
| 143 | * which holds the chunk's position in the tree and its size. For tagged | ||
| 144 | * iteration radix_tree_iter also holds the slots' bit-mask for one chosen | ||
| 145 | * radix tree tag. | ||
| 146 | */ | ||
| 147 | struct radix_tree_iter { | ||
| 148 | unsigned long index; | ||
| 149 | unsigned long next_index; | ||
| 150 | unsigned long tags; | ||
| 151 | struct radix_tree_node *node; | ||
| 152 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 153 | unsigned int shift; | ||
| 154 | #endif | ||
| 155 | }; | ||
| 156 | |||
| 157 | static inline unsigned int iter_shift(const struct radix_tree_iter *iter) | ||
| 158 | { | ||
| 159 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 160 | return iter->shift; | ||
| 161 | #else | ||
| 162 | return 0; | ||
| 163 | #endif | ||
| 164 | } | ||
| 165 | |||
| 166 | /** | ||
| 130 | * Radix-tree synchronization | 167 | * Radix-tree synchronization |
| 131 | * | 168 | * |
| 132 | * The radix-tree API requires that users provide all synchronisation (with | 169 | * The radix-tree API requires that users provide all synchronisation (with |
| @@ -264,6 +301,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
| 264 | struct radix_tree_node *node, | 301 | struct radix_tree_node *node, |
| 265 | void **slot, void *item, | 302 | void **slot, void *item, |
| 266 | radix_tree_update_node_t update_node, void *private); | 303 | radix_tree_update_node_t update_node, void *private); |
| 304 | void radix_tree_iter_replace(struct radix_tree_root *, | ||
| 305 | const struct radix_tree_iter *, void **slot, void *item); | ||
| 267 | void radix_tree_replace_slot(struct radix_tree_root *root, | 306 | void radix_tree_replace_slot(struct radix_tree_root *root, |
| 268 | void **slot, void *item); | 307 | void **slot, void *item); |
| 269 | void __radix_tree_delete_node(struct radix_tree_root *root, | 308 | void __radix_tree_delete_node(struct radix_tree_root *root, |
| @@ -289,6 +328,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, | |||
| 289 | unsigned long index, unsigned int tag); | 328 | unsigned long index, unsigned int tag); |
| 290 | int radix_tree_tag_get(struct radix_tree_root *root, | 329 | int radix_tree_tag_get(struct radix_tree_root *root, |
| 291 | unsigned long index, unsigned int tag); | 330 | unsigned long index, unsigned int tag); |
| 331 | void radix_tree_iter_tag_set(struct radix_tree_root *root, | ||
| 332 | const struct radix_tree_iter *iter, unsigned int tag); | ||
| 292 | unsigned int | 333 | unsigned int |
| 293 | radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, | 334 | radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, |
| 294 | unsigned long first_index, unsigned int max_items, | 335 | unsigned long first_index, unsigned int max_items, |
| @@ -297,50 +338,18 @@ unsigned int | |||
| 297 | radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, | 338 | radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, |
| 298 | unsigned long first_index, unsigned int max_items, | 339 | unsigned long first_index, unsigned int max_items, |
| 299 | unsigned int tag); | 340 | unsigned int tag); |
| 300 | unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | ||
| 301 | unsigned long *first_indexp, unsigned long last_index, | ||
| 302 | unsigned long nr_to_tag, | ||
| 303 | unsigned int fromtag, unsigned int totag); | ||
| 304 | int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); | 341 | int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); |
| 305 | unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item); | ||
| 306 | 342 | ||
| 307 | static inline void radix_tree_preload_end(void) | 343 | static inline void radix_tree_preload_end(void) |
| 308 | { | 344 | { |
| 309 | preempt_enable(); | 345 | preempt_enable(); |
| 310 | } | 346 | } |
| 311 | 347 | ||
| 312 | /** | 348 | int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); |
| 313 | * struct radix_tree_iter - radix tree iterator state | 349 | int radix_tree_split(struct radix_tree_root *, unsigned long index, |
| 314 | * | 350 | unsigned new_order); |
| 315 | * @index: index of current slot | 351 | int radix_tree_join(struct radix_tree_root *, unsigned long index, |
| 316 | * @next_index: one beyond the last index for this chunk | 352 | unsigned new_order, void *); |
| 317 | * @tags: bit-mask for tag-iterating | ||
| 318 | * @shift: shift for the node that holds our slots | ||
| 319 | * | ||
| 320 | * This radix tree iterator works in terms of "chunks" of slots. A chunk is a | ||
| 321 | * subinterval of slots contained within one radix tree leaf node. It is | ||
| 322 | * described by a pointer to its first slot and a struct radix_tree_iter | ||
| 323 | * which holds the chunk's position in the tree and its size. For tagged | ||
| 324 | * iteration radix_tree_iter also holds the slots' bit-mask for one chosen | ||
| 325 | * radix tree tag. | ||
| 326 | */ | ||
| 327 | struct radix_tree_iter { | ||
| 328 | unsigned long index; | ||
| 329 | unsigned long next_index; | ||
| 330 | unsigned long tags; | ||
| 331 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 332 | unsigned int shift; | ||
| 333 | #endif | ||
| 334 | }; | ||
| 335 | |||
| 336 | static inline unsigned int iter_shift(struct radix_tree_iter *iter) | ||
| 337 | { | ||
| 338 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 339 | return iter->shift; | ||
| 340 | #else | ||
| 341 | return 0; | ||
| 342 | #endif | ||
| 343 | } | ||
| 344 | 353 | ||
| 345 | #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ | 354 | #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ |
| 346 | #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ | 355 | #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ |
| @@ -409,20 +418,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) | |||
| 409 | } | 418 | } |
| 410 | 419 | ||
| 411 | /** | 420 | /** |
| 412 | * radix_tree_iter_next - resume iterating when the chunk may be invalid | 421 | * radix_tree_iter_resume - resume iterating when the chunk may be invalid |
| 413 | * @iter: iterator state | 422 | * @slot: pointer to current slot |
| 423 | * @iter: iterator state | ||
| 424 | * Returns: New slot pointer | ||
| 414 | * | 425 | * |
| 415 | * If the iterator needs to release then reacquire a lock, the chunk may | 426 | * If the iterator needs to release then reacquire a lock, the chunk may |
| 416 | * have been invalidated by an insertion or deletion. Call this function | 427 | * have been invalidated by an insertion or deletion. Call this function |
| 417 | * to continue the iteration from the next index. | 428 | * before releasing the lock to continue the iteration from the next index. |
| 418 | */ | 429 | */ |
| 419 | static inline __must_check | 430 | void **__must_check radix_tree_iter_resume(void **slot, |
| 420 | void **radix_tree_iter_next(struct radix_tree_iter *iter) | 431 | struct radix_tree_iter *iter); |
| 421 | { | ||
| 422 | iter->next_index = __radix_tree_iter_add(iter, 1); | ||
| 423 | iter->tags = 0; | ||
| 424 | return NULL; | ||
| 425 | } | ||
| 426 | 432 | ||
| 427 | /** | 433 | /** |
| 428 | * radix_tree_chunk_size - get current chunk size | 434 | * radix_tree_chunk_size - get current chunk size |
| @@ -436,10 +442,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) | |||
| 436 | return (iter->next_index - iter->index) >> iter_shift(iter); | 442 | return (iter->next_index - iter->index) >> iter_shift(iter); |
| 437 | } | 443 | } |
| 438 | 444 | ||
| 439 | static inline struct radix_tree_node *entry_to_node(void *ptr) | 445 | #ifdef CONFIG_RADIX_TREE_MULTIORDER |
| 446 | void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, | ||
| 447 | unsigned flags); | ||
| 448 | #else | ||
| 449 | /* Can't happen without sibling entries, but the compiler can't tell that */ | ||
| 450 | static inline void ** __radix_tree_next_slot(void **slot, | ||
| 451 | struct radix_tree_iter *iter, unsigned flags) | ||
| 440 | { | 452 | { |
| 441 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); | 453 | return slot; |
| 442 | } | 454 | } |
| 455 | #endif | ||
| 443 | 456 | ||
| 444 | /** | 457 | /** |
| 445 | * radix_tree_next_slot - find next slot in chunk | 458 | * radix_tree_next_slot - find next slot in chunk |
| @@ -453,7 +466,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr) | |||
| 453 | * For tagged lookup it also eats @iter->tags. | 466 | * For tagged lookup it also eats @iter->tags. |
| 454 | * | 467 | * |
| 455 | * There are several cases where 'slot' can be passed in as NULL to this | 468 | * There are several cases where 'slot' can be passed in as NULL to this |
| 456 | * function. These cases result from the use of radix_tree_iter_next() or | 469 | * function. These cases result from the use of radix_tree_iter_resume() or |
| 457 | * radix_tree_iter_retry(). In these cases we don't end up dereferencing | 470 | * radix_tree_iter_retry(). In these cases we don't end up dereferencing |
| 458 | * 'slot' because either: | 471 | * 'slot' because either: |
| 459 | * a) we are doing tagged iteration and iter->tags has been set to 0, or | 472 | * a) we are doing tagged iteration and iter->tags has been set to 0, or |
| @@ -464,51 +477,31 @@ static __always_inline void ** | |||
| 464 | radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) | 477 | radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) |
| 465 | { | 478 | { |
| 466 | if (flags & RADIX_TREE_ITER_TAGGED) { | 479 | if (flags & RADIX_TREE_ITER_TAGGED) { |
| 467 | void *canon = slot; | ||
| 468 | |||
| 469 | iter->tags >>= 1; | 480 | iter->tags >>= 1; |
| 470 | if (unlikely(!iter->tags)) | 481 | if (unlikely(!iter->tags)) |
| 471 | return NULL; | 482 | return NULL; |
| 472 | while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && | ||
| 473 | radix_tree_is_internal_node(slot[1])) { | ||
| 474 | if (entry_to_node(slot[1]) == canon) { | ||
| 475 | iter->tags >>= 1; | ||
| 476 | iter->index = __radix_tree_iter_add(iter, 1); | ||
| 477 | slot++; | ||
| 478 | continue; | ||
| 479 | } | ||
| 480 | iter->next_index = __radix_tree_iter_add(iter, 1); | ||
| 481 | return NULL; | ||
| 482 | } | ||
| 483 | if (likely(iter->tags & 1ul)) { | 483 | if (likely(iter->tags & 1ul)) { |
| 484 | iter->index = __radix_tree_iter_add(iter, 1); | 484 | iter->index = __radix_tree_iter_add(iter, 1); |
| 485 | return slot + 1; | 485 | slot++; |
| 486 | goto found; | ||
| 486 | } | 487 | } |
| 487 | if (!(flags & RADIX_TREE_ITER_CONTIG)) { | 488 | if (!(flags & RADIX_TREE_ITER_CONTIG)) { |
| 488 | unsigned offset = __ffs(iter->tags); | 489 | unsigned offset = __ffs(iter->tags); |
| 489 | 490 | ||
| 490 | iter->tags >>= offset; | 491 | iter->tags >>= offset++; |
| 491 | iter->index = __radix_tree_iter_add(iter, offset + 1); | 492 | iter->index = __radix_tree_iter_add(iter, offset); |
| 492 | return slot + offset + 1; | 493 | slot += offset; |
| 494 | goto found; | ||
| 493 | } | 495 | } |
| 494 | } else { | 496 | } else { |
| 495 | long count = radix_tree_chunk_size(iter); | 497 | long count = radix_tree_chunk_size(iter); |
| 496 | void *canon = slot; | ||
| 497 | 498 | ||
| 498 | while (--count > 0) { | 499 | while (--count > 0) { |
| 499 | slot++; | 500 | slot++; |
| 500 | iter->index = __radix_tree_iter_add(iter, 1); | 501 | iter->index = __radix_tree_iter_add(iter, 1); |
| 501 | 502 | ||
| 502 | if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && | ||
| 503 | radix_tree_is_internal_node(*slot)) { | ||
| 504 | if (entry_to_node(*slot) == canon) | ||
| 505 | continue; | ||
| 506 | iter->next_index = iter->index; | ||
| 507 | break; | ||
| 508 | } | ||
| 509 | |||
| 510 | if (likely(*slot)) | 503 | if (likely(*slot)) |
| 511 | return slot; | 504 | goto found; |
| 512 | if (flags & RADIX_TREE_ITER_CONTIG) { | 505 | if (flags & RADIX_TREE_ITER_CONTIG) { |
| 513 | /* forbid switching to the next chunk */ | 506 | /* forbid switching to the next chunk */ |
| 514 | iter->next_index = 0; | 507 | iter->next_index = 0; |
| @@ -517,6 +510,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) | |||
| 517 | } | 510 | } |
| 518 | } | 511 | } |
| 519 | return NULL; | 512 | return NULL; |
| 513 | |||
| 514 | found: | ||
| 515 | if (unlikely(radix_tree_is_internal_node(*slot))) | ||
| 516 | return __radix_tree_next_slot(slot, iter, flags); | ||
| 517 | return slot; | ||
| 520 | } | 518 | } |
| 521 | 519 | ||
| 522 | /** | 520 | /** |
| @@ -567,6 +565,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) | |||
| 567 | slot || (slot = radix_tree_next_chunk(root, iter, \ | 565 | slot || (slot = radix_tree_next_chunk(root, iter, \ |
| 568 | RADIX_TREE_ITER_TAGGED | tag)) ; \ | 566 | RADIX_TREE_ITER_TAGGED | tag)) ; \ |
| 569 | slot = radix_tree_next_slot(slot, iter, \ | 567 | slot = radix_tree_next_slot(slot, iter, \ |
| 570 | RADIX_TREE_ITER_TAGGED)) | 568 | RADIX_TREE_ITER_TAGGED | tag)) |
| 571 | 569 | ||
| 572 | #endif /* _LINUX_RADIX_TREE_H */ | 570 | #endif /* _LINUX_RADIX_TREE_H */ |
diff --git a/include/linux/signal.h b/include/linux/signal.h index b63f63eaa39c..5308304993be 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h | |||
| @@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set) | |||
| 97 | } | 97 | } |
| 98 | } | 98 | } |
| 99 | 99 | ||
| 100 | static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) | ||
| 101 | { | ||
| 102 | switch (_NSIG_WORDS) { | ||
| 103 | case 4: | ||
| 104 | return (set1->sig[3] == set2->sig[3]) && | ||
| 105 | (set1->sig[2] == set2->sig[2]) && | ||
| 106 | (set1->sig[1] == set2->sig[1]) && | ||
| 107 | (set1->sig[0] == set2->sig[0]); | ||
| 108 | case 2: | ||
| 109 | return (set1->sig[1] == set2->sig[1]) && | ||
| 110 | (set1->sig[0] == set2->sig[0]); | ||
| 111 | case 1: | ||
| 112 | return set1->sig[0] == set2->sig[0]; | ||
| 113 | } | ||
| 114 | return 0; | ||
| 115 | } | ||
| 116 | |||
| 100 | #define sigmask(sig) (1UL << ((sig) - 1)) | 117 | #define sigmask(sig) (1UL << ((sig) - 1)) |
| 101 | 118 | ||
| 102 | #ifndef __HAVE_ARCH_SIG_SETOPS | 119 | #ifndef __HAVE_ARCH_SIG_SETOPS |
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index dd66a952e8cd..11b92b047a1e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h | |||
| @@ -27,7 +27,7 @@ | |||
| 27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) | 27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) |
| 28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) | 28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) |
| 29 | 29 | ||
| 30 | extern int handle_userfault(struct fault_env *fe, unsigned long reason); | 30 | extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); |
| 31 | 31 | ||
| 32 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, | 32 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, |
| 33 | unsigned long src_start, unsigned long len); | 33 | unsigned long src_start, unsigned long len); |
| @@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) | |||
| 55 | #else /* CONFIG_USERFAULTFD */ | 55 | #else /* CONFIG_USERFAULTFD */ |
| 56 | 56 | ||
| 57 | /* mm helpers */ | 57 | /* mm helpers */ |
| 58 | static inline int handle_userfault(struct fault_env *fe, unsigned long reason) | 58 | static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) |
| 59 | { | 59 | { |
| 60 | return VM_FAULT_SIGBUS; | 60 | return VM_FAULT_SIGBUS; |
| 61 | } | 61 | } |
| @@ -763,7 +763,10 @@ static inline int convert_mode(long *msgtyp, int msgflg) | |||
| 763 | if (*msgtyp == 0) | 763 | if (*msgtyp == 0) |
| 764 | return SEARCH_ANY; | 764 | return SEARCH_ANY; |
| 765 | if (*msgtyp < 0) { | 765 | if (*msgtyp < 0) { |
| 766 | *msgtyp = -*msgtyp; | 766 | if (*msgtyp == LONG_MIN) /* -LONG_MIN is undefined */ |
| 767 | *msgtyp = LONG_MAX; | ||
| 768 | else | ||
| 769 | *msgtyp = -*msgtyp; | ||
| 767 | return SEARCH_LESSEQUAL; | 770 | return SEARCH_LESSEQUAL; |
| 768 | } | 771 | } |
| 769 | if (msgflg & MSG_EXCEPT) | 772 | if (msgflg & MSG_EXCEPT) |
| @@ -11,6 +11,7 @@ | |||
| 11 | * (c) 2001 Red Hat Inc | 11 | * (c) 2001 Red Hat Inc |
| 12 | * Lockless wakeup | 12 | * Lockless wakeup |
| 13 | * (c) 2003 Manfred Spraul <manfred@colorfullife.com> | 13 | * (c) 2003 Manfred Spraul <manfred@colorfullife.com> |
| 14 | * (c) 2016 Davidlohr Bueso <dave@stgolabs.net> | ||
| 14 | * Further wakeup optimizations, documentation | 15 | * Further wakeup optimizations, documentation |
| 15 | * (c) 2010 Manfred Spraul <manfred@colorfullife.com> | 16 | * (c) 2010 Manfred Spraul <manfred@colorfullife.com> |
| 16 | * | 17 | * |
| @@ -53,15 +54,11 @@ | |||
| 53 | * Semaphores are actively given to waiting tasks (necessary for FIFO). | 54 | * Semaphores are actively given to waiting tasks (necessary for FIFO). |
| 54 | * (see update_queue()) | 55 | * (see update_queue()) |
| 55 | * - To improve the scalability, the actual wake-up calls are performed after | 56 | * - To improve the scalability, the actual wake-up calls are performed after |
| 56 | * dropping all locks. (see wake_up_sem_queue_prepare(), | 57 | * dropping all locks. (see wake_up_sem_queue_prepare()) |
| 57 | * wake_up_sem_queue_do()) | ||
| 58 | * - All work is done by the waker, the woken up task does not have to do | 58 | * - All work is done by the waker, the woken up task does not have to do |
| 59 | * anything - not even acquiring a lock or dropping a refcount. | 59 | * anything - not even acquiring a lock or dropping a refcount. |
| 60 | * - A woken up task may not even touch the semaphore array anymore, it may | 60 | * - A woken up task may not even touch the semaphore array anymore, it may |
| 61 | * have been destroyed already by a semctl(RMID). | 61 | * have been destroyed already by a semctl(RMID). |
| 62 | * - The synchronizations between wake-ups due to a timeout/signal and a | ||
| 63 | * wake-up due to a completed semaphore operation is achieved by using an | ||
| 64 | * intermediate state (IN_WAKEUP). | ||
| 65 | * - UNDO values are stored in an array (one per process and per | 62 | * - UNDO values are stored in an array (one per process and per |
| 66 | * semaphore array, lazily allocated). For backwards compatibility, multiple | 63 | * semaphore array, lazily allocated). For backwards compatibility, multiple |
| 67 | * modes for the UNDO variables are supported (per process, per thread) | 64 | * modes for the UNDO variables are supported (per process, per thread) |
| @@ -118,7 +115,8 @@ struct sem_queue { | |||
| 118 | struct sembuf *sops; /* array of pending operations */ | 115 | struct sembuf *sops; /* array of pending operations */ |
| 119 | struct sembuf *blocking; /* the operation that blocked */ | 116 | struct sembuf *blocking; /* the operation that blocked */ |
| 120 | int nsops; /* number of operations */ | 117 | int nsops; /* number of operations */ |
| 121 | int alter; /* does *sops alter the array? */ | 118 | bool alter; /* does *sops alter the array? */ |
| 119 | bool dupsop; /* sops on more than one sem_num */ | ||
| 122 | }; | 120 | }; |
| 123 | 121 | ||
| 124 | /* Each task has a list of undo requests. They are executed automatically | 122 | /* Each task has a list of undo requests. They are executed automatically |
| @@ -416,29 +414,6 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) | |||
| 416 | * | 414 | * |
| 417 | * The caller holds the RCU read lock. | 415 | * The caller holds the RCU read lock. |
| 418 | */ | 416 | */ |
| 419 | static inline struct sem_array *sem_obtain_lock(struct ipc_namespace *ns, | ||
| 420 | int id, struct sembuf *sops, int nsops, int *locknum) | ||
| 421 | { | ||
| 422 | struct kern_ipc_perm *ipcp; | ||
| 423 | struct sem_array *sma; | ||
| 424 | |||
| 425 | ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); | ||
| 426 | if (IS_ERR(ipcp)) | ||
| 427 | return ERR_CAST(ipcp); | ||
| 428 | |||
| 429 | sma = container_of(ipcp, struct sem_array, sem_perm); | ||
| 430 | *locknum = sem_lock(sma, sops, nsops); | ||
| 431 | |||
| 432 | /* ipc_rmid() may have already freed the ID while sem_lock | ||
| 433 | * was spinning: verify that the structure is still valid | ||
| 434 | */ | ||
| 435 | if (ipc_valid_object(ipcp)) | ||
| 436 | return container_of(ipcp, struct sem_array, sem_perm); | ||
| 437 | |||
| 438 | sem_unlock(sma, *locknum); | ||
| 439 | return ERR_PTR(-EINVAL); | ||
| 440 | } | ||
| 441 | |||
| 442 | static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) | 417 | static inline struct sem_array *sem_obtain_object(struct ipc_namespace *ns, int id) |
| 443 | { | 418 | { |
| 444 | struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); | 419 | struct kern_ipc_perm *ipcp = ipc_obtain_object_idr(&sem_ids(ns), id); |
| @@ -471,40 +446,6 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) | |||
| 471 | ipc_rmid(&sem_ids(ns), &s->sem_perm); | 446 | ipc_rmid(&sem_ids(ns), &s->sem_perm); |
| 472 | } | 447 | } |
| 473 | 448 | ||
| 474 | /* | ||
| 475 | * Lockless wakeup algorithm: | ||
| 476 | * Without the check/retry algorithm a lockless wakeup is possible: | ||
| 477 | * - queue.status is initialized to -EINTR before blocking. | ||
| 478 | * - wakeup is performed by | ||
| 479 | * * unlinking the queue entry from the pending list | ||
| 480 | * * setting queue.status to IN_WAKEUP | ||
| 481 | * This is the notification for the blocked thread that a | ||
| 482 | * result value is imminent. | ||
| 483 | * * call wake_up_process | ||
| 484 | * * set queue.status to the final value. | ||
| 485 | * - the previously blocked thread checks queue.status: | ||
| 486 | * * if it's IN_WAKEUP, then it must wait until the value changes | ||
| 487 | * * if it's not -EINTR, then the operation was completed by | ||
| 488 | * update_queue. semtimedop can return queue.status without | ||
| 489 | * performing any operation on the sem array. | ||
| 490 | * * otherwise it must acquire the spinlock and check what's up. | ||
| 491 | * | ||
| 492 | * The two-stage algorithm is necessary to protect against the following | ||
| 493 | * races: | ||
| 494 | * - if queue.status is set after wake_up_process, then the woken up idle | ||
| 495 | * thread could race forward and try (and fail) to acquire sma->lock | ||
| 496 | * before update_queue had a chance to set queue.status | ||
| 497 | * - if queue.status is written before wake_up_process and if the | ||
| 498 | * blocked process is woken up by a signal between writing | ||
| 499 | * queue.status and the wake_up_process, then the woken up | ||
| 500 | * process could return from semtimedop and die by calling | ||
| 501 | * sys_exit before wake_up_process is called. Then wake_up_process | ||
| 502 | * will oops, because the task structure is already invalid. | ||
| 503 | * (yes, this happened on s390 with sysv msg). | ||
| 504 | * | ||
| 505 | */ | ||
| 506 | #define IN_WAKEUP 1 | ||
| 507 | |||
| 508 | /** | 449 | /** |
| 509 | * newary - Create a new semaphore set | 450 | * newary - Create a new semaphore set |
| 510 | * @ns: namespace | 451 | * @ns: namespace |
| @@ -624,15 +565,23 @@ SYSCALL_DEFINE3(semget, key_t, key, int, nsems, int, semflg) | |||
| 624 | } | 565 | } |
| 625 | 566 | ||
| 626 | /** | 567 | /** |
| 627 | * perform_atomic_semop - Perform (if possible) a semaphore operation | 568 | * perform_atomic_semop[_slow] - Attempt to perform semaphore |
| 569 | * operations on a given array. | ||
| 628 | * @sma: semaphore array | 570 | * @sma: semaphore array |
| 629 | * @q: struct sem_queue that describes the operation | 571 | * @q: struct sem_queue that describes the operation |
| 630 | * | 572 | * |
| 573 | * Caller blocking are as follows, based the value | ||
| 574 | * indicated by the semaphore operation (sem_op): | ||
| 575 | * | ||
| 576 | * (1) >0 never blocks. | ||
| 577 | * (2) 0 (wait-for-zero operation): semval is non-zero. | ||
| 578 | * (3) <0 attempting to decrement semval to a value smaller than zero. | ||
| 579 | * | ||
| 631 | * Returns 0 if the operation was possible. | 580 | * Returns 0 if the operation was possible. |
| 632 | * Returns 1 if the operation is impossible, the caller must sleep. | 581 | * Returns 1 if the operation is impossible, the caller must sleep. |
| 633 | * Negative values are error codes. | 582 | * Returns <0 for error codes. |
| 634 | */ | 583 | */ |
| 635 | static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) | 584 | static int perform_atomic_semop_slow(struct sem_array *sma, struct sem_queue *q) |
| 636 | { | 585 | { |
| 637 | int result, sem_op, nsops, pid; | 586 | int result, sem_op, nsops, pid; |
| 638 | struct sembuf *sop; | 587 | struct sembuf *sop; |
| @@ -703,51 +652,84 @@ undo: | |||
| 703 | return result; | 652 | return result; |
| 704 | } | 653 | } |
| 705 | 654 | ||
| 706 | /** wake_up_sem_queue_prepare(q, error): Prepare wake-up | 655 | static int perform_atomic_semop(struct sem_array *sma, struct sem_queue *q) |
| 707 | * @q: queue entry that must be signaled | ||
| 708 | * @error: Error value for the signal | ||
| 709 | * | ||
| 710 | * Prepare the wake-up of the queue entry q. | ||
| 711 | */ | ||
| 712 | static void wake_up_sem_queue_prepare(struct list_head *pt, | ||
| 713 | struct sem_queue *q, int error) | ||
| 714 | { | 656 | { |
| 715 | if (list_empty(pt)) { | 657 | int result, sem_op, nsops; |
| 716 | /* | 658 | struct sembuf *sop; |
| 717 | * Hold preempt off so that we don't get preempted and have the | 659 | struct sem *curr; |
| 718 | * wakee busy-wait until we're scheduled back on. | 660 | struct sembuf *sops; |
| 719 | */ | 661 | struct sem_undo *un; |
| 720 | preempt_disable(); | 662 | |
| 663 | sops = q->sops; | ||
| 664 | nsops = q->nsops; | ||
| 665 | un = q->undo; | ||
| 666 | |||
| 667 | if (unlikely(q->dupsop)) | ||
| 668 | return perform_atomic_semop_slow(sma, q); | ||
| 669 | |||
| 670 | /* | ||
| 671 | * We scan the semaphore set twice, first to ensure that the entire | ||
| 672 | * operation can succeed, therefore avoiding any pointless writes | ||
| 673 | * to shared memory and having to undo such changes in order to block | ||
| 674 | * until the operations can go through. | ||
| 675 | */ | ||
| 676 | for (sop = sops; sop < sops + nsops; sop++) { | ||
| 677 | curr = sma->sem_base + sop->sem_num; | ||
| 678 | sem_op = sop->sem_op; | ||
| 679 | result = curr->semval; | ||
| 680 | |||
| 681 | if (!sem_op && result) | ||
| 682 | goto would_block; /* wait-for-zero */ | ||
| 683 | |||
| 684 | result += sem_op; | ||
| 685 | if (result < 0) | ||
| 686 | goto would_block; | ||
| 687 | |||
| 688 | if (result > SEMVMX) | ||
| 689 | return -ERANGE; | ||
| 690 | |||
| 691 | if (sop->sem_flg & SEM_UNDO) { | ||
| 692 | int undo = un->semadj[sop->sem_num] - sem_op; | ||
| 693 | |||
| 694 | /* Exceeding the undo range is an error. */ | ||
| 695 | if (undo < (-SEMAEM - 1) || undo > SEMAEM) | ||
| 696 | return -ERANGE; | ||
| 697 | } | ||
| 698 | } | ||
| 699 | |||
| 700 | for (sop = sops; sop < sops + nsops; sop++) { | ||
| 701 | curr = sma->sem_base + sop->sem_num; | ||
| 702 | sem_op = sop->sem_op; | ||
| 703 | result = curr->semval; | ||
| 704 | |||
| 705 | if (sop->sem_flg & SEM_UNDO) { | ||
| 706 | int undo = un->semadj[sop->sem_num] - sem_op; | ||
| 707 | |||
| 708 | un->semadj[sop->sem_num] = undo; | ||
| 709 | } | ||
| 710 | curr->semval += sem_op; | ||
| 711 | curr->sempid = q->pid; | ||
| 721 | } | 712 | } |
| 722 | q->status = IN_WAKEUP; | ||
| 723 | q->pid = error; | ||
| 724 | 713 | ||
| 725 | list_add_tail(&q->list, pt); | 714 | return 0; |
| 715 | |||
| 716 | would_block: | ||
| 717 | q->blocking = sop; | ||
| 718 | return sop->sem_flg & IPC_NOWAIT ? -EAGAIN : 1; | ||
| 726 | } | 719 | } |
| 727 | 720 | ||
| 728 | /** | 721 | static inline void wake_up_sem_queue_prepare(struct sem_queue *q, int error, |
| 729 | * wake_up_sem_queue_do - do the actual wake-up | 722 | struct wake_q_head *wake_q) |
| 730 | * @pt: list of tasks to be woken up | ||
| 731 | * | ||
| 732 | * Do the actual wake-up. | ||
| 733 | * The function is called without any locks held, thus the semaphore array | ||
| 734 | * could be destroyed already and the tasks can disappear as soon as the | ||
| 735 | * status is set to the actual return code. | ||
| 736 | */ | ||
| 737 | static void wake_up_sem_queue_do(struct list_head *pt) | ||
| 738 | { | 723 | { |
| 739 | struct sem_queue *q, *t; | 724 | wake_q_add(wake_q, q->sleeper); |
| 740 | int did_something; | 725 | /* |
| 741 | 726 | * Rely on the above implicit barrier, such that we can | |
| 742 | did_something = !list_empty(pt); | 727 | * ensure that we hold reference to the task before setting |
| 743 | list_for_each_entry_safe(q, t, pt, list) { | 728 | * q->status. Otherwise we could race with do_exit if the |
| 744 | wake_up_process(q->sleeper); | 729 | * task is awoken by an external event before calling |
| 745 | /* q can disappear immediately after writing q->status. */ | 730 | * wake_up_process(). |
| 746 | smp_wmb(); | 731 | */ |
| 747 | q->status = q->pid; | 732 | WRITE_ONCE(q->status, error); |
| 748 | } | ||
| 749 | if (did_something) | ||
| 750 | preempt_enable(); | ||
| 751 | } | 733 | } |
| 752 | 734 | ||
| 753 | static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | 735 | static void unlink_queue(struct sem_array *sma, struct sem_queue *q) |
| @@ -767,7 +749,7 @@ static void unlink_queue(struct sem_array *sma, struct sem_queue *q) | |||
| 767 | * modified the array. | 749 | * modified the array. |
| 768 | * Note that wait-for-zero operations are handled without restart. | 750 | * Note that wait-for-zero operations are handled without restart. |
| 769 | */ | 751 | */ |
| 770 | static int check_restart(struct sem_array *sma, struct sem_queue *q) | 752 | static inline int check_restart(struct sem_array *sma, struct sem_queue *q) |
| 771 | { | 753 | { |
| 772 | /* pending complex alter operations are too difficult to analyse */ | 754 | /* pending complex alter operations are too difficult to analyse */ |
| 773 | if (!list_empty(&sma->pending_alter)) | 755 | if (!list_empty(&sma->pending_alter)) |
| @@ -795,21 +777,20 @@ static int check_restart(struct sem_array *sma, struct sem_queue *q) | |||
| 795 | * wake_const_ops - wake up non-alter tasks | 777 | * wake_const_ops - wake up non-alter tasks |
| 796 | * @sma: semaphore array. | 778 | * @sma: semaphore array. |
| 797 | * @semnum: semaphore that was modified. | 779 | * @semnum: semaphore that was modified. |
| 798 | * @pt: list head for the tasks that must be woken up. | 780 | * @wake_q: lockless wake-queue head. |
| 799 | * | 781 | * |
| 800 | * wake_const_ops must be called after a semaphore in a semaphore array | 782 | * wake_const_ops must be called after a semaphore in a semaphore array |
| 801 | * was set to 0. If complex const operations are pending, wake_const_ops must | 783 | * was set to 0. If complex const operations are pending, wake_const_ops must |
| 802 | * be called with semnum = -1, as well as with the number of each modified | 784 | * be called with semnum = -1, as well as with the number of each modified |
| 803 | * semaphore. | 785 | * semaphore. |
| 804 | * The tasks that must be woken up are added to @pt. The return code | 786 | * The tasks that must be woken up are added to @wake_q. The return code |
| 805 | * is stored in q->pid. | 787 | * is stored in q->pid. |
| 806 | * The function returns 1 if at least one operation was completed successfully. | 788 | * The function returns 1 if at least one operation was completed successfully. |
| 807 | */ | 789 | */ |
| 808 | static int wake_const_ops(struct sem_array *sma, int semnum, | 790 | static int wake_const_ops(struct sem_array *sma, int semnum, |
| 809 | struct list_head *pt) | 791 | struct wake_q_head *wake_q) |
| 810 | { | 792 | { |
| 811 | struct sem_queue *q; | 793 | struct sem_queue *q, *tmp; |
| 812 | struct list_head *walk; | ||
| 813 | struct list_head *pending_list; | 794 | struct list_head *pending_list; |
| 814 | int semop_completed = 0; | 795 | int semop_completed = 0; |
| 815 | 796 | ||
| @@ -818,25 +799,19 @@ static int wake_const_ops(struct sem_array *sma, int semnum, | |||
| 818 | else | 799 | else |
| 819 | pending_list = &sma->sem_base[semnum].pending_const; | 800 | pending_list = &sma->sem_base[semnum].pending_const; |
| 820 | 801 | ||
| 821 | walk = pending_list->next; | 802 | list_for_each_entry_safe(q, tmp, pending_list, list) { |
| 822 | while (walk != pending_list) { | 803 | int error = perform_atomic_semop(sma, q); |
| 823 | int error; | ||
| 824 | |||
| 825 | q = container_of(walk, struct sem_queue, list); | ||
| 826 | walk = walk->next; | ||
| 827 | |||
| 828 | error = perform_atomic_semop(sma, q); | ||
| 829 | |||
| 830 | if (error <= 0) { | ||
| 831 | /* operation completed, remove from queue & wakeup */ | ||
| 832 | 804 | ||
| 833 | unlink_queue(sma, q); | 805 | if (error > 0) |
| 806 | continue; | ||
| 807 | /* operation completed, remove from queue & wakeup */ | ||
| 808 | unlink_queue(sma, q); | ||
| 834 | 809 | ||
| 835 | wake_up_sem_queue_prepare(pt, q, error); | 810 | wake_up_sem_queue_prepare(q, error, wake_q); |
| 836 | if (error == 0) | 811 | if (error == 0) |
| 837 | semop_completed = 1; | 812 | semop_completed = 1; |
| 838 | } | ||
| 839 | } | 813 | } |
| 814 | |||
| 840 | return semop_completed; | 815 | return semop_completed; |
| 841 | } | 816 | } |
| 842 | 817 | ||
| @@ -845,14 +820,14 @@ static int wake_const_ops(struct sem_array *sma, int semnum, | |||
| 845 | * @sma: semaphore array | 820 | * @sma: semaphore array |
| 846 | * @sops: operations that were performed | 821 | * @sops: operations that were performed |
| 847 | * @nsops: number of operations | 822 | * @nsops: number of operations |
| 848 | * @pt: list head of the tasks that must be woken up. | 823 | * @wake_q: lockless wake-queue head |
| 849 | * | 824 | * |
| 850 | * Checks all required queue for wait-for-zero operations, based | 825 | * Checks all required queue for wait-for-zero operations, based |
| 851 | * on the actual changes that were performed on the semaphore array. | 826 | * on the actual changes that were performed on the semaphore array. |
| 852 | * The function returns 1 if at least one operation was completed successfully. | 827 | * The function returns 1 if at least one operation was completed successfully. |
| 853 | */ | 828 | */ |
| 854 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | 829 | static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, |
| 855 | int nsops, struct list_head *pt) | 830 | int nsops, struct wake_q_head *wake_q) |
| 856 | { | 831 | { |
| 857 | int i; | 832 | int i; |
| 858 | int semop_completed = 0; | 833 | int semop_completed = 0; |
| @@ -865,7 +840,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 865 | 840 | ||
| 866 | if (sma->sem_base[num].semval == 0) { | 841 | if (sma->sem_base[num].semval == 0) { |
| 867 | got_zero = 1; | 842 | got_zero = 1; |
| 868 | semop_completed |= wake_const_ops(sma, num, pt); | 843 | semop_completed |= wake_const_ops(sma, num, wake_q); |
| 869 | } | 844 | } |
| 870 | } | 845 | } |
| 871 | } else { | 846 | } else { |
| @@ -876,7 +851,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 876 | for (i = 0; i < sma->sem_nsems; i++) { | 851 | for (i = 0; i < sma->sem_nsems; i++) { |
| 877 | if (sma->sem_base[i].semval == 0) { | 852 | if (sma->sem_base[i].semval == 0) { |
| 878 | got_zero = 1; | 853 | got_zero = 1; |
| 879 | semop_completed |= wake_const_ops(sma, i, pt); | 854 | semop_completed |= wake_const_ops(sma, i, wake_q); |
| 880 | } | 855 | } |
| 881 | } | 856 | } |
| 882 | } | 857 | } |
| @@ -885,7 +860,7 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 885 | * then check the global queue, too. | 860 | * then check the global queue, too. |
| 886 | */ | 861 | */ |
| 887 | if (got_zero) | 862 | if (got_zero) |
| 888 | semop_completed |= wake_const_ops(sma, -1, pt); | 863 | semop_completed |= wake_const_ops(sma, -1, wake_q); |
| 889 | 864 | ||
| 890 | return semop_completed; | 865 | return semop_completed; |
| 891 | } | 866 | } |
| @@ -895,22 +870,21 @@ static int do_smart_wakeup_zero(struct sem_array *sma, struct sembuf *sops, | |||
| 895 | * update_queue - look for tasks that can be completed. | 870 | * update_queue - look for tasks that can be completed. |
| 896 | * @sma: semaphore array. | 871 | * @sma: semaphore array. |
| 897 | * @semnum: semaphore that was modified. | 872 | * @semnum: semaphore that was modified. |
| 898 | * @pt: list head for the tasks that must be woken up. | 873 | * @wake_q: lockless wake-queue head. |
| 899 | * | 874 | * |
| 900 | * update_queue must be called after a semaphore in a semaphore array | 875 | * update_queue must be called after a semaphore in a semaphore array |
| 901 | * was modified. If multiple semaphores were modified, update_queue must | 876 | * was modified. If multiple semaphores were modified, update_queue must |
| 902 | * be called with semnum = -1, as well as with the number of each modified | 877 | * be called with semnum = -1, as well as with the number of each modified |
| 903 | * semaphore. | 878 | * semaphore. |
| 904 | * The tasks that must be woken up are added to @pt. The return code | 879 | * The tasks that must be woken up are added to @wake_q. The return code |
| 905 | * is stored in q->pid. | 880 | * is stored in q->pid. |
| 906 | * The function internally checks if const operations can now succeed. | 881 | * The function internally checks if const operations can now succeed. |
| 907 | * | 882 | * |
| 908 | * The function return 1 if at least one semop was completed successfully. | 883 | * The function return 1 if at least one semop was completed successfully. |
| 909 | */ | 884 | */ |
| 910 | static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | 885 | static int update_queue(struct sem_array *sma, int semnum, struct wake_q_head *wake_q) |
| 911 | { | 886 | { |
| 912 | struct sem_queue *q; | 887 | struct sem_queue *q, *tmp; |
| 913 | struct list_head *walk; | ||
| 914 | struct list_head *pending_list; | 888 | struct list_head *pending_list; |
| 915 | int semop_completed = 0; | 889 | int semop_completed = 0; |
| 916 | 890 | ||
| @@ -920,13 +894,9 @@ static int update_queue(struct sem_array *sma, int semnum, struct list_head *pt) | |||
| 920 | pending_list = &sma->sem_base[semnum].pending_alter; | 894 | pending_list = &sma->sem_base[semnum].pending_alter; |
| 921 | 895 | ||
| 922 | again: | 896 | again: |
| 923 | walk = pending_list->next; | 897 | list_for_each_entry_safe(q, tmp, pending_list, list) { |
| 924 | while (walk != pending_list) { | ||
| 925 | int error, restart; | 898 | int error, restart; |
| 926 | 899 | ||
| 927 | q = container_of(walk, struct sem_queue, list); | ||
| 928 | walk = walk->next; | ||
| 929 | |||
| 930 | /* If we are scanning the single sop, per-semaphore list of | 900 | /* If we are scanning the single sop, per-semaphore list of |
| 931 | * one semaphore and that semaphore is 0, then it is not | 901 | * one semaphore and that semaphore is 0, then it is not |
| 932 | * necessary to scan further: simple increments | 902 | * necessary to scan further: simple increments |
| @@ -949,11 +919,11 @@ again: | |||
| 949 | restart = 0; | 919 | restart = 0; |
| 950 | } else { | 920 | } else { |
| 951 | semop_completed = 1; | 921 | semop_completed = 1; |
| 952 | do_smart_wakeup_zero(sma, q->sops, q->nsops, pt); | 922 | do_smart_wakeup_zero(sma, q->sops, q->nsops, wake_q); |
| 953 | restart = check_restart(sma, q); | 923 | restart = check_restart(sma, q); |
| 954 | } | 924 | } |
| 955 | 925 | ||
| 956 | wake_up_sem_queue_prepare(pt, q, error); | 926 | wake_up_sem_queue_prepare(q, error, wake_q); |
| 957 | if (restart) | 927 | if (restart) |
| 958 | goto again; | 928 | goto again; |
| 959 | } | 929 | } |
| @@ -984,24 +954,24 @@ static void set_semotime(struct sem_array *sma, struct sembuf *sops) | |||
| 984 | * @sops: operations that were performed | 954 | * @sops: operations that were performed |
| 985 | * @nsops: number of operations | 955 | * @nsops: number of operations |
| 986 | * @otime: force setting otime | 956 | * @otime: force setting otime |
| 987 | * @pt: list head of the tasks that must be woken up. | 957 | * @wake_q: lockless wake-queue head |
| 988 | * | 958 | * |
| 989 | * do_smart_update() does the required calls to update_queue and wakeup_zero, | 959 | * do_smart_update() does the required calls to update_queue and wakeup_zero, |
| 990 | * based on the actual changes that were performed on the semaphore array. | 960 | * based on the actual changes that were performed on the semaphore array. |
| 991 | * Note that the function does not do the actual wake-up: the caller is | 961 | * Note that the function does not do the actual wake-up: the caller is |
| 992 | * responsible for calling wake_up_sem_queue_do(@pt). | 962 | * responsible for calling wake_up_q(). |
| 993 | * It is safe to perform this call after dropping all locks. | 963 | * It is safe to perform this call after dropping all locks. |
| 994 | */ | 964 | */ |
| 995 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, | 965 | static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsops, |
| 996 | int otime, struct list_head *pt) | 966 | int otime, struct wake_q_head *wake_q) |
| 997 | { | 967 | { |
| 998 | int i; | 968 | int i; |
| 999 | 969 | ||
| 1000 | otime |= do_smart_wakeup_zero(sma, sops, nsops, pt); | 970 | otime |= do_smart_wakeup_zero(sma, sops, nsops, wake_q); |
| 1001 | 971 | ||
| 1002 | if (!list_empty(&sma->pending_alter)) { | 972 | if (!list_empty(&sma->pending_alter)) { |
| 1003 | /* semaphore array uses the global queue - just process it. */ | 973 | /* semaphore array uses the global queue - just process it. */ |
| 1004 | otime |= update_queue(sma, -1, pt); | 974 | otime |= update_queue(sma, -1, wake_q); |
| 1005 | } else { | 975 | } else { |
| 1006 | if (!sops) { | 976 | if (!sops) { |
| 1007 | /* | 977 | /* |
| @@ -1009,7 +979,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
| 1009 | * known. Check all. | 979 | * known. Check all. |
| 1010 | */ | 980 | */ |
| 1011 | for (i = 0; i < sma->sem_nsems; i++) | 981 | for (i = 0; i < sma->sem_nsems; i++) |
| 1012 | otime |= update_queue(sma, i, pt); | 982 | otime |= update_queue(sma, i, wake_q); |
| 1013 | } else { | 983 | } else { |
| 1014 | /* | 984 | /* |
| 1015 | * Check the semaphores that were increased: | 985 | * Check the semaphores that were increased: |
| @@ -1023,7 +993,7 @@ static void do_smart_update(struct sem_array *sma, struct sembuf *sops, int nsop | |||
| 1023 | for (i = 0; i < nsops; i++) { | 993 | for (i = 0; i < nsops; i++) { |
| 1024 | if (sops[i].sem_op > 0) { | 994 | if (sops[i].sem_op > 0) { |
| 1025 | otime |= update_queue(sma, | 995 | otime |= update_queue(sma, |
| 1026 | sops[i].sem_num, pt); | 996 | sops[i].sem_num, wake_q); |
| 1027 | } | 997 | } |
| 1028 | } | 998 | } |
| 1029 | } | 999 | } |
| @@ -1111,8 +1081,8 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
| 1111 | struct sem_undo *un, *tu; | 1081 | struct sem_undo *un, *tu; |
| 1112 | struct sem_queue *q, *tq; | 1082 | struct sem_queue *q, *tq; |
| 1113 | struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); | 1083 | struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm); |
| 1114 | struct list_head tasks; | ||
| 1115 | int i; | 1084 | int i; |
| 1085 | DEFINE_WAKE_Q(wake_q); | ||
| 1116 | 1086 | ||
| 1117 | /* Free the existing undo structures for this semaphore set. */ | 1087 | /* Free the existing undo structures for this semaphore set. */ |
| 1118 | ipc_assert_locked_object(&sma->sem_perm); | 1088 | ipc_assert_locked_object(&sma->sem_perm); |
| @@ -1126,25 +1096,24 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
| 1126 | } | 1096 | } |
| 1127 | 1097 | ||
| 1128 | /* Wake up all pending processes and let them fail with EIDRM. */ | 1098 | /* Wake up all pending processes and let them fail with EIDRM. */ |
| 1129 | INIT_LIST_HEAD(&tasks); | ||
| 1130 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { | 1099 | list_for_each_entry_safe(q, tq, &sma->pending_const, list) { |
| 1131 | unlink_queue(sma, q); | 1100 | unlink_queue(sma, q); |
| 1132 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1101 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1133 | } | 1102 | } |
| 1134 | 1103 | ||
| 1135 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { | 1104 | list_for_each_entry_safe(q, tq, &sma->pending_alter, list) { |
| 1136 | unlink_queue(sma, q); | 1105 | unlink_queue(sma, q); |
| 1137 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1106 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1138 | } | 1107 | } |
| 1139 | for (i = 0; i < sma->sem_nsems; i++) { | 1108 | for (i = 0; i < sma->sem_nsems; i++) { |
| 1140 | struct sem *sem = sma->sem_base + i; | 1109 | struct sem *sem = sma->sem_base + i; |
| 1141 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { | 1110 | list_for_each_entry_safe(q, tq, &sem->pending_const, list) { |
| 1142 | unlink_queue(sma, q); | 1111 | unlink_queue(sma, q); |
| 1143 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1112 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1144 | } | 1113 | } |
| 1145 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { | 1114 | list_for_each_entry_safe(q, tq, &sem->pending_alter, list) { |
| 1146 | unlink_queue(sma, q); | 1115 | unlink_queue(sma, q); |
| 1147 | wake_up_sem_queue_prepare(&tasks, q, -EIDRM); | 1116 | wake_up_sem_queue_prepare(q, -EIDRM, &wake_q); |
| 1148 | } | 1117 | } |
| 1149 | } | 1118 | } |
| 1150 | 1119 | ||
| @@ -1153,7 +1122,7 @@ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | |||
| 1153 | sem_unlock(sma, -1); | 1122 | sem_unlock(sma, -1); |
| 1154 | rcu_read_unlock(); | 1123 | rcu_read_unlock(); |
| 1155 | 1124 | ||
| 1156 | wake_up_sem_queue_do(&tasks); | 1125 | wake_up_q(&wake_q); |
| 1157 | ns->used_sems -= sma->sem_nsems; | 1126 | ns->used_sems -= sma->sem_nsems; |
| 1158 | ipc_rcu_putref(sma, sem_rcu_free); | 1127 | ipc_rcu_putref(sma, sem_rcu_free); |
| 1159 | } | 1128 | } |
| @@ -1292,9 +1261,9 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1292 | struct sem_undo *un; | 1261 | struct sem_undo *un; |
| 1293 | struct sem_array *sma; | 1262 | struct sem_array *sma; |
| 1294 | struct sem *curr; | 1263 | struct sem *curr; |
| 1295 | int err; | 1264 | int err, val; |
| 1296 | struct list_head tasks; | 1265 | DEFINE_WAKE_Q(wake_q); |
| 1297 | int val; | 1266 | |
| 1298 | #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) | 1267 | #if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN) |
| 1299 | /* big-endian 64bit */ | 1268 | /* big-endian 64bit */ |
| 1300 | val = arg >> 32; | 1269 | val = arg >> 32; |
| @@ -1306,8 +1275,6 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1306 | if (val > SEMVMX || val < 0) | 1275 | if (val > SEMVMX || val < 0) |
| 1307 | return -ERANGE; | 1276 | return -ERANGE; |
| 1308 | 1277 | ||
| 1309 | INIT_LIST_HEAD(&tasks); | ||
| 1310 | |||
| 1311 | rcu_read_lock(); | 1278 | rcu_read_lock(); |
| 1312 | sma = sem_obtain_object_check(ns, semid); | 1279 | sma = sem_obtain_object_check(ns, semid); |
| 1313 | if (IS_ERR(sma)) { | 1280 | if (IS_ERR(sma)) { |
| @@ -1350,10 +1317,10 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1350 | curr->sempid = task_tgid_vnr(current); | 1317 | curr->sempid = task_tgid_vnr(current); |
| 1351 | sma->sem_ctime = get_seconds(); | 1318 | sma->sem_ctime = get_seconds(); |
| 1352 | /* maybe some queued-up processes were waiting for this */ | 1319 | /* maybe some queued-up processes were waiting for this */ |
| 1353 | do_smart_update(sma, NULL, 0, 0, &tasks); | 1320 | do_smart_update(sma, NULL, 0, 0, &wake_q); |
| 1354 | sem_unlock(sma, -1); | 1321 | sem_unlock(sma, -1); |
| 1355 | rcu_read_unlock(); | 1322 | rcu_read_unlock(); |
| 1356 | wake_up_sem_queue_do(&tasks); | 1323 | wake_up_q(&wake_q); |
| 1357 | return 0; | 1324 | return 0; |
| 1358 | } | 1325 | } |
| 1359 | 1326 | ||
| @@ -1365,9 +1332,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1365 | int err, nsems; | 1332 | int err, nsems; |
| 1366 | ushort fast_sem_io[SEMMSL_FAST]; | 1333 | ushort fast_sem_io[SEMMSL_FAST]; |
| 1367 | ushort *sem_io = fast_sem_io; | 1334 | ushort *sem_io = fast_sem_io; |
| 1368 | struct list_head tasks; | 1335 | DEFINE_WAKE_Q(wake_q); |
| 1369 | |||
| 1370 | INIT_LIST_HEAD(&tasks); | ||
| 1371 | 1336 | ||
| 1372 | rcu_read_lock(); | 1337 | rcu_read_lock(); |
| 1373 | sma = sem_obtain_object_check(ns, semid); | 1338 | sma = sem_obtain_object_check(ns, semid); |
| @@ -1478,7 +1443,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1478 | } | 1443 | } |
| 1479 | sma->sem_ctime = get_seconds(); | 1444 | sma->sem_ctime = get_seconds(); |
| 1480 | /* maybe some queued-up processes were waiting for this */ | 1445 | /* maybe some queued-up processes were waiting for this */ |
| 1481 | do_smart_update(sma, NULL, 0, 0, &tasks); | 1446 | do_smart_update(sma, NULL, 0, 0, &wake_q); |
| 1482 | err = 0; | 1447 | err = 0; |
| 1483 | goto out_unlock; | 1448 | goto out_unlock; |
| 1484 | } | 1449 | } |
| @@ -1514,7 +1479,7 @@ out_unlock: | |||
| 1514 | sem_unlock(sma, -1); | 1479 | sem_unlock(sma, -1); |
| 1515 | out_rcu_wakeup: | 1480 | out_rcu_wakeup: |
| 1516 | rcu_read_unlock(); | 1481 | rcu_read_unlock(); |
| 1517 | wake_up_sem_queue_do(&tasks); | 1482 | wake_up_q(&wake_q); |
| 1518 | out_free: | 1483 | out_free: |
| 1519 | if (sem_io != fast_sem_io) | 1484 | if (sem_io != fast_sem_io) |
| 1520 | ipc_free(sem_io); | 1485 | ipc_free(sem_io); |
| @@ -1787,32 +1752,6 @@ out: | |||
| 1787 | return un; | 1752 | return un; |
| 1788 | } | 1753 | } |
| 1789 | 1754 | ||
| 1790 | |||
| 1791 | /** | ||
| 1792 | * get_queue_result - retrieve the result code from sem_queue | ||
| 1793 | * @q: Pointer to queue structure | ||
| 1794 | * | ||
| 1795 | * Retrieve the return code from the pending queue. If IN_WAKEUP is found in | ||
| 1796 | * q->status, then we must loop until the value is replaced with the final | ||
| 1797 | * value: This may happen if a task is woken up by an unrelated event (e.g. | ||
| 1798 | * signal) and in parallel the task is woken up by another task because it got | ||
| 1799 | * the requested semaphores. | ||
| 1800 | * | ||
| 1801 | * The function can be called with or without holding the semaphore spinlock. | ||
| 1802 | */ | ||
| 1803 | static int get_queue_result(struct sem_queue *q) | ||
| 1804 | { | ||
| 1805 | int error; | ||
| 1806 | |||
| 1807 | error = q->status; | ||
| 1808 | while (unlikely(error == IN_WAKEUP)) { | ||
| 1809 | cpu_relax(); | ||
| 1810 | error = q->status; | ||
| 1811 | } | ||
| 1812 | |||
| 1813 | return error; | ||
| 1814 | } | ||
| 1815 | |||
| 1816 | SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | 1755 | SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, |
| 1817 | unsigned, nsops, const struct timespec __user *, timeout) | 1756 | unsigned, nsops, const struct timespec __user *, timeout) |
| 1818 | { | 1757 | { |
| @@ -1821,11 +1760,11 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1821 | struct sembuf fast_sops[SEMOPM_FAST]; | 1760 | struct sembuf fast_sops[SEMOPM_FAST]; |
| 1822 | struct sembuf *sops = fast_sops, *sop; | 1761 | struct sembuf *sops = fast_sops, *sop; |
| 1823 | struct sem_undo *un; | 1762 | struct sem_undo *un; |
| 1824 | int undos = 0, alter = 0, max, locknum; | 1763 | int max, locknum; |
| 1764 | bool undos = false, alter = false, dupsop = false; | ||
| 1825 | struct sem_queue queue; | 1765 | struct sem_queue queue; |
| 1826 | unsigned long jiffies_left = 0; | 1766 | unsigned long dup = 0, jiffies_left = 0; |
| 1827 | struct ipc_namespace *ns; | 1767 | struct ipc_namespace *ns; |
| 1828 | struct list_head tasks; | ||
| 1829 | 1768 | ||
| 1830 | ns = current->nsproxy->ipc_ns; | 1769 | ns = current->nsproxy->ipc_ns; |
| 1831 | 1770 | ||
| @@ -1838,10 +1777,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1838 | if (sops == NULL) | 1777 | if (sops == NULL) |
| 1839 | return -ENOMEM; | 1778 | return -ENOMEM; |
| 1840 | } | 1779 | } |
| 1780 | |||
| 1841 | if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { | 1781 | if (copy_from_user(sops, tsops, nsops * sizeof(*tsops))) { |
| 1842 | error = -EFAULT; | 1782 | error = -EFAULT; |
| 1843 | goto out_free; | 1783 | goto out_free; |
| 1844 | } | 1784 | } |
| 1785 | |||
| 1845 | if (timeout) { | 1786 | if (timeout) { |
| 1846 | struct timespec _timeout; | 1787 | struct timespec _timeout; |
| 1847 | if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { | 1788 | if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { |
| @@ -1855,18 +1796,30 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1855 | } | 1796 | } |
| 1856 | jiffies_left = timespec_to_jiffies(&_timeout); | 1797 | jiffies_left = timespec_to_jiffies(&_timeout); |
| 1857 | } | 1798 | } |
| 1799 | |||
| 1858 | max = 0; | 1800 | max = 0; |
| 1859 | for (sop = sops; sop < sops + nsops; sop++) { | 1801 | for (sop = sops; sop < sops + nsops; sop++) { |
| 1802 | unsigned long mask = 1ULL << ((sop->sem_num) % BITS_PER_LONG); | ||
| 1803 | |||
| 1860 | if (sop->sem_num >= max) | 1804 | if (sop->sem_num >= max) |
| 1861 | max = sop->sem_num; | 1805 | max = sop->sem_num; |
| 1862 | if (sop->sem_flg & SEM_UNDO) | 1806 | if (sop->sem_flg & SEM_UNDO) |
| 1863 | undos = 1; | 1807 | undos = true; |
| 1864 | if (sop->sem_op != 0) | 1808 | if (dup & mask) { |
| 1865 | alter = 1; | 1809 | /* |
| 1810 | * There was a previous alter access that appears | ||
| 1811 | * to have accessed the same semaphore, thus use | ||
| 1812 | * the dupsop logic. "appears", because the detection | ||
| 1813 | * can only check % BITS_PER_LONG. | ||
| 1814 | */ | ||
| 1815 | dupsop = true; | ||
| 1816 | } | ||
| 1817 | if (sop->sem_op != 0) { | ||
| 1818 | alter = true; | ||
| 1819 | dup |= mask; | ||
| 1820 | } | ||
| 1866 | } | 1821 | } |
| 1867 | 1822 | ||
| 1868 | INIT_LIST_HEAD(&tasks); | ||
| 1869 | |||
| 1870 | if (undos) { | 1823 | if (undos) { |
| 1871 | /* On success, find_alloc_undo takes the rcu_read_lock */ | 1824 | /* On success, find_alloc_undo takes the rcu_read_lock */ |
| 1872 | un = find_alloc_undo(ns, semid); | 1825 | un = find_alloc_undo(ns, semid); |
| @@ -1887,16 +1840,22 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1887 | } | 1840 | } |
| 1888 | 1841 | ||
| 1889 | error = -EFBIG; | 1842 | error = -EFBIG; |
| 1890 | if (max >= sma->sem_nsems) | 1843 | if (max >= sma->sem_nsems) { |
| 1891 | goto out_rcu_wakeup; | 1844 | rcu_read_unlock(); |
| 1845 | goto out_free; | ||
| 1846 | } | ||
| 1892 | 1847 | ||
| 1893 | error = -EACCES; | 1848 | error = -EACCES; |
| 1894 | if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) | 1849 | if (ipcperms(ns, &sma->sem_perm, alter ? S_IWUGO : S_IRUGO)) { |
| 1895 | goto out_rcu_wakeup; | 1850 | rcu_read_unlock(); |
| 1851 | goto out_free; | ||
| 1852 | } | ||
| 1896 | 1853 | ||
| 1897 | error = security_sem_semop(sma, sops, nsops, alter); | 1854 | error = security_sem_semop(sma, sops, nsops, alter); |
| 1898 | if (error) | 1855 | if (error) { |
| 1899 | goto out_rcu_wakeup; | 1856 | rcu_read_unlock(); |
| 1857 | goto out_free; | ||
| 1858 | } | ||
| 1900 | 1859 | ||
| 1901 | error = -EIDRM; | 1860 | error = -EIDRM; |
| 1902 | locknum = sem_lock(sma, sops, nsops); | 1861 | locknum = sem_lock(sma, sops, nsops); |
| @@ -1925,24 +1884,34 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1925 | queue.undo = un; | 1884 | queue.undo = un; |
| 1926 | queue.pid = task_tgid_vnr(current); | 1885 | queue.pid = task_tgid_vnr(current); |
| 1927 | queue.alter = alter; | 1886 | queue.alter = alter; |
| 1887 | queue.dupsop = dupsop; | ||
| 1928 | 1888 | ||
| 1929 | error = perform_atomic_semop(sma, &queue); | 1889 | error = perform_atomic_semop(sma, &queue); |
| 1930 | if (error == 0) { | 1890 | if (error == 0) { /* non-blocking succesfull path */ |
| 1931 | /* If the operation was successful, then do | 1891 | DEFINE_WAKE_Q(wake_q); |
| 1892 | |||
| 1893 | /* | ||
| 1894 | * If the operation was successful, then do | ||
| 1932 | * the required updates. | 1895 | * the required updates. |
| 1933 | */ | 1896 | */ |
| 1934 | if (alter) | 1897 | if (alter) |
| 1935 | do_smart_update(sma, sops, nsops, 1, &tasks); | 1898 | do_smart_update(sma, sops, nsops, 1, &wake_q); |
| 1936 | else | 1899 | else |
| 1937 | set_semotime(sma, sops); | 1900 | set_semotime(sma, sops); |
| 1901 | |||
| 1902 | sem_unlock(sma, locknum); | ||
| 1903 | rcu_read_unlock(); | ||
| 1904 | wake_up_q(&wake_q); | ||
| 1905 | |||
| 1906 | goto out_free; | ||
| 1938 | } | 1907 | } |
| 1939 | if (error <= 0) | 1908 | if (error < 0) /* non-blocking error path */ |
| 1940 | goto out_unlock_free; | 1909 | goto out_unlock_free; |
| 1941 | 1910 | ||
| 1942 | /* We need to sleep on this operation, so we put the current | 1911 | /* |
| 1912 | * We need to sleep on this operation, so we put the current | ||
| 1943 | * task into the pending queue and go to sleep. | 1913 | * task into the pending queue and go to sleep. |
| 1944 | */ | 1914 | */ |
| 1945 | |||
| 1946 | if (nsops == 1) { | 1915 | if (nsops == 1) { |
| 1947 | struct sem *curr; | 1916 | struct sem *curr; |
| 1948 | curr = &sma->sem_base[sops->sem_num]; | 1917 | curr = &sma->sem_base[sops->sem_num]; |
| @@ -1971,77 +1940,69 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1971 | sma->complex_count++; | 1940 | sma->complex_count++; |
| 1972 | } | 1941 | } |
| 1973 | 1942 | ||
| 1974 | queue.status = -EINTR; | 1943 | do { |
| 1975 | queue.sleeper = current; | 1944 | queue.status = -EINTR; |
| 1945 | queue.sleeper = current; | ||
| 1976 | 1946 | ||
| 1977 | sleep_again: | 1947 | __set_current_state(TASK_INTERRUPTIBLE); |
| 1978 | __set_current_state(TASK_INTERRUPTIBLE); | 1948 | sem_unlock(sma, locknum); |
| 1979 | sem_unlock(sma, locknum); | 1949 | rcu_read_unlock(); |
| 1980 | rcu_read_unlock(); | ||
| 1981 | |||
| 1982 | if (timeout) | ||
| 1983 | jiffies_left = schedule_timeout(jiffies_left); | ||
| 1984 | else | ||
| 1985 | schedule(); | ||
| 1986 | 1950 | ||
| 1987 | error = get_queue_result(&queue); | 1951 | if (timeout) |
| 1952 | jiffies_left = schedule_timeout(jiffies_left); | ||
| 1953 | else | ||
| 1954 | schedule(); | ||
| 1988 | 1955 | ||
| 1989 | if (error != -EINTR) { | 1956 | /* |
| 1990 | /* fast path: update_queue already obtained all requested | 1957 | * fastpath: the semop has completed, either successfully or |
| 1991 | * resources. | 1958 | * not, from the syscall pov, is quite irrelevant to us at this |
| 1992 | * Perform a smp_mb(): User space could assume that semop() | 1959 | * point; we're done. |
| 1993 | * is a memory barrier: Without the mb(), the cpu could | 1960 | * |
| 1994 | * speculatively read in user space stale data that was | 1961 | * We _do_ care, nonetheless, about being awoken by a signal or |
| 1995 | * overwritten by the previous owner of the semaphore. | 1962 | * spuriously. The queue.status is checked again in the |
| 1963 | * slowpath (aka after taking sem_lock), such that we can detect | ||
| 1964 | * scenarios where we were awakened externally, during the | ||
| 1965 | * window between wake_q_add() and wake_up_q(). | ||
| 1996 | */ | 1966 | */ |
| 1997 | smp_mb(); | 1967 | error = READ_ONCE(queue.status); |
| 1998 | 1968 | if (error != -EINTR) { | |
| 1999 | goto out_free; | 1969 | /* |
| 2000 | } | 1970 | * User space could assume that semop() is a memory |
| 2001 | 1971 | * barrier: Without the mb(), the cpu could | |
| 2002 | rcu_read_lock(); | 1972 | * speculatively read in userspace stale data that was |
| 2003 | sma = sem_obtain_lock(ns, semid, sops, nsops, &locknum); | 1973 | * overwritten by the previous owner of the semaphore. |
| 2004 | 1974 | */ | |
| 2005 | /* | 1975 | smp_mb(); |
| 2006 | * Wait until it's guaranteed that no wakeup_sem_queue_do() is ongoing. | 1976 | goto out_free; |
| 2007 | */ | 1977 | } |
| 2008 | error = get_queue_result(&queue); | ||
| 2009 | 1978 | ||
| 2010 | /* | 1979 | rcu_read_lock(); |
| 2011 | * Array removed? If yes, leave without sem_unlock(). | 1980 | sem_lock(sma, sops, nsops); |
| 2012 | */ | ||
| 2013 | if (IS_ERR(sma)) { | ||
| 2014 | rcu_read_unlock(); | ||
| 2015 | goto out_free; | ||
| 2016 | } | ||
| 2017 | 1981 | ||
| 1982 | if (!ipc_valid_object(&sma->sem_perm)) | ||
| 1983 | goto out_unlock_free; | ||
| 2018 | 1984 | ||
| 2019 | /* | 1985 | error = READ_ONCE(queue.status); |
| 2020 | * If queue.status != -EINTR we are woken up by another process. | ||
| 2021 | * Leave without unlink_queue(), but with sem_unlock(). | ||
| 2022 | */ | ||
| 2023 | if (error != -EINTR) | ||
| 2024 | goto out_unlock_free; | ||
| 2025 | 1986 | ||
| 2026 | /* | 1987 | /* |
| 2027 | * If an interrupt occurred we have to clean up the queue | 1988 | * If queue.status != -EINTR we are woken up by another process. |
| 2028 | */ | 1989 | * Leave without unlink_queue(), but with sem_unlock(). |
| 2029 | if (timeout && jiffies_left == 0) | 1990 | */ |
| 2030 | error = -EAGAIN; | 1991 | if (error != -EINTR) |
| 1992 | goto out_unlock_free; | ||
| 2031 | 1993 | ||
| 2032 | /* | 1994 | /* |
| 2033 | * If the wakeup was spurious, just retry | 1995 | * If an interrupt occurred we have to clean up the queue. |
| 2034 | */ | 1996 | */ |
| 2035 | if (error == -EINTR && !signal_pending(current)) | 1997 | if (timeout && jiffies_left == 0) |
| 2036 | goto sleep_again; | 1998 | error = -EAGAIN; |
| 1999 | } while (error == -EINTR && !signal_pending(current)); /* spurious */ | ||
| 2037 | 2000 | ||
| 2038 | unlink_queue(sma, &queue); | 2001 | unlink_queue(sma, &queue); |
| 2039 | 2002 | ||
| 2040 | out_unlock_free: | 2003 | out_unlock_free: |
| 2041 | sem_unlock(sma, locknum); | 2004 | sem_unlock(sma, locknum); |
| 2042 | out_rcu_wakeup: | ||
| 2043 | rcu_read_unlock(); | 2005 | rcu_read_unlock(); |
| 2044 | wake_up_sem_queue_do(&tasks); | ||
| 2045 | out_free: | 2006 | out_free: |
| 2046 | if (sops != fast_sops) | 2007 | if (sops != fast_sops) |
| 2047 | kfree(sops); | 2008 | kfree(sops); |
| @@ -2102,8 +2063,8 @@ void exit_sem(struct task_struct *tsk) | |||
| 2102 | for (;;) { | 2063 | for (;;) { |
| 2103 | struct sem_array *sma; | 2064 | struct sem_array *sma; |
| 2104 | struct sem_undo *un; | 2065 | struct sem_undo *un; |
| 2105 | struct list_head tasks; | ||
| 2106 | int semid, i; | 2066 | int semid, i; |
| 2067 | DEFINE_WAKE_Q(wake_q); | ||
| 2107 | 2068 | ||
| 2108 | cond_resched(); | 2069 | cond_resched(); |
| 2109 | 2070 | ||
| @@ -2191,11 +2152,10 @@ void exit_sem(struct task_struct *tsk) | |||
| 2191 | } | 2152 | } |
| 2192 | } | 2153 | } |
| 2193 | /* maybe some queued-up processes were waiting for this */ | 2154 | /* maybe some queued-up processes were waiting for this */ |
| 2194 | INIT_LIST_HEAD(&tasks); | 2155 | do_smart_update(sma, NULL, 0, 1, &wake_q); |
| 2195 | do_smart_update(sma, NULL, 0, 1, &tasks); | ||
| 2196 | sem_unlock(sma, -1); | 2156 | sem_unlock(sma, -1); |
| 2197 | rcu_read_unlock(); | 2157 | rcu_read_unlock(); |
| 2198 | wake_up_sem_queue_do(&tasks); | 2158 | wake_up_q(&wake_q); |
| 2199 | 2159 | ||
| 2200 | kfree_rcu(un, rcu); | 2160 | kfree_rcu(un, rcu); |
| 2201 | } | 2161 | } |
| @@ -89,6 +89,7 @@ void shm_init_ns(struct ipc_namespace *ns) | |||
| 89 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) | 89 | static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) |
| 90 | { | 90 | { |
| 91 | struct shmid_kernel *shp; | 91 | struct shmid_kernel *shp; |
| 92 | |||
| 92 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); | 93 | shp = container_of(ipcp, struct shmid_kernel, shm_perm); |
| 93 | 94 | ||
| 94 | if (shp->shm_nattch) { | 95 | if (shp->shm_nattch) { |
| @@ -387,6 +388,7 @@ static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) | |||
| 387 | struct file *file = vma->vm_file; | 388 | struct file *file = vma->vm_file; |
| 388 | struct shm_file_data *sfd = shm_file_data(file); | 389 | struct shm_file_data *sfd = shm_file_data(file); |
| 389 | int err = 0; | 390 | int err = 0; |
| 391 | |||
| 390 | if (sfd->vm_ops->set_policy) | 392 | if (sfd->vm_ops->set_policy) |
| 391 | err = sfd->vm_ops->set_policy(vma, new); | 393 | err = sfd->vm_ops->set_policy(vma, new); |
| 392 | return err; | 394 | return err; |
| @@ -417,7 +419,7 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 417 | * In case of remap_file_pages() emulation, the file can represent | 419 | * In case of remap_file_pages() emulation, the file can represent |
| 418 | * removed IPC ID: propogate shm_lock() error to caller. | 420 | * removed IPC ID: propogate shm_lock() error to caller. |
| 419 | */ | 421 | */ |
| 420 | ret =__shm_open(vma); | 422 | ret = __shm_open(vma); |
| 421 | if (ret) | 423 | if (ret) |
| 422 | return ret; | 424 | return ret; |
| 423 | 425 | ||
| @@ -468,6 +470,7 @@ static unsigned long shm_get_unmapped_area(struct file *file, | |||
| 468 | unsigned long flags) | 470 | unsigned long flags) |
| 469 | { | 471 | { |
| 470 | struct shm_file_data *sfd = shm_file_data(file); | 472 | struct shm_file_data *sfd = shm_file_data(file); |
| 473 | |||
| 471 | return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, | 474 | return sfd->file->f_op->get_unmapped_area(sfd->file, addr, len, |
| 472 | pgoff, flags); | 475 | pgoff, flags); |
| 473 | } | 476 | } |
| @@ -766,6 +769,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, | |||
| 766 | } else { | 769 | } else { |
| 767 | #ifdef CONFIG_SHMEM | 770 | #ifdef CONFIG_SHMEM |
| 768 | struct shmem_inode_info *info = SHMEM_I(inode); | 771 | struct shmem_inode_info *info = SHMEM_I(inode); |
| 772 | |||
| 769 | spin_lock_irq(&info->lock); | 773 | spin_lock_irq(&info->lock); |
| 770 | *rss_add += inode->i_mapping->nrpages; | 774 | *rss_add += inode->i_mapping->nrpages; |
| 771 | *swp_add += info->swapped; | 775 | *swp_add += info->swapped; |
| @@ -1028,6 +1032,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) | |||
| 1028 | 1032 | ||
| 1029 | if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { | 1033 | if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { |
| 1030 | kuid_t euid = current_euid(); | 1034 | kuid_t euid = current_euid(); |
| 1035 | |||
| 1031 | if (!uid_eq(euid, shp->shm_perm.uid) && | 1036 | if (!uid_eq(euid, shp->shm_perm.uid) && |
| 1032 | !uid_eq(euid, shp->shm_perm.cuid)) { | 1037 | !uid_eq(euid, shp->shm_perm.cuid)) { |
| 1033 | err = -EPERM; | 1038 | err = -EPERM; |
| @@ -1045,6 +1050,7 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) | |||
| 1045 | 1050 | ||
| 1046 | if (cmd == SHM_LOCK) { | 1051 | if (cmd == SHM_LOCK) { |
| 1047 | struct user_struct *user = current_user(); | 1052 | struct user_struct *user = current_user(); |
| 1053 | |||
| 1048 | err = shmem_lock(shm_file, 1, user); | 1054 | err = shmem_lock(shm_file, 1, user); |
| 1049 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { | 1055 | if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { |
| 1050 | shp->shm_perm.mode |= SHM_LOCKED; | 1056 | shp->shm_perm.mode |= SHM_LOCKED; |
| @@ -1354,9 +1360,10 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) | |||
| 1354 | vma = next; | 1360 | vma = next; |
| 1355 | } | 1361 | } |
| 1356 | 1362 | ||
| 1357 | #else /* CONFIG_MMU */ | 1363 | #else /* CONFIG_MMU */ |
| 1358 | /* under NOMMU conditions, the exact address to be destroyed must be | 1364 | /* under NOMMU conditions, the exact address to be destroyed must be |
| 1359 | * given */ | 1365 | * given |
| 1366 | */ | ||
| 1360 | if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { | 1367 | if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { |
| 1361 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); | 1368 | do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); |
| 1362 | retval = 0; | 1369 | retval = 0; |
diff --git a/kernel/Makefile b/kernel/Makefile index eaee9de224bd..12c679f769c6 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o | |||
| 84 | obj-$(CONFIG_KGDB) += debug/ | 84 | obj-$(CONFIG_KGDB) += debug/ |
| 85 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o | 85 | obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o |
| 86 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o | 86 | obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o |
| 87 | obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o | ||
| 87 | obj-$(CONFIG_SECCOMP) += seccomp.o | 88 | obj-$(CONFIG_SECCOMP) += seccomp.o |
| 88 | obj-$(CONFIG_RELAY) += relay.o | 89 | obj-$(CONFIG_RELAY) += relay.o |
| 89 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 90 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0874e2edd275..79517e5549f1 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -598,11 +598,11 @@ return_normal: | |||
| 598 | /* | 598 | /* |
| 599 | * Wait for the other CPUs to be notified and be waiting for us: | 599 | * Wait for the other CPUs to be notified and be waiting for us: |
| 600 | */ | 600 | */ |
| 601 | time_left = loops_per_jiffy * HZ; | 601 | time_left = MSEC_PER_SEC; |
| 602 | while (kgdb_do_roundup && --time_left && | 602 | while (kgdb_do_roundup && --time_left && |
| 603 | (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != | 603 | (atomic_read(&masters_in_kgdb) + atomic_read(&slaves_in_kgdb)) != |
| 604 | online_cpus) | 604 | online_cpus) |
| 605 | cpu_relax(); | 605 | udelay(1000); |
| 606 | if (!time_left) | 606 | if (!time_left) |
| 607 | pr_crit("Timed out waiting for secondary CPUs.\n"); | 607 | pr_crit("Timed out waiting for secondary CPUs.\n"); |
| 608 | 608 | ||
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 98c9011eac78..e74be38245ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | char kdb_prompt_str[CMD_BUFLEN]; | 30 | char kdb_prompt_str[CMD_BUFLEN]; |
| 31 | 31 | ||
| 32 | int kdb_trap_printk; | 32 | int kdb_trap_printk; |
| 33 | int kdb_printf_cpu = -1; | ||
| 33 | 34 | ||
| 34 | static int kgdb_transition_check(char *buffer) | 35 | static int kgdb_transition_check(char *buffer) |
| 35 | { | 36 | { |
| @@ -554,31 +555,26 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) | |||
| 554 | int linecount; | 555 | int linecount; |
| 555 | int colcount; | 556 | int colcount; |
| 556 | int logging, saved_loglevel = 0; | 557 | int logging, saved_loglevel = 0; |
| 557 | int saved_trap_printk; | ||
| 558 | int got_printf_lock = 0; | ||
| 559 | int retlen = 0; | 558 | int retlen = 0; |
| 560 | int fnd, len; | 559 | int fnd, len; |
| 560 | int this_cpu, old_cpu; | ||
| 561 | char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; | 561 | char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; |
| 562 | char *moreprompt = "more> "; | 562 | char *moreprompt = "more> "; |
| 563 | struct console *c = console_drivers; | 563 | struct console *c = console_drivers; |
| 564 | static DEFINE_SPINLOCK(kdb_printf_lock); | ||
| 565 | unsigned long uninitialized_var(flags); | 564 | unsigned long uninitialized_var(flags); |
| 566 | 565 | ||
| 567 | preempt_disable(); | ||
| 568 | saved_trap_printk = kdb_trap_printk; | ||
| 569 | kdb_trap_printk = 0; | ||
| 570 | |||
| 571 | /* Serialize kdb_printf if multiple cpus try to write at once. | 566 | /* Serialize kdb_printf if multiple cpus try to write at once. |
| 572 | * But if any cpu goes recursive in kdb, just print the output, | 567 | * But if any cpu goes recursive in kdb, just print the output, |
| 573 | * even if it is interleaved with any other text. | 568 | * even if it is interleaved with any other text. |
| 574 | */ | 569 | */ |
| 575 | if (!KDB_STATE(PRINTF_LOCK)) { | 570 | local_irq_save(flags); |
| 576 | KDB_STATE_SET(PRINTF_LOCK); | 571 | this_cpu = smp_processor_id(); |
| 577 | spin_lock_irqsave(&kdb_printf_lock, flags); | 572 | for (;;) { |
| 578 | got_printf_lock = 1; | 573 | old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); |
| 579 | atomic_inc(&kdb_event); | 574 | if (old_cpu == -1 || old_cpu == this_cpu) |
| 580 | } else { | 575 | break; |
| 581 | __acquire(kdb_printf_lock); | 576 | |
| 577 | cpu_relax(); | ||
| 582 | } | 578 | } |
| 583 | 579 | ||
| 584 | diag = kdbgetintenv("LINES", &linecount); | 580 | diag = kdbgetintenv("LINES", &linecount); |
| @@ -847,16 +843,9 @@ kdb_print_out: | |||
| 847 | suspend_grep = 0; /* end of what may have been a recursive call */ | 843 | suspend_grep = 0; /* end of what may have been a recursive call */ |
| 848 | if (logging) | 844 | if (logging) |
| 849 | console_loglevel = saved_loglevel; | 845 | console_loglevel = saved_loglevel; |
| 850 | if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { | 846 | /* kdb_printf_cpu locked the code above. */ |
| 851 | got_printf_lock = 0; | 847 | smp_store_release(&kdb_printf_cpu, old_cpu); |
| 852 | spin_unlock_irqrestore(&kdb_printf_lock, flags); | 848 | local_irq_restore(flags); |
| 853 | KDB_STATE_CLEAR(PRINTF_LOCK); | ||
| 854 | atomic_dec(&kdb_event); | ||
| 855 | } else { | ||
| 856 | __release(kdb_printf_lock); | ||
| 857 | } | ||
| 858 | kdb_trap_printk = saved_trap_printk; | ||
| 859 | preempt_enable(); | ||
| 860 | return retlen; | 849 | return retlen; |
| 861 | } | 850 | } |
| 862 | 851 | ||
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2a20c0dfdafc..ca183919d302 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -60,7 +60,6 @@ int kdb_grep_trailing; | |||
| 60 | * Kernel debugger state flags | 60 | * Kernel debugger state flags |
| 61 | */ | 61 | */ |
| 62 | int kdb_flags; | 62 | int kdb_flags; |
| 63 | atomic_t kdb_event; | ||
| 64 | 63 | ||
| 65 | /* | 64 | /* |
| 66 | * kdb_lock protects updates to kdb_initial_cpu. Used to | 65 | * kdb_lock protects updates to kdb_initial_cpu. Used to |
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index 75014d7f4568..fc224fbcf954 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h | |||
| @@ -132,7 +132,6 @@ extern int kdb_state; | |||
| 132 | #define KDB_STATE_PAGER 0x00000400 /* pager is available */ | 132 | #define KDB_STATE_PAGER 0x00000400 /* pager is available */ |
| 133 | #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching | 133 | #define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching |
| 134 | * back to initial cpu */ | 134 | * back to initial cpu */ |
| 135 | #define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ | ||
| 136 | #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ | 135 | #define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ |
| 137 | #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ | 136 | #define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ |
| 138 | #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been | 137 | #define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, | |||
| 301 | retry: | 301 | retry: |
| 302 | /* Read the page with vaddr into memory */ | 302 | /* Read the page with vaddr into memory */ |
| 303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, | 303 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, |
| 304 | &vma); | 304 | &vma, NULL); |
| 305 | if (ret <= 0) | 305 | if (ret <= 0) |
| 306 | return ret; | 306 | return ret; |
| 307 | 307 | ||
| @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) | |||
| 1712 | * essentially a kernel access to the memory. | 1712 | * essentially a kernel access to the memory. |
| 1713 | */ | 1713 | */ |
| 1714 | result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, | 1714 | result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, |
| 1715 | NULL); | 1715 | NULL, NULL); |
| 1716 | if (result < 0) | 1716 | if (result < 0) |
| 1717 | return result; | 1717 | return result; |
| 1718 | 1718 | ||
diff --git a/kernel/kcov.c b/kernel/kcov.c index 3cbb0c879705..cc2fa35ca480 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c | |||
| @@ -1,11 +1,16 @@ | |||
| 1 | #define pr_fmt(fmt) "kcov: " fmt | 1 | #define pr_fmt(fmt) "kcov: " fmt |
| 2 | 2 | ||
| 3 | #define DISABLE_BRANCH_PROFILING | 3 | #define DISABLE_BRANCH_PROFILING |
| 4 | #include <linux/atomic.h> | ||
| 4 | #include <linux/compiler.h> | 5 | #include <linux/compiler.h> |
| 6 | #include <linux/errno.h> | ||
| 7 | #include <linux/export.h> | ||
| 5 | #include <linux/types.h> | 8 | #include <linux/types.h> |
| 6 | #include <linux/file.h> | 9 | #include <linux/file.h> |
| 7 | #include <linux/fs.h> | 10 | #include <linux/fs.h> |
| 11 | #include <linux/init.h> | ||
| 8 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 13 | #include <linux/preempt.h> | ||
| 9 | #include <linux/printk.h> | 14 | #include <linux/printk.h> |
| 10 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
| 11 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 561675589511..5617cc412444 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
| @@ -441,6 +441,8 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image, | |||
| 441 | while (hole_end <= crashk_res.end) { | 441 | while (hole_end <= crashk_res.end) { |
| 442 | unsigned long i; | 442 | unsigned long i; |
| 443 | 443 | ||
| 444 | cond_resched(); | ||
| 445 | |||
| 444 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) | 446 | if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) |
| 445 | break; | 447 | break; |
| 446 | /* See if I overlap any of the segments */ | 448 | /* See if I overlap any of the segments */ |
| @@ -1467,9 +1469,6 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 1467 | #endif | 1469 | #endif |
| 1468 | VMCOREINFO_NUMBER(PG_head_mask); | 1470 | VMCOREINFO_NUMBER(PG_head_mask); |
| 1469 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | 1471 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); |
| 1470 | #ifdef CONFIG_X86 | ||
| 1471 | VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE); | ||
| 1472 | #endif | ||
| 1473 | #ifdef CONFIG_HUGETLB_PAGE | 1472 | #ifdef CONFIG_HUGETLB_PAGE |
| 1474 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); | 1473 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); |
| 1475 | #endif | 1474 | #endif |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 577f2288d19f..a3ce35e0fa1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args) | |||
| 1926 | int r; | 1926 | int r; |
| 1927 | 1927 | ||
| 1928 | #ifdef CONFIG_KGDB_KDB | 1928 | #ifdef CONFIG_KGDB_KDB |
| 1929 | if (unlikely(kdb_trap_printk)) { | 1929 | /* Allow to pass printk() to kdb but avoid a recursion. */ |
| 1930 | if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { | ||
| 1930 | r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); | 1931 | r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); |
| 1931 | return r; | 1932 | return r; |
| 1932 | } | 1933 | } |
diff --git a/kernel/relay.c b/kernel/relay.c index da79a109dbeb..8f18d314a96a 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -809,11 +809,11 @@ void relay_subbufs_consumed(struct rchan *chan, | |||
| 809 | { | 809 | { |
| 810 | struct rchan_buf *buf; | 810 | struct rchan_buf *buf; |
| 811 | 811 | ||
| 812 | if (!chan) | 812 | if (!chan || cpu >= NR_CPUS) |
| 813 | return; | 813 | return; |
| 814 | 814 | ||
| 815 | buf = *per_cpu_ptr(chan->buf, cpu); | 815 | buf = *per_cpu_ptr(chan->buf, cpu); |
| 816 | if (cpu >= NR_CPUS || !buf || subbufs_consumed > chan->n_subbufs) | 816 | if (!buf || subbufs_consumed > chan->n_subbufs) |
| 817 | return; | 817 | return; |
| 818 | 818 | ||
| 819 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) | 819 | if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) |
diff --git a/kernel/signal.c b/kernel/signal.c index 29a410780aa9..ae60996fedff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -2491,6 +2491,13 @@ void __set_current_blocked(const sigset_t *newset) | |||
| 2491 | { | 2491 | { |
| 2492 | struct task_struct *tsk = current; | 2492 | struct task_struct *tsk = current; |
| 2493 | 2493 | ||
| 2494 | /* | ||
| 2495 | * In case the signal mask hasn't changed, there is nothing we need | ||
| 2496 | * to do. The current->blocked shouldn't be modified by other task. | ||
| 2497 | */ | ||
| 2498 | if (sigequalsets(&tsk->blocked, newset)) | ||
| 2499 | return; | ||
| 2500 | |||
| 2494 | spin_lock_irq(&tsk->sighand->siglock); | 2501 | spin_lock_irq(&tsk->sighand->siglock); |
| 2495 | __set_task_blocked(tsk, newset); | 2502 | __set_task_blocked(tsk, newset); |
| 2496 | spin_unlock_irq(&tsk->sighand->siglock); | 2503 | spin_unlock_irq(&tsk->sighand->siglock); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 39b3368f6de6..1475d2545b7e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -2389,9 +2389,11 @@ static void validate_coredump_safety(void) | |||
| 2389 | #ifdef CONFIG_COREDUMP | 2389 | #ifdef CONFIG_COREDUMP |
| 2390 | if (suid_dumpable == SUID_DUMP_ROOT && | 2390 | if (suid_dumpable == SUID_DUMP_ROOT && |
| 2391 | core_pattern[0] != '/' && core_pattern[0] != '|') { | 2391 | core_pattern[0] != '/' && core_pattern[0] != '|') { |
| 2392 | printk(KERN_WARNING "Unsafe core_pattern used with "\ | 2392 | printk(KERN_WARNING |
| 2393 | "suid_dumpable=2. Pipe handler or fully qualified "\ | 2393 | "Unsafe core_pattern used with fs.suid_dumpable=2.\n" |
| 2394 | "core dump path required.\n"); | 2394 | "Pipe handler or fully qualified core dump path required.\n" |
| 2395 | "Set kernel.core_pattern before fs.suid_dumpable.\n" | ||
| 2396 | ); | ||
| 2395 | } | 2397 | } |
| 2396 | #endif | 2398 | #endif |
| 2397 | } | 2399 | } |
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 6eb99c17dbd8..ece4b177052b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c | |||
| @@ -1354,8 +1354,8 @@ static void deprecated_sysctl_warning(const int *name, int nlen) | |||
| 1354 | "warning: process `%s' used the deprecated sysctl " | 1354 | "warning: process `%s' used the deprecated sysctl " |
| 1355 | "system call with ", current->comm); | 1355 | "system call with ", current->comm); |
| 1356 | for (i = 0; i < nlen; i++) | 1356 | for (i = 0; i < nlen; i++) |
| 1357 | printk("%d.", name[i]); | 1357 | printk(KERN_CONT "%d.", name[i]); |
| 1358 | printk("\n"); | 1358 | printk(KERN_CONT "\n"); |
| 1359 | } | 1359 | } |
| 1360 | return; | 1360 | return; |
| 1361 | } | 1361 | } |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9b08ca391aed..3921cf7fea8e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -516,7 +516,8 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, | |||
| 516 | 516 | ||
| 517 | spin_lock_irqsave(&ptr->it_lock, flags); | 517 | spin_lock_irqsave(&ptr->it_lock, flags); |
| 518 | if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { | 518 | if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) { |
| 519 | if (posix_timer_event(ptr, 0) != 0) | 519 | if (IS_ENABLED(CONFIG_POSIX_TIMERS) && |
| 520 | posix_timer_event(ptr, 0) != 0) | ||
| 520 | ptr->it_overrun++; | 521 | ptr->it_overrun++; |
| 521 | } | 522 | } |
| 522 | 523 | ||
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 9acb29f280ec..d4b0fa01cae3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -24,32 +24,14 @@ | |||
| 24 | 24 | ||
| 25 | #include <asm/irq_regs.h> | 25 | #include <asm/irq_regs.h> |
| 26 | #include <linux/kvm_para.h> | 26 | #include <linux/kvm_para.h> |
| 27 | #include <linux/perf_event.h> | ||
| 28 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
| 29 | 28 | ||
| 30 | /* | ||
| 31 | * The run state of the lockup detectors is controlled by the content of the | ||
| 32 | * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - | ||
| 33 | * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. | ||
| 34 | * | ||
| 35 | * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' | ||
| 36 | * are variables that are only used as an 'interface' between the parameters | ||
| 37 | * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The | ||
| 38 | * 'watchdog_thresh' variable is handled differently because its value is not | ||
| 39 | * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' | ||
| 40 | * is equal zero. | ||
| 41 | */ | ||
| 42 | #define NMI_WATCHDOG_ENABLED_BIT 0 | ||
| 43 | #define SOFT_WATCHDOG_ENABLED_BIT 1 | ||
| 44 | #define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) | ||
| 45 | #define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) | ||
| 46 | |||
| 47 | static DEFINE_MUTEX(watchdog_proc_mutex); | 29 | static DEFINE_MUTEX(watchdog_proc_mutex); |
| 48 | 30 | ||
| 49 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 31 | #if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) |
| 50 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; | 32 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; |
| 51 | #else | 33 | #else |
| 52 | static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; | 34 | unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; |
| 53 | #endif | 35 | #endif |
| 54 | int __read_mostly nmi_watchdog_enabled; | 36 | int __read_mostly nmi_watchdog_enabled; |
| 55 | int __read_mostly soft_watchdog_enabled; | 37 | int __read_mostly soft_watchdog_enabled; |
| @@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10; | |||
| 59 | #ifdef CONFIG_SMP | 41 | #ifdef CONFIG_SMP |
| 60 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; | 42 | int __read_mostly sysctl_softlockup_all_cpu_backtrace; |
| 61 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; | 43 | int __read_mostly sysctl_hardlockup_all_cpu_backtrace; |
| 62 | #else | ||
| 63 | #define sysctl_softlockup_all_cpu_backtrace 0 | ||
| 64 | #define sysctl_hardlockup_all_cpu_backtrace 0 | ||
| 65 | #endif | 44 | #endif |
| 66 | static struct cpumask watchdog_cpumask __read_mostly; | 45 | static struct cpumask watchdog_cpumask __read_mostly; |
| 67 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); | 46 | unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); |
| @@ -100,50 +79,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn); | |||
| 100 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | 79 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); |
| 101 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | 80 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
| 102 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | 81 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); |
| 103 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 104 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
| 105 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
| 106 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | 82 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); |
| 107 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
| 108 | #endif | ||
| 109 | static unsigned long soft_lockup_nmi_warn; | 83 | static unsigned long soft_lockup_nmi_warn; |
| 110 | 84 | ||
| 111 | /* boot commands */ | ||
| 112 | /* | ||
| 113 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 114 | */ | ||
| 115 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 116 | unsigned int __read_mostly hardlockup_panic = | ||
| 117 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 118 | static unsigned long hardlockup_allcpu_dumped; | ||
| 119 | /* | ||
| 120 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 121 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 122 | * cases this function can be called to disable hard lockup detection. This | ||
| 123 | * function should only be executed once by the boot processor before the | ||
| 124 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 125 | * possible to override this in hardlockup_panic_setup(). | ||
| 126 | */ | ||
| 127 | void hardlockup_detector_disable(void) | ||
| 128 | { | ||
| 129 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 130 | } | ||
| 131 | |||
| 132 | static int __init hardlockup_panic_setup(char *str) | ||
| 133 | { | ||
| 134 | if (!strncmp(str, "panic", 5)) | ||
| 135 | hardlockup_panic = 1; | ||
| 136 | else if (!strncmp(str, "nopanic", 7)) | ||
| 137 | hardlockup_panic = 0; | ||
| 138 | else if (!strncmp(str, "0", 1)) | ||
| 139 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 140 | else if (!strncmp(str, "1", 1)) | ||
| 141 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
| 142 | return 1; | ||
| 143 | } | ||
| 144 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 145 | #endif | ||
| 146 | |||
| 147 | unsigned int __read_mostly softlockup_panic = | 85 | unsigned int __read_mostly softlockup_panic = |
| 148 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; | 86 | CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; |
| 149 | 87 | ||
| @@ -264,32 +202,14 @@ void touch_all_softlockup_watchdogs(void) | |||
| 264 | wq_watchdog_touch(-1); | 202 | wq_watchdog_touch(-1); |
| 265 | } | 203 | } |
| 266 | 204 | ||
| 267 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 268 | void touch_nmi_watchdog(void) | ||
| 269 | { | ||
| 270 | /* | ||
| 271 | * Using __raw here because some code paths have | ||
| 272 | * preemption enabled. If preemption is enabled | ||
| 273 | * then interrupts should be enabled too, in which | ||
| 274 | * case we shouldn't have to worry about the watchdog | ||
| 275 | * going off. | ||
| 276 | */ | ||
| 277 | raw_cpu_write(watchdog_nmi_touch, true); | ||
| 278 | touch_softlockup_watchdog(); | ||
| 279 | } | ||
| 280 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
| 281 | |||
| 282 | #endif | ||
| 283 | |||
| 284 | void touch_softlockup_watchdog_sync(void) | 205 | void touch_softlockup_watchdog_sync(void) |
| 285 | { | 206 | { |
| 286 | __this_cpu_write(softlockup_touch_sync, true); | 207 | __this_cpu_write(softlockup_touch_sync, true); |
| 287 | __this_cpu_write(watchdog_touch_ts, 0); | 208 | __this_cpu_write(watchdog_touch_ts, 0); |
| 288 | } | 209 | } |
| 289 | 210 | ||
| 290 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 291 | /* watchdog detector functions */ | 211 | /* watchdog detector functions */ |
| 292 | static bool is_hardlockup(void) | 212 | bool is_hardlockup(void) |
| 293 | { | 213 | { |
| 294 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); | 214 | unsigned long hrint = __this_cpu_read(hrtimer_interrupts); |
| 295 | 215 | ||
| @@ -299,7 +219,6 @@ static bool is_hardlockup(void) | |||
| 299 | __this_cpu_write(hrtimer_interrupts_saved, hrint); | 219 | __this_cpu_write(hrtimer_interrupts_saved, hrint); |
| 300 | return false; | 220 | return false; |
| 301 | } | 221 | } |
| 302 | #endif | ||
| 303 | 222 | ||
| 304 | static int is_softlockup(unsigned long touch_ts) | 223 | static int is_softlockup(unsigned long touch_ts) |
| 305 | { | 224 | { |
| @@ -313,78 +232,22 @@ static int is_softlockup(unsigned long touch_ts) | |||
| 313 | return 0; | 232 | return 0; |
| 314 | } | 233 | } |
| 315 | 234 | ||
| 316 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 317 | |||
| 318 | static struct perf_event_attr wd_hw_attr = { | ||
| 319 | .type = PERF_TYPE_HARDWARE, | ||
| 320 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
| 321 | .size = sizeof(struct perf_event_attr), | ||
| 322 | .pinned = 1, | ||
| 323 | .disabled = 1, | ||
| 324 | }; | ||
| 325 | |||
| 326 | /* Callback function for perf event subsystem */ | ||
| 327 | static void watchdog_overflow_callback(struct perf_event *event, | ||
| 328 | struct perf_sample_data *data, | ||
| 329 | struct pt_regs *regs) | ||
| 330 | { | ||
| 331 | /* Ensure the watchdog never gets throttled */ | ||
| 332 | event->hw.interrupts = 0; | ||
| 333 | |||
| 334 | if (__this_cpu_read(watchdog_nmi_touch) == true) { | ||
| 335 | __this_cpu_write(watchdog_nmi_touch, false); | ||
| 336 | return; | ||
| 337 | } | ||
| 338 | |||
| 339 | /* check for a hardlockup | ||
| 340 | * This is done by making sure our timer interrupt | ||
| 341 | * is incrementing. The timer interrupt should have | ||
| 342 | * fired multiple times before we overflow'd. If it hasn't | ||
| 343 | * then this is a good indication the cpu is stuck | ||
| 344 | */ | ||
| 345 | if (is_hardlockup()) { | ||
| 346 | int this_cpu = smp_processor_id(); | ||
| 347 | struct pt_regs *regs = get_irq_regs(); | ||
| 348 | |||
| 349 | /* only print hardlockups once */ | ||
| 350 | if (__this_cpu_read(hard_watchdog_warn) == true) | ||
| 351 | return; | ||
| 352 | |||
| 353 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
| 354 | print_modules(); | ||
| 355 | print_irqtrace_events(current); | ||
| 356 | if (regs) | ||
| 357 | show_regs(regs); | ||
| 358 | else | ||
| 359 | dump_stack(); | ||
| 360 | |||
| 361 | /* | ||
| 362 | * Perform all-CPU dump only once to avoid multiple hardlockups | ||
| 363 | * generating interleaving traces | ||
| 364 | */ | ||
| 365 | if (sysctl_hardlockup_all_cpu_backtrace && | ||
| 366 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) | ||
| 367 | trigger_allbutself_cpu_backtrace(); | ||
| 368 | |||
| 369 | if (hardlockup_panic) | ||
| 370 | nmi_panic(regs, "Hard LOCKUP"); | ||
| 371 | |||
| 372 | __this_cpu_write(hard_watchdog_warn, true); | ||
| 373 | return; | ||
| 374 | } | ||
| 375 | |||
| 376 | __this_cpu_write(hard_watchdog_warn, false); | ||
| 377 | return; | ||
| 378 | } | ||
| 379 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
| 380 | |||
| 381 | static void watchdog_interrupt_count(void) | 235 | static void watchdog_interrupt_count(void) |
| 382 | { | 236 | { |
| 383 | __this_cpu_inc(hrtimer_interrupts); | 237 | __this_cpu_inc(hrtimer_interrupts); |
| 384 | } | 238 | } |
| 385 | 239 | ||
| 386 | static int watchdog_nmi_enable(unsigned int cpu); | 240 | /* |
| 387 | static void watchdog_nmi_disable(unsigned int cpu); | 241 | * These two functions are mostly architecture specific |
| 242 | * defining them as weak here. | ||
| 243 | */ | ||
| 244 | int __weak watchdog_nmi_enable(unsigned int cpu) | ||
| 245 | { | ||
| 246 | return 0; | ||
| 247 | } | ||
| 248 | void __weak watchdog_nmi_disable(unsigned int cpu) | ||
| 249 | { | ||
| 250 | } | ||
| 388 | 251 | ||
| 389 | static int watchdog_enable_all_cpus(void); | 252 | static int watchdog_enable_all_cpus(void); |
| 390 | static void watchdog_disable_all_cpus(void); | 253 | static void watchdog_disable_all_cpus(void); |
| @@ -577,109 +440,6 @@ static void watchdog(unsigned int cpu) | |||
| 577 | watchdog_nmi_disable(cpu); | 440 | watchdog_nmi_disable(cpu); |
| 578 | } | 441 | } |
| 579 | 442 | ||
| 580 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | ||
| 581 | /* | ||
| 582 | * People like the simple clean cpu node info on boot. | ||
| 583 | * Reduce the watchdog noise by only printing messages | ||
| 584 | * that are different from what cpu0 displayed. | ||
| 585 | */ | ||
| 586 | static unsigned long cpu0_err; | ||
| 587 | |||
| 588 | static int watchdog_nmi_enable(unsigned int cpu) | ||
| 589 | { | ||
| 590 | struct perf_event_attr *wd_attr; | ||
| 591 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
| 592 | |||
| 593 | /* nothing to do if the hard lockup detector is disabled */ | ||
| 594 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
| 595 | goto out; | ||
| 596 | |||
| 597 | /* is it already setup and enabled? */ | ||
| 598 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
| 599 | goto out; | ||
| 600 | |||
| 601 | /* it is setup but not enabled */ | ||
| 602 | if (event != NULL) | ||
| 603 | goto out_enable; | ||
| 604 | |||
| 605 | wd_attr = &wd_hw_attr; | ||
| 606 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); | ||
| 607 | |||
| 608 | /* Try to register using hardware perf events */ | ||
| 609 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | ||
| 610 | |||
| 611 | /* save cpu0 error for future comparision */ | ||
| 612 | if (cpu == 0 && IS_ERR(event)) | ||
| 613 | cpu0_err = PTR_ERR(event); | ||
| 614 | |||
| 615 | if (!IS_ERR(event)) { | ||
| 616 | /* only print for cpu0 or different than cpu0 */ | ||
| 617 | if (cpu == 0 || cpu0_err) | ||
| 618 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
| 619 | goto out_save; | ||
| 620 | } | ||
| 621 | |||
| 622 | /* | ||
| 623 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
| 624 | * set up the hardware perf event. The watchdog() function checks | ||
| 625 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
| 626 | * | ||
| 627 | * The barriers are for syncing up watchdog_enabled across all the | ||
| 628 | * cpus, as clear_bit() does not use barriers. | ||
| 629 | */ | ||
| 630 | smp_mb__before_atomic(); | ||
| 631 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
| 632 | smp_mb__after_atomic(); | ||
| 633 | |||
| 634 | /* skip displaying the same error again */ | ||
| 635 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
| 636 | return PTR_ERR(event); | ||
| 637 | |||
| 638 | /* vary the KERN level based on the returned errno */ | ||
| 639 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
| 640 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
| 641 | else if (PTR_ERR(event) == -ENOENT) | ||
| 642 | pr_warn("disabled (cpu%i): hardware events not enabled\n", | ||
| 643 | cpu); | ||
| 644 | else | ||
| 645 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | ||
| 646 | cpu, PTR_ERR(event)); | ||
| 647 | |||
| 648 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
| 649 | |||
| 650 | return PTR_ERR(event); | ||
| 651 | |||
| 652 | /* success path */ | ||
| 653 | out_save: | ||
| 654 | per_cpu(watchdog_ev, cpu) = event; | ||
| 655 | out_enable: | ||
| 656 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
| 657 | out: | ||
| 658 | return 0; | ||
| 659 | } | ||
| 660 | |||
| 661 | static void watchdog_nmi_disable(unsigned int cpu) | ||
| 662 | { | ||
| 663 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
| 664 | |||
| 665 | if (event) { | ||
| 666 | perf_event_disable(event); | ||
| 667 | per_cpu(watchdog_ev, cpu) = NULL; | ||
| 668 | |||
| 669 | /* should be in cleanup, but blocks oprofile */ | ||
| 670 | perf_event_release_kernel(event); | ||
| 671 | } | ||
| 672 | if (cpu == 0) { | ||
| 673 | /* watchdog_nmi_enable() expects this to be zero initially. */ | ||
| 674 | cpu0_err = 0; | ||
| 675 | } | ||
| 676 | } | ||
| 677 | |||
| 678 | #else | ||
| 679 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } | ||
| 680 | static void watchdog_nmi_disable(unsigned int cpu) { return; } | ||
| 681 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
| 682 | |||
| 683 | static struct smp_hotplug_thread watchdog_threads = { | 443 | static struct smp_hotplug_thread watchdog_threads = { |
| 684 | .store = &softlockup_watchdog, | 444 | .store = &softlockup_watchdog, |
| 685 | .thread_should_run = watchdog_should_run, | 445 | .thread_should_run = watchdog_should_run, |
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c new file mode 100644 index 000000000000..84016c8aee6b --- /dev/null +++ b/kernel/watchdog_hld.c | |||
| @@ -0,0 +1,227 @@ | |||
| 1 | /* | ||
| 2 | * Detect hard lockups on a system | ||
| 3 | * | ||
| 4 | * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. | ||
| 5 | * | ||
| 6 | * Note: Most of this code is borrowed heavily from the original softlockup | ||
| 7 | * detector, so thanks to Ingo for the initial implementation. | ||
| 8 | * Some chunks also taken from the old x86-specific nmi watchdog code, thanks | ||
| 9 | * to those contributors as well. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #define pr_fmt(fmt) "NMI watchdog: " fmt | ||
| 13 | |||
| 14 | #include <linux/nmi.h> | ||
| 15 | #include <linux/module.h> | ||
| 16 | #include <asm/irq_regs.h> | ||
| 17 | #include <linux/perf_event.h> | ||
| 18 | |||
| 19 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | ||
| 20 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | ||
| 21 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | ||
| 22 | |||
| 23 | /* boot commands */ | ||
| 24 | /* | ||
| 25 | * Should we panic when a soft-lockup or hard-lockup occurs: | ||
| 26 | */ | ||
| 27 | unsigned int __read_mostly hardlockup_panic = | ||
| 28 | CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; | ||
| 29 | static unsigned long hardlockup_allcpu_dumped; | ||
| 30 | /* | ||
| 31 | * We may not want to enable hard lockup detection by default in all cases, | ||
| 32 | * for example when running the kernel as a guest on a hypervisor. In these | ||
| 33 | * cases this function can be called to disable hard lockup detection. This | ||
| 34 | * function should only be executed once by the boot processor before the | ||
| 35 | * kernel command line parameters are parsed, because otherwise it is not | ||
| 36 | * possible to override this in hardlockup_panic_setup(). | ||
| 37 | */ | ||
| 38 | void hardlockup_detector_disable(void) | ||
| 39 | { | ||
| 40 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 41 | } | ||
| 42 | |||
| 43 | static int __init hardlockup_panic_setup(char *str) | ||
| 44 | { | ||
| 45 | if (!strncmp(str, "panic", 5)) | ||
| 46 | hardlockup_panic = 1; | ||
| 47 | else if (!strncmp(str, "nopanic", 7)) | ||
| 48 | hardlockup_panic = 0; | ||
| 49 | else if (!strncmp(str, "0", 1)) | ||
| 50 | watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; | ||
| 51 | else if (!strncmp(str, "1", 1)) | ||
| 52 | watchdog_enabled |= NMI_WATCHDOG_ENABLED; | ||
| 53 | return 1; | ||
| 54 | } | ||
| 55 | __setup("nmi_watchdog=", hardlockup_panic_setup); | ||
| 56 | |||
| 57 | void touch_nmi_watchdog(void) | ||
| 58 | { | ||
| 59 | /* | ||
| 60 | * Using __raw here because some code paths have | ||
| 61 | * preemption enabled. If preemption is enabled | ||
| 62 | * then interrupts should be enabled too, in which | ||
| 63 | * case we shouldn't have to worry about the watchdog | ||
| 64 | * going off. | ||
| 65 | */ | ||
| 66 | raw_cpu_write(watchdog_nmi_touch, true); | ||
| 67 | touch_softlockup_watchdog(); | ||
| 68 | } | ||
| 69 | EXPORT_SYMBOL(touch_nmi_watchdog); | ||
| 70 | |||
| 71 | static struct perf_event_attr wd_hw_attr = { | ||
| 72 | .type = PERF_TYPE_HARDWARE, | ||
| 73 | .config = PERF_COUNT_HW_CPU_CYCLES, | ||
| 74 | .size = sizeof(struct perf_event_attr), | ||
| 75 | .pinned = 1, | ||
| 76 | .disabled = 1, | ||
| 77 | }; | ||
| 78 | |||
| 79 | /* Callback function for perf event subsystem */ | ||
| 80 | static void watchdog_overflow_callback(struct perf_event *event, | ||
| 81 | struct perf_sample_data *data, | ||
| 82 | struct pt_regs *regs) | ||
| 83 | { | ||
| 84 | /* Ensure the watchdog never gets throttled */ | ||
| 85 | event->hw.interrupts = 0; | ||
| 86 | |||
| 87 | if (__this_cpu_read(watchdog_nmi_touch) == true) { | ||
| 88 | __this_cpu_write(watchdog_nmi_touch, false); | ||
| 89 | return; | ||
| 90 | } | ||
| 91 | |||
| 92 | /* check for a hardlockup | ||
| 93 | * This is done by making sure our timer interrupt | ||
| 94 | * is incrementing. The timer interrupt should have | ||
| 95 | * fired multiple times before we overflow'd. If it hasn't | ||
| 96 | * then this is a good indication the cpu is stuck | ||
| 97 | */ | ||
| 98 | if (is_hardlockup()) { | ||
| 99 | int this_cpu = smp_processor_id(); | ||
| 100 | |||
| 101 | /* only print hardlockups once */ | ||
| 102 | if (__this_cpu_read(hard_watchdog_warn) == true) | ||
| 103 | return; | ||
| 104 | |||
| 105 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | ||
| 106 | print_modules(); | ||
| 107 | print_irqtrace_events(current); | ||
| 108 | if (regs) | ||
| 109 | show_regs(regs); | ||
| 110 | else | ||
| 111 | dump_stack(); | ||
| 112 | |||
| 113 | /* | ||
| 114 | * Perform all-CPU dump only once to avoid multiple hardlockups | ||
| 115 | * generating interleaving traces | ||
| 116 | */ | ||
| 117 | if (sysctl_hardlockup_all_cpu_backtrace && | ||
| 118 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) | ||
| 119 | trigger_allbutself_cpu_backtrace(); | ||
| 120 | |||
| 121 | if (hardlockup_panic) | ||
| 122 | nmi_panic(regs, "Hard LOCKUP"); | ||
| 123 | |||
| 124 | __this_cpu_write(hard_watchdog_warn, true); | ||
| 125 | return; | ||
| 126 | } | ||
| 127 | |||
| 128 | __this_cpu_write(hard_watchdog_warn, false); | ||
| 129 | return; | ||
| 130 | } | ||
| 131 | |||
| 132 | /* | ||
| 133 | * People like the simple clean cpu node info on boot. | ||
| 134 | * Reduce the watchdog noise by only printing messages | ||
| 135 | * that are different from what cpu0 displayed. | ||
| 136 | */ | ||
| 137 | static unsigned long cpu0_err; | ||
| 138 | |||
| 139 | int watchdog_nmi_enable(unsigned int cpu) | ||
| 140 | { | ||
| 141 | struct perf_event_attr *wd_attr; | ||
| 142 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
| 143 | |||
| 144 | /* nothing to do if the hard lockup detector is disabled */ | ||
| 145 | if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) | ||
| 146 | goto out; | ||
| 147 | |||
| 148 | /* is it already setup and enabled? */ | ||
| 149 | if (event && event->state > PERF_EVENT_STATE_OFF) | ||
| 150 | goto out; | ||
| 151 | |||
| 152 | /* it is setup but not enabled */ | ||
| 153 | if (event != NULL) | ||
| 154 | goto out_enable; | ||
| 155 | |||
| 156 | wd_attr = &wd_hw_attr; | ||
| 157 | wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); | ||
| 158 | |||
| 159 | /* Try to register using hardware perf events */ | ||
| 160 | event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); | ||
| 161 | |||
| 162 | /* save cpu0 error for future comparision */ | ||
| 163 | if (cpu == 0 && IS_ERR(event)) | ||
| 164 | cpu0_err = PTR_ERR(event); | ||
| 165 | |||
| 166 | if (!IS_ERR(event)) { | ||
| 167 | /* only print for cpu0 or different than cpu0 */ | ||
| 168 | if (cpu == 0 || cpu0_err) | ||
| 169 | pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); | ||
| 170 | goto out_save; | ||
| 171 | } | ||
| 172 | |||
| 173 | /* | ||
| 174 | * Disable the hard lockup detector if _any_ CPU fails to set up | ||
| 175 | * set up the hardware perf event. The watchdog() function checks | ||
| 176 | * the NMI_WATCHDOG_ENABLED bit periodically. | ||
| 177 | * | ||
| 178 | * The barriers are for syncing up watchdog_enabled across all the | ||
| 179 | * cpus, as clear_bit() does not use barriers. | ||
| 180 | */ | ||
| 181 | smp_mb__before_atomic(); | ||
| 182 | clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); | ||
| 183 | smp_mb__after_atomic(); | ||
| 184 | |||
| 185 | /* skip displaying the same error again */ | ||
| 186 | if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) | ||
| 187 | return PTR_ERR(event); | ||
| 188 | |||
| 189 | /* vary the KERN level based on the returned errno */ | ||
| 190 | if (PTR_ERR(event) == -EOPNOTSUPP) | ||
| 191 | pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); | ||
| 192 | else if (PTR_ERR(event) == -ENOENT) | ||
| 193 | pr_warn("disabled (cpu%i): hardware events not enabled\n", | ||
| 194 | cpu); | ||
| 195 | else | ||
| 196 | pr_err("disabled (cpu%i): unable to create perf event: %ld\n", | ||
| 197 | cpu, PTR_ERR(event)); | ||
| 198 | |||
| 199 | pr_info("Shutting down hard lockup detector on all cpus\n"); | ||
| 200 | |||
| 201 | return PTR_ERR(event); | ||
| 202 | |||
| 203 | /* success path */ | ||
| 204 | out_save: | ||
| 205 | per_cpu(watchdog_ev, cpu) = event; | ||
| 206 | out_enable: | ||
| 207 | perf_event_enable(per_cpu(watchdog_ev, cpu)); | ||
| 208 | out: | ||
| 209 | return 0; | ||
| 210 | } | ||
| 211 | |||
| 212 | void watchdog_nmi_disable(unsigned int cpu) | ||
| 213 | { | ||
| 214 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | ||
| 215 | |||
| 216 | if (event) { | ||
| 217 | perf_event_disable(event); | ||
| 218 | per_cpu(watchdog_ev, cpu) = NULL; | ||
| 219 | |||
| 220 | /* should be in cleanup, but blocks oprofile */ | ||
| 221 | perf_event_release_kernel(event); | ||
| 222 | } | ||
| 223 | if (cpu == 0) { | ||
| 224 | /* watchdog_nmi_enable() expects this to be zero initially. */ | ||
| 225 | cpu0_err = 0; | ||
| 226 | } | ||
| 227 | } | ||
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e6327d102184..7446097f72bd 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
| @@ -194,8 +194,8 @@ config GDB_SCRIPTS | |||
| 194 | build directory. If you load vmlinux into gdb, the helper | 194 | build directory. If you load vmlinux into gdb, the helper |
| 195 | scripts will be automatically imported by gdb as well, and | 195 | scripts will be automatically imported by gdb as well, and |
| 196 | additional functions are available to analyze a Linux kernel | 196 | additional functions are available to analyze a Linux kernel |
| 197 | instance. See Documentation/gdb-kernel-debugging.txt for further | 197 | instance. See Documentation/dev-tools/gdb-kernel-debugging.rst |
| 198 | details. | 198 | for further details. |
| 199 | 199 | ||
| 200 | config ENABLE_WARN_DEPRECATED | 200 | config ENABLE_WARN_DEPRECATED |
| 201 | bool "Enable __deprecated logic" | 201 | bool "Enable __deprecated logic" |
| @@ -542,7 +542,7 @@ config DEBUG_KMEMLEAK | |||
| 542 | difference being that the orphan objects are not freed but | 542 | difference being that the orphan objects are not freed but |
| 543 | only shown in /sys/kernel/debug/kmemleak. Enabling this | 543 | only shown in /sys/kernel/debug/kmemleak. Enabling this |
| 544 | feature will introduce an overhead to memory | 544 | feature will introduce an overhead to memory |
| 545 | allocations. See Documentation/kmemleak.txt for more | 545 | allocations. See Documentation/dev-tools/kmemleak.rst for more |
| 546 | details. | 546 | details. |
| 547 | 547 | ||
| 548 | Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances | 548 | Enabling DEBUG_SLAB or SLUB_DEBUG may increase the chances |
| @@ -739,7 +739,7 @@ config KCOV | |||
| 739 | different machines and across reboots. If you need stable PC values, | 739 | different machines and across reboots. If you need stable PC values, |
| 740 | disable RANDOMIZE_BASE. | 740 | disable RANDOMIZE_BASE. |
| 741 | 741 | ||
| 742 | For more details, see Documentation/kcov.txt. | 742 | For more details, see Documentation/dev-tools/kcov.rst. |
| 743 | 743 | ||
| 744 | config KCOV_INSTRUMENT_ALL | 744 | config KCOV_INSTRUMENT_ALL |
| 745 | bool "Instrument all code by default" | 745 | bool "Instrument all code by default" |
diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index bc6e651df68c..a669c193b878 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan | |||
| @@ -10,7 +10,8 @@ config UBSAN | |||
| 10 | This option enables undefined behaviour sanity checker | 10 | This option enables undefined behaviour sanity checker |
| 11 | Compile-time instrumentation is used to detect various undefined | 11 | Compile-time instrumentation is used to detect various undefined |
| 12 | behaviours in runtime. Various types of checks may be enabled | 12 | behaviours in runtime. Various types of checks may be enabled |
| 13 | via boot parameter ubsan_handle (see: Documentation/ubsan.txt). | 13 | via boot parameter ubsan_handle |
| 14 | (see: Documentation/dev-tools/ubsan.rst). | ||
| 14 | 15 | ||
| 15 | config UBSAN_SANITIZE_ALL | 16 | config UBSAN_SANITIZE_ALL |
| 16 | bool "Enable instrumentation for the entire kernel" | 17 | bool "Enable instrumentation for the entire kernel" |
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 2e8c6f7aa56e..0019aca0f328 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 23 | */ | 23 | */ |
| 24 | 24 | ||
| 25 | #include <linux/cpu.h> | ||
| 25 | #include <linux/errno.h> | 26 | #include <linux/errno.h> |
| 26 | #include <linux/init.h> | 27 | #include <linux/init.h> |
| 27 | #include <linux/kernel.h> | 28 | #include <linux/kernel.h> |
| @@ -69,6 +70,11 @@ struct radix_tree_preload { | |||
| 69 | }; | 70 | }; |
| 70 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; | 71 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; |
| 71 | 72 | ||
| 73 | static inline struct radix_tree_node *entry_to_node(void *ptr) | ||
| 74 | { | ||
| 75 | return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); | ||
| 76 | } | ||
| 77 | |||
| 72 | static inline void *node_to_entry(void *ptr) | 78 | static inline void *node_to_entry(void *ptr) |
| 73 | { | 79 | { |
| 74 | return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); | 80 | return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); |
| @@ -191,13 +197,12 @@ static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag) | |||
| 191 | * Returns next bit offset, or size if nothing found. | 197 | * Returns next bit offset, or size if nothing found. |
| 192 | */ | 198 | */ |
| 193 | static __always_inline unsigned long | 199 | static __always_inline unsigned long |
| 194 | radix_tree_find_next_bit(const unsigned long *addr, | 200 | radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, |
| 195 | unsigned long size, unsigned long offset) | 201 | unsigned long offset) |
| 196 | { | 202 | { |
| 197 | if (!__builtin_constant_p(size)) | 203 | const unsigned long *addr = node->tags[tag]; |
| 198 | return find_next_bit(addr, size, offset); | ||
| 199 | 204 | ||
| 200 | if (offset < size) { | 205 | if (offset < RADIX_TREE_MAP_SIZE) { |
| 201 | unsigned long tmp; | 206 | unsigned long tmp; |
| 202 | 207 | ||
| 203 | addr += offset / BITS_PER_LONG; | 208 | addr += offset / BITS_PER_LONG; |
| @@ -205,14 +210,32 @@ radix_tree_find_next_bit(const unsigned long *addr, | |||
| 205 | if (tmp) | 210 | if (tmp) |
| 206 | return __ffs(tmp) + offset; | 211 | return __ffs(tmp) + offset; |
| 207 | offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); | 212 | offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1); |
| 208 | while (offset < size) { | 213 | while (offset < RADIX_TREE_MAP_SIZE) { |
| 209 | tmp = *++addr; | 214 | tmp = *++addr; |
| 210 | if (tmp) | 215 | if (tmp) |
| 211 | return __ffs(tmp) + offset; | 216 | return __ffs(tmp) + offset; |
| 212 | offset += BITS_PER_LONG; | 217 | offset += BITS_PER_LONG; |
| 213 | } | 218 | } |
| 214 | } | 219 | } |
| 215 | return size; | 220 | return RADIX_TREE_MAP_SIZE; |
| 221 | } | ||
| 222 | |||
| 223 | static unsigned int iter_offset(const struct radix_tree_iter *iter) | ||
| 224 | { | ||
| 225 | return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; | ||
| 226 | } | ||
| 227 | |||
| 228 | /* | ||
| 229 | * The maximum index which can be stored in a radix tree | ||
| 230 | */ | ||
| 231 | static inline unsigned long shift_maxindex(unsigned int shift) | ||
| 232 | { | ||
| 233 | return (RADIX_TREE_MAP_SIZE << shift) - 1; | ||
| 234 | } | ||
| 235 | |||
| 236 | static inline unsigned long node_maxindex(struct radix_tree_node *node) | ||
| 237 | { | ||
| 238 | return shift_maxindex(node->shift); | ||
| 216 | } | 239 | } |
| 217 | 240 | ||
| 218 | #ifndef __KERNEL__ | 241 | #ifndef __KERNEL__ |
| @@ -220,10 +243,11 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) | |||
| 220 | { | 243 | { |
| 221 | unsigned long i; | 244 | unsigned long i; |
| 222 | 245 | ||
| 223 | pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", | 246 | pr_debug("radix node: %p offset %d indices %lu-%lu parent %p tags %lx %lx %lx shift %d count %d exceptional %d\n", |
| 224 | node, node->offset, | 247 | node, node->offset, index, index | node_maxindex(node), |
| 248 | node->parent, | ||
| 225 | node->tags[0][0], node->tags[1][0], node->tags[2][0], | 249 | node->tags[0][0], node->tags[1][0], node->tags[2][0], |
| 226 | node->shift, node->count, node->exceptional, node->parent); | 250 | node->shift, node->count, node->exceptional); |
| 227 | 251 | ||
| 228 | for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { | 252 | for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { |
| 229 | unsigned long first = index | (i << node->shift); | 253 | unsigned long first = index | (i << node->shift); |
| @@ -231,14 +255,16 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) | |||
| 231 | void *entry = node->slots[i]; | 255 | void *entry = node->slots[i]; |
| 232 | if (!entry) | 256 | if (!entry) |
| 233 | continue; | 257 | continue; |
| 234 | if (is_sibling_entry(node, entry)) { | 258 | if (entry == RADIX_TREE_RETRY) { |
| 235 | pr_debug("radix sblng %p offset %ld val %p indices %ld-%ld\n", | 259 | pr_debug("radix retry offset %ld indices %lu-%lu parent %p\n", |
| 236 | entry, i, | 260 | i, first, last, node); |
| 237 | *(void **)entry_to_node(entry), | ||
| 238 | first, last); | ||
| 239 | } else if (!radix_tree_is_internal_node(entry)) { | 261 | } else if (!radix_tree_is_internal_node(entry)) { |
| 240 | pr_debug("radix entry %p offset %ld indices %ld-%ld\n", | 262 | pr_debug("radix entry %p offset %ld indices %lu-%lu parent %p\n", |
| 241 | entry, i, first, last); | 263 | entry, i, first, last, node); |
| 264 | } else if (is_sibling_entry(node, entry)) { | ||
| 265 | pr_debug("radix sblng %p offset %ld indices %lu-%lu parent %p val %p\n", | ||
| 266 | entry, i, first, last, node, | ||
| 267 | *(void **)entry_to_node(entry)); | ||
| 242 | } else { | 268 | } else { |
| 243 | dump_node(entry_to_node(entry), first); | 269 | dump_node(entry_to_node(entry), first); |
| 244 | } | 270 | } |
| @@ -262,7 +288,10 @@ static void radix_tree_dump(struct radix_tree_root *root) | |||
| 262 | * that the caller has pinned this thread of control to the current CPU. | 288 | * that the caller has pinned this thread of control to the current CPU. |
| 263 | */ | 289 | */ |
| 264 | static struct radix_tree_node * | 290 | static struct radix_tree_node * |
| 265 | radix_tree_node_alloc(struct radix_tree_root *root) | 291 | radix_tree_node_alloc(struct radix_tree_root *root, |
| 292 | struct radix_tree_node *parent, | ||
| 293 | unsigned int shift, unsigned int offset, | ||
| 294 | unsigned int count, unsigned int exceptional) | ||
| 266 | { | 295 | { |
| 267 | struct radix_tree_node *ret = NULL; | 296 | struct radix_tree_node *ret = NULL; |
| 268 | gfp_t gfp_mask = root_gfp_mask(root); | 297 | gfp_t gfp_mask = root_gfp_mask(root); |
| @@ -307,6 +336,13 @@ radix_tree_node_alloc(struct radix_tree_root *root) | |||
| 307 | ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); | 336 | ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); |
| 308 | out: | 337 | out: |
| 309 | BUG_ON(radix_tree_is_internal_node(ret)); | 338 | BUG_ON(radix_tree_is_internal_node(ret)); |
| 339 | if (ret) { | ||
| 340 | ret->parent = parent; | ||
| 341 | ret->shift = shift; | ||
| 342 | ret->offset = offset; | ||
| 343 | ret->count = count; | ||
| 344 | ret->exceptional = exceptional; | ||
| 345 | } | ||
| 310 | return ret; | 346 | return ret; |
| 311 | } | 347 | } |
| 312 | 348 | ||
| @@ -314,17 +350,15 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) | |||
| 314 | { | 350 | { |
| 315 | struct radix_tree_node *node = | 351 | struct radix_tree_node *node = |
| 316 | container_of(head, struct radix_tree_node, rcu_head); | 352 | container_of(head, struct radix_tree_node, rcu_head); |
| 317 | int i; | ||
| 318 | 353 | ||
| 319 | /* | 354 | /* |
| 320 | * must only free zeroed nodes into the slab. radix_tree_shrink | 355 | * Must only free zeroed nodes into the slab. We can be left with |
| 321 | * can leave us with a non-NULL entry in the first slot, so clear | 356 | * non-NULL entries by radix_tree_free_nodes, so clear the entries |
| 322 | * that here to make sure. | 357 | * and tags here. |
| 323 | */ | 358 | */ |
| 324 | for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) | 359 | memset(node->slots, 0, sizeof(node->slots)); |
| 325 | tag_clear(node, i, 0); | 360 | memset(node->tags, 0, sizeof(node->tags)); |
| 326 | 361 | INIT_LIST_HEAD(&node->private_list); | |
| 327 | node->slots[0] = NULL; | ||
| 328 | 362 | ||
| 329 | kmem_cache_free(radix_tree_node_cachep, node); | 363 | kmem_cache_free(radix_tree_node_cachep, node); |
| 330 | } | 364 | } |
| @@ -344,7 +378,7 @@ radix_tree_node_free(struct radix_tree_node *node) | |||
| 344 | * To make use of this facility, the radix tree must be initialised without | 378 | * To make use of this facility, the radix tree must be initialised without |
| 345 | * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). | 379 | * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). |
| 346 | */ | 380 | */ |
| 347 | static int __radix_tree_preload(gfp_t gfp_mask, int nr) | 381 | static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) |
| 348 | { | 382 | { |
| 349 | struct radix_tree_preload *rtp; | 383 | struct radix_tree_preload *rtp; |
| 350 | struct radix_tree_node *node; | 384 | struct radix_tree_node *node; |
| @@ -410,6 +444,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) | |||
| 410 | } | 444 | } |
| 411 | EXPORT_SYMBOL(radix_tree_maybe_preload); | 445 | EXPORT_SYMBOL(radix_tree_maybe_preload); |
| 412 | 446 | ||
| 447 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 448 | /* | ||
| 449 | * Preload with enough objects to ensure that we can split a single entry | ||
| 450 | * of order @old_order into many entries of size @new_order | ||
| 451 | */ | ||
| 452 | int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, | ||
| 453 | gfp_t gfp_mask) | ||
| 454 | { | ||
| 455 | unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); | ||
| 456 | unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - | ||
| 457 | (new_order / RADIX_TREE_MAP_SHIFT); | ||
| 458 | unsigned nr = 0; | ||
| 459 | |||
| 460 | WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); | ||
| 461 | BUG_ON(new_order >= old_order); | ||
| 462 | |||
| 463 | while (layers--) | ||
| 464 | nr = nr * RADIX_TREE_MAP_SIZE + 1; | ||
| 465 | return __radix_tree_preload(gfp_mask, top * nr); | ||
| 466 | } | ||
| 467 | #endif | ||
| 468 | |||
| 413 | /* | 469 | /* |
| 414 | * The same as function above, but preload number of nodes required to insert | 470 | * The same as function above, but preload number of nodes required to insert |
| 415 | * (1 << order) continuous naturally-aligned elements. | 471 | * (1 << order) continuous naturally-aligned elements. |
| @@ -455,19 +511,6 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) | |||
| 455 | return __radix_tree_preload(gfp_mask, nr_nodes); | 511 | return __radix_tree_preload(gfp_mask, nr_nodes); |
| 456 | } | 512 | } |
| 457 | 513 | ||
| 458 | /* | ||
| 459 | * The maximum index which can be stored in a radix tree | ||
| 460 | */ | ||
| 461 | static inline unsigned long shift_maxindex(unsigned int shift) | ||
| 462 | { | ||
| 463 | return (RADIX_TREE_MAP_SIZE << shift) - 1; | ||
| 464 | } | ||
| 465 | |||
| 466 | static inline unsigned long node_maxindex(struct radix_tree_node *node) | ||
| 467 | { | ||
| 468 | return shift_maxindex(node->shift); | ||
| 469 | } | ||
| 470 | |||
| 471 | static unsigned radix_tree_load_root(struct radix_tree_root *root, | 514 | static unsigned radix_tree_load_root(struct radix_tree_root *root, |
| 472 | struct radix_tree_node **nodep, unsigned long *maxindex) | 515 | struct radix_tree_node **nodep, unsigned long *maxindex) |
| 473 | { | 516 | { |
| @@ -505,8 +548,8 @@ static int radix_tree_extend(struct radix_tree_root *root, | |||
| 505 | goto out; | 548 | goto out; |
| 506 | 549 | ||
| 507 | do { | 550 | do { |
| 508 | struct radix_tree_node *node = radix_tree_node_alloc(root); | 551 | struct radix_tree_node *node = radix_tree_node_alloc(root, |
| 509 | 552 | NULL, shift, 0, 1, 0); | |
| 510 | if (!node) | 553 | if (!node) |
| 511 | return -ENOMEM; | 554 | return -ENOMEM; |
| 512 | 555 | ||
| @@ -517,16 +560,11 @@ static int radix_tree_extend(struct radix_tree_root *root, | |||
| 517 | } | 560 | } |
| 518 | 561 | ||
| 519 | BUG_ON(shift > BITS_PER_LONG); | 562 | BUG_ON(shift > BITS_PER_LONG); |
| 520 | node->shift = shift; | ||
| 521 | node->offset = 0; | ||
| 522 | node->count = 1; | ||
| 523 | node->parent = NULL; | ||
| 524 | if (radix_tree_is_internal_node(slot)) { | 563 | if (radix_tree_is_internal_node(slot)) { |
| 525 | entry_to_node(slot)->parent = node; | 564 | entry_to_node(slot)->parent = node; |
| 526 | } else { | 565 | } else if (radix_tree_exceptional_entry(slot)) { |
| 527 | /* Moving an exceptional root->rnode to a node */ | 566 | /* Moving an exceptional root->rnode to a node */ |
| 528 | if (radix_tree_exceptional_entry(slot)) | 567 | node->exceptional = 1; |
| 529 | node->exceptional = 1; | ||
| 530 | } | 568 | } |
| 531 | node->slots[0] = slot; | 569 | node->slots[0] = slot; |
| 532 | slot = node_to_entry(node); | 570 | slot = node_to_entry(node); |
| @@ -665,26 +703,24 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, | |||
| 665 | shift = radix_tree_load_root(root, &child, &maxindex); | 703 | shift = radix_tree_load_root(root, &child, &maxindex); |
| 666 | 704 | ||
| 667 | /* Make sure the tree is high enough. */ | 705 | /* Make sure the tree is high enough. */ |
| 706 | if (order > 0 && max == ((1UL << order) - 1)) | ||
| 707 | max++; | ||
| 668 | if (max > maxindex) { | 708 | if (max > maxindex) { |
| 669 | int error = radix_tree_extend(root, max, shift); | 709 | int error = radix_tree_extend(root, max, shift); |
| 670 | if (error < 0) | 710 | if (error < 0) |
| 671 | return error; | 711 | return error; |
| 672 | shift = error; | 712 | shift = error; |
| 673 | child = root->rnode; | 713 | child = root->rnode; |
| 674 | if (order == shift) | ||
| 675 | shift += RADIX_TREE_MAP_SHIFT; | ||
| 676 | } | 714 | } |
| 677 | 715 | ||
| 678 | while (shift > order) { | 716 | while (shift > order) { |
| 679 | shift -= RADIX_TREE_MAP_SHIFT; | 717 | shift -= RADIX_TREE_MAP_SHIFT; |
| 680 | if (child == NULL) { | 718 | if (child == NULL) { |
| 681 | /* Have to add a child node. */ | 719 | /* Have to add a child node. */ |
| 682 | child = radix_tree_node_alloc(root); | 720 | child = radix_tree_node_alloc(root, node, shift, |
| 721 | offset, 0, 0); | ||
| 683 | if (!child) | 722 | if (!child) |
| 684 | return -ENOMEM; | 723 | return -ENOMEM; |
| 685 | child->shift = shift; | ||
| 686 | child->offset = offset; | ||
| 687 | child->parent = node; | ||
| 688 | rcu_assign_pointer(*slot, node_to_entry(child)); | 724 | rcu_assign_pointer(*slot, node_to_entry(child)); |
| 689 | if (node) | 725 | if (node) |
| 690 | node->count++; | 726 | node->count++; |
| @@ -697,31 +733,125 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, | |||
| 697 | slot = &node->slots[offset]; | 733 | slot = &node->slots[offset]; |
| 698 | } | 734 | } |
| 699 | 735 | ||
| 736 | if (nodep) | ||
| 737 | *nodep = node; | ||
| 738 | if (slotp) | ||
| 739 | *slotp = slot; | ||
| 740 | return 0; | ||
| 741 | } | ||
| 742 | |||
| 700 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | 743 | #ifdef CONFIG_RADIX_TREE_MULTIORDER |
| 701 | /* Insert pointers to the canonical entry */ | 744 | /* |
| 702 | if (order > shift) { | 745 | * Free any nodes below this node. The tree is presumed to not need |
| 703 | unsigned i, n = 1 << (order - shift); | 746 | * shrinking, and any user data in the tree is presumed to not need a |
| 747 | * destructor called on it. If we need to add a destructor, we can | ||
| 748 | * add that functionality later. Note that we may not clear tags or | ||
| 749 | * slots from the tree as an RCU walker may still have a pointer into | ||
| 750 | * this subtree. We could replace the entries with RADIX_TREE_RETRY, | ||
| 751 | * but we'll still have to clear those in rcu_free. | ||
| 752 | */ | ||
| 753 | static void radix_tree_free_nodes(struct radix_tree_node *node) | ||
| 754 | { | ||
| 755 | unsigned offset = 0; | ||
| 756 | struct radix_tree_node *child = entry_to_node(node); | ||
| 757 | |||
| 758 | for (;;) { | ||
| 759 | void *entry = child->slots[offset]; | ||
| 760 | if (radix_tree_is_internal_node(entry) && | ||
| 761 | !is_sibling_entry(child, entry)) { | ||
| 762 | child = entry_to_node(entry); | ||
| 763 | offset = 0; | ||
| 764 | continue; | ||
| 765 | } | ||
| 766 | offset++; | ||
| 767 | while (offset == RADIX_TREE_MAP_SIZE) { | ||
| 768 | struct radix_tree_node *old = child; | ||
| 769 | offset = child->offset + 1; | ||
| 770 | child = child->parent; | ||
| 771 | radix_tree_node_free(old); | ||
| 772 | if (old == entry_to_node(node)) | ||
| 773 | return; | ||
| 774 | } | ||
| 775 | } | ||
| 776 | } | ||
| 777 | |||
| 778 | static inline int insert_entries(struct radix_tree_node *node, void **slot, | ||
| 779 | void *item, unsigned order, bool replace) | ||
| 780 | { | ||
| 781 | struct radix_tree_node *child; | ||
| 782 | unsigned i, n, tag, offset, tags = 0; | ||
| 783 | |||
| 784 | if (node) { | ||
| 785 | if (order > node->shift) | ||
| 786 | n = 1 << (order - node->shift); | ||
| 787 | else | ||
| 788 | n = 1; | ||
| 789 | offset = get_slot_offset(node, slot); | ||
| 790 | } else { | ||
| 791 | n = 1; | ||
| 792 | offset = 0; | ||
| 793 | } | ||
| 794 | |||
| 795 | if (n > 1) { | ||
| 704 | offset = offset & ~(n - 1); | 796 | offset = offset & ~(n - 1); |
| 705 | slot = &node->slots[offset]; | 797 | slot = &node->slots[offset]; |
| 706 | child = node_to_entry(slot); | 798 | } |
| 707 | for (i = 0; i < n; i++) { | 799 | child = node_to_entry(slot); |
| 708 | if (slot[i]) | 800 | |
| 801 | for (i = 0; i < n; i++) { | ||
| 802 | if (slot[i]) { | ||
| 803 | if (replace) { | ||
| 804 | node->count--; | ||
| 805 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
| 806 | if (tag_get(node, tag, offset + i)) | ||
| 807 | tags |= 1 << tag; | ||
| 808 | } else | ||
| 709 | return -EEXIST; | 809 | return -EEXIST; |
| 710 | } | 810 | } |
| 811 | } | ||
| 711 | 812 | ||
| 712 | for (i = 1; i < n; i++) { | 813 | for (i = 0; i < n; i++) { |
| 814 | struct radix_tree_node *old = slot[i]; | ||
| 815 | if (i) { | ||
| 713 | rcu_assign_pointer(slot[i], child); | 816 | rcu_assign_pointer(slot[i], child); |
| 714 | node->count++; | 817 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) |
| 818 | if (tags & (1 << tag)) | ||
| 819 | tag_clear(node, tag, offset + i); | ||
| 820 | } else { | ||
| 821 | rcu_assign_pointer(slot[i], item); | ||
| 822 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
| 823 | if (tags & (1 << tag)) | ||
| 824 | tag_set(node, tag, offset); | ||
| 715 | } | 825 | } |
| 826 | if (radix_tree_is_internal_node(old) && | ||
| 827 | !is_sibling_entry(node, old) && | ||
| 828 | (old != RADIX_TREE_RETRY)) | ||
| 829 | radix_tree_free_nodes(old); | ||
| 830 | if (radix_tree_exceptional_entry(old)) | ||
| 831 | node->exceptional--; | ||
| 716 | } | 832 | } |
| 717 | #endif | 833 | if (node) { |
| 718 | 834 | node->count += n; | |
| 719 | if (nodep) | 835 | if (radix_tree_exceptional_entry(item)) |
| 720 | *nodep = node; | 836 | node->exceptional += n; |
| 721 | if (slotp) | 837 | } |
| 722 | *slotp = slot; | 838 | return n; |
| 723 | return 0; | 839 | } |
| 840 | #else | ||
| 841 | static inline int insert_entries(struct radix_tree_node *node, void **slot, | ||
| 842 | void *item, unsigned order, bool replace) | ||
| 843 | { | ||
| 844 | if (*slot) | ||
| 845 | return -EEXIST; | ||
| 846 | rcu_assign_pointer(*slot, item); | ||
| 847 | if (node) { | ||
| 848 | node->count++; | ||
| 849 | if (radix_tree_exceptional_entry(item)) | ||
| 850 | node->exceptional++; | ||
| 851 | } | ||
| 852 | return 1; | ||
| 724 | } | 853 | } |
| 854 | #endif | ||
| 725 | 855 | ||
| 726 | /** | 856 | /** |
| 727 | * __radix_tree_insert - insert into a radix tree | 857 | * __radix_tree_insert - insert into a radix tree |
| @@ -744,15 +874,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, | |||
| 744 | error = __radix_tree_create(root, index, order, &node, &slot); | 874 | error = __radix_tree_create(root, index, order, &node, &slot); |
| 745 | if (error) | 875 | if (error) |
| 746 | return error; | 876 | return error; |
| 747 | if (*slot != NULL) | 877 | |
| 748 | return -EEXIST; | 878 | error = insert_entries(node, slot, item, order, false); |
| 749 | rcu_assign_pointer(*slot, item); | 879 | if (error < 0) |
| 880 | return error; | ||
| 750 | 881 | ||
| 751 | if (node) { | 882 | if (node) { |
| 752 | unsigned offset = get_slot_offset(node, slot); | 883 | unsigned offset = get_slot_offset(node, slot); |
| 753 | node->count++; | ||
| 754 | if (radix_tree_exceptional_entry(item)) | ||
| 755 | node->exceptional++; | ||
| 756 | BUG_ON(tag_get(node, 0, offset)); | 884 | BUG_ON(tag_get(node, 0, offset)); |
| 757 | BUG_ON(tag_get(node, 1, offset)); | 885 | BUG_ON(tag_get(node, 1, offset)); |
| 758 | BUG_ON(tag_get(node, 2, offset)); | 886 | BUG_ON(tag_get(node, 2, offset)); |
| @@ -850,6 +978,24 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) | |||
| 850 | } | 978 | } |
| 851 | EXPORT_SYMBOL(radix_tree_lookup); | 979 | EXPORT_SYMBOL(radix_tree_lookup); |
| 852 | 980 | ||
| 981 | static inline int slot_count(struct radix_tree_node *node, | ||
| 982 | void **slot) | ||
| 983 | { | ||
| 984 | int n = 1; | ||
| 985 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 986 | void *ptr = node_to_entry(slot); | ||
| 987 | unsigned offset = get_slot_offset(node, slot); | ||
| 988 | int i; | ||
| 989 | |||
| 990 | for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { | ||
| 991 | if (node->slots[offset + i] != ptr) | ||
| 992 | break; | ||
| 993 | n++; | ||
| 994 | } | ||
| 995 | #endif | ||
| 996 | return n; | ||
| 997 | } | ||
| 998 | |||
| 853 | static void replace_slot(struct radix_tree_root *root, | 999 | static void replace_slot(struct radix_tree_root *root, |
| 854 | struct radix_tree_node *node, | 1000 | struct radix_tree_node *node, |
| 855 | void **slot, void *item, | 1001 | void **slot, void *item, |
| @@ -868,12 +1014,35 @@ static void replace_slot(struct radix_tree_root *root, | |||
| 868 | 1014 | ||
| 869 | if (node) { | 1015 | if (node) { |
| 870 | node->count += count; | 1016 | node->count += count; |
| 871 | node->exceptional += exceptional; | 1017 | if (exceptional) { |
| 1018 | exceptional *= slot_count(node, slot); | ||
| 1019 | node->exceptional += exceptional; | ||
| 1020 | } | ||
| 872 | } | 1021 | } |
| 873 | 1022 | ||
| 874 | rcu_assign_pointer(*slot, item); | 1023 | rcu_assign_pointer(*slot, item); |
| 875 | } | 1024 | } |
| 876 | 1025 | ||
| 1026 | static inline void delete_sibling_entries(struct radix_tree_node *node, | ||
| 1027 | void **slot) | ||
| 1028 | { | ||
| 1029 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 1030 | bool exceptional = radix_tree_exceptional_entry(*slot); | ||
| 1031 | void *ptr = node_to_entry(slot); | ||
| 1032 | unsigned offset = get_slot_offset(node, slot); | ||
| 1033 | int i; | ||
| 1034 | |||
| 1035 | for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { | ||
| 1036 | if (node->slots[offset + i] != ptr) | ||
| 1037 | break; | ||
| 1038 | node->slots[offset + i] = NULL; | ||
| 1039 | node->count--; | ||
| 1040 | if (exceptional) | ||
| 1041 | node->exceptional--; | ||
| 1042 | } | ||
| 1043 | #endif | ||
| 1044 | } | ||
| 1045 | |||
| 877 | /** | 1046 | /** |
| 878 | * __radix_tree_replace - replace item in a slot | 1047 | * __radix_tree_replace - replace item in a slot |
| 879 | * @root: radix tree root | 1048 | * @root: radix tree root |
| @@ -891,6 +1060,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
| 891 | void **slot, void *item, | 1060 | void **slot, void *item, |
| 892 | radix_tree_update_node_t update_node, void *private) | 1061 | radix_tree_update_node_t update_node, void *private) |
| 893 | { | 1062 | { |
| 1063 | if (!item) | ||
| 1064 | delete_sibling_entries(node, slot); | ||
| 894 | /* | 1065 | /* |
| 895 | * This function supports replacing exceptional entries and | 1066 | * This function supports replacing exceptional entries and |
| 896 | * deleting entries, but that needs accounting against the | 1067 | * deleting entries, but that needs accounting against the |
| @@ -921,7 +1092,8 @@ void __radix_tree_replace(struct radix_tree_root *root, | |||
| 921 | * NOTE: This cannot be used to switch between non-entries (empty slots), | 1092 | * NOTE: This cannot be used to switch between non-entries (empty slots), |
| 922 | * regular entries, and exceptional entries, as that requires accounting | 1093 | * regular entries, and exceptional entries, as that requires accounting |
| 923 | * inside the radix tree node. When switching from one type of entry or | 1094 | * inside the radix tree node. When switching from one type of entry or |
| 924 | * deleting, use __radix_tree_lookup() and __radix_tree_replace(). | 1095 | * deleting, use __radix_tree_lookup() and __radix_tree_replace() or |
| 1096 | * radix_tree_iter_replace(). | ||
| 925 | */ | 1097 | */ |
| 926 | void radix_tree_replace_slot(struct radix_tree_root *root, | 1098 | void radix_tree_replace_slot(struct radix_tree_root *root, |
| 927 | void **slot, void *item) | 1099 | void **slot, void *item) |
| @@ -930,6 +1102,164 @@ void radix_tree_replace_slot(struct radix_tree_root *root, | |||
| 930 | } | 1102 | } |
| 931 | 1103 | ||
| 932 | /** | 1104 | /** |
| 1105 | * radix_tree_iter_replace - replace item in a slot | ||
| 1106 | * @root: radix tree root | ||
| 1107 | * @slot: pointer to slot | ||
| 1108 | * @item: new item to store in the slot. | ||
| 1109 | * | ||
| 1110 | * For use with radix_tree_split() and radix_tree_for_each_slot(). | ||
| 1111 | * Caller must hold tree write locked across split and replacement. | ||
| 1112 | */ | ||
| 1113 | void radix_tree_iter_replace(struct radix_tree_root *root, | ||
| 1114 | const struct radix_tree_iter *iter, void **slot, void *item) | ||
| 1115 | { | ||
| 1116 | __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 1120 | /** | ||
| 1121 | * radix_tree_join - replace multiple entries with one multiorder entry | ||
| 1122 | * @root: radix tree root | ||
| 1123 | * @index: an index inside the new entry | ||
| 1124 | * @order: order of the new entry | ||
| 1125 | * @item: new entry | ||
| 1126 | * | ||
| 1127 | * Call this function to replace several entries with one larger entry. | ||
| 1128 | * The existing entries are presumed to not need freeing as a result of | ||
| 1129 | * this call. | ||
| 1130 | * | ||
| 1131 | * The replacement entry will have all the tags set on it that were set | ||
| 1132 | * on any of the entries it is replacing. | ||
| 1133 | */ | ||
| 1134 | int radix_tree_join(struct radix_tree_root *root, unsigned long index, | ||
| 1135 | unsigned order, void *item) | ||
| 1136 | { | ||
| 1137 | struct radix_tree_node *node; | ||
| 1138 | void **slot; | ||
| 1139 | int error; | ||
| 1140 | |||
| 1141 | BUG_ON(radix_tree_is_internal_node(item)); | ||
| 1142 | |||
| 1143 | error = __radix_tree_create(root, index, order, &node, &slot); | ||
| 1144 | if (!error) | ||
| 1145 | error = insert_entries(node, slot, item, order, true); | ||
| 1146 | if (error > 0) | ||
| 1147 | error = 0; | ||
| 1148 | |||
| 1149 | return error; | ||
| 1150 | } | ||
| 1151 | |||
| 1152 | /** | ||
| 1153 | * radix_tree_split - Split an entry into smaller entries | ||
| 1154 | * @root: radix tree root | ||
| 1155 | * @index: An index within the large entry | ||
| 1156 | * @order: Order of new entries | ||
| 1157 | * | ||
| 1158 | * Call this function as the first step in replacing a multiorder entry | ||
| 1159 | * with several entries of lower order. After this function returns, | ||
| 1160 | * loop over the relevant portion of the tree using radix_tree_for_each_slot() | ||
| 1161 | * and call radix_tree_iter_replace() to set up each new entry. | ||
| 1162 | * | ||
| 1163 | * The tags from this entry are replicated to all the new entries. | ||
| 1164 | * | ||
| 1165 | * The radix tree should be locked against modification during the entire | ||
| 1166 | * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which | ||
| 1167 | * should prompt RCU walkers to restart the lookup from the root. | ||
| 1168 | */ | ||
| 1169 | int radix_tree_split(struct radix_tree_root *root, unsigned long index, | ||
| 1170 | unsigned order) | ||
| 1171 | { | ||
| 1172 | struct radix_tree_node *parent, *node, *child; | ||
| 1173 | void **slot; | ||
| 1174 | unsigned int offset, end; | ||
| 1175 | unsigned n, tag, tags = 0; | ||
| 1176 | |||
| 1177 | if (!__radix_tree_lookup(root, index, &parent, &slot)) | ||
| 1178 | return -ENOENT; | ||
| 1179 | if (!parent) | ||
| 1180 | return -ENOENT; | ||
| 1181 | |||
| 1182 | offset = get_slot_offset(parent, slot); | ||
| 1183 | |||
| 1184 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
| 1185 | if (tag_get(parent, tag, offset)) | ||
| 1186 | tags |= 1 << tag; | ||
| 1187 | |||
| 1188 | for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { | ||
| 1189 | if (!is_sibling_entry(parent, parent->slots[end])) | ||
| 1190 | break; | ||
| 1191 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
| 1192 | if (tags & (1 << tag)) | ||
| 1193 | tag_set(parent, tag, end); | ||
| 1194 | /* rcu_assign_pointer ensures tags are set before RETRY */ | ||
| 1195 | rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); | ||
| 1196 | } | ||
| 1197 | rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); | ||
| 1198 | parent->exceptional -= (end - offset); | ||
| 1199 | |||
| 1200 | if (order == parent->shift) | ||
| 1201 | return 0; | ||
| 1202 | if (order > parent->shift) { | ||
| 1203 | while (offset < end) | ||
| 1204 | offset += insert_entries(parent, &parent->slots[offset], | ||
| 1205 | RADIX_TREE_RETRY, order, true); | ||
| 1206 | return 0; | ||
| 1207 | } | ||
| 1208 | |||
| 1209 | node = parent; | ||
| 1210 | |||
| 1211 | for (;;) { | ||
| 1212 | if (node->shift > order) { | ||
| 1213 | child = radix_tree_node_alloc(root, node, | ||
| 1214 | node->shift - RADIX_TREE_MAP_SHIFT, | ||
| 1215 | offset, 0, 0); | ||
| 1216 | if (!child) | ||
| 1217 | goto nomem; | ||
| 1218 | if (node != parent) { | ||
| 1219 | node->count++; | ||
| 1220 | node->slots[offset] = node_to_entry(child); | ||
| 1221 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
| 1222 | if (tags & (1 << tag)) | ||
| 1223 | tag_set(node, tag, offset); | ||
| 1224 | } | ||
| 1225 | |||
| 1226 | node = child; | ||
| 1227 | offset = 0; | ||
| 1228 | continue; | ||
| 1229 | } | ||
| 1230 | |||
| 1231 | n = insert_entries(node, &node->slots[offset], | ||
| 1232 | RADIX_TREE_RETRY, order, false); | ||
| 1233 | BUG_ON(n > RADIX_TREE_MAP_SIZE); | ||
| 1234 | |||
| 1235 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | ||
| 1236 | if (tags & (1 << tag)) | ||
| 1237 | tag_set(node, tag, offset); | ||
| 1238 | offset += n; | ||
| 1239 | |||
| 1240 | while (offset == RADIX_TREE_MAP_SIZE) { | ||
| 1241 | if (node == parent) | ||
| 1242 | break; | ||
| 1243 | offset = node->offset; | ||
| 1244 | child = node; | ||
| 1245 | node = node->parent; | ||
| 1246 | rcu_assign_pointer(node->slots[offset], | ||
| 1247 | node_to_entry(child)); | ||
| 1248 | offset++; | ||
| 1249 | } | ||
| 1250 | if ((node == parent) && (offset == end)) | ||
| 1251 | return 0; | ||
| 1252 | } | ||
| 1253 | |||
| 1254 | nomem: | ||
| 1255 | /* Shouldn't happen; did user forget to preload? */ | ||
| 1256 | /* TODO: free all the allocated nodes */ | ||
| 1257 | WARN_ON(1); | ||
| 1258 | return -ENOMEM; | ||
| 1259 | } | ||
| 1260 | #endif | ||
| 1261 | |||
| 1262 | /** | ||
| 933 | * radix_tree_tag_set - set a tag on a radix tree node | 1263 | * radix_tree_tag_set - set a tag on a radix tree node |
| 934 | * @root: radix tree root | 1264 | * @root: radix tree root |
| 935 | * @index: index key | 1265 | * @index: index key |
| @@ -990,6 +1320,34 @@ static void node_tag_clear(struct radix_tree_root *root, | |||
| 990 | root_tag_clear(root, tag); | 1320 | root_tag_clear(root, tag); |
| 991 | } | 1321 | } |
| 992 | 1322 | ||
| 1323 | static void node_tag_set(struct radix_tree_root *root, | ||
| 1324 | struct radix_tree_node *node, | ||
| 1325 | unsigned int tag, unsigned int offset) | ||
| 1326 | { | ||
| 1327 | while (node) { | ||
| 1328 | if (tag_get(node, tag, offset)) | ||
| 1329 | return; | ||
| 1330 | tag_set(node, tag, offset); | ||
| 1331 | offset = node->offset; | ||
| 1332 | node = node->parent; | ||
| 1333 | } | ||
| 1334 | |||
| 1335 | if (!root_tag_get(root, tag)) | ||
| 1336 | root_tag_set(root, tag); | ||
| 1337 | } | ||
| 1338 | |||
| 1339 | /** | ||
| 1340 | * radix_tree_iter_tag_set - set a tag on the current iterator entry | ||
| 1341 | * @root: radix tree root | ||
| 1342 | * @iter: iterator state | ||
| 1343 | * @tag: tag to set | ||
| 1344 | */ | ||
| 1345 | void radix_tree_iter_tag_set(struct radix_tree_root *root, | ||
| 1346 | const struct radix_tree_iter *iter, unsigned int tag) | ||
| 1347 | { | ||
| 1348 | node_tag_set(root, iter->node, tag, iter_offset(iter)); | ||
| 1349 | } | ||
| 1350 | |||
| 993 | /** | 1351 | /** |
| 994 | * radix_tree_tag_clear - clear a tag on a radix tree node | 1352 | * radix_tree_tag_clear - clear a tag on a radix tree node |
| 995 | * @root: radix tree root | 1353 | * @root: radix tree root |
| @@ -1085,6 +1443,121 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, | |||
| 1085 | #endif | 1443 | #endif |
| 1086 | } | 1444 | } |
| 1087 | 1445 | ||
| 1446 | /* Construct iter->tags bit-mask from node->tags[tag] array */ | ||
| 1447 | static void set_iter_tags(struct radix_tree_iter *iter, | ||
| 1448 | struct radix_tree_node *node, unsigned offset, | ||
| 1449 | unsigned tag) | ||
| 1450 | { | ||
| 1451 | unsigned tag_long = offset / BITS_PER_LONG; | ||
| 1452 | unsigned tag_bit = offset % BITS_PER_LONG; | ||
| 1453 | |||
| 1454 | iter->tags = node->tags[tag][tag_long] >> tag_bit; | ||
| 1455 | |||
| 1456 | /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ | ||
| 1457 | if (tag_long < RADIX_TREE_TAG_LONGS - 1) { | ||
| 1458 | /* Pick tags from next element */ | ||
| 1459 | if (tag_bit) | ||
| 1460 | iter->tags |= node->tags[tag][tag_long + 1] << | ||
| 1461 | (BITS_PER_LONG - tag_bit); | ||
| 1462 | /* Clip chunk size, here only BITS_PER_LONG tags */ | ||
| 1463 | iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); | ||
| 1464 | } | ||
| 1465 | } | ||
| 1466 | |||
| 1467 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 1468 | static void **skip_siblings(struct radix_tree_node **nodep, | ||
| 1469 | void **slot, struct radix_tree_iter *iter) | ||
| 1470 | { | ||
| 1471 | void *sib = node_to_entry(slot - 1); | ||
| 1472 | |||
| 1473 | while (iter->index < iter->next_index) { | ||
| 1474 | *nodep = rcu_dereference_raw(*slot); | ||
| 1475 | if (*nodep && *nodep != sib) | ||
| 1476 | return slot; | ||
| 1477 | slot++; | ||
| 1478 | iter->index = __radix_tree_iter_add(iter, 1); | ||
| 1479 | iter->tags >>= 1; | ||
| 1480 | } | ||
| 1481 | |||
| 1482 | *nodep = NULL; | ||
| 1483 | return NULL; | ||
| 1484 | } | ||
| 1485 | |||
| 1486 | void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, | ||
| 1487 | unsigned flags) | ||
| 1488 | { | ||
| 1489 | unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; | ||
| 1490 | struct radix_tree_node *node = rcu_dereference_raw(*slot); | ||
| 1491 | |||
| 1492 | slot = skip_siblings(&node, slot, iter); | ||
| 1493 | |||
| 1494 | while (radix_tree_is_internal_node(node)) { | ||
| 1495 | unsigned offset; | ||
| 1496 | unsigned long next_index; | ||
| 1497 | |||
| 1498 | if (node == RADIX_TREE_RETRY) | ||
| 1499 | return slot; | ||
| 1500 | node = entry_to_node(node); | ||
| 1501 | iter->node = node; | ||
| 1502 | iter->shift = node->shift; | ||
| 1503 | |||
| 1504 | if (flags & RADIX_TREE_ITER_TAGGED) { | ||
| 1505 | offset = radix_tree_find_next_bit(node, tag, 0); | ||
| 1506 | if (offset == RADIX_TREE_MAP_SIZE) | ||
| 1507 | return NULL; | ||
| 1508 | slot = &node->slots[offset]; | ||
| 1509 | iter->index = __radix_tree_iter_add(iter, offset); | ||
| 1510 | set_iter_tags(iter, node, offset, tag); | ||
| 1511 | node = rcu_dereference_raw(*slot); | ||
| 1512 | } else { | ||
| 1513 | offset = 0; | ||
| 1514 | slot = &node->slots[0]; | ||
| 1515 | for (;;) { | ||
| 1516 | node = rcu_dereference_raw(*slot); | ||
| 1517 | if (node) | ||
| 1518 | break; | ||
| 1519 | slot++; | ||
| 1520 | offset++; | ||
| 1521 | if (offset == RADIX_TREE_MAP_SIZE) | ||
| 1522 | return NULL; | ||
| 1523 | } | ||
| 1524 | iter->index = __radix_tree_iter_add(iter, offset); | ||
| 1525 | } | ||
| 1526 | if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) | ||
| 1527 | goto none; | ||
| 1528 | next_index = (iter->index | shift_maxindex(iter->shift)) + 1; | ||
| 1529 | if (next_index < iter->next_index) | ||
| 1530 | iter->next_index = next_index; | ||
| 1531 | } | ||
| 1532 | |||
| 1533 | return slot; | ||
| 1534 | none: | ||
| 1535 | iter->next_index = 0; | ||
| 1536 | return NULL; | ||
| 1537 | } | ||
| 1538 | EXPORT_SYMBOL(__radix_tree_next_slot); | ||
| 1539 | #else | ||
| 1540 | static void **skip_siblings(struct radix_tree_node **nodep, | ||
| 1541 | void **slot, struct radix_tree_iter *iter) | ||
| 1542 | { | ||
| 1543 | return slot; | ||
| 1544 | } | ||
| 1545 | #endif | ||
| 1546 | |||
| 1547 | void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) | ||
| 1548 | { | ||
| 1549 | struct radix_tree_node *node; | ||
| 1550 | |||
| 1551 | slot++; | ||
| 1552 | iter->index = __radix_tree_iter_add(iter, 1); | ||
| 1553 | node = rcu_dereference_raw(*slot); | ||
| 1554 | skip_siblings(&node, slot, iter); | ||
| 1555 | iter->next_index = iter->index; | ||
| 1556 | iter->tags = 0; | ||
| 1557 | return NULL; | ||
| 1558 | } | ||
| 1559 | EXPORT_SYMBOL(radix_tree_iter_resume); | ||
| 1560 | |||
| 1088 | /** | 1561 | /** |
| 1089 | * radix_tree_next_chunk - find next chunk of slots for iteration | 1562 | * radix_tree_next_chunk - find next chunk of slots for iteration |
| 1090 | * | 1563 | * |
| @@ -1110,7 +1583,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
| 1110 | * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. | 1583 | * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. |
| 1111 | * | 1584 | * |
| 1112 | * This condition also used by radix_tree_next_slot() to stop | 1585 | * This condition also used by radix_tree_next_slot() to stop |
| 1113 | * contiguous iterating, and forbid swithing to the next chunk. | 1586 | * contiguous iterating, and forbid switching to the next chunk. |
| 1114 | */ | 1587 | */ |
| 1115 | index = iter->next_index; | 1588 | index = iter->next_index; |
| 1116 | if (!index && iter->index) | 1589 | if (!index && iter->index) |
| @@ -1128,6 +1601,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
| 1128 | iter->index = index; | 1601 | iter->index = index; |
| 1129 | iter->next_index = maxindex + 1; | 1602 | iter->next_index = maxindex + 1; |
| 1130 | iter->tags = 1; | 1603 | iter->tags = 1; |
| 1604 | iter->node = NULL; | ||
| 1131 | __set_iter_shift(iter, 0); | 1605 | __set_iter_shift(iter, 0); |
| 1132 | return (void **)&root->rnode; | 1606 | return (void **)&root->rnode; |
| 1133 | } | 1607 | } |
| @@ -1143,9 +1617,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
| 1143 | return NULL; | 1617 | return NULL; |
| 1144 | 1618 | ||
| 1145 | if (flags & RADIX_TREE_ITER_TAGGED) | 1619 | if (flags & RADIX_TREE_ITER_TAGGED) |
| 1146 | offset = radix_tree_find_next_bit( | 1620 | offset = radix_tree_find_next_bit(node, tag, |
| 1147 | node->tags[tag], | ||
| 1148 | RADIX_TREE_MAP_SIZE, | ||
| 1149 | offset + 1); | 1621 | offset + 1); |
| 1150 | else | 1622 | else |
| 1151 | while (++offset < RADIX_TREE_MAP_SIZE) { | 1623 | while (++offset < RADIX_TREE_MAP_SIZE) { |
| @@ -1165,154 +1637,26 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, | |||
| 1165 | child = rcu_dereference_raw(node->slots[offset]); | 1637 | child = rcu_dereference_raw(node->slots[offset]); |
| 1166 | } | 1638 | } |
| 1167 | 1639 | ||
| 1168 | if ((child == NULL) || (child == RADIX_TREE_RETRY)) | 1640 | if (!child) |
| 1169 | goto restart; | 1641 | goto restart; |
| 1642 | if (child == RADIX_TREE_RETRY) | ||
| 1643 | break; | ||
| 1170 | } while (radix_tree_is_internal_node(child)); | 1644 | } while (radix_tree_is_internal_node(child)); |
| 1171 | 1645 | ||
| 1172 | /* Update the iterator state */ | 1646 | /* Update the iterator state */ |
| 1173 | iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); | 1647 | iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); |
| 1174 | iter->next_index = (index | node_maxindex(node)) + 1; | 1648 | iter->next_index = (index | node_maxindex(node)) + 1; |
| 1649 | iter->node = node; | ||
| 1175 | __set_iter_shift(iter, node->shift); | 1650 | __set_iter_shift(iter, node->shift); |
| 1176 | 1651 | ||
| 1177 | /* Construct iter->tags bit-mask from node->tags[tag] array */ | 1652 | if (flags & RADIX_TREE_ITER_TAGGED) |
| 1178 | if (flags & RADIX_TREE_ITER_TAGGED) { | 1653 | set_iter_tags(iter, node, offset, tag); |
| 1179 | unsigned tag_long, tag_bit; | ||
| 1180 | |||
| 1181 | tag_long = offset / BITS_PER_LONG; | ||
| 1182 | tag_bit = offset % BITS_PER_LONG; | ||
| 1183 | iter->tags = node->tags[tag][tag_long] >> tag_bit; | ||
| 1184 | /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ | ||
| 1185 | if (tag_long < RADIX_TREE_TAG_LONGS - 1) { | ||
| 1186 | /* Pick tags from next element */ | ||
| 1187 | if (tag_bit) | ||
| 1188 | iter->tags |= node->tags[tag][tag_long + 1] << | ||
| 1189 | (BITS_PER_LONG - tag_bit); | ||
| 1190 | /* Clip chunk size, here only BITS_PER_LONG tags */ | ||
| 1191 | iter->next_index = index + BITS_PER_LONG; | ||
| 1192 | } | ||
| 1193 | } | ||
| 1194 | 1654 | ||
| 1195 | return node->slots + offset; | 1655 | return node->slots + offset; |
| 1196 | } | 1656 | } |
| 1197 | EXPORT_SYMBOL(radix_tree_next_chunk); | 1657 | EXPORT_SYMBOL(radix_tree_next_chunk); |
| 1198 | 1658 | ||
| 1199 | /** | 1659 | /** |
| 1200 | * radix_tree_range_tag_if_tagged - for each item in given range set given | ||
| 1201 | * tag if item has another tag set | ||
| 1202 | * @root: radix tree root | ||
| 1203 | * @first_indexp: pointer to a starting index of a range to scan | ||
| 1204 | * @last_index: last index of a range to scan | ||
| 1205 | * @nr_to_tag: maximum number items to tag | ||
| 1206 | * @iftag: tag index to test | ||
| 1207 | * @settag: tag index to set if tested tag is set | ||
| 1208 | * | ||
| 1209 | * This function scans range of radix tree from first_index to last_index | ||
| 1210 | * (inclusive). For each item in the range if iftag is set, the function sets | ||
| 1211 | * also settag. The function stops either after tagging nr_to_tag items or | ||
| 1212 | * after reaching last_index. | ||
| 1213 | * | ||
| 1214 | * The tags must be set from the leaf level only and propagated back up the | ||
| 1215 | * path to the root. We must do this so that we resolve the full path before | ||
| 1216 | * setting any tags on intermediate nodes. If we set tags as we descend, then | ||
| 1217 | * we can get to the leaf node and find that the index that has the iftag | ||
| 1218 | * set is outside the range we are scanning. This reults in dangling tags and | ||
| 1219 | * can lead to problems with later tag operations (e.g. livelocks on lookups). | ||
| 1220 | * | ||
| 1221 | * The function returns the number of leaves where the tag was set and sets | ||
| 1222 | * *first_indexp to the first unscanned index. | ||
| 1223 | * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must | ||
| 1224 | * be prepared to handle that. | ||
| 1225 | */ | ||
| 1226 | unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, | ||
| 1227 | unsigned long *first_indexp, unsigned long last_index, | ||
| 1228 | unsigned long nr_to_tag, | ||
| 1229 | unsigned int iftag, unsigned int settag) | ||
| 1230 | { | ||
| 1231 | struct radix_tree_node *parent, *node, *child; | ||
| 1232 | unsigned long maxindex; | ||
| 1233 | unsigned long tagged = 0; | ||
| 1234 | unsigned long index = *first_indexp; | ||
| 1235 | |||
| 1236 | radix_tree_load_root(root, &child, &maxindex); | ||
| 1237 | last_index = min(last_index, maxindex); | ||
| 1238 | if (index > last_index) | ||
| 1239 | return 0; | ||
| 1240 | if (!nr_to_tag) | ||
| 1241 | return 0; | ||
| 1242 | if (!root_tag_get(root, iftag)) { | ||
| 1243 | *first_indexp = last_index + 1; | ||
| 1244 | return 0; | ||
| 1245 | } | ||
| 1246 | if (!radix_tree_is_internal_node(child)) { | ||
| 1247 | *first_indexp = last_index + 1; | ||
| 1248 | root_tag_set(root, settag); | ||
| 1249 | return 1; | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | node = entry_to_node(child); | ||
| 1253 | |||
| 1254 | for (;;) { | ||
| 1255 | unsigned offset = radix_tree_descend(node, &child, index); | ||
| 1256 | if (!child) | ||
| 1257 | goto next; | ||
| 1258 | if (!tag_get(node, iftag, offset)) | ||
| 1259 | goto next; | ||
| 1260 | /* Sibling slots never have tags set on them */ | ||
| 1261 | if (radix_tree_is_internal_node(child)) { | ||
| 1262 | node = entry_to_node(child); | ||
| 1263 | continue; | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | /* tag the leaf */ | ||
| 1267 | tagged++; | ||
| 1268 | tag_set(node, settag, offset); | ||
| 1269 | |||
| 1270 | /* walk back up the path tagging interior nodes */ | ||
| 1271 | parent = node; | ||
| 1272 | for (;;) { | ||
| 1273 | offset = parent->offset; | ||
| 1274 | parent = parent->parent; | ||
| 1275 | if (!parent) | ||
| 1276 | break; | ||
| 1277 | /* stop if we find a node with the tag already set */ | ||
| 1278 | if (tag_get(parent, settag, offset)) | ||
| 1279 | break; | ||
| 1280 | tag_set(parent, settag, offset); | ||
| 1281 | } | ||
| 1282 | next: | ||
| 1283 | /* Go to next entry in node */ | ||
| 1284 | index = ((index >> node->shift) + 1) << node->shift; | ||
| 1285 | /* Overflow can happen when last_index is ~0UL... */ | ||
| 1286 | if (index > last_index || !index) | ||
| 1287 | break; | ||
| 1288 | offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; | ||
| 1289 | while (offset == 0) { | ||
| 1290 | /* | ||
| 1291 | * We've fully scanned this node. Go up. Because | ||
| 1292 | * last_index is guaranteed to be in the tree, what | ||
| 1293 | * we do below cannot wander astray. | ||
| 1294 | */ | ||
| 1295 | node = node->parent; | ||
| 1296 | offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; | ||
| 1297 | } | ||
| 1298 | if (is_sibling_entry(node, node->slots[offset])) | ||
| 1299 | goto next; | ||
| 1300 | if (tagged >= nr_to_tag) | ||
| 1301 | break; | ||
| 1302 | } | ||
| 1303 | /* | ||
| 1304 | * We need not to tag the root tag if there is no tag which is set with | ||
| 1305 | * settag within the range from *first_indexp to last_index. | ||
| 1306 | */ | ||
| 1307 | if (tagged > 0) | ||
| 1308 | root_tag_set(root, settag); | ||
| 1309 | *first_indexp = index; | ||
| 1310 | |||
| 1311 | return tagged; | ||
| 1312 | } | ||
| 1313 | EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); | ||
| 1314 | |||
| 1315 | /** | ||
| 1316 | * radix_tree_gang_lookup - perform multiple lookup on a radix tree | 1660 | * radix_tree_gang_lookup - perform multiple lookup on a radix tree |
| 1317 | * @root: radix tree root | 1661 | * @root: radix tree root |
| 1318 | * @results: where the results of the lookup are placed | 1662 | * @results: where the results of the lookup are placed |
| @@ -1477,105 +1821,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, | |||
| 1477 | } | 1821 | } |
| 1478 | EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); | 1822 | EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); |
| 1479 | 1823 | ||
| 1480 | #if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) | ||
| 1481 | #include <linux/sched.h> /* for cond_resched() */ | ||
| 1482 | |||
| 1483 | struct locate_info { | ||
| 1484 | unsigned long found_index; | ||
| 1485 | bool stop; | ||
| 1486 | }; | ||
| 1487 | |||
| 1488 | /* | ||
| 1489 | * This linear search is at present only useful to shmem_unuse_inode(). | ||
| 1490 | */ | ||
| 1491 | static unsigned long __locate(struct radix_tree_node *slot, void *item, | ||
| 1492 | unsigned long index, struct locate_info *info) | ||
| 1493 | { | ||
| 1494 | unsigned long i; | ||
| 1495 | |||
| 1496 | do { | ||
| 1497 | unsigned int shift = slot->shift; | ||
| 1498 | |||
| 1499 | for (i = (index >> shift) & RADIX_TREE_MAP_MASK; | ||
| 1500 | i < RADIX_TREE_MAP_SIZE; | ||
| 1501 | i++, index += (1UL << shift)) { | ||
| 1502 | struct radix_tree_node *node = | ||
| 1503 | rcu_dereference_raw(slot->slots[i]); | ||
| 1504 | if (node == RADIX_TREE_RETRY) | ||
| 1505 | goto out; | ||
| 1506 | if (!radix_tree_is_internal_node(node)) { | ||
| 1507 | if (node == item) { | ||
| 1508 | info->found_index = index; | ||
| 1509 | info->stop = true; | ||
| 1510 | goto out; | ||
| 1511 | } | ||
| 1512 | continue; | ||
| 1513 | } | ||
| 1514 | node = entry_to_node(node); | ||
| 1515 | if (is_sibling_entry(slot, node)) | ||
| 1516 | continue; | ||
| 1517 | slot = node; | ||
| 1518 | break; | ||
| 1519 | } | ||
| 1520 | } while (i < RADIX_TREE_MAP_SIZE); | ||
| 1521 | |||
| 1522 | out: | ||
| 1523 | if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) | ||
| 1524 | info->stop = true; | ||
| 1525 | return index; | ||
| 1526 | } | ||
| 1527 | |||
| 1528 | /** | ||
| 1529 | * radix_tree_locate_item - search through radix tree for item | ||
| 1530 | * @root: radix tree root | ||
| 1531 | * @item: item to be found | ||
| 1532 | * | ||
| 1533 | * Returns index where item was found, or -1 if not found. | ||
| 1534 | * Caller must hold no lock (since this time-consuming function needs | ||
| 1535 | * to be preemptible), and must check afterwards if item is still there. | ||
| 1536 | */ | ||
| 1537 | unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) | ||
| 1538 | { | ||
| 1539 | struct radix_tree_node *node; | ||
| 1540 | unsigned long max_index; | ||
| 1541 | unsigned long cur_index = 0; | ||
| 1542 | struct locate_info info = { | ||
| 1543 | .found_index = -1, | ||
| 1544 | .stop = false, | ||
| 1545 | }; | ||
| 1546 | |||
| 1547 | do { | ||
| 1548 | rcu_read_lock(); | ||
| 1549 | node = rcu_dereference_raw(root->rnode); | ||
| 1550 | if (!radix_tree_is_internal_node(node)) { | ||
| 1551 | rcu_read_unlock(); | ||
| 1552 | if (node == item) | ||
| 1553 | info.found_index = 0; | ||
| 1554 | break; | ||
| 1555 | } | ||
| 1556 | |||
| 1557 | node = entry_to_node(node); | ||
| 1558 | |||
| 1559 | max_index = node_maxindex(node); | ||
| 1560 | if (cur_index > max_index) { | ||
| 1561 | rcu_read_unlock(); | ||
| 1562 | break; | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | cur_index = __locate(node, item, cur_index, &info); | ||
| 1566 | rcu_read_unlock(); | ||
| 1567 | cond_resched(); | ||
| 1568 | } while (!info.stop && cur_index <= max_index); | ||
| 1569 | |||
| 1570 | return info.found_index; | ||
| 1571 | } | ||
| 1572 | #else | ||
| 1573 | unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) | ||
| 1574 | { | ||
| 1575 | return -1; | ||
| 1576 | } | ||
| 1577 | #endif /* CONFIG_SHMEM && CONFIG_SWAP */ | ||
| 1578 | |||
| 1579 | /** | 1824 | /** |
| 1580 | * __radix_tree_delete_node - try to free node after clearing a slot | 1825 | * __radix_tree_delete_node - try to free node after clearing a slot |
| 1581 | * @root: radix tree root | 1826 | * @root: radix tree root |
| @@ -1591,20 +1836,6 @@ void __radix_tree_delete_node(struct radix_tree_root *root, | |||
| 1591 | delete_node(root, node, NULL, NULL); | 1836 | delete_node(root, node, NULL, NULL); |
| 1592 | } | 1837 | } |
| 1593 | 1838 | ||
| 1594 | static inline void delete_sibling_entries(struct radix_tree_node *node, | ||
| 1595 | void *ptr, unsigned offset) | ||
| 1596 | { | ||
| 1597 | #ifdef CONFIG_RADIX_TREE_MULTIORDER | ||
| 1598 | int i; | ||
| 1599 | for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { | ||
| 1600 | if (node->slots[offset + i] != ptr) | ||
| 1601 | break; | ||
| 1602 | node->slots[offset + i] = NULL; | ||
| 1603 | node->count--; | ||
| 1604 | } | ||
| 1605 | #endif | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | /** | 1839 | /** |
| 1609 | * radix_tree_delete_item - delete an item from a radix tree | 1840 | * radix_tree_delete_item - delete an item from a radix tree |
| 1610 | * @root: radix tree root | 1841 | * @root: radix tree root |
| @@ -1644,7 +1875,6 @@ void *radix_tree_delete_item(struct radix_tree_root *root, | |||
| 1644 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) | 1875 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) |
| 1645 | node_tag_clear(root, node, tag, offset); | 1876 | node_tag_clear(root, node, tag, offset); |
| 1646 | 1877 | ||
| 1647 | delete_sibling_entries(node, node_to_entry(slot), offset); | ||
| 1648 | __radix_tree_replace(root, node, slot, NULL, NULL, NULL); | 1878 | __radix_tree_replace(root, node, slot, NULL, NULL, NULL); |
| 1649 | 1879 | ||
| 1650 | return entry; | 1880 | return entry; |
diff --git a/mm/compaction.c b/mm/compaction.c index 223464227299..949198d01260 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -818,6 +818,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
| 818 | page_count(page) > page_mapcount(page)) | 818 | page_count(page) > page_mapcount(page)) |
| 819 | goto isolate_fail; | 819 | goto isolate_fail; |
| 820 | 820 | ||
| 821 | /* | ||
| 822 | * Only allow to migrate anonymous pages in GFP_NOFS context | ||
| 823 | * because those do not depend on fs locks. | ||
| 824 | */ | ||
| 825 | if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) | ||
| 826 | goto isolate_fail; | ||
| 827 | |||
| 821 | /* If we already hold the lock, we can skip some rechecking */ | 828 | /* If we already hold the lock, we can skip some rechecking */ |
| 822 | if (!locked) { | 829 | if (!locked) { |
| 823 | locked = compact_trylock_irqsave(zone_lru_lock(zone), | 830 | locked = compact_trylock_irqsave(zone_lru_lock(zone), |
| @@ -1677,14 +1684,16 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
| 1677 | unsigned int alloc_flags, const struct alloc_context *ac, | 1684 | unsigned int alloc_flags, const struct alloc_context *ac, |
| 1678 | enum compact_priority prio) | 1685 | enum compact_priority prio) |
| 1679 | { | 1686 | { |
| 1680 | int may_enter_fs = gfp_mask & __GFP_FS; | ||
| 1681 | int may_perform_io = gfp_mask & __GFP_IO; | 1687 | int may_perform_io = gfp_mask & __GFP_IO; |
| 1682 | struct zoneref *z; | 1688 | struct zoneref *z; |
| 1683 | struct zone *zone; | 1689 | struct zone *zone; |
| 1684 | enum compact_result rc = COMPACT_SKIPPED; | 1690 | enum compact_result rc = COMPACT_SKIPPED; |
| 1685 | 1691 | ||
| 1686 | /* Check if the GFP flags allow compaction */ | 1692 | /* |
| 1687 | if (!may_enter_fs || !may_perform_io) | 1693 | * Check if the GFP flags allow compaction - GFP_NOIO is really |
| 1694 | * tricky context because the migration might require IO | ||
| 1695 | */ | ||
| 1696 | if (!may_perform_io) | ||
| 1688 | return COMPACT_SKIPPED; | 1697 | return COMPACT_SKIPPED; |
| 1689 | 1698 | ||
| 1690 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); | 1699 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio); |
| @@ -1751,6 +1760,7 @@ static void compact_node(int nid) | |||
| 1751 | .mode = MIGRATE_SYNC, | 1760 | .mode = MIGRATE_SYNC, |
| 1752 | .ignore_skip_hint = true, | 1761 | .ignore_skip_hint = true, |
| 1753 | .whole_zone = true, | 1762 | .whole_zone = true, |
| 1763 | .gfp_mask = GFP_KERNEL, | ||
| 1754 | }; | 1764 | }; |
| 1755 | 1765 | ||
| 1756 | 1766 | ||
| @@ -1876,6 +1886,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
| 1876 | .classzone_idx = pgdat->kcompactd_classzone_idx, | 1886 | .classzone_idx = pgdat->kcompactd_classzone_idx, |
| 1877 | .mode = MIGRATE_SYNC_LIGHT, | 1887 | .mode = MIGRATE_SYNC_LIGHT, |
| 1878 | .ignore_skip_hint = true, | 1888 | .ignore_skip_hint = true, |
| 1889 | .gfp_mask = GFP_KERNEL, | ||
| 1879 | 1890 | ||
| 1880 | }; | 1891 | }; |
| 1881 | trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, | 1892 | trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, |
diff --git a/mm/filemap.c b/mm/filemap.c index b06517b7f97f..32be3c8f3a11 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -2164,12 +2164,12 @@ page_not_uptodate: | |||
| 2164 | } | 2164 | } |
| 2165 | EXPORT_SYMBOL(filemap_fault); | 2165 | EXPORT_SYMBOL(filemap_fault); |
| 2166 | 2166 | ||
| 2167 | void filemap_map_pages(struct fault_env *fe, | 2167 | void filemap_map_pages(struct vm_fault *vmf, |
| 2168 | pgoff_t start_pgoff, pgoff_t end_pgoff) | 2168 | pgoff_t start_pgoff, pgoff_t end_pgoff) |
| 2169 | { | 2169 | { |
| 2170 | struct radix_tree_iter iter; | 2170 | struct radix_tree_iter iter; |
| 2171 | void **slot; | 2171 | void **slot; |
| 2172 | struct file *file = fe->vma->vm_file; | 2172 | struct file *file = vmf->vma->vm_file; |
| 2173 | struct address_space *mapping = file->f_mapping; | 2173 | struct address_space *mapping = file->f_mapping; |
| 2174 | pgoff_t last_pgoff = start_pgoff; | 2174 | pgoff_t last_pgoff = start_pgoff; |
| 2175 | loff_t size; | 2175 | loff_t size; |
| @@ -2225,11 +2225,11 @@ repeat: | |||
| 2225 | if (file->f_ra.mmap_miss > 0) | 2225 | if (file->f_ra.mmap_miss > 0) |
| 2226 | file->f_ra.mmap_miss--; | 2226 | file->f_ra.mmap_miss--; |
| 2227 | 2227 | ||
| 2228 | fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; | 2228 | vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; |
| 2229 | if (fe->pte) | 2229 | if (vmf->pte) |
| 2230 | fe->pte += iter.index - last_pgoff; | 2230 | vmf->pte += iter.index - last_pgoff; |
| 2231 | last_pgoff = iter.index; | 2231 | last_pgoff = iter.index; |
| 2232 | if (alloc_set_pte(fe, NULL, page)) | 2232 | if (alloc_set_pte(vmf, NULL, page)) |
| 2233 | goto unlock; | 2233 | goto unlock; |
| 2234 | unlock_page(page); | 2234 | unlock_page(page); |
| 2235 | goto next; | 2235 | goto next; |
| @@ -2239,7 +2239,7 @@ skip: | |||
| 2239 | put_page(page); | 2239 | put_page(page); |
| 2240 | next: | 2240 | next: |
| 2241 | /* Huge page is mapped? No need to proceed. */ | 2241 | /* Huge page is mapped? No need to proceed. */ |
| 2242 | if (pmd_trans_huge(*fe->pmd)) | 2242 | if (pmd_trans_huge(*vmf->pmd)) |
| 2243 | break; | 2243 | break; |
| 2244 | if (iter.index == end_pgoff) | 2244 | if (iter.index == end_pgoff) |
| 2245 | break; | 2245 | break; |
| @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); | |||
| 865 | * caller if required (just like with __get_user_pages). "FOLL_GET" | 865 | * caller if required (just like with __get_user_pages). "FOLL_GET" |
| 866 | * is set implicitly if "pages" is non-NULL. | 866 | * is set implicitly if "pages" is non-NULL. |
| 867 | */ | 867 | */ |
| 868 | __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | 868 | static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, |
| 869 | unsigned long start, unsigned long nr_pages, | 869 | struct mm_struct *mm, unsigned long start, |
| 870 | struct page **pages, unsigned int gup_flags) | 870 | unsigned long nr_pages, struct page **pages, |
| 871 | unsigned int gup_flags) | ||
| 871 | { | 872 | { |
| 872 | long ret; | 873 | long ret; |
| 873 | int locked = 1; | 874 | int locked = 1; |
| @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m | |||
| 879 | up_read(&mm->mmap_sem); | 880 | up_read(&mm->mmap_sem); |
| 880 | return ret; | 881 | return ret; |
| 881 | } | 882 | } |
| 882 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
| 883 | 883 | ||
| 884 | /* | 884 | /* |
| 885 | * get_user_pages_unlocked() is suitable to replace the form: | 885 | * get_user_pages_unlocked() is suitable to replace the form: |
| @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); | |||
| 917 | * only intends to ensure the pages are faulted in. | 917 | * only intends to ensure the pages are faulted in. |
| 918 | * @vmas: array of pointers to vmas corresponding to each page. | 918 | * @vmas: array of pointers to vmas corresponding to each page. |
| 919 | * Or NULL if the caller does not require them. | 919 | * Or NULL if the caller does not require them. |
| 920 | * @locked: pointer to lock flag indicating whether lock is held and | ||
| 921 | * subsequently whether VM_FAULT_RETRY functionality can be | ||
| 922 | * utilised. Lock must initially be held. | ||
| 920 | * | 923 | * |
| 921 | * Returns number of pages pinned. This may be fewer than the number | 924 | * Returns number of pages pinned. This may be fewer than the number |
| 922 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | 925 | * requested. If nr_pages is 0 or negative, returns 0. If no pages |
| @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); | |||
| 960 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, | 963 | long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, |
| 961 | unsigned long start, unsigned long nr_pages, | 964 | unsigned long start, unsigned long nr_pages, |
| 962 | unsigned int gup_flags, struct page **pages, | 965 | unsigned int gup_flags, struct page **pages, |
| 963 | struct vm_area_struct **vmas) | 966 | struct vm_area_struct **vmas, int *locked) |
| 964 | { | 967 | { |
| 965 | return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, | 968 | return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, |
| 966 | NULL, false, | 969 | locked, true, |
| 967 | gup_flags | FOLL_TOUCH | FOLL_REMOTE); | 970 | gup_flags | FOLL_TOUCH | FOLL_REMOTE); |
| 968 | } | 971 | } |
| 969 | EXPORT_SYMBOL(get_user_pages_remote); | 972 | EXPORT_SYMBOL(get_user_pages_remote); |
| @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); | |||
| 971 | /* | 974 | /* |
| 972 | * This is the same as get_user_pages_remote(), just with a | 975 | * This is the same as get_user_pages_remote(), just with a |
| 973 | * less-flexible calling convention where we assume that the task | 976 | * less-flexible calling convention where we assume that the task |
| 974 | * and mm being operated on are the current task's. We also | 977 | * and mm being operated on are the current task's and don't allow |
| 975 | * obviously don't pass FOLL_REMOTE in here. | 978 | * passing of a locked parameter. We also obviously don't pass |
| 979 | * FOLL_REMOTE in here. | ||
| 976 | */ | 980 | */ |
| 977 | long get_user_pages(unsigned long start, unsigned long nr_pages, | 981 | long get_user_pages(unsigned long start, unsigned long nr_pages, |
| 978 | unsigned int gup_flags, struct page **pages, | 982 | unsigned int gup_flags, struct page **pages, |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf05477..10eedbf14421 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, | |||
| 542 | } | 542 | } |
| 543 | EXPORT_SYMBOL_GPL(thp_get_unmapped_area); | 543 | EXPORT_SYMBOL_GPL(thp_get_unmapped_area); |
| 544 | 544 | ||
| 545 | static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | 545 | static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, |
| 546 | gfp_t gfp) | 546 | gfp_t gfp) |
| 547 | { | 547 | { |
| 548 | struct vm_area_struct *vma = fe->vma; | 548 | struct vm_area_struct *vma = vmf->vma; |
| 549 | struct mem_cgroup *memcg; | 549 | struct mem_cgroup *memcg; |
| 550 | pgtable_t pgtable; | 550 | pgtable_t pgtable; |
| 551 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 551 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| 552 | 552 | ||
| 553 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 553 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
| 554 | 554 | ||
| @@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
| 573 | */ | 573 | */ |
| 574 | __SetPageUptodate(page); | 574 | __SetPageUptodate(page); |
| 575 | 575 | ||
| 576 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 576 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| 577 | if (unlikely(!pmd_none(*fe->pmd))) { | 577 | if (unlikely(!pmd_none(*vmf->pmd))) { |
| 578 | spin_unlock(fe->ptl); | 578 | spin_unlock(vmf->ptl); |
| 579 | mem_cgroup_cancel_charge(page, memcg, true); | 579 | mem_cgroup_cancel_charge(page, memcg, true); |
| 580 | put_page(page); | 580 | put_page(page); |
| 581 | pte_free(vma->vm_mm, pgtable); | 581 | pte_free(vma->vm_mm, pgtable); |
| @@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
| 586 | if (userfaultfd_missing(vma)) { | 586 | if (userfaultfd_missing(vma)) { |
| 587 | int ret; | 587 | int ret; |
| 588 | 588 | ||
| 589 | spin_unlock(fe->ptl); | 589 | spin_unlock(vmf->ptl); |
| 590 | mem_cgroup_cancel_charge(page, memcg, true); | 590 | mem_cgroup_cancel_charge(page, memcg, true); |
| 591 | put_page(page); | 591 | put_page(page); |
| 592 | pte_free(vma->vm_mm, pgtable); | 592 | pte_free(vma->vm_mm, pgtable); |
| 593 | ret = handle_userfault(fe, VM_UFFD_MISSING); | 593 | ret = handle_userfault(vmf, VM_UFFD_MISSING); |
| 594 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 594 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
| 595 | return ret; | 595 | return ret; |
| 596 | } | 596 | } |
| @@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, | |||
| 600 | page_add_new_anon_rmap(page, vma, haddr, true); | 600 | page_add_new_anon_rmap(page, vma, haddr, true); |
| 601 | mem_cgroup_commit_charge(page, memcg, false, true); | 601 | mem_cgroup_commit_charge(page, memcg, false, true); |
| 602 | lru_cache_add_active_or_unevictable(page, vma); | 602 | lru_cache_add_active_or_unevictable(page, vma); |
| 603 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); | 603 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); |
| 604 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 604 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
| 605 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 605 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| 606 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 606 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
| 607 | spin_unlock(fe->ptl); | 607 | spin_unlock(vmf->ptl); |
| 608 | count_vm_event(THP_FAULT_ALLOC); | 608 | count_vm_event(THP_FAULT_ALLOC); |
| 609 | } | 609 | } |
| 610 | 610 | ||
| @@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
| 651 | return true; | 651 | return true; |
| 652 | } | 652 | } |
| 653 | 653 | ||
| 654 | int do_huge_pmd_anonymous_page(struct fault_env *fe) | 654 | int do_huge_pmd_anonymous_page(struct vm_fault *vmf) |
| 655 | { | 655 | { |
| 656 | struct vm_area_struct *vma = fe->vma; | 656 | struct vm_area_struct *vma = vmf->vma; |
| 657 | gfp_t gfp; | 657 | gfp_t gfp; |
| 658 | struct page *page; | 658 | struct page *page; |
| 659 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 659 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| 660 | 660 | ||
| 661 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) | 661 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
| 662 | return VM_FAULT_FALLBACK; | 662 | return VM_FAULT_FALLBACK; |
| @@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
| 664 | return VM_FAULT_OOM; | 664 | return VM_FAULT_OOM; |
| 665 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) | 665 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) |
| 666 | return VM_FAULT_OOM; | 666 | return VM_FAULT_OOM; |
| 667 | if (!(fe->flags & FAULT_FLAG_WRITE) && | 667 | if (!(vmf->flags & FAULT_FLAG_WRITE) && |
| 668 | !mm_forbids_zeropage(vma->vm_mm) && | 668 | !mm_forbids_zeropage(vma->vm_mm) && |
| 669 | transparent_hugepage_use_zero_page()) { | 669 | transparent_hugepage_use_zero_page()) { |
| 670 | pgtable_t pgtable; | 670 | pgtable_t pgtable; |
| @@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
| 680 | count_vm_event(THP_FAULT_FALLBACK); | 680 | count_vm_event(THP_FAULT_FALLBACK); |
| 681 | return VM_FAULT_FALLBACK; | 681 | return VM_FAULT_FALLBACK; |
| 682 | } | 682 | } |
| 683 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 683 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| 684 | ret = 0; | 684 | ret = 0; |
| 685 | set = false; | 685 | set = false; |
| 686 | if (pmd_none(*fe->pmd)) { | 686 | if (pmd_none(*vmf->pmd)) { |
| 687 | if (userfaultfd_missing(vma)) { | 687 | if (userfaultfd_missing(vma)) { |
| 688 | spin_unlock(fe->ptl); | 688 | spin_unlock(vmf->ptl); |
| 689 | ret = handle_userfault(fe, VM_UFFD_MISSING); | 689 | ret = handle_userfault(vmf, VM_UFFD_MISSING); |
| 690 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 690 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
| 691 | } else { | 691 | } else { |
| 692 | set_huge_zero_page(pgtable, vma->vm_mm, vma, | 692 | set_huge_zero_page(pgtable, vma->vm_mm, vma, |
| 693 | haddr, fe->pmd, zero_page); | 693 | haddr, vmf->pmd, zero_page); |
| 694 | spin_unlock(fe->ptl); | 694 | spin_unlock(vmf->ptl); |
| 695 | set = true; | 695 | set = true; |
| 696 | } | 696 | } |
| 697 | } else | 697 | } else |
| 698 | spin_unlock(fe->ptl); | 698 | spin_unlock(vmf->ptl); |
| 699 | if (!set) | 699 | if (!set) |
| 700 | pte_free(vma->vm_mm, pgtable); | 700 | pte_free(vma->vm_mm, pgtable); |
| 701 | return ret; | 701 | return ret; |
| @@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) | |||
| 707 | return VM_FAULT_FALLBACK; | 707 | return VM_FAULT_FALLBACK; |
| 708 | } | 708 | } |
| 709 | prep_transhuge_page(page); | 709 | prep_transhuge_page(page); |
| 710 | return __do_huge_pmd_anonymous_page(fe, page, gfp); | 710 | return __do_huge_pmd_anonymous_page(vmf, page, gfp); |
| 711 | } | 711 | } |
| 712 | 712 | ||
| 713 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 713 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
| @@ -879,30 +879,30 @@ out: | |||
| 879 | return ret; | 879 | return ret; |
| 880 | } | 880 | } |
| 881 | 881 | ||
| 882 | void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) | 882 | void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) |
| 883 | { | 883 | { |
| 884 | pmd_t entry; | 884 | pmd_t entry; |
| 885 | unsigned long haddr; | 885 | unsigned long haddr; |
| 886 | 886 | ||
| 887 | fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); | 887 | vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); |
| 888 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) | 888 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
| 889 | goto unlock; | 889 | goto unlock; |
| 890 | 890 | ||
| 891 | entry = pmd_mkyoung(orig_pmd); | 891 | entry = pmd_mkyoung(orig_pmd); |
| 892 | haddr = fe->address & HPAGE_PMD_MASK; | 892 | haddr = vmf->address & HPAGE_PMD_MASK; |
| 893 | if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, | 893 | if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, |
| 894 | fe->flags & FAULT_FLAG_WRITE)) | 894 | vmf->flags & FAULT_FLAG_WRITE)) |
| 895 | update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); | 895 | update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); |
| 896 | 896 | ||
| 897 | unlock: | 897 | unlock: |
| 898 | spin_unlock(fe->ptl); | 898 | spin_unlock(vmf->ptl); |
| 899 | } | 899 | } |
| 900 | 900 | ||
| 901 | static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | 901 | static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, |
| 902 | struct page *page) | 902 | struct page *page) |
| 903 | { | 903 | { |
| 904 | struct vm_area_struct *vma = fe->vma; | 904 | struct vm_area_struct *vma = vmf->vma; |
| 905 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 905 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| 906 | struct mem_cgroup *memcg; | 906 | struct mem_cgroup *memcg; |
| 907 | pgtable_t pgtable; | 907 | pgtable_t pgtable; |
| 908 | pmd_t _pmd; | 908 | pmd_t _pmd; |
| @@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | |||
| 921 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 921 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
| 922 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | | 922 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
| 923 | __GFP_OTHER_NODE, vma, | 923 | __GFP_OTHER_NODE, vma, |
| 924 | fe->address, page_to_nid(page)); | 924 | vmf->address, page_to_nid(page)); |
| 925 | if (unlikely(!pages[i] || | 925 | if (unlikely(!pages[i] || |
| 926 | mem_cgroup_try_charge(pages[i], vma->vm_mm, | 926 | mem_cgroup_try_charge(pages[i], vma->vm_mm, |
| 927 | GFP_KERNEL, &memcg, false))) { | 927 | GFP_KERNEL, &memcg, false))) { |
| @@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | |||
| 952 | mmun_end = haddr + HPAGE_PMD_SIZE; | 952 | mmun_end = haddr + HPAGE_PMD_SIZE; |
| 953 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | 953 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
| 954 | 954 | ||
| 955 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 955 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| 956 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) | 956 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
| 957 | goto out_free_pages; | 957 | goto out_free_pages; |
| 958 | VM_BUG_ON_PAGE(!PageHead(page), page); | 958 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 959 | 959 | ||
| 960 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); | 960 | pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); |
| 961 | /* leave pmd empty until pte is filled */ | 961 | /* leave pmd empty until pte is filled */ |
| 962 | 962 | ||
| 963 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); | 963 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); |
| 964 | pmd_populate(vma->vm_mm, &_pmd, pgtable); | 964 | pmd_populate(vma->vm_mm, &_pmd, pgtable); |
| 965 | 965 | ||
| 966 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 966 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
| @@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, | |||
| 969 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 969 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 970 | memcg = (void *)page_private(pages[i]); | 970 | memcg = (void *)page_private(pages[i]); |
| 971 | set_page_private(pages[i], 0); | 971 | set_page_private(pages[i], 0); |
| 972 | page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); | 972 | page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); |
| 973 | mem_cgroup_commit_charge(pages[i], memcg, false, false); | 973 | mem_cgroup_commit_charge(pages[i], memcg, false, false); |
| 974 | lru_cache_add_active_or_unevictable(pages[i], vma); | 974 | lru_cache_add_active_or_unevictable(pages[i], vma); |
| 975 | fe->pte = pte_offset_map(&_pmd, haddr); | 975 | vmf->pte = pte_offset_map(&_pmd, haddr); |
| 976 | VM_BUG_ON(!pte_none(*fe->pte)); | 976 | VM_BUG_ON(!pte_none(*vmf->pte)); |
| 977 | set_pte_at(vma->vm_mm, haddr, fe->pte, entry); | 977 | set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); |
| 978 | pte_unmap(fe->pte); | 978 | pte_unmap(vmf->pte); |
| 979 | } | 979 | } |
| 980 | kfree(pages); | 980 | kfree(pages); |
| 981 | 981 | ||
| 982 | smp_wmb(); /* make pte visible before pmd */ | 982 | smp_wmb(); /* make pte visible before pmd */ |
| 983 | pmd_populate(vma->vm_mm, fe->pmd, pgtable); | 983 | pmd_populate(vma->vm_mm, vmf->pmd, pgtable); |
| 984 | page_remove_rmap(page, true); | 984 | page_remove_rmap(page, true); |
| 985 | spin_unlock(fe->ptl); | 985 | spin_unlock(vmf->ptl); |
| 986 | 986 | ||
| 987 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 987 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
| 988 | 988 | ||
| @@ -993,7 +993,7 @@ out: | |||
| 993 | return ret; | 993 | return ret; |
| 994 | 994 | ||
| 995 | out_free_pages: | 995 | out_free_pages: |
| 996 | spin_unlock(fe->ptl); | 996 | spin_unlock(vmf->ptl); |
| 997 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 997 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
| 998 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 998 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
| 999 | memcg = (void *)page_private(pages[i]); | 999 | memcg = (void *)page_private(pages[i]); |
| @@ -1005,23 +1005,23 @@ out_free_pages: | |||
| 1005 | goto out; | 1005 | goto out; |
| 1006 | } | 1006 | } |
| 1007 | 1007 | ||
| 1008 | int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) | 1008 | int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) |
| 1009 | { | 1009 | { |
| 1010 | struct vm_area_struct *vma = fe->vma; | 1010 | struct vm_area_struct *vma = vmf->vma; |
| 1011 | struct page *page = NULL, *new_page; | 1011 | struct page *page = NULL, *new_page; |
| 1012 | struct mem_cgroup *memcg; | 1012 | struct mem_cgroup *memcg; |
| 1013 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 1013 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| 1014 | unsigned long mmun_start; /* For mmu_notifiers */ | 1014 | unsigned long mmun_start; /* For mmu_notifiers */ |
| 1015 | unsigned long mmun_end; /* For mmu_notifiers */ | 1015 | unsigned long mmun_end; /* For mmu_notifiers */ |
| 1016 | gfp_t huge_gfp; /* for allocation and charge */ | 1016 | gfp_t huge_gfp; /* for allocation and charge */ |
| 1017 | int ret = 0; | 1017 | int ret = 0; |
| 1018 | 1018 | ||
| 1019 | fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); | 1019 | vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); |
| 1020 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 1020 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
| 1021 | if (is_huge_zero_pmd(orig_pmd)) | 1021 | if (is_huge_zero_pmd(orig_pmd)) |
| 1022 | goto alloc; | 1022 | goto alloc; |
| 1023 | spin_lock(fe->ptl); | 1023 | spin_lock(vmf->ptl); |
| 1024 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) | 1024 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) |
| 1025 | goto out_unlock; | 1025 | goto out_unlock; |
| 1026 | 1026 | ||
| 1027 | page = pmd_page(orig_pmd); | 1027 | page = pmd_page(orig_pmd); |
| @@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) | |||
| 1034 | pmd_t entry; | 1034 | pmd_t entry; |
| 1035 | entry = pmd_mkyoung(orig_pmd); | 1035 | entry = pmd_mkyoung(orig_pmd); |
| 1036 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1036 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
| 1037 | if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) | 1037 | if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) |
| 1038 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1038 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
| 1039 | ret |= VM_FAULT_WRITE; | 1039 | ret |= VM_FAULT_WRITE; |
| 1040 | goto out_unlock; | 1040 | goto out_unlock; |
| 1041 | } | 1041 | } |
| 1042 | get_page(page); | 1042 | get_page(page); |
| 1043 | spin_unlock(fe->ptl); | 1043 | spin_unlock(vmf->ptl); |
| 1044 | alloc: | 1044 | alloc: |
| 1045 | if (transparent_hugepage_enabled(vma) && | 1045 | if (transparent_hugepage_enabled(vma) && |
| 1046 | !transparent_hugepage_debug_cow()) { | 1046 | !transparent_hugepage_debug_cow()) { |
| @@ -1053,12 +1053,12 @@ alloc: | |||
| 1053 | prep_transhuge_page(new_page); | 1053 | prep_transhuge_page(new_page); |
| 1054 | } else { | 1054 | } else { |
| 1055 | if (!page) { | 1055 | if (!page) { |
| 1056 | split_huge_pmd(vma, fe->pmd, fe->address); | 1056 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
| 1057 | ret |= VM_FAULT_FALLBACK; | 1057 | ret |= VM_FAULT_FALLBACK; |
| 1058 | } else { | 1058 | } else { |
| 1059 | ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); | 1059 | ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); |
| 1060 | if (ret & VM_FAULT_OOM) { | 1060 | if (ret & VM_FAULT_OOM) { |
| 1061 | split_huge_pmd(vma, fe->pmd, fe->address); | 1061 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
| 1062 | ret |= VM_FAULT_FALLBACK; | 1062 | ret |= VM_FAULT_FALLBACK; |
| 1063 | } | 1063 | } |
| 1064 | put_page(page); | 1064 | put_page(page); |
| @@ -1070,7 +1070,7 @@ alloc: | |||
| 1070 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, | 1070 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, |
| 1071 | huge_gfp, &memcg, true))) { | 1071 | huge_gfp, &memcg, true))) { |
| 1072 | put_page(new_page); | 1072 | put_page(new_page); |
| 1073 | split_huge_pmd(vma, fe->pmd, fe->address); | 1073 | split_huge_pmd(vma, vmf->pmd, vmf->address); |
| 1074 | if (page) | 1074 | if (page) |
| 1075 | put_page(page); | 1075 | put_page(page); |
| 1076 | ret |= VM_FAULT_FALLBACK; | 1076 | ret |= VM_FAULT_FALLBACK; |
| @@ -1090,11 +1090,11 @@ alloc: | |||
| 1090 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1090 | mmun_end = haddr + HPAGE_PMD_SIZE; |
| 1091 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); | 1091 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
| 1092 | 1092 | ||
| 1093 | spin_lock(fe->ptl); | 1093 | spin_lock(vmf->ptl); |
| 1094 | if (page) | 1094 | if (page) |
| 1095 | put_page(page); | 1095 | put_page(page); |
| 1096 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { | 1096 | if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { |
| 1097 | spin_unlock(fe->ptl); | 1097 | spin_unlock(vmf->ptl); |
| 1098 | mem_cgroup_cancel_charge(new_page, memcg, true); | 1098 | mem_cgroup_cancel_charge(new_page, memcg, true); |
| 1099 | put_page(new_page); | 1099 | put_page(new_page); |
| 1100 | goto out_mn; | 1100 | goto out_mn; |
| @@ -1102,12 +1102,12 @@ alloc: | |||
| 1102 | pmd_t entry; | 1102 | pmd_t entry; |
| 1103 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1103 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
| 1104 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1104 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
| 1105 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); | 1105 | pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); |
| 1106 | page_add_new_anon_rmap(new_page, vma, haddr, true); | 1106 | page_add_new_anon_rmap(new_page, vma, haddr, true); |
| 1107 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1107 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
| 1108 | lru_cache_add_active_or_unevictable(new_page, vma); | 1108 | lru_cache_add_active_or_unevictable(new_page, vma); |
| 1109 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 1109 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
| 1110 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1110 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
| 1111 | if (!page) { | 1111 | if (!page) { |
| 1112 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1112 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| 1113 | } else { | 1113 | } else { |
| @@ -1117,13 +1117,13 @@ alloc: | |||
| 1117 | } | 1117 | } |
| 1118 | ret |= VM_FAULT_WRITE; | 1118 | ret |= VM_FAULT_WRITE; |
| 1119 | } | 1119 | } |
| 1120 | spin_unlock(fe->ptl); | 1120 | spin_unlock(vmf->ptl); |
| 1121 | out_mn: | 1121 | out_mn: |
| 1122 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); | 1122 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
| 1123 | out: | 1123 | out: |
| 1124 | return ret; | 1124 | return ret; |
| 1125 | out_unlock: | 1125 | out_unlock: |
| 1126 | spin_unlock(fe->ptl); | 1126 | spin_unlock(vmf->ptl); |
| 1127 | return ret; | 1127 | return ret; |
| 1128 | } | 1128 | } |
| 1129 | 1129 | ||
| @@ -1196,12 +1196,12 @@ out: | |||
| 1196 | } | 1196 | } |
| 1197 | 1197 | ||
| 1198 | /* NUMA hinting page fault entry point for trans huge pmds */ | 1198 | /* NUMA hinting page fault entry point for trans huge pmds */ |
| 1199 | int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | 1199 | int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) |
| 1200 | { | 1200 | { |
| 1201 | struct vm_area_struct *vma = fe->vma; | 1201 | struct vm_area_struct *vma = vmf->vma; |
| 1202 | struct anon_vma *anon_vma = NULL; | 1202 | struct anon_vma *anon_vma = NULL; |
| 1203 | struct page *page; | 1203 | struct page *page; |
| 1204 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 1204 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| 1205 | int page_nid = -1, this_nid = numa_node_id(); | 1205 | int page_nid = -1, this_nid = numa_node_id(); |
| 1206 | int target_nid, last_cpupid = -1; | 1206 | int target_nid, last_cpupid = -1; |
| 1207 | bool page_locked; | 1207 | bool page_locked; |
| @@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
| 1209 | bool was_writable; | 1209 | bool was_writable; |
| 1210 | int flags = 0; | 1210 | int flags = 0; |
| 1211 | 1211 | ||
| 1212 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 1212 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| 1213 | if (unlikely(!pmd_same(pmd, *fe->pmd))) | 1213 | if (unlikely(!pmd_same(pmd, *vmf->pmd))) |
| 1214 | goto out_unlock; | 1214 | goto out_unlock; |
| 1215 | 1215 | ||
| 1216 | /* | 1216 | /* |
| @@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
| 1218 | * without disrupting NUMA hinting information. Do not relock and | 1218 | * without disrupting NUMA hinting information. Do not relock and |
| 1219 | * check_same as the page may no longer be mapped. | 1219 | * check_same as the page may no longer be mapped. |
| 1220 | */ | 1220 | */ |
| 1221 | if (unlikely(pmd_trans_migrating(*fe->pmd))) { | 1221 | if (unlikely(pmd_trans_migrating(*vmf->pmd))) { |
| 1222 | page = pmd_page(*fe->pmd); | 1222 | page = pmd_page(*vmf->pmd); |
| 1223 | spin_unlock(fe->ptl); | 1223 | spin_unlock(vmf->ptl); |
| 1224 | wait_on_page_locked(page); | 1224 | wait_on_page_locked(page); |
| 1225 | goto out; | 1225 | goto out; |
| 1226 | } | 1226 | } |
| @@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
| 1253 | 1253 | ||
| 1254 | /* Migration could have started since the pmd_trans_migrating check */ | 1254 | /* Migration could have started since the pmd_trans_migrating check */ |
| 1255 | if (!page_locked) { | 1255 | if (!page_locked) { |
| 1256 | spin_unlock(fe->ptl); | 1256 | spin_unlock(vmf->ptl); |
| 1257 | wait_on_page_locked(page); | 1257 | wait_on_page_locked(page); |
| 1258 | page_nid = -1; | 1258 | page_nid = -1; |
| 1259 | goto out; | 1259 | goto out; |
| @@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
| 1264 | * to serialises splits | 1264 | * to serialises splits |
| 1265 | */ | 1265 | */ |
| 1266 | get_page(page); | 1266 | get_page(page); |
| 1267 | spin_unlock(fe->ptl); | 1267 | spin_unlock(vmf->ptl); |
| 1268 | anon_vma = page_lock_anon_vma_read(page); | 1268 | anon_vma = page_lock_anon_vma_read(page); |
| 1269 | 1269 | ||
| 1270 | /* Confirm the PMD did not change while page_table_lock was released */ | 1270 | /* Confirm the PMD did not change while page_table_lock was released */ |
| 1271 | spin_lock(fe->ptl); | 1271 | spin_lock(vmf->ptl); |
| 1272 | if (unlikely(!pmd_same(pmd, *fe->pmd))) { | 1272 | if (unlikely(!pmd_same(pmd, *vmf->pmd))) { |
| 1273 | unlock_page(page); | 1273 | unlock_page(page); |
| 1274 | put_page(page); | 1274 | put_page(page); |
| 1275 | page_nid = -1; | 1275 | page_nid = -1; |
| @@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) | |||
| 1287 | * Migrate the THP to the requested node, returns with page unlocked | 1287 | * Migrate the THP to the requested node, returns with page unlocked |
| 1288 | * and access rights restored. | 1288 | * and access rights restored. |
| 1289 | */ | 1289 | */ |
| 1290 | spin_unlock(fe->ptl); | 1290 | spin_unlock(vmf->ptl); |
| 1291 | migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, | 1291 | migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, |
| 1292 | fe->pmd, pmd, fe->address, page, target_nid); | 1292 | vmf->pmd, pmd, vmf->address, page, target_nid); |
| 1293 | if (migrated) { | 1293 | if (migrated) { |
| 1294 | flags |= TNF_MIGRATED; | 1294 | flags |= TNF_MIGRATED; |
| 1295 | page_nid = target_nid; | 1295 | page_nid = target_nid; |
| @@ -1304,18 +1304,19 @@ clear_pmdnuma: | |||
| 1304 | pmd = pmd_mkyoung(pmd); | 1304 | pmd = pmd_mkyoung(pmd); |
| 1305 | if (was_writable) | 1305 | if (was_writable) |
| 1306 | pmd = pmd_mkwrite(pmd); | 1306 | pmd = pmd_mkwrite(pmd); |
| 1307 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); | 1307 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); |
| 1308 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); | 1308 | update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); |
| 1309 | unlock_page(page); | 1309 | unlock_page(page); |
| 1310 | out_unlock: | 1310 | out_unlock: |
| 1311 | spin_unlock(fe->ptl); | 1311 | spin_unlock(vmf->ptl); |
| 1312 | 1312 | ||
| 1313 | out: | 1313 | out: |
| 1314 | if (anon_vma) | 1314 | if (anon_vma) |
| 1315 | page_unlock_anon_vma_read(anon_vma); | 1315 | page_unlock_anon_vma_read(anon_vma); |
| 1316 | 1316 | ||
| 1317 | if (page_nid != -1) | 1317 | if (page_nid != -1) |
| 1318 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); | 1318 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, |
| 1319 | vmf->flags); | ||
| 1319 | 1320 | ||
| 1320 | return 0; | 1321 | return 0; |
| 1321 | } | 1322 | } |
diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..44d68895a9b9 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -36,7 +36,7 @@ | |||
| 36 | /* Do not use these with a slab allocator */ | 36 | /* Do not use these with a slab allocator */ |
| 37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) | 37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) |
| 38 | 38 | ||
| 39 | int do_swap_page(struct fault_env *fe, pte_t orig_pte); | 39 | int do_swap_page(struct vm_fault *vmf); |
| 40 | 40 | ||
| 41 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 41 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
| 42 | unsigned long floor, unsigned long ceiling); | 42 | unsigned long floor, unsigned long ceiling); |
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..e32389a97030 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
| @@ -875,13 +875,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
| 875 | unsigned long address, pmd_t *pmd, | 875 | unsigned long address, pmd_t *pmd, |
| 876 | int referenced) | 876 | int referenced) |
| 877 | { | 877 | { |
| 878 | pte_t pteval; | ||
| 879 | int swapped_in = 0, ret = 0; | 878 | int swapped_in = 0, ret = 0; |
| 880 | struct fault_env fe = { | 879 | struct vm_fault vmf = { |
| 881 | .vma = vma, | 880 | .vma = vma, |
| 882 | .address = address, | 881 | .address = address, |
| 883 | .flags = FAULT_FLAG_ALLOW_RETRY, | 882 | .flags = FAULT_FLAG_ALLOW_RETRY, |
| 884 | .pmd = pmd, | 883 | .pmd = pmd, |
| 884 | .pgoff = linear_page_index(vma, address), | ||
| 885 | }; | 885 | }; |
| 886 | 886 | ||
| 887 | /* we only decide to swapin, if there is enough young ptes */ | 887 | /* we only decide to swapin, if there is enough young ptes */ |
| @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
| 889 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | 889 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); |
| 890 | return false; | 890 | return false; |
| 891 | } | 891 | } |
| 892 | fe.pte = pte_offset_map(pmd, address); | 892 | vmf.pte = pte_offset_map(pmd, address); |
| 893 | for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; | 893 | for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; |
| 894 | fe.pte++, fe.address += PAGE_SIZE) { | 894 | vmf.pte++, vmf.address += PAGE_SIZE) { |
| 895 | pteval = *fe.pte; | 895 | vmf.orig_pte = *vmf.pte; |
| 896 | if (!is_swap_pte(pteval)) | 896 | if (!is_swap_pte(vmf.orig_pte)) |
| 897 | continue; | 897 | continue; |
| 898 | swapped_in++; | 898 | swapped_in++; |
| 899 | ret = do_swap_page(&fe, pteval); | 899 | ret = do_swap_page(&vmf); |
| 900 | 900 | ||
| 901 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ | 901 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ |
| 902 | if (ret & VM_FAULT_RETRY) { | 902 | if (ret & VM_FAULT_RETRY) { |
| 903 | down_read(&mm->mmap_sem); | 903 | down_read(&mm->mmap_sem); |
| 904 | if (hugepage_vma_revalidate(mm, address, &fe.vma)) { | 904 | if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { |
| 905 | /* vma is no longer available, don't continue to swapin */ | 905 | /* vma is no longer available, don't continue to swapin */ |
| 906 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | 906 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); |
| 907 | return false; | 907 | return false; |
| @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, | |||
| 915 | return false; | 915 | return false; |
| 916 | } | 916 | } |
| 917 | /* pte is unmapped now, we need to map it */ | 917 | /* pte is unmapped now, we need to map it */ |
| 918 | fe.pte = pte_offset_map(pmd, fe.address); | 918 | vmf.pte = pte_offset_map(pmd, vmf.address); |
| 919 | } | 919 | } |
| 920 | fe.pte--; | 920 | vmf.pte--; |
| 921 | pte_unmap(fe.pte); | 921 | pte_unmap(vmf.pte); |
| 922 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); | 922 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); |
| 923 | return true; | 923 | return true; |
| 924 | } | 924 | } |
| @@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm, | |||
| 1446 | radix_tree_replace_slot(&mapping->page_tree, slot, | 1446 | radix_tree_replace_slot(&mapping->page_tree, slot, |
| 1447 | new_page + (index % HPAGE_PMD_NR)); | 1447 | new_page + (index % HPAGE_PMD_NR)); |
| 1448 | 1448 | ||
| 1449 | slot = radix_tree_iter_next(&iter); | 1449 | slot = radix_tree_iter_resume(slot, &iter); |
| 1450 | index++; | 1450 | index++; |
| 1451 | continue; | 1451 | continue; |
| 1452 | out_lru: | 1452 | out_lru: |
| @@ -1546,7 +1546,6 @@ tree_unlocked: | |||
| 1546 | /* Put holes back where they were */ | 1546 | /* Put holes back where they were */ |
| 1547 | radix_tree_delete(&mapping->page_tree, | 1547 | radix_tree_delete(&mapping->page_tree, |
| 1548 | iter.index); | 1548 | iter.index); |
| 1549 | slot = radix_tree_iter_next(&iter); | ||
| 1550 | continue; | 1549 | continue; |
| 1551 | } | 1550 | } |
| 1552 | 1551 | ||
| @@ -1557,11 +1556,11 @@ tree_unlocked: | |||
| 1557 | page_ref_unfreeze(page, 2); | 1556 | page_ref_unfreeze(page, 2); |
| 1558 | radix_tree_replace_slot(&mapping->page_tree, | 1557 | radix_tree_replace_slot(&mapping->page_tree, |
| 1559 | slot, page); | 1558 | slot, page); |
| 1559 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 1560 | spin_unlock_irq(&mapping->tree_lock); | 1560 | spin_unlock_irq(&mapping->tree_lock); |
| 1561 | putback_lru_page(page); | 1561 | putback_lru_page(page); |
| 1562 | unlock_page(page); | 1562 | unlock_page(page); |
| 1563 | spin_lock_irq(&mapping->tree_lock); | 1563 | spin_lock_irq(&mapping->tree_lock); |
| 1564 | slot = radix_tree_iter_next(&iter); | ||
| 1565 | } | 1564 | } |
| 1566 | VM_BUG_ON(nr_none); | 1565 | VM_BUG_ON(nr_none); |
| 1567 | spin_unlock_irq(&mapping->tree_lock); | 1566 | spin_unlock_irq(&mapping->tree_lock); |
| @@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, | |||
| 1641 | present++; | 1640 | present++; |
| 1642 | 1641 | ||
| 1643 | if (need_resched()) { | 1642 | if (need_resched()) { |
| 1643 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 1644 | cond_resched_rcu(); | 1644 | cond_resched_rcu(); |
| 1645 | slot = radix_tree_iter_next(&iter); | ||
| 1646 | } | 1645 | } |
| 1647 | } | 1646 | } |
| 1648 | rcu_read_unlock(); | 1647 | rcu_read_unlock(); |
diff --git a/mm/memory.c b/mm/memory.c index 08d8da39de28..455c3e628d52 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -2034,20 +2034,17 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) | |||
| 2034 | * | 2034 | * |
| 2035 | * We do this without the lock held, so that it can sleep if it needs to. | 2035 | * We do this without the lock held, so that it can sleep if it needs to. |
| 2036 | */ | 2036 | */ |
| 2037 | static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | 2037 | static int do_page_mkwrite(struct vm_fault *vmf) |
| 2038 | unsigned long address) | ||
| 2039 | { | 2038 | { |
| 2040 | struct vm_fault vmf; | ||
| 2041 | int ret; | 2039 | int ret; |
| 2040 | struct page *page = vmf->page; | ||
| 2041 | unsigned int old_flags = vmf->flags; | ||
| 2042 | 2042 | ||
| 2043 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2043 | vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
| 2044 | vmf.pgoff = page->index; | ||
| 2045 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; | ||
| 2046 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
| 2047 | vmf.page = page; | ||
| 2048 | vmf.cow_page = NULL; | ||
| 2049 | 2044 | ||
| 2050 | ret = vma->vm_ops->page_mkwrite(vma, &vmf); | 2045 | ret = vmf->vma->vm_ops->page_mkwrite(vmf->vma, vmf); |
| 2046 | /* Restore original flags so that caller is not surprised */ | ||
| 2047 | vmf->flags = old_flags; | ||
| 2051 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2048 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
| 2052 | return ret; | 2049 | return ret; |
| 2053 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { | 2050 | if (unlikely(!(ret & VM_FAULT_LOCKED))) { |
| @@ -2063,6 +2060,41 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
| 2063 | } | 2060 | } |
| 2064 | 2061 | ||
| 2065 | /* | 2062 | /* |
| 2063 | * Handle dirtying of a page in shared file mapping on a write fault. | ||
| 2064 | * | ||
| 2065 | * The function expects the page to be locked and unlocks it. | ||
| 2066 | */ | ||
| 2067 | static void fault_dirty_shared_page(struct vm_area_struct *vma, | ||
| 2068 | struct page *page) | ||
| 2069 | { | ||
| 2070 | struct address_space *mapping; | ||
| 2071 | bool dirtied; | ||
| 2072 | bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; | ||
| 2073 | |||
| 2074 | dirtied = set_page_dirty(page); | ||
| 2075 | VM_BUG_ON_PAGE(PageAnon(page), page); | ||
| 2076 | /* | ||
| 2077 | * Take a local copy of the address_space - page.mapping may be zeroed | ||
| 2078 | * by truncate after unlock_page(). The address_space itself remains | ||
| 2079 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | ||
| 2080 | * release semantics to prevent the compiler from undoing this copying. | ||
| 2081 | */ | ||
| 2082 | mapping = page_rmapping(page); | ||
| 2083 | unlock_page(page); | ||
| 2084 | |||
| 2085 | if ((dirtied || page_mkwrite) && mapping) { | ||
| 2086 | /* | ||
| 2087 | * Some device drivers do not set page.mapping | ||
| 2088 | * but still dirty their pages | ||
| 2089 | */ | ||
| 2090 | balance_dirty_pages_ratelimited(mapping); | ||
| 2091 | } | ||
| 2092 | |||
| 2093 | if (!page_mkwrite) | ||
| 2094 | file_update_time(vma->vm_file); | ||
| 2095 | } | ||
| 2096 | |||
| 2097 | /* | ||
| 2066 | * Handle write page faults for pages that can be reused in the current vma | 2098 | * Handle write page faults for pages that can be reused in the current vma |
| 2067 | * | 2099 | * |
| 2068 | * This can happen either due to the mapping being with the VM_SHARED flag, | 2100 | * This can happen either due to the mapping being with the VM_SHARED flag, |
| @@ -2070,11 +2102,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
| 2070 | * case, all we need to do here is to mark the page as writable and update | 2102 | * case, all we need to do here is to mark the page as writable and update |
| 2071 | * any related book-keeping. | 2103 | * any related book-keeping. |
| 2072 | */ | 2104 | */ |
| 2073 | static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | 2105 | static inline void wp_page_reuse(struct vm_fault *vmf) |
| 2074 | struct page *page, int page_mkwrite, int dirty_shared) | 2106 | __releases(vmf->ptl) |
| 2075 | __releases(fe->ptl) | ||
| 2076 | { | 2107 | { |
| 2077 | struct vm_area_struct *vma = fe->vma; | 2108 | struct vm_area_struct *vma = vmf->vma; |
| 2109 | struct page *page = vmf->page; | ||
| 2078 | pte_t entry; | 2110 | pte_t entry; |
| 2079 | /* | 2111 | /* |
| 2080 | * Clear the pages cpupid information as the existing | 2112 | * Clear the pages cpupid information as the existing |
| @@ -2084,39 +2116,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | |||
| 2084 | if (page) | 2116 | if (page) |
| 2085 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); | 2117 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
| 2086 | 2118 | ||
| 2087 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); | 2119 | flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
| 2088 | entry = pte_mkyoung(orig_pte); | 2120 | entry = pte_mkyoung(vmf->orig_pte); |
| 2089 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2121 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2090 | if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) | 2122 | if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) |
| 2091 | update_mmu_cache(vma, fe->address, fe->pte); | 2123 | update_mmu_cache(vma, vmf->address, vmf->pte); |
| 2092 | pte_unmap_unlock(fe->pte, fe->ptl); | 2124 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2093 | |||
| 2094 | if (dirty_shared) { | ||
| 2095 | struct address_space *mapping; | ||
| 2096 | int dirtied; | ||
| 2097 | |||
| 2098 | if (!page_mkwrite) | ||
| 2099 | lock_page(page); | ||
| 2100 | |||
| 2101 | dirtied = set_page_dirty(page); | ||
| 2102 | VM_BUG_ON_PAGE(PageAnon(page), page); | ||
| 2103 | mapping = page->mapping; | ||
| 2104 | unlock_page(page); | ||
| 2105 | put_page(page); | ||
| 2106 | |||
| 2107 | if ((dirtied || page_mkwrite) && mapping) { | ||
| 2108 | /* | ||
| 2109 | * Some device drivers do not set page.mapping | ||
| 2110 | * but still dirty their pages | ||
| 2111 | */ | ||
| 2112 | balance_dirty_pages_ratelimited(mapping); | ||
| 2113 | } | ||
| 2114 | |||
| 2115 | if (!page_mkwrite) | ||
| 2116 | file_update_time(vma->vm_file); | ||
| 2117 | } | ||
| 2118 | |||
| 2119 | return VM_FAULT_WRITE; | ||
| 2120 | } | 2125 | } |
| 2121 | 2126 | ||
| 2122 | /* | 2127 | /* |
| @@ -2135,31 +2140,32 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, | |||
| 2135 | * held to the old page, as well as updating the rmap. | 2140 | * held to the old page, as well as updating the rmap. |
| 2136 | * - In any case, unlock the PTL and drop the reference we took to the old page. | 2141 | * - In any case, unlock the PTL and drop the reference we took to the old page. |
| 2137 | */ | 2142 | */ |
| 2138 | static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | 2143 | static int wp_page_copy(struct vm_fault *vmf) |
| 2139 | struct page *old_page) | ||
| 2140 | { | 2144 | { |
| 2141 | struct vm_area_struct *vma = fe->vma; | 2145 | struct vm_area_struct *vma = vmf->vma; |
| 2142 | struct mm_struct *mm = vma->vm_mm; | 2146 | struct mm_struct *mm = vma->vm_mm; |
| 2147 | struct page *old_page = vmf->page; | ||
| 2143 | struct page *new_page = NULL; | 2148 | struct page *new_page = NULL; |
| 2144 | pte_t entry; | 2149 | pte_t entry; |
| 2145 | int page_copied = 0; | 2150 | int page_copied = 0; |
| 2146 | const unsigned long mmun_start = fe->address & PAGE_MASK; | 2151 | const unsigned long mmun_start = vmf->address & PAGE_MASK; |
| 2147 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; | 2152 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; |
| 2148 | struct mem_cgroup *memcg; | 2153 | struct mem_cgroup *memcg; |
| 2149 | 2154 | ||
| 2150 | if (unlikely(anon_vma_prepare(vma))) | 2155 | if (unlikely(anon_vma_prepare(vma))) |
| 2151 | goto oom; | 2156 | goto oom; |
| 2152 | 2157 | ||
| 2153 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2158 | if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { |
| 2154 | new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); | 2159 | new_page = alloc_zeroed_user_highpage_movable(vma, |
| 2160 | vmf->address); | ||
| 2155 | if (!new_page) | 2161 | if (!new_page) |
| 2156 | goto oom; | 2162 | goto oom; |
| 2157 | } else { | 2163 | } else { |
| 2158 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, | 2164 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, |
| 2159 | fe->address); | 2165 | vmf->address); |
| 2160 | if (!new_page) | 2166 | if (!new_page) |
| 2161 | goto oom; | 2167 | goto oom; |
| 2162 | cow_user_page(new_page, old_page, fe->address, vma); | 2168 | cow_user_page(new_page, old_page, vmf->address, vma); |
| 2163 | } | 2169 | } |
| 2164 | 2170 | ||
| 2165 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2171 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
| @@ -2172,8 +2178,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
| 2172 | /* | 2178 | /* |
| 2173 | * Re-check the pte - we dropped the lock | 2179 | * Re-check the pte - we dropped the lock |
| 2174 | */ | 2180 | */ |
| 2175 | fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); | 2181 | vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); |
| 2176 | if (likely(pte_same(*fe->pte, orig_pte))) { | 2182 | if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { |
| 2177 | if (old_page) { | 2183 | if (old_page) { |
| 2178 | if (!PageAnon(old_page)) { | 2184 | if (!PageAnon(old_page)) { |
| 2179 | dec_mm_counter_fast(mm, | 2185 | dec_mm_counter_fast(mm, |
| @@ -2183,7 +2189,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
| 2183 | } else { | 2189 | } else { |
| 2184 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2190 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2185 | } | 2191 | } |
| 2186 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); | 2192 | flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); |
| 2187 | entry = mk_pte(new_page, vma->vm_page_prot); | 2193 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 2188 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2194 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2189 | /* | 2195 | /* |
| @@ -2192,8 +2198,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
| 2192 | * seen in the presence of one thread doing SMC and another | 2198 | * seen in the presence of one thread doing SMC and another |
| 2193 | * thread doing COW. | 2199 | * thread doing COW. |
| 2194 | */ | 2200 | */ |
| 2195 | ptep_clear_flush_notify(vma, fe->address, fe->pte); | 2201 | ptep_clear_flush_notify(vma, vmf->address, vmf->pte); |
| 2196 | page_add_new_anon_rmap(new_page, vma, fe->address, false); | 2202 | page_add_new_anon_rmap(new_page, vma, vmf->address, false); |
| 2197 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 2203 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
| 2198 | lru_cache_add_active_or_unevictable(new_page, vma); | 2204 | lru_cache_add_active_or_unevictable(new_page, vma); |
| 2199 | /* | 2205 | /* |
| @@ -2201,8 +2207,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
| 2201 | * mmu page tables (such as kvm shadow page tables), we want the | 2207 | * mmu page tables (such as kvm shadow page tables), we want the |
| 2202 | * new page to be mapped directly into the secondary page table. | 2208 | * new page to be mapped directly into the secondary page table. |
| 2203 | */ | 2209 | */ |
| 2204 | set_pte_at_notify(mm, fe->address, fe->pte, entry); | 2210 | set_pte_at_notify(mm, vmf->address, vmf->pte, entry); |
| 2205 | update_mmu_cache(vma, fe->address, fe->pte); | 2211 | update_mmu_cache(vma, vmf->address, vmf->pte); |
| 2206 | if (old_page) { | 2212 | if (old_page) { |
| 2207 | /* | 2213 | /* |
| 2208 | * Only after switching the pte to the new page may | 2214 | * Only after switching the pte to the new page may |
| @@ -2239,7 +2245,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, | |||
| 2239 | if (new_page) | 2245 | if (new_page) |
| 2240 | put_page(new_page); | 2246 | put_page(new_page); |
| 2241 | 2247 | ||
| 2242 | pte_unmap_unlock(fe->pte, fe->ptl); | 2248 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2243 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2249 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
| 2244 | if (old_page) { | 2250 | if (old_page) { |
| 2245 | /* | 2251 | /* |
| @@ -2263,79 +2269,91 @@ oom: | |||
| 2263 | return VM_FAULT_OOM; | 2269 | return VM_FAULT_OOM; |
| 2264 | } | 2270 | } |
| 2265 | 2271 | ||
| 2272 | /** | ||
| 2273 | * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE | ||
| 2274 | * writeable once the page is prepared | ||
| 2275 | * | ||
| 2276 | * @vmf: structure describing the fault | ||
| 2277 | * | ||
| 2278 | * This function handles all that is needed to finish a write page fault in a | ||
| 2279 | * shared mapping due to PTE being read-only once the mapped page is prepared. | ||
| 2280 | * It handles locking of PTE and modifying it. The function returns | ||
| 2281 | * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE | ||
| 2282 | * lock. | ||
| 2283 | * | ||
| 2284 | * The function expects the page to be locked or other protection against | ||
| 2285 | * concurrent faults / writeback (such as DAX radix tree locks). | ||
| 2286 | */ | ||
| 2287 | int finish_mkwrite_fault(struct vm_fault *vmf) | ||
| 2288 | { | ||
| 2289 | WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); | ||
| 2290 | vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, | ||
| 2291 | &vmf->ptl); | ||
| 2292 | /* | ||
| 2293 | * We might have raced with another page fault while we released the | ||
| 2294 | * pte_offset_map_lock. | ||
| 2295 | */ | ||
| 2296 | if (!pte_same(*vmf->pte, vmf->orig_pte)) { | ||
| 2297 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
| 2298 | return VM_FAULT_NOPAGE; | ||
| 2299 | } | ||
| 2300 | wp_page_reuse(vmf); | ||
| 2301 | return 0; | ||
| 2302 | } | ||
| 2303 | |||
| 2266 | /* | 2304 | /* |
| 2267 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | 2305 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED |
| 2268 | * mapping | 2306 | * mapping |
| 2269 | */ | 2307 | */ |
| 2270 | static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) | 2308 | static int wp_pfn_shared(struct vm_fault *vmf) |
| 2271 | { | 2309 | { |
| 2272 | struct vm_area_struct *vma = fe->vma; | 2310 | struct vm_area_struct *vma = vmf->vma; |
| 2273 | 2311 | ||
| 2274 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | 2312 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { |
| 2275 | struct vm_fault vmf = { | ||
| 2276 | .page = NULL, | ||
| 2277 | .pgoff = linear_page_index(vma, fe->address), | ||
| 2278 | .virtual_address = | ||
| 2279 | (void __user *)(fe->address & PAGE_MASK), | ||
| 2280 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
| 2281 | }; | ||
| 2282 | int ret; | 2313 | int ret; |
| 2283 | 2314 | ||
| 2284 | pte_unmap_unlock(fe->pte, fe->ptl); | 2315 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2285 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | 2316 | vmf->flags |= FAULT_FLAG_MKWRITE; |
| 2286 | if (ret & VM_FAULT_ERROR) | 2317 | ret = vma->vm_ops->pfn_mkwrite(vma, vmf); |
| 2318 | if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) | ||
| 2287 | return ret; | 2319 | return ret; |
| 2288 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2320 | return finish_mkwrite_fault(vmf); |
| 2289 | &fe->ptl); | ||
| 2290 | /* | ||
| 2291 | * We might have raced with another page fault while we | ||
| 2292 | * released the pte_offset_map_lock. | ||
| 2293 | */ | ||
| 2294 | if (!pte_same(*fe->pte, orig_pte)) { | ||
| 2295 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
| 2296 | return 0; | ||
| 2297 | } | ||
| 2298 | } | 2321 | } |
| 2299 | return wp_page_reuse(fe, orig_pte, NULL, 0, 0); | 2322 | wp_page_reuse(vmf); |
| 2323 | return VM_FAULT_WRITE; | ||
| 2300 | } | 2324 | } |
| 2301 | 2325 | ||
| 2302 | static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, | 2326 | static int wp_page_shared(struct vm_fault *vmf) |
| 2303 | struct page *old_page) | 2327 | __releases(vmf->ptl) |
| 2304 | __releases(fe->ptl) | ||
| 2305 | { | 2328 | { |
| 2306 | struct vm_area_struct *vma = fe->vma; | 2329 | struct vm_area_struct *vma = vmf->vma; |
| 2307 | int page_mkwrite = 0; | ||
| 2308 | 2330 | ||
| 2309 | get_page(old_page); | 2331 | get_page(vmf->page); |
| 2310 | 2332 | ||
| 2311 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2333 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
| 2312 | int tmp; | 2334 | int tmp; |
| 2313 | 2335 | ||
| 2314 | pte_unmap_unlock(fe->pte, fe->ptl); | 2336 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2315 | tmp = do_page_mkwrite(vma, old_page, fe->address); | 2337 | tmp = do_page_mkwrite(vmf); |
| 2316 | if (unlikely(!tmp || (tmp & | 2338 | if (unlikely(!tmp || (tmp & |
| 2317 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2339 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
| 2318 | put_page(old_page); | 2340 | put_page(vmf->page); |
| 2319 | return tmp; | 2341 | return tmp; |
| 2320 | } | 2342 | } |
| 2321 | /* | 2343 | tmp = finish_mkwrite_fault(vmf); |
| 2322 | * Since we dropped the lock we need to revalidate | 2344 | if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { |
| 2323 | * the PTE as someone else may have changed it. If | 2345 | unlock_page(vmf->page); |
| 2324 | * they did, we just return, as we can count on the | 2346 | put_page(vmf->page); |
| 2325 | * MMU to tell us if they didn't also make it writable. | 2347 | return tmp; |
| 2326 | */ | ||
| 2327 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | ||
| 2328 | &fe->ptl); | ||
| 2329 | if (!pte_same(*fe->pte, orig_pte)) { | ||
| 2330 | unlock_page(old_page); | ||
| 2331 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
| 2332 | put_page(old_page); | ||
| 2333 | return 0; | ||
| 2334 | } | 2348 | } |
| 2335 | page_mkwrite = 1; | 2349 | } else { |
| 2350 | wp_page_reuse(vmf); | ||
| 2351 | lock_page(vmf->page); | ||
| 2336 | } | 2352 | } |
| 2353 | fault_dirty_shared_page(vma, vmf->page); | ||
| 2354 | put_page(vmf->page); | ||
| 2337 | 2355 | ||
| 2338 | return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); | 2356 | return VM_FAULT_WRITE; |
| 2339 | } | 2357 | } |
| 2340 | 2358 | ||
| 2341 | /* | 2359 | /* |
| @@ -2356,14 +2374,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, | |||
| 2356 | * but allow concurrent faults), with pte both mapped and locked. | 2374 | * but allow concurrent faults), with pte both mapped and locked. |
| 2357 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2375 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
| 2358 | */ | 2376 | */ |
| 2359 | static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | 2377 | static int do_wp_page(struct vm_fault *vmf) |
| 2360 | __releases(fe->ptl) | 2378 | __releases(vmf->ptl) |
| 2361 | { | 2379 | { |
| 2362 | struct vm_area_struct *vma = fe->vma; | 2380 | struct vm_area_struct *vma = vmf->vma; |
| 2363 | struct page *old_page; | ||
| 2364 | 2381 | ||
| 2365 | old_page = vm_normal_page(vma, fe->address, orig_pte); | 2382 | vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); |
| 2366 | if (!old_page) { | 2383 | if (!vmf->page) { |
| 2367 | /* | 2384 | /* |
| 2368 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | 2385 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
| 2369 | * VM_PFNMAP VMA. | 2386 | * VM_PFNMAP VMA. |
| @@ -2373,33 +2390,33 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2373 | */ | 2390 | */ |
| 2374 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2391 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2375 | (VM_WRITE|VM_SHARED)) | 2392 | (VM_WRITE|VM_SHARED)) |
| 2376 | return wp_pfn_shared(fe, orig_pte); | 2393 | return wp_pfn_shared(vmf); |
| 2377 | 2394 | ||
| 2378 | pte_unmap_unlock(fe->pte, fe->ptl); | 2395 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2379 | return wp_page_copy(fe, orig_pte, old_page); | 2396 | return wp_page_copy(vmf); |
| 2380 | } | 2397 | } |
| 2381 | 2398 | ||
| 2382 | /* | 2399 | /* |
| 2383 | * Take out anonymous pages first, anonymous shared vmas are | 2400 | * Take out anonymous pages first, anonymous shared vmas are |
| 2384 | * not dirty accountable. | 2401 | * not dirty accountable. |
| 2385 | */ | 2402 | */ |
| 2386 | if (PageAnon(old_page) && !PageKsm(old_page)) { | 2403 | if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { |
| 2387 | int total_mapcount; | 2404 | int total_mapcount; |
| 2388 | if (!trylock_page(old_page)) { | 2405 | if (!trylock_page(vmf->page)) { |
| 2389 | get_page(old_page); | 2406 | get_page(vmf->page); |
| 2390 | pte_unmap_unlock(fe->pte, fe->ptl); | 2407 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2391 | lock_page(old_page); | 2408 | lock_page(vmf->page); |
| 2392 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, | 2409 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
| 2393 | fe->address, &fe->ptl); | 2410 | vmf->address, &vmf->ptl); |
| 2394 | if (!pte_same(*fe->pte, orig_pte)) { | 2411 | if (!pte_same(*vmf->pte, vmf->orig_pte)) { |
| 2395 | unlock_page(old_page); | 2412 | unlock_page(vmf->page); |
| 2396 | pte_unmap_unlock(fe->pte, fe->ptl); | 2413 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2397 | put_page(old_page); | 2414 | put_page(vmf->page); |
| 2398 | return 0; | 2415 | return 0; |
| 2399 | } | 2416 | } |
| 2400 | put_page(old_page); | 2417 | put_page(vmf->page); |
| 2401 | } | 2418 | } |
| 2402 | if (reuse_swap_page(old_page, &total_mapcount)) { | 2419 | if (reuse_swap_page(vmf->page, &total_mapcount)) { |
| 2403 | if (total_mapcount == 1) { | 2420 | if (total_mapcount == 1) { |
| 2404 | /* | 2421 | /* |
| 2405 | * The page is all ours. Move it to | 2422 | * The page is all ours. Move it to |
| @@ -2408,24 +2425,25 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2408 | * Protected against the rmap code by | 2425 | * Protected against the rmap code by |
| 2409 | * the page lock. | 2426 | * the page lock. |
| 2410 | */ | 2427 | */ |
| 2411 | page_move_anon_rmap(old_page, vma); | 2428 | page_move_anon_rmap(vmf->page, vma); |
| 2412 | } | 2429 | } |
| 2413 | unlock_page(old_page); | 2430 | unlock_page(vmf->page); |
| 2414 | return wp_page_reuse(fe, orig_pte, old_page, 0, 0); | 2431 | wp_page_reuse(vmf); |
| 2432 | return VM_FAULT_WRITE; | ||
| 2415 | } | 2433 | } |
| 2416 | unlock_page(old_page); | 2434 | unlock_page(vmf->page); |
| 2417 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2435 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2418 | (VM_WRITE|VM_SHARED))) { | 2436 | (VM_WRITE|VM_SHARED))) { |
| 2419 | return wp_page_shared(fe, orig_pte, old_page); | 2437 | return wp_page_shared(vmf); |
| 2420 | } | 2438 | } |
| 2421 | 2439 | ||
| 2422 | /* | 2440 | /* |
| 2423 | * Ok, we need to copy. Oh, well.. | 2441 | * Ok, we need to copy. Oh, well.. |
| 2424 | */ | 2442 | */ |
| 2425 | get_page(old_page); | 2443 | get_page(vmf->page); |
| 2426 | 2444 | ||
| 2427 | pte_unmap_unlock(fe->pte, fe->ptl); | 2445 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2428 | return wp_page_copy(fe, orig_pte, old_page); | 2446 | return wp_page_copy(vmf); |
| 2429 | } | 2447 | } |
| 2430 | 2448 | ||
| 2431 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2449 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
| @@ -2513,9 +2531,9 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
| 2513 | * We return with the mmap_sem locked or unlocked in the same cases | 2531 | * We return with the mmap_sem locked or unlocked in the same cases |
| 2514 | * as does filemap_fault(). | 2532 | * as does filemap_fault(). |
| 2515 | */ | 2533 | */ |
| 2516 | int do_swap_page(struct fault_env *fe, pte_t orig_pte) | 2534 | int do_swap_page(struct vm_fault *vmf) |
| 2517 | { | 2535 | { |
| 2518 | struct vm_area_struct *vma = fe->vma; | 2536 | struct vm_area_struct *vma = vmf->vma; |
| 2519 | struct page *page, *swapcache; | 2537 | struct page *page, *swapcache; |
| 2520 | struct mem_cgroup *memcg; | 2538 | struct mem_cgroup *memcg; |
| 2521 | swp_entry_t entry; | 2539 | swp_entry_t entry; |
| @@ -2524,17 +2542,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2524 | int exclusive = 0; | 2542 | int exclusive = 0; |
| 2525 | int ret = 0; | 2543 | int ret = 0; |
| 2526 | 2544 | ||
| 2527 | if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) | 2545 | if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) |
| 2528 | goto out; | 2546 | goto out; |
| 2529 | 2547 | ||
| 2530 | entry = pte_to_swp_entry(orig_pte); | 2548 | entry = pte_to_swp_entry(vmf->orig_pte); |
| 2531 | if (unlikely(non_swap_entry(entry))) { | 2549 | if (unlikely(non_swap_entry(entry))) { |
| 2532 | if (is_migration_entry(entry)) { | 2550 | if (is_migration_entry(entry)) { |
| 2533 | migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); | 2551 | migration_entry_wait(vma->vm_mm, vmf->pmd, |
| 2552 | vmf->address); | ||
| 2534 | } else if (is_hwpoison_entry(entry)) { | 2553 | } else if (is_hwpoison_entry(entry)) { |
| 2535 | ret = VM_FAULT_HWPOISON; | 2554 | ret = VM_FAULT_HWPOISON; |
| 2536 | } else { | 2555 | } else { |
| 2537 | print_bad_pte(vma, fe->address, orig_pte, NULL); | 2556 | print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); |
| 2538 | ret = VM_FAULT_SIGBUS; | 2557 | ret = VM_FAULT_SIGBUS; |
| 2539 | } | 2558 | } |
| 2540 | goto out; | 2559 | goto out; |
| @@ -2542,16 +2561,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2542 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2561 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
| 2543 | page = lookup_swap_cache(entry); | 2562 | page = lookup_swap_cache(entry); |
| 2544 | if (!page) { | 2563 | if (!page) { |
| 2545 | page = swapin_readahead(entry, | 2564 | page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, |
| 2546 | GFP_HIGHUSER_MOVABLE, vma, fe->address); | 2565 | vmf->address); |
| 2547 | if (!page) { | 2566 | if (!page) { |
| 2548 | /* | 2567 | /* |
| 2549 | * Back out if somebody else faulted in this pte | 2568 | * Back out if somebody else faulted in this pte |
| 2550 | * while we released the pte lock. | 2569 | * while we released the pte lock. |
| 2551 | */ | 2570 | */ |
| 2552 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, | 2571 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
| 2553 | fe->address, &fe->ptl); | 2572 | vmf->address, &vmf->ptl); |
| 2554 | if (likely(pte_same(*fe->pte, orig_pte))) | 2573 | if (likely(pte_same(*vmf->pte, vmf->orig_pte))) |
| 2555 | ret = VM_FAULT_OOM; | 2574 | ret = VM_FAULT_OOM; |
| 2556 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2575 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| 2557 | goto unlock; | 2576 | goto unlock; |
| @@ -2573,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2573 | } | 2592 | } |
| 2574 | 2593 | ||
| 2575 | swapcache = page; | 2594 | swapcache = page; |
| 2576 | locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); | 2595 | locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); |
| 2577 | 2596 | ||
| 2578 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2597 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| 2579 | if (!locked) { | 2598 | if (!locked) { |
| @@ -2590,7 +2609,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2590 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2609 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
| 2591 | goto out_page; | 2610 | goto out_page; |
| 2592 | 2611 | ||
| 2593 | page = ksm_might_need_to_copy(page, vma, fe->address); | 2612 | page = ksm_might_need_to_copy(page, vma, vmf->address); |
| 2594 | if (unlikely(!page)) { | 2613 | if (unlikely(!page)) { |
| 2595 | ret = VM_FAULT_OOM; | 2614 | ret = VM_FAULT_OOM; |
| 2596 | page = swapcache; | 2615 | page = swapcache; |
| @@ -2606,9 +2625,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2606 | /* | 2625 | /* |
| 2607 | * Back out if somebody else already faulted in this pte. | 2626 | * Back out if somebody else already faulted in this pte. |
| 2608 | */ | 2627 | */ |
| 2609 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2628 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
| 2610 | &fe->ptl); | 2629 | &vmf->ptl); |
| 2611 | if (unlikely(!pte_same(*fe->pte, orig_pte))) | 2630 | if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) |
| 2612 | goto out_nomap; | 2631 | goto out_nomap; |
| 2613 | 2632 | ||
| 2614 | if (unlikely(!PageUptodate(page))) { | 2633 | if (unlikely(!PageUptodate(page))) { |
| @@ -2629,22 +2648,23 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2629 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2648 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
| 2630 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); | 2649 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); |
| 2631 | pte = mk_pte(page, vma->vm_page_prot); | 2650 | pte = mk_pte(page, vma->vm_page_prot); |
| 2632 | if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { | 2651 | if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { |
| 2633 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2652 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
| 2634 | fe->flags &= ~FAULT_FLAG_WRITE; | 2653 | vmf->flags &= ~FAULT_FLAG_WRITE; |
| 2635 | ret |= VM_FAULT_WRITE; | 2654 | ret |= VM_FAULT_WRITE; |
| 2636 | exclusive = RMAP_EXCLUSIVE; | 2655 | exclusive = RMAP_EXCLUSIVE; |
| 2637 | } | 2656 | } |
| 2638 | flush_icache_page(vma, page); | 2657 | flush_icache_page(vma, page); |
| 2639 | if (pte_swp_soft_dirty(orig_pte)) | 2658 | if (pte_swp_soft_dirty(vmf->orig_pte)) |
| 2640 | pte = pte_mksoft_dirty(pte); | 2659 | pte = pte_mksoft_dirty(pte); |
| 2641 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); | 2660 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
| 2661 | vmf->orig_pte = pte; | ||
| 2642 | if (page == swapcache) { | 2662 | if (page == swapcache) { |
| 2643 | do_page_add_anon_rmap(page, vma, fe->address, exclusive); | 2663 | do_page_add_anon_rmap(page, vma, vmf->address, exclusive); |
| 2644 | mem_cgroup_commit_charge(page, memcg, true, false); | 2664 | mem_cgroup_commit_charge(page, memcg, true, false); |
| 2645 | activate_page(page); | 2665 | activate_page(page); |
| 2646 | } else { /* ksm created a completely new copy */ | 2666 | } else { /* ksm created a completely new copy */ |
| 2647 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2667 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
| 2648 | mem_cgroup_commit_charge(page, memcg, false, false); | 2668 | mem_cgroup_commit_charge(page, memcg, false, false); |
| 2649 | lru_cache_add_active_or_unevictable(page, vma); | 2669 | lru_cache_add_active_or_unevictable(page, vma); |
| 2650 | } | 2670 | } |
| @@ -2667,22 +2687,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) | |||
| 2667 | put_page(swapcache); | 2687 | put_page(swapcache); |
| 2668 | } | 2688 | } |
| 2669 | 2689 | ||
| 2670 | if (fe->flags & FAULT_FLAG_WRITE) { | 2690 | if (vmf->flags & FAULT_FLAG_WRITE) { |
| 2671 | ret |= do_wp_page(fe, pte); | 2691 | ret |= do_wp_page(vmf); |
| 2672 | if (ret & VM_FAULT_ERROR) | 2692 | if (ret & VM_FAULT_ERROR) |
| 2673 | ret &= VM_FAULT_ERROR; | 2693 | ret &= VM_FAULT_ERROR; |
| 2674 | goto out; | 2694 | goto out; |
| 2675 | } | 2695 | } |
| 2676 | 2696 | ||
| 2677 | /* No need to invalidate - it was non-present before */ | 2697 | /* No need to invalidate - it was non-present before */ |
| 2678 | update_mmu_cache(vma, fe->address, fe->pte); | 2698 | update_mmu_cache(vma, vmf->address, vmf->pte); |
| 2679 | unlock: | 2699 | unlock: |
| 2680 | pte_unmap_unlock(fe->pte, fe->ptl); | 2700 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2681 | out: | 2701 | out: |
| 2682 | return ret; | 2702 | return ret; |
| 2683 | out_nomap: | 2703 | out_nomap: |
| 2684 | mem_cgroup_cancel_charge(page, memcg, false); | 2704 | mem_cgroup_cancel_charge(page, memcg, false); |
| 2685 | pte_unmap_unlock(fe->pte, fe->ptl); | 2705 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2686 | out_page: | 2706 | out_page: |
| 2687 | unlock_page(page); | 2707 | unlock_page(page); |
| 2688 | out_release: | 2708 | out_release: |
| @@ -2733,9 +2753,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
| 2733 | * but allow concurrent faults), and pte mapped but not yet locked. | 2753 | * but allow concurrent faults), and pte mapped but not yet locked. |
| 2734 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2754 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
| 2735 | */ | 2755 | */ |
| 2736 | static int do_anonymous_page(struct fault_env *fe) | 2756 | static int do_anonymous_page(struct vm_fault *vmf) |
| 2737 | { | 2757 | { |
| 2738 | struct vm_area_struct *vma = fe->vma; | 2758 | struct vm_area_struct *vma = vmf->vma; |
| 2739 | struct mem_cgroup *memcg; | 2759 | struct mem_cgroup *memcg; |
| 2740 | struct page *page; | 2760 | struct page *page; |
| 2741 | pte_t entry; | 2761 | pte_t entry; |
| @@ -2745,7 +2765,7 @@ static int do_anonymous_page(struct fault_env *fe) | |||
| 2745 | return VM_FAULT_SIGBUS; | 2765 | return VM_FAULT_SIGBUS; |
| 2746 | 2766 | ||
| 2747 | /* Check if we need to add a guard page to the stack */ | 2767 | /* Check if we need to add a guard page to the stack */ |
| 2748 | if (check_stack_guard_page(vma, fe->address) < 0) | 2768 | if (check_stack_guard_page(vma, vmf->address) < 0) |
| 2749 | return VM_FAULT_SIGSEGV; | 2769 | return VM_FAULT_SIGSEGV; |
| 2750 | 2770 | ||
| 2751 | /* | 2771 | /* |
| @@ -2758,26 +2778,26 @@ static int do_anonymous_page(struct fault_env *fe) | |||
| 2758 | * | 2778 | * |
| 2759 | * Here we only have down_read(mmap_sem). | 2779 | * Here we only have down_read(mmap_sem). |
| 2760 | */ | 2780 | */ |
| 2761 | if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) | 2781 | if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) |
| 2762 | return VM_FAULT_OOM; | 2782 | return VM_FAULT_OOM; |
| 2763 | 2783 | ||
| 2764 | /* See the comment in pte_alloc_one_map() */ | 2784 | /* See the comment in pte_alloc_one_map() */ |
| 2765 | if (unlikely(pmd_trans_unstable(fe->pmd))) | 2785 | if (unlikely(pmd_trans_unstable(vmf->pmd))) |
| 2766 | return 0; | 2786 | return 0; |
| 2767 | 2787 | ||
| 2768 | /* Use the zero-page for reads */ | 2788 | /* Use the zero-page for reads */ |
| 2769 | if (!(fe->flags & FAULT_FLAG_WRITE) && | 2789 | if (!(vmf->flags & FAULT_FLAG_WRITE) && |
| 2770 | !mm_forbids_zeropage(vma->vm_mm)) { | 2790 | !mm_forbids_zeropage(vma->vm_mm)) { |
| 2771 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), | 2791 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), |
| 2772 | vma->vm_page_prot)); | 2792 | vma->vm_page_prot)); |
| 2773 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2793 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, |
| 2774 | &fe->ptl); | 2794 | vmf->address, &vmf->ptl); |
| 2775 | if (!pte_none(*fe->pte)) | 2795 | if (!pte_none(*vmf->pte)) |
| 2776 | goto unlock; | 2796 | goto unlock; |
| 2777 | /* Deliver the page fault to userland, check inside PT lock */ | 2797 | /* Deliver the page fault to userland, check inside PT lock */ |
| 2778 | if (userfaultfd_missing(vma)) { | 2798 | if (userfaultfd_missing(vma)) { |
| 2779 | pte_unmap_unlock(fe->pte, fe->ptl); | 2799 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2780 | return handle_userfault(fe, VM_UFFD_MISSING); | 2800 | return handle_userfault(vmf, VM_UFFD_MISSING); |
| 2781 | } | 2801 | } |
| 2782 | goto setpte; | 2802 | goto setpte; |
| 2783 | } | 2803 | } |
| @@ -2785,7 +2805,7 @@ static int do_anonymous_page(struct fault_env *fe) | |||
| 2785 | /* Allocate our own private page. */ | 2805 | /* Allocate our own private page. */ |
| 2786 | if (unlikely(anon_vma_prepare(vma))) | 2806 | if (unlikely(anon_vma_prepare(vma))) |
| 2787 | goto oom; | 2807 | goto oom; |
| 2788 | page = alloc_zeroed_user_highpage_movable(vma, fe->address); | 2808 | page = alloc_zeroed_user_highpage_movable(vma, vmf->address); |
| 2789 | if (!page) | 2809 | if (!page) |
| 2790 | goto oom; | 2810 | goto oom; |
| 2791 | 2811 | ||
| @@ -2803,30 +2823,30 @@ static int do_anonymous_page(struct fault_env *fe) | |||
| 2803 | if (vma->vm_flags & VM_WRITE) | 2823 | if (vma->vm_flags & VM_WRITE) |
| 2804 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2824 | entry = pte_mkwrite(pte_mkdirty(entry)); |
| 2805 | 2825 | ||
| 2806 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2826 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
| 2807 | &fe->ptl); | 2827 | &vmf->ptl); |
| 2808 | if (!pte_none(*fe->pte)) | 2828 | if (!pte_none(*vmf->pte)) |
| 2809 | goto release; | 2829 | goto release; |
| 2810 | 2830 | ||
| 2811 | /* Deliver the page fault to userland, check inside PT lock */ | 2831 | /* Deliver the page fault to userland, check inside PT lock */ |
| 2812 | if (userfaultfd_missing(vma)) { | 2832 | if (userfaultfd_missing(vma)) { |
| 2813 | pte_unmap_unlock(fe->pte, fe->ptl); | 2833 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2814 | mem_cgroup_cancel_charge(page, memcg, false); | 2834 | mem_cgroup_cancel_charge(page, memcg, false); |
| 2815 | put_page(page); | 2835 | put_page(page); |
| 2816 | return handle_userfault(fe, VM_UFFD_MISSING); | 2836 | return handle_userfault(vmf, VM_UFFD_MISSING); |
| 2817 | } | 2837 | } |
| 2818 | 2838 | ||
| 2819 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2839 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
| 2820 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2840 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
| 2821 | mem_cgroup_commit_charge(page, memcg, false, false); | 2841 | mem_cgroup_commit_charge(page, memcg, false, false); |
| 2822 | lru_cache_add_active_or_unevictable(page, vma); | 2842 | lru_cache_add_active_or_unevictable(page, vma); |
| 2823 | setpte: | 2843 | setpte: |
| 2824 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); | 2844 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); |
| 2825 | 2845 | ||
| 2826 | /* No need to invalidate - it was non-present before */ | 2846 | /* No need to invalidate - it was non-present before */ |
| 2827 | update_mmu_cache(vma, fe->address, fe->pte); | 2847 | update_mmu_cache(vma, vmf->address, vmf->pte); |
| 2828 | unlock: | 2848 | unlock: |
| 2829 | pte_unmap_unlock(fe->pte, fe->ptl); | 2849 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 2830 | return 0; | 2850 | return 0; |
| 2831 | release: | 2851 | release: |
| 2832 | mem_cgroup_cancel_charge(page, memcg, false); | 2852 | mem_cgroup_cancel_charge(page, memcg, false); |
| @@ -2843,62 +2863,50 @@ oom: | |||
| 2843 | * released depending on flags and vma->vm_ops->fault() return value. | 2863 | * released depending on flags and vma->vm_ops->fault() return value. |
| 2844 | * See filemap_fault() and __lock_page_retry(). | 2864 | * See filemap_fault() and __lock_page_retry(). |
| 2845 | */ | 2865 | */ |
| 2846 | static int __do_fault(struct fault_env *fe, pgoff_t pgoff, | 2866 | static int __do_fault(struct vm_fault *vmf) |
| 2847 | struct page *cow_page, struct page **page, void **entry) | ||
| 2848 | { | 2867 | { |
| 2849 | struct vm_area_struct *vma = fe->vma; | 2868 | struct vm_area_struct *vma = vmf->vma; |
| 2850 | struct vm_fault vmf; | ||
| 2851 | int ret; | 2869 | int ret; |
| 2852 | 2870 | ||
| 2853 | vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); | 2871 | ret = vma->vm_ops->fault(vma, vmf); |
| 2854 | vmf.pgoff = pgoff; | 2872 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | |
| 2855 | vmf.flags = fe->flags; | 2873 | VM_FAULT_DONE_COW))) |
| 2856 | vmf.page = NULL; | ||
| 2857 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | ||
| 2858 | vmf.cow_page = cow_page; | ||
| 2859 | |||
| 2860 | ret = vma->vm_ops->fault(vma, &vmf); | ||
| 2861 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | ||
| 2862 | return ret; | ||
| 2863 | if (ret & VM_FAULT_DAX_LOCKED) { | ||
| 2864 | *entry = vmf.entry; | ||
| 2865 | return ret; | 2874 | return ret; |
| 2866 | } | ||
| 2867 | 2875 | ||
| 2868 | if (unlikely(PageHWPoison(vmf.page))) { | 2876 | if (unlikely(PageHWPoison(vmf->page))) { |
| 2869 | if (ret & VM_FAULT_LOCKED) | 2877 | if (ret & VM_FAULT_LOCKED) |
| 2870 | unlock_page(vmf.page); | 2878 | unlock_page(vmf->page); |
| 2871 | put_page(vmf.page); | 2879 | put_page(vmf->page); |
| 2880 | vmf->page = NULL; | ||
| 2872 | return VM_FAULT_HWPOISON; | 2881 | return VM_FAULT_HWPOISON; |
| 2873 | } | 2882 | } |
| 2874 | 2883 | ||
| 2875 | if (unlikely(!(ret & VM_FAULT_LOCKED))) | 2884 | if (unlikely(!(ret & VM_FAULT_LOCKED))) |
| 2876 | lock_page(vmf.page); | 2885 | lock_page(vmf->page); |
| 2877 | else | 2886 | else |
| 2878 | VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); | 2887 | VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); |
| 2879 | 2888 | ||
| 2880 | *page = vmf.page; | ||
| 2881 | return ret; | 2889 | return ret; |
| 2882 | } | 2890 | } |
| 2883 | 2891 | ||
| 2884 | static int pte_alloc_one_map(struct fault_env *fe) | 2892 | static int pte_alloc_one_map(struct vm_fault *vmf) |
| 2885 | { | 2893 | { |
| 2886 | struct vm_area_struct *vma = fe->vma; | 2894 | struct vm_area_struct *vma = vmf->vma; |
| 2887 | 2895 | ||
| 2888 | if (!pmd_none(*fe->pmd)) | 2896 | if (!pmd_none(*vmf->pmd)) |
| 2889 | goto map_pte; | 2897 | goto map_pte; |
| 2890 | if (fe->prealloc_pte) { | 2898 | if (vmf->prealloc_pte) { |
| 2891 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 2899 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| 2892 | if (unlikely(!pmd_none(*fe->pmd))) { | 2900 | if (unlikely(!pmd_none(*vmf->pmd))) { |
| 2893 | spin_unlock(fe->ptl); | 2901 | spin_unlock(vmf->ptl); |
| 2894 | goto map_pte; | 2902 | goto map_pte; |
| 2895 | } | 2903 | } |
| 2896 | 2904 | ||
| 2897 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 2905 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
| 2898 | pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); | 2906 | pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
| 2899 | spin_unlock(fe->ptl); | 2907 | spin_unlock(vmf->ptl); |
| 2900 | fe->prealloc_pte = 0; | 2908 | vmf->prealloc_pte = 0; |
| 2901 | } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { | 2909 | } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { |
| 2902 | return VM_FAULT_OOM; | 2910 | return VM_FAULT_OOM; |
| 2903 | } | 2911 | } |
| 2904 | map_pte: | 2912 | map_pte: |
| @@ -2913,11 +2921,11 @@ map_pte: | |||
| 2913 | * through an atomic read in C, which is what pmd_trans_unstable() | 2921 | * through an atomic read in C, which is what pmd_trans_unstable() |
| 2914 | * provides. | 2922 | * provides. |
| 2915 | */ | 2923 | */ |
| 2916 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | 2924 | if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) |
| 2917 | return VM_FAULT_NOPAGE; | 2925 | return VM_FAULT_NOPAGE; |
| 2918 | 2926 | ||
| 2919 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 2927 | vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, |
| 2920 | &fe->ptl); | 2928 | &vmf->ptl); |
| 2921 | return 0; | 2929 | return 0; |
| 2922 | } | 2930 | } |
| 2923 | 2931 | ||
| @@ -2935,24 +2943,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, | |||
| 2935 | return true; | 2943 | return true; |
| 2936 | } | 2944 | } |
| 2937 | 2945 | ||
| 2938 | static void deposit_prealloc_pte(struct fault_env *fe) | 2946 | static void deposit_prealloc_pte(struct vm_fault *vmf) |
| 2939 | { | 2947 | { |
| 2940 | struct vm_area_struct *vma = fe->vma; | 2948 | struct vm_area_struct *vma = vmf->vma; |
| 2941 | 2949 | ||
| 2942 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); | 2950 | pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); |
| 2943 | /* | 2951 | /* |
| 2944 | * We are going to consume the prealloc table, | 2952 | * We are going to consume the prealloc table, |
| 2945 | * count that as nr_ptes. | 2953 | * count that as nr_ptes. |
| 2946 | */ | 2954 | */ |
| 2947 | atomic_long_inc(&vma->vm_mm->nr_ptes); | 2955 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
| 2948 | fe->prealloc_pte = 0; | 2956 | vmf->prealloc_pte = 0; |
| 2949 | } | 2957 | } |
| 2950 | 2958 | ||
| 2951 | static int do_set_pmd(struct fault_env *fe, struct page *page) | 2959 | static int do_set_pmd(struct vm_fault *vmf, struct page *page) |
| 2952 | { | 2960 | { |
| 2953 | struct vm_area_struct *vma = fe->vma; | 2961 | struct vm_area_struct *vma = vmf->vma; |
| 2954 | bool write = fe->flags & FAULT_FLAG_WRITE; | 2962 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
| 2955 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | 2963 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
| 2956 | pmd_t entry; | 2964 | pmd_t entry; |
| 2957 | int i, ret; | 2965 | int i, ret; |
| 2958 | 2966 | ||
| @@ -2966,15 +2974,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
| 2966 | * Archs like ppc64 need additonal space to store information | 2974 | * Archs like ppc64 need additonal space to store information |
| 2967 | * related to pte entry. Use the preallocated table for that. | 2975 | * related to pte entry. Use the preallocated table for that. |
| 2968 | */ | 2976 | */ |
| 2969 | if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { | 2977 | if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { |
| 2970 | fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); | 2978 | vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); |
| 2971 | if (!fe->prealloc_pte) | 2979 | if (!vmf->prealloc_pte) |
| 2972 | return VM_FAULT_OOM; | 2980 | return VM_FAULT_OOM; |
| 2973 | smp_wmb(); /* See comment in __pte_alloc() */ | 2981 | smp_wmb(); /* See comment in __pte_alloc() */ |
| 2974 | } | 2982 | } |
| 2975 | 2983 | ||
| 2976 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | 2984 | vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); |
| 2977 | if (unlikely(!pmd_none(*fe->pmd))) | 2985 | if (unlikely(!pmd_none(*vmf->pmd))) |
| 2978 | goto out; | 2986 | goto out; |
| 2979 | 2987 | ||
| 2980 | for (i = 0; i < HPAGE_PMD_NR; i++) | 2988 | for (i = 0; i < HPAGE_PMD_NR; i++) |
| @@ -2990,11 +2998,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
| 2990 | * deposit and withdraw with pmd lock held | 2998 | * deposit and withdraw with pmd lock held |
| 2991 | */ | 2999 | */ |
| 2992 | if (arch_needs_pgtable_deposit()) | 3000 | if (arch_needs_pgtable_deposit()) |
| 2993 | deposit_prealloc_pte(fe); | 3001 | deposit_prealloc_pte(vmf); |
| 2994 | 3002 | ||
| 2995 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | 3003 | set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); |
| 2996 | 3004 | ||
| 2997 | update_mmu_cache_pmd(vma, haddr, fe->pmd); | 3005 | update_mmu_cache_pmd(vma, haddr, vmf->pmd); |
| 2998 | 3006 | ||
| 2999 | /* fault is handled */ | 3007 | /* fault is handled */ |
| 3000 | ret = 0; | 3008 | ret = 0; |
| @@ -3005,13 +3013,13 @@ out: | |||
| 3005 | * withdraw with pmd lock held. | 3013 | * withdraw with pmd lock held. |
| 3006 | */ | 3014 | */ |
| 3007 | if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) | 3015 | if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) |
| 3008 | fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, | 3016 | vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, |
| 3009 | fe->pmd); | 3017 | vmf->pmd); |
| 3010 | spin_unlock(fe->ptl); | 3018 | spin_unlock(vmf->ptl); |
| 3011 | return ret; | 3019 | return ret; |
| 3012 | } | 3020 | } |
| 3013 | #else | 3021 | #else |
| 3014 | static int do_set_pmd(struct fault_env *fe, struct page *page) | 3022 | static int do_set_pmd(struct vm_fault *vmf, struct page *page) |
| 3015 | { | 3023 | { |
| 3016 | BUILD_BUG(); | 3024 | BUILD_BUG(); |
| 3017 | return 0; | 3025 | return 0; |
| @@ -3022,41 +3030,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) | |||
| 3022 | * alloc_set_pte - setup new PTE entry for given page and add reverse page | 3030 | * alloc_set_pte - setup new PTE entry for given page and add reverse page |
| 3023 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. | 3031 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. |
| 3024 | * | 3032 | * |
| 3025 | * @fe: fault environment | 3033 | * @vmf: fault environment |
| 3026 | * @memcg: memcg to charge page (only for private mappings) | 3034 | * @memcg: memcg to charge page (only for private mappings) |
| 3027 | * @page: page to map | 3035 | * @page: page to map |
| 3028 | * | 3036 | * |
| 3029 | * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. | 3037 | * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on |
| 3038 | * return. | ||
| 3030 | * | 3039 | * |
| 3031 | * Target users are page handler itself and implementations of | 3040 | * Target users are page handler itself and implementations of |
| 3032 | * vm_ops->map_pages. | 3041 | * vm_ops->map_pages. |
| 3033 | */ | 3042 | */ |
| 3034 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | 3043 | int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
| 3035 | struct page *page) | 3044 | struct page *page) |
| 3036 | { | 3045 | { |
| 3037 | struct vm_area_struct *vma = fe->vma; | 3046 | struct vm_area_struct *vma = vmf->vma; |
| 3038 | bool write = fe->flags & FAULT_FLAG_WRITE; | 3047 | bool write = vmf->flags & FAULT_FLAG_WRITE; |
| 3039 | pte_t entry; | 3048 | pte_t entry; |
| 3040 | int ret; | 3049 | int ret; |
| 3041 | 3050 | ||
| 3042 | if (pmd_none(*fe->pmd) && PageTransCompound(page) && | 3051 | if (pmd_none(*vmf->pmd) && PageTransCompound(page) && |
| 3043 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { | 3052 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { |
| 3044 | /* THP on COW? */ | 3053 | /* THP on COW? */ |
| 3045 | VM_BUG_ON_PAGE(memcg, page); | 3054 | VM_BUG_ON_PAGE(memcg, page); |
| 3046 | 3055 | ||
| 3047 | ret = do_set_pmd(fe, page); | 3056 | ret = do_set_pmd(vmf, page); |
| 3048 | if (ret != VM_FAULT_FALLBACK) | 3057 | if (ret != VM_FAULT_FALLBACK) |
| 3049 | goto fault_handled; | 3058 | goto fault_handled; |
| 3050 | } | 3059 | } |
| 3051 | 3060 | ||
| 3052 | if (!fe->pte) { | 3061 | if (!vmf->pte) { |
| 3053 | ret = pte_alloc_one_map(fe); | 3062 | ret = pte_alloc_one_map(vmf); |
| 3054 | if (ret) | 3063 | if (ret) |
| 3055 | goto fault_handled; | 3064 | goto fault_handled; |
| 3056 | } | 3065 | } |
| 3057 | 3066 | ||
| 3058 | /* Re-check under ptl */ | 3067 | /* Re-check under ptl */ |
| 3059 | if (unlikely(!pte_none(*fe->pte))) { | 3068 | if (unlikely(!pte_none(*vmf->pte))) { |
| 3060 | ret = VM_FAULT_NOPAGE; | 3069 | ret = VM_FAULT_NOPAGE; |
| 3061 | goto fault_handled; | 3070 | goto fault_handled; |
| 3062 | } | 3071 | } |
| @@ -3068,28 +3077,60 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, | |||
| 3068 | /* copy-on-write page */ | 3077 | /* copy-on-write page */ |
| 3069 | if (write && !(vma->vm_flags & VM_SHARED)) { | 3078 | if (write && !(vma->vm_flags & VM_SHARED)) { |
| 3070 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 3079 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
| 3071 | page_add_new_anon_rmap(page, vma, fe->address, false); | 3080 | page_add_new_anon_rmap(page, vma, vmf->address, false); |
| 3072 | mem_cgroup_commit_charge(page, memcg, false, false); | 3081 | mem_cgroup_commit_charge(page, memcg, false, false); |
| 3073 | lru_cache_add_active_or_unevictable(page, vma); | 3082 | lru_cache_add_active_or_unevictable(page, vma); |
| 3074 | } else { | 3083 | } else { |
| 3075 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 3084 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
| 3076 | page_add_file_rmap(page, false); | 3085 | page_add_file_rmap(page, false); |
| 3077 | } | 3086 | } |
| 3078 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); | 3087 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); |
| 3079 | 3088 | ||
| 3080 | /* no need to invalidate: a not-present page won't be cached */ | 3089 | /* no need to invalidate: a not-present page won't be cached */ |
| 3081 | update_mmu_cache(vma, fe->address, fe->pte); | 3090 | update_mmu_cache(vma, vmf->address, vmf->pte); |
| 3082 | ret = 0; | 3091 | ret = 0; |
| 3083 | 3092 | ||
| 3084 | fault_handled: | 3093 | fault_handled: |
| 3085 | /* preallocated pagetable is unused: free it */ | 3094 | /* preallocated pagetable is unused: free it */ |
| 3086 | if (fe->prealloc_pte) { | 3095 | if (vmf->prealloc_pte) { |
| 3087 | pte_free(fe->vma->vm_mm, fe->prealloc_pte); | 3096 | pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); |
| 3088 | fe->prealloc_pte = 0; | 3097 | vmf->prealloc_pte = 0; |
| 3089 | } | 3098 | } |
| 3090 | return ret; | 3099 | return ret; |
| 3091 | } | 3100 | } |
| 3092 | 3101 | ||
| 3102 | |||
| 3103 | /** | ||
| 3104 | * finish_fault - finish page fault once we have prepared the page to fault | ||
| 3105 | * | ||
| 3106 | * @vmf: structure describing the fault | ||
| 3107 | * | ||
| 3108 | * This function handles all that is needed to finish a page fault once the | ||
| 3109 | * page to fault in is prepared. It handles locking of PTEs, inserts PTE for | ||
| 3110 | * given page, adds reverse page mapping, handles memcg charges and LRU | ||
| 3111 | * addition. The function returns 0 on success, VM_FAULT_ code in case of | ||
| 3112 | * error. | ||
| 3113 | * | ||
| 3114 | * The function expects the page to be locked and on success it consumes a | ||
| 3115 | * reference of a page being mapped (for the PTE which maps it). | ||
| 3116 | */ | ||
| 3117 | int finish_fault(struct vm_fault *vmf) | ||
| 3118 | { | ||
| 3119 | struct page *page; | ||
| 3120 | int ret; | ||
| 3121 | |||
| 3122 | /* Did we COW the page? */ | ||
| 3123 | if ((vmf->flags & FAULT_FLAG_WRITE) && | ||
| 3124 | !(vmf->vma->vm_flags & VM_SHARED)) | ||
| 3125 | page = vmf->cow_page; | ||
| 3126 | else | ||
| 3127 | page = vmf->page; | ||
| 3128 | ret = alloc_set_pte(vmf, vmf->memcg, page); | ||
| 3129 | if (vmf->pte) | ||
| 3130 | pte_unmap_unlock(vmf->pte, vmf->ptl); | ||
| 3131 | return ret; | ||
| 3132 | } | ||
| 3133 | |||
| 3093 | static unsigned long fault_around_bytes __read_mostly = | 3134 | static unsigned long fault_around_bytes __read_mostly = |
| 3094 | rounddown_pow_of_two(65536); | 3135 | rounddown_pow_of_two(65536); |
| 3095 | 3136 | ||
| @@ -3154,17 +3195,18 @@ late_initcall(fault_around_debugfs); | |||
| 3154 | * fault_around_pages() value (and therefore to page order). This way it's | 3195 | * fault_around_pages() value (and therefore to page order). This way it's |
| 3155 | * easier to guarantee that we don't cross page table boundaries. | 3196 | * easier to guarantee that we don't cross page table boundaries. |
| 3156 | */ | 3197 | */ |
| 3157 | static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | 3198 | static int do_fault_around(struct vm_fault *vmf) |
| 3158 | { | 3199 | { |
| 3159 | unsigned long address = fe->address, nr_pages, mask; | 3200 | unsigned long address = vmf->address, nr_pages, mask; |
| 3201 | pgoff_t start_pgoff = vmf->pgoff; | ||
| 3160 | pgoff_t end_pgoff; | 3202 | pgoff_t end_pgoff; |
| 3161 | int off, ret = 0; | 3203 | int off, ret = 0; |
| 3162 | 3204 | ||
| 3163 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 3205 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
| 3164 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 3206 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
| 3165 | 3207 | ||
| 3166 | fe->address = max(address & mask, fe->vma->vm_start); | 3208 | vmf->address = max(address & mask, vmf->vma->vm_start); |
| 3167 | off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 3209 | off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
| 3168 | start_pgoff -= off; | 3210 | start_pgoff -= off; |
| 3169 | 3211 | ||
| 3170 | /* | 3212 | /* |
| @@ -3172,45 +3214,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | |||
| 3172 | * or fault_around_pages() from start_pgoff, depending what is nearest. | 3214 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
| 3173 | */ | 3215 | */ |
| 3174 | end_pgoff = start_pgoff - | 3216 | end_pgoff = start_pgoff - |
| 3175 | ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 3217 | ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
| 3176 | PTRS_PER_PTE - 1; | 3218 | PTRS_PER_PTE - 1; |
| 3177 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, | 3219 | end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, |
| 3178 | start_pgoff + nr_pages - 1); | 3220 | start_pgoff + nr_pages - 1); |
| 3179 | 3221 | ||
| 3180 | if (pmd_none(*fe->pmd)) { | 3222 | if (pmd_none(*vmf->pmd)) { |
| 3181 | fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); | 3223 | vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, |
| 3182 | if (!fe->prealloc_pte) | 3224 | vmf->address); |
| 3225 | if (!vmf->prealloc_pte) | ||
| 3183 | goto out; | 3226 | goto out; |
| 3184 | smp_wmb(); /* See comment in __pte_alloc() */ | 3227 | smp_wmb(); /* See comment in __pte_alloc() */ |
| 3185 | } | 3228 | } |
| 3186 | 3229 | ||
| 3187 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); | 3230 | vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); |
| 3188 | 3231 | ||
| 3189 | /* Huge page is mapped? Page fault is solved */ | 3232 | /* Huge page is mapped? Page fault is solved */ |
| 3190 | if (pmd_trans_huge(*fe->pmd)) { | 3233 | if (pmd_trans_huge(*vmf->pmd)) { |
| 3191 | ret = VM_FAULT_NOPAGE; | 3234 | ret = VM_FAULT_NOPAGE; |
| 3192 | goto out; | 3235 | goto out; |
| 3193 | } | 3236 | } |
| 3194 | 3237 | ||
| 3195 | /* ->map_pages() haven't done anything useful. Cold page cache? */ | 3238 | /* ->map_pages() haven't done anything useful. Cold page cache? */ |
| 3196 | if (!fe->pte) | 3239 | if (!vmf->pte) |
| 3197 | goto out; | 3240 | goto out; |
| 3198 | 3241 | ||
| 3199 | /* check if the page fault is solved */ | 3242 | /* check if the page fault is solved */ |
| 3200 | fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); | 3243 | vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); |
| 3201 | if (!pte_none(*fe->pte)) | 3244 | if (!pte_none(*vmf->pte)) |
| 3202 | ret = VM_FAULT_NOPAGE; | 3245 | ret = VM_FAULT_NOPAGE; |
| 3203 | pte_unmap_unlock(fe->pte, fe->ptl); | 3246 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 3204 | out: | 3247 | out: |
| 3205 | fe->address = address; | 3248 | vmf->address = address; |
| 3206 | fe->pte = NULL; | 3249 | vmf->pte = NULL; |
| 3207 | return ret; | 3250 | return ret; |
| 3208 | } | 3251 | } |
| 3209 | 3252 | ||
| 3210 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) | 3253 | static int do_read_fault(struct vm_fault *vmf) |
| 3211 | { | 3254 | { |
| 3212 | struct vm_area_struct *vma = fe->vma; | 3255 | struct vm_area_struct *vma = vmf->vma; |
| 3213 | struct page *fault_page; | ||
| 3214 | int ret = 0; | 3256 | int ret = 0; |
| 3215 | 3257 | ||
| 3216 | /* | 3258 | /* |
| @@ -3219,80 +3261,67 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) | |||
| 3219 | * something). | 3261 | * something). |
| 3220 | */ | 3262 | */ |
| 3221 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3263 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
| 3222 | ret = do_fault_around(fe, pgoff); | 3264 | ret = do_fault_around(vmf); |
| 3223 | if (ret) | 3265 | if (ret) |
| 3224 | return ret; | 3266 | return ret; |
| 3225 | } | 3267 | } |
| 3226 | 3268 | ||
| 3227 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3269 | ret = __do_fault(vmf); |
| 3228 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3270 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 3229 | return ret; | 3271 | return ret; |
| 3230 | 3272 | ||
| 3231 | ret |= alloc_set_pte(fe, NULL, fault_page); | 3273 | ret |= finish_fault(vmf); |
| 3232 | if (fe->pte) | 3274 | unlock_page(vmf->page); |
| 3233 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
| 3234 | unlock_page(fault_page); | ||
| 3235 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3275 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 3236 | put_page(fault_page); | 3276 | put_page(vmf->page); |
| 3237 | return ret; | 3277 | return ret; |
| 3238 | } | 3278 | } |
| 3239 | 3279 | ||
| 3240 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) | 3280 | static int do_cow_fault(struct vm_fault *vmf) |
| 3241 | { | 3281 | { |
| 3242 | struct vm_area_struct *vma = fe->vma; | 3282 | struct vm_area_struct *vma = vmf->vma; |
| 3243 | struct page *fault_page, *new_page; | ||
| 3244 | void *fault_entry; | ||
| 3245 | struct mem_cgroup *memcg; | ||
| 3246 | int ret; | 3283 | int ret; |
| 3247 | 3284 | ||
| 3248 | if (unlikely(anon_vma_prepare(vma))) | 3285 | if (unlikely(anon_vma_prepare(vma))) |
| 3249 | return VM_FAULT_OOM; | 3286 | return VM_FAULT_OOM; |
| 3250 | 3287 | ||
| 3251 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); | 3288 | vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); |
| 3252 | if (!new_page) | 3289 | if (!vmf->cow_page) |
| 3253 | return VM_FAULT_OOM; | 3290 | return VM_FAULT_OOM; |
| 3254 | 3291 | ||
| 3255 | if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, | 3292 | if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, |
| 3256 | &memcg, false)) { | 3293 | &vmf->memcg, false)) { |
| 3257 | put_page(new_page); | 3294 | put_page(vmf->cow_page); |
| 3258 | return VM_FAULT_OOM; | 3295 | return VM_FAULT_OOM; |
| 3259 | } | 3296 | } |
| 3260 | 3297 | ||
| 3261 | ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); | 3298 | ret = __do_fault(vmf); |
| 3262 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3299 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 3263 | goto uncharge_out; | 3300 | goto uncharge_out; |
| 3301 | if (ret & VM_FAULT_DONE_COW) | ||
| 3302 | return ret; | ||
| 3264 | 3303 | ||
| 3265 | if (!(ret & VM_FAULT_DAX_LOCKED)) | 3304 | copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); |
| 3266 | copy_user_highpage(new_page, fault_page, fe->address, vma); | 3305 | __SetPageUptodate(vmf->cow_page); |
| 3267 | __SetPageUptodate(new_page); | ||
| 3268 | 3306 | ||
| 3269 | ret |= alloc_set_pte(fe, memcg, new_page); | 3307 | ret |= finish_fault(vmf); |
| 3270 | if (fe->pte) | 3308 | unlock_page(vmf->page); |
| 3271 | pte_unmap_unlock(fe->pte, fe->ptl); | 3309 | put_page(vmf->page); |
| 3272 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | ||
| 3273 | unlock_page(fault_page); | ||
| 3274 | put_page(fault_page); | ||
| 3275 | } else { | ||
| 3276 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); | ||
| 3277 | } | ||
| 3278 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3310 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 3279 | goto uncharge_out; | 3311 | goto uncharge_out; |
| 3280 | return ret; | 3312 | return ret; |
| 3281 | uncharge_out: | 3313 | uncharge_out: |
| 3282 | mem_cgroup_cancel_charge(new_page, memcg, false); | 3314 | mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); |
| 3283 | put_page(new_page); | 3315 | put_page(vmf->cow_page); |
| 3284 | return ret; | 3316 | return ret; |
| 3285 | } | 3317 | } |
| 3286 | 3318 | ||
| 3287 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | 3319 | static int do_shared_fault(struct vm_fault *vmf) |
| 3288 | { | 3320 | { |
| 3289 | struct vm_area_struct *vma = fe->vma; | 3321 | struct vm_area_struct *vma = vmf->vma; |
| 3290 | struct page *fault_page; | ||
| 3291 | struct address_space *mapping; | ||
| 3292 | int dirtied = 0; | ||
| 3293 | int ret, tmp; | 3322 | int ret, tmp; |
| 3294 | 3323 | ||
| 3295 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3324 | ret = __do_fault(vmf); |
| 3296 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3325 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
| 3297 | return ret; | 3326 | return ret; |
| 3298 | 3327 | ||
| @@ -3301,46 +3330,24 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | |||
| 3301 | * about to become writable | 3330 | * about to become writable |
| 3302 | */ | 3331 | */ |
| 3303 | if (vma->vm_ops->page_mkwrite) { | 3332 | if (vma->vm_ops->page_mkwrite) { |
| 3304 | unlock_page(fault_page); | 3333 | unlock_page(vmf->page); |
| 3305 | tmp = do_page_mkwrite(vma, fault_page, fe->address); | 3334 | tmp = do_page_mkwrite(vmf); |
| 3306 | if (unlikely(!tmp || | 3335 | if (unlikely(!tmp || |
| 3307 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 3336 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
| 3308 | put_page(fault_page); | 3337 | put_page(vmf->page); |
| 3309 | return tmp; | 3338 | return tmp; |
| 3310 | } | 3339 | } |
| 3311 | } | 3340 | } |
| 3312 | 3341 | ||
| 3313 | ret |= alloc_set_pte(fe, NULL, fault_page); | 3342 | ret |= finish_fault(vmf); |
| 3314 | if (fe->pte) | ||
| 3315 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
| 3316 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | 3343 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | |
| 3317 | VM_FAULT_RETRY))) { | 3344 | VM_FAULT_RETRY))) { |
| 3318 | unlock_page(fault_page); | 3345 | unlock_page(vmf->page); |
| 3319 | put_page(fault_page); | 3346 | put_page(vmf->page); |
| 3320 | return ret; | 3347 | return ret; |
| 3321 | } | 3348 | } |
| 3322 | 3349 | ||
| 3323 | if (set_page_dirty(fault_page)) | 3350 | fault_dirty_shared_page(vma, vmf->page); |
| 3324 | dirtied = 1; | ||
| 3325 | /* | ||
| 3326 | * Take a local copy of the address_space - page.mapping may be zeroed | ||
| 3327 | * by truncate after unlock_page(). The address_space itself remains | ||
| 3328 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | ||
| 3329 | * release semantics to prevent the compiler from undoing this copying. | ||
| 3330 | */ | ||
| 3331 | mapping = page_rmapping(fault_page); | ||
| 3332 | unlock_page(fault_page); | ||
| 3333 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { | ||
| 3334 | /* | ||
| 3335 | * Some device drivers do not set page.mapping but still | ||
| 3336 | * dirty their pages | ||
| 3337 | */ | ||
| 3338 | balance_dirty_pages_ratelimited(mapping); | ||
| 3339 | } | ||
| 3340 | |||
| 3341 | if (!vma->vm_ops->page_mkwrite) | ||
| 3342 | file_update_time(vma->vm_file); | ||
| 3343 | |||
| 3344 | return ret; | 3351 | return ret; |
| 3345 | } | 3352 | } |
| 3346 | 3353 | ||
| @@ -3350,19 +3357,18 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) | |||
| 3350 | * The mmap_sem may have been released depending on flags and our | 3357 | * The mmap_sem may have been released depending on flags and our |
| 3351 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3358 | * return value. See filemap_fault() and __lock_page_or_retry(). |
| 3352 | */ | 3359 | */ |
| 3353 | static int do_fault(struct fault_env *fe) | 3360 | static int do_fault(struct vm_fault *vmf) |
| 3354 | { | 3361 | { |
| 3355 | struct vm_area_struct *vma = fe->vma; | 3362 | struct vm_area_struct *vma = vmf->vma; |
| 3356 | pgoff_t pgoff = linear_page_index(vma, fe->address); | ||
| 3357 | 3363 | ||
| 3358 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3364 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
| 3359 | if (!vma->vm_ops->fault) | 3365 | if (!vma->vm_ops->fault) |
| 3360 | return VM_FAULT_SIGBUS; | 3366 | return VM_FAULT_SIGBUS; |
| 3361 | if (!(fe->flags & FAULT_FLAG_WRITE)) | 3367 | if (!(vmf->flags & FAULT_FLAG_WRITE)) |
| 3362 | return do_read_fault(fe, pgoff); | 3368 | return do_read_fault(vmf); |
| 3363 | if (!(vma->vm_flags & VM_SHARED)) | 3369 | if (!(vma->vm_flags & VM_SHARED)) |
| 3364 | return do_cow_fault(fe, pgoff); | 3370 | return do_cow_fault(vmf); |
| 3365 | return do_shared_fault(fe, pgoff); | 3371 | return do_shared_fault(vmf); |
| 3366 | } | 3372 | } |
| 3367 | 3373 | ||
| 3368 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3374 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
| @@ -3380,14 +3386,15 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | |||
| 3380 | return mpol_misplaced(page, vma, addr); | 3386 | return mpol_misplaced(page, vma, addr); |
| 3381 | } | 3387 | } |
| 3382 | 3388 | ||
| 3383 | static int do_numa_page(struct fault_env *fe, pte_t pte) | 3389 | static int do_numa_page(struct vm_fault *vmf) |
| 3384 | { | 3390 | { |
| 3385 | struct vm_area_struct *vma = fe->vma; | 3391 | struct vm_area_struct *vma = vmf->vma; |
| 3386 | struct page *page = NULL; | 3392 | struct page *page = NULL; |
| 3387 | int page_nid = -1; | 3393 | int page_nid = -1; |
| 3388 | int last_cpupid; | 3394 | int last_cpupid; |
| 3389 | int target_nid; | 3395 | int target_nid; |
| 3390 | bool migrated = false; | 3396 | bool migrated = false; |
| 3397 | pte_t pte = vmf->orig_pte; | ||
| 3391 | bool was_writable = pte_write(pte); | 3398 | bool was_writable = pte_write(pte); |
| 3392 | int flags = 0; | 3399 | int flags = 0; |
| 3393 | 3400 | ||
| @@ -3400,10 +3407,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
| 3400 | * page table entry is not accessible, so there would be no | 3407 | * page table entry is not accessible, so there would be no |
| 3401 | * concurrent hardware modifications to the PTE. | 3408 | * concurrent hardware modifications to the PTE. |
| 3402 | */ | 3409 | */ |
| 3403 | fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); | 3410 | vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); |
| 3404 | spin_lock(fe->ptl); | 3411 | spin_lock(vmf->ptl); |
| 3405 | if (unlikely(!pte_same(*fe->pte, pte))) { | 3412 | if (unlikely(!pte_same(*vmf->pte, pte))) { |
| 3406 | pte_unmap_unlock(fe->pte, fe->ptl); | 3413 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 3407 | goto out; | 3414 | goto out; |
| 3408 | } | 3415 | } |
| 3409 | 3416 | ||
| @@ -3412,18 +3419,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
| 3412 | pte = pte_mkyoung(pte); | 3419 | pte = pte_mkyoung(pte); |
| 3413 | if (was_writable) | 3420 | if (was_writable) |
| 3414 | pte = pte_mkwrite(pte); | 3421 | pte = pte_mkwrite(pte); |
| 3415 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); | 3422 | set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); |
| 3416 | update_mmu_cache(vma, fe->address, fe->pte); | 3423 | update_mmu_cache(vma, vmf->address, vmf->pte); |
| 3417 | 3424 | ||
| 3418 | page = vm_normal_page(vma, fe->address, pte); | 3425 | page = vm_normal_page(vma, vmf->address, pte); |
| 3419 | if (!page) { | 3426 | if (!page) { |
| 3420 | pte_unmap_unlock(fe->pte, fe->ptl); | 3427 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 3421 | return 0; | 3428 | return 0; |
| 3422 | } | 3429 | } |
| 3423 | 3430 | ||
| 3424 | /* TODO: handle PTE-mapped THP */ | 3431 | /* TODO: handle PTE-mapped THP */ |
| 3425 | if (PageCompound(page)) { | 3432 | if (PageCompound(page)) { |
| 3426 | pte_unmap_unlock(fe->pte, fe->ptl); | 3433 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 3427 | return 0; | 3434 | return 0; |
| 3428 | } | 3435 | } |
| 3429 | 3436 | ||
| @@ -3447,9 +3454,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) | |||
| 3447 | 3454 | ||
| 3448 | last_cpupid = page_cpupid_last(page); | 3455 | last_cpupid = page_cpupid_last(page); |
| 3449 | page_nid = page_to_nid(page); | 3456 | page_nid = page_to_nid(page); |
| 3450 | target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, | 3457 | target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, |
| 3451 | &flags); | 3458 | &flags); |
| 3452 | pte_unmap_unlock(fe->pte, fe->ptl); | 3459 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 3453 | if (target_nid == -1) { | 3460 | if (target_nid == -1) { |
| 3454 | put_page(page); | 3461 | put_page(page); |
| 3455 | goto out; | 3462 | goto out; |
| @@ -3469,28 +3476,28 @@ out: | |||
| 3469 | return 0; | 3476 | return 0; |
| 3470 | } | 3477 | } |
| 3471 | 3478 | ||
| 3472 | static int create_huge_pmd(struct fault_env *fe) | 3479 | static int create_huge_pmd(struct vm_fault *vmf) |
| 3473 | { | 3480 | { |
| 3474 | struct vm_area_struct *vma = fe->vma; | 3481 | struct vm_area_struct *vma = vmf->vma; |
| 3475 | if (vma_is_anonymous(vma)) | 3482 | if (vma_is_anonymous(vma)) |
| 3476 | return do_huge_pmd_anonymous_page(fe); | 3483 | return do_huge_pmd_anonymous_page(vmf); |
| 3477 | if (vma->vm_ops->pmd_fault) | 3484 | if (vma->vm_ops->pmd_fault) |
| 3478 | return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, | 3485 | return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, |
| 3479 | fe->flags); | 3486 | vmf->flags); |
| 3480 | return VM_FAULT_FALLBACK; | 3487 | return VM_FAULT_FALLBACK; |
| 3481 | } | 3488 | } |
| 3482 | 3489 | ||
| 3483 | static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) | 3490 | static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) |
| 3484 | { | 3491 | { |
| 3485 | if (vma_is_anonymous(fe->vma)) | 3492 | if (vma_is_anonymous(vmf->vma)) |
| 3486 | return do_huge_pmd_wp_page(fe, orig_pmd); | 3493 | return do_huge_pmd_wp_page(vmf, orig_pmd); |
| 3487 | if (fe->vma->vm_ops->pmd_fault) | 3494 | if (vmf->vma->vm_ops->pmd_fault) |
| 3488 | return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, | 3495 | return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, |
| 3489 | fe->flags); | 3496 | vmf->pmd, vmf->flags); |
| 3490 | 3497 | ||
| 3491 | /* COW handled on pte level: split pmd */ | 3498 | /* COW handled on pte level: split pmd */ |
| 3492 | VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); | 3499 | VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); |
| 3493 | __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); | 3500 | __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); |
| 3494 | 3501 | ||
| 3495 | return VM_FAULT_FALLBACK; | 3502 | return VM_FAULT_FALLBACK; |
| 3496 | } | 3503 | } |
| @@ -3515,21 +3522,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) | |||
| 3515 | * The mmap_sem may have been released depending on flags and our return value. | 3522 | * The mmap_sem may have been released depending on flags and our return value. |
| 3516 | * See filemap_fault() and __lock_page_or_retry(). | 3523 | * See filemap_fault() and __lock_page_or_retry(). |
| 3517 | */ | 3524 | */ |
| 3518 | static int handle_pte_fault(struct fault_env *fe) | 3525 | static int handle_pte_fault(struct vm_fault *vmf) |
| 3519 | { | 3526 | { |
| 3520 | pte_t entry; | 3527 | pte_t entry; |
| 3521 | 3528 | ||
| 3522 | if (unlikely(pmd_none(*fe->pmd))) { | 3529 | if (unlikely(pmd_none(*vmf->pmd))) { |
| 3523 | /* | 3530 | /* |
| 3524 | * Leave __pte_alloc() until later: because vm_ops->fault may | 3531 | * Leave __pte_alloc() until later: because vm_ops->fault may |
| 3525 | * want to allocate huge page, and if we expose page table | 3532 | * want to allocate huge page, and if we expose page table |
| 3526 | * for an instant, it will be difficult to retract from | 3533 | * for an instant, it will be difficult to retract from |
| 3527 | * concurrent faults and from rmap lookups. | 3534 | * concurrent faults and from rmap lookups. |
| 3528 | */ | 3535 | */ |
| 3529 | fe->pte = NULL; | 3536 | vmf->pte = NULL; |
| 3530 | } else { | 3537 | } else { |
| 3531 | /* See comment in pte_alloc_one_map() */ | 3538 | /* See comment in pte_alloc_one_map() */ |
| 3532 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | 3539 | if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) |
| 3533 | return 0; | 3540 | return 0; |
| 3534 | /* | 3541 | /* |
| 3535 | * A regular pmd is established and it can't morph into a huge | 3542 | * A regular pmd is established and it can't morph into a huge |
| @@ -3537,9 +3544,8 @@ static int handle_pte_fault(struct fault_env *fe) | |||
| 3537 | * mmap_sem read mode and khugepaged takes it in write mode. | 3544 | * mmap_sem read mode and khugepaged takes it in write mode. |
| 3538 | * So now it's safe to run pte_offset_map(). | 3545 | * So now it's safe to run pte_offset_map(). |
| 3539 | */ | 3546 | */ |
| 3540 | fe->pte = pte_offset_map(fe->pmd, fe->address); | 3547 | vmf->pte = pte_offset_map(vmf->pmd, vmf->address); |
| 3541 | 3548 | vmf->orig_pte = *vmf->pte; | |
| 3542 | entry = *fe->pte; | ||
| 3543 | 3549 | ||
| 3544 | /* | 3550 | /* |
| 3545 | * some architectures can have larger ptes than wordsize, | 3551 | * some architectures can have larger ptes than wordsize, |
| @@ -3550,38 +3556,39 @@ static int handle_pte_fault(struct fault_env *fe) | |||
| 3550 | * ptl lock held. So here a barrier will do. | 3556 | * ptl lock held. So here a barrier will do. |
| 3551 | */ | 3557 | */ |
| 3552 | barrier(); | 3558 | barrier(); |
| 3553 | if (pte_none(entry)) { | 3559 | if (pte_none(vmf->orig_pte)) { |
| 3554 | pte_unmap(fe->pte); | 3560 | pte_unmap(vmf->pte); |
| 3555 | fe->pte = NULL; | 3561 | vmf->pte = NULL; |
| 3556 | } | 3562 | } |
| 3557 | } | 3563 | } |
| 3558 | 3564 | ||
| 3559 | if (!fe->pte) { | 3565 | if (!vmf->pte) { |
| 3560 | if (vma_is_anonymous(fe->vma)) | 3566 | if (vma_is_anonymous(vmf->vma)) |
| 3561 | return do_anonymous_page(fe); | 3567 | return do_anonymous_page(vmf); |
| 3562 | else | 3568 | else |
| 3563 | return do_fault(fe); | 3569 | return do_fault(vmf); |
| 3564 | } | 3570 | } |
| 3565 | 3571 | ||
| 3566 | if (!pte_present(entry)) | 3572 | if (!pte_present(vmf->orig_pte)) |
| 3567 | return do_swap_page(fe, entry); | 3573 | return do_swap_page(vmf); |
| 3568 | 3574 | ||
| 3569 | if (pte_protnone(entry) && vma_is_accessible(fe->vma)) | 3575 | if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) |
| 3570 | return do_numa_page(fe, entry); | 3576 | return do_numa_page(vmf); |
| 3571 | 3577 | ||
| 3572 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); | 3578 | vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); |
| 3573 | spin_lock(fe->ptl); | 3579 | spin_lock(vmf->ptl); |
| 3574 | if (unlikely(!pte_same(*fe->pte, entry))) | 3580 | entry = vmf->orig_pte; |
| 3581 | if (unlikely(!pte_same(*vmf->pte, entry))) | ||
| 3575 | goto unlock; | 3582 | goto unlock; |
| 3576 | if (fe->flags & FAULT_FLAG_WRITE) { | 3583 | if (vmf->flags & FAULT_FLAG_WRITE) { |
| 3577 | if (!pte_write(entry)) | 3584 | if (!pte_write(entry)) |
| 3578 | return do_wp_page(fe, entry); | 3585 | return do_wp_page(vmf); |
| 3579 | entry = pte_mkdirty(entry); | 3586 | entry = pte_mkdirty(entry); |
| 3580 | } | 3587 | } |
| 3581 | entry = pte_mkyoung(entry); | 3588 | entry = pte_mkyoung(entry); |
| 3582 | if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, | 3589 | if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, |
| 3583 | fe->flags & FAULT_FLAG_WRITE)) { | 3590 | vmf->flags & FAULT_FLAG_WRITE)) { |
| 3584 | update_mmu_cache(fe->vma, fe->address, fe->pte); | 3591 | update_mmu_cache(vmf->vma, vmf->address, vmf->pte); |
| 3585 | } else { | 3592 | } else { |
| 3586 | /* | 3593 | /* |
| 3587 | * This is needed only for protection faults but the arch code | 3594 | * This is needed only for protection faults but the arch code |
| @@ -3589,11 +3596,11 @@ static int handle_pte_fault(struct fault_env *fe) | |||
| 3589 | * This still avoids useless tlb flushes for .text page faults | 3596 | * This still avoids useless tlb flushes for .text page faults |
| 3590 | * with threads. | 3597 | * with threads. |
| 3591 | */ | 3598 | */ |
| 3592 | if (fe->flags & FAULT_FLAG_WRITE) | 3599 | if (vmf->flags & FAULT_FLAG_WRITE) |
| 3593 | flush_tlb_fix_spurious_fault(fe->vma, fe->address); | 3600 | flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); |
| 3594 | } | 3601 | } |
| 3595 | unlock: | 3602 | unlock: |
| 3596 | pte_unmap_unlock(fe->pte, fe->ptl); | 3603 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
| 3597 | return 0; | 3604 | return 0; |
| 3598 | } | 3605 | } |
| 3599 | 3606 | ||
| @@ -3606,10 +3613,12 @@ unlock: | |||
| 3606 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | 3613 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
| 3607 | unsigned int flags) | 3614 | unsigned int flags) |
| 3608 | { | 3615 | { |
| 3609 | struct fault_env fe = { | 3616 | struct vm_fault vmf = { |
| 3610 | .vma = vma, | 3617 | .vma = vma, |
| 3611 | .address = address, | 3618 | .address = address & PAGE_MASK, |
| 3612 | .flags = flags, | 3619 | .flags = flags, |
| 3620 | .pgoff = linear_page_index(vma, address), | ||
| 3621 | .gfp_mask = __get_fault_gfp_mask(vma), | ||
| 3613 | }; | 3622 | }; |
| 3614 | struct mm_struct *mm = vma->vm_mm; | 3623 | struct mm_struct *mm = vma->vm_mm; |
| 3615 | pgd_t *pgd; | 3624 | pgd_t *pgd; |
| @@ -3619,35 +3628,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 3619 | pud = pud_alloc(mm, pgd, address); | 3628 | pud = pud_alloc(mm, pgd, address); |
| 3620 | if (!pud) | 3629 | if (!pud) |
| 3621 | return VM_FAULT_OOM; | 3630 | return VM_FAULT_OOM; |
| 3622 | fe.pmd = pmd_alloc(mm, pud, address); | 3631 | vmf.pmd = pmd_alloc(mm, pud, address); |
| 3623 | if (!fe.pmd) | 3632 | if (!vmf.pmd) |
| 3624 | return VM_FAULT_OOM; | 3633 | return VM_FAULT_OOM; |
| 3625 | if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { | 3634 | if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { |
| 3626 | int ret = create_huge_pmd(&fe); | 3635 | int ret = create_huge_pmd(&vmf); |
| 3627 | if (!(ret & VM_FAULT_FALLBACK)) | 3636 | if (!(ret & VM_FAULT_FALLBACK)) |
| 3628 | return ret; | 3637 | return ret; |
| 3629 | } else { | 3638 | } else { |
| 3630 | pmd_t orig_pmd = *fe.pmd; | 3639 | pmd_t orig_pmd = *vmf.pmd; |
| 3631 | int ret; | 3640 | int ret; |
| 3632 | 3641 | ||
| 3633 | barrier(); | 3642 | barrier(); |
| 3634 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3643 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
| 3635 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) | 3644 | if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) |
| 3636 | return do_huge_pmd_numa_page(&fe, orig_pmd); | 3645 | return do_huge_pmd_numa_page(&vmf, orig_pmd); |
| 3637 | 3646 | ||
| 3638 | if ((fe.flags & FAULT_FLAG_WRITE) && | 3647 | if ((vmf.flags & FAULT_FLAG_WRITE) && |
| 3639 | !pmd_write(orig_pmd)) { | 3648 | !pmd_write(orig_pmd)) { |
| 3640 | ret = wp_huge_pmd(&fe, orig_pmd); | 3649 | ret = wp_huge_pmd(&vmf, orig_pmd); |
| 3641 | if (!(ret & VM_FAULT_FALLBACK)) | 3650 | if (!(ret & VM_FAULT_FALLBACK)) |
| 3642 | return ret; | 3651 | return ret; |
| 3643 | } else { | 3652 | } else { |
| 3644 | huge_pmd_set_accessed(&fe, orig_pmd); | 3653 | huge_pmd_set_accessed(&vmf, orig_pmd); |
| 3645 | return 0; | 3654 | return 0; |
| 3646 | } | 3655 | } |
| 3647 | } | 3656 | } |
| 3648 | } | 3657 | } |
| 3649 | 3658 | ||
| 3650 | return handle_pte_fault(&fe); | 3659 | return handle_pte_fault(&vmf); |
| 3651 | } | 3660 | } |
| 3652 | 3661 | ||
| 3653 | /* | 3662 | /* |
| @@ -3808,8 +3817,8 @@ out: | |||
| 3808 | return -EINVAL; | 3817 | return -EINVAL; |
| 3809 | } | 3818 | } |
| 3810 | 3819 | ||
| 3811 | static inline int follow_pte(struct mm_struct *mm, unsigned long address, | 3820 | int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, |
| 3812 | pte_t **ptepp, spinlock_t **ptlp) | 3821 | spinlock_t **ptlp) |
| 3813 | { | 3822 | { |
| 3814 | int res; | 3823 | int res; |
| 3815 | 3824 | ||
| @@ -3919,7 +3928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
| 3919 | struct page *page = NULL; | 3928 | struct page *page = NULL; |
| 3920 | 3929 | ||
| 3921 | ret = get_user_pages_remote(tsk, mm, addr, 1, | 3930 | ret = get_user_pages_remote(tsk, mm, addr, 1, |
| 3922 | gup_flags, &page, &vma); | 3931 | gup_flags, &page, &vma, NULL); |
| 3923 | if (ret <= 0) { | 3932 | if (ret <= 0) { |
| 3924 | #ifndef CONFIG_HAVE_IOREMAP_PROT | 3933 | #ifndef CONFIG_HAVE_IOREMAP_PROT |
| 3925 | break; | 3934 | break; |
diff --git a/mm/nommu.c b/mm/nommu.c index 27bc543128e5..210d7ec2843c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, | |||
| 176 | } | 176 | } |
| 177 | EXPORT_SYMBOL(get_user_pages_locked); | 177 | EXPORT_SYMBOL(get_user_pages_locked); |
| 178 | 178 | ||
| 179 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | 179 | static long __get_user_pages_unlocked(struct task_struct *tsk, |
| 180 | unsigned long start, unsigned long nr_pages, | 180 | struct mm_struct *mm, unsigned long start, |
| 181 | struct page **pages, unsigned int gup_flags) | 181 | unsigned long nr_pages, struct page **pages, |
| 182 | unsigned int gup_flags) | ||
| 182 | { | 183 | { |
| 183 | long ret; | 184 | long ret; |
| 184 | down_read(&mm->mmap_sem); | 185 | down_read(&mm->mmap_sem); |
| @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | |||
| 187 | up_read(&mm->mmap_sem); | 188 | up_read(&mm->mmap_sem); |
| 188 | return ret; | 189 | return ret; |
| 189 | } | 190 | } |
| 190 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
| 191 | 191 | ||
| 192 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, | 192 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
| 193 | struct page **pages, unsigned int gup_flags) | 193 | struct page **pages, unsigned int gup_flags) |
| @@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1801 | } | 1801 | } |
| 1802 | EXPORT_SYMBOL(filemap_fault); | 1802 | EXPORT_SYMBOL(filemap_fault); |
| 1803 | 1803 | ||
| 1804 | void filemap_map_pages(struct fault_env *fe, | 1804 | void filemap_map_pages(struct vm_fault *vmf, |
| 1805 | pgoff_t start_pgoff, pgoff_t end_pgoff) | 1805 | pgoff_t start_pgoff, pgoff_t end_pgoff) |
| 1806 | { | 1806 | { |
| 1807 | BUG(); | 1807 | BUG(); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 52e2f8e3b472..290e8b7d3181 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, | |||
| 2106 | pgoff_t start, pgoff_t end) | 2106 | pgoff_t start, pgoff_t end) |
| 2107 | { | 2107 | { |
| 2108 | #define WRITEBACK_TAG_BATCH 4096 | 2108 | #define WRITEBACK_TAG_BATCH 4096 |
| 2109 | unsigned long tagged; | 2109 | unsigned long tagged = 0; |
| 2110 | 2110 | struct radix_tree_iter iter; | |
| 2111 | do { | 2111 | void **slot; |
| 2112 | spin_lock_irq(&mapping->tree_lock); | 2112 | |
| 2113 | tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, | 2113 | spin_lock_irq(&mapping->tree_lock); |
| 2114 | &start, end, WRITEBACK_TAG_BATCH, | 2114 | radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, |
| 2115 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); | 2115 | PAGECACHE_TAG_DIRTY) { |
| 2116 | if (iter.index > end) | ||
| 2117 | break; | ||
| 2118 | radix_tree_iter_tag_set(&mapping->page_tree, &iter, | ||
| 2119 | PAGECACHE_TAG_TOWRITE); | ||
| 2120 | tagged++; | ||
| 2121 | if ((tagged % WRITEBACK_TAG_BATCH) != 0) | ||
| 2122 | continue; | ||
| 2123 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 2116 | spin_unlock_irq(&mapping->tree_lock); | 2124 | spin_unlock_irq(&mapping->tree_lock); |
| 2117 | WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); | ||
| 2118 | cond_resched(); | 2125 | cond_resched(); |
| 2119 | /* We check 'start' to handle wrapping when end == ~0UL */ | 2126 | spin_lock_irq(&mapping->tree_lock); |
| 2120 | } while (tagged >= WRITEBACK_TAG_BATCH && start); | 2127 | } |
| 2128 | spin_unlock_irq(&mapping->tree_lock); | ||
| 2121 | } | 2129 | } |
| 2122 | EXPORT_SYMBOL(tag_pages_for_writeback); | 2130 | EXPORT_SYMBOL(tag_pages_for_writeback); |
| 2123 | 2131 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f64e7bcb43b7..2c6d5f64feca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, | |||
| 3925 | return page; | 3925 | return page; |
| 3926 | } | 3926 | } |
| 3927 | 3927 | ||
| 3928 | void __page_frag_drain(struct page *page, unsigned int order, | ||
| 3929 | unsigned int count) | ||
| 3930 | { | ||
| 3931 | VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); | ||
| 3932 | |||
| 3933 | if (page_ref_sub_and_test(page, count)) { | ||
| 3934 | if (order == 0) | ||
| 3935 | free_hot_cold_page(page, false); | ||
| 3936 | else | ||
| 3937 | __free_pages_ok(page, order); | ||
| 3938 | } | ||
| 3939 | } | ||
| 3940 | EXPORT_SYMBOL(__page_frag_drain); | ||
| 3941 | |||
| 3928 | void *__alloc_page_frag(struct page_frag_cache *nc, | 3942 | void *__alloc_page_frag(struct page_frag_cache *nc, |
| 3929 | unsigned int fragsz, gfp_t gfp_mask) | 3943 | unsigned int fragsz, gfp_t gfp_mask) |
| 3930 | { | 3944 | { |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
| @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
| 88 | ssize_t rc = 0; | 88 | ssize_t rc = 0; |
| 89 | unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES | 89 | unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES |
| 90 | / sizeof(struct pages *); | 90 | / sizeof(struct pages *); |
| 91 | unsigned int flags = FOLL_REMOTE; | 91 | unsigned int flags = 0; |
| 92 | 92 | ||
| 93 | /* Work out address and page range required */ | 93 | /* Work out address and page range required */ |
| 94 | if (len == 0) | 94 | if (len == 0) |
| @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
| 100 | 100 | ||
| 101 | while (!rc && nr_pages && iov_iter_count(iter)) { | 101 | while (!rc && nr_pages && iov_iter_count(iter)) { |
| 102 | int pages = min(nr_pages, max_pages_per_loop); | 102 | int pages = min(nr_pages, max_pages_per_loop); |
| 103 | int locked = 1; | ||
| 103 | size_t bytes; | 104 | size_t bytes; |
| 104 | 105 | ||
| 105 | /* | 106 | /* |
| 106 | * Get the pages we're interested in. We must | 107 | * Get the pages we're interested in. We must |
| 107 | * add FOLL_REMOTE because task/mm might not | 108 | * access remotely because task/mm might not |
| 108 | * current/current->mm | 109 | * current/current->mm |
| 109 | */ | 110 | */ |
| 110 | pages = __get_user_pages_unlocked(task, mm, pa, pages, | 111 | down_read(&mm->mmap_sem); |
| 111 | process_pages, flags); | 112 | pages = get_user_pages_remote(task, mm, pa, pages, flags, |
| 113 | process_pages, NULL, &locked); | ||
| 114 | if (locked) | ||
| 115 | up_read(&mm->mmap_sem); | ||
| 112 | if (pages <= 0) | 116 | if (pages <= 0) |
| 113 | return -EFAULT; | 117 | return -EFAULT; |
| 114 | 118 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index abd7403aba41..54287d443806 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, | |||
| 661 | swapped++; | 661 | swapped++; |
| 662 | 662 | ||
| 663 | if (need_resched()) { | 663 | if (need_resched()) { |
| 664 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 664 | cond_resched_rcu(); | 665 | cond_resched_rcu(); |
| 665 | slot = radix_tree_iter_next(&iter); | ||
| 666 | } | 666 | } |
| 667 | } | 667 | } |
| 668 | 668 | ||
| @@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode) | |||
| 1049 | clear_inode(inode); | 1049 | clear_inode(inode); |
| 1050 | } | 1050 | } |
| 1051 | 1051 | ||
| 1052 | static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) | ||
| 1053 | { | ||
| 1054 | struct radix_tree_iter iter; | ||
| 1055 | void **slot; | ||
| 1056 | unsigned long found = -1; | ||
| 1057 | unsigned int checked = 0; | ||
| 1058 | |||
| 1059 | rcu_read_lock(); | ||
| 1060 | radix_tree_for_each_slot(slot, root, &iter, 0) { | ||
| 1061 | if (*slot == item) { | ||
| 1062 | found = iter.index; | ||
| 1063 | break; | ||
| 1064 | } | ||
| 1065 | checked++; | ||
| 1066 | if ((checked % 4096) != 0) | ||
| 1067 | continue; | ||
| 1068 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 1069 | cond_resched_rcu(); | ||
| 1070 | } | ||
| 1071 | |||
| 1072 | rcu_read_unlock(); | ||
| 1073 | return found; | ||
| 1074 | } | ||
| 1075 | |||
| 1052 | /* | 1076 | /* |
| 1053 | * If swap found in inode, free it and move page from swapcache to filecache. | 1077 | * If swap found in inode, free it and move page from swapcache to filecache. |
| 1054 | */ | 1078 | */ |
| @@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
| 1062 | int error = 0; | 1086 | int error = 0; |
| 1063 | 1087 | ||
| 1064 | radswap = swp_to_radix_entry(swap); | 1088 | radswap = swp_to_radix_entry(swap); |
| 1065 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 1089 | index = find_swap_entry(&mapping->page_tree, radswap); |
| 1066 | if (index == -1) | 1090 | if (index == -1) |
| 1067 | return -EAGAIN; /* tell shmem_unuse we found nothing */ | 1091 | return -EAGAIN; /* tell shmem_unuse we found nothing */ |
| 1068 | 1092 | ||
| @@ -2447,8 +2471,8 @@ static void shmem_tag_pins(struct address_space *mapping) | |||
| 2447 | } | 2471 | } |
| 2448 | 2472 | ||
| 2449 | if (need_resched()) { | 2473 | if (need_resched()) { |
| 2474 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 2450 | cond_resched_rcu(); | 2475 | cond_resched_rcu(); |
| 2451 | slot = radix_tree_iter_next(&iter); | ||
| 2452 | } | 2476 | } |
| 2453 | } | 2477 | } |
| 2454 | rcu_read_unlock(); | 2478 | rcu_read_unlock(); |
| @@ -2517,8 +2541,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) | |||
| 2517 | spin_unlock_irq(&mapping->tree_lock); | 2541 | spin_unlock_irq(&mapping->tree_lock); |
| 2518 | continue_resched: | 2542 | continue_resched: |
| 2519 | if (need_resched()) { | 2543 | if (need_resched()) { |
| 2544 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 2520 | cond_resched_rcu(); | 2545 | cond_resched_rcu(); |
| 2521 | slot = radix_tree_iter_next(&iter); | ||
| 2522 | } | 2546 | } |
| 2523 | } | 2547 | } |
| 2524 | rcu_read_unlock(); | 2548 | rcu_read_unlock(); |
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2d59c9be40e1..5f63f6dcaabb 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c | |||
| @@ -762,16 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = { | |||
| 762 | static int __init af_rxrpc_init(void) | 762 | static int __init af_rxrpc_init(void) |
| 763 | { | 763 | { |
| 764 | int ret = -1; | 764 | int ret = -1; |
| 765 | unsigned int tmp; | ||
| 765 | 766 | ||
| 766 | BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); | 767 | BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); |
| 767 | 768 | ||
| 768 | get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); | 769 | get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); |
| 769 | rxrpc_epoch |= RXRPC_RANDOM_EPOCH; | 770 | rxrpc_epoch |= RXRPC_RANDOM_EPOCH; |
| 770 | get_random_bytes(&rxrpc_client_conn_ids.cur, | 771 | get_random_bytes(&tmp, sizeof(tmp)); |
| 771 | sizeof(rxrpc_client_conn_ids.cur)); | 772 | tmp &= 0x3fffffff; |
| 772 | rxrpc_client_conn_ids.cur &= 0x3fffffff; | 773 | if (tmp == 0) |
| 773 | if (rxrpc_client_conn_ids.cur == 0) | 774 | tmp = 1; |
| 774 | rxrpc_client_conn_ids.cur = 1; | 775 | idr_set_cursor(&rxrpc_client_conn_ids, tmp); |
| 775 | 776 | ||
| 776 | ret = -ENOMEM; | 777 | ret = -ENOMEM; |
| 777 | rxrpc_call_jar = kmem_cache_create( | 778 | rxrpc_call_jar = kmem_cache_create( |
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 60ef9605167e..6cbcdcc29853 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c | |||
| @@ -263,12 +263,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) | |||
| 263 | * times the maximum number of client conns away from the current | 263 | * times the maximum number of client conns away from the current |
| 264 | * allocation point to try and keep the IDs concentrated. | 264 | * allocation point to try and keep the IDs concentrated. |
| 265 | */ | 265 | */ |
| 266 | id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); | 266 | id_cursor = idr_get_cursor(&rxrpc_client_conn_ids); |
| 267 | id = conn->proto.cid >> RXRPC_CIDSHIFT; | 267 | id = conn->proto.cid >> RXRPC_CIDSHIFT; |
| 268 | distance = id - id_cursor; | 268 | distance = id - id_cursor; |
| 269 | if (distance < 0) | 269 | if (distance < 0) |
| 270 | distance = -distance; | 270 | distance = -distance; |
| 271 | limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; | 271 | limit = max(rxrpc_max_client_connections * 4, 1024U); |
| 272 | if (distance > limit) | 272 | if (distance > limit) |
| 273 | goto mark_dont_reuse; | 273 | goto mark_dont_reuse; |
| 274 | 274 | ||
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 682b73af7766..838ffa78cfda 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c | |||
| @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, | |||
| 881 | * the execve(). | 881 | * the execve(). |
| 882 | */ | 882 | */ |
| 883 | if (get_user_pages_remote(current, bprm->mm, pos, 1, | 883 | if (get_user_pages_remote(current, bprm->mm, pos, 1, |
| 884 | FOLL_FORCE, &page, NULL) <= 0) | 884 | FOLL_FORCE, &page, NULL, NULL) <= 0) |
| 885 | return false; | 885 | return false; |
| 886 | #else | 886 | #else |
| 887 | page = bprm->page[pos / PAGE_SIZE]; | 887 | page = bprm->page[pos / PAGE_SIZE]; |
diff --git a/tools/include/asm/bug.h b/tools/include/asm/bug.h index 9e5f4846967f..beda1a884b50 100644 --- a/tools/include/asm/bug.h +++ b/tools/include/asm/bug.h | |||
| @@ -12,6 +12,17 @@ | |||
| 12 | unlikely(__ret_warn_on); \ | 12 | unlikely(__ret_warn_on); \ |
| 13 | }) | 13 | }) |
| 14 | 14 | ||
| 15 | #define WARN_ON_ONCE(condition) ({ \ | ||
| 16 | static int __warned; \ | ||
| 17 | int __ret_warn_once = !!(condition); \ | ||
| 18 | \ | ||
| 19 | if (unlikely(__ret_warn_once && !__warned)) { \ | ||
| 20 | __warned = true; \ | ||
| 21 | WARN_ON(1); \ | ||
| 22 | } \ | ||
| 23 | unlikely(__ret_warn_once); \ | ||
| 24 | }) | ||
| 25 | |||
| 15 | #define WARN_ONCE(condition, format...) ({ \ | 26 | #define WARN_ONCE(condition, format...) ({ \ |
| 16 | static int __warned; \ | 27 | static int __warned; \ |
| 17 | int __ret_warn_once = !!(condition); \ | 28 | int __ret_warn_once = !!(condition); \ |
diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index 43c1c5021e4b..eef41d500e9e 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h | |||
| @@ -35,6 +35,32 @@ static inline void bitmap_zero(unsigned long *dst, int nbits) | |||
| 35 | } | 35 | } |
| 36 | } | 36 | } |
| 37 | 37 | ||
| 38 | static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) | ||
| 39 | { | ||
| 40 | unsigned int nlongs = BITS_TO_LONGS(nbits); | ||
| 41 | if (!small_const_nbits(nbits)) { | ||
| 42 | unsigned int len = (nlongs - 1) * sizeof(unsigned long); | ||
| 43 | memset(dst, 0xff, len); | ||
| 44 | } | ||
| 45 | dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); | ||
| 46 | } | ||
| 47 | |||
| 48 | static inline int bitmap_empty(const unsigned long *src, unsigned nbits) | ||
| 49 | { | ||
| 50 | if (small_const_nbits(nbits)) | ||
| 51 | return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); | ||
| 52 | |||
| 53 | return find_first_bit(src, nbits) == nbits; | ||
| 54 | } | ||
| 55 | |||
| 56 | static inline int bitmap_full(const unsigned long *src, unsigned int nbits) | ||
| 57 | { | ||
| 58 | if (small_const_nbits(nbits)) | ||
| 59 | return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); | ||
| 60 | |||
| 61 | return find_first_zero_bit(src, nbits) == nbits; | ||
| 62 | } | ||
| 63 | |||
| 38 | static inline int bitmap_weight(const unsigned long *src, int nbits) | 64 | static inline int bitmap_weight(const unsigned long *src, int nbits) |
| 39 | { | 65 | { |
| 40 | if (small_const_nbits(nbits)) | 66 | if (small_const_nbits(nbits)) |
diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl index d08e214ec6e7..be93ab02b490 100755 --- a/tools/testing/ktest/ktest.pl +++ b/tools/testing/ktest/ktest.pl | |||
| @@ -719,14 +719,14 @@ sub set_value { | |||
| 719 | 719 | ||
| 720 | if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { | 720 | if ($buildonly && $lvalue =~ /^TEST_TYPE(\[.*\])?$/ && $prvalue ne "build") { |
| 721 | # Note if a test is something other than build, then we | 721 | # Note if a test is something other than build, then we |
| 722 | # will need other manditory options. | 722 | # will need other mandatory options. |
| 723 | if ($prvalue ne "install") { | 723 | if ($prvalue ne "install") { |
| 724 | # for bisect, we need to check BISECT_TYPE | 724 | # for bisect, we need to check BISECT_TYPE |
| 725 | if ($prvalue ne "bisect") { | 725 | if ($prvalue ne "bisect") { |
| 726 | $buildonly = 0; | 726 | $buildonly = 0; |
| 727 | } | 727 | } |
| 728 | } else { | 728 | } else { |
| 729 | # install still limits some manditory options. | 729 | # install still limits some mandatory options. |
| 730 | $buildonly = 2; | 730 | $buildonly = 2; |
| 731 | } | 731 | } |
| 732 | } | 732 | } |
| @@ -735,7 +735,7 @@ sub set_value { | |||
| 735 | if ($prvalue ne "install") { | 735 | if ($prvalue ne "install") { |
| 736 | $buildonly = 0; | 736 | $buildonly = 0; |
| 737 | } else { | 737 | } else { |
| 738 | # install still limits some manditory options. | 738 | # install still limits some mandatory options. |
| 739 | $buildonly = 2; | 739 | $buildonly = 2; |
| 740 | } | 740 | } |
| 741 | } | 741 | } |
| @@ -3989,7 +3989,7 @@ sub make_min_config { | |||
| 3989 | } | 3989 | } |
| 3990 | } | 3990 | } |
| 3991 | 3991 | ||
| 3992 | # Save off all the current mandidory configs | 3992 | # Save off all the current mandatory configs |
| 3993 | open (OUT, ">$temp_config") | 3993 | open (OUT, ">$temp_config") |
| 3994 | or die "Can't write to $temp_config"; | 3994 | or die "Can't write to $temp_config"; |
| 3995 | foreach my $config (keys %keep_configs) { | 3995 | foreach my $config (keys %keep_configs) { |
diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index f2e07f2fd4b4..3635e4d3eca7 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile | |||
| @@ -1,10 +1,14 @@ | |||
| 1 | 1 | ||
| 2 | CFLAGS += -I. -g -O2 -Wall -D_LGPL_SOURCE | 2 | CFLAGS += -I. -I../../include -g -O2 -Wall -D_LGPL_SOURCE |
| 3 | LDFLAGS += -lpthread -lurcu | 3 | LDFLAGS += -lpthread -lurcu |
| 4 | TARGETS = main | 4 | TARGETS = main |
| 5 | OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ | 5 | OFILES = main.o radix-tree.o linux.o test.o tag_check.o find_next_bit.o \ |
| 6 | regression1.o regression2.o regression3.o multiorder.o \ | 6 | regression1.o regression2.o regression3.o multiorder.o \ |
| 7 | iteration_check.o | 7 | iteration_check.o benchmark.o |
| 8 | |||
| 9 | ifdef BENCHMARK | ||
| 10 | CFLAGS += -DBENCHMARK=1 | ||
| 11 | endif | ||
| 8 | 12 | ||
| 9 | targets: $(TARGETS) | 13 | targets: $(TARGETS) |
| 10 | 14 | ||
| @@ -14,7 +18,12 @@ main: $(OFILES) | |||
| 14 | clean: | 18 | clean: |
| 15 | $(RM) -f $(TARGETS) *.o radix-tree.c | 19 | $(RM) -f $(TARGETS) *.o radix-tree.c |
| 16 | 20 | ||
| 17 | $(OFILES): *.h */*.h ../../../include/linux/radix-tree.h ../../include/linux/*.h | 21 | find_next_bit.o: ../../lib/find_bit.c |
| 22 | $(CC) $(CFLAGS) -c -o $@ $< | ||
| 23 | |||
| 24 | $(OFILES): *.h */*.h \ | ||
| 25 | ../../include/linux/*.h \ | ||
| 26 | ../../../include/linux/radix-tree.h | ||
| 18 | 27 | ||
| 19 | radix-tree.c: ../../../lib/radix-tree.c | 28 | radix-tree.c: ../../../lib/radix-tree.c |
| 20 | sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ | 29 | sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ |
diff --git a/tools/testing/radix-tree/benchmark.c b/tools/testing/radix-tree/benchmark.c new file mode 100644 index 000000000000..215ca86c7605 --- /dev/null +++ b/tools/testing/radix-tree/benchmark.c | |||
| @@ -0,0 +1,98 @@ | |||
| 1 | /* | ||
| 2 | * benchmark.c: | ||
| 3 | * Author: Konstantin Khlebnikov <koct9i@gmail.com> | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify it | ||
| 6 | * under the terms and conditions of the GNU General Public License, | ||
| 7 | * version 2, as published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
| 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
| 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
| 12 | * more details. | ||
| 13 | */ | ||
| 14 | #include <linux/radix-tree.h> | ||
| 15 | #include <linux/slab.h> | ||
| 16 | #include <linux/errno.h> | ||
| 17 | #include <time.h> | ||
| 18 | #include "test.h" | ||
| 19 | |||
| 20 | #define NSEC_PER_SEC 1000000000L | ||
| 21 | |||
| 22 | static long long benchmark_iter(struct radix_tree_root *root, bool tagged) | ||
| 23 | { | ||
| 24 | volatile unsigned long sink = 0; | ||
| 25 | struct radix_tree_iter iter; | ||
| 26 | struct timespec start, finish; | ||
| 27 | long long nsec; | ||
| 28 | int l, loops = 1; | ||
| 29 | void **slot; | ||
| 30 | |||
| 31 | #ifdef BENCHMARK | ||
| 32 | again: | ||
| 33 | #endif | ||
| 34 | clock_gettime(CLOCK_MONOTONIC, &start); | ||
| 35 | for (l = 0; l < loops; l++) { | ||
| 36 | if (tagged) { | ||
| 37 | radix_tree_for_each_tagged(slot, root, &iter, 0, 0) | ||
| 38 | sink ^= (unsigned long)slot; | ||
| 39 | } else { | ||
| 40 | radix_tree_for_each_slot(slot, root, &iter, 0) | ||
| 41 | sink ^= (unsigned long)slot; | ||
| 42 | } | ||
| 43 | } | ||
| 44 | clock_gettime(CLOCK_MONOTONIC, &finish); | ||
| 45 | |||
| 46 | nsec = (finish.tv_sec - start.tv_sec) * NSEC_PER_SEC + | ||
| 47 | (finish.tv_nsec - start.tv_nsec); | ||
| 48 | |||
| 49 | #ifdef BENCHMARK | ||
| 50 | if (loops == 1 && nsec * 5 < NSEC_PER_SEC) { | ||
| 51 | loops = NSEC_PER_SEC / nsec / 4 + 1; | ||
| 52 | goto again; | ||
| 53 | } | ||
| 54 | #endif | ||
| 55 | |||
| 56 | nsec /= loops; | ||
| 57 | return nsec; | ||
| 58 | } | ||
| 59 | |||
| 60 | static void benchmark_size(unsigned long size, unsigned long step, int order) | ||
| 61 | { | ||
| 62 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 63 | long long normal, tagged; | ||
| 64 | unsigned long index; | ||
| 65 | |||
| 66 | for (index = 0 ; index < size ; index += step) { | ||
| 67 | item_insert_order(&tree, index, order); | ||
| 68 | radix_tree_tag_set(&tree, index, 0); | ||
| 69 | } | ||
| 70 | |||
| 71 | tagged = benchmark_iter(&tree, true); | ||
| 72 | normal = benchmark_iter(&tree, false); | ||
| 73 | |||
| 74 | printf("Size %ld, step %6ld, order %d tagged %10lld ns, normal %10lld ns\n", | ||
| 75 | size, step, order, tagged, normal); | ||
| 76 | |||
| 77 | item_kill_tree(&tree); | ||
| 78 | rcu_barrier(); | ||
| 79 | } | ||
| 80 | |||
| 81 | void benchmark(void) | ||
| 82 | { | ||
| 83 | unsigned long size[] = {1 << 10, 1 << 20, 0}; | ||
| 84 | unsigned long step[] = {1, 2, 7, 15, 63, 64, 65, | ||
| 85 | 128, 256, 512, 12345, 0}; | ||
| 86 | int c, s; | ||
| 87 | |||
| 88 | printf("starting benchmarks\n"); | ||
| 89 | printf("RADIX_TREE_MAP_SHIFT = %d\n", RADIX_TREE_MAP_SHIFT); | ||
| 90 | |||
| 91 | for (c = 0; size[c]; c++) | ||
| 92 | for (s = 0; step[s]; s++) | ||
| 93 | benchmark_size(size[c], step[s], 0); | ||
| 94 | |||
| 95 | for (c = 0; size[c]; c++) | ||
| 96 | for (s = 0; step[s]; s++) | ||
| 97 | benchmark_size(size[c], step[s] << 9, 9); | ||
| 98 | } | ||
diff --git a/tools/testing/radix-tree/find_next_bit.c b/tools/testing/radix-tree/find_next_bit.c deleted file mode 100644 index d1c2178bb2d4..000000000000 --- a/tools/testing/radix-tree/find_next_bit.c +++ /dev/null | |||
| @@ -1,57 +0,0 @@ | |||
| 1 | /* find_next_bit.c: fallback find next bit implementation | ||
| 2 | * | ||
| 3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | ||
| 4 | * Written by David Howells (dhowells@redhat.com) | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the License, or (at your option) any later version. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/types.h> | ||
| 13 | #include <linux/bitops.h> | ||
| 14 | |||
| 15 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) | ||
| 16 | |||
| 17 | /* | ||
| 18 | * Find the next set bit in a memory region. | ||
| 19 | */ | ||
| 20 | unsigned long find_next_bit(const unsigned long *addr, unsigned long size, | ||
| 21 | unsigned long offset) | ||
| 22 | { | ||
| 23 | const unsigned long *p = addr + BITOP_WORD(offset); | ||
| 24 | unsigned long result = offset & ~(BITS_PER_LONG-1); | ||
| 25 | unsigned long tmp; | ||
| 26 | |||
| 27 | if (offset >= size) | ||
| 28 | return size; | ||
| 29 | size -= result; | ||
| 30 | offset %= BITS_PER_LONG; | ||
| 31 | if (offset) { | ||
| 32 | tmp = *(p++); | ||
| 33 | tmp &= (~0UL << offset); | ||
| 34 | if (size < BITS_PER_LONG) | ||
| 35 | goto found_first; | ||
| 36 | if (tmp) | ||
| 37 | goto found_middle; | ||
| 38 | size -= BITS_PER_LONG; | ||
| 39 | result += BITS_PER_LONG; | ||
| 40 | } | ||
| 41 | while (size & ~(BITS_PER_LONG-1)) { | ||
| 42 | if ((tmp = *(p++))) | ||
| 43 | goto found_middle; | ||
| 44 | result += BITS_PER_LONG; | ||
| 45 | size -= BITS_PER_LONG; | ||
| 46 | } | ||
| 47 | if (!size) | ||
| 48 | return result; | ||
| 49 | tmp = *p; | ||
| 50 | |||
| 51 | found_first: | ||
| 52 | tmp &= (~0UL >> (BITS_PER_LONG - size)); | ||
| 53 | if (tmp == 0UL) /* Are any bits set? */ | ||
| 54 | return result + size; /* Nope. */ | ||
| 55 | found_middle: | ||
| 56 | return result + __ffs(tmp); | ||
| 57 | } | ||
diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index 9adb8e7415a6..7572b7ed930e 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c | |||
| @@ -16,35 +16,50 @@ | |||
| 16 | #include <pthread.h> | 16 | #include <pthread.h> |
| 17 | #include "test.h" | 17 | #include "test.h" |
| 18 | 18 | ||
| 19 | #define NUM_THREADS 4 | 19 | #define NUM_THREADS 5 |
| 20 | #define TAG 0 | 20 | #define MAX_IDX 100 |
| 21 | #define TAG 0 | ||
| 22 | #define NEW_TAG 1 | ||
| 23 | |||
| 21 | static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; | 24 | static pthread_mutex_t tree_lock = PTHREAD_MUTEX_INITIALIZER; |
| 22 | static pthread_t threads[NUM_THREADS]; | 25 | static pthread_t threads[NUM_THREADS]; |
| 23 | RADIX_TREE(tree, GFP_KERNEL); | 26 | static unsigned int seeds[3]; |
| 24 | bool test_complete; | 27 | static RADIX_TREE(tree, GFP_KERNEL); |
| 28 | static bool test_complete; | ||
| 29 | static int max_order; | ||
| 25 | 30 | ||
| 26 | /* relentlessly fill the tree with tagged entries */ | 31 | /* relentlessly fill the tree with tagged entries */ |
| 27 | static void *add_entries_fn(void *arg) | 32 | static void *add_entries_fn(void *arg) |
| 28 | { | 33 | { |
| 29 | int pgoff; | 34 | rcu_register_thread(); |
| 30 | 35 | ||
| 31 | while (!test_complete) { | 36 | while (!test_complete) { |
| 32 | for (pgoff = 0; pgoff < 100; pgoff++) { | 37 | unsigned long pgoff; |
| 38 | int order; | ||
| 39 | |||
| 40 | for (pgoff = 0; pgoff < MAX_IDX; pgoff++) { | ||
| 33 | pthread_mutex_lock(&tree_lock); | 41 | pthread_mutex_lock(&tree_lock); |
| 34 | if (item_insert(&tree, pgoff) == 0) | 42 | for (order = max_order; order >= 0; order--) { |
| 35 | item_tag_set(&tree, pgoff, TAG); | 43 | if (item_insert_order(&tree, pgoff, order) |
| 44 | == 0) { | ||
| 45 | item_tag_set(&tree, pgoff, TAG); | ||
| 46 | break; | ||
| 47 | } | ||
| 48 | } | ||
| 36 | pthread_mutex_unlock(&tree_lock); | 49 | pthread_mutex_unlock(&tree_lock); |
| 37 | } | 50 | } |
| 38 | } | 51 | } |
| 39 | 52 | ||
| 53 | rcu_unregister_thread(); | ||
| 54 | |||
| 40 | return NULL; | 55 | return NULL; |
| 41 | } | 56 | } |
| 42 | 57 | ||
| 43 | /* | 58 | /* |
| 44 | * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find | 59 | * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find |
| 45 | * things that have been removed and randomly resetting our iteration to the | 60 | * things that have been removed and randomly resetting our iteration to the |
| 46 | * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and | 61 | * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and |
| 47 | * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a | 62 | * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a |
| 48 | * NULL 'slot' variable. | 63 | * NULL 'slot' variable. |
| 49 | */ | 64 | */ |
| 50 | static void *tagged_iteration_fn(void *arg) | 65 | static void *tagged_iteration_fn(void *arg) |
| @@ -52,17 +67,12 @@ static void *tagged_iteration_fn(void *arg) | |||
| 52 | struct radix_tree_iter iter; | 67 | struct radix_tree_iter iter; |
| 53 | void **slot; | 68 | void **slot; |
| 54 | 69 | ||
| 70 | rcu_register_thread(); | ||
| 71 | |||
| 55 | while (!test_complete) { | 72 | while (!test_complete) { |
| 56 | rcu_read_lock(); | 73 | rcu_read_lock(); |
| 57 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { | 74 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, TAG) { |
| 58 | void *entry; | 75 | void *entry = radix_tree_deref_slot(slot); |
| 59 | int i; | ||
| 60 | |||
| 61 | /* busy wait to let removals happen */ | ||
| 62 | for (i = 0; i < 1000000; i++) | ||
| 63 | ; | ||
| 64 | |||
| 65 | entry = radix_tree_deref_slot(slot); | ||
| 66 | if (unlikely(!entry)) | 76 | if (unlikely(!entry)) |
| 67 | continue; | 77 | continue; |
| 68 | 78 | ||
| @@ -71,20 +81,26 @@ static void *tagged_iteration_fn(void *arg) | |||
| 71 | continue; | 81 | continue; |
| 72 | } | 82 | } |
| 73 | 83 | ||
| 74 | if (rand() % 50 == 0) | 84 | if (rand_r(&seeds[0]) % 50 == 0) { |
| 75 | slot = radix_tree_iter_next(&iter); | 85 | slot = radix_tree_iter_resume(slot, &iter); |
| 86 | rcu_read_unlock(); | ||
| 87 | rcu_barrier(); | ||
| 88 | rcu_read_lock(); | ||
| 89 | } | ||
| 76 | } | 90 | } |
| 77 | rcu_read_unlock(); | 91 | rcu_read_unlock(); |
| 78 | } | 92 | } |
| 79 | 93 | ||
| 94 | rcu_unregister_thread(); | ||
| 95 | |||
| 80 | return NULL; | 96 | return NULL; |
| 81 | } | 97 | } |
| 82 | 98 | ||
| 83 | /* | 99 | /* |
| 84 | * Iterate over the entries, doing a radix_tree_iter_retry() as we find things | 100 | * Iterate over the entries, doing a radix_tree_iter_retry() as we find things |
| 85 | * that have been removed and randomly resetting our iteration to the next | 101 | * that have been removed and randomly resetting our iteration to the next |
| 86 | * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and | 102 | * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and |
| 87 | * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a | 103 | * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a |
| 88 | * NULL 'slot' variable. | 104 | * NULL 'slot' variable. |
| 89 | */ | 105 | */ |
| 90 | static void *untagged_iteration_fn(void *arg) | 106 | static void *untagged_iteration_fn(void *arg) |
| @@ -92,17 +108,12 @@ static void *untagged_iteration_fn(void *arg) | |||
| 92 | struct radix_tree_iter iter; | 108 | struct radix_tree_iter iter; |
| 93 | void **slot; | 109 | void **slot; |
| 94 | 110 | ||
| 111 | rcu_register_thread(); | ||
| 112 | |||
| 95 | while (!test_complete) { | 113 | while (!test_complete) { |
| 96 | rcu_read_lock(); | 114 | rcu_read_lock(); |
| 97 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | 115 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { |
| 98 | void *entry; | 116 | void *entry = radix_tree_deref_slot(slot); |
| 99 | int i; | ||
| 100 | |||
| 101 | /* busy wait to let removals happen */ | ||
| 102 | for (i = 0; i < 1000000; i++) | ||
| 103 | ; | ||
| 104 | |||
| 105 | entry = radix_tree_deref_slot(slot); | ||
| 106 | if (unlikely(!entry)) | 117 | if (unlikely(!entry)) |
| 107 | continue; | 118 | continue; |
| 108 | 119 | ||
| @@ -111,12 +122,18 @@ static void *untagged_iteration_fn(void *arg) | |||
| 111 | continue; | 122 | continue; |
| 112 | } | 123 | } |
| 113 | 124 | ||
| 114 | if (rand() % 50 == 0) | 125 | if (rand_r(&seeds[1]) % 50 == 0) { |
| 115 | slot = radix_tree_iter_next(&iter); | 126 | slot = radix_tree_iter_resume(slot, &iter); |
| 127 | rcu_read_unlock(); | ||
| 128 | rcu_barrier(); | ||
| 129 | rcu_read_lock(); | ||
| 130 | } | ||
| 116 | } | 131 | } |
| 117 | rcu_read_unlock(); | 132 | rcu_read_unlock(); |
| 118 | } | 133 | } |
| 119 | 134 | ||
| 135 | rcu_unregister_thread(); | ||
| 136 | |||
| 120 | return NULL; | 137 | return NULL; |
| 121 | } | 138 | } |
| 122 | 139 | ||
| @@ -126,47 +143,71 @@ static void *untagged_iteration_fn(void *arg) | |||
| 126 | */ | 143 | */ |
| 127 | static void *remove_entries_fn(void *arg) | 144 | static void *remove_entries_fn(void *arg) |
| 128 | { | 145 | { |
| 146 | rcu_register_thread(); | ||
| 147 | |||
| 129 | while (!test_complete) { | 148 | while (!test_complete) { |
| 130 | int pgoff; | 149 | int pgoff; |
| 131 | 150 | ||
| 132 | pgoff = rand() % 100; | 151 | pgoff = rand_r(&seeds[2]) % MAX_IDX; |
| 133 | 152 | ||
| 134 | pthread_mutex_lock(&tree_lock); | 153 | pthread_mutex_lock(&tree_lock); |
| 135 | item_delete(&tree, pgoff); | 154 | item_delete(&tree, pgoff); |
| 136 | pthread_mutex_unlock(&tree_lock); | 155 | pthread_mutex_unlock(&tree_lock); |
| 137 | } | 156 | } |
| 138 | 157 | ||
| 158 | rcu_unregister_thread(); | ||
| 159 | |||
| 160 | return NULL; | ||
| 161 | } | ||
| 162 | |||
| 163 | static void *tag_entries_fn(void *arg) | ||
| 164 | { | ||
| 165 | rcu_register_thread(); | ||
| 166 | |||
| 167 | while (!test_complete) { | ||
| 168 | tag_tagged_items(&tree, &tree_lock, 0, MAX_IDX, 10, TAG, | ||
| 169 | NEW_TAG); | ||
| 170 | } | ||
| 171 | rcu_unregister_thread(); | ||
| 139 | return NULL; | 172 | return NULL; |
| 140 | } | 173 | } |
| 141 | 174 | ||
| 142 | /* This is a unit test for a bug found by the syzkaller tester */ | 175 | /* This is a unit test for a bug found by the syzkaller tester */ |
| 143 | void iteration_test(void) | 176 | void iteration_test(unsigned order, unsigned test_duration) |
| 144 | { | 177 | { |
| 145 | int i; | 178 | int i; |
| 146 | 179 | ||
| 147 | printf("Running iteration tests for 10 seconds\n"); | 180 | printf("Running %siteration tests for %d seconds\n", |
| 181 | order > 0 ? "multiorder " : "", test_duration); | ||
| 148 | 182 | ||
| 149 | srand(time(0)); | 183 | max_order = order; |
| 150 | test_complete = false; | 184 | test_complete = false; |
| 151 | 185 | ||
| 186 | for (i = 0; i < 3; i++) | ||
| 187 | seeds[i] = rand(); | ||
| 188 | |||
| 152 | if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { | 189 | if (pthread_create(&threads[0], NULL, tagged_iteration_fn, NULL)) { |
| 153 | perror("pthread_create"); | 190 | perror("create tagged iteration thread"); |
| 154 | exit(1); | 191 | exit(1); |
| 155 | } | 192 | } |
| 156 | if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { | 193 | if (pthread_create(&threads[1], NULL, untagged_iteration_fn, NULL)) { |
| 157 | perror("pthread_create"); | 194 | perror("create untagged iteration thread"); |
| 158 | exit(1); | 195 | exit(1); |
| 159 | } | 196 | } |
| 160 | if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { | 197 | if (pthread_create(&threads[2], NULL, add_entries_fn, NULL)) { |
| 161 | perror("pthread_create"); | 198 | perror("create add entry thread"); |
| 162 | exit(1); | 199 | exit(1); |
| 163 | } | 200 | } |
| 164 | if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { | 201 | if (pthread_create(&threads[3], NULL, remove_entries_fn, NULL)) { |
| 165 | perror("pthread_create"); | 202 | perror("create remove entry thread"); |
| 203 | exit(1); | ||
| 204 | } | ||
| 205 | if (pthread_create(&threads[4], NULL, tag_entries_fn, NULL)) { | ||
| 206 | perror("create tag entry thread"); | ||
| 166 | exit(1); | 207 | exit(1); |
| 167 | } | 208 | } |
| 168 | 209 | ||
| 169 | sleep(10); | 210 | sleep(test_duration); |
| 170 | test_complete = true; | 211 | test_complete = true; |
| 171 | 212 | ||
| 172 | for (i = 0; i < NUM_THREADS; i++) { | 213 | for (i = 0; i < NUM_THREADS; i++) { |
diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 154823737b20..d31ea7c9abec 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c | |||
| @@ -1,14 +1,26 @@ | |||
| 1 | #include <stdlib.h> | 1 | #include <stdlib.h> |
| 2 | #include <string.h> | 2 | #include <string.h> |
| 3 | #include <malloc.h> | 3 | #include <malloc.h> |
| 4 | #include <pthread.h> | ||
| 4 | #include <unistd.h> | 5 | #include <unistd.h> |
| 5 | #include <assert.h> | 6 | #include <assert.h> |
| 6 | 7 | ||
| 7 | #include <linux/mempool.h> | 8 | #include <linux/mempool.h> |
| 9 | #include <linux/poison.h> | ||
| 8 | #include <linux/slab.h> | 10 | #include <linux/slab.h> |
| 11 | #include <linux/radix-tree.h> | ||
| 9 | #include <urcu/uatomic.h> | 12 | #include <urcu/uatomic.h> |
| 10 | 13 | ||
| 11 | int nr_allocated; | 14 | int nr_allocated; |
| 15 | int preempt_count; | ||
| 16 | |||
| 17 | struct kmem_cache { | ||
| 18 | pthread_mutex_t lock; | ||
| 19 | int size; | ||
| 20 | int nr_objs; | ||
| 21 | void *objs; | ||
| 22 | void (*ctor)(void *); | ||
| 23 | }; | ||
| 12 | 24 | ||
| 13 | void *mempool_alloc(mempool_t *pool, int gfp_mask) | 25 | void *mempool_alloc(mempool_t *pool, int gfp_mask) |
| 14 | { | 26 | { |
| @@ -33,19 +45,59 @@ mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, | |||
| 33 | 45 | ||
| 34 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) | 46 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags) |
| 35 | { | 47 | { |
| 36 | void *ret = malloc(cachep->size); | 48 | struct radix_tree_node *node; |
| 37 | if (cachep->ctor) | 49 | |
| 38 | cachep->ctor(ret); | 50 | if (flags & __GFP_NOWARN) |
| 51 | return NULL; | ||
| 52 | |||
| 53 | pthread_mutex_lock(&cachep->lock); | ||
| 54 | if (cachep->nr_objs) { | ||
| 55 | cachep->nr_objs--; | ||
| 56 | node = cachep->objs; | ||
| 57 | cachep->objs = node->private_data; | ||
| 58 | pthread_mutex_unlock(&cachep->lock); | ||
| 59 | node->private_data = NULL; | ||
| 60 | } else { | ||
| 61 | pthread_mutex_unlock(&cachep->lock); | ||
| 62 | node = malloc(cachep->size); | ||
| 63 | if (cachep->ctor) | ||
| 64 | cachep->ctor(node); | ||
| 65 | } | ||
| 66 | |||
| 39 | uatomic_inc(&nr_allocated); | 67 | uatomic_inc(&nr_allocated); |
| 40 | return ret; | 68 | return node; |
| 41 | } | 69 | } |
| 42 | 70 | ||
| 43 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) | 71 | void kmem_cache_free(struct kmem_cache *cachep, void *objp) |
| 44 | { | 72 | { |
| 45 | assert(objp); | 73 | assert(objp); |
| 46 | uatomic_dec(&nr_allocated); | 74 | uatomic_dec(&nr_allocated); |
| 47 | memset(objp, 0, cachep->size); | 75 | pthread_mutex_lock(&cachep->lock); |
| 48 | free(objp); | 76 | if (cachep->nr_objs > 10) { |
| 77 | memset(objp, POISON_FREE, cachep->size); | ||
| 78 | free(objp); | ||
| 79 | } else { | ||
| 80 | struct radix_tree_node *node = objp; | ||
| 81 | cachep->nr_objs++; | ||
| 82 | node->private_data = cachep->objs; | ||
| 83 | cachep->objs = node; | ||
| 84 | } | ||
| 85 | pthread_mutex_unlock(&cachep->lock); | ||
| 86 | } | ||
| 87 | |||
| 88 | void *kmalloc(size_t size, gfp_t gfp) | ||
| 89 | { | ||
| 90 | void *ret = malloc(size); | ||
| 91 | uatomic_inc(&nr_allocated); | ||
| 92 | return ret; | ||
| 93 | } | ||
| 94 | |||
| 95 | void kfree(void *p) | ||
| 96 | { | ||
| 97 | if (!p) | ||
| 98 | return; | ||
| 99 | uatomic_dec(&nr_allocated); | ||
| 100 | free(p); | ||
| 49 | } | 101 | } |
| 50 | 102 | ||
| 51 | struct kmem_cache * | 103 | struct kmem_cache * |
| @@ -54,7 +106,10 @@ kmem_cache_create(const char *name, size_t size, size_t offset, | |||
| 54 | { | 106 | { |
| 55 | struct kmem_cache *ret = malloc(sizeof(*ret)); | 107 | struct kmem_cache *ret = malloc(sizeof(*ret)); |
| 56 | 108 | ||
| 109 | pthread_mutex_init(&ret->lock, NULL); | ||
| 57 | ret->size = size; | 110 | ret->size = size; |
| 111 | ret->nr_objs = 0; | ||
| 112 | ret->objs = NULL; | ||
| 58 | ret->ctor = ctor; | 113 | ret->ctor = ctor; |
| 59 | return ret; | 114 | return ret; |
| 60 | } | 115 | } |
diff --git a/tools/testing/radix-tree/linux/bitops.h b/tools/testing/radix-tree/linux/bitops.h index 71d58427ab60..a13e9bc76eec 100644 --- a/tools/testing/radix-tree/linux/bitops.h +++ b/tools/testing/radix-tree/linux/bitops.h | |||
| @@ -2,9 +2,14 @@ | |||
| 2 | #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ | 2 | #define _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ |
| 3 | 3 | ||
| 4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
| 5 | #include <linux/bitops/find.h> | ||
| 6 | #include <linux/bitops/hweight.h> | ||
| 7 | #include <linux/kernel.h> | ||
| 5 | 8 | ||
| 6 | #define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | 9 | #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) |
| 7 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) | 10 | #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) |
| 11 | #define BITS_PER_BYTE 8 | ||
| 12 | #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) | ||
| 8 | 13 | ||
| 9 | /** | 14 | /** |
| 10 | * __set_bit - Set a bit in memory | 15 | * __set_bit - Set a bit in memory |
| @@ -17,16 +22,16 @@ | |||
| 17 | */ | 22 | */ |
| 18 | static inline void __set_bit(int nr, volatile unsigned long *addr) | 23 | static inline void __set_bit(int nr, volatile unsigned long *addr) |
| 19 | { | 24 | { |
| 20 | unsigned long mask = BITOP_MASK(nr); | 25 | unsigned long mask = BIT_MASK(nr); |
| 21 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 26 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
| 22 | 27 | ||
| 23 | *p |= mask; | 28 | *p |= mask; |
| 24 | } | 29 | } |
| 25 | 30 | ||
| 26 | static inline void __clear_bit(int nr, volatile unsigned long *addr) | 31 | static inline void __clear_bit(int nr, volatile unsigned long *addr) |
| 27 | { | 32 | { |
| 28 | unsigned long mask = BITOP_MASK(nr); | 33 | unsigned long mask = BIT_MASK(nr); |
| 29 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 34 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
| 30 | 35 | ||
| 31 | *p &= ~mask; | 36 | *p &= ~mask; |
| 32 | } | 37 | } |
| @@ -42,8 +47,8 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) | |||
| 42 | */ | 47 | */ |
| 43 | static inline void __change_bit(int nr, volatile unsigned long *addr) | 48 | static inline void __change_bit(int nr, volatile unsigned long *addr) |
| 44 | { | 49 | { |
| 45 | unsigned long mask = BITOP_MASK(nr); | 50 | unsigned long mask = BIT_MASK(nr); |
| 46 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 51 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
| 47 | 52 | ||
| 48 | *p ^= mask; | 53 | *p ^= mask; |
| 49 | } | 54 | } |
| @@ -59,8 +64,8 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) | |||
| 59 | */ | 64 | */ |
| 60 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | 65 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) |
| 61 | { | 66 | { |
| 62 | unsigned long mask = BITOP_MASK(nr); | 67 | unsigned long mask = BIT_MASK(nr); |
| 63 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 68 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
| 64 | unsigned long old = *p; | 69 | unsigned long old = *p; |
| 65 | 70 | ||
| 66 | *p = old | mask; | 71 | *p = old | mask; |
| @@ -78,8 +83,8 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | |||
| 78 | */ | 83 | */ |
| 79 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | 84 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) |
| 80 | { | 85 | { |
| 81 | unsigned long mask = BITOP_MASK(nr); | 86 | unsigned long mask = BIT_MASK(nr); |
| 82 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 87 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
| 83 | unsigned long old = *p; | 88 | unsigned long old = *p; |
| 84 | 89 | ||
| 85 | *p = old & ~mask; | 90 | *p = old & ~mask; |
| @@ -90,8 +95,8 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | |||
| 90 | static inline int __test_and_change_bit(int nr, | 95 | static inline int __test_and_change_bit(int nr, |
| 91 | volatile unsigned long *addr) | 96 | volatile unsigned long *addr) |
| 92 | { | 97 | { |
| 93 | unsigned long mask = BITOP_MASK(nr); | 98 | unsigned long mask = BIT_MASK(nr); |
| 94 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 99 | unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); |
| 95 | unsigned long old = *p; | 100 | unsigned long old = *p; |
| 96 | 101 | ||
| 97 | *p = old ^ mask; | 102 | *p = old ^ mask; |
| @@ -105,7 +110,7 @@ static inline int __test_and_change_bit(int nr, | |||
| 105 | */ | 110 | */ |
| 106 | static inline int test_bit(int nr, const volatile unsigned long *addr) | 111 | static inline int test_bit(int nr, const volatile unsigned long *addr) |
| 107 | { | 112 | { |
| 108 | return 1UL & (addr[BITOP_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); | 113 | return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1))); |
| 109 | } | 114 | } |
| 110 | 115 | ||
| 111 | /** | 116 | /** |
| @@ -147,4 +152,9 @@ unsigned long find_next_bit(const unsigned long *addr, | |||
| 147 | unsigned long size, | 152 | unsigned long size, |
| 148 | unsigned long offset); | 153 | unsigned long offset); |
| 149 | 154 | ||
| 155 | static inline unsigned long hweight_long(unsigned long w) | ||
| 156 | { | ||
| 157 | return sizeof(w) == 4 ? hweight32(w) : hweight64(w); | ||
| 158 | } | ||
| 159 | |||
| 150 | #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ | 160 | #endif /* _ASM_GENERIC_BITOPS_NON_ATOMIC_H_ */ |
diff --git a/tools/testing/radix-tree/linux/bitops/non-atomic.h b/tools/testing/radix-tree/linux/bitops/non-atomic.h index 46a825cf2ae1..6a1bcb9d2c4a 100644 --- a/tools/testing/radix-tree/linux/bitops/non-atomic.h +++ b/tools/testing/radix-tree/linux/bitops/non-atomic.h | |||
| @@ -3,7 +3,6 @@ | |||
| 3 | 3 | ||
| 4 | #include <asm/types.h> | 4 | #include <asm/types.h> |
| 5 | 5 | ||
| 6 | #define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) | ||
| 7 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) | 6 | #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) |
| 8 | 7 | ||
| 9 | /** | 8 | /** |
| @@ -17,7 +16,7 @@ | |||
| 17 | */ | 16 | */ |
| 18 | static inline void __set_bit(int nr, volatile unsigned long *addr) | 17 | static inline void __set_bit(int nr, volatile unsigned long *addr) |
| 19 | { | 18 | { |
| 20 | unsigned long mask = BITOP_MASK(nr); | 19 | unsigned long mask = BIT_MASK(nr); |
| 21 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 20 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
| 22 | 21 | ||
| 23 | *p |= mask; | 22 | *p |= mask; |
| @@ -25,7 +24,7 @@ static inline void __set_bit(int nr, volatile unsigned long *addr) | |||
| 25 | 24 | ||
| 26 | static inline void __clear_bit(int nr, volatile unsigned long *addr) | 25 | static inline void __clear_bit(int nr, volatile unsigned long *addr) |
| 27 | { | 26 | { |
| 28 | unsigned long mask = BITOP_MASK(nr); | 27 | unsigned long mask = BIT_MASK(nr); |
| 29 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 28 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
| 30 | 29 | ||
| 31 | *p &= ~mask; | 30 | *p &= ~mask; |
| @@ -42,7 +41,7 @@ static inline void __clear_bit(int nr, volatile unsigned long *addr) | |||
| 42 | */ | 41 | */ |
| 43 | static inline void __change_bit(int nr, volatile unsigned long *addr) | 42 | static inline void __change_bit(int nr, volatile unsigned long *addr) |
| 44 | { | 43 | { |
| 45 | unsigned long mask = BITOP_MASK(nr); | 44 | unsigned long mask = BIT_MASK(nr); |
| 46 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 45 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
| 47 | 46 | ||
| 48 | *p ^= mask; | 47 | *p ^= mask; |
| @@ -59,7 +58,7 @@ static inline void __change_bit(int nr, volatile unsigned long *addr) | |||
| 59 | */ | 58 | */ |
| 60 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | 59 | static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) |
| 61 | { | 60 | { |
| 62 | unsigned long mask = BITOP_MASK(nr); | 61 | unsigned long mask = BIT_MASK(nr); |
| 63 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 62 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
| 64 | unsigned long old = *p; | 63 | unsigned long old = *p; |
| 65 | 64 | ||
| @@ -78,7 +77,7 @@ static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) | |||
| 78 | */ | 77 | */ |
| 79 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | 78 | static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) |
| 80 | { | 79 | { |
| 81 | unsigned long mask = BITOP_MASK(nr); | 80 | unsigned long mask = BIT_MASK(nr); |
| 82 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 81 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
| 83 | unsigned long old = *p; | 82 | unsigned long old = *p; |
| 84 | 83 | ||
| @@ -90,7 +89,7 @@ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) | |||
| 90 | static inline int __test_and_change_bit(int nr, | 89 | static inline int __test_and_change_bit(int nr, |
| 91 | volatile unsigned long *addr) | 90 | volatile unsigned long *addr) |
| 92 | { | 91 | { |
| 93 | unsigned long mask = BITOP_MASK(nr); | 92 | unsigned long mask = BIT_MASK(nr); |
| 94 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); | 93 | unsigned long *p = ((unsigned long *)addr) + BITOP_WORD(nr); |
| 95 | unsigned long old = *p; | 94 | unsigned long old = *p; |
| 96 | 95 | ||
diff --git a/tools/testing/radix-tree/linux/bug.h b/tools/testing/radix-tree/linux/bug.h index ccbe444977df..23b8ed52f8c8 100644 --- a/tools/testing/radix-tree/linux/bug.h +++ b/tools/testing/radix-tree/linux/bug.h | |||
| @@ -1 +1 @@ | |||
| #define WARN_ON_ONCE(x) assert(x) | #include "asm/bug.h" | ||
diff --git a/tools/testing/radix-tree/linux/gfp.h b/tools/testing/radix-tree/linux/gfp.h index 5201b915f631..5b09b2ce6c33 100644 --- a/tools/testing/radix-tree/linux/gfp.h +++ b/tools/testing/radix-tree/linux/gfp.h | |||
| @@ -3,8 +3,24 @@ | |||
| 3 | 3 | ||
| 4 | #define __GFP_BITS_SHIFT 26 | 4 | #define __GFP_BITS_SHIFT 26 |
| 5 | #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) | 5 | #define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) |
| 6 | #define __GFP_WAIT 1 | 6 | |
| 7 | #define __GFP_ACCOUNT 0 | 7 | #define __GFP_HIGH 0x20u |
| 8 | #define __GFP_NOWARN 0 | 8 | #define __GFP_IO 0x40u |
| 9 | #define __GFP_FS 0x80u | ||
| 10 | #define __GFP_NOWARN 0x200u | ||
| 11 | #define __GFP_ATOMIC 0x80000u | ||
| 12 | #define __GFP_ACCOUNT 0x100000u | ||
| 13 | #define __GFP_DIRECT_RECLAIM 0x400000u | ||
| 14 | #define __GFP_KSWAPD_RECLAIM 0x2000000u | ||
| 15 | |||
| 16 | #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM|__GFP_KSWAPD_RECLAIM) | ||
| 17 | |||
| 18 | #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) | ||
| 19 | #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) | ||
| 20 | |||
| 21 | static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) | ||
| 22 | { | ||
| 23 | return !!(gfp_flags & __GFP_DIRECT_RECLAIM); | ||
| 24 | } | ||
| 9 | 25 | ||
| 10 | #endif | 26 | #endif |
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index be98a47b4e1b..9b43b4975d83 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h | |||
| @@ -8,9 +8,14 @@ | |||
| 8 | #include <limits.h> | 8 | #include <limits.h> |
| 9 | 9 | ||
| 10 | #include "../../include/linux/compiler.h" | 10 | #include "../../include/linux/compiler.h" |
| 11 | #include "../../include/linux/err.h" | ||
| 11 | #include "../../../include/linux/kconfig.h" | 12 | #include "../../../include/linux/kconfig.h" |
| 12 | 13 | ||
| 14 | #ifdef BENCHMARK | ||
| 15 | #define RADIX_TREE_MAP_SHIFT 6 | ||
| 16 | #else | ||
| 13 | #define RADIX_TREE_MAP_SHIFT 3 | 17 | #define RADIX_TREE_MAP_SHIFT 3 |
| 18 | #endif | ||
| 14 | 19 | ||
| 15 | #ifndef NULL | 20 | #ifndef NULL |
| 16 | #define NULL 0 | 21 | #define NULL 0 |
| @@ -43,4 +48,17 @@ static inline int in_interrupt(void) | |||
| 43 | { | 48 | { |
| 44 | return 0; | 49 | return 0; |
| 45 | } | 50 | } |
| 51 | |||
| 52 | /* | ||
| 53 | * This looks more complex than it should be. But we need to | ||
| 54 | * get the type for the ~ right in round_down (it needs to be | ||
| 55 | * as wide as the result!), and we want to evaluate the macro | ||
| 56 | * arguments just once each. | ||
| 57 | */ | ||
| 58 | #define __round_mask(x, y) ((__typeof__(x))((y)-1)) | ||
| 59 | #define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) | ||
| 60 | #define round_down(x, y) ((x) & ~__round_mask(x, y)) | ||
| 61 | |||
| 62 | #define xchg(ptr, x) uatomic_xchg(ptr, x) | ||
| 63 | |||
| 46 | #endif /* _KERNEL_H */ | 64 | #endif /* _KERNEL_H */ |
diff --git a/tools/testing/radix-tree/linux/preempt.h b/tools/testing/radix-tree/linux/preempt.h index 6210672e3baa..65c04c226965 100644 --- a/tools/testing/radix-tree/linux/preempt.h +++ b/tools/testing/radix-tree/linux/preempt.h | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | /* */ | 1 | extern int preempt_count; |
| 2 | 2 | ||
| 3 | #define preempt_disable() do { } while (0) | 3 | #define preempt_disable() uatomic_inc(&preempt_count) |
| 4 | #define preempt_enable() do { } while (0) | 4 | #define preempt_enable() uatomic_dec(&preempt_count) |
diff --git a/tools/testing/radix-tree/linux/slab.h b/tools/testing/radix-tree/linux/slab.h index 6d5a34770fd4..e40337f41a38 100644 --- a/tools/testing/radix-tree/linux/slab.h +++ b/tools/testing/radix-tree/linux/slab.h | |||
| @@ -7,15 +7,8 @@ | |||
| 7 | #define SLAB_PANIC 2 | 7 | #define SLAB_PANIC 2 |
| 8 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ | 8 | #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ |
| 9 | 9 | ||
| 10 | static inline int gfpflags_allow_blocking(gfp_t mask) | 10 | void *kmalloc(size_t size, gfp_t); |
| 11 | { | 11 | void kfree(void *); |
| 12 | return 1; | ||
| 13 | } | ||
| 14 | |||
| 15 | struct kmem_cache { | ||
| 16 | int size; | ||
| 17 | void (*ctor)(void *); | ||
| 18 | }; | ||
| 19 | 12 | ||
| 20 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags); | 13 | void *kmem_cache_alloc(struct kmem_cache *cachep, int flags); |
| 21 | void kmem_cache_free(struct kmem_cache *cachep, void *objp); | 14 | void kmem_cache_free(struct kmem_cache *cachep, void *objp); |
diff --git a/tools/testing/radix-tree/linux/types.h b/tools/testing/radix-tree/linux/types.h index faa0b6ff9ca8..8491d89873bb 100644 --- a/tools/testing/radix-tree/linux/types.h +++ b/tools/testing/radix-tree/linux/types.h | |||
| @@ -6,8 +6,6 @@ | |||
| 6 | #define __rcu | 6 | #define __rcu |
| 7 | #define __read_mostly | 7 | #define __read_mostly |
| 8 | 8 | ||
| 9 | #define BITS_PER_LONG (sizeof(long) * 8) | ||
| 10 | |||
| 11 | static inline void INIT_LIST_HEAD(struct list_head *list) | 9 | static inline void INIT_LIST_HEAD(struct list_head *list) |
| 12 | { | 10 | { |
| 13 | list->next = list; | 11 | list->next = list; |
diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index daa9010693e8..f7e9801a6754 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c | |||
| @@ -67,7 +67,6 @@ void big_gang_check(bool long_run) | |||
| 67 | 67 | ||
| 68 | for (i = 0; i < (long_run ? 1000 : 3); i++) { | 68 | for (i = 0; i < (long_run ? 1000 : 3); i++) { |
| 69 | __big_gang_check(); | 69 | __big_gang_check(); |
| 70 | srand(time(0)); | ||
| 71 | printf("%d ", i); | 70 | printf("%d ", i); |
| 72 | fflush(stdout); | 71 | fflush(stdout); |
| 73 | } | 72 | } |
| @@ -206,8 +205,7 @@ void copy_tag_check(void) | |||
| 206 | } | 205 | } |
| 207 | 206 | ||
| 208 | // printf("\ncopying tags...\n"); | 207 | // printf("\ncopying tags...\n"); |
| 209 | cur = start; | 208 | tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1); |
| 210 | tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1); | ||
| 211 | 209 | ||
| 212 | // printf("checking copied tags\n"); | 210 | // printf("checking copied tags\n"); |
| 213 | assert(tagged == count); | 211 | assert(tagged == count); |
| @@ -215,16 +213,13 @@ void copy_tag_check(void) | |||
| 215 | 213 | ||
| 216 | /* Copy tags in several rounds */ | 214 | /* Copy tags in several rounds */ |
| 217 | // printf("\ncopying tags...\n"); | 215 | // printf("\ncopying tags...\n"); |
| 218 | cur = start; | 216 | tmp = rand() % (count / 10 + 2); |
| 219 | do { | 217 | tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2); |
| 220 | tmp = rand() % (count/10+2); | 218 | assert(tagged == count); |
| 221 | tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2); | ||
| 222 | } while (tmp == tagged); | ||
| 223 | 219 | ||
| 224 | // printf("%lu %lu %lu\n", tagged, tmp, count); | 220 | // printf("%lu %lu %lu\n", tagged, tmp, count); |
| 225 | // printf("checking copied tags\n"); | 221 | // printf("checking copied tags\n"); |
| 226 | check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); | 222 | check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); |
| 227 | assert(tagged < tmp); | ||
| 228 | verify_tag_consistency(&tree, 0); | 223 | verify_tag_consistency(&tree, 0); |
| 229 | verify_tag_consistency(&tree, 1); | 224 | verify_tag_consistency(&tree, 1); |
| 230 | verify_tag_consistency(&tree, 2); | 225 | verify_tag_consistency(&tree, 2); |
| @@ -240,7 +235,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index, | |||
| 240 | 235 | ||
| 241 | item_insert_order(tree, index, order); | 236 | item_insert_order(tree, index, order); |
| 242 | item = item_lookup(tree, index); | 237 | item = item_lookup(tree, index); |
| 243 | index2 = radix_tree_locate_item(tree, item); | 238 | index2 = find_item(tree, item); |
| 244 | if (index != index2) { | 239 | if (index != index2) { |
| 245 | printf("index %ld order %d inserted; found %ld\n", | 240 | printf("index %ld order %d inserted; found %ld\n", |
| 246 | index, order, index2); | 241 | index, order, index2); |
| @@ -274,17 +269,17 @@ static void locate_check(void) | |||
| 274 | index += (1UL << order)) { | 269 | index += (1UL << order)) { |
| 275 | __locate_check(&tree, index + offset, order); | 270 | __locate_check(&tree, index + offset, order); |
| 276 | } | 271 | } |
| 277 | if (radix_tree_locate_item(&tree, &tree) != -1) | 272 | if (find_item(&tree, &tree) != -1) |
| 278 | abort(); | 273 | abort(); |
| 279 | 274 | ||
| 280 | item_kill_tree(&tree); | 275 | item_kill_tree(&tree); |
| 281 | } | 276 | } |
| 282 | } | 277 | } |
| 283 | 278 | ||
| 284 | if (radix_tree_locate_item(&tree, &tree) != -1) | 279 | if (find_item(&tree, &tree) != -1) |
| 285 | abort(); | 280 | abort(); |
| 286 | __locate_check(&tree, -1, 0); | 281 | __locate_check(&tree, -1, 0); |
| 287 | if (radix_tree_locate_item(&tree, &tree) != -1) | 282 | if (find_item(&tree, &tree) != -1) |
| 288 | abort(); | 283 | abort(); |
| 289 | item_kill_tree(&tree); | 284 | item_kill_tree(&tree); |
| 290 | } | 285 | } |
| @@ -293,50 +288,80 @@ static void single_thread_tests(bool long_run) | |||
| 293 | { | 288 | { |
| 294 | int i; | 289 | int i; |
| 295 | 290 | ||
| 296 | printf("starting single_thread_tests: %d allocated\n", nr_allocated); | 291 | printf("starting single_thread_tests: %d allocated, preempt %d\n", |
| 292 | nr_allocated, preempt_count); | ||
| 297 | multiorder_checks(); | 293 | multiorder_checks(); |
| 298 | printf("after multiorder_check: %d allocated\n", nr_allocated); | 294 | rcu_barrier(); |
| 295 | printf("after multiorder_check: %d allocated, preempt %d\n", | ||
| 296 | nr_allocated, preempt_count); | ||
| 299 | locate_check(); | 297 | locate_check(); |
| 300 | printf("after locate_check: %d allocated\n", nr_allocated); | 298 | rcu_barrier(); |
| 299 | printf("after locate_check: %d allocated, preempt %d\n", | ||
| 300 | nr_allocated, preempt_count); | ||
| 301 | tag_check(); | 301 | tag_check(); |
| 302 | printf("after tag_check: %d allocated\n", nr_allocated); | 302 | rcu_barrier(); |
| 303 | printf("after tag_check: %d allocated, preempt %d\n", | ||
| 304 | nr_allocated, preempt_count); | ||
| 303 | gang_check(); | 305 | gang_check(); |
| 304 | printf("after gang_check: %d allocated\n", nr_allocated); | 306 | rcu_barrier(); |
| 307 | printf("after gang_check: %d allocated, preempt %d\n", | ||
| 308 | nr_allocated, preempt_count); | ||
| 305 | add_and_check(); | 309 | add_and_check(); |
| 306 | printf("after add_and_check: %d allocated\n", nr_allocated); | 310 | rcu_barrier(); |
| 311 | printf("after add_and_check: %d allocated, preempt %d\n", | ||
| 312 | nr_allocated, preempt_count); | ||
| 307 | dynamic_height_check(); | 313 | dynamic_height_check(); |
| 308 | printf("after dynamic_height_check: %d allocated\n", nr_allocated); | 314 | rcu_barrier(); |
| 315 | printf("after dynamic_height_check: %d allocated, preempt %d\n", | ||
| 316 | nr_allocated, preempt_count); | ||
| 309 | big_gang_check(long_run); | 317 | big_gang_check(long_run); |
| 310 | printf("after big_gang_check: %d allocated\n", nr_allocated); | 318 | rcu_barrier(); |
| 319 | printf("after big_gang_check: %d allocated, preempt %d\n", | ||
| 320 | nr_allocated, preempt_count); | ||
| 311 | for (i = 0; i < (long_run ? 2000 : 3); i++) { | 321 | for (i = 0; i < (long_run ? 2000 : 3); i++) { |
| 312 | copy_tag_check(); | 322 | copy_tag_check(); |
| 313 | printf("%d ", i); | 323 | printf("%d ", i); |
| 314 | fflush(stdout); | 324 | fflush(stdout); |
| 315 | } | 325 | } |
| 316 | printf("after copy_tag_check: %d allocated\n", nr_allocated); | 326 | rcu_barrier(); |
| 327 | printf("after copy_tag_check: %d allocated, preempt %d\n", | ||
| 328 | nr_allocated, preempt_count); | ||
| 317 | } | 329 | } |
| 318 | 330 | ||
| 319 | int main(int argc, char **argv) | 331 | int main(int argc, char **argv) |
| 320 | { | 332 | { |
| 321 | bool long_run = false; | 333 | bool long_run = false; |
| 322 | int opt; | 334 | int opt; |
| 335 | unsigned int seed = time(NULL); | ||
| 323 | 336 | ||
| 324 | while ((opt = getopt(argc, argv, "l")) != -1) { | 337 | while ((opt = getopt(argc, argv, "ls:")) != -1) { |
| 325 | if (opt == 'l') | 338 | if (opt == 'l') |
| 326 | long_run = true; | 339 | long_run = true; |
| 340 | else if (opt == 's') | ||
| 341 | seed = strtoul(optarg, NULL, 0); | ||
| 327 | } | 342 | } |
| 328 | 343 | ||
| 344 | printf("random seed %u\n", seed); | ||
| 345 | srand(seed); | ||
| 346 | |||
| 329 | rcu_register_thread(); | 347 | rcu_register_thread(); |
| 330 | radix_tree_init(); | 348 | radix_tree_init(); |
| 331 | 349 | ||
| 332 | regression1_test(); | 350 | regression1_test(); |
| 333 | regression2_test(); | 351 | regression2_test(); |
| 334 | regression3_test(); | 352 | regression3_test(); |
| 335 | iteration_test(); | 353 | iteration_test(0, 10); |
| 354 | iteration_test(7, 20); | ||
| 336 | single_thread_tests(long_run); | 355 | single_thread_tests(long_run); |
| 337 | 356 | ||
| 338 | sleep(1); | 357 | /* Free any remaining preallocated nodes */ |
| 339 | printf("after sleep(1): %d allocated\n", nr_allocated); | 358 | radix_tree_cpu_dead(0); |
| 359 | |||
| 360 | benchmark(); | ||
| 361 | |||
| 362 | rcu_barrier(); | ||
| 363 | printf("after rcu_barrier: %d allocated, preempt %d\n", | ||
| 364 | nr_allocated, preempt_count); | ||
| 340 | rcu_unregister_thread(); | 365 | rcu_unregister_thread(); |
| 341 | 366 | ||
| 342 | exit(0); | 367 | exit(0); |
diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index d1be94667a30..f79812a5e070 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c | |||
| @@ -26,7 +26,6 @@ static void __multiorder_tag_test(int index, int order) | |||
| 26 | { | 26 | { |
| 27 | RADIX_TREE(tree, GFP_KERNEL); | 27 | RADIX_TREE(tree, GFP_KERNEL); |
| 28 | int base, err, i; | 28 | int base, err, i; |
| 29 | unsigned long first = 0; | ||
| 30 | 29 | ||
| 31 | /* our canonical entry */ | 30 | /* our canonical entry */ |
| 32 | base = index & ~((1 << order) - 1); | 31 | base = index & ~((1 << order) - 1); |
| @@ -60,7 +59,7 @@ static void __multiorder_tag_test(int index, int order) | |||
| 60 | assert(!radix_tree_tag_get(&tree, i, 1)); | 59 | assert(!radix_tree_tag_get(&tree, i, 1)); |
| 61 | } | 60 | } |
| 62 | 61 | ||
| 63 | assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1); | 62 | assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1); |
| 64 | assert(radix_tree_tag_clear(&tree, index, 0)); | 63 | assert(radix_tree_tag_clear(&tree, index, 0)); |
| 65 | 64 | ||
| 66 | for_each_index(i, base, order) { | 65 | for_each_index(i, base, order) { |
| @@ -76,8 +75,27 @@ static void __multiorder_tag_test(int index, int order) | |||
| 76 | item_kill_tree(&tree); | 75 | item_kill_tree(&tree); |
| 77 | } | 76 | } |
| 78 | 77 | ||
| 78 | static void __multiorder_tag_test2(unsigned order, unsigned long index2) | ||
| 79 | { | ||
| 80 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 81 | unsigned long index = (1 << order); | ||
| 82 | index2 += index; | ||
| 83 | |||
| 84 | assert(item_insert_order(&tree, 0, order) == 0); | ||
| 85 | assert(item_insert(&tree, index2) == 0); | ||
| 86 | |||
| 87 | assert(radix_tree_tag_set(&tree, 0, 0)); | ||
| 88 | assert(radix_tree_tag_set(&tree, index2, 0)); | ||
| 89 | |||
| 90 | assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 2); | ||
| 91 | |||
| 92 | item_kill_tree(&tree); | ||
| 93 | } | ||
| 94 | |||
| 79 | static void multiorder_tag_tests(void) | 95 | static void multiorder_tag_tests(void) |
| 80 | { | 96 | { |
| 97 | int i, j; | ||
| 98 | |||
| 81 | /* test multi-order entry for indices 0-7 with no sibling pointers */ | 99 | /* test multi-order entry for indices 0-7 with no sibling pointers */ |
| 82 | __multiorder_tag_test(0, 3); | 100 | __multiorder_tag_test(0, 3); |
| 83 | __multiorder_tag_test(5, 3); | 101 | __multiorder_tag_test(5, 3); |
| @@ -117,6 +135,10 @@ static void multiorder_tag_tests(void) | |||
| 117 | __multiorder_tag_test(300, 8); | 135 | __multiorder_tag_test(300, 8); |
| 118 | 136 | ||
| 119 | __multiorder_tag_test(0x12345678UL, 8); | 137 | __multiorder_tag_test(0x12345678UL, 8); |
| 138 | |||
| 139 | for (i = 1; i < 10; i++) | ||
| 140 | for (j = 0; j < (10 << i); j++) | ||
| 141 | __multiorder_tag_test2(i, j); | ||
| 120 | } | 142 | } |
| 121 | 143 | ||
| 122 | static void multiorder_check(unsigned long index, int order) | 144 | static void multiorder_check(unsigned long index, int order) |
| @@ -125,7 +147,7 @@ static void multiorder_check(unsigned long index, int order) | |||
| 125 | unsigned long min = index & ~((1UL << order) - 1); | 147 | unsigned long min = index & ~((1UL << order) - 1); |
| 126 | unsigned long max = min + (1UL << order); | 148 | unsigned long max = min + (1UL << order); |
| 127 | void **slot; | 149 | void **slot; |
| 128 | struct item *item2 = item_create(min); | 150 | struct item *item2 = item_create(min, order); |
| 129 | RADIX_TREE(tree, GFP_KERNEL); | 151 | RADIX_TREE(tree, GFP_KERNEL); |
| 130 | 152 | ||
| 131 | printf("Multiorder index %ld, order %d\n", index, order); | 153 | printf("Multiorder index %ld, order %d\n", index, order); |
| @@ -231,11 +253,14 @@ void multiorder_iteration(void) | |||
| 231 | radix_tree_for_each_slot(slot, &tree, &iter, j) { | 253 | radix_tree_for_each_slot(slot, &tree, &iter, j) { |
| 232 | int height = order[i] / RADIX_TREE_MAP_SHIFT; | 254 | int height = order[i] / RADIX_TREE_MAP_SHIFT; |
| 233 | int shift = height * RADIX_TREE_MAP_SHIFT; | 255 | int shift = height * RADIX_TREE_MAP_SHIFT; |
| 234 | int mask = (1 << order[i]) - 1; | 256 | unsigned long mask = (1UL << order[i]) - 1; |
| 257 | struct item *item = *slot; | ||
| 235 | 258 | ||
| 236 | assert(iter.index >= (index[i] &~ mask)); | 259 | assert((iter.index | mask) == (index[i] | mask)); |
| 237 | assert(iter.index <= (index[i] | mask)); | ||
| 238 | assert(iter.shift == shift); | 260 | assert(iter.shift == shift); |
| 261 | assert(!radix_tree_is_internal_node(item)); | ||
| 262 | assert((item->index | mask) == (index[i] | mask)); | ||
| 263 | assert(item->order == order[i]); | ||
| 239 | i++; | 264 | i++; |
| 240 | } | 265 | } |
| 241 | } | 266 | } |
| @@ -248,7 +273,6 @@ void multiorder_tagged_iteration(void) | |||
| 248 | RADIX_TREE(tree, GFP_KERNEL); | 273 | RADIX_TREE(tree, GFP_KERNEL); |
| 249 | struct radix_tree_iter iter; | 274 | struct radix_tree_iter iter; |
| 250 | void **slot; | 275 | void **slot; |
| 251 | unsigned long first = 0; | ||
| 252 | int i, j; | 276 | int i, j; |
| 253 | 277 | ||
| 254 | printf("Multiorder tagged iteration test\n"); | 278 | printf("Multiorder tagged iteration test\n"); |
| @@ -269,7 +293,7 @@ void multiorder_tagged_iteration(void) | |||
| 269 | assert(radix_tree_tag_set(&tree, tag_index[i], 1)); | 293 | assert(radix_tree_tag_set(&tree, tag_index[i], 1)); |
| 270 | 294 | ||
| 271 | for (j = 0; j < 256; j++) { | 295 | for (j = 0; j < 256; j++) { |
| 272 | int mask, k; | 296 | int k; |
| 273 | 297 | ||
| 274 | for (i = 0; i < TAG_ENTRIES; i++) { | 298 | for (i = 0; i < TAG_ENTRIES; i++) { |
| 275 | for (k = i; index[k] < tag_index[i]; k++) | 299 | for (k = i; index[k] < tag_index[i]; k++) |
| @@ -279,18 +303,22 @@ void multiorder_tagged_iteration(void) | |||
| 279 | } | 303 | } |
| 280 | 304 | ||
| 281 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { | 305 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { |
| 306 | unsigned long mask; | ||
| 307 | struct item *item = *slot; | ||
| 282 | for (k = i; index[k] < tag_index[i]; k++) | 308 | for (k = i; index[k] < tag_index[i]; k++) |
| 283 | ; | 309 | ; |
| 284 | mask = (1 << order[k]) - 1; | 310 | mask = (1UL << order[k]) - 1; |
| 285 | 311 | ||
| 286 | assert(iter.index >= (tag_index[i] &~ mask)); | 312 | assert((iter.index | mask) == (tag_index[i] | mask)); |
| 287 | assert(iter.index <= (tag_index[i] | mask)); | 313 | assert(!radix_tree_is_internal_node(item)); |
| 314 | assert((item->index | mask) == (tag_index[i] | mask)); | ||
| 315 | assert(item->order == order[k]); | ||
| 288 | i++; | 316 | i++; |
| 289 | } | 317 | } |
| 290 | } | 318 | } |
| 291 | 319 | ||
| 292 | radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, | 320 | assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) == |
| 293 | MT_NUM_ENTRIES, 1, 2); | 321 | TAG_ENTRIES); |
| 294 | 322 | ||
| 295 | for (j = 0; j < 256; j++) { | 323 | for (j = 0; j < 256; j++) { |
| 296 | int mask, k; | 324 | int mask, k; |
| @@ -303,19 +331,21 @@ void multiorder_tagged_iteration(void) | |||
| 303 | } | 331 | } |
| 304 | 332 | ||
| 305 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { | 333 | radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { |
| 334 | struct item *item = *slot; | ||
| 306 | for (k = i; index[k] < tag_index[i]; k++) | 335 | for (k = i; index[k] < tag_index[i]; k++) |
| 307 | ; | 336 | ; |
| 308 | mask = (1 << order[k]) - 1; | 337 | mask = (1 << order[k]) - 1; |
| 309 | 338 | ||
| 310 | assert(iter.index >= (tag_index[i] &~ mask)); | 339 | assert((iter.index | mask) == (tag_index[i] | mask)); |
| 311 | assert(iter.index <= (tag_index[i] | mask)); | 340 | assert(!radix_tree_is_internal_node(item)); |
| 341 | assert((item->index | mask) == (tag_index[i] | mask)); | ||
| 342 | assert(item->order == order[k]); | ||
| 312 | i++; | 343 | i++; |
| 313 | } | 344 | } |
| 314 | } | 345 | } |
| 315 | 346 | ||
| 316 | first = 1; | 347 | assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0) |
| 317 | radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, | 348 | == TAG_ENTRIES); |
| 318 | MT_NUM_ENTRIES, 1, 0); | ||
| 319 | i = 0; | 349 | i = 0; |
| 320 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { | 350 | radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { |
| 321 | assert(iter.index == tag_index[i]); | 351 | assert(iter.index == tag_index[i]); |
| @@ -325,6 +355,261 @@ void multiorder_tagged_iteration(void) | |||
| 325 | item_kill_tree(&tree); | 355 | item_kill_tree(&tree); |
| 326 | } | 356 | } |
| 327 | 357 | ||
| 358 | static void multiorder_join1(unsigned long index, | ||
| 359 | unsigned order1, unsigned order2) | ||
| 360 | { | ||
| 361 | unsigned long loc; | ||
| 362 | void *item, *item2 = item_create(index + 1, order1); | ||
| 363 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 364 | |||
| 365 | item_insert_order(&tree, index, order2); | ||
| 366 | item = radix_tree_lookup(&tree, index); | ||
| 367 | radix_tree_join(&tree, index + 1, order1, item2); | ||
| 368 | loc = find_item(&tree, item); | ||
| 369 | if (loc == -1) | ||
| 370 | free(item); | ||
| 371 | item = radix_tree_lookup(&tree, index + 1); | ||
| 372 | assert(item == item2); | ||
| 373 | item_kill_tree(&tree); | ||
| 374 | } | ||
| 375 | |||
| 376 | static void multiorder_join2(unsigned order1, unsigned order2) | ||
| 377 | { | ||
| 378 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 379 | struct radix_tree_node *node; | ||
| 380 | void *item1 = item_create(0, order1); | ||
| 381 | void *item2; | ||
| 382 | |||
| 383 | item_insert_order(&tree, 0, order2); | ||
| 384 | radix_tree_insert(&tree, 1 << order2, (void *)0x12UL); | ||
| 385 | item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); | ||
| 386 | assert(item2 == (void *)0x12UL); | ||
| 387 | assert(node->exceptional == 1); | ||
| 388 | |||
| 389 | radix_tree_join(&tree, 0, order1, item1); | ||
| 390 | item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); | ||
| 391 | assert(item2 == item1); | ||
| 392 | assert(node->exceptional == 0); | ||
| 393 | item_kill_tree(&tree); | ||
| 394 | } | ||
| 395 | |||
| 396 | /* | ||
| 397 | * This test revealed an accounting bug for exceptional entries at one point. | ||
| 398 | * Nodes were being freed back into the pool with an elevated exception count | ||
| 399 | * by radix_tree_join() and then radix_tree_split() was failing to zero the | ||
| 400 | * count of exceptional entries. | ||
| 401 | */ | ||
| 402 | static void multiorder_join3(unsigned int order) | ||
| 403 | { | ||
| 404 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 405 | struct radix_tree_node *node; | ||
| 406 | void **slot; | ||
| 407 | struct radix_tree_iter iter; | ||
| 408 | unsigned long i; | ||
| 409 | |||
| 410 | for (i = 0; i < (1 << order); i++) { | ||
| 411 | radix_tree_insert(&tree, i, (void *)0x12UL); | ||
| 412 | } | ||
| 413 | |||
| 414 | radix_tree_join(&tree, 0, order, (void *)0x16UL); | ||
| 415 | rcu_barrier(); | ||
| 416 | |||
| 417 | radix_tree_split(&tree, 0, 0); | ||
| 418 | |||
| 419 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
| 420 | radix_tree_iter_replace(&tree, &iter, slot, (void *)0x12UL); | ||
| 421 | } | ||
| 422 | |||
| 423 | __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 424 | assert(node->exceptional == node->count); | ||
| 425 | |||
| 426 | item_kill_tree(&tree); | ||
| 427 | } | ||
| 428 | |||
| 429 | static void multiorder_join(void) | ||
| 430 | { | ||
| 431 | int i, j, idx; | ||
| 432 | |||
| 433 | for (idx = 0; idx < 1024; idx = idx * 2 + 3) { | ||
| 434 | for (i = 1; i < 15; i++) { | ||
| 435 | for (j = 0; j < i; j++) { | ||
| 436 | multiorder_join1(idx, i, j); | ||
| 437 | } | ||
| 438 | } | ||
| 439 | } | ||
| 440 | |||
| 441 | for (i = 1; i < 15; i++) { | ||
| 442 | for (j = 0; j < i; j++) { | ||
| 443 | multiorder_join2(i, j); | ||
| 444 | } | ||
| 445 | } | ||
| 446 | |||
| 447 | for (i = 3; i < 10; i++) { | ||
| 448 | multiorder_join3(i); | ||
| 449 | } | ||
| 450 | } | ||
| 451 | |||
| 452 | static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc) | ||
| 453 | { | ||
| 454 | struct radix_tree_preload *rtp = &radix_tree_preloads; | ||
| 455 | if (rtp->nr != 0) | ||
| 456 | printf("split(%u %u) remaining %u\n", old_order, new_order, | ||
| 457 | rtp->nr); | ||
| 458 | /* | ||
| 459 | * Can't check for equality here as some nodes may have been | ||
| 460 | * RCU-freed while we ran. But we should never finish with more | ||
| 461 | * nodes allocated since they should have all been preloaded. | ||
| 462 | */ | ||
| 463 | if (nr_allocated > alloc) | ||
| 464 | printf("split(%u %u) allocated %u %u\n", old_order, new_order, | ||
| 465 | alloc, nr_allocated); | ||
| 466 | } | ||
| 467 | |||
| 468 | static void __multiorder_split(int old_order, int new_order) | ||
| 469 | { | ||
| 470 | RADIX_TREE(tree, GFP_ATOMIC); | ||
| 471 | void **slot; | ||
| 472 | struct radix_tree_iter iter; | ||
| 473 | unsigned alloc; | ||
| 474 | |||
| 475 | radix_tree_preload(GFP_KERNEL); | ||
| 476 | assert(item_insert_order(&tree, 0, old_order) == 0); | ||
| 477 | radix_tree_preload_end(); | ||
| 478 | |||
| 479 | /* Wipe out the preloaded cache or it'll confuse check_mem() */ | ||
| 480 | radix_tree_cpu_dead(0); | ||
| 481 | |||
| 482 | radix_tree_tag_set(&tree, 0, 2); | ||
| 483 | |||
| 484 | radix_tree_split_preload(old_order, new_order, GFP_KERNEL); | ||
| 485 | alloc = nr_allocated; | ||
| 486 | radix_tree_split(&tree, 0, new_order); | ||
| 487 | check_mem(old_order, new_order, alloc); | ||
| 488 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
| 489 | radix_tree_iter_replace(&tree, &iter, slot, | ||
| 490 | item_create(iter.index, new_order)); | ||
| 491 | } | ||
| 492 | radix_tree_preload_end(); | ||
| 493 | |||
| 494 | item_kill_tree(&tree); | ||
| 495 | } | ||
| 496 | |||
| 497 | static void __multiorder_split2(int old_order, int new_order) | ||
| 498 | { | ||
| 499 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 500 | void **slot; | ||
| 501 | struct radix_tree_iter iter; | ||
| 502 | struct radix_tree_node *node; | ||
| 503 | void *item; | ||
| 504 | |||
| 505 | __radix_tree_insert(&tree, 0, old_order, (void *)0x12); | ||
| 506 | |||
| 507 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 508 | assert(item == (void *)0x12); | ||
| 509 | assert(node->exceptional > 0); | ||
| 510 | |||
| 511 | radix_tree_split(&tree, 0, new_order); | ||
| 512 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
| 513 | radix_tree_iter_replace(&tree, &iter, slot, | ||
| 514 | item_create(iter.index, new_order)); | ||
| 515 | } | ||
| 516 | |||
| 517 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 518 | assert(item != (void *)0x12); | ||
| 519 | assert(node->exceptional == 0); | ||
| 520 | |||
| 521 | item_kill_tree(&tree); | ||
| 522 | } | ||
| 523 | |||
| 524 | static void __multiorder_split3(int old_order, int new_order) | ||
| 525 | { | ||
| 526 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 527 | void **slot; | ||
| 528 | struct radix_tree_iter iter; | ||
| 529 | struct radix_tree_node *node; | ||
| 530 | void *item; | ||
| 531 | |||
| 532 | __radix_tree_insert(&tree, 0, old_order, (void *)0x12); | ||
| 533 | |||
| 534 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 535 | assert(item == (void *)0x12); | ||
| 536 | assert(node->exceptional > 0); | ||
| 537 | |||
| 538 | radix_tree_split(&tree, 0, new_order); | ||
| 539 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
| 540 | radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16); | ||
| 541 | } | ||
| 542 | |||
| 543 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 544 | assert(item == (void *)0x16); | ||
| 545 | assert(node->exceptional > 0); | ||
| 546 | |||
| 547 | item_kill_tree(&tree); | ||
| 548 | |||
| 549 | __radix_tree_insert(&tree, 0, old_order, (void *)0x12); | ||
| 550 | |||
| 551 | item = __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 552 | assert(item == (void *)0x12); | ||
| 553 | assert(node->exceptional > 0); | ||
| 554 | |||
| 555 | radix_tree_split(&tree, 0, new_order); | ||
| 556 | radix_tree_for_each_slot(slot, &tree, &iter, 0) { | ||
| 557 | if (iter.index == (1 << new_order)) | ||
| 558 | radix_tree_iter_replace(&tree, &iter, slot, | ||
| 559 | (void *)0x16); | ||
| 560 | else | ||
| 561 | radix_tree_iter_replace(&tree, &iter, slot, NULL); | ||
| 562 | } | ||
| 563 | |||
| 564 | item = __radix_tree_lookup(&tree, 1 << new_order, &node, NULL); | ||
| 565 | assert(item == (void *)0x16); | ||
| 566 | assert(node->count == node->exceptional); | ||
| 567 | do { | ||
| 568 | node = node->parent; | ||
| 569 | if (!node) | ||
| 570 | break; | ||
| 571 | assert(node->count == 1); | ||
| 572 | assert(node->exceptional == 0); | ||
| 573 | } while (1); | ||
| 574 | |||
| 575 | item_kill_tree(&tree); | ||
| 576 | } | ||
| 577 | |||
| 578 | static void multiorder_split(void) | ||
| 579 | { | ||
| 580 | int i, j; | ||
| 581 | |||
| 582 | for (i = 3; i < 11; i++) | ||
| 583 | for (j = 0; j < i; j++) { | ||
| 584 | __multiorder_split(i, j); | ||
| 585 | __multiorder_split2(i, j); | ||
| 586 | __multiorder_split3(i, j); | ||
| 587 | } | ||
| 588 | } | ||
| 589 | |||
| 590 | static void multiorder_account(void) | ||
| 591 | { | ||
| 592 | RADIX_TREE(tree, GFP_KERNEL); | ||
| 593 | struct radix_tree_node *node; | ||
| 594 | void **slot; | ||
| 595 | |||
| 596 | item_insert_order(&tree, 0, 5); | ||
| 597 | |||
| 598 | __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); | ||
| 599 | __radix_tree_lookup(&tree, 0, &node, NULL); | ||
| 600 | assert(node->count == node->exceptional * 2); | ||
| 601 | radix_tree_delete(&tree, 1 << 5); | ||
| 602 | assert(node->exceptional == 0); | ||
| 603 | |||
| 604 | __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); | ||
| 605 | __radix_tree_lookup(&tree, 1 << 5, &node, &slot); | ||
| 606 | assert(node->count == node->exceptional * 2); | ||
| 607 | __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL); | ||
| 608 | assert(node->exceptional == 0); | ||
| 609 | |||
| 610 | item_kill_tree(&tree); | ||
| 611 | } | ||
| 612 | |||
| 328 | void multiorder_checks(void) | 613 | void multiorder_checks(void) |
| 329 | { | 614 | { |
| 330 | int i; | 615 | int i; |
| @@ -342,4 +627,9 @@ void multiorder_checks(void) | |||
| 342 | multiorder_tag_tests(); | 627 | multiorder_tag_tests(); |
| 343 | multiorder_iteration(); | 628 | multiorder_iteration(); |
| 344 | multiorder_tagged_iteration(); | 629 | multiorder_tagged_iteration(); |
| 630 | multiorder_join(); | ||
| 631 | multiorder_split(); | ||
| 632 | multiorder_account(); | ||
| 633 | |||
| 634 | radix_tree_cpu_dead(0); | ||
| 345 | } | 635 | } |
diff --git a/tools/testing/radix-tree/rcupdate.c b/tools/testing/radix-tree/rcupdate.c deleted file mode 100644 index 31a2d14225d6..000000000000 --- a/tools/testing/radix-tree/rcupdate.c +++ /dev/null | |||
| @@ -1,86 +0,0 @@ | |||
| 1 | #include <linux/rcupdate.h> | ||
| 2 | #include <pthread.h> | ||
| 3 | #include <stdio.h> | ||
| 4 | #include <assert.h> | ||
| 5 | |||
| 6 | static pthread_mutex_t rculock = PTHREAD_MUTEX_INITIALIZER; | ||
| 7 | static struct rcu_head *rcuhead_global = NULL; | ||
| 8 | static __thread int nr_rcuhead = 0; | ||
| 9 | static __thread struct rcu_head *rcuhead = NULL; | ||
| 10 | static __thread struct rcu_head *rcutail = NULL; | ||
| 11 | |||
| 12 | static pthread_cond_t rcu_worker_cond = PTHREAD_COND_INITIALIZER; | ||
| 13 | |||
| 14 | /* switch to urcu implementation when it is merged. */ | ||
| 15 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head)) | ||
| 16 | { | ||
| 17 | head->func = func; | ||
| 18 | head->next = rcuhead; | ||
| 19 | rcuhead = head; | ||
| 20 | if (!rcutail) | ||
| 21 | rcutail = head; | ||
| 22 | nr_rcuhead++; | ||
| 23 | if (nr_rcuhead >= 1000) { | ||
| 24 | int signal = 0; | ||
| 25 | |||
| 26 | pthread_mutex_lock(&rculock); | ||
| 27 | if (!rcuhead_global) | ||
| 28 | signal = 1; | ||
| 29 | rcutail->next = rcuhead_global; | ||
| 30 | rcuhead_global = head; | ||
| 31 | pthread_mutex_unlock(&rculock); | ||
| 32 | |||
| 33 | nr_rcuhead = 0; | ||
| 34 | rcuhead = NULL; | ||
| 35 | rcutail = NULL; | ||
| 36 | |||
| 37 | if (signal) { | ||
| 38 | pthread_cond_signal(&rcu_worker_cond); | ||
| 39 | } | ||
| 40 | } | ||
| 41 | } | ||
| 42 | |||
| 43 | static void *rcu_worker(void *arg) | ||
| 44 | { | ||
| 45 | struct rcu_head *r; | ||
| 46 | |||
| 47 | rcupdate_thread_init(); | ||
| 48 | |||
| 49 | while (1) { | ||
| 50 | pthread_mutex_lock(&rculock); | ||
| 51 | while (!rcuhead_global) { | ||
| 52 | pthread_cond_wait(&rcu_worker_cond, &rculock); | ||
| 53 | } | ||
| 54 | r = rcuhead_global; | ||
| 55 | rcuhead_global = NULL; | ||
| 56 | |||
| 57 | pthread_mutex_unlock(&rculock); | ||
| 58 | |||
| 59 | synchronize_rcu(); | ||
| 60 | |||
| 61 | while (r) { | ||
| 62 | struct rcu_head *tmp = r->next; | ||
| 63 | r->func(r); | ||
| 64 | r = tmp; | ||
| 65 | } | ||
| 66 | } | ||
| 67 | |||
| 68 | rcupdate_thread_exit(); | ||
| 69 | |||
| 70 | return NULL; | ||
| 71 | } | ||
| 72 | |||
| 73 | static pthread_t worker_thread; | ||
| 74 | void rcupdate_init(void) | ||
| 75 | { | ||
| 76 | pthread_create(&worker_thread, NULL, rcu_worker, NULL); | ||
| 77 | } | ||
| 78 | |||
| 79 | void rcupdate_thread_init(void) | ||
| 80 | { | ||
| 81 | rcu_register_thread(); | ||
| 82 | } | ||
| 83 | void rcupdate_thread_exit(void) | ||
| 84 | { | ||
| 85 | rcu_unregister_thread(); | ||
| 86 | } | ||
diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c index 63bf347aaf33..a41325d7a170 100644 --- a/tools/testing/radix-tree/regression2.c +++ b/tools/testing/radix-tree/regression2.c | |||
| @@ -50,6 +50,7 @@ | |||
| 50 | #include <stdio.h> | 50 | #include <stdio.h> |
| 51 | 51 | ||
| 52 | #include "regression.h" | 52 | #include "regression.h" |
| 53 | #include "test.h" | ||
| 53 | 54 | ||
| 54 | #define PAGECACHE_TAG_DIRTY 0 | 55 | #define PAGECACHE_TAG_DIRTY 0 |
| 55 | #define PAGECACHE_TAG_WRITEBACK 1 | 56 | #define PAGECACHE_TAG_WRITEBACK 1 |
| @@ -90,7 +91,7 @@ void regression2_test(void) | |||
| 90 | /* 1. */ | 91 | /* 1. */ |
| 91 | start = 0; | 92 | start = 0; |
| 92 | end = max_slots - 2; | 93 | end = max_slots - 2; |
| 93 | radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1, | 94 | tag_tagged_items(&mt_tree, NULL, start, end, 1, |
| 94 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); | 95 | PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); |
| 95 | 96 | ||
| 96 | /* 2. */ | 97 | /* 2. */ |
diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c index 1f06ed73d0a8..b594841fae85 100644 --- a/tools/testing/radix-tree/regression3.c +++ b/tools/testing/radix-tree/regression3.c | |||
| @@ -5,7 +5,7 @@ | |||
| 5 | * In following radix_tree_next_slot current chunk size becomes zero. | 5 | * In following radix_tree_next_slot current chunk size becomes zero. |
| 6 | * This isn't checked and it tries to dereference null pointer in slot. | 6 | * This isn't checked and it tries to dereference null pointer in slot. |
| 7 | * | 7 | * |
| 8 | * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1, | 8 | * Helper radix_tree_iter_resume reset slot to NULL and next_index to index + 1, |
| 9 | * for tagger iteraction it also must reset cached tags in iterator to abort | 9 | * for tagger iteraction it also must reset cached tags in iterator to abort |
| 10 | * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. | 10 | * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. |
| 11 | * | 11 | * |
| @@ -88,7 +88,7 @@ void regression3_test(void) | |||
| 88 | printf("slot %ld %p\n", iter.index, *slot); | 88 | printf("slot %ld %p\n", iter.index, *slot); |
| 89 | if (!iter.index) { | 89 | if (!iter.index) { |
| 90 | printf("next at %ld\n", iter.index); | 90 | printf("next at %ld\n", iter.index); |
| 91 | slot = radix_tree_iter_next(&iter); | 91 | slot = radix_tree_iter_resume(slot, &iter); |
| 92 | } | 92 | } |
| 93 | } | 93 | } |
| 94 | 94 | ||
| @@ -96,7 +96,7 @@ void regression3_test(void) | |||
| 96 | printf("contig %ld %p\n", iter.index, *slot); | 96 | printf("contig %ld %p\n", iter.index, *slot); |
| 97 | if (!iter.index) { | 97 | if (!iter.index) { |
| 98 | printf("next at %ld\n", iter.index); | 98 | printf("next at %ld\n", iter.index); |
| 99 | slot = radix_tree_iter_next(&iter); | 99 | slot = radix_tree_iter_resume(slot, &iter); |
| 100 | } | 100 | } |
| 101 | } | 101 | } |
| 102 | 102 | ||
| @@ -106,7 +106,7 @@ void regression3_test(void) | |||
| 106 | printf("tagged %ld %p\n", iter.index, *slot); | 106 | printf("tagged %ld %p\n", iter.index, *slot); |
| 107 | if (!iter.index) { | 107 | if (!iter.index) { |
| 108 | printf("next at %ld\n", iter.index); | 108 | printf("next at %ld\n", iter.index); |
| 109 | slot = radix_tree_iter_next(&iter); | 109 | slot = radix_tree_iter_resume(slot, &iter); |
| 110 | } | 110 | } |
| 111 | } | 111 | } |
| 112 | 112 | ||
diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c index b0ac05741750..fd98c132207a 100644 --- a/tools/testing/radix-tree/tag_check.c +++ b/tools/testing/radix-tree/tag_check.c | |||
| @@ -23,7 +23,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag) | |||
| 23 | item_tag_set(tree, index, tag); | 23 | item_tag_set(tree, index, tag); |
| 24 | ret = item_tag_get(tree, index, tag); | 24 | ret = item_tag_get(tree, index, tag); |
| 25 | assert(ret != 0); | 25 | assert(ret != 0); |
| 26 | ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag); | 26 | ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag); |
| 27 | assert(ret == 1); | 27 | assert(ret == 1); |
| 28 | ret = item_tag_get(tree, index, !tag); | 28 | ret = item_tag_get(tree, index, !tag); |
| 29 | assert(ret != 0); | 29 | assert(ret != 0); |
| @@ -51,6 +51,7 @@ void simple_checks(void) | |||
| 51 | verify_tag_consistency(&tree, 1); | 51 | verify_tag_consistency(&tree, 1); |
| 52 | printf("before item_kill_tree: %d allocated\n", nr_allocated); | 52 | printf("before item_kill_tree: %d allocated\n", nr_allocated); |
| 53 | item_kill_tree(&tree); | 53 | item_kill_tree(&tree); |
| 54 | rcu_barrier(); | ||
| 54 | printf("after item_kill_tree: %d allocated\n", nr_allocated); | 55 | printf("after item_kill_tree: %d allocated\n", nr_allocated); |
| 55 | } | 56 | } |
| 56 | 57 | ||
| @@ -319,10 +320,13 @@ static void single_check(void) | |||
| 319 | assert(ret == 0); | 320 | assert(ret == 0); |
| 320 | verify_tag_consistency(&tree, 0); | 321 | verify_tag_consistency(&tree, 0); |
| 321 | verify_tag_consistency(&tree, 1); | 322 | verify_tag_consistency(&tree, 1); |
| 322 | ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1); | 323 | ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1); |
| 323 | assert(ret == 1); | 324 | assert(ret == 1); |
| 324 | ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); | 325 | ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); |
| 325 | assert(ret == 1); | 326 | assert(ret == 1); |
| 327 | item_tag_clear(&tree, 0, 0); | ||
| 328 | ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 0); | ||
| 329 | assert(ret == 0); | ||
| 326 | item_kill_tree(&tree); | 330 | item_kill_tree(&tree); |
| 327 | } | 331 | } |
| 328 | 332 | ||
| @@ -331,12 +335,16 @@ void tag_check(void) | |||
| 331 | single_check(); | 335 | single_check(); |
| 332 | extend_checks(); | 336 | extend_checks(); |
| 333 | contract_checks(); | 337 | contract_checks(); |
| 338 | rcu_barrier(); | ||
| 334 | printf("after extend_checks: %d allocated\n", nr_allocated); | 339 | printf("after extend_checks: %d allocated\n", nr_allocated); |
| 335 | __leak_check(); | 340 | __leak_check(); |
| 336 | leak_check(); | 341 | leak_check(); |
| 342 | rcu_barrier(); | ||
| 337 | printf("after leak_check: %d allocated\n", nr_allocated); | 343 | printf("after leak_check: %d allocated\n", nr_allocated); |
| 338 | simple_checks(); | 344 | simple_checks(); |
| 345 | rcu_barrier(); | ||
| 339 | printf("after simple_checks: %d allocated\n", nr_allocated); | 346 | printf("after simple_checks: %d allocated\n", nr_allocated); |
| 340 | thrash_tags(); | 347 | thrash_tags(); |
| 348 | rcu_barrier(); | ||
| 341 | printf("after thrash_tags: %d allocated\n", nr_allocated); | 349 | printf("after thrash_tags: %d allocated\n", nr_allocated); |
| 342 | } | 350 | } |
diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index a6e8099eaf4f..e5726e373646 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c | |||
| @@ -24,21 +24,29 @@ int item_tag_get(struct radix_tree_root *root, unsigned long index, int tag) | |||
| 24 | return radix_tree_tag_get(root, index, tag); | 24 | return radix_tree_tag_get(root, index, tag); |
| 25 | } | 25 | } |
| 26 | 26 | ||
| 27 | int __item_insert(struct radix_tree_root *root, struct item *item, | 27 | int __item_insert(struct radix_tree_root *root, struct item *item) |
| 28 | unsigned order) | ||
| 29 | { | 28 | { |
| 30 | return __radix_tree_insert(root, item->index, order, item); | 29 | return __radix_tree_insert(root, item->index, item->order, item); |
| 31 | } | 30 | } |
| 32 | 31 | ||
| 33 | int item_insert(struct radix_tree_root *root, unsigned long index) | 32 | int item_insert(struct radix_tree_root *root, unsigned long index) |
| 34 | { | 33 | { |
| 35 | return __item_insert(root, item_create(index), 0); | 34 | return __item_insert(root, item_create(index, 0)); |
| 36 | } | 35 | } |
| 37 | 36 | ||
| 38 | int item_insert_order(struct radix_tree_root *root, unsigned long index, | 37 | int item_insert_order(struct radix_tree_root *root, unsigned long index, |
| 39 | unsigned order) | 38 | unsigned order) |
| 40 | { | 39 | { |
| 41 | return __item_insert(root, item_create(index), order); | 40 | return __item_insert(root, item_create(index, order)); |
| 41 | } | ||
| 42 | |||
| 43 | void item_sanity(struct item *item, unsigned long index) | ||
| 44 | { | ||
| 45 | unsigned long mask; | ||
| 46 | assert(!radix_tree_is_internal_node(item)); | ||
| 47 | assert(item->order < BITS_PER_LONG); | ||
| 48 | mask = (1UL << item->order) - 1; | ||
| 49 | assert((item->index | mask) == (index | mask)); | ||
| 42 | } | 50 | } |
| 43 | 51 | ||
| 44 | int item_delete(struct radix_tree_root *root, unsigned long index) | 52 | int item_delete(struct radix_tree_root *root, unsigned long index) |
| @@ -46,18 +54,19 @@ int item_delete(struct radix_tree_root *root, unsigned long index) | |||
| 46 | struct item *item = radix_tree_delete(root, index); | 54 | struct item *item = radix_tree_delete(root, index); |
| 47 | 55 | ||
| 48 | if (item) { | 56 | if (item) { |
| 49 | assert(item->index == index); | 57 | item_sanity(item, index); |
| 50 | free(item); | 58 | free(item); |
| 51 | return 1; | 59 | return 1; |
| 52 | } | 60 | } |
| 53 | return 0; | 61 | return 0; |
| 54 | } | 62 | } |
| 55 | 63 | ||
| 56 | struct item *item_create(unsigned long index) | 64 | struct item *item_create(unsigned long index, unsigned int order) |
| 57 | { | 65 | { |
| 58 | struct item *ret = malloc(sizeof(*ret)); | 66 | struct item *ret = malloc(sizeof(*ret)); |
| 59 | 67 | ||
| 60 | ret->index = index; | 68 | ret->index = index; |
| 69 | ret->order = order; | ||
| 61 | return ret; | 70 | return ret; |
| 62 | } | 71 | } |
| 63 | 72 | ||
| @@ -66,8 +75,8 @@ void item_check_present(struct radix_tree_root *root, unsigned long index) | |||
| 66 | struct item *item; | 75 | struct item *item; |
| 67 | 76 | ||
| 68 | item = radix_tree_lookup(root, index); | 77 | item = radix_tree_lookup(root, index); |
| 69 | assert(item != 0); | 78 | assert(item != NULL); |
| 70 | assert(item->index == index); | 79 | item_sanity(item, index); |
| 71 | } | 80 | } |
| 72 | 81 | ||
| 73 | struct item *item_lookup(struct radix_tree_root *root, unsigned long index) | 82 | struct item *item_lookup(struct radix_tree_root *root, unsigned long index) |
| @@ -80,7 +89,7 @@ void item_check_absent(struct radix_tree_root *root, unsigned long index) | |||
| 80 | struct item *item; | 89 | struct item *item; |
| 81 | 90 | ||
| 82 | item = radix_tree_lookup(root, index); | 91 | item = radix_tree_lookup(root, index); |
| 83 | assert(item == 0); | 92 | assert(item == NULL); |
| 84 | } | 93 | } |
| 85 | 94 | ||
| 86 | /* | 95 | /* |
| @@ -142,6 +151,62 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, | |||
| 142 | assert(nfound == 0); | 151 | assert(nfound == 0); |
| 143 | } | 152 | } |
| 144 | 153 | ||
| 154 | /* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */ | ||
| 155 | int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, | ||
| 156 | unsigned long start, unsigned long end, unsigned batch, | ||
| 157 | unsigned iftag, unsigned thentag) | ||
| 158 | { | ||
| 159 | unsigned long tagged = 0; | ||
| 160 | struct radix_tree_iter iter; | ||
| 161 | void **slot; | ||
| 162 | |||
| 163 | if (batch == 0) | ||
| 164 | batch = 1; | ||
| 165 | |||
| 166 | if (lock) | ||
| 167 | pthread_mutex_lock(lock); | ||
| 168 | radix_tree_for_each_tagged(slot, root, &iter, start, iftag) { | ||
| 169 | if (iter.index > end) | ||
| 170 | break; | ||
| 171 | radix_tree_iter_tag_set(root, &iter, thentag); | ||
| 172 | tagged++; | ||
| 173 | if ((tagged % batch) != 0) | ||
| 174 | continue; | ||
| 175 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 176 | if (lock) { | ||
| 177 | pthread_mutex_unlock(lock); | ||
| 178 | rcu_barrier(); | ||
| 179 | pthread_mutex_lock(lock); | ||
| 180 | } | ||
| 181 | } | ||
| 182 | if (lock) | ||
| 183 | pthread_mutex_unlock(lock); | ||
| 184 | |||
| 185 | return tagged; | ||
| 186 | } | ||
| 187 | |||
| 188 | /* Use the same pattern as find_swap_entry() in mm/shmem.c */ | ||
| 189 | unsigned long find_item(struct radix_tree_root *root, void *item) | ||
| 190 | { | ||
| 191 | struct radix_tree_iter iter; | ||
| 192 | void **slot; | ||
| 193 | unsigned long found = -1; | ||
| 194 | unsigned long checked = 0; | ||
| 195 | |||
| 196 | radix_tree_for_each_slot(slot, root, &iter, 0) { | ||
| 197 | if (*slot == item) { | ||
| 198 | found = iter.index; | ||
| 199 | break; | ||
| 200 | } | ||
| 201 | checked++; | ||
| 202 | if ((checked % 4) != 0) | ||
| 203 | continue; | ||
| 204 | slot = radix_tree_iter_resume(slot, &iter); | ||
| 205 | } | ||
| 206 | |||
| 207 | return found; | ||
| 208 | } | ||
| 209 | |||
| 145 | static int verify_node(struct radix_tree_node *slot, unsigned int tag, | 210 | static int verify_node(struct radix_tree_node *slot, unsigned int tag, |
| 146 | int tagged) | 211 | int tagged) |
| 147 | { | 212 | { |
| @@ -200,9 +265,16 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag) | |||
| 200 | 265 | ||
| 201 | void item_kill_tree(struct radix_tree_root *root) | 266 | void item_kill_tree(struct radix_tree_root *root) |
| 202 | { | 267 | { |
| 268 | struct radix_tree_iter iter; | ||
| 269 | void **slot; | ||
| 203 | struct item *items[32]; | 270 | struct item *items[32]; |
| 204 | int nfound; | 271 | int nfound; |
| 205 | 272 | ||
| 273 | radix_tree_for_each_slot(slot, root, &iter, 0) { | ||
| 274 | if (radix_tree_exceptional_entry(*slot)) | ||
| 275 | radix_tree_delete(root, iter.index); | ||
| 276 | } | ||
| 277 | |||
| 206 | while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { | 278 | while ((nfound = radix_tree_gang_lookup(root, (void **)items, 0, 32))) { |
| 207 | int i; | 279 | int i; |
| 208 | 280 | ||
diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 217fb2403f09..056a23b56467 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h | |||
| @@ -5,11 +5,11 @@ | |||
| 5 | 5 | ||
| 6 | struct item { | 6 | struct item { |
| 7 | unsigned long index; | 7 | unsigned long index; |
| 8 | unsigned int order; | ||
| 8 | }; | 9 | }; |
| 9 | 10 | ||
| 10 | struct item *item_create(unsigned long index); | 11 | struct item *item_create(unsigned long index, unsigned int order); |
| 11 | int __item_insert(struct radix_tree_root *root, struct item *item, | 12 | int __item_insert(struct radix_tree_root *root, struct item *item); |
| 12 | unsigned order); | ||
| 13 | int item_insert(struct radix_tree_root *root, unsigned long index); | 13 | int item_insert(struct radix_tree_root *root, unsigned long index); |
| 14 | int item_insert_order(struct radix_tree_root *root, unsigned long index, | 14 | int item_insert_order(struct radix_tree_root *root, unsigned long index, |
| 15 | unsigned order); | 15 | unsigned order); |
| @@ -25,9 +25,15 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, | |||
| 25 | unsigned long nr, int chunk); | 25 | unsigned long nr, int chunk); |
| 26 | void item_kill_tree(struct radix_tree_root *root); | 26 | void item_kill_tree(struct radix_tree_root *root); |
| 27 | 27 | ||
| 28 | int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, | ||
| 29 | unsigned long start, unsigned long end, unsigned batch, | ||
| 30 | unsigned iftag, unsigned thentag); | ||
| 31 | unsigned long find_item(struct radix_tree_root *, void *item); | ||
| 32 | |||
| 28 | void tag_check(void); | 33 | void tag_check(void); |
| 29 | void multiorder_checks(void); | 34 | void multiorder_checks(void); |
| 30 | void iteration_test(void); | 35 | void iteration_test(unsigned order, unsigned duration); |
| 36 | void benchmark(void); | ||
| 31 | 37 | ||
| 32 | struct item * | 38 | struct item * |
| 33 | item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); | 39 | item_tag_set(struct radix_tree_root *root, unsigned long index, int tag); |
| @@ -40,7 +46,14 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); | |||
| 40 | extern int nr_allocated; | 46 | extern int nr_allocated; |
| 41 | 47 | ||
| 42 | /* Normally private parts of lib/radix-tree.c */ | 48 | /* Normally private parts of lib/radix-tree.c */ |
| 49 | struct radix_tree_node *entry_to_node(void *ptr); | ||
| 43 | void radix_tree_dump(struct radix_tree_root *root); | 50 | void radix_tree_dump(struct radix_tree_root *root); |
| 44 | int root_tag_get(struct radix_tree_root *root, unsigned int tag); | 51 | int root_tag_get(struct radix_tree_root *root, unsigned int tag); |
| 45 | unsigned long node_maxindex(struct radix_tree_node *); | 52 | unsigned long node_maxindex(struct radix_tree_node *); |
| 46 | unsigned long shift_maxindex(unsigned int shift); | 53 | unsigned long shift_maxindex(unsigned int shift); |
| 54 | int radix_tree_cpu_dead(unsigned int cpu); | ||
| 55 | struct radix_tree_preload { | ||
| 56 | unsigned nr; | ||
| 57 | struct radix_tree_node *nodes; | ||
| 58 | }; | ||
| 59 | extern struct radix_tree_preload radix_tree_preloads; | ||
diff --git a/usr/Kconfig b/usr/Kconfig index 572dcf7b6a44..6278f135256d 100644 --- a/usr/Kconfig +++ b/usr/Kconfig | |||
| @@ -98,3 +98,130 @@ config RD_LZ4 | |||
| 98 | help | 98 | help |
| 99 | Support loading of a LZ4 encoded initial ramdisk or cpio buffer | 99 | Support loading of a LZ4 encoded initial ramdisk or cpio buffer |
| 100 | If unsure, say N. | 100 | If unsure, say N. |
| 101 | |||
| 102 | choice | ||
| 103 | prompt "Built-in initramfs compression mode" | ||
| 104 | depends on INITRAMFS_SOURCE!="" | ||
| 105 | optional | ||
| 106 | help | ||
| 107 | This option allows you to decide by which algorithm the builtin | ||
| 108 | initramfs will be compressed. Several compression algorithms are | ||
| 109 | available, which differ in efficiency, compression and | ||
| 110 | decompression speed. Compression speed is only relevant | ||
| 111 | when building a kernel. Decompression speed is relevant at | ||
| 112 | each boot. Also the memory usage during decompression may become | ||
| 113 | relevant on memory constrained systems. This is usually based on the | ||
| 114 | dictionary size of the algorithm with algorithms like XZ and LZMA | ||
| 115 | featuring large dictionary sizes. | ||
| 116 | |||
| 117 | High compression options are mostly useful for users who are | ||
| 118 | low on RAM, since it reduces the memory consumption during | ||
| 119 | boot. | ||
| 120 | |||
| 121 | Keep in mind that your build system needs to provide the appropriate | ||
| 122 | compression tool to compress the generated initram cpio file for | ||
| 123 | embedding. | ||
| 124 | |||
| 125 | If in doubt, select 'None' | ||
| 126 | |||
| 127 | config INITRAMFS_COMPRESSION_NONE | ||
| 128 | bool "None" | ||
| 129 | help | ||
| 130 | Do not compress the built-in initramfs at all. This may sound wasteful | ||
| 131 | in space, but, you should be aware that the built-in initramfs will be | ||
| 132 | compressed at a later stage anyways along with the rest of the kernel, | ||
| 133 | on those architectures that support this. However, not compressing the | ||
| 134 | initramfs may lead to slightly higher memory consumption during a | ||
| 135 | short time at boot, while both the cpio image and the unpacked | ||
| 136 | filesystem image will be present in memory simultaneously | ||
| 137 | |||
| 138 | config INITRAMFS_COMPRESSION_GZIP | ||
| 139 | bool "Gzip" | ||
| 140 | depends on RD_GZIP | ||
| 141 | help | ||
| 142 | Use the old and well tested gzip compression algorithm. Gzip provides | ||
| 143 | a good balance between compression ratio and decompression speed and | ||
| 144 | has a reasonable compression speed. It is also more likely to be | ||
| 145 | supported by your build system as the gzip tool is present by default | ||
| 146 | on most distros. | ||
| 147 | |||
| 148 | config INITRAMFS_COMPRESSION_BZIP2 | ||
| 149 | bool "Bzip2" | ||
| 150 | depends on RD_BZIP2 | ||
| 151 | help | ||
| 152 | It's compression ratio and speed is intermediate. Decompression speed | ||
| 153 | is slowest among the choices. The initramfs size is about 10% smaller | ||
| 154 | with bzip2, in comparison to gzip. Bzip2 uses a large amount of | ||
| 155 | memory. For modern kernels you will need at least 8MB RAM or more for | ||
| 156 | booting. | ||
| 157 | |||
| 158 | If you choose this, keep in mind that you need to have the bzip2 tool | ||
| 159 | available to be able to compress the initram. | ||
| 160 | |||
| 161 | config INITRAMFS_COMPRESSION_LZMA | ||
| 162 | bool "LZMA" | ||
| 163 | depends on RD_LZMA | ||
| 164 | help | ||
| 165 | This algorithm's compression ratio is best but has a large dictionary | ||
| 166 | size which might cause issues in memory constrained systems. | ||
| 167 | Decompression speed is between the other choices. Compression is | ||
| 168 | slowest. The initramfs size is about 33% smaller with LZMA in | ||
| 169 | comparison to gzip. | ||
| 170 | |||
| 171 | If you choose this, keep in mind that you may need to install the xz | ||
| 172 | or lzma tools to be able to compress the initram. | ||
| 173 | |||
| 174 | config INITRAMFS_COMPRESSION_XZ | ||
| 175 | bool "XZ" | ||
| 176 | depends on RD_XZ | ||
| 177 | help | ||
| 178 | XZ uses the LZMA2 algorithm and has a large dictionary which may cause | ||
| 179 | problems on memory constrained systems. The initramfs size is about | ||
| 180 | 30% smaller with XZ in comparison to gzip. Decompression speed is | ||
| 181 | better than that of bzip2 but worse than gzip and LZO. Compression is | ||
| 182 | slow. | ||
| 183 | |||
| 184 | If you choose this, keep in mind that you may need to install the xz | ||
| 185 | tool to be able to compress the initram. | ||
| 186 | |||
| 187 | config INITRAMFS_COMPRESSION_LZO | ||
| 188 | bool "LZO" | ||
| 189 | depends on RD_LZO | ||
| 190 | help | ||
| 191 | It's compression ratio is the second poorest amongst the choices. The | ||
| 192 | kernel size is about 10% bigger than gzip. Despite that, it's | ||
| 193 | decompression speed is the second fastest and it's compression speed | ||
| 194 | is quite fast too. | ||
| 195 | |||
| 196 | If you choose this, keep in mind that you may need to install the lzop | ||
| 197 | tool to be able to compress the initram. | ||
| 198 | |||
| 199 | config INITRAMFS_COMPRESSION_LZ4 | ||
| 200 | bool "LZ4" | ||
| 201 | depends on RD_LZ4 | ||
| 202 | help | ||
| 203 | It's compression ratio is the poorest amongst the choices. The kernel | ||
| 204 | size is about 15% bigger than gzip; however its decompression speed | ||
| 205 | is the fastest. | ||
| 206 | |||
| 207 | If you choose this, keep in mind that most distros don't provide lz4 | ||
| 208 | by default which could cause a build failure. | ||
| 209 | |||
| 210 | endchoice | ||
| 211 | |||
| 212 | config INITRAMFS_COMPRESSION | ||
| 213 | string | ||
| 214 | default "" if INITRAMFS_COMPRESSION_NONE | ||
| 215 | default ".gz" if INITRAMFS_COMPRESSION_GZIP | ||
| 216 | default ".bz2" if INITRAMFS_COMPRESSION_BZIP2 | ||
| 217 | default ".lzma" if INITRAMFS_COMPRESSION_LZMA | ||
| 218 | default ".xz" if INITRAMFS_COMPRESSION_XZ | ||
| 219 | default ".lzo" if INITRAMFS_COMPRESSION_LZO | ||
| 220 | default ".lz4" if INITRAMFS_COMPRESSION_LZ4 | ||
| 221 | default ".gz" if RD_GZIP | ||
| 222 | default ".lz4" if RD_LZ4 | ||
| 223 | default ".lzo" if RD_LZO | ||
| 224 | default ".xz" if RD_XZ | ||
| 225 | default ".lzma" if RD_LZMA | ||
| 226 | default ".bz2" if RD_BZIP2 | ||
| 227 | default "" | ||
diff --git a/usr/Makefile b/usr/Makefile index e767f019accf..17a513268325 100644 --- a/usr/Makefile +++ b/usr/Makefile | |||
| @@ -5,25 +5,7 @@ | |||
| 5 | klibcdirs:; | 5 | klibcdirs:; |
| 6 | PHONY += klibcdirs | 6 | PHONY += klibcdirs |
| 7 | 7 | ||
| 8 | 8 | suffix_y = $(CONFIG_INITRAMFS_COMPRESSION) | |
| 9 | # Bzip2 | ||
| 10 | suffix_$(CONFIG_RD_BZIP2) = .bz2 | ||
| 11 | |||
| 12 | # Lzma | ||
| 13 | suffix_$(CONFIG_RD_LZMA) = .lzma | ||
| 14 | |||
| 15 | # XZ | ||
| 16 | suffix_$(CONFIG_RD_XZ) = .xz | ||
| 17 | |||
| 18 | # Lzo | ||
| 19 | suffix_$(CONFIG_RD_LZO) = .lzo | ||
| 20 | |||
| 21 | # Lz4 | ||
| 22 | suffix_$(CONFIG_RD_LZ4) = .lz4 | ||
| 23 | |||
| 24 | # Gzip | ||
| 25 | suffix_$(CONFIG_RD_GZIP) = .gz | ||
| 26 | |||
| 27 | AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)" | 9 | AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/initramfs_data.cpio$(suffix_y)" |
| 28 | 10 | ||
| 29 | # Generate builtin.o based on initramfs_data.o | 11 | # Generate builtin.o based on initramfs_data.o |
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index efeceb0a222d..3815e940fbea 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c | |||
| @@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work) | |||
| 76 | struct kvm_vcpu *vcpu = apf->vcpu; | 76 | struct kvm_vcpu *vcpu = apf->vcpu; |
| 77 | unsigned long addr = apf->addr; | 77 | unsigned long addr = apf->addr; |
| 78 | gva_t gva = apf->gva; | 78 | gva_t gva = apf->gva; |
| 79 | int locked = 1; | ||
| 79 | 80 | ||
| 80 | might_sleep(); | 81 | might_sleep(); |
| 81 | 82 | ||
| 82 | /* | 83 | /* |
| 83 | * This work is run asynchromously to the task which owns | 84 | * This work is run asynchromously to the task which owns |
| 84 | * mm and might be done in another context, so we must | 85 | * mm and might be done in another context, so we must |
| 85 | * use FOLL_REMOTE. | 86 | * access remotely. |
| 86 | */ | 87 | */ |
| 87 | __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, | 88 | down_read(&mm->mmap_sem); |
| 88 | FOLL_WRITE | FOLL_REMOTE); | 89 | get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, |
| 90 | &locked); | ||
| 91 | if (locked) | ||
| 92 | up_read(&mm->mmap_sem); | ||
| 89 | 93 | ||
| 90 | kvm_async_page_present_sync(vcpu, apf); | 94 | kvm_async_page_present_sync(vcpu, apf); |
| 91 | 95 | ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 823544c166be..de102cae7125 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
| @@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, | |||
| 1418 | npages = get_user_page_nowait(addr, write_fault, page); | 1418 | npages = get_user_page_nowait(addr, write_fault, page); |
| 1419 | up_read(¤t->mm->mmap_sem); | 1419 | up_read(¤t->mm->mmap_sem); |
| 1420 | } else { | 1420 | } else { |
| 1421 | unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; | 1421 | unsigned int flags = FOLL_HWPOISON; |
| 1422 | 1422 | ||
| 1423 | if (write_fault) | 1423 | if (write_fault) |
| 1424 | flags |= FOLL_WRITE; | 1424 | flags |= FOLL_WRITE; |
| 1425 | 1425 | ||
| 1426 | npages = __get_user_pages_unlocked(current, current->mm, addr, 1, | 1426 | npages = get_user_pages_unlocked(addr, 1, page, flags); |
| 1427 | page, flags); | ||
| 1428 | } | 1427 | } |
| 1429 | if (npages != 1) | 1428 | if (npages != 1) |
| 1430 | return npages; | 1429 | return npages; |
