diff options
34 files changed, 493 insertions, 171 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index b8a717c4f863..fc1718500181 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -10453,9 +10453,11 @@ S: Maintained | |||
10453 | F: drivers/net/ethernet/dlink/sundance.c | 10453 | F: drivers/net/ethernet/dlink/sundance.c |
10454 | 10454 | ||
10455 | SUPERH | 10455 | SUPERH |
10456 | M: Yoshinori Sato <ysato@users.sourceforge.jp> | ||
10457 | M: Rich Felker <dalias@libc.org> | ||
10456 | L: linux-sh@vger.kernel.org | 10458 | L: linux-sh@vger.kernel.org |
10457 | Q: http://patchwork.kernel.org/project/linux-sh/list/ | 10459 | Q: http://patchwork.kernel.org/project/linux-sh/list/ |
10458 | S: Orphan | 10460 | S: Maintained |
10459 | F: Documentation/sh/ | 10461 | F: Documentation/sh/ |
10460 | F: arch/sh/ | 10462 | F: arch/sh/ |
10461 | F: drivers/sh/ | 10463 | F: drivers/sh/ |
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 534a60ae282e..0eca3812527e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c | |||
@@ -1200,10 +1200,7 @@ error: | |||
1200 | while (i--) | 1200 | while (i--) |
1201 | if (pages[i]) | 1201 | if (pages[i]) |
1202 | __free_pages(pages[i], 0); | 1202 | __free_pages(pages[i], 0); |
1203 | if (array_size <= PAGE_SIZE) | 1203 | kvfree(pages); |
1204 | kfree(pages); | ||
1205 | else | ||
1206 | vfree(pages); | ||
1207 | return NULL; | 1204 | return NULL; |
1208 | } | 1205 | } |
1209 | 1206 | ||
@@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, | |||
1211 | size_t size, struct dma_attrs *attrs) | 1208 | size_t size, struct dma_attrs *attrs) |
1212 | { | 1209 | { |
1213 | int count = size >> PAGE_SHIFT; | 1210 | int count = size >> PAGE_SHIFT; |
1214 | int array_size = count * sizeof(struct page *); | ||
1215 | int i; | 1211 | int i; |
1216 | 1212 | ||
1217 | if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) { | 1213 | if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) { |
@@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, | |||
1222 | __free_pages(pages[i], 0); | 1218 | __free_pages(pages[i], 0); |
1223 | } | 1219 | } |
1224 | 1220 | ||
1225 | if (array_size <= PAGE_SIZE) | 1221 | kvfree(pages); |
1226 | kfree(pages); | ||
1227 | else | ||
1228 | vfree(pages); | ||
1229 | return 0; | 1222 | return 0; |
1230 | } | 1223 | } |
1231 | 1224 | ||
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 1544fabcd7f9..c57fd1ea9689 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h | |||
@@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void) | |||
67 | } | 67 | } |
68 | 68 | ||
69 | /** | 69 | /** |
70 | * __arch_wb_cache_pmem - write back a cache range with CLWB | 70 | * arch_wb_cache_pmem - write back a cache range with CLWB |
71 | * @vaddr: virtual start address | 71 | * @vaddr: virtual start address |
72 | * @size: number of bytes to write back | 72 | * @size: number of bytes to write back |
73 | * | 73 | * |
74 | * Write back a cache range using the CLWB (cache line write back) | 74 | * Write back a cache range using the CLWB (cache line write back) |
75 | * instruction. This function requires explicit ordering with an | 75 | * instruction. This function requires explicit ordering with an |
76 | * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. | 76 | * arch_wmb_pmem() call. |
77 | */ | 77 | */ |
78 | static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) | 78 | static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) |
79 | { | 79 | { |
80 | u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; | 80 | u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; |
81 | unsigned long clflush_mask = x86_clflush_size - 1; | 81 | unsigned long clflush_mask = x86_clflush_size - 1; |
82 | void *vaddr = (void __force *)addr; | ||
82 | void *vend = vaddr + size; | 83 | void *vend = vaddr + size; |
83 | void *p; | 84 | void *p; |
84 | 85 | ||
@@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, | |||
115 | len = copy_from_iter_nocache(vaddr, bytes, i); | 116 | len = copy_from_iter_nocache(vaddr, bytes, i); |
116 | 117 | ||
117 | if (__iter_needs_pmem_wb(i)) | 118 | if (__iter_needs_pmem_wb(i)) |
118 | __arch_wb_cache_pmem(vaddr, bytes); | 119 | arch_wb_cache_pmem(addr, bytes); |
119 | 120 | ||
120 | return len; | 121 | return len; |
121 | } | 122 | } |
@@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) | |||
133 | void *vaddr = (void __force *)addr; | 134 | void *vaddr = (void __force *)addr; |
134 | 135 | ||
135 | memset(vaddr, 0, size); | 136 | memset(vaddr, 0, size); |
136 | __arch_wb_cache_pmem(vaddr, size); | 137 | arch_wb_cache_pmem(addr, size); |
137 | } | 138 | } |
138 | 139 | ||
139 | static inline bool __arch_has_wmb_pmem(void) | 140 | static inline bool __arch_has_wmb_pmem(void) |
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 6682c5daf742..6e6bc1059301 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/hardirq.h> | 32 | #include <linux/hardirq.h> |
33 | #include <linux/pstore.h> | 33 | #include <linux/pstore.h> |
34 | #include <linux/vmalloc.h> | 34 | #include <linux/vmalloc.h> |
35 | #include <linux/mm.h> /* kvfree() */ | ||
35 | #include <acpi/apei.h> | 36 | #include <acpi/apei.h> |
36 | 37 | ||
37 | #include "apei-internal.h" | 38 | #include "apei-internal.h" |
@@ -532,10 +533,7 @@ retry: | |||
532 | return -ENOMEM; | 533 | return -ENOMEM; |
533 | memcpy(new_entries, entries, | 534 | memcpy(new_entries, entries, |
534 | erst_record_id_cache.len * sizeof(entries[0])); | 535 | erst_record_id_cache.len * sizeof(entries[0])); |
535 | if (erst_record_id_cache.size < PAGE_SIZE) | 536 | kvfree(entries); |
536 | kfree(entries); | ||
537 | else | ||
538 | vfree(entries); | ||
539 | erst_record_id_cache.entries = entries = new_entries; | 537 | erst_record_id_cache.entries = entries = new_entries; |
540 | erst_record_id_cache.size = new_size; | 538 | erst_record_id_cache.size = new_size; |
541 | } | 539 | } |
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 0dabc9b93725..92d6fc020a65 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c | |||
@@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number) | |||
364 | } | 364 | } |
365 | } | 365 | } |
366 | 366 | ||
367 | static void bm_vk_free(void *ptr, int v) | 367 | static inline void bm_vk_free(void *ptr) |
368 | { | 368 | { |
369 | if (v) | 369 | kvfree(ptr); |
370 | vfree(ptr); | ||
371 | else | ||
372 | kfree(ptr); | ||
373 | } | 370 | } |
374 | 371 | ||
375 | /* | 372 | /* |
@@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
379 | { | 376 | { |
380 | struct page **old_pages = b->bm_pages; | 377 | struct page **old_pages = b->bm_pages; |
381 | struct page **new_pages, *page; | 378 | struct page **new_pages, *page; |
382 | unsigned int i, bytes, vmalloced = 0; | 379 | unsigned int i, bytes; |
383 | unsigned long have = b->bm_number_of_pages; | 380 | unsigned long have = b->bm_number_of_pages; |
384 | 381 | ||
385 | BUG_ON(have == 0 && old_pages != NULL); | 382 | BUG_ON(have == 0 && old_pages != NULL); |
@@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
401 | PAGE_KERNEL); | 398 | PAGE_KERNEL); |
402 | if (!new_pages) | 399 | if (!new_pages) |
403 | return NULL; | 400 | return NULL; |
404 | vmalloced = 1; | ||
405 | } | 401 | } |
406 | 402 | ||
407 | if (want >= have) { | 403 | if (want >= have) { |
@@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
411 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); | 407 | page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); |
412 | if (!page) { | 408 | if (!page) { |
413 | bm_free_pages(new_pages + have, i - have); | 409 | bm_free_pages(new_pages + have, i - have); |
414 | bm_vk_free(new_pages, vmalloced); | 410 | bm_vk_free(new_pages); |
415 | return NULL; | 411 | return NULL; |
416 | } | 412 | } |
417 | /* we want to know which page it is | 413 | /* we want to know which page it is |
@@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) | |||
427 | */ | 423 | */ |
428 | } | 424 | } |
429 | 425 | ||
430 | if (vmalloced) | ||
431 | b->bm_flags |= BM_P_VMALLOCED; | ||
432 | else | ||
433 | b->bm_flags &= ~BM_P_VMALLOCED; | ||
434 | |||
435 | return new_pages; | 426 | return new_pages; |
436 | } | 427 | } |
437 | 428 | ||
@@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device) | |||
469 | if (!expect(device->bitmap)) | 460 | if (!expect(device->bitmap)) |
470 | return; | 461 | return; |
471 | bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); | 462 | bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); |
472 | bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); | 463 | bm_vk_free(device->bitmap->bm_pages); |
473 | kfree(device->bitmap); | 464 | kfree(device->bitmap); |
474 | device->bitmap = NULL; | 465 | device->bitmap = NULL; |
475 | } | 466 | } |
@@ -643,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi | |||
643 | unsigned long want, have, onpages; /* number of pages */ | 634 | unsigned long want, have, onpages; /* number of pages */ |
644 | struct page **npages, **opages = NULL; | 635 | struct page **npages, **opages = NULL; |
645 | int err = 0, growing; | 636 | int err = 0, growing; |
646 | int opages_vmalloced; | ||
647 | 637 | ||
648 | if (!expect(b)) | 638 | if (!expect(b)) |
649 | return -ENOMEM; | 639 | return -ENOMEM; |
@@ -656,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi | |||
656 | if (capacity == b->bm_dev_capacity) | 646 | if (capacity == b->bm_dev_capacity) |
657 | goto out; | 647 | goto out; |
658 | 648 | ||
659 | opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); | ||
660 | |||
661 | if (capacity == 0) { | 649 | if (capacity == 0) { |
662 | spin_lock_irq(&b->bm_lock); | 650 | spin_lock_irq(&b->bm_lock); |
663 | opages = b->bm_pages; | 651 | opages = b->bm_pages; |
@@ -671,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi | |||
671 | b->bm_dev_capacity = 0; | 659 | b->bm_dev_capacity = 0; |
672 | spin_unlock_irq(&b->bm_lock); | 660 | spin_unlock_irq(&b->bm_lock); |
673 | bm_free_pages(opages, onpages); | 661 | bm_free_pages(opages, onpages); |
674 | bm_vk_free(opages, opages_vmalloced); | 662 | bm_vk_free(opages); |
675 | goto out; | 663 | goto out; |
676 | } | 664 | } |
677 | bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); | 665 | bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); |
@@ -744,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi | |||
744 | 732 | ||
745 | spin_unlock_irq(&b->bm_lock); | 733 | spin_unlock_irq(&b->bm_lock); |
746 | if (opages != npages) | 734 | if (opages != npages) |
747 | bm_vk_free(opages, opages_vmalloced); | 735 | bm_vk_free(opages); |
748 | if (!growing) | 736 | if (!growing) |
749 | b->bm_set = bm_count_bits(b); | 737 | b->bm_set = bm_count_bits(b); |
750 | drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); | 738 | drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); |
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b6844feb9f9b..34bc84efc29e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h | |||
@@ -536,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */ | |||
536 | /* definition of bits in bm_flags to be used in drbd_bm_lock | 536 | /* definition of bits in bm_flags to be used in drbd_bm_lock |
537 | * and drbd_bitmap_io and friends. */ | 537 | * and drbd_bitmap_io and friends. */ |
538 | enum bm_flag { | 538 | enum bm_flag { |
539 | /* do we need to kfree, or vfree bm_pages? */ | ||
540 | BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ | ||
541 | |||
542 | /* currently locked for bulk operation */ | 539 | /* currently locked for bulk operation */ |
543 | BM_LOCKED_MASK = 0xf, | 540 | BM_LOCKED_MASK = 0xf, |
544 | 541 | ||
diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f1d7fa45c275..f3f92d5fcda0 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c | |||
@@ -93,14 +93,11 @@ struct vma_data { | |||
93 | spinlock_t lock; /* Serialize access to this structure. */ | 93 | spinlock_t lock; /* Serialize access to this structure. */ |
94 | int count; /* Number of pages allocated. */ | 94 | int count; /* Number of pages allocated. */ |
95 | enum mspec_page_type type; /* Type of pages allocated. */ | 95 | enum mspec_page_type type; /* Type of pages allocated. */ |
96 | int flags; /* See VMD_xxx below. */ | ||
97 | unsigned long vm_start; /* Original (unsplit) base. */ | 96 | unsigned long vm_start; /* Original (unsplit) base. */ |
98 | unsigned long vm_end; /* Original (unsplit) end. */ | 97 | unsigned long vm_end; /* Original (unsplit) end. */ |
99 | unsigned long maddr[0]; /* Array of MSPEC addresses. */ | 98 | unsigned long maddr[0]; /* Array of MSPEC addresses. */ |
100 | }; | 99 | }; |
101 | 100 | ||
102 | #define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ | ||
103 | |||
104 | /* used on shub2 to clear FOP cache in the HUB */ | 101 | /* used on shub2 to clear FOP cache in the HUB */ |
105 | static unsigned long scratch_page[MAX_NUMNODES]; | 102 | static unsigned long scratch_page[MAX_NUMNODES]; |
106 | #define SH2_AMO_CACHE_ENTRIES 4 | 103 | #define SH2_AMO_CACHE_ENTRIES 4 |
@@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma) | |||
185 | "failed to zero page %ld\n", my_page); | 182 | "failed to zero page %ld\n", my_page); |
186 | } | 183 | } |
187 | 184 | ||
188 | if (vdata->flags & VMD_VMALLOCED) | 185 | kvfree(vdata); |
189 | vfree(vdata); | ||
190 | else | ||
191 | kfree(vdata); | ||
192 | } | 186 | } |
193 | 187 | ||
194 | /* | 188 | /* |
@@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, | |||
256 | enum mspec_page_type type) | 250 | enum mspec_page_type type) |
257 | { | 251 | { |
258 | struct vma_data *vdata; | 252 | struct vma_data *vdata; |
259 | int pages, vdata_size, flags = 0; | 253 | int pages, vdata_size; |
260 | 254 | ||
261 | if (vma->vm_pgoff != 0) | 255 | if (vma->vm_pgoff != 0) |
262 | return -EINVAL; | 256 | return -EINVAL; |
@@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, | |||
271 | vdata_size = sizeof(struct vma_data) + pages * sizeof(long); | 265 | vdata_size = sizeof(struct vma_data) + pages * sizeof(long); |
272 | if (vdata_size <= PAGE_SIZE) | 266 | if (vdata_size <= PAGE_SIZE) |
273 | vdata = kzalloc(vdata_size, GFP_KERNEL); | 267 | vdata = kzalloc(vdata_size, GFP_KERNEL); |
274 | else { | 268 | else |
275 | vdata = vzalloc(vdata_size); | 269 | vdata = vzalloc(vdata_size); |
276 | flags = VMD_VMALLOCED; | ||
277 | } | ||
278 | if (!vdata) | 270 | if (!vdata) |
279 | return -ENOMEM; | 271 | return -ENOMEM; |
280 | 272 | ||
281 | vdata->vm_start = vma->vm_start; | 273 | vdata->vm_start = vma->vm_start; |
282 | vdata->vm_end = vma->vm_end; | 274 | vdata->vm_end = vma->vm_end; |
283 | vdata->flags = flags; | ||
284 | vdata->type = type; | 275 | vdata->type = type; |
285 | spin_lock_init(&vdata->lock); | 276 | spin_lock_init(&vdata->lock); |
286 | atomic_set(&vdata->refcnt, 1); | 277 | atomic_set(&vdata->refcnt, 1); |
diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index c3b80fd65d62..7b30b307674b 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c | |||
@@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item); | |||
198 | void drm_ht_remove(struct drm_open_hash *ht) | 198 | void drm_ht_remove(struct drm_open_hash *ht) |
199 | { | 199 | { |
200 | if (ht->table) { | 200 | if (ht->table) { |
201 | if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order) | 201 | kvfree(ht->table); |
202 | kfree(ht->table); | ||
203 | else | ||
204 | vfree(ht->table); | ||
205 | ht->table = NULL; | 202 | ht->table = NULL; |
206 | } | 203 | } |
207 | } | 204 | } |
diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index d6273e143324..a80d993b882e 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h | |||
@@ -151,16 +151,12 @@ do { \ | |||
151 | 151 | ||
152 | #define LIBCFS_FREE(ptr, size) \ | 152 | #define LIBCFS_FREE(ptr, size) \ |
153 | do { \ | 153 | do { \ |
154 | int s = (size); \ | ||
155 | if (unlikely((ptr) == NULL)) { \ | 154 | if (unlikely((ptr) == NULL)) { \ |
156 | CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ | 155 | CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ |
157 | "%s:%d\n", s, __FILE__, __LINE__); \ | 156 | "%s:%d\n", (int)(size), __FILE__, __LINE__); \ |
158 | break; \ | 157 | break; \ |
159 | } \ | 158 | } \ |
160 | if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ | 159 | kvfree(ptr); \ |
161 | vfree(ptr); \ | ||
162 | else \ | ||
163 | kfree(ptr); \ | ||
164 | } while (0) | 160 | } while (0) |
165 | 161 | ||
166 | /******************************************************************************/ | 162 | /******************************************************************************/ |
diff --git a/fs/block_dev.c b/fs/block_dev.c index ba762ea07f67..60895e500e15 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev) | |||
75 | { | 75 | { |
76 | struct address_space *mapping = bdev->bd_inode->i_mapping; | 76 | struct address_space *mapping = bdev->bd_inode->i_mapping; |
77 | 77 | ||
78 | if (mapping->nrpages == 0 && mapping->nrshadows == 0) | 78 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) |
79 | return; | 79 | return; |
80 | 80 | ||
81 | invalidate_bh_lrus(); | 81 | invalidate_bh_lrus(); |
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index f829fe963f5b..5104d84c4f64 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h | |||
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void); | |||
72 | } while (0) | 72 | } while (0) |
73 | 73 | ||
74 | 74 | ||
75 | #define CODA_FREE(ptr,size) \ | 75 | #define CODA_FREE(ptr, size) kvfree((ptr)) |
76 | do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) | ||
77 | 76 | ||
78 | /* inode to cnode access functions */ | 77 | /* inode to cnode access functions */ |
79 | 78 | ||
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/memcontrol.h> | 24 | #include <linux/memcontrol.h> |
25 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
26 | #include <linux/mutex.h> | 26 | #include <linux/mutex.h> |
27 | #include <linux/pagevec.h> | ||
27 | #include <linux/pmem.h> | 28 | #include <linux/pmem.h> |
28 | #include <linux/sched.h> | 29 | #include <linux/sched.h> |
29 | #include <linux/uio.h> | 30 | #include <linux/uio.h> |
@@ -245,6 +246,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, | |||
245 | loff_t end = pos + iov_iter_count(iter); | 246 | loff_t end = pos + iov_iter_count(iter); |
246 | 247 | ||
247 | memset(&bh, 0, sizeof(bh)); | 248 | memset(&bh, 0, sizeof(bh)); |
249 | bh.b_bdev = inode->i_sb->s_bdev; | ||
248 | 250 | ||
249 | if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { | 251 | if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { |
250 | struct address_space *mapping = inode->i_mapping; | 252 | struct address_space *mapping = inode->i_mapping; |
@@ -324,6 +326,199 @@ static int copy_user_bh(struct page *to, struct inode *inode, | |||
324 | return 0; | 326 | return 0; |
325 | } | 327 | } |
326 | 328 | ||
329 | #define NO_SECTOR -1 | ||
330 | #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) | ||
331 | |||
332 | static int dax_radix_entry(struct address_space *mapping, pgoff_t index, | ||
333 | sector_t sector, bool pmd_entry, bool dirty) | ||
334 | { | ||
335 | struct radix_tree_root *page_tree = &mapping->page_tree; | ||
336 | pgoff_t pmd_index = DAX_PMD_INDEX(index); | ||
337 | int type, error = 0; | ||
338 | void *entry; | ||
339 | |||
340 | WARN_ON_ONCE(pmd_entry && !dirty); | ||
341 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | ||
342 | |||
343 | spin_lock_irq(&mapping->tree_lock); | ||
344 | |||
345 | entry = radix_tree_lookup(page_tree, pmd_index); | ||
346 | if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { | ||
347 | index = pmd_index; | ||
348 | goto dirty; | ||
349 | } | ||
350 | |||
351 | entry = radix_tree_lookup(page_tree, index); | ||
352 | if (entry) { | ||
353 | type = RADIX_DAX_TYPE(entry); | ||
354 | if (WARN_ON_ONCE(type != RADIX_DAX_PTE && | ||
355 | type != RADIX_DAX_PMD)) { | ||
356 | error = -EIO; | ||
357 | goto unlock; | ||
358 | } | ||
359 | |||
360 | if (!pmd_entry || type == RADIX_DAX_PMD) | ||
361 | goto dirty; | ||
362 | |||
363 | /* | ||
364 | * We only insert dirty PMD entries into the radix tree. This | ||
365 | * means we don't need to worry about removing a dirty PTE | ||
366 | * entry and inserting a clean PMD entry, thus reducing the | ||
367 | * range we would flush with a follow-up fsync/msync call. | ||
368 | */ | ||
369 | radix_tree_delete(&mapping->page_tree, index); | ||
370 | mapping->nrexceptional--; | ||
371 | } | ||
372 | |||
373 | if (sector == NO_SECTOR) { | ||
374 | /* | ||
375 | * This can happen during correct operation if our pfn_mkwrite | ||
376 | * fault raced against a hole punch operation. If this | ||
377 | * happens the pte that was hole punched will have been | ||
378 | * unmapped and the radix tree entry will have been removed by | ||
379 | * the time we are called, but the call will still happen. We | ||
380 | * will return all the way up to wp_pfn_shared(), where the | ||
381 | * pte_same() check will fail, eventually causing page fault | ||
382 | * to be retried by the CPU. | ||
383 | */ | ||
384 | goto unlock; | ||
385 | } | ||
386 | |||
387 | error = radix_tree_insert(page_tree, index, | ||
388 | RADIX_DAX_ENTRY(sector, pmd_entry)); | ||
389 | if (error) | ||
390 | goto unlock; | ||
391 | |||
392 | mapping->nrexceptional++; | ||
393 | dirty: | ||
394 | if (dirty) | ||
395 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); | ||
396 | unlock: | ||
397 | spin_unlock_irq(&mapping->tree_lock); | ||
398 | return error; | ||
399 | } | ||
400 | |||
401 | static int dax_writeback_one(struct block_device *bdev, | ||
402 | struct address_space *mapping, pgoff_t index, void *entry) | ||
403 | { | ||
404 | struct radix_tree_root *page_tree = &mapping->page_tree; | ||
405 | int type = RADIX_DAX_TYPE(entry); | ||
406 | struct radix_tree_node *node; | ||
407 | struct blk_dax_ctl dax; | ||
408 | void **slot; | ||
409 | int ret = 0; | ||
410 | |||
411 | spin_lock_irq(&mapping->tree_lock); | ||
412 | /* | ||
413 | * Regular page slots are stabilized by the page lock even | ||
414 | * without the tree itself locked. These unlocked entries | ||
415 | * need verification under the tree lock. | ||
416 | */ | ||
417 | if (!__radix_tree_lookup(page_tree, index, &node, &slot)) | ||
418 | goto unlock; | ||
419 | if (*slot != entry) | ||
420 | goto unlock; | ||
421 | |||
422 | /* another fsync thread may have already written back this entry */ | ||
423 | if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) | ||
424 | goto unlock; | ||
425 | |||
426 | if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { | ||
427 | ret = -EIO; | ||
428 | goto unlock; | ||
429 | } | ||
430 | |||
431 | dax.sector = RADIX_DAX_SECTOR(entry); | ||
432 | dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); | ||
433 | spin_unlock_irq(&mapping->tree_lock); | ||
434 | |||
435 | /* | ||
436 | * We cannot hold tree_lock while calling dax_map_atomic() because it | ||
437 | * eventually calls cond_resched(). | ||
438 | */ | ||
439 | ret = dax_map_atomic(bdev, &dax); | ||
440 | if (ret < 0) | ||
441 | return ret; | ||
442 | |||
443 | if (WARN_ON_ONCE(ret < dax.size)) { | ||
444 | ret = -EIO; | ||
445 | goto unmap; | ||
446 | } | ||
447 | |||
448 | wb_cache_pmem(dax.addr, dax.size); | ||
449 | |||
450 | spin_lock_irq(&mapping->tree_lock); | ||
451 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); | ||
452 | spin_unlock_irq(&mapping->tree_lock); | ||
453 | unmap: | ||
454 | dax_unmap_atomic(bdev, &dax); | ||
455 | return ret; | ||
456 | |||
457 | unlock: | ||
458 | spin_unlock_irq(&mapping->tree_lock); | ||
459 | return ret; | ||
460 | } | ||
461 | |||
462 | /* | ||
463 | * Flush the mapping to the persistent domain within the byte range of [start, | ||
464 | * end]. This is required by data integrity operations to ensure file data is | ||
465 | * on persistent storage prior to completion of the operation. | ||
466 | */ | ||
467 | int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, | ||
468 | loff_t end) | ||
469 | { | ||
470 | struct inode *inode = mapping->host; | ||
471 | struct block_device *bdev = inode->i_sb->s_bdev; | ||
472 | pgoff_t start_index, end_index, pmd_index; | ||
473 | pgoff_t indices[PAGEVEC_SIZE]; | ||
474 | struct pagevec pvec; | ||
475 | bool done = false; | ||
476 | int i, ret = 0; | ||
477 | void *entry; | ||
478 | |||
479 | if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) | ||
480 | return -EIO; | ||
481 | |||
482 | start_index = start >> PAGE_CACHE_SHIFT; | ||
483 | end_index = end >> PAGE_CACHE_SHIFT; | ||
484 | pmd_index = DAX_PMD_INDEX(start_index); | ||
485 | |||
486 | rcu_read_lock(); | ||
487 | entry = radix_tree_lookup(&mapping->page_tree, pmd_index); | ||
488 | rcu_read_unlock(); | ||
489 | |||
490 | /* see if the start of our range is covered by a PMD entry */ | ||
491 | if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) | ||
492 | start_index = pmd_index; | ||
493 | |||
494 | tag_pages_for_writeback(mapping, start_index, end_index); | ||
495 | |||
496 | pagevec_init(&pvec, 0); | ||
497 | while (!done) { | ||
498 | pvec.nr = find_get_entries_tag(mapping, start_index, | ||
499 | PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, | ||
500 | pvec.pages, indices); | ||
501 | |||
502 | if (pvec.nr == 0) | ||
503 | break; | ||
504 | |||
505 | for (i = 0; i < pvec.nr; i++) { | ||
506 | if (indices[i] > end_index) { | ||
507 | done = true; | ||
508 | break; | ||
509 | } | ||
510 | |||
511 | ret = dax_writeback_one(bdev, mapping, indices[i], | ||
512 | pvec.pages[i]); | ||
513 | if (ret < 0) | ||
514 | return ret; | ||
515 | } | ||
516 | } | ||
517 | wmb_pmem(); | ||
518 | return 0; | ||
519 | } | ||
520 | EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); | ||
521 | |||
327 | static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, | 522 | static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, |
328 | struct vm_area_struct *vma, struct vm_fault *vmf) | 523 | struct vm_area_struct *vma, struct vm_fault *vmf) |
329 | { | 524 | { |
@@ -363,6 +558,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, | |||
363 | } | 558 | } |
364 | dax_unmap_atomic(bdev, &dax); | 559 | dax_unmap_atomic(bdev, &dax); |
365 | 560 | ||
561 | error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, | ||
562 | vmf->flags & FAULT_FLAG_WRITE); | ||
563 | if (error) | ||
564 | goto out; | ||
565 | |||
366 | error = vm_insert_mixed(vma, vaddr, dax.pfn); | 566 | error = vm_insert_mixed(vma, vaddr, dax.pfn); |
367 | 567 | ||
368 | out: | 568 | out: |
@@ -408,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
408 | 608 | ||
409 | memset(&bh, 0, sizeof(bh)); | 609 | memset(&bh, 0, sizeof(bh)); |
410 | block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); | 610 | block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); |
611 | bh.b_bdev = inode->i_sb->s_bdev; | ||
411 | bh.b_size = PAGE_SIZE; | 612 | bh.b_size = PAGE_SIZE; |
412 | 613 | ||
413 | repeat: | 614 | repeat: |
@@ -487,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
487 | delete_from_page_cache(page); | 688 | delete_from_page_cache(page); |
488 | unlock_page(page); | 689 | unlock_page(page); |
489 | page_cache_release(page); | 690 | page_cache_release(page); |
691 | page = NULL; | ||
490 | } | 692 | } |
491 | 693 | ||
492 | /* | 694 | /* |
@@ -590,7 +792,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
590 | struct block_device *bdev; | 792 | struct block_device *bdev; |
591 | pgoff_t size, pgoff; | 793 | pgoff_t size, pgoff; |
592 | sector_t block; | 794 | sector_t block; |
593 | int result = 0; | 795 | int error, result = 0; |
796 | bool alloc = false; | ||
594 | 797 | ||
595 | /* dax pmd mappings require pfn_t_devmap() */ | 798 | /* dax pmd mappings require pfn_t_devmap() */ |
596 | if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) | 799 | if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) |
@@ -624,13 +827,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
624 | } | 827 | } |
625 | 828 | ||
626 | memset(&bh, 0, sizeof(bh)); | 829 | memset(&bh, 0, sizeof(bh)); |
830 | bh.b_bdev = inode->i_sb->s_bdev; | ||
627 | block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); | 831 | block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); |
628 | 832 | ||
629 | bh.b_size = PMD_SIZE; | 833 | bh.b_size = PMD_SIZE; |
630 | if (get_block(inode, block, &bh, write) != 0) | 834 | |
835 | if (get_block(inode, block, &bh, 0) != 0) | ||
631 | return VM_FAULT_SIGBUS; | 836 | return VM_FAULT_SIGBUS; |
837 | |||
838 | if (!buffer_mapped(&bh) && write) { | ||
839 | if (get_block(inode, block, &bh, 1) != 0) | ||
840 | return VM_FAULT_SIGBUS; | ||
841 | alloc = true; | ||
842 | } | ||
843 | |||
632 | bdev = bh.b_bdev; | 844 | bdev = bh.b_bdev; |
633 | i_mmap_lock_read(mapping); | ||
634 | 845 | ||
635 | /* | 846 | /* |
636 | * If the filesystem isn't willing to tell us the length of a hole, | 847 | * If the filesystem isn't willing to tell us the length of a hole, |
@@ -639,19 +850,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
639 | */ | 850 | */ |
640 | if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { | 851 | if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { |
641 | dax_pmd_dbg(&bh, address, "allocated block too small"); | 852 | dax_pmd_dbg(&bh, address, "allocated block too small"); |
642 | goto fallback; | 853 | return VM_FAULT_FALLBACK; |
643 | } | 854 | } |
644 | 855 | ||
645 | /* | 856 | /* |
646 | * If we allocated new storage, make sure no process has any | 857 | * If we allocated new storage, make sure no process has any |
647 | * zero pages covering this hole | 858 | * zero pages covering this hole |
648 | */ | 859 | */ |
649 | if (buffer_new(&bh)) { | 860 | if (alloc) { |
650 | i_mmap_unlock_read(mapping); | 861 | loff_t lstart = pgoff << PAGE_SHIFT; |
651 | unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); | 862 | loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ |
652 | i_mmap_lock_read(mapping); | 863 | |
864 | truncate_pagecache_range(inode, lstart, lend); | ||
653 | } | 865 | } |
654 | 866 | ||
867 | i_mmap_lock_read(mapping); | ||
868 | |||
655 | /* | 869 | /* |
656 | * If a truncate happened while we were allocating blocks, we may | 870 | * If a truncate happened while we were allocating blocks, we may |
657 | * leave blocks allocated to the file that are beyond EOF. We can't | 871 | * leave blocks allocated to the file that are beyond EOF. We can't |
@@ -664,7 +878,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
664 | goto out; | 878 | goto out; |
665 | } | 879 | } |
666 | if ((pgoff | PG_PMD_COLOUR) >= size) { | 880 | if ((pgoff | PG_PMD_COLOUR) >= size) { |
667 | dax_pmd_dbg(&bh, address, "pgoff unaligned"); | 881 | dax_pmd_dbg(&bh, address, |
882 | "offset + huge page size > file size"); | ||
668 | goto fallback; | 883 | goto fallback; |
669 | } | 884 | } |
670 | 885 | ||
@@ -732,6 +947,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
732 | } | 947 | } |
733 | dax_unmap_atomic(bdev, &dax); | 948 | dax_unmap_atomic(bdev, &dax); |
734 | 949 | ||
950 | /* | ||
951 | * For PTE faults we insert a radix tree entry for reads, and | ||
952 | * leave it clean. Then on the first write we dirty the radix | ||
953 | * tree entry via the dax_pfn_mkwrite() path. This sequence | ||
954 | * allows the dax_pfn_mkwrite() call to be simpler and avoid a | ||
955 | * call into get_block() to translate the pgoff to a sector in | ||
956 | * order to be able to create a new radix tree entry. | ||
957 | * | ||
958 | * The PMD path doesn't have an equivalent to | ||
959 | * dax_pfn_mkwrite(), though, so for a read followed by a | ||
960 | * write we traverse all the way through __dax_pmd_fault() | ||
961 | * twice. This means we can just skip inserting a radix tree | ||
962 | * entry completely on the initial read and just wait until | ||
963 | * the write to insert a dirty entry. | ||
964 | */ | ||
965 | if (write) { | ||
966 | error = dax_radix_entry(mapping, pgoff, dax.sector, | ||
967 | true, true); | ||
968 | if (error) { | ||
969 | dax_pmd_dbg(&bh, address, | ||
970 | "PMD radix insertion failed"); | ||
971 | goto fallback; | ||
972 | } | ||
973 | } | ||
974 | |||
735 | dev_dbg(part_to_dev(bdev->bd_part), | 975 | dev_dbg(part_to_dev(bdev->bd_part), |
736 | "%s: %s addr: %lx pfn: %lx sect: %llx\n", | 976 | "%s: %s addr: %lx pfn: %lx sect: %llx\n", |
737 | __func__, current->comm, address, | 977 | __func__, current->comm, address, |
@@ -790,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); | |||
790 | * dax_pfn_mkwrite - handle first write to DAX page | 1030 | * dax_pfn_mkwrite - handle first write to DAX page |
791 | * @vma: The virtual memory area where the fault occurred | 1031 | * @vma: The virtual memory area where the fault occurred |
792 | * @vmf: The description of the fault | 1032 | * @vmf: The description of the fault |
793 | * | ||
794 | */ | 1033 | */ |
795 | int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 1034 | int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
796 | { | 1035 | { |
797 | struct super_block *sb = file_inode(vma->vm_file)->i_sb; | 1036 | struct file *file = vma->vm_file; |
798 | 1037 | ||
799 | sb_start_pagefault(sb); | 1038 | /* |
800 | file_update_time(vma->vm_file); | 1039 | * We pass NO_SECTOR to dax_radix_entry() because we expect that a |
801 | sb_end_pagefault(sb); | 1040 | * RADIX_DAX_PTE entry already exists in the radix tree from a |
1041 | * previous call to __dax_fault(). We just want to look up that PTE | ||
1042 | * entry using vmf->pgoff and make sure the dirty tag is set. This | ||
1043 | * saves us from having to make a call to get_block() here to look | ||
1044 | * up the sector. | ||
1045 | */ | ||
1046 | dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); | ||
802 | return VM_FAULT_NOPAGE; | 1047 | return VM_FAULT_NOPAGE; |
803 | } | 1048 | } |
804 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); | 1049 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); |
@@ -835,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, | |||
835 | BUG_ON((offset + length) > PAGE_CACHE_SIZE); | 1080 | BUG_ON((offset + length) > PAGE_CACHE_SIZE); |
836 | 1081 | ||
837 | memset(&bh, 0, sizeof(bh)); | 1082 | memset(&bh, 0, sizeof(bh)); |
1083 | bh.b_bdev = inode->i_sb->s_bdev; | ||
838 | bh.b_size = PAGE_CACHE_SIZE; | 1084 | bh.b_size = PAGE_CACHE_SIZE; |
839 | err = get_block(inode, index, &bh, 0); | 1085 | err = get_block(inode, index, &bh, 0); |
840 | if (err < 0) | 1086 | if (err < 0) |
diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 11a42c5a09ae..2c88d683cd91 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c | |||
@@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, | |||
102 | { | 102 | { |
103 | struct inode *inode = file_inode(vma->vm_file); | 103 | struct inode *inode = file_inode(vma->vm_file); |
104 | struct ext2_inode_info *ei = EXT2_I(inode); | 104 | struct ext2_inode_info *ei = EXT2_I(inode); |
105 | int ret = VM_FAULT_NOPAGE; | ||
106 | loff_t size; | 105 | loff_t size; |
106 | int ret; | ||
107 | 107 | ||
108 | sb_start_pagefault(inode->i_sb); | 108 | sb_start_pagefault(inode->i_sb); |
109 | file_update_time(vma->vm_file); | 109 | file_update_time(vma->vm_file); |
@@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, | |||
113 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | 113 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; |
114 | if (vmf->pgoff >= size) | 114 | if (vmf->pgoff >= size) |
115 | ret = VM_FAULT_SIGBUS; | 115 | ret = VM_FAULT_SIGBUS; |
116 | else | ||
117 | ret = dax_pfn_mkwrite(vma, vmf); | ||
116 | 118 | ||
117 | up_read(&ei->dax_sem); | 119 | up_read(&ei->dax_sem); |
118 | sb_end_pagefault(inode->i_sb); | 120 | sb_end_pagefault(inode->i_sb); |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 749b222e6498..8c8965cc4aab 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -291,8 +291,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, | |||
291 | { | 291 | { |
292 | struct inode *inode = file_inode(vma->vm_file); | 292 | struct inode *inode = file_inode(vma->vm_file); |
293 | struct super_block *sb = inode->i_sb; | 293 | struct super_block *sb = inode->i_sb; |
294 | int ret = VM_FAULT_NOPAGE; | ||
295 | loff_t size; | 294 | loff_t size; |
295 | int ret; | ||
296 | 296 | ||
297 | sb_start_pagefault(sb); | 297 | sb_start_pagefault(sb); |
298 | file_update_time(vma->vm_file); | 298 | file_update_time(vma->vm_file); |
@@ -300,6 +300,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, | |||
300 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | 300 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; |
301 | if (vmf->pgoff >= size) | 301 | if (vmf->pgoff >= size) |
302 | ret = VM_FAULT_SIGBUS; | 302 | ret = VM_FAULT_SIGBUS; |
303 | else | ||
304 | ret = dax_pfn_mkwrite(vma, vmf); | ||
303 | up_read(&EXT4_I(inode)->i_mmap_sem); | 305 | up_read(&EXT4_I(inode)->i_mmap_sem); |
304 | sb_end_pagefault(sb); | 306 | sb_end_pagefault(sb); |
305 | 307 | ||
diff --git a/fs/inode.c b/fs/inode.c index e491e54d2430..1e6dd388ba7f 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -495,7 +495,7 @@ void clear_inode(struct inode *inode) | |||
495 | */ | 495 | */ |
496 | spin_lock_irq(&inode->i_data.tree_lock); | 496 | spin_lock_irq(&inode->i_data.tree_lock); |
497 | BUG_ON(inode->i_data.nrpages); | 497 | BUG_ON(inode->i_data.nrpages); |
498 | BUG_ON(inode->i_data.nrshadows); | 498 | BUG_ON(inode->i_data.nrexceptional); |
499 | spin_unlock_irq(&inode->i_data.tree_lock); | 499 | spin_unlock_irq(&inode->i_data.tree_lock); |
500 | BUG_ON(!list_empty(&inode->i_data.private_list)); | 500 | BUG_ON(!list_empty(&inode->i_data.private_list)); |
501 | BUG_ON(!(inode->i_state & I_FREEING)); | 501 | BUG_ON(!(inode->i_state & I_FREEING)); |
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a3750f902adc..0ae91ad6df2d 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
18 | #include <linux/vmalloc.h> | 18 | #include <linux/vmalloc.h> |
19 | #include <linux/mtd/mtd.h> | 19 | #include <linux/mtd/mtd.h> |
20 | #include <linux/mm.h> /* kvfree() */ | ||
20 | #include "nodelist.h" | 21 | #include "nodelist.h" |
21 | 22 | ||
22 | static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, | 23 | static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, |
@@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) | |||
383 | return 0; | 384 | return 0; |
384 | 385 | ||
385 | out_free: | 386 | out_free: |
386 | #ifndef __ECOS | 387 | kvfree(c->blocks); |
387 | if (jffs2_blocks_use_vmalloc(c)) | ||
388 | vfree(c->blocks); | ||
389 | else | ||
390 | #endif | ||
391 | kfree(c->blocks); | ||
392 | 388 | ||
393 | return ret; | 389 | return ret; |
394 | } | 390 | } |
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf1682036d..bead25ae8fe4 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c | |||
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) | |||
596 | out_root: | 596 | out_root: |
597 | jffs2_free_ino_caches(c); | 597 | jffs2_free_ino_caches(c); |
598 | jffs2_free_raw_node_refs(c); | 598 | jffs2_free_raw_node_refs(c); |
599 | if (jffs2_blocks_use_vmalloc(c)) | 599 | kvfree(c->blocks); |
600 | vfree(c->blocks); | ||
601 | else | ||
602 | kfree(c->blocks); | ||
603 | out_inohash: | 600 | out_inohash: |
604 | jffs2_clear_xattr_subsystem(c); | 601 | jffs2_clear_xattr_subsystem(c); |
605 | kfree(c->inocache_list); | 602 | kfree(c->inocache_list); |
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb080c272149..0a9a114bb9d1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c | |||
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb) | |||
331 | 331 | ||
332 | jffs2_free_ino_caches(c); | 332 | jffs2_free_ino_caches(c); |
333 | jffs2_free_raw_node_refs(c); | 333 | jffs2_free_raw_node_refs(c); |
334 | if (jffs2_blocks_use_vmalloc(c)) | 334 | kvfree(c->blocks); |
335 | vfree(c->blocks); | ||
336 | else | ||
337 | kfree(c->blocks); | ||
338 | jffs2_flash_cleanup(c); | 335 | jffs2_flash_cleanup(c); |
339 | kfree(c->inocache_list); | 336 | kfree(c->inocache_list); |
340 | jffs2_clear_xattr_subsystem(c); | 337 | jffs2_clear_xattr_subsystem(c); |
diff --git a/fs/udf/super.c b/fs/udf/super.c index 0fbb4c7c72e8..a522c15a0bfd 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c | |||
@@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) | |||
279 | { | 279 | { |
280 | int i; | 280 | int i; |
281 | int nr_groups = bitmap->s_nr_groups; | 281 | int nr_groups = bitmap->s_nr_groups; |
282 | int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * | ||
283 | nr_groups); | ||
284 | 282 | ||
285 | for (i = 0; i < nr_groups; i++) | 283 | for (i = 0; i < nr_groups; i++) |
286 | if (bitmap->s_block_bitmap[i]) | 284 | if (bitmap->s_block_bitmap[i]) |
287 | brelse(bitmap->s_block_bitmap[i]); | 285 | brelse(bitmap->s_block_bitmap[i]); |
288 | 286 | ||
289 | if (size <= PAGE_SIZE) | 287 | kvfree(bitmap); |
290 | kfree(bitmap); | ||
291 | else | ||
292 | vfree(bitmap); | ||
293 | } | 288 | } |
294 | 289 | ||
295 | static void udf_free_partition(struct udf_part_map *map) | 290 | static void udf_free_partition(struct udf_part_map *map) |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebe9b8290a70..55e16e2402a7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
@@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault( | |||
1610 | /* | 1610 | /* |
1611 | * pfn_mkwrite was originally inteneded to ensure we capture time stamp | 1611 | * pfn_mkwrite was originally inteneded to ensure we capture time stamp |
1612 | * updates on write faults. In reality, it's need to serialise against | 1612 | * updates on write faults. In reality, it's need to serialise against |
1613 | * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() | 1613 | * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED |
1614 | * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault | 1614 | * to ensure we serialise the fault barrier in place. |
1615 | * barrier in place. | ||
1616 | */ | 1615 | */ |
1617 | static int | 1616 | static int |
1618 | xfs_filemap_pfn_mkwrite( | 1617 | xfs_filemap_pfn_mkwrite( |
@@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite( | |||
1635 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1634 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; |
1636 | if (vmf->pgoff >= size) | 1635 | if (vmf->pgoff >= size) |
1637 | ret = VM_FAULT_SIGBUS; | 1636 | ret = VM_FAULT_SIGBUS; |
1637 | else if (IS_DAX(inode)) | ||
1638 | ret = dax_pfn_mkwrite(vma, vmf); | ||
1638 | xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); | 1639 | xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); |
1639 | sb_end_pagefault(inode->i_sb); | 1640 | sb_end_pagefault(inode->i_sb); |
1640 | return ret; | 1641 | return ret; |
diff --git a/include/linux/dax.h b/include/linux/dax.h index b415e521528d..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
@@ -36,4 +36,11 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) | |||
36 | { | 36 | { |
37 | return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); | 37 | return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); |
38 | } | 38 | } |
39 | |||
40 | static inline bool dax_mapping(struct address_space *mapping) | ||
41 | { | ||
42 | return mapping->host && IS_DAX(mapping->host); | ||
43 | } | ||
44 | int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, | ||
45 | loff_t end); | ||
39 | #endif | 46 | #endif |
diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..0d7570320d63 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -433,7 +433,8 @@ struct address_space { | |||
433 | struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ | 433 | struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ |
434 | /* Protected by tree_lock together with the radix tree */ | 434 | /* Protected by tree_lock together with the radix tree */ |
435 | unsigned long nrpages; /* number of total pages */ | 435 | unsigned long nrpages; /* number of total pages */ |
436 | unsigned long nrshadows; /* number of shadow entries */ | 436 | /* number of shadow or DAX exceptional entries */ |
437 | unsigned long nrexceptional; | ||
437 | pgoff_t writeback_index;/* writeback starts here */ | 438 | pgoff_t writeback_index;/* writeback starts here */ |
438 | const struct address_space_operations *a_ops; /* methods */ | 439 | const struct address_space_operations *a_ops; /* methods */ |
439 | unsigned long flags; /* error bits/gfp mask */ | 440 | unsigned long flags; /* error bits/gfp mask */ |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4d08b6c33557..92395a0a7dc5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, | |||
361 | unsigned int nr_pages, struct page **pages); | 361 | unsigned int nr_pages, struct page **pages); |
362 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 362 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, |
363 | int tag, unsigned int nr_pages, struct page **pages); | 363 | int tag, unsigned int nr_pages, struct page **pages); |
364 | unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, | ||
365 | int tag, unsigned int nr_entries, | ||
366 | struct page **entries, pgoff_t *indices); | ||
364 | 367 | ||
365 | struct page *grab_cache_page_write_begin(struct address_space *mapping, | 368 | struct page *grab_cache_page_write_begin(struct address_space *mapping, |
366 | pgoff_t index, unsigned flags); | 369 | pgoff_t index, unsigned flags); |
diff --git a/include/linux/pmem.h b/include/linux/pmem.h index acfea8ce4a07..7c3d11a6b4ad 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h | |||
@@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) | |||
53 | { | 53 | { |
54 | BUG(); | 54 | BUG(); |
55 | } | 55 | } |
56 | |||
57 | static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) | ||
58 | { | ||
59 | BUG(); | ||
60 | } | ||
56 | #endif | 61 | #endif |
57 | 62 | ||
58 | /* | 63 | /* |
59 | * Architectures that define ARCH_HAS_PMEM_API must provide | 64 | * Architectures that define ARCH_HAS_PMEM_API must provide |
60 | * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), | 65 | * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), |
61 | * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). | 66 | * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() |
67 | * and arch_has_wmb_pmem(). | ||
62 | */ | 68 | */ |
63 | static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) | 69 | static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) |
64 | { | 70 | { |
@@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size) | |||
178 | else | 184 | else |
179 | default_clear_pmem(addr, size); | 185 | default_clear_pmem(addr, size); |
180 | } | 186 | } |
187 | |||
188 | /** | ||
189 | * wb_cache_pmem - write back processor cache for PMEM memory range | ||
190 | * @addr: virtual start address | ||
191 | * @size: number of bytes to write back | ||
192 | * | ||
193 | * Write back the processor cache range starting at 'addr' for 'size' bytes. | ||
194 | * This function requires explicit ordering with a wmb_pmem() call. | ||
195 | */ | ||
196 | static inline void wb_cache_pmem(void __pmem *addr, size_t size) | ||
197 | { | ||
198 | if (arch_has_pmem_api()) | ||
199 | arch_wb_cache_pmem(addr, size); | ||
200 | } | ||
181 | #endif /* __PMEM_H__ */ | 201 | #endif /* __PMEM_H__ */ |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 57e7d87d2d4c..7c88ad156a29 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -51,6 +51,15 @@ | |||
51 | #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 | 51 | #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 |
52 | #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 | 52 | #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 |
53 | 53 | ||
54 | #define RADIX_DAX_MASK 0xf | ||
55 | #define RADIX_DAX_SHIFT 4 | ||
56 | #define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) | ||
57 | #define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) | ||
58 | #define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) | ||
59 | #define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) | ||
60 | #define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ | ||
61 | RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) | ||
62 | |||
54 | static inline int radix_tree_is_indirect_ptr(void *ptr) | 63 | static inline int radix_tree_is_indirect_ptr(void *ptr) |
55 | { | 64 | { |
56 | return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); | 65 | return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); |
@@ -1493,7 +1493,7 @@ out_rcu_wakeup: | |||
1493 | wake_up_sem_queue_do(&tasks); | 1493 | wake_up_sem_queue_do(&tasks); |
1494 | out_free: | 1494 | out_free: |
1495 | if (sem_io != fast_sem_io) | 1495 | if (sem_io != fast_sem_io) |
1496 | ipc_free(sem_io, sizeof(ushort)*nsems); | 1496 | ipc_free(sem_io); |
1497 | return err; | 1497 | return err; |
1498 | } | 1498 | } |
1499 | 1499 | ||
diff --git a/ipc/util.c b/ipc/util.c index 0f401d94b7c6..798cad18dd87 100644 --- a/ipc/util.c +++ b/ipc/util.c | |||
@@ -414,17 +414,12 @@ void *ipc_alloc(int size) | |||
414 | /** | 414 | /** |
415 | * ipc_free - free ipc space | 415 | * ipc_free - free ipc space |
416 | * @ptr: pointer returned by ipc_alloc | 416 | * @ptr: pointer returned by ipc_alloc |
417 | * @size: size of block | ||
418 | * | 417 | * |
419 | * Free a block created with ipc_alloc(). The caller must know the size | 418 | * Free a block created with ipc_alloc(). |
420 | * used in the allocation call. | ||
421 | */ | 419 | */ |
422 | void ipc_free(void *ptr, int size) | 420 | void ipc_free(void *ptr) |
423 | { | 421 | { |
424 | if (size > PAGE_SIZE) | 422 | kvfree(ptr); |
425 | vfree(ptr); | ||
426 | else | ||
427 | kfree(ptr); | ||
428 | } | 423 | } |
429 | 424 | ||
430 | /** | 425 | /** |
diff --git a/ipc/util.h b/ipc/util.h index 3a8a5a0eca62..51f7ca58ac67 100644 --- a/ipc/util.h +++ b/ipc/util.h | |||
@@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); | |||
118 | * both function can sleep | 118 | * both function can sleep |
119 | */ | 119 | */ |
120 | void *ipc_alloc(int size); | 120 | void *ipc_alloc(int size); |
121 | void ipc_free(void *ptr, int size); | 121 | void ipc_free(void *ptr); |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * For allocation that need to be freed by RCU. | 124 | * For allocation that need to be freed by RCU. |
diff --git a/mm/filemap.c b/mm/filemap.c index 847ee43c2806..2e7c8d980d5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -11,6 +11,7 @@ | |||
11 | */ | 11 | */ |
12 | #include <linux/export.h> | 12 | #include <linux/export.h> |
13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
14 | #include <linux/dax.h> | ||
14 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
15 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
16 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
123 | __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); | 124 | __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); |
124 | 125 | ||
125 | if (shadow) { | 126 | if (shadow) { |
126 | mapping->nrshadows++; | 127 | mapping->nrexceptional++; |
127 | /* | 128 | /* |
128 | * Make sure the nrshadows update is committed before | 129 | * Make sure the nrexceptional update is committed before |
129 | * the nrpages update so that final truncate racing | 130 | * the nrpages update so that final truncate racing |
130 | * with reclaim does not see both counters 0 at the | 131 | * with reclaim does not see both counters 0 at the |
131 | * same time and miss a shadow entry. | 132 | * same time and miss a shadow entry. |
@@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
481 | { | 482 | { |
482 | int err = 0; | 483 | int err = 0; |
483 | 484 | ||
485 | if (dax_mapping(mapping) && mapping->nrexceptional) { | ||
486 | err = dax_writeback_mapping_range(mapping, lstart, lend); | ||
487 | if (err) | ||
488 | return err; | ||
489 | } | ||
490 | |||
484 | if (mapping->nrpages) { | 491 | if (mapping->nrpages) { |
485 | err = __filemap_fdatawrite_range(mapping, lstart, lend, | 492 | err = __filemap_fdatawrite_range(mapping, lstart, lend, |
486 | WB_SYNC_ALL); | 493 | WB_SYNC_ALL); |
@@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping, | |||
579 | p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); | 586 | p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); |
580 | if (!radix_tree_exceptional_entry(p)) | 587 | if (!radix_tree_exceptional_entry(p)) |
581 | return -EEXIST; | 588 | return -EEXIST; |
589 | |||
590 | if (WARN_ON(dax_mapping(mapping))) | ||
591 | return -EINVAL; | ||
592 | |||
582 | if (shadowp) | 593 | if (shadowp) |
583 | *shadowp = p; | 594 | *shadowp = p; |
584 | mapping->nrshadows--; | 595 | mapping->nrexceptional--; |
585 | if (node) | 596 | if (node) |
586 | workingset_node_shadows_dec(node); | 597 | workingset_node_shadows_dec(node); |
587 | } | 598 | } |
@@ -1245,9 +1256,9 @@ repeat: | |||
1245 | if (radix_tree_deref_retry(page)) | 1256 | if (radix_tree_deref_retry(page)) |
1246 | goto restart; | 1257 | goto restart; |
1247 | /* | 1258 | /* |
1248 | * A shadow entry of a recently evicted page, | 1259 | * A shadow entry of a recently evicted page, a swap |
1249 | * or a swap entry from shmem/tmpfs. Return | 1260 | * entry from shmem/tmpfs or a DAX entry. Return it |
1250 | * it without attempting to raise page count. | 1261 | * without attempting to raise page count. |
1251 | */ | 1262 | */ |
1252 | goto export; | 1263 | goto export; |
1253 | } | 1264 | } |
@@ -1494,6 +1505,74 @@ repeat: | |||
1494 | } | 1505 | } |
1495 | EXPORT_SYMBOL(find_get_pages_tag); | 1506 | EXPORT_SYMBOL(find_get_pages_tag); |
1496 | 1507 | ||
1508 | /** | ||
1509 | * find_get_entries_tag - find and return entries that match @tag | ||
1510 | * @mapping: the address_space to search | ||
1511 | * @start: the starting page cache index | ||
1512 | * @tag: the tag index | ||
1513 | * @nr_entries: the maximum number of entries | ||
1514 | * @entries: where the resulting entries are placed | ||
1515 | * @indices: the cache indices corresponding to the entries in @entries | ||
1516 | * | ||
1517 | * Like find_get_entries, except we only return entries which are tagged with | ||
1518 | * @tag. | ||
1519 | */ | ||
1520 | unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, | ||
1521 | int tag, unsigned int nr_entries, | ||
1522 | struct page **entries, pgoff_t *indices) | ||
1523 | { | ||
1524 | void **slot; | ||
1525 | unsigned int ret = 0; | ||
1526 | struct radix_tree_iter iter; | ||
1527 | |||
1528 | if (!nr_entries) | ||
1529 | return 0; | ||
1530 | |||
1531 | rcu_read_lock(); | ||
1532 | restart: | ||
1533 | radix_tree_for_each_tagged(slot, &mapping->page_tree, | ||
1534 | &iter, start, tag) { | ||
1535 | struct page *page; | ||
1536 | repeat: | ||
1537 | page = radix_tree_deref_slot(slot); | ||
1538 | if (unlikely(!page)) | ||
1539 | continue; | ||
1540 | if (radix_tree_exception(page)) { | ||
1541 | if (radix_tree_deref_retry(page)) { | ||
1542 | /* | ||
1543 | * Transient condition which can only trigger | ||
1544 | * when entry at index 0 moves out of or back | ||
1545 | * to root: none yet gotten, safe to restart. | ||
1546 | */ | ||
1547 | goto restart; | ||
1548 | } | ||
1549 | |||
1550 | /* | ||
1551 | * A shadow entry of a recently evicted page, a swap | ||
1552 | * entry from shmem/tmpfs or a DAX entry. Return it | ||
1553 | * without attempting to raise page count. | ||
1554 | */ | ||
1555 | goto export; | ||
1556 | } | ||
1557 | if (!page_cache_get_speculative(page)) | ||
1558 | goto repeat; | ||
1559 | |||
1560 | /* Has the page moved? */ | ||
1561 | if (unlikely(page != *slot)) { | ||
1562 | page_cache_release(page); | ||
1563 | goto repeat; | ||
1564 | } | ||
1565 | export: | ||
1566 | indices[ret] = iter.index; | ||
1567 | entries[ret] = page; | ||
1568 | if (++ret == nr_entries) | ||
1569 | break; | ||
1570 | } | ||
1571 | rcu_read_unlock(); | ||
1572 | return ret; | ||
1573 | } | ||
1574 | EXPORT_SYMBOL(find_get_entries_tag); | ||
1575 | |||
1497 | /* | 1576 | /* |
1498 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail | 1577 | * CD/DVDs are error prone. When a medium error occurs, the driver may fail |
1499 | * a _large_ part of the i/o request. Imagine the worst scenario: | 1578 | * a _large_ part of the i/o request. Imagine the worst scenario: |
diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b97a053..998607adf6eb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size) | |||
305 | /** | 305 | /** |
306 | * pcpu_mem_free - free memory | 306 | * pcpu_mem_free - free memory |
307 | * @ptr: memory to free | 307 | * @ptr: memory to free |
308 | * @size: size of the area | ||
309 | * | 308 | * |
310 | * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). | 309 | * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). |
311 | */ | 310 | */ |
312 | static void pcpu_mem_free(void *ptr, size_t size) | 311 | static void pcpu_mem_free(void *ptr) |
313 | { | 312 | { |
314 | if (size <= PAGE_SIZE) | 313 | kvfree(ptr); |
315 | kfree(ptr); | ||
316 | else | ||
317 | vfree(ptr); | ||
318 | } | 314 | } |
319 | 315 | ||
320 | /** | 316 | /** |
@@ -463,8 +459,8 @@ out_unlock: | |||
463 | * pcpu_mem_free() might end up calling vfree() which uses | 459 | * pcpu_mem_free() might end up calling vfree() which uses |
464 | * IRQ-unsafe lock and thus can't be called under pcpu_lock. | 460 | * IRQ-unsafe lock and thus can't be called under pcpu_lock. |
465 | */ | 461 | */ |
466 | pcpu_mem_free(old, old_size); | 462 | pcpu_mem_free(old); |
467 | pcpu_mem_free(new, new_size); | 463 | pcpu_mem_free(new); |
468 | 464 | ||
469 | return 0; | 465 | return 0; |
470 | } | 466 | } |
@@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) | |||
732 | chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * | 728 | chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * |
733 | sizeof(chunk->map[0])); | 729 | sizeof(chunk->map[0])); |
734 | if (!chunk->map) { | 730 | if (!chunk->map) { |
735 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); | 731 | pcpu_mem_free(chunk); |
736 | return NULL; | 732 | return NULL; |
737 | } | 733 | } |
738 | 734 | ||
@@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) | |||
753 | { | 749 | { |
754 | if (!chunk) | 750 | if (!chunk) |
755 | return; | 751 | return; |
756 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); | 752 | pcpu_mem_free(chunk->map); |
757 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); | 753 | pcpu_mem_free(chunk); |
758 | } | 754 | } |
759 | 755 | ||
760 | /** | 756 | /** |
diff --git a/mm/truncate.c b/mm/truncate.c index 76e35ad97102..e3ee0e27cd17 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -9,6 +9,7 @@ | |||
9 | 9 | ||
10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
11 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
12 | #include <linux/dax.h> | ||
12 | #include <linux/gfp.h> | 13 | #include <linux/gfp.h> |
13 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
14 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping, | |||
34 | return; | 35 | return; |
35 | 36 | ||
36 | spin_lock_irq(&mapping->tree_lock); | 37 | spin_lock_irq(&mapping->tree_lock); |
37 | /* | 38 | |
38 | * Regular page slots are stabilized by the page lock even | 39 | if (dax_mapping(mapping)) { |
39 | * without the tree itself locked. These unlocked entries | 40 | if (radix_tree_delete_item(&mapping->page_tree, index, entry)) |
40 | * need verification under the tree lock. | 41 | mapping->nrexceptional--; |
41 | */ | 42 | } else { |
42 | if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) | 43 | /* |
43 | goto unlock; | 44 | * Regular page slots are stabilized by the page lock even |
44 | if (*slot != entry) | 45 | * without the tree itself locked. These unlocked entries |
45 | goto unlock; | 46 | * need verification under the tree lock. |
46 | radix_tree_replace_slot(slot, NULL); | 47 | */ |
47 | mapping->nrshadows--; | 48 | if (!__radix_tree_lookup(&mapping->page_tree, index, &node, |
48 | if (!node) | 49 | &slot)) |
49 | goto unlock; | 50 | goto unlock; |
50 | workingset_node_shadows_dec(node); | 51 | if (*slot != entry) |
51 | /* | 52 | goto unlock; |
52 | * Don't track node without shadow entries. | 53 | radix_tree_replace_slot(slot, NULL); |
53 | * | 54 | mapping->nrexceptional--; |
54 | * Avoid acquiring the list_lru lock if already untracked. | 55 | if (!node) |
55 | * The list_empty() test is safe as node->private_list is | 56 | goto unlock; |
56 | * protected by mapping->tree_lock. | 57 | workingset_node_shadows_dec(node); |
57 | */ | 58 | /* |
58 | if (!workingset_node_shadows(node) && | 59 | * Don't track node without shadow entries. |
59 | !list_empty(&node->private_list)) | 60 | * |
60 | list_lru_del(&workingset_shadow_nodes, &node->private_list); | 61 | * Avoid acquiring the list_lru lock if already untracked. |
61 | __radix_tree_delete_node(&mapping->page_tree, node); | 62 | * The list_empty() test is safe as node->private_list is |
63 | * protected by mapping->tree_lock. | ||
64 | */ | ||
65 | if (!workingset_node_shadows(node) && | ||
66 | !list_empty(&node->private_list)) | ||
67 | list_lru_del(&workingset_shadow_nodes, | ||
68 | &node->private_list); | ||
69 | __radix_tree_delete_node(&mapping->page_tree, node); | ||
70 | } | ||
62 | unlock: | 71 | unlock: |
63 | spin_unlock_irq(&mapping->tree_lock); | 72 | spin_unlock_irq(&mapping->tree_lock); |
64 | } | 73 | } |
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
228 | int i; | 237 | int i; |
229 | 238 | ||
230 | cleancache_invalidate_inode(mapping); | 239 | cleancache_invalidate_inode(mapping); |
231 | if (mapping->nrpages == 0 && mapping->nrshadows == 0) | 240 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) |
232 | return; | 241 | return; |
233 | 242 | ||
234 | /* Offsets within partial pages */ | 243 | /* Offsets within partial pages */ |
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages); | |||
402 | */ | 411 | */ |
403 | void truncate_inode_pages_final(struct address_space *mapping) | 412 | void truncate_inode_pages_final(struct address_space *mapping) |
404 | { | 413 | { |
405 | unsigned long nrshadows; | 414 | unsigned long nrexceptional; |
406 | unsigned long nrpages; | 415 | unsigned long nrpages; |
407 | 416 | ||
408 | /* | 417 | /* |
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping) | |||
416 | 425 | ||
417 | /* | 426 | /* |
418 | * When reclaim installs eviction entries, it increases | 427 | * When reclaim installs eviction entries, it increases |
419 | * nrshadows first, then decreases nrpages. Make sure we see | 428 | * nrexceptional first, then decreases nrpages. Make sure we see |
420 | * this in the right order or we might miss an entry. | 429 | * this in the right order or we might miss an entry. |
421 | */ | 430 | */ |
422 | nrpages = mapping->nrpages; | 431 | nrpages = mapping->nrpages; |
423 | smp_rmb(); | 432 | smp_rmb(); |
424 | nrshadows = mapping->nrshadows; | 433 | nrexceptional = mapping->nrexceptional; |
425 | 434 | ||
426 | if (nrpages || nrshadows) { | 435 | if (nrpages || nrexceptional) { |
427 | /* | 436 | /* |
428 | * As truncation uses a lockless tree lookup, cycle | 437 | * As truncation uses a lockless tree lookup, cycle |
429 | * the tree lock to make sure any ongoing tree | 438 | * the tree lock to make sure any ongoing tree |
diff --git a/mm/vmscan.c b/mm/vmscan.c index bd620b65db52..eb3dd37ccd7c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -46,6 +46,7 @@ | |||
46 | #include <linux/oom.h> | 46 | #include <linux/oom.h> |
47 | #include <linux/prefetch.h> | 47 | #include <linux/prefetch.h> |
48 | #include <linux/printk.h> | 48 | #include <linux/printk.h> |
49 | #include <linux/dax.h> | ||
49 | 50 | ||
50 | #include <asm/tlbflush.h> | 51 | #include <asm/tlbflush.h> |
51 | #include <asm/div64.h> | 52 | #include <asm/div64.h> |
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
671 | * inode reclaim needs to empty out the radix tree or | 672 | * inode reclaim needs to empty out the radix tree or |
672 | * the nodes are lost. Don't plant shadows behind its | 673 | * the nodes are lost. Don't plant shadows behind its |
673 | * back. | 674 | * back. |
675 | * | ||
676 | * We also don't store shadows for DAX mappings because the | ||
677 | * only page cache pages found in these are zero pages | ||
678 | * covering holes, and because we don't want to mix DAX | ||
679 | * exceptional entries and shadow exceptional entries in the | ||
680 | * same page_tree. | ||
674 | */ | 681 | */ |
675 | if (reclaimed && page_is_file_cache(page) && | 682 | if (reclaimed && page_is_file_cache(page) && |
676 | !mapping_exiting(mapping)) | 683 | !mapping_exiting(mapping) && !dax_mapping(mapping)) |
677 | shadow = workingset_eviction(mapping, page); | 684 | shadow = workingset_eviction(mapping, page); |
678 | __delete_from_page_cache(page, shadow, memcg); | 685 | __delete_from_page_cache(page, shadow, memcg); |
679 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 686 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
diff --git a/mm/workingset.c b/mm/workingset.c index aa017133744b..61ead9e5549d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
351 | node->slots[i] = NULL; | 351 | node->slots[i] = NULL; |
352 | BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); | 352 | BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); |
353 | node->count -= 1U << RADIX_TREE_COUNT_SHIFT; | 353 | node->count -= 1U << RADIX_TREE_COUNT_SHIFT; |
354 | BUG_ON(!mapping->nrshadows); | 354 | BUG_ON(!mapping->nrexceptional); |
355 | mapping->nrshadows--; | 355 | mapping->nrexceptional--; |
356 | } | 356 | } |
357 | } | 357 | } |
358 | BUG_ON(node->count); | 358 | BUG_ON(node->count); |
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e5936c10d..7aea0ccb6be6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c | |||
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head) | |||
289 | 289 | ||
290 | if (!n->tn_bits) | 290 | if (!n->tn_bits) |
291 | kmem_cache_free(trie_leaf_kmem, n); | 291 | kmem_cache_free(trie_leaf_kmem, n); |
292 | else if (n->tn_bits <= TNODE_KMALLOC_MAX) | ||
293 | kfree(n); | ||
294 | else | 292 | else |
295 | vfree(n); | 293 | kvfree(n); |
296 | } | 294 | } |
297 | 295 | ||
298 | #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) | 296 | #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) |