diff options
Diffstat (limited to 'fs/dax.c')
-rw-r--r-- | fs/dax.c | 392 |
1 files changed, 175 insertions, 217 deletions
@@ -42,6 +42,9 @@ | |||
42 | #define DAX_WAIT_TABLE_BITS 12 | 42 | #define DAX_WAIT_TABLE_BITS 12 |
43 | #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) | 43 | #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) |
44 | 44 | ||
45 | /* The 'colour' (ie low bits) within a PMD of a page offset. */ | ||
46 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) | ||
47 | |||
45 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; | 48 | static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; |
46 | 49 | ||
47 | static int __init init_dax_wait_table(void) | 50 | static int __init init_dax_wait_table(void) |
@@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void) | |||
54 | } | 57 | } |
55 | fs_initcall(init_dax_wait_table); | 58 | fs_initcall(init_dax_wait_table); |
56 | 59 | ||
60 | /* | ||
61 | * We use lowest available bit in exceptional entry for locking, one bit for | ||
62 | * the entry size (PMD) and two more to tell us if the entry is a zero page or | ||
63 | * an empty entry that is just used for locking. In total four special bits. | ||
64 | * | ||
65 | * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE | ||
66 | * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem | ||
67 | * block allocation. | ||
68 | */ | ||
69 | #define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) | ||
70 | #define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) | ||
71 | #define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) | ||
72 | #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) | ||
73 | #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) | ||
74 | |||
75 | static unsigned long dax_radix_sector(void *entry) | ||
76 | { | ||
77 | return (unsigned long)entry >> RADIX_DAX_SHIFT; | ||
78 | } | ||
79 | |||
80 | static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) | ||
81 | { | ||
82 | return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | | ||
83 | ((unsigned long)sector << RADIX_DAX_SHIFT) | | ||
84 | RADIX_DAX_ENTRY_LOCK); | ||
85 | } | ||
86 | |||
87 | static unsigned int dax_radix_order(void *entry) | ||
88 | { | ||
89 | if ((unsigned long)entry & RADIX_DAX_PMD) | ||
90 | return PMD_SHIFT - PAGE_SHIFT; | ||
91 | return 0; | ||
92 | } | ||
93 | |||
57 | static int dax_is_pmd_entry(void *entry) | 94 | static int dax_is_pmd_entry(void *entry) |
58 | { | 95 | { |
59 | return (unsigned long)entry & RADIX_DAX_PMD; | 96 | return (unsigned long)entry & RADIX_DAX_PMD; |
@@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry) | |||
66 | 103 | ||
67 | static int dax_is_zero_entry(void *entry) | 104 | static int dax_is_zero_entry(void *entry) |
68 | { | 105 | { |
69 | return (unsigned long)entry & RADIX_DAX_HZP; | 106 | return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; |
70 | } | 107 | } |
71 | 108 | ||
72 | static int dax_is_empty_entry(void *entry) | 109 | static int dax_is_empty_entry(void *entry) |
@@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, | |||
98 | * the range covered by the PMD map to the same bit lock. | 135 | * the range covered by the PMD map to the same bit lock. |
99 | */ | 136 | */ |
100 | if (dax_is_pmd_entry(entry)) | 137 | if (dax_is_pmd_entry(entry)) |
101 | index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); | 138 | index &= ~PG_PMD_COLOUR; |
102 | 139 | ||
103 | key->mapping = mapping; | 140 | key->mapping = mapping; |
104 | key->entry_start = index; | 141 | key->entry_start = index; |
@@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo | |||
121 | } | 158 | } |
122 | 159 | ||
123 | /* | 160 | /* |
161 | * We do not necessarily hold the mapping->tree_lock when we call this | ||
162 | * function so it is possible that 'entry' is no longer a valid item in the | ||
163 | * radix tree. This is okay because all we really need to do is to find the | ||
164 | * correct waitqueue where tasks might be waiting for that old 'entry' and | ||
165 | * wake them. | ||
166 | */ | ||
167 | static void dax_wake_mapping_entry_waiter(struct address_space *mapping, | ||
168 | pgoff_t index, void *entry, bool wake_all) | ||
169 | { | ||
170 | struct exceptional_entry_key key; | ||
171 | wait_queue_head_t *wq; | ||
172 | |||
173 | wq = dax_entry_waitqueue(mapping, index, entry, &key); | ||
174 | |||
175 | /* | ||
176 | * Checking for locked entry and prepare_to_wait_exclusive() happens | ||
177 | * under mapping->tree_lock, ditto for entry handling in our callers. | ||
178 | * So at this point all tasks that could have seen our entry locked | ||
179 | * must be in the waitqueue and the following check will see them. | ||
180 | */ | ||
181 | if (waitqueue_active(wq)) | ||
182 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | ||
183 | } | ||
184 | |||
185 | /* | ||
124 | * Check whether the given slot is locked. The function must be called with | 186 | * Check whether the given slot is locked. The function must be called with |
125 | * mapping->tree_lock held | 187 | * mapping->tree_lock held |
126 | */ | 188 | */ |
@@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, | |||
181 | for (;;) { | 243 | for (;;) { |
182 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, | 244 | entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, |
183 | &slot); | 245 | &slot); |
184 | if (!entry || !radix_tree_exceptional_entry(entry) || | 246 | if (!entry || |
247 | WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || | ||
185 | !slot_locked(mapping, slot)) { | 248 | !slot_locked(mapping, slot)) { |
186 | if (slotp) | 249 | if (slotp) |
187 | *slotp = slot; | 250 | *slotp = slot; |
@@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, | |||
216 | } | 279 | } |
217 | 280 | ||
218 | static void put_locked_mapping_entry(struct address_space *mapping, | 281 | static void put_locked_mapping_entry(struct address_space *mapping, |
219 | pgoff_t index, void *entry) | 282 | pgoff_t index) |
220 | { | 283 | { |
221 | if (!radix_tree_exceptional_entry(entry)) { | 284 | dax_unlock_mapping_entry(mapping, index); |
222 | unlock_page(entry); | ||
223 | put_page(entry); | ||
224 | } else { | ||
225 | dax_unlock_mapping_entry(mapping, index); | ||
226 | } | ||
227 | } | 285 | } |
228 | 286 | ||
229 | /* | 287 | /* |
@@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, | |||
233 | static void put_unlocked_mapping_entry(struct address_space *mapping, | 291 | static void put_unlocked_mapping_entry(struct address_space *mapping, |
234 | pgoff_t index, void *entry) | 292 | pgoff_t index, void *entry) |
235 | { | 293 | { |
236 | if (!radix_tree_exceptional_entry(entry)) | 294 | if (!entry) |
237 | return; | 295 | return; |
238 | 296 | ||
239 | /* We have to wake up next waiter for the radix tree entry lock */ | 297 | /* We have to wake up next waiter for the radix tree entry lock */ |
@@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, | |||
241 | } | 299 | } |
242 | 300 | ||
243 | /* | 301 | /* |
244 | * Find radix tree entry at given index. If it points to a page, return with | 302 | * Find radix tree entry at given index. If it points to an exceptional entry, |
245 | * the page locked. If it points to the exceptional entry, return with the | 303 | * return it with the radix tree entry locked. If the radix tree doesn't |
246 | * radix tree entry locked. If the radix tree doesn't contain given index, | 304 | * contain given index, create an empty exceptional entry for the index and |
247 | * create empty exceptional entry for the index and return with it locked. | 305 | * return with it locked. |
248 | * | 306 | * |
249 | * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will | 307 | * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will |
250 | * either return that locked entry or will return an error. This error will | 308 | * either return that locked entry or will return an error. This error will |
251 | * happen if there are any 4k entries (either zero pages or DAX entries) | 309 | * happen if there are any 4k entries within the 2MiB range that we are |
252 | * within the 2MiB range that we are requesting. | 310 | * requesting. |
253 | * | 311 | * |
254 | * We always favor 4k entries over 2MiB entries. There isn't a flow where we | 312 | * We always favor 4k entries over 2MiB entries. There isn't a flow where we |
255 | * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB | 313 | * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB |
@@ -276,18 +334,21 @@ restart: | |||
276 | spin_lock_irq(&mapping->tree_lock); | 334 | spin_lock_irq(&mapping->tree_lock); |
277 | entry = get_unlocked_mapping_entry(mapping, index, &slot); | 335 | entry = get_unlocked_mapping_entry(mapping, index, &slot); |
278 | 336 | ||
337 | if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { | ||
338 | entry = ERR_PTR(-EIO); | ||
339 | goto out_unlock; | ||
340 | } | ||
341 | |||
279 | if (entry) { | 342 | if (entry) { |
280 | if (size_flag & RADIX_DAX_PMD) { | 343 | if (size_flag & RADIX_DAX_PMD) { |
281 | if (!radix_tree_exceptional_entry(entry) || | 344 | if (dax_is_pte_entry(entry)) { |
282 | dax_is_pte_entry(entry)) { | ||
283 | put_unlocked_mapping_entry(mapping, index, | 345 | put_unlocked_mapping_entry(mapping, index, |
284 | entry); | 346 | entry); |
285 | entry = ERR_PTR(-EEXIST); | 347 | entry = ERR_PTR(-EEXIST); |
286 | goto out_unlock; | 348 | goto out_unlock; |
287 | } | 349 | } |
288 | } else { /* trying to grab a PTE entry */ | 350 | } else { /* trying to grab a PTE entry */ |
289 | if (radix_tree_exceptional_entry(entry) && | 351 | if (dax_is_pmd_entry(entry) && |
290 | dax_is_pmd_entry(entry) && | ||
291 | (dax_is_zero_entry(entry) || | 352 | (dax_is_zero_entry(entry) || |
292 | dax_is_empty_entry(entry))) { | 353 | dax_is_empty_entry(entry))) { |
293 | pmd_downgrade = true; | 354 | pmd_downgrade = true; |
@@ -321,7 +382,7 @@ restart: | |||
321 | mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); | 382 | mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); |
322 | if (err) { | 383 | if (err) { |
323 | if (pmd_downgrade) | 384 | if (pmd_downgrade) |
324 | put_locked_mapping_entry(mapping, index, entry); | 385 | put_locked_mapping_entry(mapping, index); |
325 | return ERR_PTR(err); | 386 | return ERR_PTR(err); |
326 | } | 387 | } |
327 | spin_lock_irq(&mapping->tree_lock); | 388 | spin_lock_irq(&mapping->tree_lock); |
@@ -371,52 +432,12 @@ restart: | |||
371 | spin_unlock_irq(&mapping->tree_lock); | 432 | spin_unlock_irq(&mapping->tree_lock); |
372 | return entry; | 433 | return entry; |
373 | } | 434 | } |
374 | /* Normal page in radix tree? */ | ||
375 | if (!radix_tree_exceptional_entry(entry)) { | ||
376 | struct page *page = entry; | ||
377 | |||
378 | get_page(page); | ||
379 | spin_unlock_irq(&mapping->tree_lock); | ||
380 | lock_page(page); | ||
381 | /* Page got truncated? Retry... */ | ||
382 | if (unlikely(page->mapping != mapping)) { | ||
383 | unlock_page(page); | ||
384 | put_page(page); | ||
385 | goto restart; | ||
386 | } | ||
387 | return page; | ||
388 | } | ||
389 | entry = lock_slot(mapping, slot); | 435 | entry = lock_slot(mapping, slot); |
390 | out_unlock: | 436 | out_unlock: |
391 | spin_unlock_irq(&mapping->tree_lock); | 437 | spin_unlock_irq(&mapping->tree_lock); |
392 | return entry; | 438 | return entry; |
393 | } | 439 | } |
394 | 440 | ||
395 | /* | ||
396 | * We do not necessarily hold the mapping->tree_lock when we call this | ||
397 | * function so it is possible that 'entry' is no longer a valid item in the | ||
398 | * radix tree. This is okay because all we really need to do is to find the | ||
399 | * correct waitqueue where tasks might be waiting for that old 'entry' and | ||
400 | * wake them. | ||
401 | */ | ||
402 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, | ||
403 | pgoff_t index, void *entry, bool wake_all) | ||
404 | { | ||
405 | struct exceptional_entry_key key; | ||
406 | wait_queue_head_t *wq; | ||
407 | |||
408 | wq = dax_entry_waitqueue(mapping, index, entry, &key); | ||
409 | |||
410 | /* | ||
411 | * Checking for locked entry and prepare_to_wait_exclusive() happens | ||
412 | * under mapping->tree_lock, ditto for entry handling in our callers. | ||
413 | * So at this point all tasks that could have seen our entry locked | ||
414 | * must be in the waitqueue and the following check will see them. | ||
415 | */ | ||
416 | if (waitqueue_active(wq)) | ||
417 | __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); | ||
418 | } | ||
419 | |||
420 | static int __dax_invalidate_mapping_entry(struct address_space *mapping, | 441 | static int __dax_invalidate_mapping_entry(struct address_space *mapping, |
421 | pgoff_t index, bool trunc) | 442 | pgoff_t index, bool trunc) |
422 | { | 443 | { |
@@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, | |||
426 | 447 | ||
427 | spin_lock_irq(&mapping->tree_lock); | 448 | spin_lock_irq(&mapping->tree_lock); |
428 | entry = get_unlocked_mapping_entry(mapping, index, NULL); | 449 | entry = get_unlocked_mapping_entry(mapping, index, NULL); |
429 | if (!entry || !radix_tree_exceptional_entry(entry)) | 450 | if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) |
430 | goto out; | 451 | goto out; |
431 | if (!trunc && | 452 | if (!trunc && |
432 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || | 453 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || |
@@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, | |||
468 | return __dax_invalidate_mapping_entry(mapping, index, false); | 489 | return __dax_invalidate_mapping_entry(mapping, index, false); |
469 | } | 490 | } |
470 | 491 | ||
471 | /* | ||
472 | * The user has performed a load from a hole in the file. Allocating | ||
473 | * a new page in the file would cause excessive storage usage for | ||
474 | * workloads with sparse files. We allocate a page cache page instead. | ||
475 | * We'll kick it out of the page cache if it's ever written to, | ||
476 | * otherwise it will simply fall out of the page cache under memory | ||
477 | * pressure without ever having been dirtied. | ||
478 | */ | ||
479 | static int dax_load_hole(struct address_space *mapping, void **entry, | ||
480 | struct vm_fault *vmf) | ||
481 | { | ||
482 | struct inode *inode = mapping->host; | ||
483 | struct page *page; | ||
484 | int ret; | ||
485 | |||
486 | /* Hole page already exists? Return it... */ | ||
487 | if (!radix_tree_exceptional_entry(*entry)) { | ||
488 | page = *entry; | ||
489 | goto finish_fault; | ||
490 | } | ||
491 | |||
492 | /* This will replace locked radix tree entry with a hole page */ | ||
493 | page = find_or_create_page(mapping, vmf->pgoff, | ||
494 | vmf->gfp_mask | __GFP_ZERO); | ||
495 | if (!page) { | ||
496 | ret = VM_FAULT_OOM; | ||
497 | goto out; | ||
498 | } | ||
499 | |||
500 | finish_fault: | ||
501 | vmf->page = page; | ||
502 | ret = finish_fault(vmf); | ||
503 | vmf->page = NULL; | ||
504 | *entry = page; | ||
505 | if (!ret) { | ||
506 | /* Grab reference for PTE that is now referencing the page */ | ||
507 | get_page(page); | ||
508 | ret = VM_FAULT_NOPAGE; | ||
509 | } | ||
510 | out: | ||
511 | trace_dax_load_hole(inode, vmf, ret); | ||
512 | return ret; | ||
513 | } | ||
514 | |||
515 | static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, | 492 | static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, |
516 | sector_t sector, size_t size, struct page *to, | 493 | sector_t sector, size_t size, struct page *to, |
517 | unsigned long vaddr) | 494 | unsigned long vaddr) |
@@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
552 | unsigned long flags) | 529 | unsigned long flags) |
553 | { | 530 | { |
554 | struct radix_tree_root *page_tree = &mapping->page_tree; | 531 | struct radix_tree_root *page_tree = &mapping->page_tree; |
555 | int error = 0; | ||
556 | bool hole_fill = false; | ||
557 | void *new_entry; | 532 | void *new_entry; |
558 | pgoff_t index = vmf->pgoff; | 533 | pgoff_t index = vmf->pgoff; |
559 | 534 | ||
560 | if (vmf->flags & FAULT_FLAG_WRITE) | 535 | if (vmf->flags & FAULT_FLAG_WRITE) |
561 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 536 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
562 | 537 | ||
563 | /* Replacing hole page with block mapping? */ | 538 | if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { |
564 | if (!radix_tree_exceptional_entry(entry)) { | 539 | /* we are replacing a zero page with block mapping */ |
565 | hole_fill = true; | 540 | if (dax_is_pmd_entry(entry)) |
566 | /* | 541 | unmap_mapping_range(mapping, |
567 | * Unmap the page now before we remove it from page cache below. | 542 | (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, |
568 | * The page is locked so it cannot be faulted in again. | 543 | PMD_SIZE, 0); |
569 | */ | 544 | else /* pte entry */ |
570 | unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, | 545 | unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, |
571 | PAGE_SIZE, 0); | 546 | PAGE_SIZE, 0); |
572 | error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); | ||
573 | if (error) | ||
574 | return ERR_PTR(error); | ||
575 | } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { | ||
576 | /* replacing huge zero page with PMD block mapping */ | ||
577 | unmap_mapping_range(mapping, | ||
578 | (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); | ||
579 | } | 547 | } |
580 | 548 | ||
581 | spin_lock_irq(&mapping->tree_lock); | 549 | spin_lock_irq(&mapping->tree_lock); |
582 | new_entry = dax_radix_locked_entry(sector, flags); | 550 | new_entry = dax_radix_locked_entry(sector, flags); |
583 | 551 | ||
584 | if (hole_fill) { | 552 | if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
585 | __delete_from_page_cache(entry, NULL); | ||
586 | /* Drop pagecache reference */ | ||
587 | put_page(entry); | ||
588 | error = __radix_tree_insert(page_tree, index, | ||
589 | dax_radix_order(new_entry), new_entry); | ||
590 | if (error) { | ||
591 | new_entry = ERR_PTR(error); | ||
592 | goto unlock; | ||
593 | } | ||
594 | mapping->nrexceptional++; | ||
595 | } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { | ||
596 | /* | 553 | /* |
597 | * Only swap our new entry into the radix tree if the current | 554 | * Only swap our new entry into the radix tree if the current |
598 | * entry is a zero page or an empty entry. If a normal PTE or | 555 | * entry is a zero page or an empty entry. If a normal PTE or |
@@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
609 | WARN_ON_ONCE(ret != entry); | 566 | WARN_ON_ONCE(ret != entry); |
610 | __radix_tree_replace(page_tree, node, slot, | 567 | __radix_tree_replace(page_tree, node, slot, |
611 | new_entry, NULL, NULL); | 568 | new_entry, NULL, NULL); |
569 | entry = new_entry; | ||
612 | } | 570 | } |
571 | |||
613 | if (vmf->flags & FAULT_FLAG_WRITE) | 572 | if (vmf->flags & FAULT_FLAG_WRITE) |
614 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); | 573 | radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); |
615 | unlock: | 574 | |
616 | spin_unlock_irq(&mapping->tree_lock); | 575 | spin_unlock_irq(&mapping->tree_lock); |
617 | if (hole_fill) { | 576 | return entry; |
618 | radix_tree_preload_end(); | ||
619 | /* | ||
620 | * We don't need hole page anymore, it has been replaced with | ||
621 | * locked radix tree entry now. | ||
622 | */ | ||
623 | if (mapping->a_ops->freepage) | ||
624 | mapping->a_ops->freepage(entry); | ||
625 | unlock_page(entry); | ||
626 | put_page(entry); | ||
627 | } | ||
628 | return new_entry; | ||
629 | } | 577 | } |
630 | 578 | ||
631 | static inline unsigned long | 579 | static inline unsigned long |
@@ -646,11 +594,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, | |||
646 | pte_t pte, *ptep = NULL; | 594 | pte_t pte, *ptep = NULL; |
647 | pmd_t *pmdp = NULL; | 595 | pmd_t *pmdp = NULL; |
648 | spinlock_t *ptl; | 596 | spinlock_t *ptl; |
649 | bool changed; | ||
650 | 597 | ||
651 | i_mmap_lock_read(mapping); | 598 | i_mmap_lock_read(mapping); |
652 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { | 599 | vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { |
653 | unsigned long address; | 600 | unsigned long address, start, end; |
654 | 601 | ||
655 | cond_resched(); | 602 | cond_resched(); |
656 | 603 | ||
@@ -658,8 +605,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, | |||
658 | continue; | 605 | continue; |
659 | 606 | ||
660 | address = pgoff_address(index, vma); | 607 | address = pgoff_address(index, vma); |
661 | changed = false; | 608 | |
662 | if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) | 609 | /* |
610 | * Note because we provide start/end to follow_pte_pmd it will | ||
611 | * call mmu_notifier_invalidate_range_start() on our behalf | ||
612 | * before taking any lock. | ||
613 | */ | ||
614 | if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) | ||
663 | continue; | 615 | continue; |
664 | 616 | ||
665 | if (pmdp) { | 617 | if (pmdp) { |
@@ -676,7 +628,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, | |||
676 | pmd = pmd_wrprotect(pmd); | 628 | pmd = pmd_wrprotect(pmd); |
677 | pmd = pmd_mkclean(pmd); | 629 | pmd = pmd_mkclean(pmd); |
678 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | 630 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); |
679 | changed = true; | 631 | mmu_notifier_invalidate_range(vma->vm_mm, start, end); |
680 | unlock_pmd: | 632 | unlock_pmd: |
681 | spin_unlock(ptl); | 633 | spin_unlock(ptl); |
682 | #endif | 634 | #endif |
@@ -691,13 +643,12 @@ unlock_pmd: | |||
691 | pte = pte_wrprotect(pte); | 643 | pte = pte_wrprotect(pte); |
692 | pte = pte_mkclean(pte); | 644 | pte = pte_mkclean(pte); |
693 | set_pte_at(vma->vm_mm, address, ptep, pte); | 645 | set_pte_at(vma->vm_mm, address, ptep, pte); |
694 | changed = true; | 646 | mmu_notifier_invalidate_range(vma->vm_mm, start, end); |
695 | unlock_pte: | 647 | unlock_pte: |
696 | pte_unmap_unlock(ptep, ptl); | 648 | pte_unmap_unlock(ptep, ptl); |
697 | } | 649 | } |
698 | 650 | ||
699 | if (changed) | 651 | mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); |
700 | mmu_notifier_invalidate_page(vma->vm_mm, address); | ||
701 | } | 652 | } |
702 | i_mmap_unlock_read(mapping); | 653 | i_mmap_unlock_read(mapping); |
703 | } | 654 | } |
@@ -724,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev, | |||
724 | spin_lock_irq(&mapping->tree_lock); | 675 | spin_lock_irq(&mapping->tree_lock); |
725 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); | 676 | entry2 = get_unlocked_mapping_entry(mapping, index, &slot); |
726 | /* Entry got punched out / reallocated? */ | 677 | /* Entry got punched out / reallocated? */ |
727 | if (!entry2 || !radix_tree_exceptional_entry(entry2)) | 678 | if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) |
728 | goto put_unlocked; | 679 | goto put_unlocked; |
729 | /* | 680 | /* |
730 | * Entry got reallocated elsewhere? No need to writeback. We have to | 681 | * Entry got reallocated elsewhere? No need to writeback. We have to |
@@ -796,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev, | |||
796 | trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); | 747 | trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); |
797 | dax_unlock: | 748 | dax_unlock: |
798 | dax_read_unlock(id); | 749 | dax_read_unlock(id); |
799 | put_locked_mapping_entry(mapping, index, entry); | 750 | put_locked_mapping_entry(mapping, index); |
800 | return ret; | 751 | return ret; |
801 | 752 | ||
802 | put_unlocked: | 753 | put_unlocked: |
@@ -871,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); | |||
871 | 822 | ||
872 | static int dax_insert_mapping(struct address_space *mapping, | 823 | static int dax_insert_mapping(struct address_space *mapping, |
873 | struct block_device *bdev, struct dax_device *dax_dev, | 824 | struct block_device *bdev, struct dax_device *dax_dev, |
874 | sector_t sector, size_t size, void **entryp, | 825 | sector_t sector, size_t size, void *entry, |
875 | struct vm_area_struct *vma, struct vm_fault *vmf) | 826 | struct vm_area_struct *vma, struct vm_fault *vmf) |
876 | { | 827 | { |
877 | unsigned long vaddr = vmf->address; | 828 | unsigned long vaddr = vmf->address; |
878 | void *entry = *entryp; | ||
879 | void *ret, *kaddr; | 829 | void *ret, *kaddr; |
880 | pgoff_t pgoff; | 830 | pgoff_t pgoff; |
881 | int id, rc; | 831 | int id, rc; |
@@ -896,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
896 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); | 846 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); |
897 | if (IS_ERR(ret)) | 847 | if (IS_ERR(ret)) |
898 | return PTR_ERR(ret); | 848 | return PTR_ERR(ret); |
899 | *entryp = ret; | ||
900 | 849 | ||
901 | trace_dax_insert_mapping(mapping->host, vmf, ret); | 850 | trace_dax_insert_mapping(mapping->host, vmf, ret); |
902 | return vm_insert_mixed(vma, vaddr, pfn); | 851 | if (vmf->flags & FAULT_FLAG_WRITE) |
852 | return vm_insert_mixed_mkwrite(vma, vaddr, pfn); | ||
853 | else | ||
854 | return vm_insert_mixed(vma, vaddr, pfn); | ||
903 | } | 855 | } |
904 | 856 | ||
905 | /** | 857 | /* |
906 | * dax_pfn_mkwrite - handle first write to DAX page | 858 | * The user has performed a load from a hole in the file. Allocating a new |
907 | * @vmf: The description of the fault | 859 | * page in the file would cause excessive storage usage for workloads with |
860 | * sparse files. Instead we insert a read-only mapping of the 4k zero page. | ||
861 | * If this page is ever written to we will re-fault and change the mapping to | ||
862 | * point to real DAX storage instead. | ||
908 | */ | 863 | */ |
909 | int dax_pfn_mkwrite(struct vm_fault *vmf) | 864 | static int dax_load_hole(struct address_space *mapping, void *entry, |
865 | struct vm_fault *vmf) | ||
910 | { | 866 | { |
911 | struct file *file = vmf->vma->vm_file; | ||
912 | struct address_space *mapping = file->f_mapping; | ||
913 | struct inode *inode = mapping->host; | 867 | struct inode *inode = mapping->host; |
914 | void *entry, **slot; | 868 | unsigned long vaddr = vmf->address; |
915 | pgoff_t index = vmf->pgoff; | 869 | int ret = VM_FAULT_NOPAGE; |
870 | struct page *zero_page; | ||
871 | void *entry2; | ||
916 | 872 | ||
917 | spin_lock_irq(&mapping->tree_lock); | 873 | zero_page = ZERO_PAGE(0); |
918 | entry = get_unlocked_mapping_entry(mapping, index, &slot); | 874 | if (unlikely(!zero_page)) { |
919 | if (!entry || !radix_tree_exceptional_entry(entry)) { | 875 | ret = VM_FAULT_OOM; |
920 | if (entry) | 876 | goto out; |
921 | put_unlocked_mapping_entry(mapping, index, entry); | ||
922 | spin_unlock_irq(&mapping->tree_lock); | ||
923 | trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); | ||
924 | return VM_FAULT_NOPAGE; | ||
925 | } | 877 | } |
926 | radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); | 878 | |
927 | entry = lock_slot(mapping, slot); | 879 | entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, |
928 | spin_unlock_irq(&mapping->tree_lock); | 880 | RADIX_DAX_ZERO_PAGE); |
929 | /* | 881 | if (IS_ERR(entry2)) { |
930 | * If we race with somebody updating the PTE and finish_mkwrite_fault() | 882 | ret = VM_FAULT_SIGBUS; |
931 | * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry | 883 | goto out; |
932 | * the fault in either case. | 884 | } |
933 | */ | 885 | |
934 | finish_mkwrite_fault(vmf); | 886 | vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); |
935 | put_locked_mapping_entry(mapping, index, entry); | 887 | out: |
936 | trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); | 888 | trace_dax_load_hole(inode, vmf, ret); |
937 | return VM_FAULT_NOPAGE; | 889 | return ret; |
938 | } | 890 | } |
939 | EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); | ||
940 | 891 | ||
941 | static bool dax_range_is_aligned(struct block_device *bdev, | 892 | static bool dax_range_is_aligned(struct block_device *bdev, |
942 | unsigned int offset, unsigned int length) | 893 | unsigned int offset, unsigned int length) |
@@ -1056,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, | |||
1056 | if (map_len > end - pos) | 1007 | if (map_len > end - pos) |
1057 | map_len = end - pos; | 1008 | map_len = end - pos; |
1058 | 1009 | ||
1010 | /* | ||
1011 | * The userspace address for the memory copy has already been | ||
1012 | * validated via access_ok() in either vfs_read() or | ||
1013 | * vfs_write(), depending on which operation we are doing. | ||
1014 | */ | ||
1059 | if (iov_iter_rw(iter) == WRITE) | 1015 | if (iov_iter_rw(iter) == WRITE) |
1060 | map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, | 1016 | map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, |
1061 | map_len, iter); | 1017 | map_len, iter); |
@@ -1220,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1220 | major = VM_FAULT_MAJOR; | 1176 | major = VM_FAULT_MAJOR; |
1221 | } | 1177 | } |
1222 | error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, | 1178 | error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, |
1223 | sector, PAGE_SIZE, &entry, vmf->vma, vmf); | 1179 | sector, PAGE_SIZE, entry, vmf->vma, vmf); |
1224 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1180 | /* -EBUSY is fine, somebody else faulted on the same PTE */ |
1225 | if (error == -EBUSY) | 1181 | if (error == -EBUSY) |
1226 | error = 0; | 1182 | error = 0; |
@@ -1228,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1228 | case IOMAP_UNWRITTEN: | 1184 | case IOMAP_UNWRITTEN: |
1229 | case IOMAP_HOLE: | 1185 | case IOMAP_HOLE: |
1230 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1186 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
1231 | vmf_ret = dax_load_hole(mapping, &entry, vmf); | 1187 | vmf_ret = dax_load_hole(mapping, entry, vmf); |
1232 | goto finish_iomap; | 1188 | goto finish_iomap; |
1233 | } | 1189 | } |
1234 | /*FALLTHRU*/ | 1190 | /*FALLTHRU*/ |
@@ -1255,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, | |||
1255 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | 1211 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); |
1256 | } | 1212 | } |
1257 | unlock_entry: | 1213 | unlock_entry: |
1258 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1214 | put_locked_mapping_entry(mapping, vmf->pgoff); |
1259 | out: | 1215 | out: |
1260 | trace_dax_pte_fault_done(inode, vmf, vmf_ret); | 1216 | trace_dax_pte_fault_done(inode, vmf, vmf_ret); |
1261 | return vmf_ret; | 1217 | return vmf_ret; |
1262 | } | 1218 | } |
1263 | 1219 | ||
1264 | #ifdef CONFIG_FS_DAX_PMD | 1220 | #ifdef CONFIG_FS_DAX_PMD |
1265 | /* | ||
1266 | * The 'colour' (ie low bits) within a PMD of a page offset. This comes up | ||
1267 | * more often than one might expect in the below functions. | ||
1268 | */ | ||
1269 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) | ||
1270 | |||
1271 | static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | 1221 | static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, |
1272 | loff_t pos, void **entryp) | 1222 | loff_t pos, void *entry) |
1273 | { | 1223 | { |
1274 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 1224 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
1275 | const sector_t sector = dax_iomap_sector(iomap, pos); | 1225 | const sector_t sector = dax_iomap_sector(iomap, pos); |
@@ -1280,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | |||
1280 | void *ret = NULL, *kaddr; | 1230 | void *ret = NULL, *kaddr; |
1281 | long length = 0; | 1231 | long length = 0; |
1282 | pgoff_t pgoff; | 1232 | pgoff_t pgoff; |
1283 | pfn_t pfn; | 1233 | pfn_t pfn = {}; |
1284 | int id; | 1234 | int id; |
1285 | 1235 | ||
1286 | if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) | 1236 | if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) |
@@ -1300,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, | |||
1300 | goto unlock_fallback; | 1250 | goto unlock_fallback; |
1301 | dax_read_unlock(id); | 1251 | dax_read_unlock(id); |
1302 | 1252 | ||
1303 | ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, | 1253 | ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, |
1304 | RADIX_DAX_PMD); | 1254 | RADIX_DAX_PMD); |
1305 | if (IS_ERR(ret)) | 1255 | if (IS_ERR(ret)) |
1306 | goto fallback; | 1256 | goto fallback; |
1307 | *entryp = ret; | ||
1308 | 1257 | ||
1309 | trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); | 1258 | trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); |
1310 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, | 1259 | return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, |
@@ -1318,7 +1267,7 @@ fallback: | |||
1318 | } | 1267 | } |
1319 | 1268 | ||
1320 | static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | 1269 | static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, |
1321 | void **entryp) | 1270 | void *entry) |
1322 | { | 1271 | { |
1323 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; | 1272 | struct address_space *mapping = vmf->vma->vm_file->f_mapping; |
1324 | unsigned long pmd_addr = vmf->address & PMD_MASK; | 1273 | unsigned long pmd_addr = vmf->address & PMD_MASK; |
@@ -1333,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | |||
1333 | if (unlikely(!zero_page)) | 1282 | if (unlikely(!zero_page)) |
1334 | goto fallback; | 1283 | goto fallback; |
1335 | 1284 | ||
1336 | ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, | 1285 | ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, |
1337 | RADIX_DAX_PMD | RADIX_DAX_HZP); | 1286 | RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); |
1338 | if (IS_ERR(ret)) | 1287 | if (IS_ERR(ret)) |
1339 | goto fallback; | 1288 | goto fallback; |
1340 | *entryp = ret; | ||
1341 | 1289 | ||
1342 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); | 1290 | ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); |
1343 | if (!pmd_none(*(vmf->pmd))) { | 1291 | if (!pmd_none(*(vmf->pmd))) { |
@@ -1383,6 +1331,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1383 | 1331 | ||
1384 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); | 1332 | trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); |
1385 | 1333 | ||
1334 | /* | ||
1335 | * Make sure that the faulting address's PMD offset (color) matches | ||
1336 | * the PMD offset from the start of the file. This is necessary so | ||
1337 | * that a PMD range in the page table overlaps exactly with a PMD | ||
1338 | * range in the radix tree. | ||
1339 | */ | ||
1340 | if ((vmf->pgoff & PG_PMD_COLOUR) != | ||
1341 | ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) | ||
1342 | goto fallback; | ||
1343 | |||
1386 | /* Fall back to PTEs if we're going to COW */ | 1344 | /* Fall back to PTEs if we're going to COW */ |
1387 | if (write && !(vma->vm_flags & VM_SHARED)) | 1345 | if (write && !(vma->vm_flags & VM_SHARED)) |
1388 | goto fallback; | 1346 | goto fallback; |
@@ -1403,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1403 | goto fallback; | 1361 | goto fallback; |
1404 | 1362 | ||
1405 | /* | 1363 | /* |
1406 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | 1364 | * grab_mapping_entry() will make sure we get a 2MiB empty entry, a |
1407 | * PMD or a HZP entry. If it can't (because a 4k page is already in | 1365 | * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page |
1408 | * the tree, for instance), it will return -EEXIST and we just fall | 1366 | * is already in the tree, for instance), it will return -EEXIST and |
1409 | * back to 4k entries. | 1367 | * we just fall back to 4k entries. |
1410 | */ | 1368 | */ |
1411 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | 1369 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); |
1412 | if (IS_ERR(entry)) | 1370 | if (IS_ERR(entry)) |
@@ -1439,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1439 | 1397 | ||
1440 | switch (iomap.type) { | 1398 | switch (iomap.type) { |
1441 | case IOMAP_MAPPED: | 1399 | case IOMAP_MAPPED: |
1442 | result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); | 1400 | result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); |
1443 | break; | 1401 | break; |
1444 | case IOMAP_UNWRITTEN: | 1402 | case IOMAP_UNWRITTEN: |
1445 | case IOMAP_HOLE: | 1403 | case IOMAP_HOLE: |
1446 | if (WARN_ON_ONCE(write)) | 1404 | if (WARN_ON_ONCE(write)) |
1447 | break; | 1405 | break; |
1448 | result = dax_pmd_load_hole(vmf, &iomap, &entry); | 1406 | result = dax_pmd_load_hole(vmf, &iomap, entry); |
1449 | break; | 1407 | break; |
1450 | default: | 1408 | default: |
1451 | WARN_ON_ONCE(1); | 1409 | WARN_ON_ONCE(1); |
@@ -1468,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, | |||
1468 | &iomap); | 1426 | &iomap); |
1469 | } | 1427 | } |
1470 | unlock_entry: | 1428 | unlock_entry: |
1471 | put_locked_mapping_entry(mapping, pgoff, entry); | 1429 | put_locked_mapping_entry(mapping, pgoff); |
1472 | fallback: | 1430 | fallback: |
1473 | if (result == VM_FAULT_FALLBACK) { | 1431 | if (result == VM_FAULT_FALLBACK) { |
1474 | split_huge_pmd(vma, vmf->pmd, vmf->address); | 1432 | split_huge_pmd(vma, vmf->pmd, vmf->address); |