summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorRoss Zwisler <ross.zwisler@linux.intel.com>2016-01-22 18:10:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-22 20:02:18 -0500
commitf9fe48bece3af2d60e1bad65db4825f5a025dd36 (patch)
tree78f56ae2a1636fa9f8ec26466d4a94a16eadd5dc /mm
parent3f4a2670deea53e3765e24a7f46aafe6f077cb68 (diff)
dax: support dirty DAX entries in radix tree
Add support for tracking dirty DAX entries in the struct address_space radix tree. This tree is already used for dirty page writeback, and it already supports the use of exceptional (non struct page*) entries. In order to properly track dirty DAX pages we will insert new exceptional entries into the radix tree that represent dirty DAX PTE or PMD pages. These exceptional entries will also contain the writeback addresses for the PTE or PMD faults that we can use at fsync/msync time. There are currently two types of exceptional entries (shmem and shadow) that can be placed into the radix tree, and this adds a third. We rely on the fact that only one type of exceptional entry can be found in a given radix tree based on its usage. This happens for free with DAX vs shmem but we explicitly prevent shadow entries from being added to radix trees for DAX mappings. The only shadow entries that would be generated for DAX radix trees would be to track zero page mappings that were created for holes. These pages would receive minimal benefit from having shadow entries, and the choice to have only one type of exceptional entry in a given radix tree makes the logic simpler both in clear_exceptional_entry() and in the rest of DAX. Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "J. Bruce Fields" <bfields@fieldses.org> Cc: "Theodore Ts'o" <tytso@mit.edu> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Andreas Dilger <adilger.kernel@dilger.ca> Cc: Dave Chinner <david@fromorbit.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jan Kara <jack@suse.com> Cc: Jeff Layton <jlayton@poochiereds.net> Cc: Matthew Wilcox <willy@linux.intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Matthew Wilcox <matthew.r.wilcox@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Hugh Dickins <hughd@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c17
-rw-r--r--mm/truncate.c69
-rw-r--r--mm/vmscan.c9
-rw-r--r--mm/workingset.c4
4 files changed, 60 insertions, 39 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 847ee43c2806..7b8be78cfd9e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -11,6 +11,7 @@
11 */ 11 */
12#include <linux/export.h> 12#include <linux/export.h>
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/dax.h>
14#include <linux/fs.h> 15#include <linux/fs.h>
15#include <linux/uaccess.h> 16#include <linux/uaccess.h>
16#include <linux/capability.h> 17#include <linux/capability.h>
@@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
123 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); 124 __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
124 125
125 if (shadow) { 126 if (shadow) {
126 mapping->nrshadows++; 127 mapping->nrexceptional++;
127 /* 128 /*
128 * Make sure the nrshadows update is committed before 129 * Make sure the nrexceptional update is committed before
129 * the nrpages update so that final truncate racing 130 * the nrpages update so that final truncate racing
130 * with reclaim does not see both counters 0 at the 131 * with reclaim does not see both counters 0 at the
131 * same time and miss a shadow entry. 132 * same time and miss a shadow entry.
@@ -579,9 +580,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
579 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); 580 p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
580 if (!radix_tree_exceptional_entry(p)) 581 if (!radix_tree_exceptional_entry(p))
581 return -EEXIST; 582 return -EEXIST;
583
584 if (WARN_ON(dax_mapping(mapping)))
585 return -EINVAL;
586
582 if (shadowp) 587 if (shadowp)
583 *shadowp = p; 588 *shadowp = p;
584 mapping->nrshadows--; 589 mapping->nrexceptional--;
585 if (node) 590 if (node)
586 workingset_node_shadows_dec(node); 591 workingset_node_shadows_dec(node);
587 } 592 }
@@ -1245,9 +1250,9 @@ repeat:
1245 if (radix_tree_deref_retry(page)) 1250 if (radix_tree_deref_retry(page))
1246 goto restart; 1251 goto restart;
1247 /* 1252 /*
1248 * A shadow entry of a recently evicted page, 1253 * A shadow entry of a recently evicted page, a swap
1249 * or a swap entry from shmem/tmpfs. Return 1254 * entry from shmem/tmpfs or a DAX entry. Return it
1250 * it without attempting to raise page count. 1255 * without attempting to raise page count.
1251 */ 1256 */
1252 goto export; 1257 goto export;
1253 } 1258 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 76e35ad97102..e3ee0e27cd17 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
9 9
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/backing-dev.h> 11#include <linux/backing-dev.h>
12#include <linux/dax.h>
12#include <linux/gfp.h> 13#include <linux/gfp.h>
13#include <linux/mm.h> 14#include <linux/mm.h>
14#include <linux/swap.h> 15#include <linux/swap.h>
@@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping,
34 return; 35 return;
35 36
36 spin_lock_irq(&mapping->tree_lock); 37 spin_lock_irq(&mapping->tree_lock);
37 /* 38
38 * Regular page slots are stabilized by the page lock even 39 if (dax_mapping(mapping)) {
39 * without the tree itself locked. These unlocked entries 40 if (radix_tree_delete_item(&mapping->page_tree, index, entry))
40 * need verification under the tree lock. 41 mapping->nrexceptional--;
41 */ 42 } else {
42 if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) 43 /*
43 goto unlock; 44 * Regular page slots are stabilized by the page lock even
44 if (*slot != entry) 45 * without the tree itself locked. These unlocked entries
45 goto unlock; 46 * need verification under the tree lock.
46 radix_tree_replace_slot(slot, NULL); 47 */
47 mapping->nrshadows--; 48 if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
48 if (!node) 49 &slot))
49 goto unlock; 50 goto unlock;
50 workingset_node_shadows_dec(node); 51 if (*slot != entry)
51 /* 52 goto unlock;
52 * Don't track node without shadow entries. 53 radix_tree_replace_slot(slot, NULL);
53 * 54 mapping->nrexceptional--;
54 * Avoid acquiring the list_lru lock if already untracked. 55 if (!node)
55 * The list_empty() test is safe as node->private_list is 56 goto unlock;
56 * protected by mapping->tree_lock. 57 workingset_node_shadows_dec(node);
57 */ 58 /*
58 if (!workingset_node_shadows(node) && 59 * Don't track node without shadow entries.
59 !list_empty(&node->private_list)) 60 *
60 list_lru_del(&workingset_shadow_nodes, &node->private_list); 61 * Avoid acquiring the list_lru lock if already untracked.
61 __radix_tree_delete_node(&mapping->page_tree, node); 62 * The list_empty() test is safe as node->private_list is
63 * protected by mapping->tree_lock.
64 */
65 if (!workingset_node_shadows(node) &&
66 !list_empty(&node->private_list))
67 list_lru_del(&workingset_shadow_nodes,
68 &node->private_list);
69 __radix_tree_delete_node(&mapping->page_tree, node);
70 }
62unlock: 71unlock:
63 spin_unlock_irq(&mapping->tree_lock); 72 spin_unlock_irq(&mapping->tree_lock);
64} 73}
@@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
228 int i; 237 int i;
229 238
230 cleancache_invalidate_inode(mapping); 239 cleancache_invalidate_inode(mapping);
231 if (mapping->nrpages == 0 && mapping->nrshadows == 0) 240 if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
232 return; 241 return;
233 242
234 /* Offsets within partial pages */ 243 /* Offsets within partial pages */
@@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages);
402 */ 411 */
403void truncate_inode_pages_final(struct address_space *mapping) 412void truncate_inode_pages_final(struct address_space *mapping)
404{ 413{
405 unsigned long nrshadows; 414 unsigned long nrexceptional;
406 unsigned long nrpages; 415 unsigned long nrpages;
407 416
408 /* 417 /*
@@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping)
416 425
417 /* 426 /*
418 * When reclaim installs eviction entries, it increases 427 * When reclaim installs eviction entries, it increases
419 * nrshadows first, then decreases nrpages. Make sure we see 428 * nrexceptional first, then decreases nrpages. Make sure we see
420 * this in the right order or we might miss an entry. 429 * this in the right order or we might miss an entry.
421 */ 430 */
422 nrpages = mapping->nrpages; 431 nrpages = mapping->nrpages;
423 smp_rmb(); 432 smp_rmb();
424 nrshadows = mapping->nrshadows; 433 nrexceptional = mapping->nrexceptional;
425 434
426 if (nrpages || nrshadows) { 435 if (nrpages || nrexceptional) {
427 /* 436 /*
428 * As truncation uses a lockless tree lookup, cycle 437 * As truncation uses a lockless tree lookup, cycle
429 * the tree lock to make sure any ongoing tree 438 * the tree lock to make sure any ongoing tree
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd620b65db52..eb3dd37ccd7c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,6 +46,7 @@
46#include <linux/oom.h> 46#include <linux/oom.h>
47#include <linux/prefetch.h> 47#include <linux/prefetch.h>
48#include <linux/printk.h> 48#include <linux/printk.h>
49#include <linux/dax.h>
49 50
50#include <asm/tlbflush.h> 51#include <asm/tlbflush.h>
51#include <asm/div64.h> 52#include <asm/div64.h>
@@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
671 * inode reclaim needs to empty out the radix tree or 672 * inode reclaim needs to empty out the radix tree or
672 * the nodes are lost. Don't plant shadows behind its 673 * the nodes are lost. Don't plant shadows behind its
673 * back. 674 * back.
675 *
676 * We also don't store shadows for DAX mappings because the
677 * only page cache pages found in these are zero pages
678 * covering holes, and because we don't want to mix DAX
679 * exceptional entries and shadow exceptional entries in the
680 * same page_tree.
674 */ 681 */
675 if (reclaimed && page_is_file_cache(page) && 682 if (reclaimed && page_is_file_cache(page) &&
676 !mapping_exiting(mapping)) 683 !mapping_exiting(mapping) && !dax_mapping(mapping))
677 shadow = workingset_eviction(mapping, page); 684 shadow = workingset_eviction(mapping, page);
678 __delete_from_page_cache(page, shadow, memcg); 685 __delete_from_page_cache(page, shadow, memcg);
679 spin_unlock_irqrestore(&mapping->tree_lock, flags); 686 spin_unlock_irqrestore(&mapping->tree_lock, flags);
diff --git a/mm/workingset.c b/mm/workingset.c
index aa017133744b..61ead9e5549d 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
351 node->slots[i] = NULL; 351 node->slots[i] = NULL;
352 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); 352 BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
353 node->count -= 1U << RADIX_TREE_COUNT_SHIFT; 353 node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
354 BUG_ON(!mapping->nrshadows); 354 BUG_ON(!mapping->nrexceptional);
355 mapping->nrshadows--; 355 mapping->nrexceptional--;
356 } 356 }
357 } 357 }
358 BUG_ON(node->count); 358 BUG_ON(node->count);