aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthew Wilcox <matthew.r.wilcox@intel.com>2015-02-16 18:59:02 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-16 20:56:03 -0500
commitf7ca90b160307d63aaedab8bd451c24a182db20f (patch)
tree687eb94acbc8ebfab6d5e12a57dc336ce21b7c64
parent289c6aedac981533331428bc933fff21ae332c9e (diff)
dax,ext2: replace the XIP page fault handler with the DAX page fault handler
Instead of calling aops->get_xip_mem from the fault handler, the filesystem passes a get_block_t that is used to find the appropriate blocks. This requires that all architectures implement copy_user_page(). At the time of writing, mips and arm do not. Patches exist and are in progress. [akpm@linux-foundation.org: remap_file_pages went away] Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com> Reviewed-by: Jan Kara <jack@suse.cz> Cc: Andreas Dilger <andreas.dilger@intel.com> Cc: Boaz Harrosh <boaz@plexistor.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Dave Chinner <david@fromorbit.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Randy Dunlap <rdunlap@infradead.org> Cc: Ross Zwisler <ross.zwisler@linux.intel.com> Cc: Theodore Ts'o <tytso@mit.edu> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ralf Baechle <ralf@linux-mips.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/dax.c241
-rw-r--r--fs/ext2/file.c34
-rw-r--r--include/linux/fs.h4
-rw-r--r--mm/filemap_xip.c206
4 files changed, 276 insertions, 209 deletions
diff --git a/fs/dax.c b/fs/dax.c
index 69c3126a05b4..553e55b93495 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -19,9 +19,13 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/fs.h> 20#include <linux/fs.h>
21#include <linux/genhd.h> 21#include <linux/genhd.h>
22#include <linux/highmem.h>
23#include <linux/memcontrol.h>
24#include <linux/mm.h>
22#include <linux/mutex.h> 25#include <linux/mutex.h>
23#include <linux/sched.h> 26#include <linux/sched.h>
24#include <linux/uio.h> 27#include <linux/uio.h>
28#include <linux/vmstat.h>
25 29
26int dax_clear_blocks(struct inode *inode, sector_t block, long size) 30int dax_clear_blocks(struct inode *inode, sector_t block, long size)
27{ 31{
@@ -221,3 +225,240 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
221 return retval; 225 return retval;
222} 226}
223EXPORT_SYMBOL_GPL(dax_do_io); 227EXPORT_SYMBOL_GPL(dax_do_io);
228
229/*
230 * The user has performed a load from a hole in the file. Allocating
231 * a new page in the file would cause excessive storage usage for
232 * workloads with sparse files. We allocate a page cache page instead.
233 * We'll kick it out of the page cache if it's ever written to,
234 * otherwise it will simply fall out of the page cache under memory
235 * pressure without ever having been dirtied.
236 */
237static int dax_load_hole(struct address_space *mapping, struct page *page,
238 struct vm_fault *vmf)
239{
240 unsigned long size;
241 struct inode *inode = mapping->host;
242 if (!page)
243 page = find_or_create_page(mapping, vmf->pgoff,
244 GFP_KERNEL | __GFP_ZERO);
245 if (!page)
246 return VM_FAULT_OOM;
247 /* Recheck i_size under page lock to avoid truncate race */
248 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
249 if (vmf->pgoff >= size) {
250 unlock_page(page);
251 page_cache_release(page);
252 return VM_FAULT_SIGBUS;
253 }
254
255 vmf->page = page;
256 return VM_FAULT_LOCKED;
257}
258
259static int copy_user_bh(struct page *to, struct buffer_head *bh,
260 unsigned blkbits, unsigned long vaddr)
261{
262 void *vfrom, *vto;
263 if (dax_get_addr(bh, &vfrom, blkbits) < 0)
264 return -EIO;
265 vto = kmap_atomic(to);
266 copy_user_page(vto, vfrom, vaddr, to);
267 kunmap_atomic(vto);
268 return 0;
269}
270
271static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
272 struct vm_area_struct *vma, struct vm_fault *vmf)
273{
274 struct address_space *mapping = inode->i_mapping;
275 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
276 unsigned long vaddr = (unsigned long)vmf->virtual_address;
277 void *addr;
278 unsigned long pfn;
279 pgoff_t size;
280 int error;
281
282 i_mmap_lock_read(mapping);
283
284 /*
285 * Check truncate didn't happen while we were allocating a block.
286 * If it did, this block may or may not be still allocated to the
287 * file. We can't tell the filesystem to free it because we can't
288 * take i_mutex here. In the worst case, the file still has blocks
289 * allocated past the end of the file.
290 */
291 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
292 if (unlikely(vmf->pgoff >= size)) {
293 error = -EIO;
294 goto out;
295 }
296
297 error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
298 if (error < 0)
299 goto out;
300 if (error < PAGE_SIZE) {
301 error = -EIO;
302 goto out;
303 }
304
305 if (buffer_unwritten(bh) || buffer_new(bh))
306 clear_page(addr);
307
308 error = vm_insert_mixed(vma, vaddr, pfn);
309
310 out:
311 i_mmap_unlock_read(mapping);
312
313 if (bh->b_end_io)
314 bh->b_end_io(bh, 1);
315
316 return error;
317}
318
319static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
320 get_block_t get_block)
321{
322 struct file *file = vma->vm_file;
323 struct address_space *mapping = file->f_mapping;
324 struct inode *inode = mapping->host;
325 struct page *page;
326 struct buffer_head bh;
327 unsigned long vaddr = (unsigned long)vmf->virtual_address;
328 unsigned blkbits = inode->i_blkbits;
329 sector_t block;
330 pgoff_t size;
331 int error;
332 int major = 0;
333
334 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
335 if (vmf->pgoff >= size)
336 return VM_FAULT_SIGBUS;
337
338 memset(&bh, 0, sizeof(bh));
339 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
340 bh.b_size = PAGE_SIZE;
341
342 repeat:
343 page = find_get_page(mapping, vmf->pgoff);
344 if (page) {
345 if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
346 page_cache_release(page);
347 return VM_FAULT_RETRY;
348 }
349 if (unlikely(page->mapping != mapping)) {
350 unlock_page(page);
351 page_cache_release(page);
352 goto repeat;
353 }
354 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
355 if (unlikely(vmf->pgoff >= size)) {
356 /*
357 * We have a struct page covering a hole in the file
358 * from a read fault and we've raced with a truncate
359 */
360 error = -EIO;
361 goto unlock_page;
362 }
363 }
364
365 error = get_block(inode, block, &bh, 0);
366 if (!error && (bh.b_size < PAGE_SIZE))
367 error = -EIO; /* fs corruption? */
368 if (error)
369 goto unlock_page;
370
371 if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
372 if (vmf->flags & FAULT_FLAG_WRITE) {
373 error = get_block(inode, block, &bh, 1);
374 count_vm_event(PGMAJFAULT);
375 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
376 major = VM_FAULT_MAJOR;
377 if (!error && (bh.b_size < PAGE_SIZE))
378 error = -EIO;
379 if (error)
380 goto unlock_page;
381 } else {
382 return dax_load_hole(mapping, page, vmf);
383 }
384 }
385
386 if (vmf->cow_page) {
387 struct page *new_page = vmf->cow_page;
388 if (buffer_written(&bh))
389 error = copy_user_bh(new_page, &bh, blkbits, vaddr);
390 else
391 clear_user_highpage(new_page, vaddr);
392 if (error)
393 goto unlock_page;
394 vmf->page = page;
395 if (!page) {
396 i_mmap_lock_read(mapping);
397 /* Check we didn't race with truncate */
398 size = (i_size_read(inode) + PAGE_SIZE - 1) >>
399 PAGE_SHIFT;
400 if (vmf->pgoff >= size) {
401 i_mmap_unlock_read(mapping);
402 error = -EIO;
403 goto out;
404 }
405 }
406 return VM_FAULT_LOCKED;
407 }
408
409 /* Check we didn't race with a read fault installing a new page */
410 if (!page && major)
411 page = find_lock_page(mapping, vmf->pgoff);
412
413 if (page) {
414 unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
415 PAGE_CACHE_SIZE, 0);
416 delete_from_page_cache(page);
417 unlock_page(page);
418 page_cache_release(page);
419 }
420
421 error = dax_insert_mapping(inode, &bh, vma, vmf);
422
423 out:
424 if (error == -ENOMEM)
425 return VM_FAULT_OOM | major;
426 /* -EBUSY is fine, somebody else faulted on the same PTE */
427 if ((error < 0) && (error != -EBUSY))
428 return VM_FAULT_SIGBUS | major;
429 return VM_FAULT_NOPAGE | major;
430
431 unlock_page:
432 if (page) {
433 unlock_page(page);
434 page_cache_release(page);
435 }
436 goto out;
437}
438
439/**
440 * dax_fault - handle a page fault on a DAX file
441 * @vma: The virtual memory area where the fault occurred
442 * @vmf: The description of the fault
443 * @get_block: The filesystem method used to translate file offsets to blocks
444 *
445 * When a page fault occurs, filesystems may call this helper in their
446 * fault handler for DAX files.
447 */
448int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
449 get_block_t get_block)
450{
451 int result;
452 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
453
454 if (vmf->flags & FAULT_FLAG_WRITE) {
455 sb_start_pagefault(sb);
456 file_update_time(vma->vm_file);
457 }
458 result = do_dax_fault(vma, vmf, get_block);
459 if (vmf->flags & FAULT_FLAG_WRITE)
460 sb_end_pagefault(sb);
461
462 return result;
463}
464EXPORT_SYMBOL_GPL(dax_fault);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a247123fd798..a61c93fd9dce 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
25#include "xattr.h" 25#include "xattr.h"
26#include "acl.h" 26#include "acl.h"
27 27
28#ifdef CONFIG_EXT2_FS_XIP
29static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
30{
31 return dax_fault(vma, vmf, ext2_get_block);
32}
33
34static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
35{
36 return dax_mkwrite(vma, vmf, ext2_get_block);
37}
38
39static const struct vm_operations_struct ext2_dax_vm_ops = {
40 .fault = ext2_dax_fault,
41 .page_mkwrite = ext2_dax_mkwrite,
42};
43
44static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
45{
46 if (!IS_DAX(file_inode(file)))
47 return generic_file_mmap(file, vma);
48
49 file_accessed(file);
50 vma->vm_ops = &ext2_dax_vm_ops;
51 vma->vm_flags |= VM_MIXEDMAP;
52 return 0;
53}
54#else
55#define ext2_file_mmap generic_file_mmap
56#endif
57
28/* 58/*
29 * Called when filp is released. This happens when all file descriptors 59 * Called when filp is released. This happens when all file descriptors
30 * for a single struct file are closed. Note that different open() calls 60 * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
70#ifdef CONFIG_COMPAT 100#ifdef CONFIG_COMPAT
71 .compat_ioctl = ext2_compat_ioctl, 101 .compat_ioctl = ext2_compat_ioctl,
72#endif 102#endif
73 .mmap = generic_file_mmap, 103 .mmap = ext2_file_mmap,
74 .open = dquot_file_open, 104 .open = dquot_file_open,
75 .release = ext2_release_file, 105 .release = ext2_release_file,
76 .fsync = ext2_fsync, 106 .fsync = ext2_fsync,
@@ -89,7 +119,7 @@ const struct file_operations ext2_xip_file_operations = {
89#ifdef CONFIG_COMPAT 119#ifdef CONFIG_COMPAT
90 .compat_ioctl = ext2_compat_ioctl, 120 .compat_ioctl = ext2_compat_ioctl,
91#endif 121#endif
92 .mmap = xip_file_mmap, 122 .mmap = ext2_file_mmap,
93 .open = dquot_file_open, 123 .open = dquot_file_open,
94 .release = ext2_release_file, 124 .release = ext2_release_file,
95 .fsync = ext2_fsync, 125 .fsync = ext2_fsync,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8084934a5676..6bad6d4c579b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -51,6 +51,7 @@ struct swap_info_struct;
51struct seq_file; 51struct seq_file;
52struct workqueue_struct; 52struct workqueue_struct;
53struct iov_iter; 53struct iov_iter;
54struct vm_fault;
54 55
55extern void __init inode_init(void); 56extern void __init inode_init(void);
56extern void __init inode_init_early(void); 57extern void __init inode_init_early(void);
@@ -2590,9 +2591,10 @@ extern int nonseekable_open(struct inode * inode, struct file * filp);
2590ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *, 2591ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
2591 loff_t, get_block_t, dio_iodone_t, int flags); 2592 loff_t, get_block_t, dio_iodone_t, int flags);
2592int dax_clear_blocks(struct inode *, sector_t block, long size); 2593int dax_clear_blocks(struct inode *, sector_t block, long size);
2594int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
2595#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
2593 2596
2594#ifdef CONFIG_FS_XIP 2597#ifdef CONFIG_FS_XIP
2595extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
2596extern int xip_truncate_page(struct address_space *mapping, loff_t from); 2598extern int xip_truncate_page(struct address_space *mapping, loff_t from);
2597#else 2599#else
2598static inline int xip_truncate_page(struct address_space *mapping, loff_t from) 2600static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9c869f402c07..59fb387b2238 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -23,212 +23,6 @@
23#include <asm/io.h> 23#include <asm/io.h>
24 24
25/* 25/*
26 * We do use our own empty page to avoid interference with other users
27 * of ZERO_PAGE(), such as /dev/zero
28 */
29static DEFINE_MUTEX(xip_sparse_mutex);
30static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
31static struct page *__xip_sparse_page;
32
33/* called under xip_sparse_mutex */
34static struct page *xip_sparse_page(void)
35{
36 if (!__xip_sparse_page) {
37 struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
38
39 if (page)
40 __xip_sparse_page = page;
41 }
42 return __xip_sparse_page;
43}
44
45/*
46 * __xip_unmap is invoked from xip_unmap and xip_write
47 *
48 * This function walks all vmas of the address_space and unmaps the
49 * __xip_sparse_page when found at pgoff.
50 */
51static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
52{
53 struct vm_area_struct *vma;
54 struct page *page;
55 unsigned count;
56 int locked = 0;
57
58 count = read_seqcount_begin(&xip_sparse_seq);
59
60 page = __xip_sparse_page;
61 if (!page)
62 return;
63
64retry:
65 i_mmap_lock_read(mapping);
66 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
67 pte_t *pte, pteval;
68 spinlock_t *ptl;
69 struct mm_struct *mm = vma->vm_mm;
70 unsigned long address = vma->vm_start +
71 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
72
73 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
74 pte = page_check_address(page, mm, address, &ptl, 1);
75 if (pte) {
76 /* Nuke the page table entry. */
77 flush_cache_page(vma, address, pte_pfn(*pte));
78 pteval = ptep_clear_flush(vma, address, pte);
79 page_remove_rmap(page);
80 dec_mm_counter(mm, MM_FILEPAGES);
81 BUG_ON(pte_dirty(pteval));
82 pte_unmap_unlock(pte, ptl);
83 /* must invalidate_page _before_ freeing the page */
84 mmu_notifier_invalidate_page(mm, address);
85 page_cache_release(page);
86 }
87 }
88 i_mmap_unlock_read(mapping);
89
90 if (locked) {
91 mutex_unlock(&xip_sparse_mutex);
92 } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
93 mutex_lock(&xip_sparse_mutex);
94 locked = 1;
95 goto retry;
96 }
97}
98
99/*
100 * xip_fault() is invoked via the vma operations vector for a
101 * mapped memory region to read in file data during a page fault.
102 *
103 * This function is derived from filemap_fault, but used for execute in place
104 */
105static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
106{
107 struct file *file = vma->vm_file;
108 struct address_space *mapping = file->f_mapping;
109 struct inode *inode = mapping->host;
110 pgoff_t size;
111 void *xip_mem;
112 unsigned long xip_pfn;
113 struct page *page;
114 int error;
115
116 /* XXX: are VM_FAULT_ codes OK? */
117again:
118 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
119 if (vmf->pgoff >= size)
120 return VM_FAULT_SIGBUS;
121
122 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
123 &xip_mem, &xip_pfn);
124 if (likely(!error))
125 goto found;
126 if (error != -ENODATA)
127 return VM_FAULT_OOM;
128
129 /* sparse block */
130 if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
131 (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
132 (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
133 int err;
134
135 /* maybe shared writable, allocate new block */
136 mutex_lock(&xip_sparse_mutex);
137 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
138 &xip_mem, &xip_pfn);
139 mutex_unlock(&xip_sparse_mutex);
140 if (error)
141 return VM_FAULT_SIGBUS;
142 /* unmap sparse mappings at pgoff from all other vmas */
143 __xip_unmap(mapping, vmf->pgoff);
144
145found:
146 /*
147 * We must recheck i_size under i_mmap_rwsem to prevent races
148 * with truncation
149 */
150 i_mmap_lock_read(mapping);
151 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
152 PAGE_CACHE_SHIFT;
153 if (unlikely(vmf->pgoff >= size)) {
154 i_mmap_unlock_read(mapping);
155 return VM_FAULT_SIGBUS;
156 }
157 err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
158 xip_pfn);
159 i_mmap_unlock_read(mapping);
160 if (err == -ENOMEM)
161 return VM_FAULT_OOM;
162 /*
163 * err == -EBUSY is fine, we've raced against another thread
164 * that faulted-in the same page
165 */
166 if (err != -EBUSY)
167 BUG_ON(err);
168 return VM_FAULT_NOPAGE;
169 } else {
170 int err, ret = VM_FAULT_OOM;
171
172 mutex_lock(&xip_sparse_mutex);
173 write_seqcount_begin(&xip_sparse_seq);
174 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
175 &xip_mem, &xip_pfn);
176 if (unlikely(!error)) {
177 write_seqcount_end(&xip_sparse_seq);
178 mutex_unlock(&xip_sparse_mutex);
179 goto again;
180 }
181 if (error != -ENODATA)
182 goto out;
183
184 /*
185 * We must recheck i_size under i_mmap_rwsem to prevent races
186 * with truncation
187 */
188 i_mmap_lock_read(mapping);
189 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
190 PAGE_CACHE_SHIFT;
191 if (unlikely(vmf->pgoff >= size)) {
192 ret = VM_FAULT_SIGBUS;
193 goto unlock;
194 }
195 /* not shared and writable, use xip_sparse_page() */
196 page = xip_sparse_page();
197 if (!page)
198 goto unlock;
199 err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
200 page);
201 if (err == -ENOMEM)
202 goto unlock;
203
204 ret = VM_FAULT_NOPAGE;
205unlock:
206 i_mmap_unlock_read(mapping);
207out:
208 write_seqcount_end(&xip_sparse_seq);
209 mutex_unlock(&xip_sparse_mutex);
210
211 return ret;
212 }
213}
214
215static const struct vm_operations_struct xip_file_vm_ops = {
216 .fault = xip_file_fault,
217 .page_mkwrite = filemap_page_mkwrite,
218};
219
220int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
221{
222 BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
223
224 file_accessed(file);
225 vma->vm_ops = &xip_file_vm_ops;
226 vma->vm_flags |= VM_MIXEDMAP;
227 return 0;
228}
229EXPORT_SYMBOL_GPL(xip_file_mmap);
230
231/*
232 * truncate a page used for execute in place 26 * truncate a page used for execute in place
233 * functionality is analog to block_truncate_page but does use get_xip_mem 27 * functionality is analog to block_truncate_page but does use get_xip_mem
234 * to get the page instead of page cache 28 * to get the page instead of page cache