aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS6
-rw-r--r--fs/Makefile1
-rw-r--r--fs/dax.c186
-rw-r--r--fs/ext2/file.c6
-rw-r--r--fs/ext2/inode.c8
-rw-r--r--include/linux/fs.h12
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c234
8 files changed, 214 insertions, 245 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 348f5c16ef50..8670c224c833 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3151,6 +3151,12 @@ L: linux-i2c@vger.kernel.org
3151S: Maintained 3151S: Maintained
3152F: drivers/i2c/busses/i2c-diolan-u2c.c 3152F: drivers/i2c/busses/i2c-diolan-u2c.c
3153 3153
3154DIRECT ACCESS (DAX)
3155M: Matthew Wilcox <willy@linux.intel.com>
3156L: linux-fsdevel@vger.kernel.org
3157S: Supported
3158F: fs/dax.c
3159
3154DIRECTORY NOTIFICATION (DNOTIFY) 3160DIRECTORY NOTIFICATION (DNOTIFY)
3155M: Eric Paris <eparis@parisplace.org> 3161M: Eric Paris <eparis@parisplace.org>
3156S: Maintained 3162S: Maintained
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..0534444e257c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
28obj-$(CONFIG_TIMERFD) += timerfd.o 28obj-$(CONFIG_TIMERFD) += timerfd.o
29obj-$(CONFIG_EVENTFD) += eventfd.o 29obj-$(CONFIG_EVENTFD) += eventfd.o
30obj-$(CONFIG_AIO) += aio.o 30obj-$(CONFIG_AIO) += aio.o
31obj-$(CONFIG_FS_XIP) += dax.o
31obj-$(CONFIG_FILE_LOCKING) += locks.o 32obj-$(CONFIG_FILE_LOCKING) += locks.o
32obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o 33obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
33obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o 34obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..1a2bdbfa3ea9
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,186 @@
1/*
2 * fs/dax.c - Direct Access filesystem code
3 * Copyright (c) 2013-2014 Intel Corporation
4 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5 * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 */
16
17#include <linux/atomic.h>
18#include <linux/blkdev.h>
19#include <linux/buffer_head.h>
20#include <linux/fs.h>
21#include <linux/genhd.h>
22#include <linux/mutex.h>
23#include <linux/uio.h>
24
25static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
26{
27 unsigned long pfn;
28 sector_t sector = bh->b_blocknr << (blkbits - 9);
29 return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
30}
31
32static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
33 loff_t end)
34{
35 loff_t final = end - pos + first; /* The final byte of the buffer */
36
37 if (first > 0)
38 memset(addr, 0, first);
39 if (final < size)
40 memset(addr + final, 0, size - final);
41}
42
43static bool buffer_written(struct buffer_head *bh)
44{
45 return buffer_mapped(bh) && !buffer_unwritten(bh);
46}
47
48/*
49 * When ext4 encounters a hole, it returns without modifying the buffer_head
50 * which means that we can't trust b_size. To cope with this, we set b_state
51 * to 0 before calling get_block and, if any bit is set, we know we can trust
52 * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
53 * and would save us time calling get_block repeatedly.
54 */
55static bool buffer_size_valid(struct buffer_head *bh)
56{
57 return bh->b_state != 0;
58}
59
60static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
61 loff_t start, loff_t end, get_block_t get_block,
62 struct buffer_head *bh)
63{
64 ssize_t retval = 0;
65 loff_t pos = start;
66 loff_t max = start;
67 loff_t bh_max = start;
68 void *addr;
69 bool hole = false;
70
71 if (rw != WRITE)
72 end = min(end, i_size_read(inode));
73
74 while (pos < end) {
75 unsigned len;
76 if (pos == max) {
77 unsigned blkbits = inode->i_blkbits;
78 sector_t block = pos >> blkbits;
79 unsigned first = pos - (block << blkbits);
80 long size;
81
82 if (pos == bh_max) {
83 bh->b_size = PAGE_ALIGN(end - pos);
84 bh->b_state = 0;
85 retval = get_block(inode, block, bh,
86 rw == WRITE);
87 if (retval)
88 break;
89 if (!buffer_size_valid(bh))
90 bh->b_size = 1 << blkbits;
91 bh_max = pos - first + bh->b_size;
92 } else {
93 unsigned done = bh->b_size -
94 (bh_max - (pos - first));
95 bh->b_blocknr += done >> blkbits;
96 bh->b_size -= done;
97 }
98
99 hole = (rw != WRITE) && !buffer_written(bh);
100 if (hole) {
101 addr = NULL;
102 size = bh->b_size - first;
103 } else {
104 retval = dax_get_addr(bh, &addr, blkbits);
105 if (retval < 0)
106 break;
107 if (buffer_unwritten(bh) || buffer_new(bh))
108 dax_new_buf(addr, retval, first, pos,
109 end);
110 addr += first;
111 size = retval - first;
112 }
113 max = min(pos + size, end);
114 }
115
116 if (rw == WRITE)
117 len = copy_from_iter(addr, max - pos, iter);
118 else if (!hole)
119 len = copy_to_iter(addr, max - pos, iter);
120 else
121 len = iov_iter_zero(max - pos, iter);
122
123 if (!len)
124 break;
125
126 pos += len;
127 addr += len;
128 }
129
130 return (pos == start) ? retval : pos - start;
131}
132
133/**
134 * dax_do_io - Perform I/O to a DAX file
135 * @rw: READ to read or WRITE to write
136 * @iocb: The control block for this I/O
137 * @inode: The file which the I/O is directed at
138 * @iter: The addresses to do I/O from or to
139 * @pos: The file offset where the I/O starts
140 * @get_block: The filesystem method used to translate file offsets to blocks
141 * @end_io: A filesystem callback for I/O completion
142 * @flags: See below
143 *
144 * This function uses the same locking scheme as do_blockdev_direct_IO:
145 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
146 * caller for writes. For reads, we take and release the i_mutex ourselves.
147 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
148 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
149 * is in progress.
150 */
151ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
152 struct iov_iter *iter, loff_t pos,
153 get_block_t get_block, dio_iodone_t end_io, int flags)
154{
155 struct buffer_head bh;
156 ssize_t retval = -EINVAL;
157 loff_t end = pos + iov_iter_count(iter);
158
159 memset(&bh, 0, sizeof(bh));
160
161 if ((flags & DIO_LOCKING) && (rw == READ)) {
162 struct address_space *mapping = inode->i_mapping;
163 mutex_lock(&inode->i_mutex);
164 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
165 if (retval) {
166 mutex_unlock(&inode->i_mutex);
167 goto out;
168 }
169 }
170
171 /* Protects against truncate */
172 atomic_inc(&inode->i_dio_count);
173
174 retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
175
176 if ((flags & DIO_LOCKING) && (rw == READ))
177 mutex_unlock(&inode->i_mutex);
178
179 if ((retval > 0) && end_io)
180 end_io(iocb, pos, retval, bh.b_private);
181
182 inode_dio_done(inode);
183 out:
184 return retval;
185}
186EXPORT_SYMBOL_GPL(dax_do_io);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..a247123fd798 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -81,8 +81,10 @@ const struct file_operations ext2_file_operations = {
81#ifdef CONFIG_EXT2_FS_XIP 81#ifdef CONFIG_EXT2_FS_XIP
82const struct file_operations ext2_xip_file_operations = { 82const struct file_operations ext2_xip_file_operations = {
83 .llseek = generic_file_llseek, 83 .llseek = generic_file_llseek,
84 .read = xip_file_read, 84 .read = new_sync_read,
85 .write = xip_file_write, 85 .write = new_sync_write,
86 .read_iter = generic_file_read_iter,
87 .write_iter = generic_file_write_iter,
86 .unlocked_ioctl = ext2_ioctl, 88 .unlocked_ioctl = ext2_ioctl,
87#ifdef CONFIG_COMPAT 89#ifdef CONFIG_COMPAT
88 .compat_ioctl = ext2_compat_ioctl, 90 .compat_ioctl = ext2_compat_ioctl,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0cb04486577d..3ccd5fd47d66 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -859,7 +859,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
859 size_t count = iov_iter_count(iter); 859 size_t count = iov_iter_count(iter);
860 ssize_t ret; 860 ssize_t ret;
861 861
862 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block); 862 if (IS_DAX(inode))
863 ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
864 NULL, DIO_LOCKING);
865 else
866 ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
867 ext2_get_block);
863 if (ret < 0 && (rw & WRITE)) 868 if (ret < 0 && (rw & WRITE))
864 ext2_write_failed(mapping, offset + count); 869 ext2_write_failed(mapping, offset + count);
865 return ret; 870 return ret;
@@ -888,6 +893,7 @@ const struct address_space_operations ext2_aops = {
888const struct address_space_operations ext2_aops_xip = { 893const struct address_space_operations ext2_aops_xip = {
889 .bmap = ext2_bmap, 894 .bmap = ext2_bmap,
890 .get_xip_mem = ext2_get_xip_mem, 895 .get_xip_mem = ext2_get_xip_mem,
896 .direct_IO = ext2_direct_IO,
891}; 897};
892 898
893const struct address_space_operations ext2_nobh_aops = { 899const struct address_space_operations ext2_nobh_aops = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb373bb5cf03..241c3c030fb5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2587,12 +2587,11 @@ extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
2587extern int generic_file_open(struct inode * inode, struct file * filp); 2587extern int generic_file_open(struct inode * inode, struct file * filp);
2588extern int nonseekable_open(struct inode * inode, struct file * filp); 2588extern int nonseekable_open(struct inode * inode, struct file * filp);
2589 2589
2590ssize_t dax_do_io(int rw, struct kiocb *, struct inode *, struct iov_iter *,
2591 loff_t, get_block_t, dio_iodone_t, int flags);
2592
2590#ifdef CONFIG_FS_XIP 2593#ifdef CONFIG_FS_XIP
2591extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
2592 loff_t *ppos);
2593extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma); 2594extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
2594extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
2595 size_t len, loff_t *ppos);
2596extern int xip_truncate_page(struct address_space *mapping, loff_t from); 2595extern int xip_truncate_page(struct address_space *mapping, loff_t from);
2597#else 2596#else
2598static inline int xip_truncate_page(struct address_space *mapping, loff_t from) 2597static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
@@ -2756,6 +2755,11 @@ extern int generic_show_options(struct seq_file *m, struct dentry *root);
2756extern void save_mount_options(struct super_block *sb, char *options); 2755extern void save_mount_options(struct super_block *sb, char *options);
2757extern void replace_mount_options(struct super_block *sb, char *options); 2756extern void replace_mount_options(struct super_block *sb, char *options);
2758 2757
2758static inline bool io_is_direct(struct file *filp)
2759{
2760 return (filp->f_flags & O_DIRECT) || IS_DAX(file_inode(filp));
2761}
2762
2759static inline ino_t parent_ino(struct dentry *dentry) 2763static inline ino_t parent_ino(struct dentry *dentry)
2760{ 2764{
2761 ino_t res; 2765 ino_t res;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1578c224285e..ad7242043bdb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1695 loff_t *ppos = &iocb->ki_pos; 1695 loff_t *ppos = &iocb->ki_pos;
1696 loff_t pos = *ppos; 1696 loff_t pos = *ppos;
1697 1697
1698 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 1698 if (io_is_direct(file)) {
1699 if (file->f_flags & O_DIRECT) {
1700 struct address_space *mapping = file->f_mapping; 1699 struct address_space *mapping = file->f_mapping;
1701 struct inode *inode = mapping->host; 1700 struct inode *inode = mapping->host;
1702 size_t count = iov_iter_count(iter); 1701 size_t count = iov_iter_count(iter);
@@ -2584,8 +2583,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2584 if (err) 2583 if (err)
2585 goto out; 2584 goto out;
2586 2585
2587 /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ 2586 if (io_is_direct(file)) {
2588 if (unlikely(file->f_flags & O_DIRECT)) {
2589 loff_t endbyte; 2587 loff_t endbyte;
2590 2588
2591 written = generic_file_direct_write(iocb, from, pos); 2589 written = generic_file_direct_write(iocb, from, pos);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 59e1c5585748..9c869f402c07 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -43,119 +43,6 @@ static struct page *xip_sparse_page(void)
43} 43}
44 44
45/* 45/*
46 * This is a file read routine for execute in place files, and uses
47 * the mapping->a_ops->get_xip_mem() function for the actual low-level
48 * stuff.
49 *
50 * Note the struct file* is not used at all. It may be NULL.
51 */
52static ssize_t
53do_xip_mapping_read(struct address_space *mapping,
54 struct file_ra_state *_ra,
55 struct file *filp,
56 char __user *buf,
57 size_t len,
58 loff_t *ppos)
59{
60 struct inode *inode = mapping->host;
61 pgoff_t index, end_index;
62 unsigned long offset;
63 loff_t isize, pos;
64 size_t copied = 0, error = 0;
65
66 BUG_ON(!mapping->a_ops->get_xip_mem);
67
68 pos = *ppos;
69 index = pos >> PAGE_CACHE_SHIFT;
70 offset = pos & ~PAGE_CACHE_MASK;
71
72 isize = i_size_read(inode);
73 if (!isize)
74 goto out;
75
76 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
77 do {
78 unsigned long nr, left;
79 void *xip_mem;
80 unsigned long xip_pfn;
81 int zero = 0;
82
83 /* nr is the maximum number of bytes to copy from this page */
84 nr = PAGE_CACHE_SIZE;
85 if (index >= end_index) {
86 if (index > end_index)
87 goto out;
88 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
89 if (nr <= offset) {
90 goto out;
91 }
92 }
93 nr = nr - offset;
94 if (nr > len - copied)
95 nr = len - copied;
96
97 error = mapping->a_ops->get_xip_mem(mapping, index, 0,
98 &xip_mem, &xip_pfn);
99 if (unlikely(error)) {
100 if (error == -ENODATA) {
101 /* sparse */
102 zero = 1;
103 } else
104 goto out;
105 }
106
107 /* If users can be writing to this page using arbitrary
108 * virtual addresses, take care about potential aliasing
109 * before reading the page on the kernel side.
110 */
111 if (mapping_writably_mapped(mapping))
112 /* address based flush */ ;
113
114 /*
115 * Ok, we have the mem, so now we can copy it to user space...
116 *
117 * The actor routine returns how many bytes were actually used..
118 * NOTE! This may not be the same as how much of a user buffer
119 * we filled up (we may be padding etc), so we can only update
120 * "pos" here (the actor routine has to update the user buffer
121 * pointers and the remaining count).
122 */
123 if (!zero)
124 left = __copy_to_user(buf+copied, xip_mem+offset, nr);
125 else
126 left = __clear_user(buf + copied, nr);
127
128 if (left) {
129 error = -EFAULT;
130 goto out;
131 }
132
133 copied += (nr - left);
134 offset += (nr - left);
135 index += offset >> PAGE_CACHE_SHIFT;
136 offset &= ~PAGE_CACHE_MASK;
137 } while (copied < len);
138
139out:
140 *ppos = pos + copied;
141 if (filp)
142 file_accessed(filp);
143
144 return (copied ? copied : error);
145}
146
147ssize_t
148xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
149{
150 if (!access_ok(VERIFY_WRITE, buf, len))
151 return -EFAULT;
152
153 return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
154 buf, len, ppos);
155}
156EXPORT_SYMBOL_GPL(xip_file_read);
157
158/*
159 * __xip_unmap is invoked from xip_unmap and xip_write 46 * __xip_unmap is invoked from xip_unmap and xip_write
160 * 47 *
161 * This function walks all vmas of the address_space and unmaps the 48 * This function walks all vmas of the address_space and unmaps the
@@ -341,127 +228,6 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
341} 228}
342EXPORT_SYMBOL_GPL(xip_file_mmap); 229EXPORT_SYMBOL_GPL(xip_file_mmap);
343 230
344static ssize_t
345__xip_file_write(struct file *filp, const char __user *buf,
346 size_t count, loff_t pos, loff_t *ppos)
347{
348 struct address_space * mapping = filp->f_mapping;
349 const struct address_space_operations *a_ops = mapping->a_ops;
350 struct inode *inode = mapping->host;
351 long status = 0;
352 size_t bytes;
353 ssize_t written = 0;
354
355 BUG_ON(!mapping->a_ops->get_xip_mem);
356
357 do {
358 unsigned long index;
359 unsigned long offset;
360 size_t copied;
361 void *xip_mem;
362 unsigned long xip_pfn;
363
364 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
365 index = pos >> PAGE_CACHE_SHIFT;
366 bytes = PAGE_CACHE_SIZE - offset;
367 if (bytes > count)
368 bytes = count;
369
370 status = a_ops->get_xip_mem(mapping, index, 0,
371 &xip_mem, &xip_pfn);
372 if (status == -ENODATA) {
373 /* we allocate a new page unmap it */
374 mutex_lock(&xip_sparse_mutex);
375 status = a_ops->get_xip_mem(mapping, index, 1,
376 &xip_mem, &xip_pfn);
377 mutex_unlock(&xip_sparse_mutex);
378 if (!status)
379 /* unmap page at pgoff from all other vmas */
380 __xip_unmap(mapping, index);
381 }
382
383 if (status)
384 break;
385
386 copied = bytes -
387 __copy_from_user_nocache(xip_mem + offset, buf, bytes);
388
389 if (likely(copied > 0)) {
390 status = copied;
391
392 if (status >= 0) {
393 written += status;
394 count -= status;
395 pos += status;
396 buf += status;
397 }
398 }
399 if (unlikely(copied != bytes))
400 if (status >= 0)
401 status = -EFAULT;
402 if (status < 0)
403 break;
404 } while (count);
405 *ppos = pos;
406 /*
407 * No need to use i_size_read() here, the i_size
408 * cannot change under us because we hold i_mutex.
409 */
410 if (pos > inode->i_size) {
411 i_size_write(inode, pos);
412 mark_inode_dirty(inode);
413 }
414
415 return written ? written : status;
416}
417
418ssize_t
419xip_file_write(struct file *filp, const char __user *buf, size_t len,
420 loff_t *ppos)
421{
422 struct address_space *mapping = filp->f_mapping;
423 struct inode *inode = mapping->host;
424 size_t count;
425 loff_t pos;
426 ssize_t ret;
427
428 mutex_lock(&inode->i_mutex);
429
430 if (!access_ok(VERIFY_READ, buf, len)) {
431 ret=-EFAULT;
432 goto out_up;
433 }
434
435 pos = *ppos;
436 count = len;
437
438 /* We can write back this queue in page reclaim */
439 current->backing_dev_info = inode_to_bdi(inode);
440
441 ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
442 if (ret)
443 goto out_backing;
444 if (count == 0)
445 goto out_backing;
446
447 ret = file_remove_suid(filp);
448 if (ret)
449 goto out_backing;
450
451 ret = file_update_time(filp);
452 if (ret)
453 goto out_backing;
454
455 ret = __xip_file_write (filp, buf, count, pos, ppos);
456
457 out_backing:
458 current->backing_dev_info = NULL;
459 out_up:
460 mutex_unlock(&inode->i_mutex);
461 return ret;
462}
463EXPORT_SYMBOL_GPL(xip_file_write);
464
465/* 231/*
466 * truncate a page used for execute in place 232 * truncate a page used for execute in place
467 * functionality is analog to block_truncate_page but does use get_xip_mem 233 * functionality is analog to block_truncate_page but does use get_xip_mem