summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2016-11-20 20:48:36 -0500
committerTheodore Ts'o <tytso@mit.edu>2016-11-20 20:48:36 -0500
commitdd936e4313fa3f60abd6e67abb3cb66fc9a018d1 (patch)
treefd9b89b7922a9c28bdb614ae2d49853d94772093
parent00697eed386d57c5267aad5b8343f027b10da0c0 (diff)
dax: rip out get_block based IO support
No one uses functions using the get_block callback anymore. Rip them out and update documentation. Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--Documentation/filesystems/dax.txt22
-rw-r--r--fs/dax.c315
-rw-r--r--include/linux/dax.h12
3 files changed, 11 insertions, 338 deletions
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 23d18b8a49d5..a7e6e14aeb08 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers
58Filesystem support consists of 58Filesystem support consists of
59- adding support to mark inodes as being DAX by setting the S_DAX flag in 59- adding support to mark inodes as being DAX by setting the S_DAX flag in
60 i_flags 60 i_flags
61- implementing the direct_IO address space operation, and calling 61- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw()
62 dax_do_io() instead of blockdev_direct_IO() if S_DAX is set 62 when inode has S_DAX flag set
63- implementing an mmap file operation for DAX files which sets the 63- implementing an mmap file operation for DAX files which sets the
64 VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to 64 VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to
65 include handlers for fault, pmd_fault and page_mkwrite (which should 65 include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These
66 probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the 66 handlers should probably call dax_iomap_fault() (for fault and page_mkwrite
67 appropriate get_block() callback) 67 handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate
68- calling dax_truncate_page() instead of block_truncate_page() for DAX files 68 iomap operations.
69- calling dax_zero_page_range() instead of zero_user() for DAX files 69- calling iomap_zero_range() passing appropriate iomap operations instead of
70 block_truncate_page() for DAX files
70- ensuring that there is sufficient locking between reads, writes, 71- ensuring that there is sufficient locking between reads, writes,
71 truncates and page faults 72 truncates and page faults
72 73
73The get_block() callback passed to the DAX functions may return 74The iomap handlers for allocating blocks must make sure that allocated blocks
74uninitialised extents. If it does, it must ensure that simultaneous 75are zeroed out and converted to written extents before being returned to avoid
75calls to get_block() (for example by a page-fault racing with a read() 76exposure of uninitialized data through mmap.
76or a write()) work correctly.
77 77
78These filesystems may be used for inspiration: 78These filesystems may be used for inspiration:
79- ext2: see Documentation/filesystems/ext2.txt 79- ext2: see Documentation/filesystems/ext2.txt
diff --git a/fs/dax.c b/fs/dax.c
index 28af41b9da3a..ad131cd2605d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -116,168 +116,6 @@ struct page *read_dax_sector(struct block_device *bdev, sector_t n)
116 return page; 116 return page;
117} 117}
118 118
119static bool buffer_written(struct buffer_head *bh)
120{
121 return buffer_mapped(bh) && !buffer_unwritten(bh);
122}
123
124static sector_t to_sector(const struct buffer_head *bh,
125 const struct inode *inode)
126{
127 sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
128
129 return sector;
130}
131
132static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
133 loff_t start, loff_t end, get_block_t get_block,
134 struct buffer_head *bh)
135{
136 loff_t pos = start, max = start, bh_max = start;
137 bool hole = false;
138 struct block_device *bdev = NULL;
139 int rw = iov_iter_rw(iter), rc;
140 long map_len = 0;
141 struct blk_dax_ctl dax = {
142 .addr = ERR_PTR(-EIO),
143 };
144 unsigned blkbits = inode->i_blkbits;
145 sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
146 >> blkbits;
147
148 if (rw == READ)
149 end = min(end, i_size_read(inode));
150
151 while (pos < end) {
152 size_t len;
153 if (pos == max) {
154 long page = pos >> PAGE_SHIFT;
155 sector_t block = page << (PAGE_SHIFT - blkbits);
156 unsigned first = pos - (block << blkbits);
157 long size;
158
159 if (pos == bh_max) {
160 bh->b_size = PAGE_ALIGN(end - pos);
161 bh->b_state = 0;
162 rc = get_block(inode, block, bh, rw == WRITE);
163 if (rc)
164 break;
165 bh_max = pos - first + bh->b_size;
166 bdev = bh->b_bdev;
167 /*
168 * We allow uninitialized buffers for writes
169 * beyond EOF as those cannot race with faults
170 */
171 WARN_ON_ONCE(
172 (buffer_new(bh) && block < file_blks) ||
173 (rw == WRITE && buffer_unwritten(bh)));
174 } else {
175 unsigned done = bh->b_size -
176 (bh_max - (pos - first));
177 bh->b_blocknr += done >> blkbits;
178 bh->b_size -= done;
179 }
180
181 hole = rw == READ && !buffer_written(bh);
182 if (hole) {
183 size = bh->b_size - first;
184 } else {
185 dax_unmap_atomic(bdev, &dax);
186 dax.sector = to_sector(bh, inode);
187 dax.size = bh->b_size;
188 map_len = dax_map_atomic(bdev, &dax);
189 if (map_len < 0) {
190 rc = map_len;
191 break;
192 }
193 dax.addr += first;
194 size = map_len - first;
195 }
196 /*
197 * pos + size is one past the last offset for IO,
198 * so pos + size can overflow loff_t at extreme offsets.
199 * Cast to u64 to catch this and get the true minimum.
200 */
201 max = min_t(u64, pos + size, end);
202 }
203
204 if (iov_iter_rw(iter) == WRITE) {
205 len = copy_from_iter_pmem(dax.addr, max - pos, iter);
206 } else if (!hole)
207 len = copy_to_iter((void __force *) dax.addr, max - pos,
208 iter);
209 else
210 len = iov_iter_zero(max - pos, iter);
211
212 if (!len) {
213 rc = -EFAULT;
214 break;
215 }
216
217 pos += len;
218 if (!IS_ERR(dax.addr))
219 dax.addr += len;
220 }
221
222 dax_unmap_atomic(bdev, &dax);
223
224 return (pos == start) ? rc : pos - start;
225}
226
227/**
228 * dax_do_io - Perform I/O to a DAX file
229 * @iocb: The control block for this I/O
230 * @inode: The file which the I/O is directed at
231 * @iter: The addresses to do I/O from or to
232 * @get_block: The filesystem method used to translate file offsets to blocks
233 * @end_io: A filesystem callback for I/O completion
234 * @flags: See below
235 *
236 * This function uses the same locking scheme as do_blockdev_direct_IO:
237 * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
238 * caller for writes. For reads, we take and release the i_mutex ourselves.
239 * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
240 * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
241 * is in progress.
242 */
243ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
244 struct iov_iter *iter, get_block_t get_block,
245 dio_iodone_t end_io, int flags)
246{
247 struct buffer_head bh;
248 ssize_t retval = -EINVAL;
249 loff_t pos = iocb->ki_pos;
250 loff_t end = pos + iov_iter_count(iter);
251
252 memset(&bh, 0, sizeof(bh));
253 bh.b_bdev = inode->i_sb->s_bdev;
254
255 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
256 inode_lock(inode);
257
258 /* Protects against truncate */
259 if (!(flags & DIO_SKIP_DIO_COUNT))
260 inode_dio_begin(inode);
261
262 retval = dax_io(inode, iter, pos, end, get_block, &bh);
263
264 if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
265 inode_unlock(inode);
266
267 if (end_io) {
268 int err;
269
270 err = end_io(iocb, pos, retval, bh.b_private);
271 if (err)
272 retval = err;
273 }
274
275 if (!(flags & DIO_SKIP_DIO_COUNT))
276 inode_dio_end(inode);
277 return retval;
278}
279EXPORT_SYMBOL_GPL(dax_do_io);
280
281/* 119/*
282 * DAX radix tree locking 120 * DAX radix tree locking
283 */ 121 */
@@ -920,105 +758,6 @@ static int dax_insert_mapping(struct address_space *mapping,
920} 758}
921 759
922/** 760/**
923 * dax_fault - handle a page fault on a DAX file
924 * @vma: The virtual memory area where the fault occurred
925 * @vmf: The description of the fault
926 * @get_block: The filesystem method used to translate file offsets to blocks
927 *
928 * When a page fault occurs, filesystems may call this helper in their
929 * fault handler for DAX files. dax_fault() assumes the caller has done all
930 * the necessary locking for the page fault to proceed successfully.
931 */
932int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
933 get_block_t get_block)
934{
935 struct file *file = vma->vm_file;
936 struct address_space *mapping = file->f_mapping;
937 struct inode *inode = mapping->host;
938 void *entry;
939 struct buffer_head bh;
940 unsigned long vaddr = (unsigned long)vmf->virtual_address;
941 unsigned blkbits = inode->i_blkbits;
942 sector_t block;
943 pgoff_t size;
944 int error;
945 int major = 0;
946
947 /*
948 * Check whether offset isn't beyond end of file now. Caller is supposed
949 * to hold locks serializing us with truncate / punch hole so this is
950 * a reliable test.
951 */
952 size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
953 if (vmf->pgoff >= size)
954 return VM_FAULT_SIGBUS;
955
956 memset(&bh, 0, sizeof(bh));
957 block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
958 bh.b_bdev = inode->i_sb->s_bdev;
959 bh.b_size = PAGE_SIZE;
960
961 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
962 if (IS_ERR(entry)) {
963 error = PTR_ERR(entry);
964 goto out;
965 }
966
967 error = get_block(inode, block, &bh, 0);
968 if (!error && (bh.b_size < PAGE_SIZE))
969 error = -EIO; /* fs corruption? */
970 if (error)
971 goto unlock_entry;
972
973 if (vmf->cow_page) {
974 struct page *new_page = vmf->cow_page;
975 if (buffer_written(&bh))
976 error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
977 bh.b_size, new_page, vaddr);
978 else
979 clear_user_highpage(new_page, vaddr);
980 if (error)
981 goto unlock_entry;
982 if (!radix_tree_exceptional_entry(entry)) {
983 vmf->page = entry;
984 return VM_FAULT_LOCKED;
985 }
986 vmf->entry = entry;
987 return VM_FAULT_DAX_LOCKED;
988 }
989
990 if (!buffer_mapped(&bh)) {
991 if (vmf->flags & FAULT_FLAG_WRITE) {
992 error = get_block(inode, block, &bh, 1);
993 count_vm_event(PGMAJFAULT);
994 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
995 major = VM_FAULT_MAJOR;
996 if (!error && (bh.b_size < PAGE_SIZE))
997 error = -EIO;
998 if (error)
999 goto unlock_entry;
1000 } else {
1001 return dax_load_hole(mapping, entry, vmf);
1002 }
1003 }
1004
1005 /* Filesystem should not return unwritten buffers to us! */
1006 WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
1007 error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
1008 bh.b_size, &entry, vma, vmf);
1009 unlock_entry:
1010 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1011 out:
1012 if (error == -ENOMEM)
1013 return VM_FAULT_OOM | major;
1014 /* -EBUSY is fine, somebody else faulted on the same PTE */
1015 if ((error < 0) && (error != -EBUSY))
1016 return VM_FAULT_SIGBUS | major;
1017 return VM_FAULT_NOPAGE | major;
1018}
1019EXPORT_SYMBOL_GPL(dax_fault);
1020
1021/**
1022 * dax_pfn_mkwrite - handle first write to DAX page 761 * dax_pfn_mkwrite - handle first write to DAX page
1023 * @vma: The virtual memory area where the fault occurred 762 * @vma: The virtual memory area where the fault occurred
1024 * @vmf: The description of the fault 763 * @vmf: The description of the fault
@@ -1078,60 +817,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
1078} 817}
1079EXPORT_SYMBOL_GPL(__dax_zero_page_range); 818EXPORT_SYMBOL_GPL(__dax_zero_page_range);
1080 819
1081/**
1082 * dax_zero_page_range - zero a range within a page of a DAX file
1083 * @inode: The file being truncated
1084 * @from: The file offset that is being truncated to
1085 * @length: The number of bytes to zero
1086 * @get_block: The filesystem method used to translate file offsets to blocks
1087 *
1088 * This function can be called by a filesystem when it is zeroing part of a
1089 * page in a DAX file. This is intended for hole-punch operations. If
1090 * you are truncating a file, the helper function dax_truncate_page() may be
1091 * more convenient.
1092 */
1093int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
1094 get_block_t get_block)
1095{
1096 struct buffer_head bh;
1097 pgoff_t index = from >> PAGE_SHIFT;
1098 unsigned offset = from & (PAGE_SIZE-1);
1099 int err;
1100
1101 /* Block boundary? Nothing to do */
1102 if (!length)
1103 return 0;
1104 if (WARN_ON_ONCE((offset + length) > PAGE_SIZE))
1105 return -EINVAL;
1106
1107 memset(&bh, 0, sizeof(bh));
1108 bh.b_bdev = inode->i_sb->s_bdev;
1109 bh.b_size = PAGE_SIZE;
1110 err = get_block(inode, index, &bh, 0);
1111 if (err < 0 || !buffer_written(&bh))
1112 return err;
1113
1114 return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
1115 offset, length);
1116}
1117EXPORT_SYMBOL_GPL(dax_zero_page_range);
1118
1119/**
1120 * dax_truncate_page - handle a partial page being truncated in a DAX file
1121 * @inode: The file being truncated
1122 * @from: The file offset that is being truncated to
1123 * @get_block: The filesystem method used to translate file offsets to blocks
1124 *
1125 * Similar to block_truncate_page(), this function can be called by a
1126 * filesystem when it is truncating a DAX file to handle the partial page.
1127 */
1128int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
1129{
1130 unsigned length = PAGE_ALIGN(from) - from;
1131 return dax_zero_page_range(inode, from, length, get_block);
1132}
1133EXPORT_SYMBOL_GPL(dax_truncate_page);
1134
1135#ifdef CONFIG_FS_IOMAP 820#ifdef CONFIG_FS_IOMAP
1136static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 821static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
1137{ 822{
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8d1a5c47945f..0afade8bd3d7 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -38,13 +38,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
38 38
39ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 39ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
40 struct iomap_ops *ops); 40 struct iomap_ops *ops);
41ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
42 get_block_t, dio_iodone_t, int flags);
43int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
44int dax_truncate_page(struct inode *, loff_t from, get_block_t);
45int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 41int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
46 struct iomap_ops *ops); 42 struct iomap_ops *ops);
47int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
48int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 43int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
49void dax_wake_mapping_entry_waiter(struct address_space *mapping, 44void dax_wake_mapping_entry_waiter(struct address_space *mapping,
50 pgoff_t index, void *entry, bool wake_all); 45 pgoff_t index, void *entry, bool wake_all);
@@ -73,12 +68,6 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
73} 68}
74#endif 69#endif
75 70
76static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
77 pmd_t *pmd, unsigned int flags, get_block_t gb)
78{
79 return VM_FAULT_FALLBACK;
80}
81
82#ifdef CONFIG_FS_DAX_PMD 71#ifdef CONFIG_FS_DAX_PMD
83static inline unsigned int dax_radix_order(void *entry) 72static inline unsigned int dax_radix_order(void *entry)
84{ 73{
@@ -101,7 +90,6 @@ static inline int dax_iomap_pmd_fault(struct vm_area_struct *vma,
101} 90}
102#endif 91#endif
103int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 92int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
104#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
105 93
106static inline bool vma_is_dax(struct vm_area_struct *vma) 94static inline bool vma_is_dax(struct vm_area_struct *vma)
107{ 95{