aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/buffer.c76
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap.c497
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/blocklayoutxdr.c1
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c101
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c51
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h18
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c31
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h43
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c38
-rw-r--r--fs/xfs/libxfs/xfs_format.h66
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c28
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/xfs_aops.c332
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c381
-rw-r--r--fs/xfs/xfs_bmap_util.h3
-rw-r--r--fs/xfs/xfs_buf.c236
-rw-r--r--fs/xfs/xfs_buf.h7
-rw-r--r--fs/xfs/xfs_buf_item.c31
-rw-r--r--fs/xfs/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c5
-rw-r--r--fs/xfs/xfs_error.h2
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c425
-rw-r--r--fs/xfs/xfs_fsops.c105
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c16
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_ioctl.c27
-rw-r--r--fs/xfs/xfs_ioctl.h3
-rw-r--r--fs/xfs/xfs_ioctl32.c6
-rw-r--r--fs/xfs/xfs_iomap.c171
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c113
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c13
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c258
-rw-r--r--fs/xfs/xfs_mount.c10
-rw-r--r--fs/xfs/xfs_ondisk.h31
-rw-r--r--fs/xfs/xfs_pnfs.c27
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_super.c19
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_sysfs.c3
-rw-r--r--fs/xfs/xfs_trace.h25
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--include/linux/exportfs.h16
-rw-r--r--include/linux/iomap.h70
66 files changed, 2026 insertions, 1440 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index b8fcb416be72..4524916fa200 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
10 10
11if BLOCK 11if BLOCK
12 12
13config FS_IOMAP
14 bool
15
13source "fs/ext2/Kconfig" 16source "fs/ext2/Kconfig"
14source "fs/ext4/Kconfig" 17source "fs/ext4/Kconfig"
15source "fs/jbd2/Kconfig" 18source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13b62d3..ed2b63257ba9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
49obj-$(CONFIG_SYSCTL) += drop_caches.o 49obj-$(CONFIG_SYSCTL) += drop_caches.o
50 50
51obj-$(CONFIG_FHANDLE) += fhandle.o 51obj-$(CONFIG_FHANDLE) += fhandle.o
52obj-$(CONFIG_FS_IOMAP) += iomap.o
52 53
53obj-y += quota/ 54obj-y += quota/
54 55
diff --git a/fs/buffer.c b/fs/buffer.c
index b9fa1be75e69..9c8eb9b6db6a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/iomap.h>
24#include <linux/mm.h> 25#include <linux/mm.h>
25#include <linux/percpu.h> 26#include <linux/percpu.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1892} 1893}
1893EXPORT_SYMBOL(page_zero_new_buffers); 1894EXPORT_SYMBOL(page_zero_new_buffers);
1894 1895
1895int __block_write_begin(struct page *page, loff_t pos, unsigned len, 1896static void
1896 get_block_t *get_block) 1897iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1898 struct iomap *iomap)
1899{
1900 loff_t offset = block << inode->i_blkbits;
1901
1902 bh->b_bdev = iomap->bdev;
1903
1904 /*
1905 * Block points to offset in file we need to map, iomap contains
1906 * the offset at which the map starts. If the map ends before the
1907 * current block, then do not map the buffer and let the caller
1908 * handle it.
1909 */
1910 BUG_ON(offset >= iomap->offset + iomap->length);
1911
1912 switch (iomap->type) {
1913 case IOMAP_HOLE:
1914 /*
1915 * If the buffer is not up to date or beyond the current EOF,
1916 * we need to mark it as new to ensure sub-block zeroing is
1917 * executed if necessary.
1918 */
1919 if (!buffer_uptodate(bh) ||
1920 (offset >= i_size_read(inode)))
1921 set_buffer_new(bh);
1922 break;
1923 case IOMAP_DELALLOC:
1924 if (!buffer_uptodate(bh) ||
1925 (offset >= i_size_read(inode)))
1926 set_buffer_new(bh);
1927 set_buffer_uptodate(bh);
1928 set_buffer_mapped(bh);
1929 set_buffer_delay(bh);
1930 break;
1931 case IOMAP_UNWRITTEN:
1932 /*
1933 * For unwritten regions, we always need to ensure that
1934 * sub-block writes cause the regions in the block we are not
1935 * writing to are zeroed. Set the buffer as new to ensure this.
1936 */
1937 set_buffer_new(bh);
1938 set_buffer_unwritten(bh);
1939 /* FALLTHRU */
1940 case IOMAP_MAPPED:
1941 if (offset >= i_size_read(inode))
1942 set_buffer_new(bh);
1943 bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
1944 ((offset - iomap->offset) >> inode->i_blkbits);
1945 set_buffer_mapped(bh);
1946 break;
1947 }
1948}
1949
1950int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
1951 get_block_t *get_block, struct iomap *iomap)
1897{ 1952{
1898 unsigned from = pos & (PAGE_SIZE - 1); 1953 unsigned from = pos & (PAGE_SIZE - 1);
1899 unsigned to = from + len; 1954 unsigned to = from + len;
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1929 clear_buffer_new(bh); 1984 clear_buffer_new(bh);
1930 if (!buffer_mapped(bh)) { 1985 if (!buffer_mapped(bh)) {
1931 WARN_ON(bh->b_size != blocksize); 1986 WARN_ON(bh->b_size != blocksize);
1932 err = get_block(inode, block, bh, 1); 1987 if (get_block) {
1933 if (err) 1988 err = get_block(inode, block, bh, 1);
1934 break; 1989 if (err)
1990 break;
1991 } else {
1992 iomap_to_bh(inode, block, bh, iomap);
1993 }
1994
1935 if (buffer_new(bh)) { 1995 if (buffer_new(bh)) {
1936 unmap_underlying_metadata(bh->b_bdev, 1996 unmap_underlying_metadata(bh->b_bdev,
1937 bh->b_blocknr); 1997 bh->b_blocknr);
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1972 page_zero_new_buffers(page, from, to); 2032 page_zero_new_buffers(page, from, to);
1973 return err; 2033 return err;
1974} 2034}
2035
2036int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2037 get_block_t *get_block)
2038{
2039 return __block_write_begin_int(page, pos, len, get_block, NULL);
2040}
1975EXPORT_SYMBOL(__block_write_begin); 2041EXPORT_SYMBOL(__block_write_begin);
1976 2042
1977static int __block_commit_write(struct inode *inode, struct page *page, 2043static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index f57ced528cde..cef0913e5d41 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
11 11
12struct super_block; 12struct super_block;
13struct file_system_type; 13struct file_system_type;
14struct iomap;
14struct linux_binprm; 15struct linux_binprm;
15struct path; 16struct path;
16struct mount; 17struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
39 * buffer.c 40 * buffer.c
40 */ 41 */
41extern void guard_bio_eod(int rw, struct bio *bio); 42extern void guard_bio_eod(int rw, struct bio *bio);
43extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
44 get_block_t *get_block, struct iomap *iomap);
42 45
43/* 46/*
44 * char_dev.c 47 * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 000000000000..48141b8eff5f
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * Copyright (c) 2016 Christoph Hellwig.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#include <linux/module.h>
15#include <linux/compiler.h>
16#include <linux/fs.h>
17#include <linux/iomap.h>
18#include <linux/uaccess.h>
19#include <linux/gfp.h>
20#include <linux/mm.h>
21#include <linux/swap.h>
22#include <linux/pagemap.h>
23#include <linux/file.h>
24#include <linux/uio.h>
25#include <linux/backing-dev.h>
26#include <linux/buffer_head.h>
27#include <linux/dax.h>
28#include "internal.h"
29
30typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
31 void *data, struct iomap *iomap);
32
33/*
34 * Execute a iomap write on a segment of the mapping that spans a
35 * contiguous range of pages that have identical block mapping state.
36 *
37 * This avoids the need to map pages individually, do individual allocations
38 * for each page and most importantly avoid the need for filesystem specific
39 * locking per page. Instead, all the operations are amortised over the entire
40 * range of pages. It is assumed that the filesystems will lock whatever
41 * resources they require in the iomap_begin call, and release them in the
42 * iomap_end call.
43 */
44static loff_t
45iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
46 struct iomap_ops *ops, void *data, iomap_actor_t actor)
47{
48 struct iomap iomap = { 0 };
49 loff_t written = 0, ret;
50
51 /*
52 * Need to map a range from start position for length bytes. This can
53 * span multiple pages - it is only guaranteed to return a range of a
54 * single type of pages (e.g. all into a hole, all mapped or all
55 * unwritten). Failure at this point has nothing to undo.
56 *
57 * If allocation is required for this range, reserve the space now so
58 * that the allocation is guaranteed to succeed later on. Once we copy
59 * the data into the page cache pages, then we cannot fail otherwise we
60 * expose transient stale data. If the reserve fails, we can safely
61 * back out at this point as there is nothing to undo.
62 */
63 ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
64 if (ret)
65 return ret;
66 if (WARN_ON(iomap.offset > pos))
67 return -EIO;
68
69 /*
70 * Cut down the length to the one actually provided by the filesystem,
71 * as it might not be able to give us the whole size that we requested.
72 */
73 if (iomap.offset + iomap.length < pos + length)
74 length = iomap.offset + iomap.length - pos;
75
76 /*
77 * Now that we have guaranteed that the space allocation will succeed.
78 * we can do the copy-in page by page without having to worry about
79 * failures exposing transient data.
80 */
81 written = actor(inode, pos, length, data, &iomap);
82
83 /*
84 * Now the data has been copied, commit the range we've copied. This
85 * should not fail unless the filesystem has had a fatal error.
86 */
87 ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
88 flags, &iomap);
89
90 return written ? written : ret;
91}
92
93static void
94iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
95{
96 loff_t i_size = i_size_read(inode);
97
98 /*
99 * Only truncate newly allocated pages beyoned EOF, even if the
100 * write started inside the existing inode size.
101 */
102 if (pos + len > i_size)
103 truncate_pagecache_range(inode, max(pos, i_size), pos + len);
104}
105
106static int
107iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
108 struct page **pagep, struct iomap *iomap)
109{
110 pgoff_t index = pos >> PAGE_SHIFT;
111 struct page *page;
112 int status = 0;
113
114 BUG_ON(pos + len > iomap->offset + iomap->length);
115
116 page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
117 if (!page)
118 return -ENOMEM;
119
120 status = __block_write_begin_int(page, pos, len, NULL, iomap);
121 if (unlikely(status)) {
122 unlock_page(page);
123 put_page(page);
124 page = NULL;
125
126 iomap_write_failed(inode, pos, len);
127 }
128
129 *pagep = page;
130 return status;
131}
132
133static int
134iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
135 unsigned copied, struct page *page)
136{
137 int ret;
138
139 ret = generic_write_end(NULL, inode->i_mapping, pos, len,
140 copied, page, NULL);
141 if (ret < len)
142 iomap_write_failed(inode, pos, len);
143 return ret;
144}
145
146static loff_t
147iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
148 struct iomap *iomap)
149{
150 struct iov_iter *i = data;
151 long status = 0;
152 ssize_t written = 0;
153 unsigned int flags = AOP_FLAG_NOFS;
154
155 /*
156 * Copies from kernel address space cannot fail (NFSD is a big user).
157 */
158 if (!iter_is_iovec(i))
159 flags |= AOP_FLAG_UNINTERRUPTIBLE;
160
161 do {
162 struct page *page;
163 unsigned long offset; /* Offset into pagecache page */
164 unsigned long bytes; /* Bytes to write to page */
165 size_t copied; /* Bytes copied from user */
166
167 offset = (pos & (PAGE_SIZE - 1));
168 bytes = min_t(unsigned long, PAGE_SIZE - offset,
169 iov_iter_count(i));
170again:
171 if (bytes > length)
172 bytes = length;
173
174 /*
175 * Bring in the user page that we will copy from _first_.
176 * Otherwise there's a nasty deadlock on copying from the
177 * same page as we're writing to, without it being marked
178 * up-to-date.
179 *
180 * Not only is this an optimisation, but it is also required
181 * to check that the address is actually valid, when atomic
182 * usercopies are used, below.
183 */
184 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
185 status = -EFAULT;
186 break;
187 }
188
189 status = iomap_write_begin(inode, pos, bytes, flags, &page,
190 iomap);
191 if (unlikely(status))
192 break;
193
194 if (mapping_writably_mapped(inode->i_mapping))
195 flush_dcache_page(page);
196
197 pagefault_disable();
198 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
199 pagefault_enable();
200
201 flush_dcache_page(page);
202 mark_page_accessed(page);
203
204 status = iomap_write_end(inode, pos, bytes, copied, page);
205 if (unlikely(status < 0))
206 break;
207 copied = status;
208
209 cond_resched();
210
211 iov_iter_advance(i, copied);
212 if (unlikely(copied == 0)) {
213 /*
214 * If we were unable to copy any data at all, we must
215 * fall back to a single segment length write.
216 *
217 * If we didn't fallback here, we could livelock
218 * because not all segments in the iov can be copied at
219 * once without a pagefault.
220 */
221 bytes = min_t(unsigned long, PAGE_SIZE - offset,
222 iov_iter_single_seg_count(i));
223 goto again;
224 }
225 pos += copied;
226 written += copied;
227 length -= copied;
228
229 balance_dirty_pages_ratelimited(inode->i_mapping);
230 } while (iov_iter_count(i) && length);
231
232 return written ? written : status;
233}
234
235ssize_t
236iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
237 struct iomap_ops *ops)
238{
239 struct inode *inode = iocb->ki_filp->f_mapping->host;
240 loff_t pos = iocb->ki_pos, ret = 0, written = 0;
241
242 while (iov_iter_count(iter)) {
243 ret = iomap_apply(inode, pos, iov_iter_count(iter),
244 IOMAP_WRITE, ops, iter, iomap_write_actor);
245 if (ret <= 0)
246 break;
247 pos += ret;
248 written += ret;
249 }
250
251 return written ? written : ret;
252}
253EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
254
255static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
256 unsigned bytes, struct iomap *iomap)
257{
258 struct page *page;
259 int status;
260
261 status = iomap_write_begin(inode, pos, bytes,
262 AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
263 if (status)
264 return status;
265
266 zero_user(page, offset, bytes);
267 mark_page_accessed(page);
268
269 return iomap_write_end(inode, pos, bytes, bytes, page);
270}
271
272static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
273 struct iomap *iomap)
274{
275 sector_t sector = iomap->blkno +
276 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
277
278 return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
279}
280
281static loff_t
282iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
283 void *data, struct iomap *iomap)
284{
285 bool *did_zero = data;
286 loff_t written = 0;
287 int status;
288
289 /* already zeroed? we're done. */
290 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
291 return count;
292
293 do {
294 unsigned offset, bytes;
295
296 offset = pos & (PAGE_SIZE - 1); /* Within page */
297 bytes = min_t(unsigned, PAGE_SIZE - offset, count);
298
299 if (IS_DAX(inode))
300 status = iomap_dax_zero(pos, offset, bytes, iomap);
301 else
302 status = iomap_zero(inode, pos, offset, bytes, iomap);
303 if (status < 0)
304 return status;
305
306 pos += bytes;
307 count -= bytes;
308 written += bytes;
309 if (did_zero)
310 *did_zero = true;
311 } while (count > 0);
312
313 return written;
314}
315
316int
317iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
318 struct iomap_ops *ops)
319{
320 loff_t ret;
321
322 while (len > 0) {
323 ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
324 ops, did_zero, iomap_zero_range_actor);
325 if (ret <= 0)
326 return ret;
327
328 pos += ret;
329 len -= ret;
330 }
331
332 return 0;
333}
334EXPORT_SYMBOL_GPL(iomap_zero_range);
335
336int
337iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
338 struct iomap_ops *ops)
339{
340 unsigned blocksize = (1 << inode->i_blkbits);
341 unsigned off = pos & (blocksize - 1);
342
343 /* Block boundary? Nothing to do */
344 if (!off)
345 return 0;
346 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
347}
348EXPORT_SYMBOL_GPL(iomap_truncate_page);
349
350static loff_t
351iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
352 void *data, struct iomap *iomap)
353{
354 struct page *page = data;
355 int ret;
356
357 ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
358 NULL, iomap);
359 if (ret)
360 return ret;
361
362 block_commit_write(page, 0, length);
363 return length;
364}
365
366int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
367 struct iomap_ops *ops)
368{
369 struct page *page = vmf->page;
370 struct inode *inode = file_inode(vma->vm_file);
371 unsigned long length;
372 loff_t offset, size;
373 ssize_t ret;
374
375 lock_page(page);
376 size = i_size_read(inode);
377 if ((page->mapping != inode->i_mapping) ||
378 (page_offset(page) > size)) {
379 /* We overload EFAULT to mean page got truncated */
380 ret = -EFAULT;
381 goto out_unlock;
382 }
383
384 /* page is wholly or partially inside EOF */
385 if (((page->index + 1) << PAGE_SHIFT) > size)
386 length = size & ~PAGE_MASK;
387 else
388 length = PAGE_SIZE;
389
390 offset = page_offset(page);
391 while (length > 0) {
392 ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
393 ops, page, iomap_page_mkwrite_actor);
394 if (unlikely(ret <= 0))
395 goto out_unlock;
396 offset += ret;
397 length -= ret;
398 }
399
400 set_page_dirty(page);
401 wait_for_stable_page(page);
402 return 0;
403out_unlock:
404 unlock_page(page);
405 return ret;
406}
407EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
408
409struct fiemap_ctx {
410 struct fiemap_extent_info *fi;
411 struct iomap prev;
412};
413
414static int iomap_to_fiemap(struct fiemap_extent_info *fi,
415 struct iomap *iomap, u32 flags)
416{
417 switch (iomap->type) {
418 case IOMAP_HOLE:
419 /* skip holes */
420 return 0;
421 case IOMAP_DELALLOC:
422 flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
423 break;
424 case IOMAP_UNWRITTEN:
425 flags |= FIEMAP_EXTENT_UNWRITTEN;
426 break;
427 case IOMAP_MAPPED:
428 break;
429 }
430
431 return fiemap_fill_next_extent(fi, iomap->offset,
432 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
433 iomap->length, flags | FIEMAP_EXTENT_MERGED);
434
435}
436
437static loff_t
438iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
439 struct iomap *iomap)
440{
441 struct fiemap_ctx *ctx = data;
442 loff_t ret = length;
443
444 if (iomap->type == IOMAP_HOLE)
445 return length;
446
447 ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
448 ctx->prev = *iomap;
449 switch (ret) {
450 case 0: /* success */
451 return length;
452 case 1: /* extent array full */
453 return 0;
454 default:
455 return ret;
456 }
457}
458
459int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
460 loff_t start, loff_t len, struct iomap_ops *ops)
461{
462 struct fiemap_ctx ctx;
463 loff_t ret;
464
465 memset(&ctx, 0, sizeof(ctx));
466 ctx.fi = fi;
467 ctx.prev.type = IOMAP_HOLE;
468
469 ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
470 if (ret)
471 return ret;
472
473 ret = filemap_write_and_wait(inode->i_mapping);
474 if (ret)
475 return ret;
476
477 while (len > 0) {
478 ret = iomap_apply(inode, start, len, 0, ops, &ctx,
479 iomap_fiemap_actor);
480 if (ret < 0)
481 return ret;
482 if (ret == 0)
483 break;
484
485 start += ret;
486 len -= ret;
487 }
488
489 if (ctx.prev.type != IOMAP_HOLE) {
490 ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
491 if (ret < 0)
492 return ret;
493 }
494
495 return 0;
496}
497EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 31f3df193bdb..ad2c05e80a83 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
2 * Copyright (c) 2014-2016 Christoph Hellwig. 2 * Copyright (c) 2014-2016 Christoph Hellwig.
3 */ 3 */
4#include <linux/exportfs.h> 4#include <linux/exportfs.h>
5#include <linux/iomap.h>
5#include <linux/genhd.h> 6#include <linux/genhd.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
7#include <linux/pr.h> 8#include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316f932e..4ebaaf4b8d8a 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sunrpc/svc.h> 4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h> 5#include <linux/exportfs.h>
6#include <linux/iomap.h>
6#include <linux/nfs4.h> 7#include <linux/nfs4.h>
7 8
8#include "nfsd.h" 9#include "nfsd.h"
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4df61ea..35faf128f36d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
4 depends on (64BIT || LBDAF) 4 depends on (64BIT || LBDAF)
5 select EXPORTFS 5 select EXPORTFS
6 select LIBCRC32C 6 select LIBCRC32C
7 select FS_IOMAP
7 help 8 help
8 XFS is a high performance journaling filesystem which originated 9 XFS is a high performance journaling filesystem which originated
9 on the SGI IRIX platform. It is completely multi-threaded, can 10 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a708e38b494c..88c26b827a2d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
84 * Lookup the first record less than or equal to [bno, len] 84 * Lookup the first record less than or equal to [bno, len]
85 * in the btree given by cur. 85 * in the btree given by cur.
86 */ 86 */
87int /* error */ 87static int /* error */
88xfs_alloc_lookup_le( 88xfs_alloc_lookup_le(
89 struct xfs_btree_cur *cur, /* btree cursor */ 89 struct xfs_btree_cur *cur, /* btree cursor */
90 xfs_agblock_t bno, /* starting block of extent */ 90 xfs_agblock_t bno, /* starting block of extent */
@@ -1839,19 +1839,8 @@ void
1839xfs_alloc_compute_maxlevels( 1839xfs_alloc_compute_maxlevels(
1840 xfs_mount_t *mp) /* file system mount structure */ 1840 xfs_mount_t *mp) /* file system mount structure */
1841{ 1841{
1842 int level; 1842 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
1843 uint maxblocks; 1843 (mp->m_sb.sb_agblocks + 1) / 2);
1844 uint maxleafents;
1845 int minleafrecs;
1846 int minnoderecs;
1847
1848 maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
1849 minleafrecs = mp->m_alloc_mnr[0];
1850 minnoderecs = mp->m_alloc_mnr[1];
1851 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1852 for (level = 1; maxblocks > 1; level++)
1853 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1854 mp->m_ag_maxlevels = level;
1855} 1844}
1856 1845
1857/* 1846/*
@@ -2658,55 +2647,79 @@ error0:
2658 return error; 2647 return error;
2659} 2648}
2660 2649
2661/* 2650/* Ensure that the freelist is at full capacity. */
2662 * Free an extent. 2651int
2663 * Just break up the extent address and hand off to xfs_free_ag_extent 2652xfs_free_extent_fix_freelist(
2664 * after fixing up the freelist. 2653 struct xfs_trans *tp,
2665 */ 2654 xfs_agnumber_t agno,
2666int /* error */ 2655 struct xfs_buf **agbp)
2667xfs_free_extent(
2668 xfs_trans_t *tp, /* transaction pointer */
2669 xfs_fsblock_t bno, /* starting block number of extent */
2670 xfs_extlen_t len) /* length of extent */
2671{ 2656{
2672 xfs_alloc_arg_t args; 2657 struct xfs_alloc_arg args;
2673 int error; 2658 int error;
2674 2659
2675 ASSERT(len != 0); 2660 memset(&args, 0, sizeof(struct xfs_alloc_arg));
2676 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2677 args.tp = tp; 2661 args.tp = tp;
2678 args.mp = tp->t_mountp; 2662 args.mp = tp->t_mountp;
2663 args.agno = agno;
2679 2664
2680 /* 2665 /*
2681 * validate that the block number is legal - the enables us to detect 2666 * validate that the block number is legal - the enables us to detect
2682 * and handle a silent filesystem corruption rather than crashing. 2667 * and handle a silent filesystem corruption rather than crashing.
2683 */ 2668 */
2684 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2685 if (args.agno >= args.mp->m_sb.sb_agcount) 2669 if (args.agno >= args.mp->m_sb.sb_agcount)
2686 return -EFSCORRUPTED; 2670 return -EFSCORRUPTED;
2687 2671
2688 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2689 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2690 return -EFSCORRUPTED;
2691
2692 args.pag = xfs_perag_get(args.mp, args.agno); 2672 args.pag = xfs_perag_get(args.mp, args.agno);
2693 ASSERT(args.pag); 2673 ASSERT(args.pag);
2694 2674
2695 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 2675 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2696 if (error) 2676 if (error)
2697 goto error0; 2677 goto out;
2678
2679 *agbp = args.agbp;
2680out:
2681 xfs_perag_put(args.pag);
2682 return error;
2683}
2684
2685/*
2686 * Free an extent.
2687 * Just break up the extent address and hand off to xfs_free_ag_extent
2688 * after fixing up the freelist.
2689 */
2690int /* error */
2691xfs_free_extent(
2692 struct xfs_trans *tp, /* transaction pointer */
2693 xfs_fsblock_t bno, /* starting block number of extent */
2694 xfs_extlen_t len) /* length of extent */
2695{
2696 struct xfs_mount *mp = tp->t_mountp;
2697 struct xfs_buf *agbp;
2698 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
2699 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
2700 int error;
2701
2702 ASSERT(len != 0);
2703
2704 error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
2705 if (error)
2706 return error;
2707
2708 XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
2698 2709
2699 /* validate the extent size is legal now we have the agf locked */ 2710 /* validate the extent size is legal now we have the agf locked */
2700 if (args.agbno + len > 2711 XFS_WANT_CORRUPTED_GOTO(mp,
2701 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { 2712 agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
2702 error = -EFSCORRUPTED; 2713 err);
2703 goto error0;
2704 }
2705 2714
2706 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2715 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
2707 if (!error) 2716 if (error)
2708 xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); 2717 goto err;
2709error0: 2718
2710 xfs_perag_put(args.pag); 2719 xfs_extent_busy_insert(tp, agno, agbno, len, 0);
2720 return 0;
2721
2722err:
2723 xfs_trans_brelse(tp, agbp);
2711 return error; 2724 return error;
2712} 2725}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 135eb3d24db7..cf268b2d0b6c 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -212,13 +212,6 @@ xfs_free_extent(
212 xfs_fsblock_t bno, /* starting block number of extent */ 212 xfs_fsblock_t bno, /* starting block number of extent */
213 xfs_extlen_t len); /* length of extent */ 213 xfs_extlen_t len); /* length of extent */
214 214
215int /* error */
216xfs_alloc_lookup_le(
217 struct xfs_btree_cur *cur, /* btree cursor */
218 xfs_agblock_t bno, /* starting block of extent */
219 xfs_extlen_t len, /* length of extent */
220 int *stat); /* success/failure */
221
222int /* error */ 215int /* error */
223xfs_alloc_lookup_ge( 216xfs_alloc_lookup_ge(
224 struct xfs_btree_cur *cur, /* btree cursor */ 217 struct xfs_btree_cur *cur, /* btree cursor */
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
236int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 229int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
237 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 230 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
238int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); 231int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
232int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
233 struct xfs_buf **agbp);
239 234
240#endif /* __XFS_ALLOC_H__ */ 235#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 882c8d338891..4f2aed04f827 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args);
50int xfs_attr_shortform_getvalue(struct xfs_da_args *args); 50int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
51int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); 51int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
52int xfs_attr_shortform_remove(struct xfs_da_args *args); 52int xfs_attr_shortform_remove(struct xfs_da_args *args);
53int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
54int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); 53int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
55int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); 54int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
56void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); 55void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
88void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, 87void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
89 struct xfs_da_state_blk *drop_blk, 88 struct xfs_da_state_blk *drop_blk,
90 struct xfs_da_state_blk *save_blk); 89 struct xfs_da_state_blk *save_blk);
91int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
92
93/* 90/*
94 * Utility routines. 91 * Utility routines.
95 */ 92 */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 932381caef1b..2f2c85cc8117 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
570 */ 570 */
571void 571void
572xfs_bmap_add_free( 572xfs_bmap_add_free(
573 struct xfs_mount *mp, /* mount point structure */
574 struct xfs_bmap_free *flist, /* list of extents */
573 xfs_fsblock_t bno, /* fs block number of extent */ 575 xfs_fsblock_t bno, /* fs block number of extent */
574 xfs_filblks_t len, /* length of extent */ 576 xfs_filblks_t len) /* length of extent */
575 xfs_bmap_free_t *flist, /* list of extents */
576 xfs_mount_t *mp) /* mount point structure */
577{ 577{
578 xfs_bmap_free_item_t *cur; /* current (next) element */ 578 struct xfs_bmap_free_item *new; /* new element */
579 xfs_bmap_free_item_t *new; /* new element */
580 xfs_bmap_free_item_t *prev; /* previous element */
581#ifdef DEBUG 579#ifdef DEBUG
582 xfs_agnumber_t agno; 580 xfs_agnumber_t agno;
583 xfs_agblock_t agbno; 581 xfs_agblock_t agbno;
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
597 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 595 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
598 new->xbfi_startblock = bno; 596 new->xbfi_startblock = bno;
599 new->xbfi_blockcount = (xfs_extlen_t)len; 597 new->xbfi_blockcount = (xfs_extlen_t)len;
600 for (prev = NULL, cur = flist->xbf_first; 598 list_add(&new->xbfi_list, &flist->xbf_flist);
601 cur != NULL;
602 prev = cur, cur = cur->xbfi_next) {
603 if (cur->xbfi_startblock >= bno)
604 break;
605 }
606 if (prev)
607 prev->xbfi_next = new;
608 else
609 flist->xbf_first = new;
610 new->xbfi_next = cur;
611 flist->xbf_count++; 599 flist->xbf_count++;
612} 600}
613 601
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
617 */ 605 */
618void 606void
619xfs_bmap_del_free( 607xfs_bmap_del_free(
620 xfs_bmap_free_t *flist, /* free item list header */ 608 struct xfs_bmap_free *flist, /* free item list header */
621 xfs_bmap_free_item_t *prev, /* previous item on list, if any */ 609 struct xfs_bmap_free_item *free) /* list item to be freed */
622 xfs_bmap_free_item_t *free) /* list item to be freed */
623{ 610{
624 if (prev) 611 list_del(&free->xbfi_list);
625 prev->xbfi_next = free->xbfi_next;
626 else
627 flist->xbf_first = free->xbfi_next;
628 flist->xbf_count--; 612 flist->xbf_count--;
629 kmem_zone_free(xfs_bmap_free_item_zone, free); 613 kmem_zone_free(xfs_bmap_free_item_zone, free);
630} 614}
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
634 */ 618 */
635void 619void
636xfs_bmap_cancel( 620xfs_bmap_cancel(
637 xfs_bmap_free_t *flist) /* list of bmap_free_items */ 621 struct xfs_bmap_free *flist) /* list of bmap_free_items */
638{ 622{
639 xfs_bmap_free_item_t *free; /* free list item */ 623 struct xfs_bmap_free_item *free; /* free list item */
640 xfs_bmap_free_item_t *next;
641 624
642 if (flist->xbf_count == 0) 625 if (flist->xbf_count == 0)
643 return; 626 return;
644 ASSERT(flist->xbf_first != NULL); 627 while (!list_empty(&flist->xbf_flist)) {
645 for (free = flist->xbf_first; free; free = next) { 628 free = list_first_entry(&flist->xbf_flist,
646 next = free->xbfi_next; 629 struct xfs_bmap_free_item, xbfi_list);
647 xfs_bmap_del_free(flist, NULL, free); 630 xfs_bmap_del_free(flist, free);
648 } 631 }
649 ASSERT(flist->xbf_count == 0); 632 ASSERT(flist->xbf_count == 0);
650} 633}
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
699 cblock = XFS_BUF_TO_BLOCK(cbp); 682 cblock = XFS_BUF_TO_BLOCK(cbp);
700 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) 683 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
701 return error; 684 return error;
702 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); 685 xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
703 ip->i_d.di_nblocks--; 686 ip->i_d.di_nblocks--;
704 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); 687 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
705 xfs_trans_binval(tp, cbp); 688 xfs_trans_binval(tp, cbp);
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
5073 * If we need to, add to list of extents to delete. 5056 * If we need to, add to list of extents to delete.
5074 */ 5057 */
5075 if (do_fx) 5058 if (do_fx)
5076 xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, 5059 xfs_bmap_add_free(mp, flist, del->br_startblock,
5077 mp); 5060 del->br_blockcount);
5078 /* 5061 /*
5079 * Adjust inode # blocks in the file. 5062 * Adjust inode # blocks in the file.
5080 */ 5063 */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 423a34e832bd..f1f3ae6c0a3f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
62 * List of extents to be free "later". 62 * List of extents to be free "later".
63 * The list is kept sorted on xbf_startblock. 63 * The list is kept sorted on xbf_startblock.
64 */ 64 */
65typedef struct xfs_bmap_free_item 65struct xfs_bmap_free_item
66{ 66{
67 xfs_fsblock_t xbfi_startblock;/* starting fs block number */ 67 xfs_fsblock_t xbfi_startblock;/* starting fs block number */
68 xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ 68 xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
69 struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ 69 struct list_head xbfi_list;
70} xfs_bmap_free_item_t; 70};
71 71
72/* 72/*
73 * Header for free extent list. 73 * Header for free extent list.
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
85 */ 85 */
86typedef struct xfs_bmap_free 86typedef struct xfs_bmap_free
87{ 87{
88 xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ 88 struct list_head xbf_flist; /* list of to-be-free extents */
89 int xbf_count; /* count of items on list */ 89 int xbf_count; /* count of items on list */
90 int xbf_low; /* alloc in low mode */ 90 int xbf_low; /* alloc in low mode */
91} xfs_bmap_free_t; 91} xfs_bmap_free_t;
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
141 141
142static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) 142static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
143{ 143{
144 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ 144 INIT_LIST_HEAD(&flp->xbf_flist);
145 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); 145 flp->xbf_count = 0;
146 flp->xbf_low = 0;
147 *fbp = NULLFSBLOCK;
146} 148}
147 149
148/* 150/*
@@ -191,8 +193,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
191 193
192int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 194int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
193void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); 195void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
194void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, 196void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
195 struct xfs_bmap_free *flist, struct xfs_mount *mp); 197 xfs_fsblock_t bno, xfs_filblks_t len);
196void xfs_bmap_cancel(struct xfs_bmap_free *flist); 198void xfs_bmap_cancel(struct xfs_bmap_free *flist);
197int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, 199int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
198 struct xfs_inode *ip); 200 struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6282f6e708af..db0c71e470c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
526 struct xfs_trans *tp = cur->bc_tp; 526 struct xfs_trans *tp = cur->bc_tp;
527 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); 527 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
528 528
529 xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); 529 xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
530 ip->i_d.di_nblocks--; 530 ip->i_d.di_nblocks--;
531 531
532 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 532 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 1f88e1ce770f..07eeb0b4ca74 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
543 */ 543 */
544STATIC struct xfs_btree_block * 544STATIC struct xfs_btree_block *
545xfs_btree_get_iroot( 545xfs_btree_get_iroot(
546 struct xfs_btree_cur *cur) 546 struct xfs_btree_cur *cur)
547{ 547{
548 struct xfs_ifork *ifp; 548 struct xfs_ifork *ifp;
549 549
550 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); 550 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
551 return (struct xfs_btree_block *)ifp->if_broot; 551 return (struct xfs_btree_block *)ifp->if_broot;
552} 552}
553 553
554/* 554/*
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
4152 4152
4153 return true; 4153 return true;
4154} 4154}
4155
4156/*
4157 * Calculate the number of btree levels needed to store a given number of
4158 * records in a short-format btree.
4159 */
4160uint
4161xfs_btree_compute_maxlevels(
4162 struct xfs_mount *mp,
4163 uint *limits,
4164 unsigned long len)
4165{
4166 uint level;
4167 unsigned long maxblocks;
4168
4169 maxblocks = (len + limits[0] - 1) / limits[0];
4170 for (level = 1; maxblocks > 1; level++)
4171 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4172 return level;
4173}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 2e874be70209..785a99682159 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
474 474
475bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); 475bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
476bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); 476bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
477uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
478 unsigned long len);
477 479
478#endif /* __XFS_BTREE_H__ */ 480#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 097bf7717d80..0f1f165f4048 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
356 struct xfs_da_state_blk *newblk; 356 struct xfs_da_state_blk *newblk;
357 struct xfs_da_state_blk *addblk; 357 struct xfs_da_state_blk *addblk;
358 struct xfs_da_intnode *node; 358 struct xfs_da_intnode *node;
359 struct xfs_buf *bp;
360 int max; 359 int max;
361 int action = 0; 360 int action = 0;
362 int error; 361 int error;
@@ -397,7 +396,9 @@ xfs_da3_split(
397 break; 396 break;
398 } 397 }
399 /* 398 /*
400 * Entry wouldn't fit, split the leaf again. 399 * Entry wouldn't fit, split the leaf again. The new
400 * extrablk will be consumed by xfs_da3_node_split if
401 * the node is split.
401 */ 402 */
402 state->extravalid = 1; 403 state->extravalid = 1;
403 if (state->inleaf) { 404 if (state->inleaf) {
@@ -446,6 +447,14 @@ xfs_da3_split(
446 return 0; 447 return 0;
447 448
448 /* 449 /*
450 * xfs_da3_node_split() should have consumed any extra blocks we added
451 * during a double leaf split in the attr fork. This is guaranteed as
452 * we can't be here if the attr fork only has a single leaf block.
453 */
454 ASSERT(state->extravalid == 0 ||
455 state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
456
457 /*
449 * Split the root node. 458 * Split the root node.
450 */ 459 */
451 ASSERT(state->path.active == 0); 460 ASSERT(state->path.active == 0);
@@ -457,43 +466,33 @@ xfs_da3_split(
457 } 466 }
458 467
459 /* 468 /*
460 * Update pointers to the node which used to be block 0 and 469 * Update pointers to the node which used to be block 0 and just got
461 * just got bumped because of the addition of a new root node. 470 * bumped because of the addition of a new root node. Note that the
462 * There might be three blocks involved if a double split occurred, 471 * original block 0 could be at any position in the list of blocks in
463 * and the original block 0 could be at any position in the list. 472 * the tree.
464 * 473 *
465 * Note: the magic numbers and sibling pointers are in the same 474 * Note: the magic numbers and sibling pointers are in the same physical
466 * physical place for both v2 and v3 headers (by design). Hence it 475 * place for both v2 and v3 headers (by design). Hence it doesn't matter
467 * doesn't matter which version of the xfs_da_intnode structure we use 476 * which version of the xfs_da_intnode structure we use here as the
468 * here as the result will be the same using either structure. 477 * result will be the same using either structure.
469 */ 478 */
470 node = oldblk->bp->b_addr; 479 node = oldblk->bp->b_addr;
471 if (node->hdr.info.forw) { 480 if (node->hdr.info.forw) {
472 if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { 481 ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
473 bp = addblk->bp; 482 node = addblk->bp->b_addr;
474 } else {
475 ASSERT(state->extravalid);
476 bp = state->extrablk.bp;
477 }
478 node = bp->b_addr;
479 node->hdr.info.back = cpu_to_be32(oldblk->blkno); 483 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
480 xfs_trans_log_buf(state->args->trans, bp, 484 xfs_trans_log_buf(state->args->trans, addblk->bp,
481 XFS_DA_LOGRANGE(node, &node->hdr.info, 485 XFS_DA_LOGRANGE(node, &node->hdr.info,
482 sizeof(node->hdr.info))); 486 sizeof(node->hdr.info)));
483 } 487 }
484 node = oldblk->bp->b_addr; 488 node = oldblk->bp->b_addr;
485 if (node->hdr.info.back) { 489 if (node->hdr.info.back) {
486 if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { 490 ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
487 bp = addblk->bp; 491 node = addblk->bp->b_addr;
488 } else {
489 ASSERT(state->extravalid);
490 bp = state->extrablk.bp;
491 }
492 node = bp->b_addr;
493 node->hdr.info.forw = cpu_to_be32(oldblk->blkno); 492 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
494 xfs_trans_log_buf(state->args->trans, bp, 493 xfs_trans_log_buf(state->args->trans, addblk->bp,
495 XFS_DA_LOGRANGE(node, &node->hdr.info, 494 XFS_DA_LOGRANGE(node, &node->hdr.info,
496 sizeof(node->hdr.info))); 495 sizeof(node->hdr.info)));
497 } 496 }
498 addblk->bp = NULL; 497 addblk->bp = NULL;
499 return 0; 498 return 0;
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 9d624a622946..f1e8d4dbb600 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
40 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ 40 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
41 41
42 count += len; /* name */ 42 count += len; /* name */
43 count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : 43 count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
44 sizeof(xfs_dir2_ino4_t); /* ino # */
45 return count; 44 return count;
46} 45}
47 46
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
125static xfs_ino_t 124static xfs_ino_t
126xfs_dir2_sf_get_ino( 125xfs_dir2_sf_get_ino(
127 struct xfs_dir2_sf_hdr *hdr, 126 struct xfs_dir2_sf_hdr *hdr,
128 xfs_dir2_inou_t *from) 127 __uint8_t *from)
129{ 128{
130 if (hdr->i8count) 129 if (hdr->i8count)
131 return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; 130 return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
132 else 131 else
133 return get_unaligned_be32(&from->i4.i); 132 return get_unaligned_be32(from);
134} 133}
135 134
136static void 135static void
137xfs_dir2_sf_put_ino( 136xfs_dir2_sf_put_ino(
138 struct xfs_dir2_sf_hdr *hdr, 137 struct xfs_dir2_sf_hdr *hdr,
139 xfs_dir2_inou_t *to, 138 __uint8_t *to,
140 xfs_ino_t ino) 139 xfs_ino_t ino)
141{ 140{
142 ASSERT((ino & 0xff00000000000000ULL) == 0); 141 ASSERT((ino & 0xff00000000000000ULL) == 0);
143 142
144 if (hdr->i8count) 143 if (hdr->i8count)
145 put_unaligned_be64(ino, &to->i8.i); 144 put_unaligned_be64(ino, to);
146 else 145 else
147 put_unaligned_be32(ino, &to->i4.i); 146 put_unaligned_be32(ino, to);
148} 147}
149 148
150static xfs_ino_t 149static xfs_ino_t
151xfs_dir2_sf_get_parent_ino( 150xfs_dir2_sf_get_parent_ino(
152 struct xfs_dir2_sf_hdr *hdr) 151 struct xfs_dir2_sf_hdr *hdr)
153{ 152{
154 return xfs_dir2_sf_get_ino(hdr, &hdr->parent); 153 return xfs_dir2_sf_get_ino(hdr, hdr->parent);
155} 154}
156 155
157static void 156static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
159 struct xfs_dir2_sf_hdr *hdr, 158 struct xfs_dir2_sf_hdr *hdr,
160 xfs_ino_t ino) 159 xfs_ino_t ino)
161{ 160{
162 xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); 161 xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
163} 162}
164 163
165/* 164/*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
173 struct xfs_dir2_sf_hdr *hdr, 172 struct xfs_dir2_sf_hdr *hdr,
174 struct xfs_dir2_sf_entry *sfep) 173 struct xfs_dir2_sf_entry *sfep)
175{ 174{
176 return xfs_dir2_sf_get_ino(hdr, 175 return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
177 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
178} 176}
179 177
180static void 178static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
183 struct xfs_dir2_sf_entry *sfep, 181 struct xfs_dir2_sf_entry *sfep,
184 xfs_ino_t ino) 182 xfs_ino_t ino)
185{ 183{
186 xfs_dir2_sf_put_ino(hdr, 184 xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
187 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
188} 185}
189 186
190static xfs_ino_t 187static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
192 struct xfs_dir2_sf_hdr *hdr, 189 struct xfs_dir2_sf_hdr *hdr,
193 struct xfs_dir2_sf_entry *sfep) 190 struct xfs_dir2_sf_entry *sfep)
194{ 191{
195 return xfs_dir2_sf_get_ino(hdr, 192 return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
196 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
197} 193}
198 194
199static void 195static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
202 struct xfs_dir2_sf_entry *sfep, 198 struct xfs_dir2_sf_entry *sfep,
203 xfs_ino_t ino) 199 xfs_ino_t ino)
204{ 200{
205 xfs_dir2_sf_put_ino(hdr, 201 xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
206 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
207} 202}
208 203
209 204
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 8d4d8bce41bf..685f23b67056 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t;
192typedef uint xfs_dir2_data_aoff_t; /* argument form */ 192typedef uint xfs_dir2_data_aoff_t; /* argument form */
193 193
194/* 194/*
195 * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
196 * Only need 16 bits, this is the byte offset into the single block form.
197 */
198typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
199
200/*
201 * Offset in data space of a data entry. 195 * Offset in data space of a data entry.
202 */ 196 */
203typedef __uint32_t xfs_dir2_dataptr_t; 197typedef __uint32_t xfs_dir2_dataptr_t;
@@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t;
214 */ 208 */
215typedef __uint32_t xfs_dir2_db_t; 209typedef __uint32_t xfs_dir2_db_t;
216 210
217/* 211#define XFS_INO32_SIZE 4
218 * Inode number stored as 8 8-bit values. 212#define XFS_INO64_SIZE 8
219 */ 213#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
220typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
221
222/*
223 * Inode number stored as 4 8-bit values.
224 * Works a lot of the time, when all the inode numbers in a directory
225 * fit in 32 bits.
226 */
227typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
228 214
229typedef union {
230 xfs_dir2_ino8_t i8;
231 xfs_dir2_ino4_t i4;
232} xfs_dir2_inou_t;
233#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) 215#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
234 216
235/* 217/*
@@ -246,39 +228,38 @@ typedef union {
246typedef struct xfs_dir2_sf_hdr { 228typedef struct xfs_dir2_sf_hdr {
247 __uint8_t count; /* count of entries */ 229 __uint8_t count; /* count of entries */
248 __uint8_t i8count; /* count of 8-byte inode #s */ 230 __uint8_t i8count; /* count of 8-byte inode #s */
249 xfs_dir2_inou_t parent; /* parent dir inode number */ 231 __uint8_t parent[8]; /* parent dir inode number */
250} __arch_pack xfs_dir2_sf_hdr_t; 232} __packed xfs_dir2_sf_hdr_t;
251 233
252typedef struct xfs_dir2_sf_entry { 234typedef struct xfs_dir2_sf_entry {
253 __u8 namelen; /* actual name length */ 235 __u8 namelen; /* actual name length */
254 xfs_dir2_sf_off_t offset; /* saved offset */ 236 __u8 offset[2]; /* saved offset */
255 __u8 name[]; /* name, variable size */ 237 __u8 name[]; /* name, variable size */
256 /* 238 /*
257 * A single byte containing the file type field follows the inode 239 * A single byte containing the file type field follows the inode
258 * number for version 3 directory entries. 240 * number for version 3 directory entries.
259 * 241 *
260 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a 242 * A 64-bit or 32-bit inode number follows here, at a variable offset
261 * variable offset after the name. 243 * after the name.
262 */ 244 */
263} __arch_pack xfs_dir2_sf_entry_t; 245} xfs_dir2_sf_entry_t;
264 246
265static inline int xfs_dir2_sf_hdr_size(int i8count) 247static inline int xfs_dir2_sf_hdr_size(int i8count)
266{ 248{
267 return sizeof(struct xfs_dir2_sf_hdr) - 249 return sizeof(struct xfs_dir2_sf_hdr) -
268 (i8count == 0) * 250 (i8count == 0) * XFS_INO64_DIFF;
269 (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
270} 251}
271 252
272static inline xfs_dir2_data_aoff_t 253static inline xfs_dir2_data_aoff_t
273xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) 254xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
274{ 255{
275 return get_unaligned_be16(&sfep->offset.i); 256 return get_unaligned_be16(sfep->offset);
276} 257}
277 258
278static inline void 259static inline void
279xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) 260xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
280{ 261{
281 put_unaligned_be16(off, &sfep->offset.i); 262 put_unaligned_be16(off, sfep->offset);
282} 263}
283 264
284static inline struct xfs_dir2_sf_entry * 265static inline struct xfs_dir2_sf_entry *
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e5bb9cc3b243..c6809ff41197 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
126 /* 126 /*
127 * Calculate the new size, see if we should give up yet. 127 * Calculate the new size, see if we should give up yet.
128 */ 128 */
129 size = xfs_dir2_sf_hdr_size(i8count) + /* header */ 129 size = xfs_dir2_sf_hdr_size(i8count) + /* header */
130 count + /* namelen */ 130 count * 3 * sizeof(u8) + /* namelen + offset */
131 count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ 131 namelen + /* name */
132 namelen + /* name */ 132 (i8count ? /* inumber */
133 (i8count ? /* inumber */ 133 count * XFS_INO64_SIZE :
134 (uint)sizeof(xfs_dir2_ino8_t) * count : 134 count * XFS_INO32_SIZE);
135 (uint)sizeof(xfs_dir2_ino4_t) * count);
136 if (size > XFS_IFORK_DSIZE(dp)) 135 if (size > XFS_IFORK_DSIZE(dp))
137 return size; /* size value is a failure */ 136 return size; /* size value is a failure */
138 } 137 }
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
319 /* 318 /*
320 * Yes, adjust the inode size. old count + (parent + new) 319 * Yes, adjust the inode size. old count + (parent + new)
321 */ 320 */
322 incr_isize += 321 incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
323 (sfp->count + 2) *
324 ((uint)sizeof(xfs_dir2_ino8_t) -
325 (uint)sizeof(xfs_dir2_ino4_t));
326 objchange = 1; 322 objchange = 1;
327 } 323 }
328 324
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
897 int error; /* error return value */ 893 int error; /* error return value */
898 int newsize; /* new inode size */ 894 int newsize; /* new inode size */
899 895
900 newsize = 896 newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
901 dp->i_df.if_bytes +
902 (sfp->count + 1) *
903 ((uint)sizeof(xfs_dir2_ino8_t) -
904 (uint)sizeof(xfs_dir2_ino4_t));
905 /* 897 /*
906 * Won't fit as shortform, convert to block then do replace. 898 * Won't fit as shortform, convert to block then do replace.
907 */ 899 */
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
1022 /* 1014 /*
1023 * Compute the new inode size. 1015 * Compute the new inode size.
1024 */ 1016 */
1025 newsize = 1017 newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
1026 oldsize -
1027 (oldsfp->count + 1) *
1028 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1029 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); 1018 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1030 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); 1019 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1031 /* 1020 /*
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
1048 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), 1037 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
1049 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { 1038 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
1050 sfep->namelen = oldsfep->namelen; 1039 sfep->namelen = oldsfep->namelen;
1051 sfep->offset = oldsfep->offset; 1040 memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
1052 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1041 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1053 dp->d_ops->sf_put_ino(sfp, sfep, 1042 dp->d_ops->sf_put_ino(sfp, sfep,
1054 dp->d_ops->sf_get_ino(oldsfp, oldsfep)); 1043 dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
1098 /* 1087 /*
1099 * Compute the new inode size (nb: entry count + 1 for parent) 1088 * Compute the new inode size (nb: entry count + 1 for parent)
1100 */ 1089 */
1101 newsize = 1090 newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
1102 oldsize +
1103 (oldsfp->count + 1) *
1104 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1105 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); 1091 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1106 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); 1092 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1107 /* 1093 /*
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
1124 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), 1110 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
1125 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { 1111 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
1126 sfep->namelen = oldsfep->namelen; 1112 sfep->namelen = oldsfep->namelen;
1127 sfep->offset = oldsfep->offset; 1113 memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
1128 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1114 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1129 dp->d_ops->sf_put_ino(sfp, sfep, 1115 dp->d_ops->sf_put_ino(sfp, sfep,
1130 dp->d_ops->sf_get_ino(oldsfp, oldsfep)); 1116 dp->d_ops->sf_get_ino(oldsfp, oldsfep));
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index dc97eb21af07..adb204d40f22 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
1435 * with the crc feature bit, and all accesses to them must be conditional on 1435 * with the crc feature bit, and all accesses to them must be conditional on
1436 * that flag. 1436 * that flag.
1437 */ 1437 */
1438/* short form block header */
1439struct xfs_btree_block_shdr {
1440 __be32 bb_leftsib;
1441 __be32 bb_rightsib;
1442
1443 __be64 bb_blkno;
1444 __be64 bb_lsn;
1445 uuid_t bb_uuid;
1446 __be32 bb_owner;
1447 __le32 bb_crc;
1448};
1449
1450/* long form block header */
1451struct xfs_btree_block_lhdr {
1452 __be64 bb_leftsib;
1453 __be64 bb_rightsib;
1454
1455 __be64 bb_blkno;
1456 __be64 bb_lsn;
1457 uuid_t bb_uuid;
1458 __be64 bb_owner;
1459 __le32 bb_crc;
1460 __be32 bb_pad; /* padding for alignment */
1461};
1462
1438struct xfs_btree_block { 1463struct xfs_btree_block {
1439 __be32 bb_magic; /* magic number for block type */ 1464 __be32 bb_magic; /* magic number for block type */
1440 __be16 bb_level; /* 0 is a leaf */ 1465 __be16 bb_level; /* 0 is a leaf */
1441 __be16 bb_numrecs; /* current # of data records */ 1466 __be16 bb_numrecs; /* current # of data records */
1442 union { 1467 union {
1443 struct { 1468 struct xfs_btree_block_shdr s;
1444 __be32 bb_leftsib; 1469 struct xfs_btree_block_lhdr l;
1445 __be32 bb_rightsib;
1446
1447 __be64 bb_blkno;
1448 __be64 bb_lsn;
1449 uuid_t bb_uuid;
1450 __be32 bb_owner;
1451 __le32 bb_crc;
1452 } s; /* short form pointers */
1453 struct {
1454 __be64 bb_leftsib;
1455 __be64 bb_rightsib;
1456
1457 __be64 bb_blkno;
1458 __be64 bb_lsn;
1459 uuid_t bb_uuid;
1460 __be64 bb_owner;
1461 __le32 bb_crc;
1462 __be32 bb_pad; /* padding for alignment */
1463 } l; /* long form pointers */
1464 } bb_u; /* rest */ 1470 } bb_u; /* rest */
1465}; 1471};
1466 1472
1467#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ 1473/* size of a short form block */
1468#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ 1474#define XFS_BTREE_SBLOCK_LEN \
1475 (offsetof(struct xfs_btree_block, bb_u) + \
1476 offsetof(struct xfs_btree_block_shdr, bb_blkno))
1477/* size of a long form block */
1478#define XFS_BTREE_LBLOCK_LEN \
1479 (offsetof(struct xfs_btree_block, bb_u) + \
1480 offsetof(struct xfs_btree_block_lhdr, bb_blkno))
1469 1481
1470/* sizes of CRC enabled btree blocks */ 1482/* sizes of CRC enabled btree blocks */
1471#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) 1483#define XFS_BTREE_SBLOCK_CRC_LEN \
1472#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) 1484 (offsetof(struct xfs_btree_block, bb_u) + \
1485 sizeof(struct xfs_btree_block_shdr))
1486#define XFS_BTREE_LBLOCK_CRC_LEN \
1487 (offsetof(struct xfs_btree_block, bb_u) + \
1488 sizeof(struct xfs_btree_block_lhdr))
1473 1489
1474#define XFS_BTREE_SBLOCK_CRC_OFF \ 1490#define XFS_BTREE_SBLOCK_CRC_OFF \
1475 offsetof(struct xfs_btree_block, bb_u.s.bb_crc) 1491 offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index fffe3d01bd9f..f5ec9c5ccae6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
521#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 521#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
522/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 522/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
523 523
524/* XFS_IOC_FREEZE -- FIFREEZE 119 */ 524#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
525/* XFS_IOC_THAW -- FITHAW 120 */ 525#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
526#ifndef FIFREEZE
527#define XFS_IOC_FREEZE _IOWR('X', 119, int)
528#define XFS_IOC_THAW _IOWR('X', 120, int)
529#endif
530 526
531#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 527#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
532#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 528#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 22297f9b0fd5..4b1e408169a8 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
1828 1828
1829 if (!xfs_inobt_issparse(rec->ir_holemask)) { 1829 if (!xfs_inobt_issparse(rec->ir_holemask)) {
1830 /* not sparse, calculate extent info directly */ 1830 /* not sparse, calculate extent info directly */
1831 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, 1831 xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
1832 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), 1832 mp->m_ialloc_blks);
1833 mp->m_ialloc_blks, flist, mp);
1834 return; 1833 return;
1835 } 1834 }
1836 1835
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
1873 1872
1874 ASSERT(agbno % mp->m_sb.sb_spino_align == 0); 1873 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1875 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); 1874 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1876 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, 1875 xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
1877 flist, mp); 1876 contigblk);
1878 1877
1879 /* reset range to current bit and carry on... */ 1878 /* reset range to current bit and carry on... */
1880 startidx = endidx = nextbit; 1879 startidx = endidx = nextbit;
@@ -2395,20 +2394,11 @@ void
2395xfs_ialloc_compute_maxlevels( 2394xfs_ialloc_compute_maxlevels(
2396 xfs_mount_t *mp) /* file system mount structure */ 2395 xfs_mount_t *mp) /* file system mount structure */
2397{ 2396{
2398 int level; 2397 uint inodes;
2399 uint maxblocks; 2398
2400 uint maxleafents; 2399 inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2401 int minleafrecs; 2400 mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
2402 int minnoderecs; 2401 inodes);
2403
2404 maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
2405 XFS_INODES_PER_CHUNK_LOG;
2406 minleafrecs = mp->m_inobt_mnr[0];
2407 minnoderecs = mp->m_inobt_mnr[1];
2408 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
2409 for (level = 1; maxblocks > 1; level++)
2410 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
2411 mp->m_in_maxlevels = level;
2412} 2402}
2413 2403
2414/* 2404/*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 951c044e24e4..e2e1106c9fad 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
70 * Get a buffer for the bitmap or summary file block specified. 70 * Get a buffer for the bitmap or summary file block specified.
71 * The buffer is returned read and locked. 71 * The buffer is returned read and locked.
72 */ 72 */
73int 73static int
74xfs_rtbuf_get( 74xfs_rtbuf_get(
75 xfs_mount_t *mp, /* file system mount structure */ 75 xfs_mount_t *mp, /* file system mount structure */
76 xfs_trans_t *tp, /* transaction pointer */ 76 xfs_trans_t *tp, /* transaction pointer */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 87d2b215cbbd..7575cfc3ad15 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
87 * We're now finished for good with this page. Update the page state via the 87 * We're now finished for good with this page. Update the page state via the
88 * associated buffer_heads, paying attention to the start and end offsets that 88 * associated buffer_heads, paying attention to the start and end offsets that
89 * we need to process on the page. 89 * we need to process on the page.
90 *
91 * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
92 * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
93 * the page at all, as we may be racing with memory reclaim and it can free both
94 * the bufferhead chain and the page as it will see the page as clean and
95 * unused.
90 */ 96 */
91static void 97static void
92xfs_finish_page_writeback( 98xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
95 int error) 101 int error)
96{ 102{
97 unsigned int end = bvec->bv_offset + bvec->bv_len - 1; 103 unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
98 struct buffer_head *head, *bh; 104 struct buffer_head *head, *bh, *next;
99 unsigned int off = 0; 105 unsigned int off = 0;
106 unsigned int bsize;
100 107
101 ASSERT(bvec->bv_offset < PAGE_SIZE); 108 ASSERT(bvec->bv_offset < PAGE_SIZE);
102 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); 109 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
105 112
106 bh = head = page_buffers(bvec->bv_page); 113 bh = head = page_buffers(bvec->bv_page);
107 114
115 bsize = bh->b_size;
108 do { 116 do {
117 next = bh->b_this_page;
109 if (off < bvec->bv_offset) 118 if (off < bvec->bv_offset)
110 goto next_bh; 119 goto next_bh;
111 if (off > end) 120 if (off > end)
112 break; 121 break;
113 bh->b_end_io(bh, !error); 122 bh->b_end_io(bh, !error);
114next_bh: 123next_bh:
115 off += bh->b_size; 124 off += bsize;
116 } while ((bh = bh->b_this_page) != head); 125 } while ((bh = next) != head);
117} 126}
118 127
119/* 128/*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
1041 1050
1042 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1051 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1043 1052
1053 /*
1054 * mm accommodates an old ext3 case where clean pages might not have had
1055 * the dirty bit cleared. Thus, it can send actual dirty pages to
1056 * ->releasepage() via shrink_active_list(). Conversely,
1057 * block_invalidatepage() can send pages that are still marked dirty
1058 * but otherwise have invalidated buffers.
1059 *
1060 * We've historically freed buffers on the latter. Instead, quietly
1061 * filter out all dirty pages to avoid spurious buffer state warnings.
1062 * This can likely be removed once shrink_active_list() is fixed.
1063 */
1064 if (PageDirty(page))
1065 return 0;
1066
1044 xfs_count_page_state(page, &delalloc, &unwritten); 1067 xfs_count_page_state(page, &delalloc, &unwritten);
1045 1068
1046 if (WARN_ON_ONCE(delalloc)) 1069 if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
1144 ssize_t size; 1167 ssize_t size;
1145 int new = 0; 1168 int new = 0;
1146 1169
1170 BUG_ON(create && !direct);
1171
1147 if (XFS_FORCED_SHUTDOWN(mp)) 1172 if (XFS_FORCED_SHUTDOWN(mp))
1148 return -EIO; 1173 return -EIO;
1149 1174
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
1151 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1176 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1152 size = bh_result->b_size; 1177 size = bh_result->b_size;
1153 1178
1154 if (!create && direct && offset >= i_size_read(inode)) 1179 if (!create && offset >= i_size_read(inode))
1155 return 0; 1180 return 0;
1156 1181
1157 /* 1182 /*
1158 * Direct I/O is usually done on preallocated files, so try getting 1183 * Direct I/O is usually done on preallocated files, so try getting
1159 * a block mapping without an exclusive lock first. For buffered 1184 * a block mapping without an exclusive lock first.
1160 * writes we already have the exclusive iolock anyway, so avoiding
1161 * a lock roundtrip here by taking the ilock exclusive from the
1162 * beginning is a useful micro optimization.
1163 */ 1185 */
1164 if (create && !direct) { 1186 lockmode = xfs_ilock_data_map_shared(ip);
1165 lockmode = XFS_ILOCK_EXCL;
1166 xfs_ilock(ip, lockmode);
1167 } else {
1168 lockmode = xfs_ilock_data_map_shared(ip);
1169 }
1170 1187
1171 ASSERT(offset <= mp->m_super->s_maxbytes); 1188 ASSERT(offset <= mp->m_super->s_maxbytes);
1172 if (offset + size > mp->m_super->s_maxbytes) 1189 if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
1185 (imap.br_startblock == HOLESTARTBLOCK || 1202 (imap.br_startblock == HOLESTARTBLOCK ||
1186 imap.br_startblock == DELAYSTARTBLOCK) || 1203 imap.br_startblock == DELAYSTARTBLOCK) ||
1187 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1204 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1188 if (direct || xfs_get_extsz_hint(ip)) { 1205 /*
1189 /* 1206 * xfs_iomap_write_direct() expects the shared lock. It
1190 * xfs_iomap_write_direct() expects the shared lock. It 1207 * is unlocked on return.
1191 * is unlocked on return. 1208 */
1192 */ 1209 if (lockmode == XFS_ILOCK_EXCL)
1193 if (lockmode == XFS_ILOCK_EXCL) 1210 xfs_ilock_demote(ip, lockmode);
1194 xfs_ilock_demote(ip, lockmode);
1195
1196 error = xfs_iomap_write_direct(ip, offset, size,
1197 &imap, nimaps);
1198 if (error)
1199 return error;
1200 new = 1;
1201 1211
1202 } else { 1212 error = xfs_iomap_write_direct(ip, offset, size,
1203 /* 1213 &imap, nimaps);
1204 * Delalloc reservations do not require a transaction, 1214 if (error)
1205 * we can go on without dropping the lock here. If we 1215 return error;
1206 * are allocating a new delalloc block, make sure that 1216 new = 1;
1207 * we set the new flag so that we mark the buffer new so
1208 * that we know that it is newly allocated if the write
1209 * fails.
1210 */
1211 if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1212 new = 1;
1213 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1214 if (error)
1215 goto out_unlock;
1216 1217
1217 xfs_iunlock(ip, lockmode);
1218 }
1219 trace_xfs_get_blocks_alloc(ip, offset, size, 1218 trace_xfs_get_blocks_alloc(ip, offset, size,
1220 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1219 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1221 : XFS_IO_DELALLOC, &imap); 1220 : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
1236 } 1235 }
1237 1236
1238 /* trim mapping down to size requested */ 1237 /* trim mapping down to size requested */
1239 if (direct || size > (1 << inode->i_blkbits)) 1238 xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1240 xfs_map_trim_size(inode, iblock, bh_result,
1241 &imap, offset, size);
1242 1239
1243 /* 1240 /*
1244 * For unwritten extents do not report a disk address in the buffered 1241 * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
1251 if (ISUNWRITTEN(&imap)) 1248 if (ISUNWRITTEN(&imap))
1252 set_buffer_unwritten(bh_result); 1249 set_buffer_unwritten(bh_result);
1253 /* direct IO needs special help */ 1250 /* direct IO needs special help */
1254 if (create && direct) { 1251 if (create) {
1255 if (dax_fault) 1252 if (dax_fault)
1256 ASSERT(!ISUNWRITTEN(&imap)); 1253 ASSERT(!ISUNWRITTEN(&imap));
1257 else 1254 else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
1280 (new || ISUNWRITTEN(&imap)))) 1277 (new || ISUNWRITTEN(&imap))))
1281 set_buffer_new(bh_result); 1278 set_buffer_new(bh_result);
1282 1279
1283 if (imap.br_startblock == DELAYSTARTBLOCK) { 1280 BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
1284 BUG_ON(direct);
1285 if (create) {
1286 set_buffer_uptodate(bh_result);
1287 set_buffer_mapped(bh_result);
1288 set_buffer_delay(bh_result);
1289 }
1290 }
1291 1281
1292 return 0; 1282 return 0;
1293 1283
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
1337 * whereas if we have flags set we will always be called in task context 1327 * whereas if we have flags set we will always be called in task context
1338 * (i.e. from a workqueue). 1328 * (i.e. from a workqueue).
1339 */ 1329 */
1340STATIC int 1330int
1341xfs_end_io_direct_write( 1331xfs_end_io_direct_write(
1342 struct kiocb *iocb, 1332 struct kiocb *iocb,
1343 loff_t offset, 1333 loff_t offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
1408 struct kiocb *iocb, 1398 struct kiocb *iocb,
1409 struct iov_iter *iter) 1399 struct iov_iter *iter)
1410{ 1400{
1411 struct inode *inode = iocb->ki_filp->f_mapping->host;
1412 dio_iodone_t *endio = NULL;
1413 int flags = 0;
1414 struct block_device *bdev;
1415
1416 if (iov_iter_rw(iter) == WRITE) {
1417 endio = xfs_end_io_direct_write;
1418 flags = DIO_ASYNC_EXTEND;
1419 }
1420
1421 if (IS_DAX(inode)) {
1422 return dax_do_io(iocb, inode, iter,
1423 xfs_get_blocks_direct, endio, 0);
1424 }
1425
1426 bdev = xfs_find_bdev_for_inode(inode);
1427 return __blockdev_direct_IO(iocb, inode, bdev, iter,
1428 xfs_get_blocks_direct, endio, NULL, flags);
1429}
1430
1431/*
1432 * Punch out the delalloc blocks we have already allocated.
1433 *
1434 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1435 * as the page is still locked at this point.
1436 */
1437STATIC void
1438xfs_vm_kill_delalloc_range(
1439 struct inode *inode,
1440 loff_t start,
1441 loff_t end)
1442{
1443 struct xfs_inode *ip = XFS_I(inode);
1444 xfs_fileoff_t start_fsb;
1445 xfs_fileoff_t end_fsb;
1446 int error;
1447
1448 start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1449 end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1450 if (end_fsb <= start_fsb)
1451 return;
1452
1453 xfs_ilock(ip, XFS_ILOCK_EXCL);
1454 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1455 end_fsb - start_fsb);
1456 if (error) {
1457 /* something screwed, just bail */
1458 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1459 xfs_alert(ip->i_mount,
1460 "xfs_vm_write_failed: unable to clean up ino %lld",
1461 ip->i_ino);
1462 }
1463 }
1464 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1465}
1466
1467STATIC void
1468xfs_vm_write_failed(
1469 struct inode *inode,
1470 struct page *page,
1471 loff_t pos,
1472 unsigned len)
1473{
1474 loff_t block_offset;
1475 loff_t block_start;
1476 loff_t block_end;
1477 loff_t from = pos & (PAGE_SIZE - 1);
1478 loff_t to = from + len;
1479 struct buffer_head *bh, *head;
1480 struct xfs_mount *mp = XFS_I(inode)->i_mount;
1481
1482 /* 1401 /*
1483 * The request pos offset might be 32 or 64 bit, this is all fine 1402 * We just need the method present so that open/fcntl allow direct I/O.
1484 * on 64-bit platform. However, for 64-bit pos request on 32-bit
1485 * platform, the high 32-bit will be masked off if we evaluate the
1486 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1487 * 0xfffff000 as an unsigned long, hence the result is incorrect
1488 * which could cause the following ASSERT failed in most cases.
1489 * In order to avoid this, we can evaluate the block_offset of the
1490 * start of the page by using shifts rather than masks the mismatch
1491 * problem.
1492 */ 1403 */
1493 block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; 1404 return -EINVAL;
1494
1495 ASSERT(block_offset + from == pos);
1496
1497 head = page_buffers(page);
1498 block_start = 0;
1499 for (bh = head; bh != head || !block_start;
1500 bh = bh->b_this_page, block_start = block_end,
1501 block_offset += bh->b_size) {
1502 block_end = block_start + bh->b_size;
1503
1504 /* skip buffers before the write */
1505 if (block_end <= from)
1506 continue;
1507
1508 /* if the buffer is after the write, we're done */
1509 if (block_start >= to)
1510 break;
1511
1512 /*
1513 * Process delalloc and unwritten buffers beyond EOF. We can
1514 * encounter unwritten buffers in the event that a file has
1515 * post-EOF unwritten extents and an extending write happens to
1516 * fail (e.g., an unaligned write that also involves a delalloc
1517 * to the same page).
1518 */
1519 if (!buffer_delay(bh) && !buffer_unwritten(bh))
1520 continue;
1521
1522 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1523 block_offset < i_size_read(inode))
1524 continue;
1525
1526 if (buffer_delay(bh))
1527 xfs_vm_kill_delalloc_range(inode, block_offset,
1528 block_offset + bh->b_size);
1529
1530 /*
1531 * This buffer does not contain data anymore. make sure anyone
1532 * who finds it knows that for certain.
1533 */
1534 clear_buffer_delay(bh);
1535 clear_buffer_uptodate(bh);
1536 clear_buffer_mapped(bh);
1537 clear_buffer_new(bh);
1538 clear_buffer_dirty(bh);
1539 clear_buffer_unwritten(bh);
1540 }
1541
1542}
1543
1544/*
1545 * This used to call block_write_begin(), but it unlocks and releases the page
1546 * on error, and we need that page to be able to punch stale delalloc blocks out
1547 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1548 * the appropriate point.
1549 */
1550STATIC int
1551xfs_vm_write_begin(
1552 struct file *file,
1553 struct address_space *mapping,
1554 loff_t pos,
1555 unsigned len,
1556 unsigned flags,
1557 struct page **pagep,
1558 void **fsdata)
1559{
1560 pgoff_t index = pos >> PAGE_SHIFT;
1561 struct page *page;
1562 int status;
1563 struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
1564
1565 ASSERT(len <= PAGE_SIZE);
1566
1567 page = grab_cache_page_write_begin(mapping, index, flags);
1568 if (!page)
1569 return -ENOMEM;
1570
1571 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1572 if (xfs_mp_fail_writes(mp))
1573 status = -EIO;
1574 if (unlikely(status)) {
1575 struct inode *inode = mapping->host;
1576 size_t isize = i_size_read(inode);
1577
1578 xfs_vm_write_failed(inode, page, pos, len);
1579 unlock_page(page);
1580
1581 /*
1582 * If the write is beyond EOF, we only want to kill blocks
1583 * allocated in this write, not blocks that were previously
1584 * written successfully.
1585 */
1586 if (xfs_mp_fail_writes(mp))
1587 isize = 0;
1588 if (pos + len > isize) {
1589 ssize_t start = max_t(ssize_t, pos, isize);
1590
1591 truncate_pagecache_range(inode, start, pos + len);
1592 }
1593
1594 put_page(page);
1595 page = NULL;
1596 }
1597
1598 *pagep = page;
1599 return status;
1600}
1601
1602/*
1603 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1604 * this specific write because they will never be written. Previous writes
1605 * beyond EOF where block allocation succeeded do not need to be trashed, so
1606 * only new blocks from this write should be trashed. For blocks within
1607 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1608 * written with all the other valid data.
1609 */
1610STATIC int
1611xfs_vm_write_end(
1612 struct file *file,
1613 struct address_space *mapping,
1614 loff_t pos,
1615 unsigned len,
1616 unsigned copied,
1617 struct page *page,
1618 void *fsdata)
1619{
1620 int ret;
1621
1622 ASSERT(len <= PAGE_SIZE);
1623
1624 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1625 if (unlikely(ret < len)) {
1626 struct inode *inode = mapping->host;
1627 size_t isize = i_size_read(inode);
1628 loff_t to = pos + len;
1629
1630 if (to > isize) {
1631 /* only kill blocks in this write beyond EOF */
1632 if (pos > isize)
1633 isize = pos;
1634 xfs_vm_kill_delalloc_range(inode, isize, to);
1635 truncate_pagecache_range(inode, isize, to);
1636 }
1637 }
1638 return ret;
1639} 1405}
1640 1406
1641STATIC sector_t 1407STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
1748 .set_page_dirty = xfs_vm_set_page_dirty, 1514 .set_page_dirty = xfs_vm_set_page_dirty,
1749 .releasepage = xfs_vm_releasepage, 1515 .releasepage = xfs_vm_releasepage,
1750 .invalidatepage = xfs_vm_invalidatepage, 1516 .invalidatepage = xfs_vm_invalidatepage,
1751 .write_begin = xfs_vm_write_begin,
1752 .write_end = xfs_vm_write_end,
1753 .bmap = xfs_vm_bmap, 1517 .bmap = xfs_vm_bmap,
1754 .direct_IO = xfs_vm_direct_IO, 1518 .direct_IO = xfs_vm_direct_IO,
1755 .migratepage = buffer_migrate_page, 1519 .migratepage = buffer_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 814aab790713..bf2d9a141a73 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
60int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, 60int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
61 struct buffer_head *map_bh, int create); 61 struct buffer_head *map_bh, int create);
62 62
63int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
64 ssize_t size, void *private);
65
63extern void xfs_count_page_state(struct page *, int *, int *); 66extern void xfs_count_page_state(struct page *, int *, int *);
64extern struct block_device *xfs_find_bdev_for_inode(struct inode *); 67extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
65 68
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 55d214981ed2..be0b79d8900f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
322 * Recurse (gasp!) through the attribute nodes until we find leaves. 322 * Recurse (gasp!) through the attribute nodes until we find leaves.
323 * We're doing a depth-first traversal in order to invalidate everything. 323 * We're doing a depth-first traversal in order to invalidate everything.
324 */ 324 */
325int 325static int
326xfs_attr3_root_inactive( 326xfs_attr3_root_inactive(
327 struct xfs_trans **trans, 327 struct xfs_trans **trans,
328 struct xfs_inode *dp) 328 struct xfs_inode *dp)
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d25f26b22ac9..25e76cd6c053 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
65 * we have to calculate each entries' hashvalue and sort them before 65 * we have to calculate each entries' hashvalue and sort them before
66 * we can begin returning them to the user. 66 * we can begin returning them to the user.
67 */ 67 */
68int 68static int
69xfs_attr_shortform_list(xfs_attr_list_context_t *context) 69xfs_attr_shortform_list(xfs_attr_list_context_t *context)
70{ 70{
71 attrlist_cursor_kern_t *cursor; 71 attrlist_cursor_kern_t *cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 586bb64e674b..cd4a850564f2 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -79,6 +79,23 @@ xfs_zero_extent(
79 GFP_NOFS, true); 79 GFP_NOFS, true);
80} 80}
81 81
82/* Sort bmap items by AG. */
83static int
84xfs_bmap_free_list_cmp(
85 void *priv,
86 struct list_head *a,
87 struct list_head *b)
88{
89 struct xfs_mount *mp = priv;
90 struct xfs_bmap_free_item *ra;
91 struct xfs_bmap_free_item *rb;
92
93 ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
94 rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
95 return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
96 XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
97}
98
82/* 99/*
83 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi 100 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
84 * caller. Frees all the extents that need freeing, which must be done 101 * caller. Frees all the extents that need freeing, which must be done
@@ -99,14 +116,15 @@ xfs_bmap_finish(
99 int error; /* error return value */ 116 int error; /* error return value */
100 int committed;/* xact committed or not */ 117 int committed;/* xact committed or not */
101 struct xfs_bmap_free_item *free; /* free extent item */ 118 struct xfs_bmap_free_item *free; /* free extent item */
102 struct xfs_bmap_free_item *next; /* next item on free list */
103 119
104 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 120 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
105 if (flist->xbf_count == 0) 121 if (flist->xbf_count == 0)
106 return 0; 122 return 0;
107 123
124 list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
125
108 efi = xfs_trans_get_efi(*tp, flist->xbf_count); 126 efi = xfs_trans_get_efi(*tp, flist->xbf_count);
109 for (free = flist->xbf_first; free; free = free->xbfi_next) 127 list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
110 xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, 128 xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
111 free->xbfi_blockcount); 129 free->xbfi_blockcount);
112 130
@@ -125,9 +143,7 @@ xfs_bmap_finish(
125 if (committed) { 143 if (committed) {
126 xfs_efi_release(efi); 144 xfs_efi_release(efi);
127 xfs_force_shutdown((*tp)->t_mountp, 145 xfs_force_shutdown((*tp)->t_mountp,
128 (error == -EFSCORRUPTED) ? 146 SHUTDOWN_META_IO_ERROR);
129 SHUTDOWN_CORRUPT_INCORE :
130 SHUTDOWN_META_IO_ERROR);
131 } 147 }
132 return error; 148 return error;
133 } 149 }
@@ -138,15 +154,15 @@ xfs_bmap_finish(
138 * on error. 154 * on error.
139 */ 155 */
140 efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); 156 efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
141 for (free = flist->xbf_first; free != NULL; free = next) { 157 while (!list_empty(&flist->xbf_flist)) {
142 next = free->xbfi_next; 158 free = list_first_entry(&flist->xbf_flist,
143 159 struct xfs_bmap_free_item, xbfi_list);
144 error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, 160 error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
145 free->xbfi_blockcount); 161 free->xbfi_blockcount);
146 if (error) 162 if (error)
147 return error; 163 return error;
148 164
149 xfs_bmap_del_free(flist, NULL, free); 165 xfs_bmap_del_free(flist, free);
150 } 166 }
151 167
152 return 0; 168 return 0;
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
409/* 425/*
410 * Count fsblocks of the given fork. 426 * Count fsblocks of the given fork.
411 */ 427 */
412int /* error */ 428static int /* error */
413xfs_bmap_count_blocks( 429xfs_bmap_count_blocks(
414 xfs_trans_t *tp, /* transaction pointer */ 430 xfs_trans_t *tp, /* transaction pointer */
415 xfs_inode_t *ip, /* incore inode */ 431 xfs_inode_t *ip, /* incore inode */
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
799 if (error) 815 if (error)
800 break; 816 break;
801 817
802 ASSERT(!flist.xbf_count && !flist.xbf_first); 818 ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
803next_block: 819next_block:
804 start_fsb++; 820 start_fsb++;
805 remaining--; 821 remaining--;
@@ -1089,99 +1105,120 @@ error1: /* Just cancel transaction */
1089 return error; 1105 return error;
1090} 1106}
1091 1107
1092/* 1108static int
1093 * Zero file bytes between startoff and endoff inclusive. 1109xfs_unmap_extent(
1094 * The iolock is held exclusive and no blocks are buffered. 1110 struct xfs_inode *ip,
1095 * 1111 xfs_fileoff_t startoffset_fsb,
1096 * This function is used by xfs_free_file_space() to zero 1112 xfs_filblks_t len_fsb,
1097 * partial blocks when the range to free is not block aligned. 1113 int *done)
1098 * When unreserving space with boundaries that are not block
1099 * aligned we round up the start and round down the end
1100 * boundaries and then use this function to zero the parts of
1101 * the blocks that got dropped during the rounding.
1102 */
1103STATIC int
1104xfs_zero_remaining_bytes(
1105 xfs_inode_t *ip,
1106 xfs_off_t startoff,
1107 xfs_off_t endoff)
1108{ 1114{
1109 xfs_bmbt_irec_t imap; 1115 struct xfs_mount *mp = ip->i_mount;
1110 xfs_fileoff_t offset_fsb; 1116 struct xfs_trans *tp;
1111 xfs_off_t lastoffset; 1117 struct xfs_bmap_free free_list;
1112 xfs_off_t offset; 1118 xfs_fsblock_t firstfsb;
1113 xfs_buf_t *bp; 1119 uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1114 xfs_mount_t *mp = ip->i_mount; 1120 int error;
1115 int nimap;
1116 int error = 0;
1117 1121
1118 /* 1122 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1119 * Avoid doing I/O beyond eof - it's not necessary 1123 if (error) {
1120 * since nothing can read beyond eof. The space will 1124 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1121 * be zeroed when the file is extended anyway. 1125 return error;
1122 */ 1126 }
1123 if (startoff >= XFS_ISIZE(ip))
1124 return 0;
1125 1127
1126 if (endoff > XFS_ISIZE(ip)) 1128 xfs_ilock(ip, XFS_ILOCK_EXCL);
1127 endoff = XFS_ISIZE(ip); 1129 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
1130 ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
1131 if (error)
1132 goto out_trans_cancel;
1128 1133
1129 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1134 xfs_trans_ijoin(tp, ip, 0);
1130 uint lock_mode;
1131 1135
1132 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1136 xfs_bmap_init(&free_list, &firstfsb);
1133 nimap = 1; 1137 error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
1138 &free_list, done);
1139 if (error)
1140 goto out_bmap_cancel;
1134 1141
1135 lock_mode = xfs_ilock_data_map_shared(ip); 1142 error = xfs_bmap_finish(&tp, &free_list, NULL);
1136 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); 1143 if (error)
1137 xfs_iunlock(ip, lock_mode); 1144 goto out_bmap_cancel;
1138 1145
1139 if (error || nimap < 1) 1146 error = xfs_trans_commit(tp);
1140 break; 1147out_unlock:
1141 ASSERT(imap.br_blockcount >= 1); 1148 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1142 ASSERT(imap.br_startoff == offset_fsb); 1149 return error;
1143 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1144 1150
1145 if (imap.br_startblock == HOLESTARTBLOCK || 1151out_bmap_cancel:
1146 imap.br_state == XFS_EXT_UNWRITTEN) { 1152 xfs_bmap_cancel(&free_list);
1147 /* skip the entire extent */ 1153out_trans_cancel:
1148 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1154 xfs_trans_cancel(tp);
1149 imap.br_blockcount) - 1; 1155 goto out_unlock;
1150 continue; 1156}
1151 }
1152 1157
1153 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; 1158static int
1154 if (lastoffset > endoff) 1159xfs_adjust_extent_unmap_boundaries(
1155 lastoffset = endoff; 1160 struct xfs_inode *ip,
1161 xfs_fileoff_t *startoffset_fsb,
1162 xfs_fileoff_t *endoffset_fsb)
1163{
1164 struct xfs_mount *mp = ip->i_mount;
1165 struct xfs_bmbt_irec imap;
1166 int nimap, error;
1167 xfs_extlen_t mod = 0;
1156 1168
1157 /* DAX can just zero the backing device directly */ 1169 nimap = 1;
1158 if (IS_DAX(VFS_I(ip))) { 1170 error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
1159 error = dax_zero_page_range(VFS_I(ip), offset, 1171 if (error)
1160 lastoffset - offset + 1, 1172 return error;
1161 xfs_get_blocks_direct);
1162 if (error)
1163 return error;
1164 continue;
1165 }
1166 1173
1167 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? 1174 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1168 mp->m_rtdev_targp : mp->m_ddev_targp, 1175 xfs_daddr_t block;
1169 xfs_fsb_to_db(ip, imap.br_startblock),
1170 BTOBB(mp->m_sb.sb_blocksize),
1171 0, &bp, NULL);
1172 if (error)
1173 return error;
1174 1176
1175 memset(bp->b_addr + 1177 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1176 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 1178 block = imap.br_startblock;
1177 0, lastoffset - offset + 1); 1179 mod = do_div(block, mp->m_sb.sb_rextsize);
1180 if (mod)
1181 *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1182 }
1178 1183
1179 error = xfs_bwrite(bp); 1184 nimap = 1;
1180 xfs_buf_relse(bp); 1185 error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
1181 if (error) 1186 if (error)
1182 return error; 1187 return error;
1188
1189 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1190 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1191 mod++;
1192 if (mod && mod != mp->m_sb.sb_rextsize)
1193 *endoffset_fsb -= mod;
1183 } 1194 }
1184 return error; 1195
1196 return 0;
1197}
1198
1199static int
1200xfs_flush_unmap_range(
1201 struct xfs_inode *ip,
1202 xfs_off_t offset,
1203 xfs_off_t len)
1204{
1205 struct xfs_mount *mp = ip->i_mount;
1206 struct inode *inode = VFS_I(ip);
1207 xfs_off_t rounding, start, end;
1208 int error;
1209
1210 /* wait for the completion of any pending DIOs */
1211 inode_dio_wait(inode);
1212
1213 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
1214 start = round_down(offset, rounding);
1215 end = round_up(offset + len, rounding) - 1;
1216
1217 error = filemap_write_and_wait_range(inode->i_mapping, start, end);
1218 if (error)
1219 return error;
1220 truncate_pagecache_range(inode, start, end);
1221 return 0;
1185} 1222}
1186 1223
1187int 1224int
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
1190 xfs_off_t offset, 1227 xfs_off_t offset,
1191 xfs_off_t len) 1228 xfs_off_t len)
1192{ 1229{
1193 int done; 1230 struct xfs_mount *mp = ip->i_mount;
1194 xfs_fileoff_t endoffset_fsb;
1195 int error;
1196 xfs_fsblock_t firstfsb;
1197 xfs_bmap_free_t free_list;
1198 xfs_bmbt_irec_t imap;
1199 xfs_off_t ioffset;
1200 xfs_off_t iendoffset;
1201 xfs_extlen_t mod=0;
1202 xfs_mount_t *mp;
1203 int nimap;
1204 uint resblks;
1205 xfs_off_t rounding;
1206 int rt;
1207 xfs_fileoff_t startoffset_fsb; 1231 xfs_fileoff_t startoffset_fsb;
1208 xfs_trans_t *tp; 1232 xfs_fileoff_t endoffset_fsb;
1209 1233 int done = 0, error;
1210 mp = ip->i_mount;
1211 1234
1212 trace_xfs_free_file_space(ip); 1235 trace_xfs_free_file_space(ip);
1213 1236
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
1215 if (error) 1238 if (error)
1216 return error; 1239 return error;
1217 1240
1218 error = 0;
1219 if (len <= 0) /* if nothing being freed */ 1241 if (len <= 0) /* if nothing being freed */
1220 return error; 1242 return 0;
1221 rt = XFS_IS_REALTIME_INODE(ip);
1222 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1223 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1224
1225 /* wait for the completion of any pending DIOs */
1226 inode_dio_wait(VFS_I(ip));
1227 1243
1228 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); 1244 error = xfs_flush_unmap_range(ip, offset, len);
1229 ioffset = round_down(offset, rounding);
1230 iendoffset = round_up(offset + len, rounding) - 1;
1231 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
1232 iendoffset);
1233 if (error) 1245 if (error)
1234 goto out; 1246 return error;
1235 truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); 1247
1248 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1249 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1236 1250
1237 /* 1251 /*
1238 * Need to zero the stuff we're not freeing, on disk. 1252 * Need to zero the stuff we're not freeing, on disk. If it's a RT file
1239 * If it's a realtime file & can't use unwritten extents then we 1253 * and we can't use unwritten extents then we actually need to ensure
1240 * actually need to zero the extent edges. Otherwise xfs_bunmapi 1254 * to zero the whole extent, otherwise we just need to take of block
1241 * will take care of it for us. 1255 * boundaries, and xfs_bunmapi will handle the rest.
1242 */ 1256 */
1243 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 1257 if (XFS_IS_REALTIME_INODE(ip) &&
1244 nimap = 1; 1258 !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1245 error = xfs_bmapi_read(ip, startoffset_fsb, 1, 1259 error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
1246 &imap, &nimap, 0); 1260 &endoffset_fsb);
1247 if (error)
1248 goto out;
1249 ASSERT(nimap == 0 || nimap == 1);
1250 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1251 xfs_daddr_t block;
1252
1253 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1254 block = imap.br_startblock;
1255 mod = do_div(block, mp->m_sb.sb_rextsize);
1256 if (mod)
1257 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1258 }
1259 nimap = 1;
1260 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1261 &imap, &nimap, 0);
1262 if (error) 1261 if (error)
1263 goto out; 1262 return error;
1264 ASSERT(nimap == 0 || nimap == 1);
1265 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1266 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1267 mod++;
1268 if (mod && (mod != mp->m_sb.sb_rextsize))
1269 endoffset_fsb -= mod;
1270 }
1271 }
1272 if ((done = (endoffset_fsb <= startoffset_fsb)))
1273 /*
1274 * One contiguous piece to clear
1275 */
1276 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1277 else {
1278 /*
1279 * Some full blocks, possibly two pieces to clear
1280 */
1281 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1282 error = xfs_zero_remaining_bytes(ip, offset,
1283 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1284 if (!error &&
1285 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1286 error = xfs_zero_remaining_bytes(ip,
1287 XFS_FSB_TO_B(mp, endoffset_fsb),
1288 offset + len - 1);
1289 } 1263 }
1290 1264
1291 /* 1265 if (endoffset_fsb > startoffset_fsb) {
1292 * free file space until done or until there is an error 1266 while (!done) {
1293 */ 1267 error = xfs_unmap_extent(ip, startoffset_fsb,
1294 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 1268 endoffset_fsb - startoffset_fsb, &done);
1295 while (!error && !done) { 1269 if (error)
1296 1270 return error;
1297 /*
1298 * allocate and setup the transaction. Allow this
1299 * transaction to dip into the reserve blocks to ensure
1300 * the freeing of the space succeeds at ENOSPC.
1301 */
1302 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
1303 &tp);
1304 if (error) {
1305 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1306 break;
1307 } 1271 }
1308 xfs_ilock(ip, XFS_ILOCK_EXCL);
1309 error = xfs_trans_reserve_quota(tp, mp,
1310 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1311 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1312 if (error)
1313 goto error1;
1314
1315 xfs_trans_ijoin(tp, ip, 0);
1316
1317 /*
1318 * issue the bunmapi() call to free the blocks
1319 */
1320 xfs_bmap_init(&free_list, &firstfsb);
1321 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1322 endoffset_fsb - startoffset_fsb,
1323 0, 2, &firstfsb, &free_list, &done);
1324 if (error)
1325 goto error0;
1326
1327 /*
1328 * complete the transaction
1329 */
1330 error = xfs_bmap_finish(&tp, &free_list, NULL);
1331 if (error)
1332 goto error0;
1333
1334 error = xfs_trans_commit(tp);
1335 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1336 } 1272 }
1337 1273
1338 out: 1274 /*
1339 return error; 1275 * Now that we've unmap all full blocks we'll have to zero out any
1340 1276 * partial block at the beginning and/or end. xfs_zero_range is
1341 error0: 1277 * smart enough to skip any holes, including those we just created.
1342 xfs_bmap_cancel(&free_list); 1278 */
1343 error1: 1279 return xfs_zero_range(ip, offset, len, NULL);
1344 xfs_trans_cancel(tp);
1345 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1346 goto out;
1347} 1280}
1348 1281
1349/* 1282/*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index af97d9a1dfb4..f20071432ca6 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
31int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); 31int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
32int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, 32int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
33 int whichfork, int *eof); 33 int whichfork, int *eof);
34int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
35 int whichfork, int *count);
36int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, 34int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
37 xfs_fileoff_t start_fsb, xfs_fileoff_t length); 35 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
38 36
@@ -43,7 +41,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
43 41
44/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ 42/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
45void xfs_bmap_del_free(struct xfs_bmap_free *flist, 43void xfs_bmap_del_free(struct xfs_bmap_free *flist,
46 struct xfs_bmap_free_item *prev,
47 struct xfs_bmap_free_item *free); 44 struct xfs_bmap_free_item *free);
48int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, 45int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
49 struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, 46 struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a87a0d5477bd..47a318ce82e0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,6 +80,47 @@ xfs_buf_vmap_len(
80} 80}
81 81
82/* 82/*
83 * Bump the I/O in flight count on the buftarg if we haven't yet done so for
84 * this buffer. The count is incremented once per buffer (per hold cycle)
85 * because the corresponding decrement is deferred to buffer release. Buffers
86 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
87 * tracking adds unnecessary overhead. This is used for sychronization purposes
88 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
89 * in-flight buffers.
90 *
91 * Buffers that are never released (e.g., superblock, iclog buffers) must set
92 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
93 * never reaches zero and unmount hangs indefinitely.
94 */
95static inline void
96xfs_buf_ioacct_inc(
97 struct xfs_buf *bp)
98{
99 if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
100 return;
101
102 ASSERT(bp->b_flags & XBF_ASYNC);
103 bp->b_flags |= _XBF_IN_FLIGHT;
104 percpu_counter_inc(&bp->b_target->bt_io_count);
105}
106
107/*
108 * Clear the in-flight state on a buffer about to be released to the LRU or
109 * freed and unaccount from the buftarg.
110 */
111static inline void
112xfs_buf_ioacct_dec(
113 struct xfs_buf *bp)
114{
115 if (!(bp->b_flags & _XBF_IN_FLIGHT))
116 return;
117
118 ASSERT(bp->b_flags & XBF_ASYNC);
119 bp->b_flags &= ~_XBF_IN_FLIGHT;
120 percpu_counter_dec(&bp->b_target->bt_io_count);
121}
122
123/*
83 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 124 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
84 * b_lru_ref count so that the buffer is freed immediately when the buffer 125 * b_lru_ref count so that the buffer is freed immediately when the buffer
85 * reference count falls to zero. If the buffer is already on the LRU, we need 126 * reference count falls to zero. If the buffer is already on the LRU, we need
@@ -102,6 +143,14 @@ xfs_buf_stale(
102 */ 143 */
103 bp->b_flags &= ~_XBF_DELWRI_Q; 144 bp->b_flags &= ~_XBF_DELWRI_Q;
104 145
146 /*
147 * Once the buffer is marked stale and unlocked, a subsequent lookup
148 * could reset b_flags. There is no guarantee that the buffer is
149 * unaccounted (released to LRU) before that occurs. Drop in-flight
150 * status now to preserve accounting consistency.
151 */
152 xfs_buf_ioacct_dec(bp);
153
105 spin_lock(&bp->b_lock); 154 spin_lock(&bp->b_lock);
106 atomic_set(&bp->b_lru_ref, 0); 155 atomic_set(&bp->b_lru_ref, 0);
107 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 156 if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
815 struct xfs_buf *bp; 864 struct xfs_buf *bp;
816 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 865 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
817 866
818 bp = _xfs_buf_alloc(target, &map, 1, 0); 867 /* flags might contain irrelevant bits, pass only what we care about */
868 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
819 if (unlikely(bp == NULL)) 869 if (unlikely(bp == NULL))
820 goto fail; 870 goto fail;
821 871
@@ -866,63 +916,85 @@ xfs_buf_hold(
866} 916}
867 917
868/* 918/*
869 * Releases a hold on the specified buffer. If the 919 * Release a hold on the specified buffer. If the hold count is 1, the buffer is
870 * the hold count is 1, calls xfs_buf_free. 920 * placed on LRU or freed (depending on b_lru_ref).
871 */ 921 */
872void 922void
873xfs_buf_rele( 923xfs_buf_rele(
874 xfs_buf_t *bp) 924 xfs_buf_t *bp)
875{ 925{
876 struct xfs_perag *pag = bp->b_pag; 926 struct xfs_perag *pag = bp->b_pag;
927 bool release;
928 bool freebuf = false;
877 929
878 trace_xfs_buf_rele(bp, _RET_IP_); 930 trace_xfs_buf_rele(bp, _RET_IP_);
879 931
880 if (!pag) { 932 if (!pag) {
881 ASSERT(list_empty(&bp->b_lru)); 933 ASSERT(list_empty(&bp->b_lru));
882 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 934 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
883 if (atomic_dec_and_test(&bp->b_hold)) 935 if (atomic_dec_and_test(&bp->b_hold)) {
936 xfs_buf_ioacct_dec(bp);
884 xfs_buf_free(bp); 937 xfs_buf_free(bp);
938 }
885 return; 939 return;
886 } 940 }
887 941
888 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 942 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
889 943
890 ASSERT(atomic_read(&bp->b_hold) > 0); 944 ASSERT(atomic_read(&bp->b_hold) > 0);
891 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
892 spin_lock(&bp->b_lock);
893 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
894 /*
895 * If the buffer is added to the LRU take a new
896 * reference to the buffer for the LRU and clear the
897 * (now stale) dispose list state flag
898 */
899 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
900 bp->b_state &= ~XFS_BSTATE_DISPOSE;
901 atomic_inc(&bp->b_hold);
902 }
903 spin_unlock(&bp->b_lock);
904 spin_unlock(&pag->pag_buf_lock);
905 } else {
906 /*
907 * most of the time buffers will already be removed from
908 * the LRU, so optimise that case by checking for the
909 * XFS_BSTATE_DISPOSE flag indicating the last list the
910 * buffer was on was the disposal list
911 */
912 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
913 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
914 } else {
915 ASSERT(list_empty(&bp->b_lru));
916 }
917 spin_unlock(&bp->b_lock);
918 945
919 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 946 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
920 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 947 spin_lock(&bp->b_lock);
921 spin_unlock(&pag->pag_buf_lock); 948 if (!release) {
922 xfs_perag_put(pag); 949 /*
923 xfs_buf_free(bp); 950 * Drop the in-flight state if the buffer is already on the LRU
951 * and it holds the only reference. This is racy because we
952 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
953 * ensures the decrement occurs only once per-buf.
954 */
955 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
956 xfs_buf_ioacct_dec(bp);
957 goto out_unlock;
958 }
959
960 /* the last reference has been dropped ... */
961 xfs_buf_ioacct_dec(bp);
962 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
963 /*
964 * If the buffer is added to the LRU take a new reference to the
965 * buffer for the LRU and clear the (now stale) dispose list
966 * state flag
967 */
968 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
969 bp->b_state &= ~XFS_BSTATE_DISPOSE;
970 atomic_inc(&bp->b_hold);
971 }
972 spin_unlock(&pag->pag_buf_lock);
973 } else {
974 /*
975 * most of the time buffers will already be removed from the
976 * LRU, so optimise that case by checking for the
977 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
978 * was on was the disposal list
979 */
980 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
981 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
982 } else {
983 ASSERT(list_empty(&bp->b_lru));
924 } 984 }
985
986 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
987 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
988 spin_unlock(&pag->pag_buf_lock);
989 xfs_perag_put(pag);
990 freebuf = true;
925 } 991 }
992
993out_unlock:
994 spin_unlock(&bp->b_lock);
995
996 if (freebuf)
997 xfs_buf_free(bp);
926} 998}
927 999
928 1000
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
944 int locked; 1016 int locked;
945 1017
946 locked = down_trylock(&bp->b_sema) == 0; 1018 locked = down_trylock(&bp->b_sema) == 0;
947 if (locked) 1019 if (locked) {
948 XB_SET_OWNER(bp); 1020 XB_SET_OWNER(bp);
949 1021 trace_xfs_buf_trylock(bp, _RET_IP_);
950 trace_xfs_buf_trylock(bp, _RET_IP_); 1022 } else {
1023 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1024 }
951 return locked; 1025 return locked;
952} 1026}
953 1027
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
1339 * xfs_buf_ioend too early. 1413 * xfs_buf_ioend too early.
1340 */ 1414 */
1341 atomic_set(&bp->b_io_remaining, 1); 1415 atomic_set(&bp->b_io_remaining, 1);
1416 xfs_buf_ioacct_inc(bp);
1342 _xfs_buf_ioapply(bp); 1417 _xfs_buf_ioapply(bp);
1343 1418
1344 /* 1419 /*
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
1524 int loop = 0; 1599 int loop = 0;
1525 1600
1526 /* 1601 /*
1527 * We need to flush the buffer workqueue to ensure that all IO 1602 * First wait on the buftarg I/O count for all in-flight buffers to be
1528 * completion processing is 100% done. Just waiting on buffer locks is 1603 * released. This is critical as new buffers do not make the LRU until
1529 * not sufficient for async IO as the reference count held over IO is 1604 * they are released.
1530 * not released until after the buffer lock is dropped. Hence we need to 1605 *
1531 * ensure here that all reference counts have been dropped before we 1606 * Next, flush the buffer workqueue to ensure all completion processing
1532 * start walking the LRU list. 1607 * has finished. Just waiting on buffer locks is not sufficient for
1608 * async IO as the reference count held over IO is not released until
1609 * after the buffer lock is dropped. Hence we need to ensure here that
1610 * all reference counts have been dropped before we start walking the
1611 * LRU list.
1533 */ 1612 */
1613 while (percpu_counter_sum(&btp->bt_io_count))
1614 delay(100);
1534 drain_workqueue(btp->bt_mount->m_buf_workqueue); 1615 drain_workqueue(btp->bt_mount->m_buf_workqueue);
1535 1616
1536 /* loop until there is nothing left on the lru list. */ 1617 /* loop until there is nothing left on the lru list. */
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
1627 struct xfs_buftarg *btp) 1708 struct xfs_buftarg *btp)
1628{ 1709{
1629 unregister_shrinker(&btp->bt_shrinker); 1710 unregister_shrinker(&btp->bt_shrinker);
1711 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
1712 percpu_counter_destroy(&btp->bt_io_count);
1630 list_lru_destroy(&btp->bt_lru); 1713 list_lru_destroy(&btp->bt_lru);
1631 1714
1632 if (mp->m_flags & XFS_MOUNT_BARRIER) 1715 if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
1691 if (list_lru_init(&btp->bt_lru)) 1774 if (list_lru_init(&btp->bt_lru))
1692 goto error; 1775 goto error;
1693 1776
1777 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
1778 goto error;
1779
1694 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1780 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
1695 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1781 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
1696 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1782 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
1774 return 0; 1860 return 0;
1775} 1861}
1776 1862
1863/*
1864 * submit buffers for write.
1865 *
1866 * When we have a large buffer list, we do not want to hold all the buffers
1867 * locked while we block on the request queue waiting for IO dispatch. To avoid
1868 * this problem, we lock and submit buffers in groups of 50, thereby minimising
1869 * the lock hold times for lists which may contain thousands of objects.
1870 *
1871 * To do this, we sort the buffer list before we walk the list to lock and
1872 * submit buffers, and we plug and unplug around each group of buffers we
1873 * submit.
1874 */
1777static int 1875static int
1778__xfs_buf_delwri_submit( 1876xfs_buf_delwri_submit_buffers(
1779 struct list_head *buffer_list, 1877 struct list_head *buffer_list,
1780 struct list_head *io_list, 1878 struct list_head *wait_list)
1781 bool wait)
1782{ 1879{
1783 struct blk_plug plug;
1784 struct xfs_buf *bp, *n; 1880 struct xfs_buf *bp, *n;
1881 LIST_HEAD (submit_list);
1785 int pinned = 0; 1882 int pinned = 0;
1883 struct blk_plug plug;
1786 1884
1885 list_sort(NULL, buffer_list, xfs_buf_cmp);
1886
1887 blk_start_plug(&plug);
1787 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1888 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1788 if (!wait) { 1889 if (!wait_list) {
1789 if (xfs_buf_ispinned(bp)) { 1890 if (xfs_buf_ispinned(bp)) {
1790 pinned++; 1891 pinned++;
1791 continue; 1892 continue;
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
1808 continue; 1909 continue;
1809 } 1910 }
1810 1911
1811 list_move_tail(&bp->b_list, io_list);
1812 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1912 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1813 }
1814
1815 list_sort(NULL, io_list, xfs_buf_cmp);
1816
1817 blk_start_plug(&plug);
1818 list_for_each_entry_safe(bp, n, io_list, b_list) {
1819 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
1820 bp->b_flags |= XBF_WRITE | XBF_ASYNC;
1821 1913
1822 /* 1914 /*
1823 * we do all Io submission async. This means if we need to wait 1915 * We do all IO submission async. This means if we need
1824 * for IO completion we need to take an extra reference so the 1916 * to wait for IO completion we need to take an extra
1825 * buffer is still valid on the other side. 1917 * reference so the buffer is still valid on the other
1918 * side. We need to move the buffer onto the io_list
1919 * at this point so the caller can still access it.
1826 */ 1920 */
1827 if (wait) 1921 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
1922 bp->b_flags |= XBF_WRITE | XBF_ASYNC;
1923 if (wait_list) {
1828 xfs_buf_hold(bp); 1924 xfs_buf_hold(bp);
1829 else 1925 list_move_tail(&bp->b_list, wait_list);
1926 } else
1830 list_del_init(&bp->b_list); 1927 list_del_init(&bp->b_list);
1831 1928
1832 xfs_buf_submit(bp); 1929 xfs_buf_submit(bp);
@@ -1849,8 +1946,7 @@ int
1849xfs_buf_delwri_submit_nowait( 1946xfs_buf_delwri_submit_nowait(
1850 struct list_head *buffer_list) 1947 struct list_head *buffer_list)
1851{ 1948{
1852 LIST_HEAD (io_list); 1949 return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
1853 return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1854} 1950}
1855 1951
1856/* 1952/*
@@ -1865,15 +1961,15 @@ int
1865xfs_buf_delwri_submit( 1961xfs_buf_delwri_submit(
1866 struct list_head *buffer_list) 1962 struct list_head *buffer_list)
1867{ 1963{
1868 LIST_HEAD (io_list); 1964 LIST_HEAD (wait_list);
1869 int error = 0, error2; 1965 int error = 0, error2;
1870 struct xfs_buf *bp; 1966 struct xfs_buf *bp;
1871 1967
1872 __xfs_buf_delwri_submit(buffer_list, &io_list, true); 1968 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
1873 1969
1874 /* Wait for IO to complete. */ 1970 /* Wait for IO to complete. */
1875 while (!list_empty(&io_list)) { 1971 while (!list_empty(&wait_list)) {
1876 bp = list_first_entry(&io_list, struct xfs_buf, b_list); 1972 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1877 1973
1878 list_del_init(&bp->b_list); 1974 list_del_init(&bp->b_list);
1879 1975
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8bfb974f0772..1c2e52b2d926 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
43#define XBF_READ (1 << 0) /* buffer intended for reading from device */ 43#define XBF_READ (1 << 0) /* buffer intended for reading from device */
44#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ 44#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
45#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ 45#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
46#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
46#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 47#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
47#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 48#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
48#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ 49#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
62#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 63#define _XBF_KMEM (1 << 21)/* backed by heap memory */
63#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ 64#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
64#define _XBF_COMPOUND (1 << 23)/* compound buffer */ 65#define _XBF_COMPOUND (1 << 23)/* compound buffer */
66#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
65 67
66typedef unsigned int xfs_buf_flags_t; 68typedef unsigned int xfs_buf_flags_t;
67 69
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
81 { _XBF_PAGES, "PAGES" }, \ 83 { _XBF_PAGES, "PAGES" }, \
82 { _XBF_KMEM, "KMEM" }, \ 84 { _XBF_KMEM, "KMEM" }, \
83 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 85 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
84 { _XBF_COMPOUND, "COMPOUND" } 86 { _XBF_COMPOUND, "COMPOUND" }, \
87 { _XBF_IN_FLIGHT, "IN_FLIGHT" }
85 88
86 89
87/* 90/*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
115 /* LRU control structures */ 118 /* LRU control structures */
116 struct shrinker bt_shrinker; 119 struct shrinker bt_shrinker;
117 struct list_lru bt_lru; 120 struct list_lru bt_lru;
121
122 struct percpu_counter bt_io_count;
118} xfs_buftarg_t; 123} xfs_buftarg_t;
119 124
120struct xfs_buf; 125struct xfs_buf;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 34257992934c..e455f9098d49 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
359 for (i = 0; i < bip->bli_format_count; i++) { 359 for (i = 0; i < bip->bli_format_count; i++) {
360 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 360 xfs_buf_item_format_segment(bip, lv, &vecp, offset,
361 &bip->bli_formats[i]); 361 &bip->bli_formats[i]);
362 offset += bp->b_maps[i].bm_len; 362 offset += BBTOB(bp->b_maps[i].bm_len);
363 } 363 }
364 364
365 /* 365 /*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
915 for (i = 0; i < bip->bli_format_count; i++) { 915 for (i = 0; i < bip->bli_format_count; i++) {
916 if (start > last) 916 if (start > last)
917 break; 917 break;
918 end = start + BBTOB(bp->b_maps[i].bm_len); 918 end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
919
920 /* skip to the map that includes the first byte to log */
919 if (first > end) { 921 if (first > end) {
920 start += BBTOB(bp->b_maps[i].bm_len); 922 start += BBTOB(bp->b_maps[i].bm_len);
921 continue; 923 continue;
922 } 924 }
925
926 /*
927 * Trim the range to this segment and mark it in the bitmap.
928 * Note that we must convert buffer offsets to segment relative
929 * offsets (e.g., the first byte of each segment is byte 0 of
930 * that segment).
931 */
923 if (first < start) 932 if (first < start)
924 first = start; 933 first = start;
925 if (end > last) 934 if (end > last)
926 end = last; 935 end = last;
927 936 xfs_buf_item_log_segment(first - start, end - start,
928 xfs_buf_item_log_segment(first, end,
929 &bip->bli_formats[i].blf_data_map[0]); 937 &bip->bli_formats[i].blf_data_map[0]);
930 938
931 start += bp->b_maps[i].bm_len; 939 start += BBTOB(bp->b_maps[i].bm_len);
932 } 940 }
933} 941}
934 942
@@ -949,6 +957,7 @@ xfs_buf_item_free(
949 xfs_buf_log_item_t *bip) 957 xfs_buf_log_item_t *bip)
950{ 958{
951 xfs_buf_item_free_format(bip); 959 xfs_buf_item_free_format(bip);
960 kmem_free(bip->bli_item.li_lv_shadow);
952 kmem_zone_free(xfs_buf_item_zone, bip); 961 kmem_zone_free(xfs_buf_item_zone, bip);
953} 962}
954 963
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
1073 trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 1082 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1074 ASSERT(bp->b_iodone != NULL); 1083 ASSERT(bp->b_iodone != NULL);
1075 1084
1085 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1086
1076 /* 1087 /*
1077 * If the write was asynchronous then no one will be looking for the 1088 * If the write was asynchronous then no one will be looking for the
1078 * error. If this is the first failure of this type, clear the error 1089 * error. If this is the first failure of this type, clear the error
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
1080 * async write failure at least once, but we also need to set the buffer 1091 * async write failure at least once, but we also need to set the buffer
1081 * up to behave correctly now for repeated failures. 1092 * up to behave correctly now for repeated failures.
1082 */ 1093 */
1083 if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || 1094 if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
1084 bp->b_last_error != bp->b_error) { 1095 bp->b_last_error != bp->b_error) {
1085 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | 1096 bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
1086 XBF_DONE | XBF_WRITE_FAIL);
1087 bp->b_last_error = bp->b_error; 1097 bp->b_last_error = bp->b_error;
1088 bp->b_retries = 0; 1098 if (cfg->retry_timeout && !bp->b_first_retry_time)
1089 bp->b_first_retry_time = jiffies; 1099 bp->b_first_retry_time = jiffies;
1090 1100
1091 xfs_buf_ioerror(bp, 0); 1101 xfs_buf_ioerror(bp, 0);
1092 xfs_buf_submit(bp); 1102 xfs_buf_submit(bp);
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
1097 * Repeated failure on an async write. Take action according to the 1107 * Repeated failure on an async write. Take action according to the
1098 * error configuration we have been set up to use. 1108 * error configuration we have been set up to use.
1099 */ 1109 */
1100 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1101 1110
1102 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1111 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1103 ++bp->b_retries > cfg->max_retries) 1112 ++bp->b_retries > cfg->max_retries)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index e0646659ce16..ccb0811963b2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
74{ 74{
75 ASSERT(list_empty(&dqp->q_lru)); 75 ASSERT(list_empty(&dqp->q_lru));
76 76
77 kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
77 mutex_destroy(&dqp->q_qlock); 78 mutex_destroy(&dqp->q_qlock);
78 79
79 XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); 80 XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 814cff94e78f..2c7a1629e064 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
370 spin_lock(&ailp->xa_lock); 370 spin_lock(&ailp->xa_lock);
371 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); 371 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
372 372
373 kmem_free(qfs->qql_item.li_lv_shadow);
374 kmem_free(lip->li_lv_shadow);
373 kmem_free(qfs); 375 kmem_free(qfs);
374 kmem_free(qfe); 376 kmem_free(qfe);
375 return (xfs_lsn_t)-1; 377 return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 88693a98fac5..ed7ee4e8af73 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
55} 55}
56 56
57int 57int
58xfs_errortag_add(int error_tag, xfs_mount_t *mp) 58xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
59{ 59{
60 int i; 60 int i;
61 int len; 61 int len;
62 int64_t fsid; 62 int64_t fsid;
63 63
64 if (error_tag >= XFS_ERRTAG_MAX)
65 return -EINVAL;
66
64 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); 67 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
65 68
66 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 69 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 4ed3042a0f16..2e4f67f68856 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
128 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 128 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
129 (rf)))) 129 (rf))))
130 130
131extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); 131extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
132extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); 132extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
133#else 133#else
134#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 134#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 4aa0153214f9..ab779460ecbf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
40xfs_efi_item_free( 40xfs_efi_item_free(
41 struct xfs_efi_log_item *efip) 41 struct xfs_efi_log_item *efip)
42{ 42{
43 kmem_free(efip->efi_item.li_lv_shadow);
43 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) 44 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
44 kmem_free(efip); 45 kmem_free(efip);
45 else 46 else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
300STATIC void 301STATIC void
301xfs_efd_item_free(struct xfs_efd_log_item *efdp) 302xfs_efd_item_free(struct xfs_efd_log_item *efdp)
302{ 303{
304 kmem_free(efdp->efd_item.li_lv_shadow);
303 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) 305 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
304 kmem_free(efdp); 306 kmem_free(efdp);
305 else 307 else
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1b3dc9dd8861..ed95e5bb04e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
37#include "xfs_log.h" 37#include "xfs_log.h"
38#include "xfs_icache.h" 38#include "xfs_icache.h"
39#include "xfs_pnfs.h" 39#include "xfs_pnfs.h"
40#include "xfs_iomap.h"
40 41
41#include <linux/dcache.h> 42#include <linux/dcache.h>
42#include <linux/falloc.h> 43#include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
80} 81}
81 82
82/* 83/*
83 * xfs_iozero clears the specified range supplied via the page cache (except in 84 * Clear the specified ranges to zero through either the pagecache or DAX.
84 * the DAX case). Writes through the page cache will allocate blocks over holes, 85 * Holes and unwritten extents will be left as-is as they already are zeroed.
85 * though the callers usually map the holes first and avoid them. If a block is
86 * not completely zeroed, then it will be read from disk before being partially
87 * zeroed.
88 *
89 * In the DAX case, we can just directly write to the underlying pages. This
90 * will not allocate blocks, but will avoid holes and unwritten extents and so
91 * not do unnecessary work.
92 */ 86 */
93int 87int
94xfs_iozero( 88xfs_zero_range(
95 struct xfs_inode *ip, /* inode */ 89 struct xfs_inode *ip,
96 loff_t pos, /* offset in file */ 90 xfs_off_t pos,
97 size_t count) /* size of data to zero */ 91 xfs_off_t count,
92 bool *did_zero)
98{ 93{
99 struct page *page; 94 return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
100 struct address_space *mapping;
101 int status = 0;
102
103
104 mapping = VFS_I(ip)->i_mapping;
105 do {
106 unsigned offset, bytes;
107 void *fsdata;
108
109 offset = (pos & (PAGE_SIZE -1)); /* Within page */
110 bytes = PAGE_SIZE - offset;
111 if (bytes > count)
112 bytes = count;
113
114 if (IS_DAX(VFS_I(ip))) {
115 status = dax_zero_page_range(VFS_I(ip), pos, bytes,
116 xfs_get_blocks_direct);
117 if (status)
118 break;
119 } else {
120 status = pagecache_write_begin(NULL, mapping, pos, bytes,
121 AOP_FLAG_UNINTERRUPTIBLE,
122 &page, &fsdata);
123 if (status)
124 break;
125
126 zero_user(page, offset, bytes);
127
128 status = pagecache_write_end(NULL, mapping, pos, bytes,
129 bytes, page, fsdata);
130 WARN_ON(status <= 0); /* can't return less than zero! */
131 status = 0;
132 }
133 pos += bytes;
134 count -= bytes;
135 } while (count);
136
137 return status;
138} 95}
139 96
140int 97int
@@ -282,48 +239,35 @@ xfs_file_fsync(
282} 239}
283 240
284STATIC ssize_t 241STATIC ssize_t
285xfs_file_read_iter( 242xfs_file_dio_aio_read(
286 struct kiocb *iocb, 243 struct kiocb *iocb,
287 struct iov_iter *to) 244 struct iov_iter *to)
288{ 245{
289 struct file *file = iocb->ki_filp; 246 struct address_space *mapping = iocb->ki_filp->f_mapping;
290 struct inode *inode = file->f_mapping->host; 247 struct inode *inode = mapping->host;
291 struct xfs_inode *ip = XFS_I(inode); 248 struct xfs_inode *ip = XFS_I(inode);
292 struct xfs_mount *mp = ip->i_mount; 249 loff_t isize = i_size_read(inode);
293 size_t size = iov_iter_count(to); 250 size_t count = iov_iter_count(to);
251 struct iov_iter data;
252 struct xfs_buftarg *target;
294 ssize_t ret = 0; 253 ssize_t ret = 0;
295 int ioflags = 0;
296 xfs_fsize_t n;
297 loff_t pos = iocb->ki_pos;
298 254
299 XFS_STATS_INC(mp, xs_read_calls); 255 trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
300
301 if (unlikely(iocb->ki_flags & IOCB_DIRECT))
302 ioflags |= XFS_IO_ISDIRECT;
303 if (file->f_mode & FMODE_NOCMTIME)
304 ioflags |= XFS_IO_INVIS;
305
306 if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
307 xfs_buftarg_t *target =
308 XFS_IS_REALTIME_INODE(ip) ?
309 mp->m_rtdev_targp : mp->m_ddev_targp;
310 /* DIO must be aligned to device logical sector size */
311 if ((pos | size) & target->bt_logical_sectormask) {
312 if (pos == i_size_read(inode))
313 return 0;
314 return -EINVAL;
315 }
316 }
317 256
318 n = mp->m_super->s_maxbytes - pos; 257 if (!count)
319 if (n <= 0 || size == 0) 258 return 0; /* skip atime */
320 return 0;
321 259
322 if (n < size) 260 if (XFS_IS_REALTIME_INODE(ip))
323 size = n; 261 target = ip->i_mount->m_rtdev_targp;
262 else
263 target = ip->i_mount->m_ddev_targp;
324 264
325 if (XFS_FORCED_SHUTDOWN(mp)) 265 /* DIO must be aligned to device logical sector size */
326 return -EIO; 266 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
267 if (iocb->ki_pos == isize)
268 return 0;
269 return -EINVAL;
270 }
327 271
328 /* 272 /*
329 * Locking is a bit tricky here. If we take an exclusive lock for direct 273 * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
336 * serialisation. 280 * serialisation.
337 */ 281 */
338 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 282 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
339 if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { 283 if (mapping->nrpages) {
340 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 284 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
341 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 285 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
342 286
@@ -351,8 +295,8 @@ xfs_file_read_iter(
351 * flush and reduce the chances of repeated iolock cycles going 295 * flush and reduce the chances of repeated iolock cycles going
352 * forward. 296 * forward.
353 */ 297 */
354 if (inode->i_mapping->nrpages) { 298 if (mapping->nrpages) {
355 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 299 ret = filemap_write_and_wait(mapping);
356 if (ret) { 300 if (ret) {
357 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 301 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
358 return ret; 302 return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
363 * we fail to invalidate a page, but this should never 307 * we fail to invalidate a page, but this should never
364 * happen on XFS. Warn if it does fail. 308 * happen on XFS. Warn if it does fail.
365 */ 309 */
366 ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); 310 ret = invalidate_inode_pages2(mapping);
367 WARN_ON_ONCE(ret); 311 WARN_ON_ONCE(ret);
368 ret = 0; 312 ret = 0;
369 } 313 }
370 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 314 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
371 } 315 }
372 316
373 trace_xfs_file_read(ip, size, pos, ioflags); 317 data = *to;
318 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
319 xfs_get_blocks_direct, NULL, NULL, 0);
320 if (ret > 0) {
321 iocb->ki_pos += ret;
322 iov_iter_advance(to, ret);
323 }
324 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
374 325
326 file_accessed(iocb->ki_filp);
327 return ret;
328}
329
330static noinline ssize_t
331xfs_file_dax_read(
332 struct kiocb *iocb,
333 struct iov_iter *to)
334{
335 struct address_space *mapping = iocb->ki_filp->f_mapping;
336 struct inode *inode = mapping->host;
337 struct xfs_inode *ip = XFS_I(inode);
338 struct iov_iter data = *to;
339 size_t count = iov_iter_count(to);
340 ssize_t ret = 0;
341
342 trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
343
344 if (!count)
345 return 0; /* skip atime */
346
347 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
348 ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
349 if (ret > 0) {
350 iocb->ki_pos += ret;
351 iov_iter_advance(to, ret);
352 }
353 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
354
355 file_accessed(iocb->ki_filp);
356 return ret;
357}
358
359STATIC ssize_t
360xfs_file_buffered_aio_read(
361 struct kiocb *iocb,
362 struct iov_iter *to)
363{
364 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
365 ssize_t ret;
366
367 trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
368
369 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
375 ret = generic_file_read_iter(iocb, to); 370 ret = generic_file_read_iter(iocb, to);
371 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
372
373 return ret;
374}
375
376STATIC ssize_t
377xfs_file_read_iter(
378 struct kiocb *iocb,
379 struct iov_iter *to)
380{
381 struct inode *inode = file_inode(iocb->ki_filp);
382 struct xfs_mount *mp = XFS_I(inode)->i_mount;
383 ssize_t ret = 0;
384
385 XFS_STATS_INC(mp, xs_read_calls);
386
387 if (XFS_FORCED_SHUTDOWN(mp))
388 return -EIO;
389
390 if (IS_DAX(inode))
391 ret = xfs_file_dax_read(iocb, to);
392 else if (iocb->ki_flags & IOCB_DIRECT)
393 ret = xfs_file_dio_aio_read(iocb, to);
394 else
395 ret = xfs_file_buffered_aio_read(iocb, to);
396
376 if (ret > 0) 397 if (ret > 0)
377 XFS_STATS_ADD(mp, xs_read_bytes, ret); 398 XFS_STATS_ADD(mp, xs_read_bytes, ret);
378
379 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
380 return ret; 399 return ret;
381} 400}
382 401
@@ -389,18 +408,14 @@ xfs_file_splice_read(
389 unsigned int flags) 408 unsigned int flags)
390{ 409{
391 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); 410 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
392 int ioflags = 0;
393 ssize_t ret; 411 ssize_t ret;
394 412
395 XFS_STATS_INC(ip->i_mount, xs_read_calls); 413 XFS_STATS_INC(ip->i_mount, xs_read_calls);
396 414
397 if (infilp->f_mode & FMODE_NOCMTIME)
398 ioflags |= XFS_IO_INVIS;
399
400 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 415 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
401 return -EIO; 416 return -EIO;
402 417
403 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 418 trace_xfs_file_splice_read(ip, count, *ppos);
404 419
405 /* 420 /*
406 * DAX inodes cannot ues the page cache for splice, so we have to push 421 * DAX inodes cannot ues the page cache for splice, so we have to push
@@ -424,49 +439,6 @@ out:
424} 439}
425 440
426/* 441/*
427 * This routine is called to handle zeroing any space in the last block of the
428 * file that is beyond the EOF. We do this since the size is being increased
429 * without writing anything to that block and we don't want to read the
430 * garbage on the disk.
431 */
432STATIC int /* error (positive) */
433xfs_zero_last_block(
434 struct xfs_inode *ip,
435 xfs_fsize_t offset,
436 xfs_fsize_t isize,
437 bool *did_zeroing)
438{
439 struct xfs_mount *mp = ip->i_mount;
440 xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
441 int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
442 int zero_len;
443 int nimaps = 1;
444 int error = 0;
445 struct xfs_bmbt_irec imap;
446
447 xfs_ilock(ip, XFS_ILOCK_EXCL);
448 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
449 xfs_iunlock(ip, XFS_ILOCK_EXCL);
450 if (error)
451 return error;
452
453 ASSERT(nimaps > 0);
454
455 /*
456 * If the block underlying isize is just a hole, then there
457 * is nothing to zero.
458 */
459 if (imap.br_startblock == HOLESTARTBLOCK)
460 return 0;
461
462 zero_len = mp->m_sb.sb_blocksize - zero_offset;
463 if (isize + zero_len > offset)
464 zero_len = offset - isize;
465 *did_zeroing = true;
466 return xfs_iozero(ip, isize, zero_len);
467}
468
469/*
470 * Zero any on disk space between the current EOF and the new, larger EOF. 442 * Zero any on disk space between the current EOF and the new, larger EOF.
471 * 443 *
472 * This handles the normal case of zeroing the remainder of the last block in 444 * This handles the normal case of zeroing the remainder of the last block in
@@ -484,94 +456,11 @@ xfs_zero_eof(
484 xfs_fsize_t isize, /* current inode size */ 456 xfs_fsize_t isize, /* current inode size */
485 bool *did_zeroing) 457 bool *did_zeroing)
486{ 458{
487 struct xfs_mount *mp = ip->i_mount;
488 xfs_fileoff_t start_zero_fsb;
489 xfs_fileoff_t end_zero_fsb;
490 xfs_fileoff_t zero_count_fsb;
491 xfs_fileoff_t last_fsb;
492 xfs_fileoff_t zero_off;
493 xfs_fsize_t zero_len;
494 int nimaps;
495 int error = 0;
496 struct xfs_bmbt_irec imap;
497
498 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 459 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
499 ASSERT(offset > isize); 460 ASSERT(offset > isize);
500 461
501 trace_xfs_zero_eof(ip, isize, offset - isize); 462 trace_xfs_zero_eof(ip, isize, offset - isize);
502 463 return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
503 /*
504 * First handle zeroing the block on which isize resides.
505 *
506 * We only zero a part of that block so it is handled specially.
507 */
508 if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
509 error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
510 if (error)
511 return error;
512 }
513
514 /*
515 * Calculate the range between the new size and the old where blocks
516 * needing to be zeroed may exist.
517 *
518 * To get the block where the last byte in the file currently resides,
519 * we need to subtract one from the size and truncate back to a block
520 * boundary. We subtract 1 in case the size is exactly on a block
521 * boundary.
522 */
523 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
524 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
525 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
526 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
527 if (last_fsb == end_zero_fsb) {
528 /*
529 * The size was only incremented on its last block.
530 * We took care of that above, so just return.
531 */
532 return 0;
533 }
534
535 ASSERT(start_zero_fsb <= end_zero_fsb);
536 while (start_zero_fsb <= end_zero_fsb) {
537 nimaps = 1;
538 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
539
540 xfs_ilock(ip, XFS_ILOCK_EXCL);
541 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
542 &imap, &nimaps, 0);
543 xfs_iunlock(ip, XFS_ILOCK_EXCL);
544 if (error)
545 return error;
546
547 ASSERT(nimaps > 0);
548
549 if (imap.br_state == XFS_EXT_UNWRITTEN ||
550 imap.br_startblock == HOLESTARTBLOCK) {
551 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
552 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
553 continue;
554 }
555
556 /*
557 * There are blocks we need to zero.
558 */
559 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
560 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
561
562 if ((zero_off + zero_len) > offset)
563 zero_len = offset - zero_off;
564
565 error = xfs_iozero(ip, zero_off, zero_len);
566 if (error)
567 return error;
568
569 *did_zeroing = true;
570 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
571 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
572 }
573
574 return 0;
575} 464}
576 465
577/* 466/*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
722 mp->m_rtdev_targp : mp->m_ddev_targp; 611 mp->m_rtdev_targp : mp->m_ddev_targp;
723 612
724 /* DIO must be aligned to device logical sector size */ 613 /* DIO must be aligned to device logical sector size */
725 if (!IS_DAX(inode) && 614 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
726 ((iocb->ki_pos | count) & target->bt_logical_sectormask))
727 return -EINVAL; 615 return -EINVAL;
728 616
729 /* "unaligned" here means not aligned to a filesystem block */ 617 /* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
762 end = iocb->ki_pos + count - 1; 650 end = iocb->ki_pos + count - 1;
763 651
764 /* 652 /*
765 * See xfs_file_read_iter() for why we do a full-file flush here. 653 * See xfs_file_dio_aio_read() for why we do a full-file flush here.
766 */ 654 */
767 if (mapping->nrpages) { 655 if (mapping->nrpages) {
768 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 656 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
789 iolock = XFS_IOLOCK_SHARED; 677 iolock = XFS_IOLOCK_SHARED;
790 } 678 }
791 679
792 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 680 trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
793 681
794 data = *from; 682 data = *from;
795 ret = mapping->a_ops->direct_IO(iocb, &data); 683 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
684 xfs_get_blocks_direct, xfs_end_io_direct_write,
685 NULL, DIO_ASYNC_EXTEND);
796 686
797 /* see generic_file_direct_write() for why this is necessary */ 687 /* see generic_file_direct_write() for why this is necessary */
798 if (mapping->nrpages) { 688 if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
809 xfs_rw_iunlock(ip, iolock); 699 xfs_rw_iunlock(ip, iolock);
810 700
811 /* 701 /*
812 * No fallback to buffered IO on errors for XFS. DAX can result in 702 * No fallback to buffered IO on errors for XFS, direct IO will either
813 * partial writes, but direct IO will either complete fully or fail. 703 * complete fully or fail.
814 */ 704 */
815 ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); 705 ASSERT(ret < 0 || ret == count);
706 return ret;
707}
708
709static noinline ssize_t
710xfs_file_dax_write(
711 struct kiocb *iocb,
712 struct iov_iter *from)
713{
714 struct address_space *mapping = iocb->ki_filp->f_mapping;
715 struct inode *inode = mapping->host;
716 struct xfs_inode *ip = XFS_I(inode);
717 struct xfs_mount *mp = ip->i_mount;
718 ssize_t ret = 0;
719 int unaligned_io = 0;
720 int iolock;
721 struct iov_iter data;
722
723 /* "unaligned" here means not aligned to a filesystem block */
724 if ((iocb->ki_pos & mp->m_blockmask) ||
725 ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
726 unaligned_io = 1;
727 iolock = XFS_IOLOCK_EXCL;
728 } else if (mapping->nrpages) {
729 iolock = XFS_IOLOCK_EXCL;
730 } else {
731 iolock = XFS_IOLOCK_SHARED;
732 }
733 xfs_rw_ilock(ip, iolock);
734
735 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
736 if (ret)
737 goto out;
738
739 /*
740 * Yes, even DAX files can have page cache attached to them: A zeroed
741 * page is inserted into the pagecache when we have to serve a write
742 * fault on a hole. It should never be dirtied and can simply be
743 * dropped from the pagecache once we get real data for the page.
744 */
745 if (mapping->nrpages) {
746 ret = invalidate_inode_pages2(mapping);
747 WARN_ON_ONCE(ret);
748 }
749
750 if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
751 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
752 iolock = XFS_IOLOCK_SHARED;
753 }
754
755 trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
756
757 data = *from;
758 ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
759 xfs_end_io_direct_write, 0);
760 if (ret > 0) {
761 iocb->ki_pos += ret;
762 iov_iter_advance(from, ret);
763 }
764out:
765 xfs_rw_iunlock(ip, iolock);
816 return ret; 766 return ret;
817} 767}
818 768
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
839 current->backing_dev_info = inode_to_bdi(inode); 789 current->backing_dev_info = inode_to_bdi(inode);
840 790
841write_retry: 791write_retry:
842 trace_xfs_file_buffered_write(ip, iov_iter_count(from), 792 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
843 iocb->ki_pos, 0); 793 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
844 ret = generic_perform_write(file, from, iocb->ki_pos);
845 if (likely(ret >= 0)) 794 if (likely(ret >= 0))
846 iocb->ki_pos += ret; 795 iocb->ki_pos += ret;
847 796
@@ -895,7 +844,9 @@ xfs_file_write_iter(
895 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 844 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
896 return -EIO; 845 return -EIO;
897 846
898 if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) 847 if (IS_DAX(inode))
848 ret = xfs_file_dax_write(iocb, from);
849 else if (iocb->ki_flags & IOCB_DIRECT)
899 ret = xfs_file_dio_aio_write(iocb, from); 850 ret = xfs_file_dio_aio_write(iocb, from);
900 else 851 else
901 ret = xfs_file_buffered_aio_write(iocb, from); 852 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
1553 if (IS_DAX(inode)) { 1504 if (IS_DAX(inode)) {
1554 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); 1505 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
1555 } else { 1506 } else {
1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1507 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
1557 ret = block_page_mkwrite_return(ret); 1508 ret = block_page_mkwrite_return(ret);
1558 } 1509 }
1559 1510
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b4d75825ae37..7191c3878b4a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
667 __uint64_t *inval, 667 __uint64_t *inval,
668 xfs_fsop_resblks_t *outval) 668 xfs_fsop_resblks_t *outval)
669{ 669{
670 __int64_t lcounter, delta, fdblks_delta; 670 __int64_t lcounter, delta;
671 __int64_t fdblks_delta = 0;
671 __uint64_t request; 672 __uint64_t request;
673 __int64_t free;
674 int error = 0;
672 675
673 /* If inval is null, report current values and return */ 676 /* If inval is null, report current values and return */
674 if (inval == (__uint64_t *)NULL) { 677 if (inval == (__uint64_t *)NULL) {
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
682 request = *inval; 685 request = *inval;
683 686
684 /* 687 /*
685 * With per-cpu counters, this becomes an interesting 688 * With per-cpu counters, this becomes an interesting problem. we need
686 * problem. we needto work out if we are freeing or allocation 689 * to work out if we are freeing or allocation blocks first, then we can
687 * blocks first, then we can do the modification as necessary. 690 * do the modification as necessary.
688 * 691 *
689 * We do this under the m_sb_lock so that if we are near 692 * We do this under the m_sb_lock so that if we are near ENOSPC, we will
690 * ENOSPC, we will hold out any changes while we work out 693 * hold out any changes while we work out what to do. This means that
691 * what to do. This means that the amount of free space can 694 * the amount of free space can change while we do this, so we need to
692 * change while we do this, so we need to retry if we end up 695 * retry if we end up trying to reserve more space than is available.
693 * trying to reserve more space than is available.
694 */ 696 */
695retry:
696 spin_lock(&mp->m_sb_lock); 697 spin_lock(&mp->m_sb_lock);
697 698
698 /* 699 /*
699 * If our previous reservation was larger than the current value, 700 * If our previous reservation was larger than the current value,
700 * then move any unused blocks back to the free pool. 701 * then move any unused blocks back to the free pool. Modify the resblks
702 * counters directly since we shouldn't have any problems unreserving
703 * space.
701 */ 704 */
702 fdblks_delta = 0;
703 if (mp->m_resblks > request) { 705 if (mp->m_resblks > request) {
704 lcounter = mp->m_resblks_avail - request; 706 lcounter = mp->m_resblks_avail - request;
705 if (lcounter > 0) { /* release unused blocks */ 707 if (lcounter > 0) { /* release unused blocks */
@@ -707,54 +709,67 @@ retry:
707 mp->m_resblks_avail -= lcounter; 709 mp->m_resblks_avail -= lcounter;
708 } 710 }
709 mp->m_resblks = request; 711 mp->m_resblks = request;
710 } else { 712 if (fdblks_delta) {
711 __int64_t free; 713 spin_unlock(&mp->m_sb_lock);
714 error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
715 spin_lock(&mp->m_sb_lock);
716 }
717
718 goto out;
719 }
712 720
721 /*
722 * If the request is larger than the current reservation, reserve the
723 * blocks before we update the reserve counters. Sample m_fdblocks and
724 * perform a partial reservation if the request exceeds free space.
725 */
726 error = -ENOSPC;
727 do {
713 free = percpu_counter_sum(&mp->m_fdblocks) - 728 free = percpu_counter_sum(&mp->m_fdblocks) -
714 XFS_ALLOC_SET_ASIDE(mp); 729 XFS_ALLOC_SET_ASIDE(mp);
715 if (!free) 730 if (!free)
716 goto out; /* ENOSPC and fdblks_delta = 0 */ 731 break;
717 732
718 delta = request - mp->m_resblks; 733 delta = request - mp->m_resblks;
719 lcounter = free - delta; 734 lcounter = free - delta;
720 if (lcounter < 0) { 735 if (lcounter < 0)
721 /* We can't satisfy the request, just get what we can */ 736 /* We can't satisfy the request, just get what we can */
722 mp->m_resblks += free; 737 fdblks_delta = free;
723 mp->m_resblks_avail += free; 738 else
724 fdblks_delta = -free; 739 fdblks_delta = delta;
725 } else {
726 fdblks_delta = -delta;
727 mp->m_resblks = request;
728 mp->m_resblks_avail += delta;
729 }
730 }
731out:
732 if (outval) {
733 outval->resblks = mp->m_resblks;
734 outval->resblks_avail = mp->m_resblks_avail;
735 }
736 spin_unlock(&mp->m_sb_lock);
737 740
738 if (fdblks_delta) {
739 /* 741 /*
740 * If we are putting blocks back here, m_resblks_avail is 742 * We'll either succeed in getting space from the free block
741 * already at its max so this will put it in the free pool. 743 * count or we'll get an ENOSPC. If we get a ENOSPC, it means
742 * 744 * things changed while we were calculating fdblks_delta and so
743 * If we need space, we'll either succeed in getting it 745 * we should try again to see if there is anything left to
744 * from the free block count or we'll get an enospc. If 746 * reserve.
745 * we get a ENOSPC, it means things changed while we were
746 * calculating fdblks_delta and so we should try again to
747 * see if there is anything left to reserve.
748 * 747 *
749 * Don't set the reserved flag here - we don't want to reserve 748 * Don't set the reserved flag here - we don't want to reserve
750 * the extra reserve blocks from the reserve..... 749 * the extra reserve blocks from the reserve.....
751 */ 750 */
752 int error; 751 spin_unlock(&mp->m_sb_lock);
753 error = xfs_mod_fdblocks(mp, fdblks_delta, 0); 752 error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
754 if (error == -ENOSPC) 753 spin_lock(&mp->m_sb_lock);
755 goto retry; 754 } while (error == -ENOSPC);
755
756 /*
757 * Update the reserve counters if blocks have been successfully
758 * allocated.
759 */
760 if (!error && fdblks_delta) {
761 mp->m_resblks += fdblks_delta;
762 mp->m_resblks_avail += fdblks_delta;
756 } 763 }
757 return 0; 764
765out:
766 if (outval) {
767 outval->resblks = mp->m_resblks;
768 outval->resblks_avail = mp->m_resblks_avail;
769 }
770
771 spin_unlock(&mp->m_sb_lock);
772 return error;
758} 773}
759 774
760int 775int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 99ee6eee5e0b..fb39a66914dd 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -765,7 +765,7 @@ restart:
765 * Background scanning to trim post-EOF preallocated space. This is queued 765 * Background scanning to trim post-EOF preallocated space. This is queued
766 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 766 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
767 */ 767 */
768STATIC void 768void
769xfs_queue_eofblocks( 769xfs_queue_eofblocks(
770 struct xfs_mount *mp) 770 struct xfs_mount *mp)
771{ 771{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 62f1f91c32cb..05bac99bef75 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
68int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); 68int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
69int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); 69int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
70void xfs_eofblocks_worker(struct work_struct *); 70void xfs_eofblocks_worker(struct work_struct *);
71void xfs_queue_eofblocks(struct xfs_mount *);
71 72
72int xfs_inode_ag_iterator(struct xfs_mount *mp, 73int xfs_inode_ag_iterator(struct xfs_mount *mp,
73 int (*execute)(struct xfs_inode *ip, int flags, void *args), 74 int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ee6799e0476f..8825bcfd314c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
431 * lock more than one at a time, lockdep will report false positives saying we 431 * lock more than one at a time, lockdep will report false positives saying we
432 * have violated locking orders. 432 * have violated locking orders.
433 */ 433 */
434void 434static void
435xfs_lock_inodes( 435xfs_lock_inodes(
436 xfs_inode_t **ips, 436 xfs_inode_t **ips,
437 int inodes, 437 int inodes,
@@ -667,14 +667,6 @@ xfs_ip2xflags(
667 return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); 667 return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
668} 668}
669 669
670uint
671xfs_dic2xflags(
672 struct xfs_dinode *dip)
673{
674 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
675 be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
676}
677
678/* 670/*
679 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 671 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
680 * is allowed, otherwise it has to be an exact match. If a CI match is found, 672 * is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +740,7 @@ out_unlock:
748 * are not linked into the directory structure - they are attached 740 * are not linked into the directory structure - they are attached
749 * directly to the superblock - and so have no parent. 741 * directly to the superblock - and so have no parent.
750 */ 742 */
751int 743static int
752xfs_ialloc( 744xfs_ialloc(
753 xfs_trans_t *tp, 745 xfs_trans_t *tp,
754 xfs_inode_t *pip, 746 xfs_inode_t *pip,
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
1085 * link count to go to zero, move the inode to AGI unlinked list so that it can 1077 * link count to go to zero, move the inode to AGI unlinked list so that it can
1086 * be freed when the last active reference goes away via xfs_inactive(). 1078 * be freed when the last active reference goes away via xfs_inactive().
1087 */ 1079 */
1088int /* error */ 1080static int /* error */
1089xfs_droplink( 1081xfs_droplink(
1090 xfs_trans_t *tp, 1082 xfs_trans_t *tp,
1091 xfs_inode_t *ip) 1083 xfs_inode_t *ip)
@@ -1104,7 +1096,7 @@ xfs_droplink(
1104/* 1096/*
1105 * Increment the link count on an inode & log the change. 1097 * Increment the link count on an inode & log the change.
1106 */ 1098 */
1107int 1099static int
1108xfs_bumplink( 1100xfs_bumplink(
1109 xfs_trans_t *tp, 1101 xfs_trans_t *tp,
1110 xfs_inode_t *ip) 1102 xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e52d7c7aeb5b..8eb78ec4a6e2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
395int xfs_isilocked(xfs_inode_t *, uint); 395int xfs_isilocked(xfs_inode_t *, uint);
396uint xfs_ilock_data_map_shared(struct xfs_inode *); 396uint xfs_ilock_data_map_shared(struct xfs_inode *);
397uint xfs_ilock_attr_map_shared(struct xfs_inode *); 397uint xfs_ilock_attr_map_shared(struct xfs_inode *);
398int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
399 xfs_nlink_t, xfs_dev_t, prid_t, int,
400 struct xfs_buf **, xfs_inode_t **);
401 398
402uint xfs_ip2xflags(struct xfs_inode *); 399uint xfs_ip2xflags(struct xfs_inode *);
403uint xfs_dic2xflags(struct xfs_dinode *);
404int xfs_ifree(struct xfs_trans *, xfs_inode_t *, 400int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
405 struct xfs_bmap_free *); 401 struct xfs_bmap_free *);
406int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, 402int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *);
411#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 407#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
412 408
413int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 409int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
414void xfs_lock_inodes(xfs_inode_t **, int, uint);
415void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 410void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
416 411
417xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); 412xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
419int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, 414int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
420 xfs_nlink_t, xfs_dev_t, prid_t, int, 415 xfs_nlink_t, xfs_dev_t, prid_t, int,
421 struct xfs_inode **, int *); 416 struct xfs_inode **, int *);
422int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
423int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
424 417
425/* from xfs_file.c */ 418/* from xfs_file.c */
426enum xfs_prealloc_flags { 419enum xfs_prealloc_flags {
@@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
434 enum xfs_prealloc_flags flags); 427 enum xfs_prealloc_flags flags);
435int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, 428int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
436 xfs_fsize_t isize, bool *did_zeroing); 429 xfs_fsize_t isize, bool *did_zeroing);
437int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); 430int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
431 bool *did_zero);
438loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, 432loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
439 loff_t eof, int whence); 433 loff_t eof, int whence);
440 434
@@ -479,14 +473,4 @@ do { \
479 473
480extern struct kmem_zone *xfs_inode_zone; 474extern struct kmem_zone *xfs_inode_zone;
481 475
482/*
483 * Flags for read/write calls
484 */
485#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
486#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
487
488#define XFS_IO_FLAGS \
489 { XFS_IO_ISDIRECT, "DIRECT" }, \
490 { XFS_IO_INVIS, "INVIS"}
491
492#endif /* __XFS_INODE_H__ */ 476#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a1b07612224c..892c2aced207 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
651xfs_inode_item_destroy( 651xfs_inode_item_destroy(
652 xfs_inode_t *ip) 652 xfs_inode_t *ip)
653{ 653{
654 kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
654 kmem_zone_free(xfs_ili_zone, ip->i_itemp); 655 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
655} 656}
656 657
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 63a6ff2cfc68..9a7c87809d3b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
595 595
596int 596int
597xfs_ioc_space( 597xfs_ioc_space(
598 struct xfs_inode *ip,
599 struct inode *inode,
600 struct file *filp, 598 struct file *filp,
601 int ioflags,
602 unsigned int cmd, 599 unsigned int cmd,
603 xfs_flock64_t *bf) 600 xfs_flock64_t *bf)
604{ 601{
602 struct inode *inode = file_inode(filp);
603 struct xfs_inode *ip = XFS_I(inode);
605 struct iattr iattr; 604 struct iattr iattr;
606 enum xfs_prealloc_flags flags = 0; 605 enum xfs_prealloc_flags flags = 0;
607 uint iolock = XFS_IOLOCK_EXCL; 606 uint iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
626 625
627 if (filp->f_flags & O_DSYNC) 626 if (filp->f_flags & O_DSYNC)
628 flags |= XFS_PREALLOC_SYNC; 627 flags |= XFS_PREALLOC_SYNC;
629 if (ioflags & XFS_IO_INVIS) 628 if (filp->f_mode & FMODE_NOCMTIME)
630 flags |= XFS_PREALLOC_INVISIBLE; 629 flags |= XFS_PREALLOC_INVISIBLE;
631 630
632 error = mnt_want_write_file(filp); 631 error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
1464 1463
1465STATIC int 1464STATIC int
1466xfs_ioc_getbmap( 1465xfs_ioc_getbmap(
1467 struct xfs_inode *ip, 1466 struct file *file,
1468 int ioflags,
1469 unsigned int cmd, 1467 unsigned int cmd,
1470 void __user *arg) 1468 void __user *arg)
1471{ 1469{
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
1479 return -EINVAL; 1477 return -EINVAL;
1480 1478
1481 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); 1479 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
1482 if (ioflags & XFS_IO_INVIS) 1480 if (file->f_mode & FMODE_NOCMTIME)
1483 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; 1481 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
1484 1482
1485 error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, 1483 error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
1486 (__force struct getbmap *)arg+1); 1484 (__force struct getbmap *)arg+1);
1487 if (error) 1485 if (error)
1488 return error; 1486 return error;
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
1575 goto out_put_tmp_file; 1573 goto out_put_tmp_file;
1576 } 1574 }
1577 1575
1576 /*
1577 * We need to ensure that the fds passed in point to XFS inodes
1578 * before we cast and access them as XFS structures as we have no
1579 * control over what the user passes us here.
1580 */
1578 if (f.file->f_op != &xfs_file_operations || 1581 if (f.file->f_op != &xfs_file_operations ||
1579 tmp.file->f_op != &xfs_file_operations) { 1582 tmp.file->f_op != &xfs_file_operations) {
1580 error = -EINVAL; 1583 error = -EINVAL;
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
1625 struct xfs_inode *ip = XFS_I(inode); 1628 struct xfs_inode *ip = XFS_I(inode);
1626 struct xfs_mount *mp = ip->i_mount; 1629 struct xfs_mount *mp = ip->i_mount;
1627 void __user *arg = (void __user *)p; 1630 void __user *arg = (void __user *)p;
1628 int ioflags = 0;
1629 int error; 1631 int error;
1630 1632
1631 if (filp->f_mode & FMODE_NOCMTIME)
1632 ioflags |= XFS_IO_INVIS;
1633
1634 trace_xfs_file_ioctl(ip); 1633 trace_xfs_file_ioctl(ip);
1635 1634
1636 switch (cmd) { 1635 switch (cmd) {
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
1649 1648
1650 if (copy_from_user(&bf, arg, sizeof(bf))) 1649 if (copy_from_user(&bf, arg, sizeof(bf)))
1651 return -EFAULT; 1650 return -EFAULT;
1652 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); 1651 return xfs_ioc_space(filp, cmd, &bf);
1653 } 1652 }
1654 case XFS_IOC_DIOINFO: { 1653 case XFS_IOC_DIOINFO: {
1655 struct dioattr da; 1654 struct dioattr da;
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
1708 1707
1709 case XFS_IOC_GETBMAP: 1708 case XFS_IOC_GETBMAP:
1710 case XFS_IOC_GETBMAPA: 1709 case XFS_IOC_GETBMAPA:
1711 return xfs_ioc_getbmap(ip, ioflags, cmd, arg); 1710 return xfs_ioc_getbmap(filp, cmd, arg);
1712 1711
1713 case XFS_IOC_GETBMAPX: 1712 case XFS_IOC_GETBMAPX:
1714 return xfs_ioc_getbmapx(ip, arg); 1713 return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 77c02c7900b6..8b52881bfd90 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
20 20
21extern int 21extern int
22xfs_ioc_space( 22xfs_ioc_space(
23 struct xfs_inode *ip,
24 struct inode *inode,
25 struct file *filp, 23 struct file *filp,
26 int ioflags,
27 unsigned int cmd, 24 unsigned int cmd,
28 xfs_flock64_t *bf); 25 xfs_flock64_t *bf);
29 26
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1a05d8ae327d..321f57721b92 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
532 struct xfs_inode *ip = XFS_I(inode); 532 struct xfs_inode *ip = XFS_I(inode);
533 struct xfs_mount *mp = ip->i_mount; 533 struct xfs_mount *mp = ip->i_mount;
534 void __user *arg = (void __user *)p; 534 void __user *arg = (void __user *)p;
535 int ioflags = 0;
536 int error; 535 int error;
537 536
538 if (filp->f_mode & FMODE_NOCMTIME)
539 ioflags |= XFS_IO_INVIS;
540
541 trace_xfs_file_compat_ioctl(ip); 537 trace_xfs_file_compat_ioctl(ip);
542 538
543 switch (cmd) { 539 switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
589 if (xfs_compat_flock64_copyin(&bf, arg)) 585 if (xfs_compat_flock64_copyin(&bf, arg))
590 return -EFAULT; 586 return -EFAULT;
591 cmd = _NATIVE_IOC(cmd, struct xfs_flock64); 587 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
592 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); 588 return xfs_ioc_space(filp, cmd, &bf);
593 } 589 }
594 case XFS_IOC_FSGEOMETRY_V1_32: 590 case XFS_IOC_FSGEOMETRY_V1_32:
595 return xfs_compat_ioc_fsgeometry_v1(mp, arg); 591 return xfs_compat_ioc_fsgeometry_v1(mp, arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 58391355a44d..620fc9120444 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/iomap.h>
18#include "xfs.h" 19#include "xfs.h"
19#include "xfs_fs.h" 20#include "xfs_fs.h"
20#include "xfs_shared.h" 21#include "xfs_shared.h"
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
940 xfs_iunlock(ip, XFS_ILOCK_EXCL); 941 xfs_iunlock(ip, XFS_ILOCK_EXCL);
941 return error; 942 return error;
942} 943}
944
945void
946xfs_bmbt_to_iomap(
947 struct xfs_inode *ip,
948 struct iomap *iomap,
949 struct xfs_bmbt_irec *imap)
950{
951 struct xfs_mount *mp = ip->i_mount;
952
953 if (imap->br_startblock == HOLESTARTBLOCK) {
954 iomap->blkno = IOMAP_NULL_BLOCK;
955 iomap->type = IOMAP_HOLE;
956 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
957 iomap->blkno = IOMAP_NULL_BLOCK;
958 iomap->type = IOMAP_DELALLOC;
959 } else {
960 iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
961 if (imap->br_state == XFS_EXT_UNWRITTEN)
962 iomap->type = IOMAP_UNWRITTEN;
963 else
964 iomap->type = IOMAP_MAPPED;
965 }
966 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
967 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
968 iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
969}
970
971static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
972{
973 return !nimaps ||
974 imap->br_startblock == HOLESTARTBLOCK ||
975 imap->br_startblock == DELAYSTARTBLOCK;
976}
977
978static int
979xfs_file_iomap_begin(
980 struct inode *inode,
981 loff_t offset,
982 loff_t length,
983 unsigned flags,
984 struct iomap *iomap)
985{
986 struct xfs_inode *ip = XFS_I(inode);
987 struct xfs_mount *mp = ip->i_mount;
988 struct xfs_bmbt_irec imap;
989 xfs_fileoff_t offset_fsb, end_fsb;
990 int nimaps = 1, error = 0;
991
992 if (XFS_FORCED_SHUTDOWN(mp))
993 return -EIO;
994
995 xfs_ilock(ip, XFS_ILOCK_EXCL);
996
997 ASSERT(offset <= mp->m_super->s_maxbytes);
998 if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
999 length = mp->m_super->s_maxbytes - offset;
1000 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1001 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1002
1003 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1004 &nimaps, XFS_BMAPI_ENTIRE);
1005 if (error) {
1006 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1007 return error;
1008 }
1009
1010 if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
1011 /*
1012 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
1013 * pages to keep the chunks of work done where somewhat symmetric
1014 * with the work writeback does. This is a completely arbitrary
1015 * number pulled out of thin air as a best guess for initial
1016 * testing.
1017 *
1018 * Note that the values needs to be less than 32-bits wide until
1019 * the lower level functions are updated.
1020 */
1021 length = min_t(loff_t, length, 1024 * PAGE_SIZE);
1022 if (xfs_get_extsz_hint(ip)) {
1023 /*
1024 * xfs_iomap_write_direct() expects the shared lock. It
1025 * is unlocked on return.
1026 */
1027 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1028 error = xfs_iomap_write_direct(ip, offset, length, &imap,
1029 nimaps);
1030 } else {
1031 error = xfs_iomap_write_delay(ip, offset, length, &imap);
1032 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1033 }
1034
1035 if (error)
1036 return error;
1037
1038 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
1039 xfs_bmbt_to_iomap(ip, iomap, &imap);
1040 } else if (nimaps) {
1041 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1042 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
1043 xfs_bmbt_to_iomap(ip, iomap, &imap);
1044 } else {
1045 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1046 trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
1047 iomap->blkno = IOMAP_NULL_BLOCK;
1048 iomap->type = IOMAP_HOLE;
1049 iomap->offset = offset;
1050 iomap->length = length;
1051 }
1052
1053 return 0;
1054}
1055
1056static int
1057xfs_file_iomap_end_delalloc(
1058 struct xfs_inode *ip,
1059 loff_t offset,
1060 loff_t length,
1061 ssize_t written)
1062{
1063 struct xfs_mount *mp = ip->i_mount;
1064 xfs_fileoff_t start_fsb;
1065 xfs_fileoff_t end_fsb;
1066 int error = 0;
1067
1068 start_fsb = XFS_B_TO_FSB(mp, offset + written);
1069 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1070
1071 /*
1072 * Trim back delalloc blocks if we didn't manage to write the whole
1073 * range reserved.
1074 *
1075 * We don't need to care about racing delalloc as we hold i_mutex
1076 * across the reserve/allocate/unreserve calls. If there are delalloc
1077 * blocks in the range, they are ours.
1078 */
1079 if (start_fsb < end_fsb) {
1080 xfs_ilock(ip, XFS_ILOCK_EXCL);
1081 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1082 end_fsb - start_fsb);
1083 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1084
1085 if (error && !XFS_FORCED_SHUTDOWN(mp)) {
1086 xfs_alert(mp, "%s: unable to clean up ino %lld",
1087 __func__, ip->i_ino);
1088 return error;
1089 }
1090 }
1091
1092 return 0;
1093}
1094
1095static int
1096xfs_file_iomap_end(
1097 struct inode *inode,
1098 loff_t offset,
1099 loff_t length,
1100 ssize_t written,
1101 unsigned flags,
1102 struct iomap *iomap)
1103{
1104 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
1105 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
1106 length, written);
1107 return 0;
1108}
1109
1110struct iomap_ops xfs_iomap_ops = {
1111 .iomap_begin = xfs_file_iomap_begin,
1112 .iomap_end = xfs_file_iomap_end,
1113};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e663d744..e066d045e2ff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21#include <linux/iomap.h>
22
21struct xfs_inode; 23struct xfs_inode;
22struct xfs_bmbt_irec; 24struct xfs_bmbt_irec;
23 25
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
29 struct xfs_bmbt_irec *); 31 struct xfs_bmbt_irec *);
30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); 32int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
31 33
34void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
35 struct xfs_bmbt_irec *);
36
37extern struct iomap_ops xfs_iomap_ops;
38
32#endif /* __XFS_IOMAP_H__*/ 39#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba6972e..ab820f84ed50 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
38#include "xfs_dir2.h" 38#include "xfs_dir2.h"
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_pnfs.h" 40#include "xfs_pnfs.h"
41#include "xfs_iomap.h"
41 42
42#include <linux/capability.h> 43#include <linux/capability.h>
43#include <linux/xattr.h> 44#include <linux/xattr.h>
44#include <linux/posix_acl.h> 45#include <linux/posix_acl.h>
45#include <linux/security.h> 46#include <linux/security.h>
46#include <linux/fiemap.h> 47#include <linux/iomap.h>
47#include <linux/slab.h> 48#include <linux/slab.h>
48 49
49/* 50/*
@@ -801,20 +802,30 @@ xfs_setattr_size(
801 return error; 802 return error;
802 803
803 /* 804 /*
805 * Wait for all direct I/O to complete.
806 */
807 inode_dio_wait(inode);
808
809 /*
804 * File data changes must be complete before we start the transaction to 810 * File data changes must be complete before we start the transaction to
805 * modify the inode. This needs to be done before joining the inode to 811 * modify the inode. This needs to be done before joining the inode to
806 * the transaction because the inode cannot be unlocked once it is a 812 * the transaction because the inode cannot be unlocked once it is a
807 * part of the transaction. 813 * part of the transaction.
808 * 814 *
809 * Start with zeroing any data block beyond EOF that we may expose on 815 * Start with zeroing any data beyond EOF that we may expose on file
810 * file extension. 816 * extension, or zeroing out the rest of the block on a downward
817 * truncate.
811 */ 818 */
812 if (newsize > oldsize) { 819 if (newsize > oldsize) {
813 error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); 820 error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
814 if (error) 821 } else {
815 return error; 822 error = iomap_truncate_page(inode, newsize, &did_zeroing,
823 &xfs_iomap_ops);
816 } 824 }
817 825
826 if (error)
827 return error;
828
818 /* 829 /*
819 * We are going to log the inode size change in this transaction so 830 * We are going to log the inode size change in this transaction so
820 * any previous writes that are beyond the on disk EOF and the new 831 * any previous writes that are beyond the on disk EOF and the new
@@ -823,17 +834,14 @@ xfs_setattr_size(
823 * problem. Note that this includes any block zeroing we did above; 834 * problem. Note that this includes any block zeroing we did above;
824 * otherwise those blocks may not be zeroed after a crash. 835 * otherwise those blocks may not be zeroed after a crash.
825 */ 836 */
826 if (newsize > ip->i_d.di_size && 837 if (did_zeroing ||
827 (oldsize != ip->i_d.di_size || did_zeroing)) { 838 (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
828 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 839 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
829 ip->i_d.di_size, newsize); 840 ip->i_d.di_size, newsize);
830 if (error) 841 if (error)
831 return error; 842 return error;
832 } 843 }
833 844
834 /* Now wait for all direct I/O to complete. */
835 inode_dio_wait(inode);
836
837 /* 845 /*
838 * We've already locked out new page faults, so now we can safely remove 846 * We've already locked out new page faults, so now we can safely remove
839 * pages from the page cache knowing they won't get refaulted until we 847 * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +859,6 @@ xfs_setattr_size(
851 * to hope that the caller sees ENOMEM and retries the truncate 859 * to hope that the caller sees ENOMEM and retries the truncate
852 * operation. 860 * operation.
853 */ 861 */
854 if (IS_DAX(inode))
855 error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
856 else
857 error = block_truncate_page(inode->i_mapping, newsize,
858 xfs_get_blocks);
859 if (error)
860 return error;
861 truncate_setsize(inode, newsize); 862 truncate_setsize(inode, newsize);
862 863
863 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 864 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -998,51 +999,6 @@ xfs_vn_update_time(
998 return xfs_trans_commit(tp); 999 return xfs_trans_commit(tp);
999} 1000}
1000 1001
1001#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
1002
1003/*
1004 * Call fiemap helper to fill in user data.
1005 * Returns positive errors to xfs_getbmap.
1006 */
1007STATIC int
1008xfs_fiemap_format(
1009 void **arg,
1010 struct getbmapx *bmv,
1011 int *full)
1012{
1013 int error;
1014 struct fiemap_extent_info *fieinfo = *arg;
1015 u32 fiemap_flags = 0;
1016 u64 logical, physical, length;
1017
1018 /* Do nothing for a hole */
1019 if (bmv->bmv_block == -1LL)
1020 return 0;
1021
1022 logical = BBTOB(bmv->bmv_offset);
1023 physical = BBTOB(bmv->bmv_block);
1024 length = BBTOB(bmv->bmv_length);
1025
1026 if (bmv->bmv_oflags & BMV_OF_PREALLOC)
1027 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
1028 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
1029 fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
1030 FIEMAP_EXTENT_UNKNOWN);
1031 physical = 0; /* no block yet */
1032 }
1033 if (bmv->bmv_oflags & BMV_OF_LAST)
1034 fiemap_flags |= FIEMAP_EXTENT_LAST;
1035
1036 error = fiemap_fill_next_extent(fieinfo, logical, physical,
1037 length, fiemap_flags);
1038 if (error > 0) {
1039 error = 0;
1040 *full = 1; /* user array now full */
1041 }
1042
1043 return error;
1044}
1045
1046STATIC int 1002STATIC int
1047xfs_vn_fiemap( 1003xfs_vn_fiemap(
1048 struct inode *inode, 1004 struct inode *inode,
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
1050 u64 start, 1006 u64 start,
1051 u64 length) 1007 u64 length)
1052{ 1008{
1053 xfs_inode_t *ip = XFS_I(inode);
1054 struct getbmapx bm;
1055 int error; 1009 int error;
1056 1010
1057 error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); 1011 xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
1058 if (error) 1012 error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
1059 return error; 1013 xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
1060
1061 /* Set up bmap header for xfs internal routine */
1062 bm.bmv_offset = BTOBBT(start);
1063 /* Special case for whole file */
1064 if (length == FIEMAP_MAX_OFFSET)
1065 bm.bmv_length = -1LL;
1066 else
1067 bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
1068
1069 /* We add one because in getbmap world count includes the header */
1070 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
1071 fieinfo->fi_extents_max + 1;
1072 bm.bmv_count = min_t(__s32, bm.bmv_count,
1073 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
1074 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
1075 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
1076 bm.bmv_iflags |= BMV_IF_ATTRFORK;
1077 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
1078 bm.bmv_iflags |= BMV_IF_DELALLOC;
1079
1080 error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
1081 if (error)
1082 return error;
1083 1014
1084 return 0; 1015 return error;
1085} 1016}
1086 1017
1087STATIC int 1018STATIC int
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index a8192dc797dc..b8d64d520e12 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
328 return x; 328 return x;
329} 329}
330 330
331/* ARM old ABI has some weird alignment/padding */
332#if defined(__arm__) && !defined(__ARM_EABI__)
333#define __arch_pack __attribute__((packed))
334#else
335#define __arch_pack
336#endif
337
338#define ASSERT_ALWAYS(expr) \ 331#define ASSERT_ALWAYS(expr) \
339 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) 332 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
340 333
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bde02f1fba73..3b74fa011bb1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
788 * As far as I know, there weren't any dependencies on the old behaviour. 788 * As far as I know, there weren't any dependencies on the old behaviour.
789 */ 789 */
790 790
791int 791static int
792xfs_log_unmount_write(xfs_mount_t *mp) 792xfs_log_unmount_write(xfs_mount_t *mp)
793{ 793{
794 struct xlog *log = mp->m_log; 794 struct xlog *log = mp->m_log;
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
1036 * there's no point in running a dummy transaction at this point because we 1036 * there's no point in running a dummy transaction at this point because we
1037 * can't start trying to idle the log until both the CIL and AIL are empty. 1037 * can't start trying to idle the log until both the CIL and AIL are empty.
1038 */ 1038 */
1039int 1039static int
1040xfs_log_need_covered(xfs_mount_t *mp) 1040xfs_log_need_covered(xfs_mount_t *mp)
1041{ 1041{
1042 struct xlog *log = mp->m_log; 1042 struct xlog *log = mp->m_log;
@@ -1177,7 +1177,7 @@ xlog_space_left(
1177 * The log manager needs its own routine, in order to control what 1177 * The log manager needs its own routine, in order to control what
1178 * happens with the buffer after the write completes. 1178 * happens with the buffer after the write completes.
1179 */ 1179 */
1180void 1180static void
1181xlog_iodone(xfs_buf_t *bp) 1181xlog_iodone(xfs_buf_t *bp)
1182{ 1182{
1183 struct xlog_in_core *iclog = bp->b_fspriv; 1183 struct xlog_in_core *iclog = bp->b_fspriv;
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
1302 * disk. If there is nothing dirty, then we might need to cover the log to 1302 * disk. If there is nothing dirty, then we might need to cover the log to
1303 * indicate that the filesystem is idle. 1303 * indicate that the filesystem is idle.
1304 */ 1304 */
1305void 1305static void
1306xfs_log_worker( 1306xfs_log_worker(
1307 struct work_struct *work) 1307 struct work_struct *work)
1308{ 1308{
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
1415 */ 1415 */
1416 error = -ENOMEM; 1416 error = -ENOMEM;
1417 bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, 1417 bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
1418 BTOBB(log->l_iclog_size), 0); 1418 BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
1419 if (!bp) 1419 if (!bp)
1420 goto out_free_log; 1420 goto out_free_log;
1421 1421
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
1454 prev_iclog = iclog; 1454 prev_iclog = iclog;
1455 1455
1456 bp = xfs_buf_get_uncached(mp->m_logdev_targp, 1456 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1457 BTOBB(log->l_iclog_size), 0); 1457 BTOBB(log->l_iclog_size),
1458 XBF_NO_IOACCT);
1458 if (!bp) 1459 if (!bp)
1459 goto out_free_iclog; 1460 goto out_free_iclog;
1460 1461
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 80ba0c047090..b5e71072fde5 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp,
163 __uint8_t clientid, 163 __uint8_t clientid,
164 bool permanent); 164 bool permanent);
165int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); 165int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
166int xfs_log_unmount_write(struct xfs_mount *mp);
167void xfs_log_unmount(struct xfs_mount *mp); 166void xfs_log_unmount(struct xfs_mount *mp);
168int xfs_log_force_umount(struct xfs_mount *mp, int logerror); 167int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
169int xfs_log_need_covered(struct xfs_mount *mp);
170
171void xlog_iodone(struct xfs_buf *);
172 168
173struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 169struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
174void xfs_log_ticket_put(struct xlog_ticket *ticket); 170void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
178bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 174bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
179 175
180void xfs_log_work_queue(struct xfs_mount *mp); 176void xfs_log_work_queue(struct xfs_mount *mp);
181void xfs_log_worker(struct work_struct *work);
182void xfs_log_quiesce(struct xfs_mount *mp); 177void xfs_log_quiesce(struct xfs_mount *mp);
183bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); 178bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
184 179
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5e54e7955ea6..a4ab192e1792 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
78 log->l_cilp->xc_ctx->sequence = 1; 78 log->l_cilp->xc_ctx->sequence = 1;
79} 79}
80 80
81static inline int
82xlog_cil_iovec_space(
83 uint niovecs)
84{
85 return round_up((sizeof(struct xfs_log_vec) +
86 niovecs * sizeof(struct xfs_log_iovec)),
87 sizeof(uint64_t));
88}
89
90/*
91 * Allocate or pin log vector buffers for CIL insertion.
92 *
93 * The CIL currently uses disposable buffers for copying a snapshot of the
94 * modified items into the log during a push. The biggest problem with this is
95 * the requirement to allocate the disposable buffer during the commit if:
96 * a) does not exist; or
97 * b) it is too small
98 *
99 * If we do this allocation within xlog_cil_insert_format_items(), it is done
100 * under the xc_ctx_lock, which means that a CIL push cannot occur during
101 * the memory allocation. This means that we have a potential deadlock situation
102 * under low memory conditions when we have lots of dirty metadata pinned in
103 * the CIL and we need a CIL commit to occur to free memory.
104 *
105 * To avoid this, we need to move the memory allocation outside the
106 * xc_ctx_lock, but because the log vector buffers are disposable, that opens
107 * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
108 * vector buffers between the check and the formatting of the item into the
109 * log vector buffer within the xc_ctx_lock.
110 *
111 * Because the log vector buffer needs to be unchanged during the CIL push
112 * process, we cannot share the buffer between the transaction commit (which
113 * modifies the buffer) and the CIL push context that is writing the changes
114 * into the log. This means skipping preallocation of buffer space is
115 * unreliable, but we most definitely do not want to be allocating and freeing
116 * buffers unnecessarily during commits when overwrites can be done safely.
117 *
118 * The simplest solution to this problem is to allocate a shadow buffer when a
119 * log item is committed for the second time, and then to only use this buffer
120 * if necessary. The buffer can remain attached to the log item until such time
121 * it is needed, and this is the buffer that is reallocated to match the size of
122 * the incoming modification. Then during the formatting of the item we can swap
123 * the active buffer with the new one if we can't reuse the existing buffer. We
124 * don't free the old buffer as it may be reused on the next modification if
125 * it's size is right, otherwise we'll free and reallocate it at that point.
126 *
127 * This function builds a vector for the changes in each log item in the
128 * transaction. It then works out the length of the buffer needed for each log
129 * item, allocates them and attaches the vector to the log item in preparation
130 * for the formatting step which occurs under the xc_ctx_lock.
131 *
132 * While this means the memory footprint goes up, it avoids the repeated
133 * alloc/free pattern that repeated modifications of an item would otherwise
134 * cause, and hence minimises the CPU overhead of such behaviour.
135 */
136static void
137xlog_cil_alloc_shadow_bufs(
138 struct xlog *log,
139 struct xfs_trans *tp)
140{
141 struct xfs_log_item_desc *lidp;
142
143 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
144 struct xfs_log_item *lip = lidp->lid_item;
145 struct xfs_log_vec *lv;
146 int niovecs = 0;
147 int nbytes = 0;
148 int buf_size;
149 bool ordered = false;
150
151 /* Skip items which aren't dirty in this transaction. */
152 if (!(lidp->lid_flags & XFS_LID_DIRTY))
153 continue;
154
155 /* get number of vecs and size of data to be stored */
156 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
157
158 /*
159 * Ordered items need to be tracked but we do not wish to write
160 * them. We need a logvec to track the object, but we do not
161 * need an iovec or buffer to be allocated for copying data.
162 */
163 if (niovecs == XFS_LOG_VEC_ORDERED) {
164 ordered = true;
165 niovecs = 0;
166 nbytes = 0;
167 }
168
169 /*
170 * We 64-bit align the length of each iovec so that the start
171 * of the next one is naturally aligned. We'll need to
172 * account for that slack space here. Then round nbytes up
173 * to 64-bit alignment so that the initial buffer alignment is
174 * easy to calculate and verify.
175 */
176 nbytes += niovecs * sizeof(uint64_t);
177 nbytes = round_up(nbytes, sizeof(uint64_t));
178
179 /*
180 * The data buffer needs to start 64-bit aligned, so round up
181 * that space to ensure we can align it appropriately and not
182 * overrun the buffer.
183 */
184 buf_size = nbytes + xlog_cil_iovec_space(niovecs);
185
186 /*
187 * if we have no shadow buffer, or it is too small, we need to
188 * reallocate it.
189 */
190 if (!lip->li_lv_shadow ||
191 buf_size > lip->li_lv_shadow->lv_size) {
192
193 /*
194 * We free and allocate here as a realloc would copy
195 * unecessary data. We don't use kmem_zalloc() for the
196 * same reason - we don't need to zero the data area in
197 * the buffer, only the log vector header and the iovec
198 * storage.
199 */
200 kmem_free(lip->li_lv_shadow);
201
202 lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
203 memset(lv, 0, xlog_cil_iovec_space(niovecs));
204
205 lv->lv_item = lip;
206 lv->lv_size = buf_size;
207 if (ordered)
208 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
209 else
210 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
211 lip->li_lv_shadow = lv;
212 } else {
213 /* same or smaller, optimise common overwrite case */
214 lv = lip->li_lv_shadow;
215 if (ordered)
216 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
217 else
218 lv->lv_buf_len = 0;
219 lv->lv_bytes = 0;
220 lv->lv_next = NULL;
221 }
222
223 /* Ensure the lv is set up according to ->iop_size */
224 lv->lv_niovecs = niovecs;
225
226 /* The allocated data region lies beyond the iovec region */
227 lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
228 }
229
230}
231
81/* 232/*
82 * Prepare the log item for insertion into the CIL. Calculate the difference in 233 * Prepare the log item for insertion into the CIL. Calculate the difference in
83 * log space and vectors it will consume, and if it is a new item pin it as 234 * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
100 /* 251 /*
101 * If there is no old LV, this is the first time we've seen the item in 252 * If there is no old LV, this is the first time we've seen the item in
102 * this CIL context and so we need to pin it. If we are replacing the 253 * this CIL context and so we need to pin it. If we are replacing the
103 * old_lv, then remove the space it accounts for and free it. 254 * old_lv, then remove the space it accounts for and make it the shadow
255 * buffer for later freeing. In both cases we are now switching to the
256 * shadow buffer, so update the the pointer to it appropriately.
104 */ 257 */
105 if (!old_lv) 258 if (!old_lv) {
106 lv->lv_item->li_ops->iop_pin(lv->lv_item); 259 lv->lv_item->li_ops->iop_pin(lv->lv_item);
107 else if (old_lv != lv) { 260 lv->lv_item->li_lv_shadow = NULL;
261 } else if (old_lv != lv) {
108 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); 262 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
109 263
110 *diff_len -= old_lv->lv_bytes; 264 *diff_len -= old_lv->lv_bytes;
111 *diff_iovecs -= old_lv->lv_niovecs; 265 *diff_iovecs -= old_lv->lv_niovecs;
112 kmem_free(old_lv); 266 lv->lv_item->li_lv_shadow = old_lv;
113 } 267 }
114 268
115 /* attach new log vector to log item */ 269 /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
133 * write it out asynchronously without needing to relock the object that was 287 * write it out asynchronously without needing to relock the object that was
134 * modified at the time it gets written into the iclog. 288 * modified at the time it gets written into the iclog.
135 * 289 *
136 * This function builds a vector for the changes in each log item in the 290 * This function takes the prepared log vectors attached to each log item, and
137 * transaction. It then works out the length of the buffer needed for each log 291 * formats the changes into the log vector buffer. The buffer it uses is
138 * item, allocates them and formats the vector for the item into the buffer. 292 * dependent on the current state of the vector in the CIL - the shadow lv is
139 * The buffer is then attached to the log item are then inserted into the 293 * guaranteed to be large enough for the current modification, but we will only
140 * Committed Item List for tracking until the next checkpoint is written out. 294 * use that if we can't reuse the existing lv. If we can't reuse the existing
295 * lv, then simple swap it out for the shadow lv. We don't free it - that is
296 * done lazily either by th enext modification or the freeing of the log item.
141 * 297 *
142 * We don't set up region headers during this process; we simply copy the 298 * We don't set up region headers during this process; we simply copy the
143 * regions into the flat buffer. We can do this because we still have to do a 299 * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
170 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 326 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
171 struct xfs_log_item *lip = lidp->lid_item; 327 struct xfs_log_item *lip = lidp->lid_item;
172 struct xfs_log_vec *lv; 328 struct xfs_log_vec *lv;
173 struct xfs_log_vec *old_lv; 329 struct xfs_log_vec *old_lv = NULL;
174 int niovecs = 0; 330 struct xfs_log_vec *shadow;
175 int nbytes = 0;
176 int buf_size;
177 bool ordered = false; 331 bool ordered = false;
178 332
179 /* Skip items which aren't dirty in this transaction. */ 333 /* Skip items which aren't dirty in this transaction. */
180 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 334 if (!(lidp->lid_flags & XFS_LID_DIRTY))
181 continue; 335 continue;
182 336
183 /* get number of vecs and size of data to be stored */
184 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
185
186 /* Skip items that do not have any vectors for writing */
187 if (!niovecs)
188 continue;
189
190 /* 337 /*
191 * Ordered items need to be tracked but we do not wish to write 338 * The formatting size information is already attached to
192 * them. We need a logvec to track the object, but we do not 339 * the shadow lv on the log item.
193 * need an iovec or buffer to be allocated for copying data.
194 */ 340 */
195 if (niovecs == XFS_LOG_VEC_ORDERED) { 341 shadow = lip->li_lv_shadow;
342 if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
196 ordered = true; 343 ordered = true;
197 niovecs = 0;
198 nbytes = 0;
199 }
200 344
201 /* 345 /* Skip items that do not have any vectors for writing */
202 * We 64-bit align the length of each iovec so that the start 346 if (!shadow->lv_niovecs && !ordered)
203 * of the next one is naturally aligned. We'll need to 347 continue;
204 * account for that slack space here. Then round nbytes up
205 * to 64-bit alignment so that the initial buffer alignment is
206 * easy to calculate and verify.
207 */
208 nbytes += niovecs * sizeof(uint64_t);
209 nbytes = round_up(nbytes, sizeof(uint64_t));
210
211 /* grab the old item if it exists for reservation accounting */
212 old_lv = lip->li_lv;
213
214 /*
215 * The data buffer needs to start 64-bit aligned, so round up
216 * that space to ensure we can align it appropriately and not
217 * overrun the buffer.
218 */
219 buf_size = nbytes +
220 round_up((sizeof(struct xfs_log_vec) +
221 niovecs * sizeof(struct xfs_log_iovec)),
222 sizeof(uint64_t));
223 348
224 /* compare to existing item size */ 349 /* compare to existing item size */
225 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { 350 old_lv = lip->li_lv;
351 if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
226 /* same or smaller, optimise common overwrite case */ 352 /* same or smaller, optimise common overwrite case */
227 lv = lip->li_lv; 353 lv = lip->li_lv;
228 lv->lv_next = NULL; 354 lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
236 */ 362 */
237 *diff_iovecs -= lv->lv_niovecs; 363 *diff_iovecs -= lv->lv_niovecs;
238 *diff_len -= lv->lv_bytes; 364 *diff_len -= lv->lv_bytes;
365
366 /* Ensure the lv is set up according to ->iop_size */
367 lv->lv_niovecs = shadow->lv_niovecs;
368
369 /* reset the lv buffer information for new formatting */
370 lv->lv_buf_len = 0;
371 lv->lv_bytes = 0;
372 lv->lv_buf = (char *)lv +
373 xlog_cil_iovec_space(lv->lv_niovecs);
239 } else { 374 } else {
240 /* allocate new data chunk */ 375 /* switch to shadow buffer! */
241 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); 376 lv = shadow;
242 lv->lv_item = lip; 377 lv->lv_item = lip;
243 lv->lv_size = buf_size;
244 if (ordered) { 378 if (ordered) {
245 /* track as an ordered logvec */ 379 /* track as an ordered logvec */
246 ASSERT(lip->li_lv == NULL); 380 ASSERT(lip->li_lv == NULL);
247 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
248 goto insert; 381 goto insert;
249 } 382 }
250 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
251 } 383 }
252 384
253 /* Ensure the lv is set up according to ->iop_size */
254 lv->lv_niovecs = niovecs;
255
256 /* The allocated data region lies beyond the iovec region */
257 lv->lv_buf_len = 0;
258 lv->lv_bytes = 0;
259 lv->lv_buf = (char *)lv + buf_size - nbytes;
260 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); 385 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
261
262 lip->li_ops->iop_format(lip, lv); 386 lip->li_ops->iop_format(lip, lv);
263insert: 387insert:
264 ASSERT(lv->lv_buf_len <= nbytes);
265 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); 388 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
266 } 389 }
267} 390}
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
783 struct xlog *log = mp->m_log; 906 struct xlog *log = mp->m_log;
784 struct xfs_cil *cil = log->l_cilp; 907 struct xfs_cil *cil = log->l_cilp;
785 908
909 /*
910 * Do all necessary memory allocation before we lock the CIL.
911 * This ensures the allocation does not deadlock with a CIL
912 * push in memory reclaim (e.g. from kswapd).
913 */
914 xlog_cil_alloc_shadow_bufs(log, tp);
915
786 /* lock out background commit */ 916 /* lock out background commit */
787 down_read(&cil->xc_ctx_lock); 917 down_read(&cil->xc_ctx_lock);
788 918
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e39b02351b4a..970c19ba2f56 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,13 +272,15 @@ xfs_readsb(
272 buf_ops = NULL; 272 buf_ops = NULL;
273 273
274 /* 274 /*
275 * Allocate a (locked) buffer to hold the superblock. 275 * Allocate a (locked) buffer to hold the superblock. This will be kept
276 * This will be kept around at all times to optimize 276 * around at all times to optimize access to the superblock. Therefore,
277 * access to the superblock. 277 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
278 * elevated.
278 */ 279 */
279reread: 280reread:
280 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 281 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
281 BTOBB(sector_size), 0, &bp, buf_ops); 282 BTOBB(sector_size), XBF_NO_IOACCT, &bp,
283 buf_ops);
282 if (error) { 284 if (error) {
283 if (loud) 285 if (loud)
284 xfs_warn(mp, "SB validate failed with error %d.", error); 286 xfs_warn(mp, "SB validate failed with error %d.", error);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 184c44effdd5..0cc8d8f74356 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
22 BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ 22 BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
23 #structname ") is wrong, expected " #size) 23 #structname ") is wrong, expected " #size)
24 24
25#define XFS_CHECK_OFFSET(structname, member, off) \
26 BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
27 "XFS: offsetof(" #structname ", " #member ") is wrong, " \
28 "expected " #off)
29
25static inline void __init 30static inline void __init
26xfs_check_ondisk_structs(void) 31xfs_check_ondisk_structs(void)
27{ 32{
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
34 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); 39 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
35 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); 40 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
36 XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); 41 XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
42 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
43 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
37 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); 44 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
38 XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); 45 XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
39 XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); 46 XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
75 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); 82 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
76 */ 83 */
77 84
85 XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
86 XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
87 XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
88 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
89 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
90 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
91 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
78 XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); 92 XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
79 XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); 93 XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
94 XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
95 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
96 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
97 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
98 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
80 XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); 99 XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
81 XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); 100 XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
82 XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); 101 XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
83 XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); 102 XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
84 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); 103 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
85 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); 104 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
86 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); 105 XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0);
106 XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2);
87 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); 107 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
88 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); 108 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
89 XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
90 XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
91 XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
92 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); 109 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
93 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); 110 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
94 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); 111 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
95 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); 112 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
96 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); 113 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
114 XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0);
115 XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
116 XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
97 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); 117 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
98 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
99 118
100 /* log structures */ 119 /* log structures */
101 XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); 120 XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index d5b756669fb5..0f14b2e4bf6c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014 Christoph Hellwig.
3 */ 3 */
4#include <linux/iomap.h>
4#include "xfs.h" 5#include "xfs.h"
5#include "xfs_format.h" 6#include "xfs_format.h"
6#include "xfs_log_format.h" 7#include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
79 return 0; 80 return 0;
80} 81}
81 82
82static void
83xfs_bmbt_to_iomap(
84 struct xfs_inode *ip,
85 struct iomap *iomap,
86 struct xfs_bmbt_irec *imap)
87{
88 struct xfs_mount *mp = ip->i_mount;
89
90 if (imap->br_startblock == HOLESTARTBLOCK) {
91 iomap->blkno = IOMAP_NULL_BLOCK;
92 iomap->type = IOMAP_HOLE;
93 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
94 iomap->blkno = IOMAP_NULL_BLOCK;
95 iomap->type = IOMAP_DELALLOC;
96 } else {
97 iomap->blkno =
98 XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
99 if (imap->br_state == XFS_EXT_UNWRITTEN)
100 iomap->type = IOMAP_UNWRITTEN;
101 else
102 iomap->type = IOMAP_MAPPED;
103 }
104 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
105 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
106}
107
108/* 83/*
109 * Get a layout for the pNFS client. 84 * Get a layout for the pNFS client.
110 */ 85 */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 76c0a4a9bb17..355dd9e1cb64 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
98/* 98/*
99 * From xfs_rtbitmap.c 99 * From xfs_rtbitmap.c
100 */ 100 */
101int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
102 xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
103int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, 101int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
104 xfs_rtblock_t start, xfs_extlen_t len, int val, 102 xfs_rtblock_t start, xfs_extlen_t len, int val,
105 xfs_rtblock_t *new, int *stat); 103 xfs_rtblock_t *new, int *stat);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 11ea5d51db56..0303f1005f88 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -546,7 +546,7 @@ xfs_showargs(
546 546
547 return 0; 547 return 0;
548} 548}
549__uint64_t 549static __uint64_t
550xfs_max_file_offset( 550xfs_max_file_offset(
551 unsigned int blockshift) 551 unsigned int blockshift)
552{ 552{
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
1294 */ 1294 */
1295 xfs_restore_resvblks(mp); 1295 xfs_restore_resvblks(mp);
1296 xfs_log_work_queue(mp); 1296 xfs_log_work_queue(mp);
1297 xfs_queue_eofblocks(mp);
1297 } 1298 }
1298 1299
1299 /* rw -> ro */ 1300 /* rw -> ro */
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
1306 * return it to the same size. 1307 * return it to the same size.
1307 */ 1308 */
1308 xfs_save_resvblks(mp); 1309 xfs_save_resvblks(mp);
1310
1311 /*
1312 * Cancel background eofb scanning so it cannot race with the
1313 * final log force+buftarg wait and deadlock the remount.
1314 */
1315 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1316
1309 xfs_quiesce_attr(mp); 1317 xfs_quiesce_attr(mp);
1310 mp->m_flags |= XFS_MOUNT_RDONLY; 1318 mp->m_flags |= XFS_MOUNT_RDONLY;
1311 } 1319 }
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
1565 } 1573 }
1566 } 1574 }
1567 1575
1568 if (xfs_sb_version_hassparseinodes(&mp->m_sb))
1569 xfs_alert(mp,
1570 "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
1571
1572 error = xfs_mountfs(mp); 1576 error = xfs_mountfs(mp);
1573 if (error) 1577 if (error)
1574 goto out_filestream_unmount; 1578 goto out_filestream_unmount;
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
1692 if (!xfs_log_ticket_zone) 1696 if (!xfs_log_ticket_zone)
1693 goto out_free_ioend_bioset; 1697 goto out_free_ioend_bioset;
1694 1698
1695 xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), 1699 xfs_bmap_free_item_zone = kmem_zone_init(
1696 "xfs_bmap_free_item"); 1700 sizeof(struct xfs_bmap_free_item),
1701 "xfs_bmap_free_item");
1697 if (!xfs_bmap_free_item_zone) 1702 if (!xfs_bmap_free_item_zone)
1698 goto out_destroy_log_ticket_zone; 1703 goto out_destroy_log_ticket_zone;
1699 1704
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2dfb1ce4585f..529bce9fc37e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
61struct xfs_buftarg; 61struct xfs_buftarg;
62struct block_device; 62struct block_device;
63 63
64extern __uint64_t xfs_max_file_offset(unsigned int);
65
66extern void xfs_flush_inodes(struct xfs_mount *mp); 64extern void xfs_flush_inodes(struct xfs_mount *mp);
67extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 65extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
68extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, 66extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 4c2c55086208..79cfd3fc5324 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
634{ 634{
635 struct xfs_error_cfg *cfg; 635 struct xfs_error_cfg *cfg;
636 636
637 if (error < 0)
638 error = -error;
639
637 switch (error) { 640 switch (error) {
638 case EIO: 641 case EIO:
639 cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; 642 cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ea94ee0fe5ea..145169093fe0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
354DEFINE_BUF_EVENT(xfs_buf_bawrite); 354DEFINE_BUF_EVENT(xfs_buf_bawrite);
355DEFINE_BUF_EVENT(xfs_buf_lock); 355DEFINE_BUF_EVENT(xfs_buf_lock);
356DEFINE_BUF_EVENT(xfs_buf_lock_done); 356DEFINE_BUF_EVENT(xfs_buf_lock_done);
357DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
357DEFINE_BUF_EVENT(xfs_buf_trylock); 358DEFINE_BUF_EVENT(xfs_buf_trylock);
358DEFINE_BUF_EVENT(xfs_buf_unlock); 359DEFINE_BUF_EVENT(xfs_buf_unlock);
359DEFINE_BUF_EVENT(xfs_buf_iowait); 360DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
1134) 1135)
1135 1136
1136DECLARE_EVENT_CLASS(xfs_file_class, 1137DECLARE_EVENT_CLASS(xfs_file_class,
1137 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), 1138 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
1138 TP_ARGS(ip, count, offset, flags), 1139 TP_ARGS(ip, count, offset),
1139 TP_STRUCT__entry( 1140 TP_STRUCT__entry(
1140 __field(dev_t, dev) 1141 __field(dev_t, dev)
1141 __field(xfs_ino_t, ino) 1142 __field(xfs_ino_t, ino)
1142 __field(xfs_fsize_t, size) 1143 __field(xfs_fsize_t, size)
1143 __field(loff_t, offset) 1144 __field(loff_t, offset)
1144 __field(size_t, count) 1145 __field(size_t, count)
1145 __field(int, flags)
1146 ), 1146 ),
1147 TP_fast_assign( 1147 TP_fast_assign(
1148 __entry->dev = VFS_I(ip)->i_sb->s_dev; 1148 __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
1150 __entry->size = ip->i_d.di_size; 1150 __entry->size = ip->i_d.di_size;
1151 __entry->offset = offset; 1151 __entry->offset = offset;
1152 __entry->count = count; 1152 __entry->count = count;
1153 __entry->flags = flags;
1154 ), 1153 ),
1155 TP_printk("dev %d:%d ino 0x%llx size 0x%llx " 1154 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
1156 "offset 0x%llx count 0x%zx ioflags %s",
1157 MAJOR(__entry->dev), MINOR(__entry->dev), 1155 MAJOR(__entry->dev), MINOR(__entry->dev),
1158 __entry->ino, 1156 __entry->ino,
1159 __entry->size, 1157 __entry->size,
1160 __entry->offset, 1158 __entry->offset,
1161 __entry->count, 1159 __entry->count)
1162 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
1163) 1160)
1164 1161
1165#define DEFINE_RW_EVENT(name) \ 1162#define DEFINE_RW_EVENT(name) \
1166DEFINE_EVENT(xfs_file_class, name, \ 1163DEFINE_EVENT(xfs_file_class, name, \
1167 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ 1164 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
1168 TP_ARGS(ip, count, offset, flags)) 1165 TP_ARGS(ip, count, offset))
1169DEFINE_RW_EVENT(xfs_file_read); 1166DEFINE_RW_EVENT(xfs_file_buffered_read);
1167DEFINE_RW_EVENT(xfs_file_direct_read);
1168DEFINE_RW_EVENT(xfs_file_dax_read);
1170DEFINE_RW_EVENT(xfs_file_buffered_write); 1169DEFINE_RW_EVENT(xfs_file_buffered_write);
1171DEFINE_RW_EVENT(xfs_file_direct_write); 1170DEFINE_RW_EVENT(xfs_file_direct_write);
1171DEFINE_RW_EVENT(xfs_file_dax_write);
1172DEFINE_RW_EVENT(xfs_file_splice_read); 1172DEFINE_RW_EVENT(xfs_file_splice_read);
1173 1173
1174DECLARE_EVENT_CLASS(xfs_page_class, 1174DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
1295DEFINE_IOMAP_EVENT(xfs_get_blocks_found); 1295DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
1296DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); 1296DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
1297DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); 1297DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
1298DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
1299DEFINE_IOMAP_EVENT(xfs_iomap_found);
1300DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
1298 1301
1299DECLARE_EVENT_CLASS(xfs_simple_io_class, 1302DECLARE_EVENT_CLASS(xfs_simple_io_class,
1300 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1303 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9a462e892e4f..9b2b9fa89331 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
52 /* delayed logging */ 52 /* delayed logging */
53 struct list_head li_cil; /* CIL pointers */ 53 struct list_head li_cil; /* CIL pointers */
54 struct xfs_log_vec *li_lv; /* active log vector */ 54 struct xfs_log_vec *li_lv; /* active log vector */
55 struct xfs_log_vec *li_lv_shadow; /* standby vector */
55 xfs_lsn_t li_seq; /* CIL commit seq */ 56 xfs_lsn_t li_seq; /* CIL commit seq */
56} xfs_log_item_t; 57} xfs_log_item_t;
57 58
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d8414502edb4..b03c0625fa6e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
6struct dentry; 6struct dentry;
7struct iattr; 7struct iattr;
8struct inode; 8struct inode;
9struct iomap;
9struct super_block; 10struct super_block;
10struct vfsmount; 11struct vfsmount;
11 12
@@ -187,21 +188,6 @@ struct fid {
187 * get_name is not (which is possibly inconsistent) 188 * get_name is not (which is possibly inconsistent)
188 */ 189 */
189 190
190/* types of block ranges for multipage write mappings. */
191#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
192#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
193#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
194#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
195
196#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
197
198struct iomap {
199 sector_t blkno; /* first sector of mapping */
200 loff_t offset; /* file offset of mapping, bytes */
201 u64 length; /* length of mapping, bytes */
202 int type; /* type of mapping */
203};
204
205struct export_operations { 191struct export_operations {
206 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, 192 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
207 struct inode *parent); 193 struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 000000000000..3267df461012
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,70 @@
1#ifndef LINUX_IOMAP_H
2#define LINUX_IOMAP_H 1
3
4#include <linux/types.h>
5
6struct fiemap_extent_info;
7struct inode;
8struct iov_iter;
9struct kiocb;
10struct vm_area_struct;
11struct vm_fault;
12
13/*
14 * Types of block ranges for iomap mappings:
15 */
16#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
17#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
18#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
19#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
20
21/*
22 * Magic value for blkno:
23 */
24#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
25
26struct iomap {
27 sector_t blkno; /* 1st sector of mapping, 512b units */
28 loff_t offset; /* file offset of mapping, bytes */
29 u64 length; /* length of mapping, bytes */
30 int type; /* type of mapping */
31 struct block_device *bdev; /* block device for I/O */
32};
33
34/*
35 * Flags for iomap_begin / iomap_end. No flag implies a read.
36 */
37#define IOMAP_WRITE (1 << 0)
38#define IOMAP_ZERO (1 << 1)
39
40struct iomap_ops {
41 /*
42 * Return the existing mapping at pos, or reserve space starting at
43 * pos for up to length, as long as we can do it as a single mapping.
44 * The actual length is returned in iomap->length.
45 */
46 int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
47 unsigned flags, struct iomap *iomap);
48
49 /*
50 * Commit and/or unreserve space previous allocated using iomap_begin.
51 * Written indicates the length of the successful write operation which
52 * needs to be commited, while the rest needs to be unreserved.
53 * Written might be zero if no data was written.
54 */
55 int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
56 ssize_t written, unsigned flags, struct iomap *iomap);
57};
58
59ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
60 struct iomap_ops *ops);
61int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
62 bool *did_zero, struct iomap_ops *ops);
63int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
64 struct iomap_ops *ops);
65int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
66 struct iomap_ops *ops);
67int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
68 loff_t start, loff_t len, struct iomap_ops *ops);
69
70#endif /* LINUX_IOMAP_H */