aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-27 12:53:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-27 12:53:35 -0400
commit0e6acf0204da5b8705722a5f6806a4f55ed379d6 (patch)
tree4a8a9bf9daba9c734a0fdde417ae1cb472ca396d
parent0e06f5c0deeef0332a5da2ecb8f1fcf3e024d958 (diff)
parentf2bdfda9a1c668539bc85baf5625f6f14bc510b1 (diff)
Merge tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs
Pull xfs updates from Dave Chinner: "The major addition is the new iomap based block mapping infrastructure. We've been kicking this about locally for years, but there are other filesystems want to use it too (e.g. gfs2). Now it is fully working, reviewed and ready for merge and be used by other filesystems. There are a lot of other fixes and cleanups in the tree, but those are XFS internal things and none are of the scale or visibility of the iomap changes. See below for details. I am likely to send another pull request next week - we're just about ready to merge some new functionality (on disk block->owner reverse mapping infrastructure), but that's a huge chunk of code (74 files changed, 7283 insertions(+), 1114 deletions(-)) so I'm keeping that separate to all the "normal" pull request changes so they don't get lost in the noise. Summary of changes in this update: - generic iomap based IO path infrastructure - generic iomap based fiemap implementation - xfs iomap based Io path implementation - buffer error handling fixes - tracking of in flight buffer IO for unmount serialisation - direct IO and DAX io path separation and simplification - shortform directory format definition changes for wider platform compatibility - various buffer cache fixes - cleanups in preparation for rmap merge - error injection cleanups and fixes - log item format buffer memory allocation restructuring to prevent rare OOM reclaim deadlocks - sparse inode chunks are now fully supported" * tag 'xfs-for-linus-4.8-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (53 commits) xfs: remove EXPERIMENTAL tag from sparse inode feature xfs: bufferhead chains are invalid after end_page_writeback xfs: allocate log vector buffers outside CIL context lock libxfs: directory node splitting does not have an extra block xfs: remove dax code from object file when disabled xfs: skip dirty pages in ->releasepage() xfs: remove __arch_pack xfs: kill xfs_dir2_inou_t xfs: kill xfs_dir2_sf_off_t xfs: split direct I/O and DAX path xfs: direct calls in the direct I/O path xfs: stop using generic_file_read_iter for direct I/O xfs: split xfs_file_read_iter into buffered and direct I/O helpers xfs: remove s_maxbytes enforcement in xfs_file_read_iter xfs: kill ioflags xfs: don't pass ioflags around in the ioctl path xfs: track and serialize in-flight async buffers against unmount xfs: exclude never-released buffers from buftarg I/O accounting xfs: don't reset b_retries to 0 on every failure xfs: remove extraneous buffer flag changes ...
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/buffer.c76
-rw-r--r--fs/internal.h3
-rw-r--r--fs/iomap.c497
-rw-r--r--fs/nfsd/blocklayout.c1
-rw-r--r--fs/nfsd/blocklayoutxdr.c1
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c101
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h9
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h3
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c51
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h18
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_btree.c27
-rw-r--r--fs/xfs/libxfs/xfs_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c59
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c31
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h43
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c38
-rw-r--r--fs/xfs/libxfs/xfs_format.h66
-rw-r--r--fs/xfs/libxfs/xfs_fs.h8
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c28
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c2
-rw-r--r--fs/xfs/xfs_aops.c332
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_inactive.c2
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c381
-rw-r--r--fs/xfs/xfs_bmap_util.h3
-rw-r--r--fs/xfs/xfs_buf.c236
-rw-r--r--fs/xfs/xfs_buf.h7
-rw-r--r--fs/xfs/xfs_buf_item.c31
-rw-r--r--fs/xfs/xfs_dquot.c1
-rw-r--r--fs/xfs/xfs_dquot_item.c2
-rw-r--r--fs/xfs/xfs_error.c5
-rw-r--r--fs/xfs/xfs_error.h2
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c425
-rw-r--r--fs/xfs/xfs_fsops.c105
-rw-r--r--fs/xfs/xfs_icache.c2
-rw-r--r--fs/xfs/xfs_icache.h1
-rw-r--r--fs/xfs/xfs_inode.c16
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_ioctl.c27
-rw-r--r--fs/xfs/xfs_ioctl.h3
-rw-r--r--fs/xfs/xfs_ioctl32.c6
-rw-r--r--fs/xfs/xfs_iomap.c171
-rw-r--r--fs/xfs/xfs_iomap.h7
-rw-r--r--fs/xfs/xfs_iops.c113
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c13
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_cil.c258
-rw-r--r--fs/xfs/xfs_mount.c10
-rw-r--r--fs/xfs/xfs_ondisk.h31
-rw-r--r--fs/xfs/xfs_pnfs.c27
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_super.c19
-rw-r--r--fs/xfs/xfs_super.h2
-rw-r--r--fs/xfs/xfs_sysfs.c3
-rw-r--r--fs/xfs/xfs_trace.h25
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--include/linux/exportfs.h16
-rw-r--r--include/linux/iomap.h70
66 files changed, 2026 insertions, 1440 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index b8fcb416be72..4524916fa200 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -10,6 +10,9 @@ config DCACHE_WORD_ACCESS
10 10
11if BLOCK 11if BLOCK
12 12
13config FS_IOMAP
14 bool
15
13source "fs/ext2/Kconfig" 16source "fs/ext2/Kconfig"
14source "fs/ext4/Kconfig" 17source "fs/ext4/Kconfig"
15source "fs/jbd2/Kconfig" 18source "fs/jbd2/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 85b6e13b62d3..ed2b63257ba9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -49,6 +49,7 @@ obj-$(CONFIG_COREDUMP) += coredump.o
49obj-$(CONFIG_SYSCTL) += drop_caches.o 49obj-$(CONFIG_SYSCTL) += drop_caches.o
50 50
51obj-$(CONFIG_FHANDLE) += fhandle.o 51obj-$(CONFIG_FHANDLE) += fhandle.o
52obj-$(CONFIG_FS_IOMAP) += iomap.o
52 53
53obj-y += quota/ 54obj-y += quota/
54 55
diff --git a/fs/buffer.c b/fs/buffer.c
index b9fa1be75e69..9c8eb9b6db6a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -21,6 +21,7 @@
21#include <linux/kernel.h> 21#include <linux/kernel.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/iomap.h>
24#include <linux/mm.h> 25#include <linux/mm.h>
25#include <linux/percpu.h> 26#include <linux/percpu.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
@@ -1892,8 +1893,62 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1892} 1893}
1893EXPORT_SYMBOL(page_zero_new_buffers); 1894EXPORT_SYMBOL(page_zero_new_buffers);
1894 1895
1895int __block_write_begin(struct page *page, loff_t pos, unsigned len, 1896static void
1896 get_block_t *get_block) 1897iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
1898 struct iomap *iomap)
1899{
1900 loff_t offset = block << inode->i_blkbits;
1901
1902 bh->b_bdev = iomap->bdev;
1903
1904 /*
1905 * Block points to offset in file we need to map, iomap contains
1906 * the offset at which the map starts. If the map ends before the
1907 * current block, then do not map the buffer and let the caller
1908 * handle it.
1909 */
1910 BUG_ON(offset >= iomap->offset + iomap->length);
1911
1912 switch (iomap->type) {
1913 case IOMAP_HOLE:
1914 /*
1915 * If the buffer is not up to date or beyond the current EOF,
1916 * we need to mark it as new to ensure sub-block zeroing is
1917 * executed if necessary.
1918 */
1919 if (!buffer_uptodate(bh) ||
1920 (offset >= i_size_read(inode)))
1921 set_buffer_new(bh);
1922 break;
1923 case IOMAP_DELALLOC:
1924 if (!buffer_uptodate(bh) ||
1925 (offset >= i_size_read(inode)))
1926 set_buffer_new(bh);
1927 set_buffer_uptodate(bh);
1928 set_buffer_mapped(bh);
1929 set_buffer_delay(bh);
1930 break;
1931 case IOMAP_UNWRITTEN:
1932 /*
1933 * For unwritten regions, we always need to ensure that
1934 * sub-block writes cause the regions in the block we are not
1935 * writing to are zeroed. Set the buffer as new to ensure this.
1936 */
1937 set_buffer_new(bh);
1938 set_buffer_unwritten(bh);
1939 /* FALLTHRU */
1940 case IOMAP_MAPPED:
1941 if (offset >= i_size_read(inode))
1942 set_buffer_new(bh);
1943 bh->b_blocknr = (iomap->blkno >> (inode->i_blkbits - 9)) +
1944 ((offset - iomap->offset) >> inode->i_blkbits);
1945 set_buffer_mapped(bh);
1946 break;
1947 }
1948}
1949
1950int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
1951 get_block_t *get_block, struct iomap *iomap)
1897{ 1952{
1898 unsigned from = pos & (PAGE_SIZE - 1); 1953 unsigned from = pos & (PAGE_SIZE - 1);
1899 unsigned to = from + len; 1954 unsigned to = from + len;
@@ -1929,9 +1984,14 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1929 clear_buffer_new(bh); 1984 clear_buffer_new(bh);
1930 if (!buffer_mapped(bh)) { 1985 if (!buffer_mapped(bh)) {
1931 WARN_ON(bh->b_size != blocksize); 1986 WARN_ON(bh->b_size != blocksize);
1932 err = get_block(inode, block, bh, 1); 1987 if (get_block) {
1933 if (err) 1988 err = get_block(inode, block, bh, 1);
1934 break; 1989 if (err)
1990 break;
1991 } else {
1992 iomap_to_bh(inode, block, bh, iomap);
1993 }
1994
1935 if (buffer_new(bh)) { 1995 if (buffer_new(bh)) {
1936 unmap_underlying_metadata(bh->b_bdev, 1996 unmap_underlying_metadata(bh->b_bdev,
1937 bh->b_blocknr); 1997 bh->b_blocknr);
@@ -1972,6 +2032,12 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1972 page_zero_new_buffers(page, from, to); 2032 page_zero_new_buffers(page, from, to);
1973 return err; 2033 return err;
1974} 2034}
2035
2036int __block_write_begin(struct page *page, loff_t pos, unsigned len,
2037 get_block_t *get_block)
2038{
2039 return __block_write_begin_int(page, pos, len, get_block, NULL);
2040}
1975EXPORT_SYMBOL(__block_write_begin); 2041EXPORT_SYMBOL(__block_write_begin);
1976 2042
1977static int __block_commit_write(struct inode *inode, struct page *page, 2043static int __block_commit_write(struct inode *inode, struct page *page,
diff --git a/fs/internal.h b/fs/internal.h
index f57ced528cde..cef0913e5d41 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
11 11
12struct super_block; 12struct super_block;
13struct file_system_type; 13struct file_system_type;
14struct iomap;
14struct linux_binprm; 15struct linux_binprm;
15struct path; 16struct path;
16struct mount; 17struct mount;
@@ -39,6 +40,8 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
39 * buffer.c 40 * buffer.c
40 */ 41 */
41extern void guard_bio_eod(int rw, struct bio *bio); 42extern void guard_bio_eod(int rw, struct bio *bio);
43extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
44 get_block_t *get_block, struct iomap *iomap);
42 45
43/* 46/*
44 * char_dev.c 47 * char_dev.c
diff --git a/fs/iomap.c b/fs/iomap.c
new file mode 100644
index 000000000000..48141b8eff5f
--- /dev/null
+++ b/fs/iomap.c
@@ -0,0 +1,497 @@
1/*
2 * Copyright (C) 2010 Red Hat, Inc.
3 * Copyright (c) 2016 Christoph Hellwig.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14#include <linux/module.h>
15#include <linux/compiler.h>
16#include <linux/fs.h>
17#include <linux/iomap.h>
18#include <linux/uaccess.h>
19#include <linux/gfp.h>
20#include <linux/mm.h>
21#include <linux/swap.h>
22#include <linux/pagemap.h>
23#include <linux/file.h>
24#include <linux/uio.h>
25#include <linux/backing-dev.h>
26#include <linux/buffer_head.h>
27#include <linux/dax.h>
28#include "internal.h"
29
30typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
31 void *data, struct iomap *iomap);
32
33/*
34 * Execute a iomap write on a segment of the mapping that spans a
35 * contiguous range of pages that have identical block mapping state.
36 *
37 * This avoids the need to map pages individually, do individual allocations
38 * for each page and most importantly avoid the need for filesystem specific
39 * locking per page. Instead, all the operations are amortised over the entire
40 * range of pages. It is assumed that the filesystems will lock whatever
41 * resources they require in the iomap_begin call, and release them in the
42 * iomap_end call.
43 */
44static loff_t
45iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
46 struct iomap_ops *ops, void *data, iomap_actor_t actor)
47{
48 struct iomap iomap = { 0 };
49 loff_t written = 0, ret;
50
51 /*
52 * Need to map a range from start position for length bytes. This can
53 * span multiple pages - it is only guaranteed to return a range of a
54 * single type of pages (e.g. all into a hole, all mapped or all
55 * unwritten). Failure at this point has nothing to undo.
56 *
57 * If allocation is required for this range, reserve the space now so
58 * that the allocation is guaranteed to succeed later on. Once we copy
59 * the data into the page cache pages, then we cannot fail otherwise we
60 * expose transient stale data. If the reserve fails, we can safely
61 * back out at this point as there is nothing to undo.
62 */
63 ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
64 if (ret)
65 return ret;
66 if (WARN_ON(iomap.offset > pos))
67 return -EIO;
68
69 /*
70 * Cut down the length to the one actually provided by the filesystem,
71 * as it might not be able to give us the whole size that we requested.
72 */
73 if (iomap.offset + iomap.length < pos + length)
74 length = iomap.offset + iomap.length - pos;
75
76 /*
77 * Now that we have guaranteed that the space allocation will succeed.
78 * we can do the copy-in page by page without having to worry about
79 * failures exposing transient data.
80 */
81 written = actor(inode, pos, length, data, &iomap);
82
83 /*
84 * Now the data has been copied, commit the range we've copied. This
85 * should not fail unless the filesystem has had a fatal error.
86 */
87 ret = ops->iomap_end(inode, pos, length, written > 0 ? written : 0,
88 flags, &iomap);
89
90 return written ? written : ret;
91}
92
93static void
94iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
95{
96 loff_t i_size = i_size_read(inode);
97
98 /*
99 * Only truncate newly allocated pages beyoned EOF, even if the
100 * write started inside the existing inode size.
101 */
102 if (pos + len > i_size)
103 truncate_pagecache_range(inode, max(pos, i_size), pos + len);
104}
105
106static int
107iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
108 struct page **pagep, struct iomap *iomap)
109{
110 pgoff_t index = pos >> PAGE_SHIFT;
111 struct page *page;
112 int status = 0;
113
114 BUG_ON(pos + len > iomap->offset + iomap->length);
115
116 page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
117 if (!page)
118 return -ENOMEM;
119
120 status = __block_write_begin_int(page, pos, len, NULL, iomap);
121 if (unlikely(status)) {
122 unlock_page(page);
123 put_page(page);
124 page = NULL;
125
126 iomap_write_failed(inode, pos, len);
127 }
128
129 *pagep = page;
130 return status;
131}
132
133static int
134iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
135 unsigned copied, struct page *page)
136{
137 int ret;
138
139 ret = generic_write_end(NULL, inode->i_mapping, pos, len,
140 copied, page, NULL);
141 if (ret < len)
142 iomap_write_failed(inode, pos, len);
143 return ret;
144}
145
146static loff_t
147iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
148 struct iomap *iomap)
149{
150 struct iov_iter *i = data;
151 long status = 0;
152 ssize_t written = 0;
153 unsigned int flags = AOP_FLAG_NOFS;
154
155 /*
156 * Copies from kernel address space cannot fail (NFSD is a big user).
157 */
158 if (!iter_is_iovec(i))
159 flags |= AOP_FLAG_UNINTERRUPTIBLE;
160
161 do {
162 struct page *page;
163 unsigned long offset; /* Offset into pagecache page */
164 unsigned long bytes; /* Bytes to write to page */
165 size_t copied; /* Bytes copied from user */
166
167 offset = (pos & (PAGE_SIZE - 1));
168 bytes = min_t(unsigned long, PAGE_SIZE - offset,
169 iov_iter_count(i));
170again:
171 if (bytes > length)
172 bytes = length;
173
174 /*
175 * Bring in the user page that we will copy from _first_.
176 * Otherwise there's a nasty deadlock on copying from the
177 * same page as we're writing to, without it being marked
178 * up-to-date.
179 *
180 * Not only is this an optimisation, but it is also required
181 * to check that the address is actually valid, when atomic
182 * usercopies are used, below.
183 */
184 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
185 status = -EFAULT;
186 break;
187 }
188
189 status = iomap_write_begin(inode, pos, bytes, flags, &page,
190 iomap);
191 if (unlikely(status))
192 break;
193
194 if (mapping_writably_mapped(inode->i_mapping))
195 flush_dcache_page(page);
196
197 pagefault_disable();
198 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
199 pagefault_enable();
200
201 flush_dcache_page(page);
202 mark_page_accessed(page);
203
204 status = iomap_write_end(inode, pos, bytes, copied, page);
205 if (unlikely(status < 0))
206 break;
207 copied = status;
208
209 cond_resched();
210
211 iov_iter_advance(i, copied);
212 if (unlikely(copied == 0)) {
213 /*
214 * If we were unable to copy any data at all, we must
215 * fall back to a single segment length write.
216 *
217 * If we didn't fallback here, we could livelock
218 * because not all segments in the iov can be copied at
219 * once without a pagefault.
220 */
221 bytes = min_t(unsigned long, PAGE_SIZE - offset,
222 iov_iter_single_seg_count(i));
223 goto again;
224 }
225 pos += copied;
226 written += copied;
227 length -= copied;
228
229 balance_dirty_pages_ratelimited(inode->i_mapping);
230 } while (iov_iter_count(i) && length);
231
232 return written ? written : status;
233}
234
235ssize_t
236iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
237 struct iomap_ops *ops)
238{
239 struct inode *inode = iocb->ki_filp->f_mapping->host;
240 loff_t pos = iocb->ki_pos, ret = 0, written = 0;
241
242 while (iov_iter_count(iter)) {
243 ret = iomap_apply(inode, pos, iov_iter_count(iter),
244 IOMAP_WRITE, ops, iter, iomap_write_actor);
245 if (ret <= 0)
246 break;
247 pos += ret;
248 written += ret;
249 }
250
251 return written ? written : ret;
252}
253EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
254
255static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
256 unsigned bytes, struct iomap *iomap)
257{
258 struct page *page;
259 int status;
260
261 status = iomap_write_begin(inode, pos, bytes,
262 AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap);
263 if (status)
264 return status;
265
266 zero_user(page, offset, bytes);
267 mark_page_accessed(page);
268
269 return iomap_write_end(inode, pos, bytes, bytes, page);
270}
271
272static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
273 struct iomap *iomap)
274{
275 sector_t sector = iomap->blkno +
276 (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
277
278 return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
279}
280
281static loff_t
282iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
283 void *data, struct iomap *iomap)
284{
285 bool *did_zero = data;
286 loff_t written = 0;
287 int status;
288
289 /* already zeroed? we're done. */
290 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
291 return count;
292
293 do {
294 unsigned offset, bytes;
295
296 offset = pos & (PAGE_SIZE - 1); /* Within page */
297 bytes = min_t(unsigned, PAGE_SIZE - offset, count);
298
299 if (IS_DAX(inode))
300 status = iomap_dax_zero(pos, offset, bytes, iomap);
301 else
302 status = iomap_zero(inode, pos, offset, bytes, iomap);
303 if (status < 0)
304 return status;
305
306 pos += bytes;
307 count -= bytes;
308 written += bytes;
309 if (did_zero)
310 *did_zero = true;
311 } while (count > 0);
312
313 return written;
314}
315
316int
317iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
318 struct iomap_ops *ops)
319{
320 loff_t ret;
321
322 while (len > 0) {
323 ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
324 ops, did_zero, iomap_zero_range_actor);
325 if (ret <= 0)
326 return ret;
327
328 pos += ret;
329 len -= ret;
330 }
331
332 return 0;
333}
334EXPORT_SYMBOL_GPL(iomap_zero_range);
335
336int
337iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
338 struct iomap_ops *ops)
339{
340 unsigned blocksize = (1 << inode->i_blkbits);
341 unsigned off = pos & (blocksize - 1);
342
343 /* Block boundary? Nothing to do */
344 if (!off)
345 return 0;
346 return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
347}
348EXPORT_SYMBOL_GPL(iomap_truncate_page);
349
350static loff_t
351iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
352 void *data, struct iomap *iomap)
353{
354 struct page *page = data;
355 int ret;
356
357 ret = __block_write_begin_int(page, pos & ~PAGE_MASK, length,
358 NULL, iomap);
359 if (ret)
360 return ret;
361
362 block_commit_write(page, 0, length);
363 return length;
364}
365
366int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
367 struct iomap_ops *ops)
368{
369 struct page *page = vmf->page;
370 struct inode *inode = file_inode(vma->vm_file);
371 unsigned long length;
372 loff_t offset, size;
373 ssize_t ret;
374
375 lock_page(page);
376 size = i_size_read(inode);
377 if ((page->mapping != inode->i_mapping) ||
378 (page_offset(page) > size)) {
379 /* We overload EFAULT to mean page got truncated */
380 ret = -EFAULT;
381 goto out_unlock;
382 }
383
384 /* page is wholly or partially inside EOF */
385 if (((page->index + 1) << PAGE_SHIFT) > size)
386 length = size & ~PAGE_MASK;
387 else
388 length = PAGE_SIZE;
389
390 offset = page_offset(page);
391 while (length > 0) {
392 ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
393 ops, page, iomap_page_mkwrite_actor);
394 if (unlikely(ret <= 0))
395 goto out_unlock;
396 offset += ret;
397 length -= ret;
398 }
399
400 set_page_dirty(page);
401 wait_for_stable_page(page);
402 return 0;
403out_unlock:
404 unlock_page(page);
405 return ret;
406}
407EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
408
409struct fiemap_ctx {
410 struct fiemap_extent_info *fi;
411 struct iomap prev;
412};
413
414static int iomap_to_fiemap(struct fiemap_extent_info *fi,
415 struct iomap *iomap, u32 flags)
416{
417 switch (iomap->type) {
418 case IOMAP_HOLE:
419 /* skip holes */
420 return 0;
421 case IOMAP_DELALLOC:
422 flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN;
423 break;
424 case IOMAP_UNWRITTEN:
425 flags |= FIEMAP_EXTENT_UNWRITTEN;
426 break;
427 case IOMAP_MAPPED:
428 break;
429 }
430
431 return fiemap_fill_next_extent(fi, iomap->offset,
432 iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0,
433 iomap->length, flags | FIEMAP_EXTENT_MERGED);
434
435}
436
437static loff_t
438iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
439 struct iomap *iomap)
440{
441 struct fiemap_ctx *ctx = data;
442 loff_t ret = length;
443
444 if (iomap->type == IOMAP_HOLE)
445 return length;
446
447 ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
448 ctx->prev = *iomap;
449 switch (ret) {
450 case 0: /* success */
451 return length;
452 case 1: /* extent array full */
453 return 0;
454 default:
455 return ret;
456 }
457}
458
459int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
460 loff_t start, loff_t len, struct iomap_ops *ops)
461{
462 struct fiemap_ctx ctx;
463 loff_t ret;
464
465 memset(&ctx, 0, sizeof(ctx));
466 ctx.fi = fi;
467 ctx.prev.type = IOMAP_HOLE;
468
469 ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
470 if (ret)
471 return ret;
472
473 ret = filemap_write_and_wait(inode->i_mapping);
474 if (ret)
475 return ret;
476
477 while (len > 0) {
478 ret = iomap_apply(inode, start, len, 0, ops, &ctx,
479 iomap_fiemap_actor);
480 if (ret < 0)
481 return ret;
482 if (ret == 0)
483 break;
484
485 start += ret;
486 len -= ret;
487 }
488
489 if (ctx.prev.type != IOMAP_HOLE) {
490 ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
491 if (ret < 0)
492 return ret;
493 }
494
495 return 0;
496}
497EXPORT_SYMBOL_GPL(iomap_fiemap);
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 31f3df193bdb..ad2c05e80a83 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -2,6 +2,7 @@
2 * Copyright (c) 2014-2016 Christoph Hellwig. 2 * Copyright (c) 2014-2016 Christoph Hellwig.
3 */ 3 */
4#include <linux/exportfs.h> 4#include <linux/exportfs.h>
5#include <linux/iomap.h>
5#include <linux/genhd.h> 6#include <linux/genhd.h>
6#include <linux/slab.h> 7#include <linux/slab.h>
7#include <linux/pr.h> 8#include <linux/pr.h>
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 6c3b316f932e..4ebaaf4b8d8a 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/sunrpc/svc.h> 4#include <linux/sunrpc/svc.h>
5#include <linux/exportfs.h> 5#include <linux/exportfs.h>
6#include <linux/iomap.h>
6#include <linux/nfs4.h> 7#include <linux/nfs4.h>
7 8
8#include "nfsd.h" 9#include "nfsd.h"
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5d47b4df61ea..35faf128f36d 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -4,6 +4,7 @@ config XFS_FS
4 depends on (64BIT || LBDAF) 4 depends on (64BIT || LBDAF)
5 select EXPORTFS 5 select EXPORTFS
6 select LIBCRC32C 6 select LIBCRC32C
7 select FS_IOMAP
7 help 8 help
8 XFS is a high performance journaling filesystem which originated 9 XFS is a high performance journaling filesystem which originated
9 on the SGI IRIX platform. It is completely multi-threaded, can 10 on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a708e38b494c..88c26b827a2d 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -84,7 +84,7 @@ xfs_alloc_lookup_ge(
84 * Lookup the first record less than or equal to [bno, len] 84 * Lookup the first record less than or equal to [bno, len]
85 * in the btree given by cur. 85 * in the btree given by cur.
86 */ 86 */
87int /* error */ 87static int /* error */
88xfs_alloc_lookup_le( 88xfs_alloc_lookup_le(
89 struct xfs_btree_cur *cur, /* btree cursor */ 89 struct xfs_btree_cur *cur, /* btree cursor */
90 xfs_agblock_t bno, /* starting block of extent */ 90 xfs_agblock_t bno, /* starting block of extent */
@@ -1839,19 +1839,8 @@ void
1839xfs_alloc_compute_maxlevels( 1839xfs_alloc_compute_maxlevels(
1840 xfs_mount_t *mp) /* file system mount structure */ 1840 xfs_mount_t *mp) /* file system mount structure */
1841{ 1841{
1842 int level; 1842 mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_alloc_mnr,
1843 uint maxblocks; 1843 (mp->m_sb.sb_agblocks + 1) / 2);
1844 uint maxleafents;
1845 int minleafrecs;
1846 int minnoderecs;
1847
1848 maxleafents = (mp->m_sb.sb_agblocks + 1) / 2;
1849 minleafrecs = mp->m_alloc_mnr[0];
1850 minnoderecs = mp->m_alloc_mnr[1];
1851 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1852 for (level = 1; maxblocks > 1; level++)
1853 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1854 mp->m_ag_maxlevels = level;
1855} 1844}
1856 1845
1857/* 1846/*
@@ -2658,55 +2647,79 @@ error0:
2658 return error; 2647 return error;
2659} 2648}
2660 2649
2661/* 2650/* Ensure that the freelist is at full capacity. */
2662 * Free an extent. 2651int
2663 * Just break up the extent address and hand off to xfs_free_ag_extent 2652xfs_free_extent_fix_freelist(
2664 * after fixing up the freelist. 2653 struct xfs_trans *tp,
2665 */ 2654 xfs_agnumber_t agno,
2666int /* error */ 2655 struct xfs_buf **agbp)
2667xfs_free_extent(
2668 xfs_trans_t *tp, /* transaction pointer */
2669 xfs_fsblock_t bno, /* starting block number of extent */
2670 xfs_extlen_t len) /* length of extent */
2671{ 2656{
2672 xfs_alloc_arg_t args; 2657 struct xfs_alloc_arg args;
2673 int error; 2658 int error;
2674 2659
2675 ASSERT(len != 0); 2660 memset(&args, 0, sizeof(struct xfs_alloc_arg));
2676 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2677 args.tp = tp; 2661 args.tp = tp;
2678 args.mp = tp->t_mountp; 2662 args.mp = tp->t_mountp;
2663 args.agno = agno;
2679 2664
2680 /* 2665 /*
2681 * validate that the block number is legal - the enables us to detect 2666 * validate that the block number is legal - the enables us to detect
2682 * and handle a silent filesystem corruption rather than crashing. 2667 * and handle a silent filesystem corruption rather than crashing.
2683 */ 2668 */
2684 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2685 if (args.agno >= args.mp->m_sb.sb_agcount) 2669 if (args.agno >= args.mp->m_sb.sb_agcount)
2686 return -EFSCORRUPTED; 2670 return -EFSCORRUPTED;
2687 2671
2688 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2689 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2690 return -EFSCORRUPTED;
2691
2692 args.pag = xfs_perag_get(args.mp, args.agno); 2672 args.pag = xfs_perag_get(args.mp, args.agno);
2693 ASSERT(args.pag); 2673 ASSERT(args.pag);
2694 2674
2695 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 2675 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2696 if (error) 2676 if (error)
2697 goto error0; 2677 goto out;
2678
2679 *agbp = args.agbp;
2680out:
2681 xfs_perag_put(args.pag);
2682 return error;
2683}
2684
2685/*
2686 * Free an extent.
2687 * Just break up the extent address and hand off to xfs_free_ag_extent
2688 * after fixing up the freelist.
2689 */
2690int /* error */
2691xfs_free_extent(
2692 struct xfs_trans *tp, /* transaction pointer */
2693 xfs_fsblock_t bno, /* starting block number of extent */
2694 xfs_extlen_t len) /* length of extent */
2695{
2696 struct xfs_mount *mp = tp->t_mountp;
2697 struct xfs_buf *agbp;
2698 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
2699 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
2700 int error;
2701
2702 ASSERT(len != 0);
2703
2704 error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
2705 if (error)
2706 return error;
2707
2708 XFS_WANT_CORRUPTED_GOTO(mp, agbno < mp->m_sb.sb_agblocks, err);
2698 2709
2699 /* validate the extent size is legal now we have the agf locked */ 2710 /* validate the extent size is legal now we have the agf locked */
2700 if (args.agbno + len > 2711 XFS_WANT_CORRUPTED_GOTO(mp,
2701 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { 2712 agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length),
2702 error = -EFSCORRUPTED; 2713 err);
2703 goto error0;
2704 }
2705 2714
2706 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2715 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, 0);
2707 if (!error) 2716 if (error)
2708 xfs_extent_busy_insert(tp, args.agno, args.agbno, len, 0); 2717 goto err;
2709error0: 2718
2710 xfs_perag_put(args.pag); 2719 xfs_extent_busy_insert(tp, agno, agbno, len, 0);
2720 return 0;
2721
2722err:
2723 xfs_trans_brelse(tp, agbp);
2711 return error; 2724 return error;
2712} 2725}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 135eb3d24db7..cf268b2d0b6c 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -212,13 +212,6 @@ xfs_free_extent(
212 xfs_fsblock_t bno, /* starting block number of extent */ 212 xfs_fsblock_t bno, /* starting block number of extent */
213 xfs_extlen_t len); /* length of extent */ 213 xfs_extlen_t len); /* length of extent */
214 214
215int /* error */
216xfs_alloc_lookup_le(
217 struct xfs_btree_cur *cur, /* btree cursor */
218 xfs_agblock_t bno, /* starting block of extent */
219 xfs_extlen_t len, /* length of extent */
220 int *stat); /* success/failure */
221
222int /* error */ 215int /* error */
223xfs_alloc_lookup_ge( 216xfs_alloc_lookup_ge(
224 struct xfs_btree_cur *cur, /* btree cursor */ 217 struct xfs_btree_cur *cur, /* btree cursor */
@@ -236,5 +229,7 @@ xfs_alloc_get_rec(
236int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, 229int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
237 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); 230 xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
238int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); 231int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
232int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
233 struct xfs_buf **agbp);
239 234
240#endif /* __XFS_ALLOC_H__ */ 235#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 882c8d338891..4f2aed04f827 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -50,7 +50,6 @@ int xfs_attr_shortform_lookup(struct xfs_da_args *args);
50int xfs_attr_shortform_getvalue(struct xfs_da_args *args); 50int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
51int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); 51int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
52int xfs_attr_shortform_remove(struct xfs_da_args *args); 52int xfs_attr_shortform_remove(struct xfs_da_args *args);
53int xfs_attr_shortform_list(struct xfs_attr_list_context *context);
54int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); 53int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
55int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); 54int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
56void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); 55void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
@@ -88,8 +87,6 @@ int xfs_attr3_leaf_toosmall(struct xfs_da_state *state, int *retval);
88void xfs_attr3_leaf_unbalance(struct xfs_da_state *state, 87void xfs_attr3_leaf_unbalance(struct xfs_da_state *state,
89 struct xfs_da_state_blk *drop_blk, 88 struct xfs_da_state_blk *drop_blk,
90 struct xfs_da_state_blk *save_blk); 89 struct xfs_da_state_blk *save_blk);
91int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
92
93/* 90/*
94 * Utility routines. 91 * Utility routines.
95 */ 92 */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 932381caef1b..2f2c85cc8117 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -570,14 +570,12 @@ xfs_bmap_validate_ret(
570 */ 570 */
571void 571void
572xfs_bmap_add_free( 572xfs_bmap_add_free(
573 struct xfs_mount *mp, /* mount point structure */
574 struct xfs_bmap_free *flist, /* list of extents */
573 xfs_fsblock_t bno, /* fs block number of extent */ 575 xfs_fsblock_t bno, /* fs block number of extent */
574 xfs_filblks_t len, /* length of extent */ 576 xfs_filblks_t len) /* length of extent */
575 xfs_bmap_free_t *flist, /* list of extents */
576 xfs_mount_t *mp) /* mount point structure */
577{ 577{
578 xfs_bmap_free_item_t *cur; /* current (next) element */ 578 struct xfs_bmap_free_item *new; /* new element */
579 xfs_bmap_free_item_t *new; /* new element */
580 xfs_bmap_free_item_t *prev; /* previous element */
581#ifdef DEBUG 579#ifdef DEBUG
582 xfs_agnumber_t agno; 580 xfs_agnumber_t agno;
583 xfs_agblock_t agbno; 581 xfs_agblock_t agbno;
@@ -597,17 +595,7 @@ xfs_bmap_add_free(
597 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); 595 new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP);
598 new->xbfi_startblock = bno; 596 new->xbfi_startblock = bno;
599 new->xbfi_blockcount = (xfs_extlen_t)len; 597 new->xbfi_blockcount = (xfs_extlen_t)len;
600 for (prev = NULL, cur = flist->xbf_first; 598 list_add(&new->xbfi_list, &flist->xbf_flist);
601 cur != NULL;
602 prev = cur, cur = cur->xbfi_next) {
603 if (cur->xbfi_startblock >= bno)
604 break;
605 }
606 if (prev)
607 prev->xbfi_next = new;
608 else
609 flist->xbf_first = new;
610 new->xbfi_next = cur;
611 flist->xbf_count++; 599 flist->xbf_count++;
612} 600}
613 601
@@ -617,14 +605,10 @@ xfs_bmap_add_free(
617 */ 605 */
618void 606void
619xfs_bmap_del_free( 607xfs_bmap_del_free(
620 xfs_bmap_free_t *flist, /* free item list header */ 608 struct xfs_bmap_free *flist, /* free item list header */
621 xfs_bmap_free_item_t *prev, /* previous item on list, if any */ 609 struct xfs_bmap_free_item *free) /* list item to be freed */
622 xfs_bmap_free_item_t *free) /* list item to be freed */
623{ 610{
624 if (prev) 611 list_del(&free->xbfi_list);
625 prev->xbfi_next = free->xbfi_next;
626 else
627 flist->xbf_first = free->xbfi_next;
628 flist->xbf_count--; 612 flist->xbf_count--;
629 kmem_zone_free(xfs_bmap_free_item_zone, free); 613 kmem_zone_free(xfs_bmap_free_item_zone, free);
630} 614}
@@ -634,17 +618,16 @@ xfs_bmap_del_free(
634 */ 618 */
635void 619void
636xfs_bmap_cancel( 620xfs_bmap_cancel(
637 xfs_bmap_free_t *flist) /* list of bmap_free_items */ 621 struct xfs_bmap_free *flist) /* list of bmap_free_items */
638{ 622{
639 xfs_bmap_free_item_t *free; /* free list item */ 623 struct xfs_bmap_free_item *free; /* free list item */
640 xfs_bmap_free_item_t *next;
641 624
642 if (flist->xbf_count == 0) 625 if (flist->xbf_count == 0)
643 return; 626 return;
644 ASSERT(flist->xbf_first != NULL); 627 while (!list_empty(&flist->xbf_flist)) {
645 for (free = flist->xbf_first; free; free = next) { 628 free = list_first_entry(&flist->xbf_flist,
646 next = free->xbfi_next; 629 struct xfs_bmap_free_item, xbfi_list);
647 xfs_bmap_del_free(flist, NULL, free); 630 xfs_bmap_del_free(flist, free);
648 } 631 }
649 ASSERT(flist->xbf_count == 0); 632 ASSERT(flist->xbf_count == 0);
650} 633}
@@ -699,7 +682,7 @@ xfs_bmap_btree_to_extents(
699 cblock = XFS_BUF_TO_BLOCK(cbp); 682 cblock = XFS_BUF_TO_BLOCK(cbp);
700 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) 683 if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
701 return error; 684 return error;
702 xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp); 685 xfs_bmap_add_free(mp, cur->bc_private.b.flist, cbno, 1);
703 ip->i_d.di_nblocks--; 686 ip->i_d.di_nblocks--;
704 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); 687 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
705 xfs_trans_binval(tp, cbp); 688 xfs_trans_binval(tp, cbp);
@@ -5073,8 +5056,8 @@ xfs_bmap_del_extent(
5073 * If we need to, add to list of extents to delete. 5056 * If we need to, add to list of extents to delete.
5074 */ 5057 */
5075 if (do_fx) 5058 if (do_fx)
5076 xfs_bmap_add_free(del->br_startblock, del->br_blockcount, flist, 5059 xfs_bmap_add_free(mp, flist, del->br_startblock,
5077 mp); 5060 del->br_blockcount);
5078 /* 5061 /*
5079 * Adjust inode # blocks in the file. 5062 * Adjust inode # blocks in the file.
5080 */ 5063 */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 423a34e832bd..f1f3ae6c0a3f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -62,12 +62,12 @@ struct xfs_bmalloca {
62 * List of extents to be free "later". 62 * List of extents to be free "later".
63 * The list is kept sorted on xbf_startblock. 63 * The list is kept sorted on xbf_startblock.
64 */ 64 */
65typedef struct xfs_bmap_free_item 65struct xfs_bmap_free_item
66{ 66{
67 xfs_fsblock_t xbfi_startblock;/* starting fs block number */ 67 xfs_fsblock_t xbfi_startblock;/* starting fs block number */
68 xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */ 68 xfs_extlen_t xbfi_blockcount;/* number of blocks in extent */
69 struct xfs_bmap_free_item *xbfi_next; /* link to next entry */ 69 struct list_head xbfi_list;
70} xfs_bmap_free_item_t; 70};
71 71
72/* 72/*
73 * Header for free extent list. 73 * Header for free extent list.
@@ -85,7 +85,7 @@ typedef struct xfs_bmap_free_item
85 */ 85 */
86typedef struct xfs_bmap_free 86typedef struct xfs_bmap_free
87{ 87{
88 xfs_bmap_free_item_t *xbf_first; /* list of to-be-free extents */ 88 struct list_head xbf_flist; /* list of to-be-free extents */
89 int xbf_count; /* count of items on list */ 89 int xbf_count; /* count of items on list */
90 int xbf_low; /* alloc in low mode */ 90 int xbf_low; /* alloc in low mode */
91} xfs_bmap_free_t; 91} xfs_bmap_free_t;
@@ -141,8 +141,10 @@ static inline int xfs_bmapi_aflag(int w)
141 141
142static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) 142static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
143{ 143{
144 ((flp)->xbf_first = NULL, (flp)->xbf_count = 0, \ 144 INIT_LIST_HEAD(&flp->xbf_flist);
145 (flp)->xbf_low = 0, *(fbp) = NULLFSBLOCK); 145 flp->xbf_count = 0;
146 flp->xbf_low = 0;
147 *fbp = NULLFSBLOCK;
146} 148}
147 149
148/* 150/*
@@ -191,8 +193,8 @@ void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
191 193
192int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); 194int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
193void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); 195void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
194void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, 196void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_bmap_free *flist,
195 struct xfs_bmap_free *flist, struct xfs_mount *mp); 197 xfs_fsblock_t bno, xfs_filblks_t len);
196void xfs_bmap_cancel(struct xfs_bmap_free *flist); 198void xfs_bmap_cancel(struct xfs_bmap_free *flist);
197int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, 199int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
198 struct xfs_inode *ip); 200 struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6282f6e708af..db0c71e470c9 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -526,7 +526,7 @@ xfs_bmbt_free_block(
526 struct xfs_trans *tp = cur->bc_tp; 526 struct xfs_trans *tp = cur->bc_tp;
527 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); 527 xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
528 528
529 xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp); 529 xfs_bmap_add_free(mp, cur->bc_private.b.flist, fsbno, 1);
530 ip->i_d.di_nblocks--; 530 ip->i_d.di_nblocks--;
531 531
532 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 532 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 1f88e1ce770f..07eeb0b4ca74 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -543,12 +543,12 @@ xfs_btree_ptr_addr(
543 */ 543 */
544STATIC struct xfs_btree_block * 544STATIC struct xfs_btree_block *
545xfs_btree_get_iroot( 545xfs_btree_get_iroot(
546 struct xfs_btree_cur *cur) 546 struct xfs_btree_cur *cur)
547{ 547{
548 struct xfs_ifork *ifp; 548 struct xfs_ifork *ifp;
549 549
550 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork); 550 ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
551 return (struct xfs_btree_block *)ifp->if_broot; 551 return (struct xfs_btree_block *)ifp->if_broot;
552} 552}
553 553
554/* 554/*
@@ -4152,3 +4152,22 @@ xfs_btree_sblock_verify(
4152 4152
4153 return true; 4153 return true;
4154} 4154}
4155
4156/*
4157 * Calculate the number of btree levels needed to store a given number of
4158 * records in a short-format btree.
4159 */
4160uint
4161xfs_btree_compute_maxlevels(
4162 struct xfs_mount *mp,
4163 uint *limits,
4164 unsigned long len)
4165{
4166 uint level;
4167 unsigned long maxblocks;
4168
4169 maxblocks = (len + limits[0] - 1) / limits[0];
4170 for (level = 1; maxblocks > 1; level++)
4171 maxblocks = (maxblocks + limits[1] - 1) / limits[1];
4172 return level;
4173}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 2e874be70209..785a99682159 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -474,5 +474,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
474 474
475bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); 475bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
476bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); 476bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
477uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits,
478 unsigned long len);
477 479
478#endif /* __XFS_BTREE_H__ */ 480#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 097bf7717d80..0f1f165f4048 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -356,7 +356,6 @@ xfs_da3_split(
356 struct xfs_da_state_blk *newblk; 356 struct xfs_da_state_blk *newblk;
357 struct xfs_da_state_blk *addblk; 357 struct xfs_da_state_blk *addblk;
358 struct xfs_da_intnode *node; 358 struct xfs_da_intnode *node;
359 struct xfs_buf *bp;
360 int max; 359 int max;
361 int action = 0; 360 int action = 0;
362 int error; 361 int error;
@@ -397,7 +396,9 @@ xfs_da3_split(
397 break; 396 break;
398 } 397 }
399 /* 398 /*
400 * Entry wouldn't fit, split the leaf again. 399 * Entry wouldn't fit, split the leaf again. The new
400 * extrablk will be consumed by xfs_da3_node_split if
401 * the node is split.
401 */ 402 */
402 state->extravalid = 1; 403 state->extravalid = 1;
403 if (state->inleaf) { 404 if (state->inleaf) {
@@ -446,6 +447,14 @@ xfs_da3_split(
446 return 0; 447 return 0;
447 448
448 /* 449 /*
450 * xfs_da3_node_split() should have consumed any extra blocks we added
451 * during a double leaf split in the attr fork. This is guaranteed as
452 * we can't be here if the attr fork only has a single leaf block.
453 */
454 ASSERT(state->extravalid == 0 ||
455 state->path.blk[max].magic == XFS_DIR2_LEAFN_MAGIC);
456
457 /*
449 * Split the root node. 458 * Split the root node.
450 */ 459 */
451 ASSERT(state->path.active == 0); 460 ASSERT(state->path.active == 0);
@@ -457,43 +466,33 @@ xfs_da3_split(
457 } 466 }
458 467
459 /* 468 /*
460 * Update pointers to the node which used to be block 0 and 469 * Update pointers to the node which used to be block 0 and just got
461 * just got bumped because of the addition of a new root node. 470 * bumped because of the addition of a new root node. Note that the
462 * There might be three blocks involved if a double split occurred, 471 * original block 0 could be at any position in the list of blocks in
463 * and the original block 0 could be at any position in the list. 472 * the tree.
464 * 473 *
465 * Note: the magic numbers and sibling pointers are in the same 474 * Note: the magic numbers and sibling pointers are in the same physical
466 * physical place for both v2 and v3 headers (by design). Hence it 475 * place for both v2 and v3 headers (by design). Hence it doesn't matter
467 * doesn't matter which version of the xfs_da_intnode structure we use 476 * which version of the xfs_da_intnode structure we use here as the
468 * here as the result will be the same using either structure. 477 * result will be the same using either structure.
469 */ 478 */
470 node = oldblk->bp->b_addr; 479 node = oldblk->bp->b_addr;
471 if (node->hdr.info.forw) { 480 if (node->hdr.info.forw) {
472 if (be32_to_cpu(node->hdr.info.forw) == addblk->blkno) { 481 ASSERT(be32_to_cpu(node->hdr.info.forw) == addblk->blkno);
473 bp = addblk->bp; 482 node = addblk->bp->b_addr;
474 } else {
475 ASSERT(state->extravalid);
476 bp = state->extrablk.bp;
477 }
478 node = bp->b_addr;
479 node->hdr.info.back = cpu_to_be32(oldblk->blkno); 483 node->hdr.info.back = cpu_to_be32(oldblk->blkno);
480 xfs_trans_log_buf(state->args->trans, bp, 484 xfs_trans_log_buf(state->args->trans, addblk->bp,
481 XFS_DA_LOGRANGE(node, &node->hdr.info, 485 XFS_DA_LOGRANGE(node, &node->hdr.info,
482 sizeof(node->hdr.info))); 486 sizeof(node->hdr.info)));
483 } 487 }
484 node = oldblk->bp->b_addr; 488 node = oldblk->bp->b_addr;
485 if (node->hdr.info.back) { 489 if (node->hdr.info.back) {
486 if (be32_to_cpu(node->hdr.info.back) == addblk->blkno) { 490 ASSERT(be32_to_cpu(node->hdr.info.back) == addblk->blkno);
487 bp = addblk->bp; 491 node = addblk->bp->b_addr;
488 } else {
489 ASSERT(state->extravalid);
490 bp = state->extrablk.bp;
491 }
492 node = bp->b_addr;
493 node->hdr.info.forw = cpu_to_be32(oldblk->blkno); 492 node->hdr.info.forw = cpu_to_be32(oldblk->blkno);
494 xfs_trans_log_buf(state->args->trans, bp, 493 xfs_trans_log_buf(state->args->trans, addblk->bp,
495 XFS_DA_LOGRANGE(node, &node->hdr.info, 494 XFS_DA_LOGRANGE(node, &node->hdr.info,
496 sizeof(node->hdr.info))); 495 sizeof(node->hdr.info)));
497 } 496 }
498 addblk->bp = NULL; 497 addblk->bp = NULL;
499 return 0; 498 return 0;
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index 9d624a622946..f1e8d4dbb600 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -40,8 +40,7 @@ xfs_dir2_sf_entsize(
40 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ 40 int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
41 41
42 count += len; /* name */ 42 count += len; /* name */
43 count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) : 43 count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */
44 sizeof(xfs_dir2_ino4_t); /* ino # */
45 return count; 44 return count;
46} 45}
47 46
@@ -125,33 +124,33 @@ xfs_dir3_sfe_put_ftype(
125static xfs_ino_t 124static xfs_ino_t
126xfs_dir2_sf_get_ino( 125xfs_dir2_sf_get_ino(
127 struct xfs_dir2_sf_hdr *hdr, 126 struct xfs_dir2_sf_hdr *hdr,
128 xfs_dir2_inou_t *from) 127 __uint8_t *from)
129{ 128{
130 if (hdr->i8count) 129 if (hdr->i8count)
131 return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL; 130 return get_unaligned_be64(from) & 0x00ffffffffffffffULL;
132 else 131 else
133 return get_unaligned_be32(&from->i4.i); 132 return get_unaligned_be32(from);
134} 133}
135 134
136static void 135static void
137xfs_dir2_sf_put_ino( 136xfs_dir2_sf_put_ino(
138 struct xfs_dir2_sf_hdr *hdr, 137 struct xfs_dir2_sf_hdr *hdr,
139 xfs_dir2_inou_t *to, 138 __uint8_t *to,
140 xfs_ino_t ino) 139 xfs_ino_t ino)
141{ 140{
142 ASSERT((ino & 0xff00000000000000ULL) == 0); 141 ASSERT((ino & 0xff00000000000000ULL) == 0);
143 142
144 if (hdr->i8count) 143 if (hdr->i8count)
145 put_unaligned_be64(ino, &to->i8.i); 144 put_unaligned_be64(ino, to);
146 else 145 else
147 put_unaligned_be32(ino, &to->i4.i); 146 put_unaligned_be32(ino, to);
148} 147}
149 148
150static xfs_ino_t 149static xfs_ino_t
151xfs_dir2_sf_get_parent_ino( 150xfs_dir2_sf_get_parent_ino(
152 struct xfs_dir2_sf_hdr *hdr) 151 struct xfs_dir2_sf_hdr *hdr)
153{ 152{
154 return xfs_dir2_sf_get_ino(hdr, &hdr->parent); 153 return xfs_dir2_sf_get_ino(hdr, hdr->parent);
155} 154}
156 155
157static void 156static void
@@ -159,7 +158,7 @@ xfs_dir2_sf_put_parent_ino(
159 struct xfs_dir2_sf_hdr *hdr, 158 struct xfs_dir2_sf_hdr *hdr,
160 xfs_ino_t ino) 159 xfs_ino_t ino)
161{ 160{
162 xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino); 161 xfs_dir2_sf_put_ino(hdr, hdr->parent, ino);
163} 162}
164 163
165/* 164/*
@@ -173,8 +172,7 @@ xfs_dir2_sfe_get_ino(
173 struct xfs_dir2_sf_hdr *hdr, 172 struct xfs_dir2_sf_hdr *hdr,
174 struct xfs_dir2_sf_entry *sfep) 173 struct xfs_dir2_sf_entry *sfep)
175{ 174{
176 return xfs_dir2_sf_get_ino(hdr, 175 return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen]);
177 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
178} 176}
179 177
180static void 178static void
@@ -183,8 +181,7 @@ xfs_dir2_sfe_put_ino(
183 struct xfs_dir2_sf_entry *sfep, 181 struct xfs_dir2_sf_entry *sfep,
184 xfs_ino_t ino) 182 xfs_ino_t ino)
185{ 183{
186 xfs_dir2_sf_put_ino(hdr, 184 xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen], ino);
187 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
188} 185}
189 186
190static xfs_ino_t 187static xfs_ino_t
@@ -192,8 +189,7 @@ xfs_dir3_sfe_get_ino(
192 struct xfs_dir2_sf_hdr *hdr, 189 struct xfs_dir2_sf_hdr *hdr,
193 struct xfs_dir2_sf_entry *sfep) 190 struct xfs_dir2_sf_entry *sfep)
194{ 191{
195 return xfs_dir2_sf_get_ino(hdr, 192 return xfs_dir2_sf_get_ino(hdr, &sfep->name[sfep->namelen + 1]);
196 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
197} 193}
198 194
199static void 195static void
@@ -202,8 +198,7 @@ xfs_dir3_sfe_put_ino(
202 struct xfs_dir2_sf_entry *sfep, 198 struct xfs_dir2_sf_entry *sfep,
203 xfs_ino_t ino) 199 xfs_ino_t ino)
204{ 200{
205 xfs_dir2_sf_put_ino(hdr, 201 xfs_dir2_sf_put_ino(hdr, &sfep->name[sfep->namelen + 1], ino);
206 (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
207} 202}
208 203
209 204
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 8d4d8bce41bf..685f23b67056 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -192,12 +192,6 @@ typedef __uint16_t xfs_dir2_data_off_t;
192typedef uint xfs_dir2_data_aoff_t; /* argument form */ 192typedef uint xfs_dir2_data_aoff_t; /* argument form */
193 193
194/* 194/*
195 * Normalized offset (in a data block) of the entry, really xfs_dir2_data_off_t.
196 * Only need 16 bits, this is the byte offset into the single block form.
197 */
198typedef struct { __uint8_t i[2]; } __arch_pack xfs_dir2_sf_off_t;
199
200/*
201 * Offset in data space of a data entry. 195 * Offset in data space of a data entry.
202 */ 196 */
203typedef __uint32_t xfs_dir2_dataptr_t; 197typedef __uint32_t xfs_dir2_dataptr_t;
@@ -214,22 +208,10 @@ typedef xfs_off_t xfs_dir2_off_t;
214 */ 208 */
215typedef __uint32_t xfs_dir2_db_t; 209typedef __uint32_t xfs_dir2_db_t;
216 210
217/* 211#define XFS_INO32_SIZE 4
218 * Inode number stored as 8 8-bit values. 212#define XFS_INO64_SIZE 8
219 */ 213#define XFS_INO64_DIFF (XFS_INO64_SIZE - XFS_INO32_SIZE)
220typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
221
222/*
223 * Inode number stored as 4 8-bit values.
224 * Works a lot of the time, when all the inode numbers in a directory
225 * fit in 32 bits.
226 */
227typedef struct { __uint8_t i[4]; } xfs_dir2_ino4_t;
228 214
229typedef union {
230 xfs_dir2_ino8_t i8;
231 xfs_dir2_ino4_t i4;
232} xfs_dir2_inou_t;
233#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL) 215#define XFS_DIR2_MAX_SHORT_INUM ((xfs_ino_t)0xffffffffULL)
234 216
235/* 217/*
@@ -246,39 +228,38 @@ typedef union {
246typedef struct xfs_dir2_sf_hdr { 228typedef struct xfs_dir2_sf_hdr {
247 __uint8_t count; /* count of entries */ 229 __uint8_t count; /* count of entries */
248 __uint8_t i8count; /* count of 8-byte inode #s */ 230 __uint8_t i8count; /* count of 8-byte inode #s */
249 xfs_dir2_inou_t parent; /* parent dir inode number */ 231 __uint8_t parent[8]; /* parent dir inode number */
250} __arch_pack xfs_dir2_sf_hdr_t; 232} __packed xfs_dir2_sf_hdr_t;
251 233
252typedef struct xfs_dir2_sf_entry { 234typedef struct xfs_dir2_sf_entry {
253 __u8 namelen; /* actual name length */ 235 __u8 namelen; /* actual name length */
254 xfs_dir2_sf_off_t offset; /* saved offset */ 236 __u8 offset[2]; /* saved offset */
255 __u8 name[]; /* name, variable size */ 237 __u8 name[]; /* name, variable size */
256 /* 238 /*
257 * A single byte containing the file type field follows the inode 239 * A single byte containing the file type field follows the inode
258 * number for version 3 directory entries. 240 * number for version 3 directory entries.
259 * 241 *
260 * A xfs_dir2_ino8_t or xfs_dir2_ino4_t follows here, at a 242 * A 64-bit or 32-bit inode number follows here, at a variable offset
261 * variable offset after the name. 243 * after the name.
262 */ 244 */
263} __arch_pack xfs_dir2_sf_entry_t; 245} xfs_dir2_sf_entry_t;
264 246
265static inline int xfs_dir2_sf_hdr_size(int i8count) 247static inline int xfs_dir2_sf_hdr_size(int i8count)
266{ 248{
267 return sizeof(struct xfs_dir2_sf_hdr) - 249 return sizeof(struct xfs_dir2_sf_hdr) -
268 (i8count == 0) * 250 (i8count == 0) * XFS_INO64_DIFF;
269 (sizeof(xfs_dir2_ino8_t) - sizeof(xfs_dir2_ino4_t));
270} 251}
271 252
272static inline xfs_dir2_data_aoff_t 253static inline xfs_dir2_data_aoff_t
273xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep) 254xfs_dir2_sf_get_offset(xfs_dir2_sf_entry_t *sfep)
274{ 255{
275 return get_unaligned_be16(&sfep->offset.i); 256 return get_unaligned_be16(sfep->offset);
276} 257}
277 258
278static inline void 259static inline void
279xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off) 260xfs_dir2_sf_put_offset(xfs_dir2_sf_entry_t *sfep, xfs_dir2_data_aoff_t off)
280{ 261{
281 put_unaligned_be16(off, &sfep->offset.i); 262 put_unaligned_be16(off, sfep->offset);
282} 263}
283 264
284static inline struct xfs_dir2_sf_entry * 265static inline struct xfs_dir2_sf_entry *
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e5bb9cc3b243..c6809ff41197 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -126,13 +126,12 @@ xfs_dir2_block_sfsize(
126 /* 126 /*
127 * Calculate the new size, see if we should give up yet. 127 * Calculate the new size, see if we should give up yet.
128 */ 128 */
129 size = xfs_dir2_sf_hdr_size(i8count) + /* header */ 129 size = xfs_dir2_sf_hdr_size(i8count) + /* header */
130 count + /* namelen */ 130 count * 3 * sizeof(u8) + /* namelen + offset */
131 count * (uint)sizeof(xfs_dir2_sf_off_t) + /* offset */ 131 namelen + /* name */
132 namelen + /* name */ 132 (i8count ? /* inumber */
133 (i8count ? /* inumber */ 133 count * XFS_INO64_SIZE :
134 (uint)sizeof(xfs_dir2_ino8_t) * count : 134 count * XFS_INO32_SIZE);
135 (uint)sizeof(xfs_dir2_ino4_t) * count);
136 if (size > XFS_IFORK_DSIZE(dp)) 135 if (size > XFS_IFORK_DSIZE(dp))
137 return size; /* size value is a failure */ 136 return size; /* size value is a failure */
138 } 137 }
@@ -319,10 +318,7 @@ xfs_dir2_sf_addname(
319 /* 318 /*
320 * Yes, adjust the inode size. old count + (parent + new) 319 * Yes, adjust the inode size. old count + (parent + new)
321 */ 320 */
322 incr_isize += 321 incr_isize += (sfp->count + 2) * XFS_INO64_DIFF;
323 (sfp->count + 2) *
324 ((uint)sizeof(xfs_dir2_ino8_t) -
325 (uint)sizeof(xfs_dir2_ino4_t));
326 objchange = 1; 322 objchange = 1;
327 } 323 }
328 324
@@ -897,11 +893,7 @@ xfs_dir2_sf_replace(
897 int error; /* error return value */ 893 int error; /* error return value */
898 int newsize; /* new inode size */ 894 int newsize; /* new inode size */
899 895
900 newsize = 896 newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
901 dp->i_df.if_bytes +
902 (sfp->count + 1) *
903 ((uint)sizeof(xfs_dir2_ino8_t) -
904 (uint)sizeof(xfs_dir2_ino4_t));
905 /* 897 /*
906 * Won't fit as shortform, convert to block then do replace. 898 * Won't fit as shortform, convert to block then do replace.
907 */ 899 */
@@ -1022,10 +1014,7 @@ xfs_dir2_sf_toino4(
1022 /* 1014 /*
1023 * Compute the new inode size. 1015 * Compute the new inode size.
1024 */ 1016 */
1025 newsize = 1017 newsize = oldsize - (oldsfp->count + 1) * XFS_INO64_DIFF;
1026 oldsize -
1027 (oldsfp->count + 1) *
1028 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1029 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); 1018 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1030 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); 1019 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1031 /* 1020 /*
@@ -1048,7 +1037,7 @@ xfs_dir2_sf_toino4(
1048 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), 1037 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
1049 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { 1038 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
1050 sfep->namelen = oldsfep->namelen; 1039 sfep->namelen = oldsfep->namelen;
1051 sfep->offset = oldsfep->offset; 1040 memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
1052 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1041 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1053 dp->d_ops->sf_put_ino(sfp, sfep, 1042 dp->d_ops->sf_put_ino(sfp, sfep,
1054 dp->d_ops->sf_get_ino(oldsfp, oldsfep)); 1043 dp->d_ops->sf_get_ino(oldsfp, oldsfep));
@@ -1098,10 +1087,7 @@ xfs_dir2_sf_toino8(
1098 /* 1087 /*
1099 * Compute the new inode size (nb: entry count + 1 for parent) 1088 * Compute the new inode size (nb: entry count + 1 for parent)
1100 */ 1089 */
1101 newsize = 1090 newsize = oldsize + (oldsfp->count + 1) * XFS_INO64_DIFF;
1102 oldsize +
1103 (oldsfp->count + 1) *
1104 ((uint)sizeof(xfs_dir2_ino8_t) - (uint)sizeof(xfs_dir2_ino4_t));
1105 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK); 1091 xfs_idata_realloc(dp, -oldsize, XFS_DATA_FORK);
1106 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK); 1092 xfs_idata_realloc(dp, newsize, XFS_DATA_FORK);
1107 /* 1093 /*
@@ -1124,7 +1110,7 @@ xfs_dir2_sf_toino8(
1124 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep), 1110 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
1125 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) { 1111 oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
1126 sfep->namelen = oldsfep->namelen; 1112 sfep->namelen = oldsfep->namelen;
1127 sfep->offset = oldsfep->offset; 1113 memcpy(sfep->offset, oldsfep->offset, sizeof(sfep->offset));
1128 memcpy(sfep->name, oldsfep->name, sfep->namelen); 1114 memcpy(sfep->name, oldsfep->name, sfep->namelen);
1129 dp->d_ops->sf_put_ino(sfp, sfep, 1115 dp->d_ops->sf_put_ino(sfp, sfep,
1130 dp->d_ops->sf_get_ino(oldsfp, oldsfep)); 1116 dp->d_ops->sf_get_ino(oldsfp, oldsfep));
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index dc97eb21af07..adb204d40f22 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1435,41 +1435,57 @@ typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
1435 * with the crc feature bit, and all accesses to them must be conditional on 1435 * with the crc feature bit, and all accesses to them must be conditional on
1436 * that flag. 1436 * that flag.
1437 */ 1437 */
1438/* short form block header */
1439struct xfs_btree_block_shdr {
1440 __be32 bb_leftsib;
1441 __be32 bb_rightsib;
1442
1443 __be64 bb_blkno;
1444 __be64 bb_lsn;
1445 uuid_t bb_uuid;
1446 __be32 bb_owner;
1447 __le32 bb_crc;
1448};
1449
1450/* long form block header */
1451struct xfs_btree_block_lhdr {
1452 __be64 bb_leftsib;
1453 __be64 bb_rightsib;
1454
1455 __be64 bb_blkno;
1456 __be64 bb_lsn;
1457 uuid_t bb_uuid;
1458 __be64 bb_owner;
1459 __le32 bb_crc;
1460 __be32 bb_pad; /* padding for alignment */
1461};
1462
1438struct xfs_btree_block { 1463struct xfs_btree_block {
1439 __be32 bb_magic; /* magic number for block type */ 1464 __be32 bb_magic; /* magic number for block type */
1440 __be16 bb_level; /* 0 is a leaf */ 1465 __be16 bb_level; /* 0 is a leaf */
1441 __be16 bb_numrecs; /* current # of data records */ 1466 __be16 bb_numrecs; /* current # of data records */
1442 union { 1467 union {
1443 struct { 1468 struct xfs_btree_block_shdr s;
1444 __be32 bb_leftsib; 1469 struct xfs_btree_block_lhdr l;
1445 __be32 bb_rightsib;
1446
1447 __be64 bb_blkno;
1448 __be64 bb_lsn;
1449 uuid_t bb_uuid;
1450 __be32 bb_owner;
1451 __le32 bb_crc;
1452 } s; /* short form pointers */
1453 struct {
1454 __be64 bb_leftsib;
1455 __be64 bb_rightsib;
1456
1457 __be64 bb_blkno;
1458 __be64 bb_lsn;
1459 uuid_t bb_uuid;
1460 __be64 bb_owner;
1461 __le32 bb_crc;
1462 __be32 bb_pad; /* padding for alignment */
1463 } l; /* long form pointers */
1464 } bb_u; /* rest */ 1470 } bb_u; /* rest */
1465}; 1471};
1466 1472
1467#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */ 1473/* size of a short form block */
1468#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */ 1474#define XFS_BTREE_SBLOCK_LEN \
1475 (offsetof(struct xfs_btree_block, bb_u) + \
1476 offsetof(struct xfs_btree_block_shdr, bb_blkno))
1477/* size of a long form block */
1478#define XFS_BTREE_LBLOCK_LEN \
1479 (offsetof(struct xfs_btree_block, bb_u) + \
1480 offsetof(struct xfs_btree_block_lhdr, bb_blkno))
1469 1481
1470/* sizes of CRC enabled btree blocks */ 1482/* sizes of CRC enabled btree blocks */
1471#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40) 1483#define XFS_BTREE_SBLOCK_CRC_LEN \
1472#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48) 1484 (offsetof(struct xfs_btree_block, bb_u) + \
1485 sizeof(struct xfs_btree_block_shdr))
1486#define XFS_BTREE_LBLOCK_CRC_LEN \
1487 (offsetof(struct xfs_btree_block, bb_u) + \
1488 sizeof(struct xfs_btree_block_lhdr))
1473 1489
1474#define XFS_BTREE_SBLOCK_CRC_OFF \ 1490#define XFS_BTREE_SBLOCK_CRC_OFF \
1475 offsetof(struct xfs_btree_block, bb_u.s.bb_crc) 1491 offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index fffe3d01bd9f..f5ec9c5ccae6 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -521,12 +521,8 @@ typedef struct xfs_swapext
521#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection) 521#define XFS_IOC_ERROR_CLEARALL _IOW ('X', 117, struct xfs_error_injection)
522/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */ 522/* XFS_IOC_ATTRCTL_BY_HANDLE -- deprecated 118 */
523 523
524/* XFS_IOC_FREEZE -- FIFREEZE 119 */ 524#define XFS_IOC_FREEZE _IOWR('X', 119, int) /* aka FIFREEZE */
525/* XFS_IOC_THAW -- FITHAW 120 */ 525#define XFS_IOC_THAW _IOWR('X', 120, int) /* aka FITHAW */
526#ifndef FIFREEZE
527#define XFS_IOC_FREEZE _IOWR('X', 119, int)
528#define XFS_IOC_THAW _IOWR('X', 120, int)
529#endif
530 526
531#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq) 527#define XFS_IOC_FSSETDM_BY_HANDLE _IOW ('X', 121, struct xfs_fsop_setdm_handlereq)
532#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq) 528#define XFS_IOC_ATTRLIST_BY_HANDLE _IOW ('X', 122, struct xfs_fsop_attrlist_handlereq)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 22297f9b0fd5..4b1e408169a8 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -1828,9 +1828,8 @@ xfs_difree_inode_chunk(
1828 1828
1829 if (!xfs_inobt_issparse(rec->ir_holemask)) { 1829 if (!xfs_inobt_issparse(rec->ir_holemask)) {
1830 /* not sparse, calculate extent info directly */ 1830 /* not sparse, calculate extent info directly */
1831 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, 1831 xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, sagbno),
1832 XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), 1832 mp->m_ialloc_blks);
1833 mp->m_ialloc_blks, flist, mp);
1834 return; 1833 return;
1835 } 1834 }
1836 1835
@@ -1873,8 +1872,8 @@ xfs_difree_inode_chunk(
1873 1872
1874 ASSERT(agbno % mp->m_sb.sb_spino_align == 0); 1873 ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1875 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); 1874 ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1876 xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, 1875 xfs_bmap_add_free(mp, flist, XFS_AGB_TO_FSB(mp, agno, agbno),
1877 flist, mp); 1876 contigblk);
1878 1877
1879 /* reset range to current bit and carry on... */ 1878 /* reset range to current bit and carry on... */
1880 startidx = endidx = nextbit; 1879 startidx = endidx = nextbit;
@@ -2395,20 +2394,11 @@ void
2395xfs_ialloc_compute_maxlevels( 2394xfs_ialloc_compute_maxlevels(
2396 xfs_mount_t *mp) /* file system mount structure */ 2395 xfs_mount_t *mp) /* file system mount structure */
2397{ 2396{
2398 int level; 2397 uint inodes;
2399 uint maxblocks; 2398
2400 uint maxleafents; 2399 inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2401 int minleafrecs; 2400 mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
2402 int minnoderecs; 2401 inodes);
2403
2404 maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
2405 XFS_INODES_PER_CHUNK_LOG;
2406 minleafrecs = mp->m_inobt_mnr[0];
2407 minnoderecs = mp->m_inobt_mnr[1];
2408 maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
2409 for (level = 1; maxblocks > 1; level++)
2410 maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
2411 mp->m_in_maxlevels = level;
2412} 2402}
2413 2403
2414/* 2404/*
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 951c044e24e4..e2e1106c9fad 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -70,7 +70,7 @@ const struct xfs_buf_ops xfs_rtbuf_ops = {
70 * Get a buffer for the bitmap or summary file block specified. 70 * Get a buffer for the bitmap or summary file block specified.
71 * The buffer is returned read and locked. 71 * The buffer is returned read and locked.
72 */ 72 */
73int 73static int
74xfs_rtbuf_get( 74xfs_rtbuf_get(
75 xfs_mount_t *mp, /* file system mount structure */ 75 xfs_mount_t *mp, /* file system mount structure */
76 xfs_trans_t *tp, /* transaction pointer */ 76 xfs_trans_t *tp, /* transaction pointer */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 87d2b215cbbd..7575cfc3ad15 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -87,6 +87,12 @@ xfs_find_bdev_for_inode(
87 * We're now finished for good with this page. Update the page state via the 87 * We're now finished for good with this page. Update the page state via the
88 * associated buffer_heads, paying attention to the start and end offsets that 88 * associated buffer_heads, paying attention to the start and end offsets that
89 * we need to process on the page. 89 * we need to process on the page.
90 *
91 * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
92 * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or
93 * the page at all, as we may be racing with memory reclaim and it can free both
94 * the bufferhead chain and the page as it will see the page as clean and
95 * unused.
90 */ 96 */
91static void 97static void
92xfs_finish_page_writeback( 98xfs_finish_page_writeback(
@@ -95,8 +101,9 @@ xfs_finish_page_writeback(
95 int error) 101 int error)
96{ 102{
97 unsigned int end = bvec->bv_offset + bvec->bv_len - 1; 103 unsigned int end = bvec->bv_offset + bvec->bv_len - 1;
98 struct buffer_head *head, *bh; 104 struct buffer_head *head, *bh, *next;
99 unsigned int off = 0; 105 unsigned int off = 0;
106 unsigned int bsize;
100 107
101 ASSERT(bvec->bv_offset < PAGE_SIZE); 108 ASSERT(bvec->bv_offset < PAGE_SIZE);
102 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0); 109 ASSERT((bvec->bv_offset & ((1 << inode->i_blkbits) - 1)) == 0);
@@ -105,15 +112,17 @@ xfs_finish_page_writeback(
105 112
106 bh = head = page_buffers(bvec->bv_page); 113 bh = head = page_buffers(bvec->bv_page);
107 114
115 bsize = bh->b_size;
108 do { 116 do {
117 next = bh->b_this_page;
109 if (off < bvec->bv_offset) 118 if (off < bvec->bv_offset)
110 goto next_bh; 119 goto next_bh;
111 if (off > end) 120 if (off > end)
112 break; 121 break;
113 bh->b_end_io(bh, !error); 122 bh->b_end_io(bh, !error);
114next_bh: 123next_bh:
115 off += bh->b_size; 124 off += bsize;
116 } while ((bh = bh->b_this_page) != head); 125 } while ((bh = next) != head);
117} 126}
118 127
119/* 128/*
@@ -1041,6 +1050,20 @@ xfs_vm_releasepage(
1041 1050
1042 trace_xfs_releasepage(page->mapping->host, page, 0, 0); 1051 trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1043 1052
1053 /*
1054 * mm accommodates an old ext3 case where clean pages might not have had
1055 * the dirty bit cleared. Thus, it can send actual dirty pages to
1056 * ->releasepage() via shrink_active_list(). Conversely,
1057 * block_invalidatepage() can send pages that are still marked dirty
1058 * but otherwise have invalidated buffers.
1059 *
1060 * We've historically freed buffers on the latter. Instead, quietly
1061 * filter out all dirty pages to avoid spurious buffer state warnings.
1062 * This can likely be removed once shrink_active_list() is fixed.
1063 */
1064 if (PageDirty(page))
1065 return 0;
1066
1044 xfs_count_page_state(page, &delalloc, &unwritten); 1067 xfs_count_page_state(page, &delalloc, &unwritten);
1045 1068
1046 if (WARN_ON_ONCE(delalloc)) 1069 if (WARN_ON_ONCE(delalloc))
@@ -1144,6 +1167,8 @@ __xfs_get_blocks(
1144 ssize_t size; 1167 ssize_t size;
1145 int new = 0; 1168 int new = 0;
1146 1169
1170 BUG_ON(create && !direct);
1171
1147 if (XFS_FORCED_SHUTDOWN(mp)) 1172 if (XFS_FORCED_SHUTDOWN(mp))
1148 return -EIO; 1173 return -EIO;
1149 1174
@@ -1151,22 +1176,14 @@ __xfs_get_blocks(
1151 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1176 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
1152 size = bh_result->b_size; 1177 size = bh_result->b_size;
1153 1178
1154 if (!create && direct && offset >= i_size_read(inode)) 1179 if (!create && offset >= i_size_read(inode))
1155 return 0; 1180 return 0;
1156 1181
1157 /* 1182 /*
1158 * Direct I/O is usually done on preallocated files, so try getting 1183 * Direct I/O is usually done on preallocated files, so try getting
1159 * a block mapping without an exclusive lock first. For buffered 1184 * a block mapping without an exclusive lock first.
1160 * writes we already have the exclusive iolock anyway, so avoiding
1161 * a lock roundtrip here by taking the ilock exclusive from the
1162 * beginning is a useful micro optimization.
1163 */ 1185 */
1164 if (create && !direct) { 1186 lockmode = xfs_ilock_data_map_shared(ip);
1165 lockmode = XFS_ILOCK_EXCL;
1166 xfs_ilock(ip, lockmode);
1167 } else {
1168 lockmode = xfs_ilock_data_map_shared(ip);
1169 }
1170 1187
1171 ASSERT(offset <= mp->m_super->s_maxbytes); 1188 ASSERT(offset <= mp->m_super->s_maxbytes);
1172 if (offset + size > mp->m_super->s_maxbytes) 1189 if (offset + size > mp->m_super->s_maxbytes)
@@ -1185,37 +1202,19 @@ __xfs_get_blocks(
1185 (imap.br_startblock == HOLESTARTBLOCK || 1202 (imap.br_startblock == HOLESTARTBLOCK ||
1186 imap.br_startblock == DELAYSTARTBLOCK) || 1203 imap.br_startblock == DELAYSTARTBLOCK) ||
1187 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { 1204 (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1188 if (direct || xfs_get_extsz_hint(ip)) { 1205 /*
1189 /* 1206 * xfs_iomap_write_direct() expects the shared lock. It
1190 * xfs_iomap_write_direct() expects the shared lock. It 1207 * is unlocked on return.
1191 * is unlocked on return. 1208 */
1192 */ 1209 if (lockmode == XFS_ILOCK_EXCL)
1193 if (lockmode == XFS_ILOCK_EXCL) 1210 xfs_ilock_demote(ip, lockmode);
1194 xfs_ilock_demote(ip, lockmode);
1195
1196 error = xfs_iomap_write_direct(ip, offset, size,
1197 &imap, nimaps);
1198 if (error)
1199 return error;
1200 new = 1;
1201 1211
1202 } else { 1212 error = xfs_iomap_write_direct(ip, offset, size,
1203 /* 1213 &imap, nimaps);
1204 * Delalloc reservations do not require a transaction, 1214 if (error)
1205 * we can go on without dropping the lock here. If we 1215 return error;
1206 * are allocating a new delalloc block, make sure that 1216 new = 1;
1207 * we set the new flag so that we mark the buffer new so
1208 * that we know that it is newly allocated if the write
1209 * fails.
1210 */
1211 if (nimaps && imap.br_startblock == HOLESTARTBLOCK)
1212 new = 1;
1213 error = xfs_iomap_write_delay(ip, offset, size, &imap);
1214 if (error)
1215 goto out_unlock;
1216 1217
1217 xfs_iunlock(ip, lockmode);
1218 }
1219 trace_xfs_get_blocks_alloc(ip, offset, size, 1218 trace_xfs_get_blocks_alloc(ip, offset, size,
1220 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN 1219 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1221 : XFS_IO_DELALLOC, &imap); 1220 : XFS_IO_DELALLOC, &imap);
@@ -1236,9 +1235,7 @@ __xfs_get_blocks(
1236 } 1235 }
1237 1236
1238 /* trim mapping down to size requested */ 1237 /* trim mapping down to size requested */
1239 if (direct || size > (1 << inode->i_blkbits)) 1238 xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1240 xfs_map_trim_size(inode, iblock, bh_result,
1241 &imap, offset, size);
1242 1239
1243 /* 1240 /*
1244 * For unwritten extents do not report a disk address in the buffered 1241 * For unwritten extents do not report a disk address in the buffered
@@ -1251,7 +1248,7 @@ __xfs_get_blocks(
1251 if (ISUNWRITTEN(&imap)) 1248 if (ISUNWRITTEN(&imap))
1252 set_buffer_unwritten(bh_result); 1249 set_buffer_unwritten(bh_result);
1253 /* direct IO needs special help */ 1250 /* direct IO needs special help */
1254 if (create && direct) { 1251 if (create) {
1255 if (dax_fault) 1252 if (dax_fault)
1256 ASSERT(!ISUNWRITTEN(&imap)); 1253 ASSERT(!ISUNWRITTEN(&imap));
1257 else 1254 else
@@ -1280,14 +1277,7 @@ __xfs_get_blocks(
1280 (new || ISUNWRITTEN(&imap)))) 1277 (new || ISUNWRITTEN(&imap))))
1281 set_buffer_new(bh_result); 1278 set_buffer_new(bh_result);
1282 1279
1283 if (imap.br_startblock == DELAYSTARTBLOCK) { 1280 BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
1284 BUG_ON(direct);
1285 if (create) {
1286 set_buffer_uptodate(bh_result);
1287 set_buffer_mapped(bh_result);
1288 set_buffer_delay(bh_result);
1289 }
1290 }
1291 1281
1292 return 0; 1282 return 0;
1293 1283
@@ -1337,7 +1327,7 @@ xfs_get_blocks_dax_fault(
1337 * whereas if we have flags set we will always be called in task context 1327 * whereas if we have flags set we will always be called in task context
1338 * (i.e. from a workqueue). 1328 * (i.e. from a workqueue).
1339 */ 1329 */
1340STATIC int 1330int
1341xfs_end_io_direct_write( 1331xfs_end_io_direct_write(
1342 struct kiocb *iocb, 1332 struct kiocb *iocb,
1343 loff_t offset, 1333 loff_t offset,
@@ -1408,234 +1398,10 @@ xfs_vm_direct_IO(
1408 struct kiocb *iocb, 1398 struct kiocb *iocb,
1409 struct iov_iter *iter) 1399 struct iov_iter *iter)
1410{ 1400{
1411 struct inode *inode = iocb->ki_filp->f_mapping->host;
1412 dio_iodone_t *endio = NULL;
1413 int flags = 0;
1414 struct block_device *bdev;
1415
1416 if (iov_iter_rw(iter) == WRITE) {
1417 endio = xfs_end_io_direct_write;
1418 flags = DIO_ASYNC_EXTEND;
1419 }
1420
1421 if (IS_DAX(inode)) {
1422 return dax_do_io(iocb, inode, iter,
1423 xfs_get_blocks_direct, endio, 0);
1424 }
1425
1426 bdev = xfs_find_bdev_for_inode(inode);
1427 return __blockdev_direct_IO(iocb, inode, bdev, iter,
1428 xfs_get_blocks_direct, endio, NULL, flags);
1429}
1430
1431/*
1432 * Punch out the delalloc blocks we have already allocated.
1433 *
1434 * Don't bother with xfs_setattr given that nothing can have made it to disk yet
1435 * as the page is still locked at this point.
1436 */
1437STATIC void
1438xfs_vm_kill_delalloc_range(
1439 struct inode *inode,
1440 loff_t start,
1441 loff_t end)
1442{
1443 struct xfs_inode *ip = XFS_I(inode);
1444 xfs_fileoff_t start_fsb;
1445 xfs_fileoff_t end_fsb;
1446 int error;
1447
1448 start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
1449 end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
1450 if (end_fsb <= start_fsb)
1451 return;
1452
1453 xfs_ilock(ip, XFS_ILOCK_EXCL);
1454 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1455 end_fsb - start_fsb);
1456 if (error) {
1457 /* something screwed, just bail */
1458 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1459 xfs_alert(ip->i_mount,
1460 "xfs_vm_write_failed: unable to clean up ino %lld",
1461 ip->i_ino);
1462 }
1463 }
1464 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1465}
1466
1467STATIC void
1468xfs_vm_write_failed(
1469 struct inode *inode,
1470 struct page *page,
1471 loff_t pos,
1472 unsigned len)
1473{
1474 loff_t block_offset;
1475 loff_t block_start;
1476 loff_t block_end;
1477 loff_t from = pos & (PAGE_SIZE - 1);
1478 loff_t to = from + len;
1479 struct buffer_head *bh, *head;
1480 struct xfs_mount *mp = XFS_I(inode)->i_mount;
1481
1482 /* 1401 /*
1483 * The request pos offset might be 32 or 64 bit, this is all fine 1402 * We just need the method present so that open/fcntl allow direct I/O.
1484 * on 64-bit platform. However, for 64-bit pos request on 32-bit
1485 * platform, the high 32-bit will be masked off if we evaluate the
1486 * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
1487 * 0xfffff000 as an unsigned long, hence the result is incorrect
1488 * which could cause the following ASSERT failed in most cases.
1489 * In order to avoid this, we can evaluate the block_offset of the
1490 * start of the page by using shifts rather than masks the mismatch
1491 * problem.
1492 */ 1403 */
1493 block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; 1404 return -EINVAL;
1494
1495 ASSERT(block_offset + from == pos);
1496
1497 head = page_buffers(page);
1498 block_start = 0;
1499 for (bh = head; bh != head || !block_start;
1500 bh = bh->b_this_page, block_start = block_end,
1501 block_offset += bh->b_size) {
1502 block_end = block_start + bh->b_size;
1503
1504 /* skip buffers before the write */
1505 if (block_end <= from)
1506 continue;
1507
1508 /* if the buffer is after the write, we're done */
1509 if (block_start >= to)
1510 break;
1511
1512 /*
1513 * Process delalloc and unwritten buffers beyond EOF. We can
1514 * encounter unwritten buffers in the event that a file has
1515 * post-EOF unwritten extents and an extending write happens to
1516 * fail (e.g., an unaligned write that also involves a delalloc
1517 * to the same page).
1518 */
1519 if (!buffer_delay(bh) && !buffer_unwritten(bh))
1520 continue;
1521
1522 if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
1523 block_offset < i_size_read(inode))
1524 continue;
1525
1526 if (buffer_delay(bh))
1527 xfs_vm_kill_delalloc_range(inode, block_offset,
1528 block_offset + bh->b_size);
1529
1530 /*
1531 * This buffer does not contain data anymore. make sure anyone
1532 * who finds it knows that for certain.
1533 */
1534 clear_buffer_delay(bh);
1535 clear_buffer_uptodate(bh);
1536 clear_buffer_mapped(bh);
1537 clear_buffer_new(bh);
1538 clear_buffer_dirty(bh);
1539 clear_buffer_unwritten(bh);
1540 }
1541
1542}
1543
1544/*
1545 * This used to call block_write_begin(), but it unlocks and releases the page
1546 * on error, and we need that page to be able to punch stale delalloc blocks out
1547 * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
1548 * the appropriate point.
1549 */
1550STATIC int
1551xfs_vm_write_begin(
1552 struct file *file,
1553 struct address_space *mapping,
1554 loff_t pos,
1555 unsigned len,
1556 unsigned flags,
1557 struct page **pagep,
1558 void **fsdata)
1559{
1560 pgoff_t index = pos >> PAGE_SHIFT;
1561 struct page *page;
1562 int status;
1563 struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;
1564
1565 ASSERT(len <= PAGE_SIZE);
1566
1567 page = grab_cache_page_write_begin(mapping, index, flags);
1568 if (!page)
1569 return -ENOMEM;
1570
1571 status = __block_write_begin(page, pos, len, xfs_get_blocks);
1572 if (xfs_mp_fail_writes(mp))
1573 status = -EIO;
1574 if (unlikely(status)) {
1575 struct inode *inode = mapping->host;
1576 size_t isize = i_size_read(inode);
1577
1578 xfs_vm_write_failed(inode, page, pos, len);
1579 unlock_page(page);
1580
1581 /*
1582 * If the write is beyond EOF, we only want to kill blocks
1583 * allocated in this write, not blocks that were previously
1584 * written successfully.
1585 */
1586 if (xfs_mp_fail_writes(mp))
1587 isize = 0;
1588 if (pos + len > isize) {
1589 ssize_t start = max_t(ssize_t, pos, isize);
1590
1591 truncate_pagecache_range(inode, start, pos + len);
1592 }
1593
1594 put_page(page);
1595 page = NULL;
1596 }
1597
1598 *pagep = page;
1599 return status;
1600}
1601
1602/*
1603 * On failure, we only need to kill delalloc blocks beyond EOF in the range of
1604 * this specific write because they will never be written. Previous writes
1605 * beyond EOF where block allocation succeeded do not need to be trashed, so
1606 * only new blocks from this write should be trashed. For blocks within
1607 * EOF, generic_write_end() zeros them so they are safe to leave alone and be
1608 * written with all the other valid data.
1609 */
1610STATIC int
1611xfs_vm_write_end(
1612 struct file *file,
1613 struct address_space *mapping,
1614 loff_t pos,
1615 unsigned len,
1616 unsigned copied,
1617 struct page *page,
1618 void *fsdata)
1619{
1620 int ret;
1621
1622 ASSERT(len <= PAGE_SIZE);
1623
1624 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
1625 if (unlikely(ret < len)) {
1626 struct inode *inode = mapping->host;
1627 size_t isize = i_size_read(inode);
1628 loff_t to = pos + len;
1629
1630 if (to > isize) {
1631 /* only kill blocks in this write beyond EOF */
1632 if (pos > isize)
1633 isize = pos;
1634 xfs_vm_kill_delalloc_range(inode, isize, to);
1635 truncate_pagecache_range(inode, isize, to);
1636 }
1637 }
1638 return ret;
1639} 1405}
1640 1406
1641STATIC sector_t 1407STATIC sector_t
@@ -1748,8 +1514,6 @@ const struct address_space_operations xfs_address_space_operations = {
1748 .set_page_dirty = xfs_vm_set_page_dirty, 1514 .set_page_dirty = xfs_vm_set_page_dirty,
1749 .releasepage = xfs_vm_releasepage, 1515 .releasepage = xfs_vm_releasepage,
1750 .invalidatepage = xfs_vm_invalidatepage, 1516 .invalidatepage = xfs_vm_invalidatepage,
1751 .write_begin = xfs_vm_write_begin,
1752 .write_end = xfs_vm_write_end,
1753 .bmap = xfs_vm_bmap, 1517 .bmap = xfs_vm_bmap,
1754 .direct_IO = xfs_vm_direct_IO, 1518 .direct_IO = xfs_vm_direct_IO,
1755 .migratepage = buffer_migrate_page, 1519 .migratepage = buffer_migrate_page,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 814aab790713..bf2d9a141a73 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -60,6 +60,9 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
60int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, 60int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
61 struct buffer_head *map_bh, int create); 61 struct buffer_head *map_bh, int create);
62 62
63int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
64 ssize_t size, void *private);
65
63extern void xfs_count_page_state(struct page *, int *, int *); 66extern void xfs_count_page_state(struct page *, int *, int *);
64extern struct block_device *xfs_find_bdev_for_inode(struct inode *); 67extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
65 68
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 55d214981ed2..be0b79d8900f 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -322,7 +322,7 @@ xfs_attr3_node_inactive(
322 * Recurse (gasp!) through the attribute nodes until we find leaves. 322 * Recurse (gasp!) through the attribute nodes until we find leaves.
323 * We're doing a depth-first traversal in order to invalidate everything. 323 * We're doing a depth-first traversal in order to invalidate everything.
324 */ 324 */
325int 325static int
326xfs_attr3_root_inactive( 326xfs_attr3_root_inactive(
327 struct xfs_trans **trans, 327 struct xfs_trans **trans,
328 struct xfs_inode *dp) 328 struct xfs_inode *dp)
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index d25f26b22ac9..25e76cd6c053 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -65,7 +65,7 @@ xfs_attr_shortform_compare(const void *a, const void *b)
65 * we have to calculate each entries' hashvalue and sort them before 65 * we have to calculate each entries' hashvalue and sort them before
66 * we can begin returning them to the user. 66 * we can begin returning them to the user.
67 */ 67 */
68int 68static int
69xfs_attr_shortform_list(xfs_attr_list_context_t *context) 69xfs_attr_shortform_list(xfs_attr_list_context_t *context)
70{ 70{
71 attrlist_cursor_kern_t *cursor; 71 attrlist_cursor_kern_t *cursor;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 586bb64e674b..cd4a850564f2 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -79,6 +79,23 @@ xfs_zero_extent(
79 GFP_NOFS, true); 79 GFP_NOFS, true);
80} 80}
81 81
82/* Sort bmap items by AG. */
83static int
84xfs_bmap_free_list_cmp(
85 void *priv,
86 struct list_head *a,
87 struct list_head *b)
88{
89 struct xfs_mount *mp = priv;
90 struct xfs_bmap_free_item *ra;
91 struct xfs_bmap_free_item *rb;
92
93 ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
94 rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
95 return XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
96 XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
97}
98
82/* 99/*
83 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi 100 * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
84 * caller. Frees all the extents that need freeing, which must be done 101 * caller. Frees all the extents that need freeing, which must be done
@@ -99,14 +116,15 @@ xfs_bmap_finish(
99 int error; /* error return value */ 116 int error; /* error return value */
100 int committed;/* xact committed or not */ 117 int committed;/* xact committed or not */
101 struct xfs_bmap_free_item *free; /* free extent item */ 118 struct xfs_bmap_free_item *free; /* free extent item */
102 struct xfs_bmap_free_item *next; /* next item on free list */
103 119
104 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); 120 ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
105 if (flist->xbf_count == 0) 121 if (flist->xbf_count == 0)
106 return 0; 122 return 0;
107 123
124 list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
125
108 efi = xfs_trans_get_efi(*tp, flist->xbf_count); 126 efi = xfs_trans_get_efi(*tp, flist->xbf_count);
109 for (free = flist->xbf_first; free; free = free->xbfi_next) 127 list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
110 xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, 128 xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
111 free->xbfi_blockcount); 129 free->xbfi_blockcount);
112 130
@@ -125,9 +143,7 @@ xfs_bmap_finish(
125 if (committed) { 143 if (committed) {
126 xfs_efi_release(efi); 144 xfs_efi_release(efi);
127 xfs_force_shutdown((*tp)->t_mountp, 145 xfs_force_shutdown((*tp)->t_mountp,
128 (error == -EFSCORRUPTED) ? 146 SHUTDOWN_META_IO_ERROR);
129 SHUTDOWN_CORRUPT_INCORE :
130 SHUTDOWN_META_IO_ERROR);
131 } 147 }
132 return error; 148 return error;
133 } 149 }
@@ -138,15 +154,15 @@ xfs_bmap_finish(
138 * on error. 154 * on error.
139 */ 155 */
140 efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); 156 efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
141 for (free = flist->xbf_first; free != NULL; free = next) { 157 while (!list_empty(&flist->xbf_flist)) {
142 next = free->xbfi_next; 158 free = list_first_entry(&flist->xbf_flist,
143 159 struct xfs_bmap_free_item, xbfi_list);
144 error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock, 160 error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
145 free->xbfi_blockcount); 161 free->xbfi_blockcount);
146 if (error) 162 if (error)
147 return error; 163 return error;
148 164
149 xfs_bmap_del_free(flist, NULL, free); 165 xfs_bmap_del_free(flist, free);
150 } 166 }
151 167
152 return 0; 168 return 0;
@@ -409,7 +425,7 @@ xfs_bmap_count_tree(
409/* 425/*
410 * Count fsblocks of the given fork. 426 * Count fsblocks of the given fork.
411 */ 427 */
412int /* error */ 428static int /* error */
413xfs_bmap_count_blocks( 429xfs_bmap_count_blocks(
414 xfs_trans_t *tp, /* transaction pointer */ 430 xfs_trans_t *tp, /* transaction pointer */
415 xfs_inode_t *ip, /* incore inode */ 431 xfs_inode_t *ip, /* incore inode */
@@ -799,7 +815,7 @@ xfs_bmap_punch_delalloc_range(
799 if (error) 815 if (error)
800 break; 816 break;
801 817
802 ASSERT(!flist.xbf_count && !flist.xbf_first); 818 ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
803next_block: 819next_block:
804 start_fsb++; 820 start_fsb++;
805 remaining--; 821 remaining--;
@@ -1089,99 +1105,120 @@ error1: /* Just cancel transaction */
1089 return error; 1105 return error;
1090} 1106}
1091 1107
1092/* 1108static int
1093 * Zero file bytes between startoff and endoff inclusive. 1109xfs_unmap_extent(
1094 * The iolock is held exclusive and no blocks are buffered. 1110 struct xfs_inode *ip,
1095 * 1111 xfs_fileoff_t startoffset_fsb,
1096 * This function is used by xfs_free_file_space() to zero 1112 xfs_filblks_t len_fsb,
1097 * partial blocks when the range to free is not block aligned. 1113 int *done)
1098 * When unreserving space with boundaries that are not block
1099 * aligned we round up the start and round down the end
1100 * boundaries and then use this function to zero the parts of
1101 * the blocks that got dropped during the rounding.
1102 */
1103STATIC int
1104xfs_zero_remaining_bytes(
1105 xfs_inode_t *ip,
1106 xfs_off_t startoff,
1107 xfs_off_t endoff)
1108{ 1114{
1109 xfs_bmbt_irec_t imap; 1115 struct xfs_mount *mp = ip->i_mount;
1110 xfs_fileoff_t offset_fsb; 1116 struct xfs_trans *tp;
1111 xfs_off_t lastoffset; 1117 struct xfs_bmap_free free_list;
1112 xfs_off_t offset; 1118 xfs_fsblock_t firstfsb;
1113 xfs_buf_t *bp; 1119 uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1114 xfs_mount_t *mp = ip->i_mount; 1120 int error;
1115 int nimap;
1116 int error = 0;
1117 1121
1118 /* 1122 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1119 * Avoid doing I/O beyond eof - it's not necessary 1123 if (error) {
1120 * since nothing can read beyond eof. The space will 1124 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1121 * be zeroed when the file is extended anyway. 1125 return error;
1122 */ 1126 }
1123 if (startoff >= XFS_ISIZE(ip))
1124 return 0;
1125 1127
1126 if (endoff > XFS_ISIZE(ip)) 1128 xfs_ilock(ip, XFS_ILOCK_EXCL);
1127 endoff = XFS_ISIZE(ip); 1129 error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
1130 ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
1131 if (error)
1132 goto out_trans_cancel;
1128 1133
1129 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1134 xfs_trans_ijoin(tp, ip, 0);
1130 uint lock_mode;
1131 1135
1132 offset_fsb = XFS_B_TO_FSBT(mp, offset); 1136 xfs_bmap_init(&free_list, &firstfsb);
1133 nimap = 1; 1137 error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
1138 &free_list, done);
1139 if (error)
1140 goto out_bmap_cancel;
1134 1141
1135 lock_mode = xfs_ilock_data_map_shared(ip); 1142 error = xfs_bmap_finish(&tp, &free_list, NULL);
1136 error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0); 1143 if (error)
1137 xfs_iunlock(ip, lock_mode); 1144 goto out_bmap_cancel;
1138 1145
1139 if (error || nimap < 1) 1146 error = xfs_trans_commit(tp);
1140 break; 1147out_unlock:
1141 ASSERT(imap.br_blockcount >= 1); 1148 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1142 ASSERT(imap.br_startoff == offset_fsb); 1149 return error;
1143 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1144 1150
1145 if (imap.br_startblock == HOLESTARTBLOCK || 1151out_bmap_cancel:
1146 imap.br_state == XFS_EXT_UNWRITTEN) { 1152 xfs_bmap_cancel(&free_list);
1147 /* skip the entire extent */ 1153out_trans_cancel:
1148 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1154 xfs_trans_cancel(tp);
1149 imap.br_blockcount) - 1; 1155 goto out_unlock;
1150 continue; 1156}
1151 }
1152 1157
1153 lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; 1158static int
1154 if (lastoffset > endoff) 1159xfs_adjust_extent_unmap_boundaries(
1155 lastoffset = endoff; 1160 struct xfs_inode *ip,
1161 xfs_fileoff_t *startoffset_fsb,
1162 xfs_fileoff_t *endoffset_fsb)
1163{
1164 struct xfs_mount *mp = ip->i_mount;
1165 struct xfs_bmbt_irec imap;
1166 int nimap, error;
1167 xfs_extlen_t mod = 0;
1156 1168
1157 /* DAX can just zero the backing device directly */ 1169 nimap = 1;
1158 if (IS_DAX(VFS_I(ip))) { 1170 error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
1159 error = dax_zero_page_range(VFS_I(ip), offset, 1171 if (error)
1160 lastoffset - offset + 1, 1172 return error;
1161 xfs_get_blocks_direct);
1162 if (error)
1163 return error;
1164 continue;
1165 }
1166 1173
1167 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? 1174 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1168 mp->m_rtdev_targp : mp->m_ddev_targp, 1175 xfs_daddr_t block;
1169 xfs_fsb_to_db(ip, imap.br_startblock),
1170 BTOBB(mp->m_sb.sb_blocksize),
1171 0, &bp, NULL);
1172 if (error)
1173 return error;
1174 1176
1175 memset(bp->b_addr + 1177 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1176 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 1178 block = imap.br_startblock;
1177 0, lastoffset - offset + 1); 1179 mod = do_div(block, mp->m_sb.sb_rextsize);
1180 if (mod)
1181 *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1182 }
1178 1183
1179 error = xfs_bwrite(bp); 1184 nimap = 1;
1180 xfs_buf_relse(bp); 1185 error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
1181 if (error) 1186 if (error)
1182 return error; 1187 return error;
1188
1189 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1190 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1191 mod++;
1192 if (mod && mod != mp->m_sb.sb_rextsize)
1193 *endoffset_fsb -= mod;
1183 } 1194 }
1184 return error; 1195
1196 return 0;
1197}
1198
1199static int
1200xfs_flush_unmap_range(
1201 struct xfs_inode *ip,
1202 xfs_off_t offset,
1203 xfs_off_t len)
1204{
1205 struct xfs_mount *mp = ip->i_mount;
1206 struct inode *inode = VFS_I(ip);
1207 xfs_off_t rounding, start, end;
1208 int error;
1209
1210 /* wait for the completion of any pending DIOs */
1211 inode_dio_wait(inode);
1212
1213 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
1214 start = round_down(offset, rounding);
1215 end = round_up(offset + len, rounding) - 1;
1216
1217 error = filemap_write_and_wait_range(inode->i_mapping, start, end);
1218 if (error)
1219 return error;
1220 truncate_pagecache_range(inode, start, end);
1221 return 0;
1185} 1222}
1186 1223
1187int 1224int
@@ -1190,24 +1227,10 @@ xfs_free_file_space(
1190 xfs_off_t offset, 1227 xfs_off_t offset,
1191 xfs_off_t len) 1228 xfs_off_t len)
1192{ 1229{
1193 int done; 1230 struct xfs_mount *mp = ip->i_mount;
1194 xfs_fileoff_t endoffset_fsb;
1195 int error;
1196 xfs_fsblock_t firstfsb;
1197 xfs_bmap_free_t free_list;
1198 xfs_bmbt_irec_t imap;
1199 xfs_off_t ioffset;
1200 xfs_off_t iendoffset;
1201 xfs_extlen_t mod=0;
1202 xfs_mount_t *mp;
1203 int nimap;
1204 uint resblks;
1205 xfs_off_t rounding;
1206 int rt;
1207 xfs_fileoff_t startoffset_fsb; 1231 xfs_fileoff_t startoffset_fsb;
1208 xfs_trans_t *tp; 1232 xfs_fileoff_t endoffset_fsb;
1209 1233 int done = 0, error;
1210 mp = ip->i_mount;
1211 1234
1212 trace_xfs_free_file_space(ip); 1235 trace_xfs_free_file_space(ip);
1213 1236
@@ -1215,135 +1238,45 @@ xfs_free_file_space(
1215 if (error) 1238 if (error)
1216 return error; 1239 return error;
1217 1240
1218 error = 0;
1219 if (len <= 0) /* if nothing being freed */ 1241 if (len <= 0) /* if nothing being freed */
1220 return error; 1242 return 0;
1221 rt = XFS_IS_REALTIME_INODE(ip);
1222 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1223 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1224
1225 /* wait for the completion of any pending DIOs */
1226 inode_dio_wait(VFS_I(ip));
1227 1243
1228 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE); 1244 error = xfs_flush_unmap_range(ip, offset, len);
1229 ioffset = round_down(offset, rounding);
1230 iendoffset = round_up(offset + len, rounding) - 1;
1231 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
1232 iendoffset);
1233 if (error) 1245 if (error)
1234 goto out; 1246 return error;
1235 truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); 1247
1248 startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1249 endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1236 1250
1237 /* 1251 /*
1238 * Need to zero the stuff we're not freeing, on disk. 1252 * Need to zero the stuff we're not freeing, on disk. If it's a RT file
1239 * If it's a realtime file & can't use unwritten extents then we 1253 * and we can't use unwritten extents then we actually need to ensure
1240 * actually need to zero the extent edges. Otherwise xfs_bunmapi 1254 * to zero the whole extent, otherwise we just need to take of block
1241 * will take care of it for us. 1255 * boundaries, and xfs_bunmapi will handle the rest.
1242 */ 1256 */
1243 if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) { 1257 if (XFS_IS_REALTIME_INODE(ip) &&
1244 nimap = 1; 1258 !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1245 error = xfs_bmapi_read(ip, startoffset_fsb, 1, 1259 error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
1246 &imap, &nimap, 0); 1260 &endoffset_fsb);
1247 if (error)
1248 goto out;
1249 ASSERT(nimap == 0 || nimap == 1);
1250 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1251 xfs_daddr_t block;
1252
1253 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1254 block = imap.br_startblock;
1255 mod = do_div(block, mp->m_sb.sb_rextsize);
1256 if (mod)
1257 startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1258 }
1259 nimap = 1;
1260 error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1261 &imap, &nimap, 0);
1262 if (error) 1261 if (error)
1263 goto out; 1262 return error;
1264 ASSERT(nimap == 0 || nimap == 1);
1265 if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1266 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1267 mod++;
1268 if (mod && (mod != mp->m_sb.sb_rextsize))
1269 endoffset_fsb -= mod;
1270 }
1271 }
1272 if ((done = (endoffset_fsb <= startoffset_fsb)))
1273 /*
1274 * One contiguous piece to clear
1275 */
1276 error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1277 else {
1278 /*
1279 * Some full blocks, possibly two pieces to clear
1280 */
1281 if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1282 error = xfs_zero_remaining_bytes(ip, offset,
1283 XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1284 if (!error &&
1285 XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1286 error = xfs_zero_remaining_bytes(ip,
1287 XFS_FSB_TO_B(mp, endoffset_fsb),
1288 offset + len - 1);
1289 } 1263 }
1290 1264
1291 /* 1265 if (endoffset_fsb > startoffset_fsb) {
1292 * free file space until done or until there is an error 1266 while (!done) {
1293 */ 1267 error = xfs_unmap_extent(ip, startoffset_fsb,
1294 resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); 1268 endoffset_fsb - startoffset_fsb, &done);
1295 while (!error && !done) { 1269 if (error)
1296 1270 return error;
1297 /*
1298 * allocate and setup the transaction. Allow this
1299 * transaction to dip into the reserve blocks to ensure
1300 * the freeing of the space succeeds at ENOSPC.
1301 */
1302 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
1303 &tp);
1304 if (error) {
1305 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1306 break;
1307 } 1271 }
1308 xfs_ilock(ip, XFS_ILOCK_EXCL);
1309 error = xfs_trans_reserve_quota(tp, mp,
1310 ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1311 resblks, 0, XFS_QMOPT_RES_REGBLKS);
1312 if (error)
1313 goto error1;
1314
1315 xfs_trans_ijoin(tp, ip, 0);
1316
1317 /*
1318 * issue the bunmapi() call to free the blocks
1319 */
1320 xfs_bmap_init(&free_list, &firstfsb);
1321 error = xfs_bunmapi(tp, ip, startoffset_fsb,
1322 endoffset_fsb - startoffset_fsb,
1323 0, 2, &firstfsb, &free_list, &done);
1324 if (error)
1325 goto error0;
1326
1327 /*
1328 * complete the transaction
1329 */
1330 error = xfs_bmap_finish(&tp, &free_list, NULL);
1331 if (error)
1332 goto error0;
1333
1334 error = xfs_trans_commit(tp);
1335 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1336 } 1272 }
1337 1273
1338 out: 1274 /*
1339 return error; 1275 * Now that we've unmap all full blocks we'll have to zero out any
1340 1276 * partial block at the beginning and/or end. xfs_zero_range is
1341 error0: 1277 * smart enough to skip any holes, including those we just created.
1342 xfs_bmap_cancel(&free_list); 1278 */
1343 error1: 1279 return xfs_zero_range(ip, offset, len, NULL);
1344 xfs_trans_cancel(tp);
1345 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1346 goto out;
1347} 1280}
1348 1281
1349/* 1282/*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index af97d9a1dfb4..f20071432ca6 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -31,8 +31,6 @@ struct xfs_bmalloca;
31int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); 31int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
32int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff, 32int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
33 int whichfork, int *eof); 33 int whichfork, int *eof);
34int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
35 int whichfork, int *count);
36int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, 34int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
37 xfs_fileoff_t start_fsb, xfs_fileoff_t length); 35 xfs_fileoff_t start_fsb, xfs_fileoff_t length);
38 36
@@ -43,7 +41,6 @@ int xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
43 41
44/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */ 42/* functions in xfs_bmap.c that are only needed by xfs_bmap_util.c */
45void xfs_bmap_del_free(struct xfs_bmap_free *flist, 43void xfs_bmap_del_free(struct xfs_bmap_free *flist,
46 struct xfs_bmap_free_item *prev,
47 struct xfs_bmap_free_item *free); 44 struct xfs_bmap_free_item *free);
48int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp, 45int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
49 struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz, 46 struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index a87a0d5477bd..47a318ce82e0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,6 +80,47 @@ xfs_buf_vmap_len(
80} 80}
81 81
82/* 82/*
83 * Bump the I/O in flight count on the buftarg if we haven't yet done so for
84 * this buffer. The count is incremented once per buffer (per hold cycle)
85 * because the corresponding decrement is deferred to buffer release. Buffers
86 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
87 * tracking adds unnecessary overhead. This is used for sychronization purposes
88 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
89 * in-flight buffers.
90 *
91 * Buffers that are never released (e.g., superblock, iclog buffers) must set
92 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
93 * never reaches zero and unmount hangs indefinitely.
94 */
95static inline void
96xfs_buf_ioacct_inc(
97 struct xfs_buf *bp)
98{
99 if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT))
100 return;
101
102 ASSERT(bp->b_flags & XBF_ASYNC);
103 bp->b_flags |= _XBF_IN_FLIGHT;
104 percpu_counter_inc(&bp->b_target->bt_io_count);
105}
106
107/*
108 * Clear the in-flight state on a buffer about to be released to the LRU or
109 * freed and unaccount from the buftarg.
110 */
111static inline void
112xfs_buf_ioacct_dec(
113 struct xfs_buf *bp)
114{
115 if (!(bp->b_flags & _XBF_IN_FLIGHT))
116 return;
117
118 ASSERT(bp->b_flags & XBF_ASYNC);
119 bp->b_flags &= ~_XBF_IN_FLIGHT;
120 percpu_counter_dec(&bp->b_target->bt_io_count);
121}
122
123/*
83 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 124 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
84 * b_lru_ref count so that the buffer is freed immediately when the buffer 125 * b_lru_ref count so that the buffer is freed immediately when the buffer
85 * reference count falls to zero. If the buffer is already on the LRU, we need 126 * reference count falls to zero. If the buffer is already on the LRU, we need
@@ -102,6 +143,14 @@ xfs_buf_stale(
102 */ 143 */
103 bp->b_flags &= ~_XBF_DELWRI_Q; 144 bp->b_flags &= ~_XBF_DELWRI_Q;
104 145
146 /*
147 * Once the buffer is marked stale and unlocked, a subsequent lookup
148 * could reset b_flags. There is no guarantee that the buffer is
149 * unaccounted (released to LRU) before that occurs. Drop in-flight
150 * status now to preserve accounting consistency.
151 */
152 xfs_buf_ioacct_dec(bp);
153
105 spin_lock(&bp->b_lock); 154 spin_lock(&bp->b_lock);
106 atomic_set(&bp->b_lru_ref, 0); 155 atomic_set(&bp->b_lru_ref, 0);
107 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 156 if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
@@ -815,7 +864,8 @@ xfs_buf_get_uncached(
815 struct xfs_buf *bp; 864 struct xfs_buf *bp;
816 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 865 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
817 866
818 bp = _xfs_buf_alloc(target, &map, 1, 0); 867 /* flags might contain irrelevant bits, pass only what we care about */
868 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT);
819 if (unlikely(bp == NULL)) 869 if (unlikely(bp == NULL))
820 goto fail; 870 goto fail;
821 871
@@ -866,63 +916,85 @@ xfs_buf_hold(
866} 916}
867 917
868/* 918/*
869 * Releases a hold on the specified buffer. If the 919 * Release a hold on the specified buffer. If the hold count is 1, the buffer is
870 * the hold count is 1, calls xfs_buf_free. 920 * placed on LRU or freed (depending on b_lru_ref).
871 */ 921 */
872void 922void
873xfs_buf_rele( 923xfs_buf_rele(
874 xfs_buf_t *bp) 924 xfs_buf_t *bp)
875{ 925{
876 struct xfs_perag *pag = bp->b_pag; 926 struct xfs_perag *pag = bp->b_pag;
927 bool release;
928 bool freebuf = false;
877 929
878 trace_xfs_buf_rele(bp, _RET_IP_); 930 trace_xfs_buf_rele(bp, _RET_IP_);
879 931
880 if (!pag) { 932 if (!pag) {
881 ASSERT(list_empty(&bp->b_lru)); 933 ASSERT(list_empty(&bp->b_lru));
882 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 934 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
883 if (atomic_dec_and_test(&bp->b_hold)) 935 if (atomic_dec_and_test(&bp->b_hold)) {
936 xfs_buf_ioacct_dec(bp);
884 xfs_buf_free(bp); 937 xfs_buf_free(bp);
938 }
885 return; 939 return;
886 } 940 }
887 941
888 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 942 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
889 943
890 ASSERT(atomic_read(&bp->b_hold) > 0); 944 ASSERT(atomic_read(&bp->b_hold) > 0);
891 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
892 spin_lock(&bp->b_lock);
893 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
894 /*
895 * If the buffer is added to the LRU take a new
896 * reference to the buffer for the LRU and clear the
897 * (now stale) dispose list state flag
898 */
899 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
900 bp->b_state &= ~XFS_BSTATE_DISPOSE;
901 atomic_inc(&bp->b_hold);
902 }
903 spin_unlock(&bp->b_lock);
904 spin_unlock(&pag->pag_buf_lock);
905 } else {
906 /*
907 * most of the time buffers will already be removed from
908 * the LRU, so optimise that case by checking for the
909 * XFS_BSTATE_DISPOSE flag indicating the last list the
910 * buffer was on was the disposal list
911 */
912 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
913 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
914 } else {
915 ASSERT(list_empty(&bp->b_lru));
916 }
917 spin_unlock(&bp->b_lock);
918 945
919 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 946 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
920 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 947 spin_lock(&bp->b_lock);
921 spin_unlock(&pag->pag_buf_lock); 948 if (!release) {
922 xfs_perag_put(pag); 949 /*
923 xfs_buf_free(bp); 950 * Drop the in-flight state if the buffer is already on the LRU
951 * and it holds the only reference. This is racy because we
952 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
953 * ensures the decrement occurs only once per-buf.
954 */
955 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
956 xfs_buf_ioacct_dec(bp);
957 goto out_unlock;
958 }
959
960 /* the last reference has been dropped ... */
961 xfs_buf_ioacct_dec(bp);
962 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
963 /*
964 * If the buffer is added to the LRU take a new reference to the
965 * buffer for the LRU and clear the (now stale) dispose list
966 * state flag
967 */
968 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
969 bp->b_state &= ~XFS_BSTATE_DISPOSE;
970 atomic_inc(&bp->b_hold);
971 }
972 spin_unlock(&pag->pag_buf_lock);
973 } else {
974 /*
975 * most of the time buffers will already be removed from the
976 * LRU, so optimise that case by checking for the
977 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
978 * was on was the disposal list
979 */
980 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
981 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
982 } else {
983 ASSERT(list_empty(&bp->b_lru));
924 } 984 }
985
986 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
987 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
988 spin_unlock(&pag->pag_buf_lock);
989 xfs_perag_put(pag);
990 freebuf = true;
925 } 991 }
992
993out_unlock:
994 spin_unlock(&bp->b_lock);
995
996 if (freebuf)
997 xfs_buf_free(bp);
926} 998}
927 999
928 1000
@@ -944,10 +1016,12 @@ xfs_buf_trylock(
944 int locked; 1016 int locked;
945 1017
946 locked = down_trylock(&bp->b_sema) == 0; 1018 locked = down_trylock(&bp->b_sema) == 0;
947 if (locked) 1019 if (locked) {
948 XB_SET_OWNER(bp); 1020 XB_SET_OWNER(bp);
949 1021 trace_xfs_buf_trylock(bp, _RET_IP_);
950 trace_xfs_buf_trylock(bp, _RET_IP_); 1022 } else {
1023 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1024 }
951 return locked; 1025 return locked;
952} 1026}
953 1027
@@ -1339,6 +1413,7 @@ xfs_buf_submit(
1339 * xfs_buf_ioend too early. 1413 * xfs_buf_ioend too early.
1340 */ 1414 */
1341 atomic_set(&bp->b_io_remaining, 1); 1415 atomic_set(&bp->b_io_remaining, 1);
1416 xfs_buf_ioacct_inc(bp);
1342 _xfs_buf_ioapply(bp); 1417 _xfs_buf_ioapply(bp);
1343 1418
1344 /* 1419 /*
@@ -1524,13 +1599,19 @@ xfs_wait_buftarg(
1524 int loop = 0; 1599 int loop = 0;
1525 1600
1526 /* 1601 /*
1527 * We need to flush the buffer workqueue to ensure that all IO 1602 * First wait on the buftarg I/O count for all in-flight buffers to be
1528 * completion processing is 100% done. Just waiting on buffer locks is 1603 * released. This is critical as new buffers do not make the LRU until
1529 * not sufficient for async IO as the reference count held over IO is 1604 * they are released.
1530 * not released until after the buffer lock is dropped. Hence we need to 1605 *
1531 * ensure here that all reference counts have been dropped before we 1606 * Next, flush the buffer workqueue to ensure all completion processing
1532 * start walking the LRU list. 1607 * has finished. Just waiting on buffer locks is not sufficient for
1608 * async IO as the reference count held over IO is not released until
1609 * after the buffer lock is dropped. Hence we need to ensure here that
1610 * all reference counts have been dropped before we start walking the
1611 * LRU list.
1533 */ 1612 */
1613 while (percpu_counter_sum(&btp->bt_io_count))
1614 delay(100);
1534 drain_workqueue(btp->bt_mount->m_buf_workqueue); 1615 drain_workqueue(btp->bt_mount->m_buf_workqueue);
1535 1616
1536 /* loop until there is nothing left on the lru list. */ 1617 /* loop until there is nothing left on the lru list. */
@@ -1627,6 +1708,8 @@ xfs_free_buftarg(
1627 struct xfs_buftarg *btp) 1708 struct xfs_buftarg *btp)
1628{ 1709{
1629 unregister_shrinker(&btp->bt_shrinker); 1710 unregister_shrinker(&btp->bt_shrinker);
1711 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
1712 percpu_counter_destroy(&btp->bt_io_count);
1630 list_lru_destroy(&btp->bt_lru); 1713 list_lru_destroy(&btp->bt_lru);
1631 1714
1632 if (mp->m_flags & XFS_MOUNT_BARRIER) 1715 if (mp->m_flags & XFS_MOUNT_BARRIER)
@@ -1691,6 +1774,9 @@ xfs_alloc_buftarg(
1691 if (list_lru_init(&btp->bt_lru)) 1774 if (list_lru_init(&btp->bt_lru))
1692 goto error; 1775 goto error;
1693 1776
1777 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
1778 goto error;
1779
1694 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1780 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
1695 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1781 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
1696 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1782 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
@@ -1774,18 +1860,33 @@ xfs_buf_cmp(
1774 return 0; 1860 return 0;
1775} 1861}
1776 1862
1863/*
1864 * submit buffers for write.
1865 *
1866 * When we have a large buffer list, we do not want to hold all the buffers
1867 * locked while we block on the request queue waiting for IO dispatch. To avoid
1868 * this problem, we lock and submit buffers in groups of 50, thereby minimising
1869 * the lock hold times for lists which may contain thousands of objects.
1870 *
1871 * To do this, we sort the buffer list before we walk the list to lock and
1872 * submit buffers, and we plug and unplug around each group of buffers we
1873 * submit.
1874 */
1777static int 1875static int
1778__xfs_buf_delwri_submit( 1876xfs_buf_delwri_submit_buffers(
1779 struct list_head *buffer_list, 1877 struct list_head *buffer_list,
1780 struct list_head *io_list, 1878 struct list_head *wait_list)
1781 bool wait)
1782{ 1879{
1783 struct blk_plug plug;
1784 struct xfs_buf *bp, *n; 1880 struct xfs_buf *bp, *n;
1881 LIST_HEAD (submit_list);
1785 int pinned = 0; 1882 int pinned = 0;
1883 struct blk_plug plug;
1786 1884
1885 list_sort(NULL, buffer_list, xfs_buf_cmp);
1886
1887 blk_start_plug(&plug);
1787 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1888 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1788 if (!wait) { 1889 if (!wait_list) {
1789 if (xfs_buf_ispinned(bp)) { 1890 if (xfs_buf_ispinned(bp)) {
1790 pinned++; 1891 pinned++;
1791 continue; 1892 continue;
@@ -1808,25 +1909,21 @@ __xfs_buf_delwri_submit(
1808 continue; 1909 continue;
1809 } 1910 }
1810 1911
1811 list_move_tail(&bp->b_list, io_list);
1812 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1912 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1813 }
1814
1815 list_sort(NULL, io_list, xfs_buf_cmp);
1816
1817 blk_start_plug(&plug);
1818 list_for_each_entry_safe(bp, n, io_list, b_list) {
1819 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
1820 bp->b_flags |= XBF_WRITE | XBF_ASYNC;
1821 1913
1822 /* 1914 /*
1823 * we do all Io submission async. This means if we need to wait 1915 * We do all IO submission async. This means if we need
1824 * for IO completion we need to take an extra reference so the 1916 * to wait for IO completion we need to take an extra
1825 * buffer is still valid on the other side. 1917 * reference so the buffer is still valid on the other
1918 * side. We need to move the buffer onto the io_list
1919 * at this point so the caller can still access it.
1826 */ 1920 */
1827 if (wait) 1921 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
1922 bp->b_flags |= XBF_WRITE | XBF_ASYNC;
1923 if (wait_list) {
1828 xfs_buf_hold(bp); 1924 xfs_buf_hold(bp);
1829 else 1925 list_move_tail(&bp->b_list, wait_list);
1926 } else
1830 list_del_init(&bp->b_list); 1927 list_del_init(&bp->b_list);
1831 1928
1832 xfs_buf_submit(bp); 1929 xfs_buf_submit(bp);
@@ -1849,8 +1946,7 @@ int
1849xfs_buf_delwri_submit_nowait( 1946xfs_buf_delwri_submit_nowait(
1850 struct list_head *buffer_list) 1947 struct list_head *buffer_list)
1851{ 1948{
1852 LIST_HEAD (io_list); 1949 return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
1853 return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1854} 1950}
1855 1951
1856/* 1952/*
@@ -1865,15 +1961,15 @@ int
1865xfs_buf_delwri_submit( 1961xfs_buf_delwri_submit(
1866 struct list_head *buffer_list) 1962 struct list_head *buffer_list)
1867{ 1963{
1868 LIST_HEAD (io_list); 1964 LIST_HEAD (wait_list);
1869 int error = 0, error2; 1965 int error = 0, error2;
1870 struct xfs_buf *bp; 1966 struct xfs_buf *bp;
1871 1967
1872 __xfs_buf_delwri_submit(buffer_list, &io_list, true); 1968 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
1873 1969
1874 /* Wait for IO to complete. */ 1970 /* Wait for IO to complete. */
1875 while (!list_empty(&io_list)) { 1971 while (!list_empty(&wait_list)) {
1876 bp = list_first_entry(&io_list, struct xfs_buf, b_list); 1972 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1877 1973
1878 list_del_init(&bp->b_list); 1974 list_del_init(&bp->b_list);
1879 1975
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8bfb974f0772..1c2e52b2d926 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -43,6 +43,7 @@ typedef enum {
43#define XBF_READ (1 << 0) /* buffer intended for reading from device */ 43#define XBF_READ (1 << 0) /* buffer intended for reading from device */
44#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */ 44#define XBF_WRITE (1 << 1) /* buffer intended for writing to device */
45#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */ 45#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
46#define XBF_NO_IOACCT (1 << 3) /* bypass I/O accounting (non-LRU bufs) */
46#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 47#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
47#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 48#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
48#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */ 49#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
@@ -62,6 +63,7 @@ typedef enum {
62#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 63#define _XBF_KMEM (1 << 21)/* backed by heap memory */
63#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */ 64#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
64#define _XBF_COMPOUND (1 << 23)/* compound buffer */ 65#define _XBF_COMPOUND (1 << 23)/* compound buffer */
66#define _XBF_IN_FLIGHT (1 << 25) /* I/O in flight, for accounting purposes */
65 67
66typedef unsigned int xfs_buf_flags_t; 68typedef unsigned int xfs_buf_flags_t;
67 69
@@ -81,7 +83,8 @@ typedef unsigned int xfs_buf_flags_t;
81 { _XBF_PAGES, "PAGES" }, \ 83 { _XBF_PAGES, "PAGES" }, \
82 { _XBF_KMEM, "KMEM" }, \ 84 { _XBF_KMEM, "KMEM" }, \
83 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 85 { _XBF_DELWRI_Q, "DELWRI_Q" }, \
84 { _XBF_COMPOUND, "COMPOUND" } 86 { _XBF_COMPOUND, "COMPOUND" }, \
87 { _XBF_IN_FLIGHT, "IN_FLIGHT" }
85 88
86 89
87/* 90/*
@@ -115,6 +118,8 @@ typedef struct xfs_buftarg {
115 /* LRU control structures */ 118 /* LRU control structures */
116 struct shrinker bt_shrinker; 119 struct shrinker bt_shrinker;
117 struct list_lru bt_lru; 120 struct list_lru bt_lru;
121
122 struct percpu_counter bt_io_count;
118} xfs_buftarg_t; 123} xfs_buftarg_t;
119 124
120struct xfs_buf; 125struct xfs_buf;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 34257992934c..e455f9098d49 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -359,7 +359,7 @@ xfs_buf_item_format(
359 for (i = 0; i < bip->bli_format_count; i++) { 359 for (i = 0; i < bip->bli_format_count; i++) {
360 xfs_buf_item_format_segment(bip, lv, &vecp, offset, 360 xfs_buf_item_format_segment(bip, lv, &vecp, offset,
361 &bip->bli_formats[i]); 361 &bip->bli_formats[i]);
362 offset += bp->b_maps[i].bm_len; 362 offset += BBTOB(bp->b_maps[i].bm_len);
363 } 363 }
364 364
365 /* 365 /*
@@ -915,20 +915,28 @@ xfs_buf_item_log(
915 for (i = 0; i < bip->bli_format_count; i++) { 915 for (i = 0; i < bip->bli_format_count; i++) {
916 if (start > last) 916 if (start > last)
917 break; 917 break;
918 end = start + BBTOB(bp->b_maps[i].bm_len); 918 end = start + BBTOB(bp->b_maps[i].bm_len) - 1;
919
920 /* skip to the map that includes the first byte to log */
919 if (first > end) { 921 if (first > end) {
920 start += BBTOB(bp->b_maps[i].bm_len); 922 start += BBTOB(bp->b_maps[i].bm_len);
921 continue; 923 continue;
922 } 924 }
925
926 /*
927 * Trim the range to this segment and mark it in the bitmap.
928 * Note that we must convert buffer offsets to segment relative
929 * offsets (e.g., the first byte of each segment is byte 0 of
930 * that segment).
931 */
923 if (first < start) 932 if (first < start)
924 first = start; 933 first = start;
925 if (end > last) 934 if (end > last)
926 end = last; 935 end = last;
927 936 xfs_buf_item_log_segment(first - start, end - start,
928 xfs_buf_item_log_segment(first, end,
929 &bip->bli_formats[i].blf_data_map[0]); 937 &bip->bli_formats[i].blf_data_map[0]);
930 938
931 start += bp->b_maps[i].bm_len; 939 start += BBTOB(bp->b_maps[i].bm_len);
932 } 940 }
933} 941}
934 942
@@ -949,6 +957,7 @@ xfs_buf_item_free(
949 xfs_buf_log_item_t *bip) 957 xfs_buf_log_item_t *bip)
950{ 958{
951 xfs_buf_item_free_format(bip); 959 xfs_buf_item_free_format(bip);
960 kmem_free(bip->bli_item.li_lv_shadow);
952 kmem_zone_free(xfs_buf_item_zone, bip); 961 kmem_zone_free(xfs_buf_item_zone, bip);
953} 962}
954 963
@@ -1073,6 +1082,8 @@ xfs_buf_iodone_callback_error(
1073 trace_xfs_buf_item_iodone_async(bp, _RET_IP_); 1082 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1074 ASSERT(bp->b_iodone != NULL); 1083 ASSERT(bp->b_iodone != NULL);
1075 1084
1085 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1086
1076 /* 1087 /*
1077 * If the write was asynchronous then no one will be looking for the 1088 * If the write was asynchronous then no one will be looking for the
1078 * error. If this is the first failure of this type, clear the error 1089 * error. If this is the first failure of this type, clear the error
@@ -1080,13 +1091,12 @@ xfs_buf_iodone_callback_error(
1080 * async write failure at least once, but we also need to set the buffer 1091 * async write failure at least once, but we also need to set the buffer
1081 * up to behave correctly now for repeated failures. 1092 * up to behave correctly now for repeated failures.
1082 */ 1093 */
1083 if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL)) || 1094 if (!(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL)) ||
1084 bp->b_last_error != bp->b_error) { 1095 bp->b_last_error != bp->b_error) {
1085 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | 1096 bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL);
1086 XBF_DONE | XBF_WRITE_FAIL);
1087 bp->b_last_error = bp->b_error; 1097 bp->b_last_error = bp->b_error;
1088 bp->b_retries = 0; 1098 if (cfg->retry_timeout && !bp->b_first_retry_time)
1089 bp->b_first_retry_time = jiffies; 1099 bp->b_first_retry_time = jiffies;
1090 1100
1091 xfs_buf_ioerror(bp, 0); 1101 xfs_buf_ioerror(bp, 0);
1092 xfs_buf_submit(bp); 1102 xfs_buf_submit(bp);
@@ -1097,7 +1107,6 @@ xfs_buf_iodone_callback_error(
1097 * Repeated failure on an async write. Take action according to the 1107 * Repeated failure on an async write. Take action according to the
1098 * error configuration we have been set up to use. 1108 * error configuration we have been set up to use.
1099 */ 1109 */
1100 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1101 1110
1102 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1111 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1103 ++bp->b_retries > cfg->max_retries) 1112 ++bp->b_retries > cfg->max_retries)
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index e0646659ce16..ccb0811963b2 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -74,6 +74,7 @@ xfs_qm_dqdestroy(
74{ 74{
75 ASSERT(list_empty(&dqp->q_lru)); 75 ASSERT(list_empty(&dqp->q_lru));
76 76
77 kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
77 mutex_destroy(&dqp->q_qlock); 78 mutex_destroy(&dqp->q_qlock);
78 79
79 XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); 80 XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 814cff94e78f..2c7a1629e064 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -370,6 +370,8 @@ xfs_qm_qoffend_logitem_committed(
370 spin_lock(&ailp->xa_lock); 370 spin_lock(&ailp->xa_lock);
371 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR); 371 xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
372 372
373 kmem_free(qfs->qql_item.li_lv_shadow);
374 kmem_free(lip->li_lv_shadow);
373 kmem_free(qfs); 375 kmem_free(qfs);
374 kmem_free(qfe); 376 kmem_free(qfe);
375 return (xfs_lsn_t)-1; 377 return (xfs_lsn_t)-1;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 88693a98fac5..ed7ee4e8af73 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -55,12 +55,15 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
55} 55}
56 56
57int 57int
58xfs_errortag_add(int error_tag, xfs_mount_t *mp) 58xfs_errortag_add(unsigned int error_tag, xfs_mount_t *mp)
59{ 59{
60 int i; 60 int i;
61 int len; 61 int len;
62 int64_t fsid; 62 int64_t fsid;
63 63
64 if (error_tag >= XFS_ERRTAG_MAX)
65 return -EINVAL;
66
64 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)); 67 memcpy(&fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t));
65 68
66 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 69 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 4ed3042a0f16..2e4f67f68856 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -128,7 +128,7 @@ extern int xfs_error_test(int, int *, char *, int, char *, unsigned long);
128 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \ 128 xfs_error_test((tag), (mp)->m_fixedfsid, "expr", __LINE__, __FILE__, \
129 (rf)))) 129 (rf))))
130 130
131extern int xfs_errortag_add(int error_tag, struct xfs_mount *mp); 131extern int xfs_errortag_add(unsigned int error_tag, struct xfs_mount *mp);
132extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud); 132extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
133#else 133#else
134#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr) 134#define XFS_TEST_ERROR(expr, mp, tag, rf) (expr)
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 4aa0153214f9..ab779460ecbf 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,6 +40,7 @@ void
40xfs_efi_item_free( 40xfs_efi_item_free(
41 struct xfs_efi_log_item *efip) 41 struct xfs_efi_log_item *efip)
42{ 42{
43 kmem_free(efip->efi_item.li_lv_shadow);
43 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) 44 if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
44 kmem_free(efip); 45 kmem_free(efip);
45 else 46 else
@@ -300,6 +301,7 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
300STATIC void 301STATIC void
301xfs_efd_item_free(struct xfs_efd_log_item *efdp) 302xfs_efd_item_free(struct xfs_efd_log_item *efdp)
302{ 303{
304 kmem_free(efdp->efd_item.li_lv_shadow);
303 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) 305 if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
304 kmem_free(efdp); 306 kmem_free(efdp);
305 else 307 else
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1b3dc9dd8861..ed95e5bb04e6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
37#include "xfs_log.h" 37#include "xfs_log.h"
38#include "xfs_icache.h" 38#include "xfs_icache.h"
39#include "xfs_pnfs.h" 39#include "xfs_pnfs.h"
40#include "xfs_iomap.h"
40 41
41#include <linux/dcache.h> 42#include <linux/dcache.h>
42#include <linux/falloc.h> 43#include <linux/falloc.h>
@@ -80,61 +81,17 @@ xfs_rw_ilock_demote(
80} 81}
81 82
82/* 83/*
83 * xfs_iozero clears the specified range supplied via the page cache (except in 84 * Clear the specified ranges to zero through either the pagecache or DAX.
84 * the DAX case). Writes through the page cache will allocate blocks over holes, 85 * Holes and unwritten extents will be left as-is as they already are zeroed.
85 * though the callers usually map the holes first and avoid them. If a block is
86 * not completely zeroed, then it will be read from disk before being partially
87 * zeroed.
88 *
89 * In the DAX case, we can just directly write to the underlying pages. This
90 * will not allocate blocks, but will avoid holes and unwritten extents and so
91 * not do unnecessary work.
92 */ 86 */
93int 87int
94xfs_iozero( 88xfs_zero_range(
95 struct xfs_inode *ip, /* inode */ 89 struct xfs_inode *ip,
96 loff_t pos, /* offset in file */ 90 xfs_off_t pos,
97 size_t count) /* size of data to zero */ 91 xfs_off_t count,
92 bool *did_zero)
98{ 93{
99 struct page *page; 94 return iomap_zero_range(VFS_I(ip), pos, count, NULL, &xfs_iomap_ops);
100 struct address_space *mapping;
101 int status = 0;
102
103
104 mapping = VFS_I(ip)->i_mapping;
105 do {
106 unsigned offset, bytes;
107 void *fsdata;
108
109 offset = (pos & (PAGE_SIZE -1)); /* Within page */
110 bytes = PAGE_SIZE - offset;
111 if (bytes > count)
112 bytes = count;
113
114 if (IS_DAX(VFS_I(ip))) {
115 status = dax_zero_page_range(VFS_I(ip), pos, bytes,
116 xfs_get_blocks_direct);
117 if (status)
118 break;
119 } else {
120 status = pagecache_write_begin(NULL, mapping, pos, bytes,
121 AOP_FLAG_UNINTERRUPTIBLE,
122 &page, &fsdata);
123 if (status)
124 break;
125
126 zero_user(page, offset, bytes);
127
128 status = pagecache_write_end(NULL, mapping, pos, bytes,
129 bytes, page, fsdata);
130 WARN_ON(status <= 0); /* can't return less than zero! */
131 status = 0;
132 }
133 pos += bytes;
134 count -= bytes;
135 } while (count);
136
137 return status;
138} 95}
139 96
140int 97int
@@ -282,48 +239,35 @@ xfs_file_fsync(
282} 239}
283 240
284STATIC ssize_t 241STATIC ssize_t
285xfs_file_read_iter( 242xfs_file_dio_aio_read(
286 struct kiocb *iocb, 243 struct kiocb *iocb,
287 struct iov_iter *to) 244 struct iov_iter *to)
288{ 245{
289 struct file *file = iocb->ki_filp; 246 struct address_space *mapping = iocb->ki_filp->f_mapping;
290 struct inode *inode = file->f_mapping->host; 247 struct inode *inode = mapping->host;
291 struct xfs_inode *ip = XFS_I(inode); 248 struct xfs_inode *ip = XFS_I(inode);
292 struct xfs_mount *mp = ip->i_mount; 249 loff_t isize = i_size_read(inode);
293 size_t size = iov_iter_count(to); 250 size_t count = iov_iter_count(to);
251 struct iov_iter data;
252 struct xfs_buftarg *target;
294 ssize_t ret = 0; 253 ssize_t ret = 0;
295 int ioflags = 0;
296 xfs_fsize_t n;
297 loff_t pos = iocb->ki_pos;
298 254
299 XFS_STATS_INC(mp, xs_read_calls); 255 trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
300
301 if (unlikely(iocb->ki_flags & IOCB_DIRECT))
302 ioflags |= XFS_IO_ISDIRECT;
303 if (file->f_mode & FMODE_NOCMTIME)
304 ioflags |= XFS_IO_INVIS;
305
306 if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
307 xfs_buftarg_t *target =
308 XFS_IS_REALTIME_INODE(ip) ?
309 mp->m_rtdev_targp : mp->m_ddev_targp;
310 /* DIO must be aligned to device logical sector size */
311 if ((pos | size) & target->bt_logical_sectormask) {
312 if (pos == i_size_read(inode))
313 return 0;
314 return -EINVAL;
315 }
316 }
317 256
318 n = mp->m_super->s_maxbytes - pos; 257 if (!count)
319 if (n <= 0 || size == 0) 258 return 0; /* skip atime */
320 return 0;
321 259
322 if (n < size) 260 if (XFS_IS_REALTIME_INODE(ip))
323 size = n; 261 target = ip->i_mount->m_rtdev_targp;
262 else
263 target = ip->i_mount->m_ddev_targp;
324 264
325 if (XFS_FORCED_SHUTDOWN(mp)) 265 /* DIO must be aligned to device logical sector size */
326 return -EIO; 266 if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
267 if (iocb->ki_pos == isize)
268 return 0;
269 return -EINVAL;
270 }
327 271
328 /* 272 /*
329 * Locking is a bit tricky here. If we take an exclusive lock for direct 273 * Locking is a bit tricky here. If we take an exclusive lock for direct
@@ -336,7 +280,7 @@ xfs_file_read_iter(
336 * serialisation. 280 * serialisation.
337 */ 281 */
338 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 282 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
339 if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) { 283 if (mapping->nrpages) {
340 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 284 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
341 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 285 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
342 286
@@ -351,8 +295,8 @@ xfs_file_read_iter(
351 * flush and reduce the chances of repeated iolock cycles going 295 * flush and reduce the chances of repeated iolock cycles going
352 * forward. 296 * forward.
353 */ 297 */
354 if (inode->i_mapping->nrpages) { 298 if (mapping->nrpages) {
355 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 299 ret = filemap_write_and_wait(mapping);
356 if (ret) { 300 if (ret) {
357 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 301 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
358 return ret; 302 return ret;
@@ -363,20 +307,95 @@ xfs_file_read_iter(
363 * we fail to invalidate a page, but this should never 307 * we fail to invalidate a page, but this should never
364 * happen on XFS. Warn if it does fail. 308 * happen on XFS. Warn if it does fail.
365 */ 309 */
366 ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping); 310 ret = invalidate_inode_pages2(mapping);
367 WARN_ON_ONCE(ret); 311 WARN_ON_ONCE(ret);
368 ret = 0; 312 ret = 0;
369 } 313 }
370 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 314 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
371 } 315 }
372 316
373 trace_xfs_file_read(ip, size, pos, ioflags); 317 data = *to;
318 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
319 xfs_get_blocks_direct, NULL, NULL, 0);
320 if (ret > 0) {
321 iocb->ki_pos += ret;
322 iov_iter_advance(to, ret);
323 }
324 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
374 325
326 file_accessed(iocb->ki_filp);
327 return ret;
328}
329
330static noinline ssize_t
331xfs_file_dax_read(
332 struct kiocb *iocb,
333 struct iov_iter *to)
334{
335 struct address_space *mapping = iocb->ki_filp->f_mapping;
336 struct inode *inode = mapping->host;
337 struct xfs_inode *ip = XFS_I(inode);
338 struct iov_iter data = *to;
339 size_t count = iov_iter_count(to);
340 ssize_t ret = 0;
341
342 trace_xfs_file_dax_read(ip, count, iocb->ki_pos);
343
344 if (!count)
345 return 0; /* skip atime */
346
347 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
348 ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0);
349 if (ret > 0) {
350 iocb->ki_pos += ret;
351 iov_iter_advance(to, ret);
352 }
353 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
354
355 file_accessed(iocb->ki_filp);
356 return ret;
357}
358
359STATIC ssize_t
360xfs_file_buffered_aio_read(
361 struct kiocb *iocb,
362 struct iov_iter *to)
363{
364 struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
365 ssize_t ret;
366
367 trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
368
369 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
375 ret = generic_file_read_iter(iocb, to); 370 ret = generic_file_read_iter(iocb, to);
371 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
372
373 return ret;
374}
375
376STATIC ssize_t
377xfs_file_read_iter(
378 struct kiocb *iocb,
379 struct iov_iter *to)
380{
381 struct inode *inode = file_inode(iocb->ki_filp);
382 struct xfs_mount *mp = XFS_I(inode)->i_mount;
383 ssize_t ret = 0;
384
385 XFS_STATS_INC(mp, xs_read_calls);
386
387 if (XFS_FORCED_SHUTDOWN(mp))
388 return -EIO;
389
390 if (IS_DAX(inode))
391 ret = xfs_file_dax_read(iocb, to);
392 else if (iocb->ki_flags & IOCB_DIRECT)
393 ret = xfs_file_dio_aio_read(iocb, to);
394 else
395 ret = xfs_file_buffered_aio_read(iocb, to);
396
376 if (ret > 0) 397 if (ret > 0)
377 XFS_STATS_ADD(mp, xs_read_bytes, ret); 398 XFS_STATS_ADD(mp, xs_read_bytes, ret);
378
379 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
380 return ret; 399 return ret;
381} 400}
382 401
@@ -389,18 +408,14 @@ xfs_file_splice_read(
389 unsigned int flags) 408 unsigned int flags)
390{ 409{
391 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host); 410 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
392 int ioflags = 0;
393 ssize_t ret; 411 ssize_t ret;
394 412
395 XFS_STATS_INC(ip->i_mount, xs_read_calls); 413 XFS_STATS_INC(ip->i_mount, xs_read_calls);
396 414
397 if (infilp->f_mode & FMODE_NOCMTIME)
398 ioflags |= XFS_IO_INVIS;
399
400 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 415 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
401 return -EIO; 416 return -EIO;
402 417
403 trace_xfs_file_splice_read(ip, count, *ppos, ioflags); 418 trace_xfs_file_splice_read(ip, count, *ppos);
404 419
405 /* 420 /*
406 * DAX inodes cannot ues the page cache for splice, so we have to push 421 * DAX inodes cannot ues the page cache for splice, so we have to push
@@ -424,49 +439,6 @@ out:
424} 439}
425 440
426/* 441/*
427 * This routine is called to handle zeroing any space in the last block of the
428 * file that is beyond the EOF. We do this since the size is being increased
429 * without writing anything to that block and we don't want to read the
430 * garbage on the disk.
431 */
432STATIC int /* error (positive) */
433xfs_zero_last_block(
434 struct xfs_inode *ip,
435 xfs_fsize_t offset,
436 xfs_fsize_t isize,
437 bool *did_zeroing)
438{
439 struct xfs_mount *mp = ip->i_mount;
440 xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize);
441 int zero_offset = XFS_B_FSB_OFFSET(mp, isize);
442 int zero_len;
443 int nimaps = 1;
444 int error = 0;
445 struct xfs_bmbt_irec imap;
446
447 xfs_ilock(ip, XFS_ILOCK_EXCL);
448 error = xfs_bmapi_read(ip, last_fsb, 1, &imap, &nimaps, 0);
449 xfs_iunlock(ip, XFS_ILOCK_EXCL);
450 if (error)
451 return error;
452
453 ASSERT(nimaps > 0);
454
455 /*
456 * If the block underlying isize is just a hole, then there
457 * is nothing to zero.
458 */
459 if (imap.br_startblock == HOLESTARTBLOCK)
460 return 0;
461
462 zero_len = mp->m_sb.sb_blocksize - zero_offset;
463 if (isize + zero_len > offset)
464 zero_len = offset - isize;
465 *did_zeroing = true;
466 return xfs_iozero(ip, isize, zero_len);
467}
468
469/*
470 * Zero any on disk space between the current EOF and the new, larger EOF. 442 * Zero any on disk space between the current EOF and the new, larger EOF.
471 * 443 *
472 * This handles the normal case of zeroing the remainder of the last block in 444 * This handles the normal case of zeroing the remainder of the last block in
@@ -484,94 +456,11 @@ xfs_zero_eof(
484 xfs_fsize_t isize, /* current inode size */ 456 xfs_fsize_t isize, /* current inode size */
485 bool *did_zeroing) 457 bool *did_zeroing)
486{ 458{
487 struct xfs_mount *mp = ip->i_mount;
488 xfs_fileoff_t start_zero_fsb;
489 xfs_fileoff_t end_zero_fsb;
490 xfs_fileoff_t zero_count_fsb;
491 xfs_fileoff_t last_fsb;
492 xfs_fileoff_t zero_off;
493 xfs_fsize_t zero_len;
494 int nimaps;
495 int error = 0;
496 struct xfs_bmbt_irec imap;
497
498 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 459 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
499 ASSERT(offset > isize); 460 ASSERT(offset > isize);
500 461
501 trace_xfs_zero_eof(ip, isize, offset - isize); 462 trace_xfs_zero_eof(ip, isize, offset - isize);
502 463 return xfs_zero_range(ip, isize, offset - isize, did_zeroing);
503 /*
504 * First handle zeroing the block on which isize resides.
505 *
506 * We only zero a part of that block so it is handled specially.
507 */
508 if (XFS_B_FSB_OFFSET(mp, isize) != 0) {
509 error = xfs_zero_last_block(ip, offset, isize, did_zeroing);
510 if (error)
511 return error;
512 }
513
514 /*
515 * Calculate the range between the new size and the old where blocks
516 * needing to be zeroed may exist.
517 *
518 * To get the block where the last byte in the file currently resides,
519 * we need to subtract one from the size and truncate back to a block
520 * boundary. We subtract 1 in case the size is exactly on a block
521 * boundary.
522 */
523 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
524 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
525 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
526 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
527 if (last_fsb == end_zero_fsb) {
528 /*
529 * The size was only incremented on its last block.
530 * We took care of that above, so just return.
531 */
532 return 0;
533 }
534
535 ASSERT(start_zero_fsb <= end_zero_fsb);
536 while (start_zero_fsb <= end_zero_fsb) {
537 nimaps = 1;
538 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
539
540 xfs_ilock(ip, XFS_ILOCK_EXCL);
541 error = xfs_bmapi_read(ip, start_zero_fsb, zero_count_fsb,
542 &imap, &nimaps, 0);
543 xfs_iunlock(ip, XFS_ILOCK_EXCL);
544 if (error)
545 return error;
546
547 ASSERT(nimaps > 0);
548
549 if (imap.br_state == XFS_EXT_UNWRITTEN ||
550 imap.br_startblock == HOLESTARTBLOCK) {
551 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
552 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
553 continue;
554 }
555
556 /*
557 * There are blocks we need to zero.
558 */
559 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
560 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
561
562 if ((zero_off + zero_len) > offset)
563 zero_len = offset - zero_off;
564
565 error = xfs_iozero(ip, zero_off, zero_len);
566 if (error)
567 return error;
568
569 *did_zeroing = true;
570 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
571 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
572 }
573
574 return 0;
575} 464}
576 465
577/* 466/*
@@ -722,8 +611,7 @@ xfs_file_dio_aio_write(
722 mp->m_rtdev_targp : mp->m_ddev_targp; 611 mp->m_rtdev_targp : mp->m_ddev_targp;
723 612
724 /* DIO must be aligned to device logical sector size */ 613 /* DIO must be aligned to device logical sector size */
725 if (!IS_DAX(inode) && 614 if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
726 ((iocb->ki_pos | count) & target->bt_logical_sectormask))
727 return -EINVAL; 615 return -EINVAL;
728 616
729 /* "unaligned" here means not aligned to a filesystem block */ 617 /* "unaligned" here means not aligned to a filesystem block */
@@ -762,7 +650,7 @@ xfs_file_dio_aio_write(
762 end = iocb->ki_pos + count - 1; 650 end = iocb->ki_pos + count - 1;
763 651
764 /* 652 /*
765 * See xfs_file_read_iter() for why we do a full-file flush here. 653 * See xfs_file_dio_aio_read() for why we do a full-file flush here.
766 */ 654 */
767 if (mapping->nrpages) { 655 if (mapping->nrpages) {
768 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping); 656 ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -789,10 +677,12 @@ xfs_file_dio_aio_write(
789 iolock = XFS_IOLOCK_SHARED; 677 iolock = XFS_IOLOCK_SHARED;
790 } 678 }
791 679
792 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); 680 trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
793 681
794 data = *from; 682 data = *from;
795 ret = mapping->a_ops->direct_IO(iocb, &data); 683 ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
684 xfs_get_blocks_direct, xfs_end_io_direct_write,
685 NULL, DIO_ASYNC_EXTEND);
796 686
797 /* see generic_file_direct_write() for why this is necessary */ 687 /* see generic_file_direct_write() for why this is necessary */
798 if (mapping->nrpages) { 688 if (mapping->nrpages) {
@@ -809,10 +699,70 @@ out:
809 xfs_rw_iunlock(ip, iolock); 699 xfs_rw_iunlock(ip, iolock);
810 700
811 /* 701 /*
812 * No fallback to buffered IO on errors for XFS. DAX can result in 702 * No fallback to buffered IO on errors for XFS, direct IO will either
813 * partial writes, but direct IO will either complete fully or fail. 703 * complete fully or fail.
814 */ 704 */
815 ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); 705 ASSERT(ret < 0 || ret == count);
706 return ret;
707}
708
709static noinline ssize_t
710xfs_file_dax_write(
711 struct kiocb *iocb,
712 struct iov_iter *from)
713{
714 struct address_space *mapping = iocb->ki_filp->f_mapping;
715 struct inode *inode = mapping->host;
716 struct xfs_inode *ip = XFS_I(inode);
717 struct xfs_mount *mp = ip->i_mount;
718 ssize_t ret = 0;
719 int unaligned_io = 0;
720 int iolock;
721 struct iov_iter data;
722
723 /* "unaligned" here means not aligned to a filesystem block */
724 if ((iocb->ki_pos & mp->m_blockmask) ||
725 ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) {
726 unaligned_io = 1;
727 iolock = XFS_IOLOCK_EXCL;
728 } else if (mapping->nrpages) {
729 iolock = XFS_IOLOCK_EXCL;
730 } else {
731 iolock = XFS_IOLOCK_SHARED;
732 }
733 xfs_rw_ilock(ip, iolock);
734
735 ret = xfs_file_aio_write_checks(iocb, from, &iolock);
736 if (ret)
737 goto out;
738
739 /*
740 * Yes, even DAX files can have page cache attached to them: A zeroed
741 * page is inserted into the pagecache when we have to serve a write
742 * fault on a hole. It should never be dirtied and can simply be
743 * dropped from the pagecache once we get real data for the page.
744 */
745 if (mapping->nrpages) {
746 ret = invalidate_inode_pages2(mapping);
747 WARN_ON_ONCE(ret);
748 }
749
750 if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) {
751 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
752 iolock = XFS_IOLOCK_SHARED;
753 }
754
755 trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos);
756
757 data = *from;
758 ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct,
759 xfs_end_io_direct_write, 0);
760 if (ret > 0) {
761 iocb->ki_pos += ret;
762 iov_iter_advance(from, ret);
763 }
764out:
765 xfs_rw_iunlock(ip, iolock);
816 return ret; 766 return ret;
817} 767}
818 768
@@ -839,9 +789,8 @@ xfs_file_buffered_aio_write(
839 current->backing_dev_info = inode_to_bdi(inode); 789 current->backing_dev_info = inode_to_bdi(inode);
840 790
841write_retry: 791write_retry:
842 trace_xfs_file_buffered_write(ip, iov_iter_count(from), 792 trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
843 iocb->ki_pos, 0); 793 ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
844 ret = generic_perform_write(file, from, iocb->ki_pos);
845 if (likely(ret >= 0)) 794 if (likely(ret >= 0))
846 iocb->ki_pos += ret; 795 iocb->ki_pos += ret;
847 796
@@ -895,7 +844,9 @@ xfs_file_write_iter(
895 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 844 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
896 return -EIO; 845 return -EIO;
897 846
898 if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) 847 if (IS_DAX(inode))
848 ret = xfs_file_dax_write(iocb, from);
849 else if (iocb->ki_flags & IOCB_DIRECT)
899 ret = xfs_file_dio_aio_write(iocb, from); 850 ret = xfs_file_dio_aio_write(iocb, from);
900 else 851 else
901 ret = xfs_file_buffered_aio_write(iocb, from); 852 ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1553,7 +1504,7 @@ xfs_filemap_page_mkwrite(
1553 if (IS_DAX(inode)) { 1504 if (IS_DAX(inode)) {
1554 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); 1505 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
1555 } else { 1506 } else {
1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1507 ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
1557 ret = block_page_mkwrite_return(ret); 1508 ret = block_page_mkwrite_return(ret);
1558 } 1509 }
1559 1510
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index b4d75825ae37..7191c3878b4a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -667,8 +667,11 @@ xfs_reserve_blocks(
667 __uint64_t *inval, 667 __uint64_t *inval,
668 xfs_fsop_resblks_t *outval) 668 xfs_fsop_resblks_t *outval)
669{ 669{
670 __int64_t lcounter, delta, fdblks_delta; 670 __int64_t lcounter, delta;
671 __int64_t fdblks_delta = 0;
671 __uint64_t request; 672 __uint64_t request;
673 __int64_t free;
674 int error = 0;
672 675
673 /* If inval is null, report current values and return */ 676 /* If inval is null, report current values and return */
674 if (inval == (__uint64_t *)NULL) { 677 if (inval == (__uint64_t *)NULL) {
@@ -682,24 +685,23 @@ xfs_reserve_blocks(
682 request = *inval; 685 request = *inval;
683 686
684 /* 687 /*
685 * With per-cpu counters, this becomes an interesting 688 * With per-cpu counters, this becomes an interesting problem. we need
686 * problem. we needto work out if we are freeing or allocation 689 * to work out if we are freeing or allocation blocks first, then we can
687 * blocks first, then we can do the modification as necessary. 690 * do the modification as necessary.
688 * 691 *
689 * We do this under the m_sb_lock so that if we are near 692 * We do this under the m_sb_lock so that if we are near ENOSPC, we will
690 * ENOSPC, we will hold out any changes while we work out 693 * hold out any changes while we work out what to do. This means that
691 * what to do. This means that the amount of free space can 694 * the amount of free space can change while we do this, so we need to
692 * change while we do this, so we need to retry if we end up 695 * retry if we end up trying to reserve more space than is available.
693 * trying to reserve more space than is available.
694 */ 696 */
695retry:
696 spin_lock(&mp->m_sb_lock); 697 spin_lock(&mp->m_sb_lock);
697 698
698 /* 699 /*
699 * If our previous reservation was larger than the current value, 700 * If our previous reservation was larger than the current value,
700 * then move any unused blocks back to the free pool. 701 * then move any unused blocks back to the free pool. Modify the resblks
702 * counters directly since we shouldn't have any problems unreserving
703 * space.
701 */ 704 */
702 fdblks_delta = 0;
703 if (mp->m_resblks > request) { 705 if (mp->m_resblks > request) {
704 lcounter = mp->m_resblks_avail - request; 706 lcounter = mp->m_resblks_avail - request;
705 if (lcounter > 0) { /* release unused blocks */ 707 if (lcounter > 0) { /* release unused blocks */
@@ -707,54 +709,67 @@ retry:
707 mp->m_resblks_avail -= lcounter; 709 mp->m_resblks_avail -= lcounter;
708 } 710 }
709 mp->m_resblks = request; 711 mp->m_resblks = request;
710 } else { 712 if (fdblks_delta) {
711 __int64_t free; 713 spin_unlock(&mp->m_sb_lock);
714 error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
715 spin_lock(&mp->m_sb_lock);
716 }
717
718 goto out;
719 }
712 720
721 /*
722 * If the request is larger than the current reservation, reserve the
723 * blocks before we update the reserve counters. Sample m_fdblocks and
724 * perform a partial reservation if the request exceeds free space.
725 */
726 error = -ENOSPC;
727 do {
713 free = percpu_counter_sum(&mp->m_fdblocks) - 728 free = percpu_counter_sum(&mp->m_fdblocks) -
714 XFS_ALLOC_SET_ASIDE(mp); 729 XFS_ALLOC_SET_ASIDE(mp);
715 if (!free) 730 if (!free)
716 goto out; /* ENOSPC and fdblks_delta = 0 */ 731 break;
717 732
718 delta = request - mp->m_resblks; 733 delta = request - mp->m_resblks;
719 lcounter = free - delta; 734 lcounter = free - delta;
720 if (lcounter < 0) { 735 if (lcounter < 0)
721 /* We can't satisfy the request, just get what we can */ 736 /* We can't satisfy the request, just get what we can */
722 mp->m_resblks += free; 737 fdblks_delta = free;
723 mp->m_resblks_avail += free; 738 else
724 fdblks_delta = -free; 739 fdblks_delta = delta;
725 } else {
726 fdblks_delta = -delta;
727 mp->m_resblks = request;
728 mp->m_resblks_avail += delta;
729 }
730 }
731out:
732 if (outval) {
733 outval->resblks = mp->m_resblks;
734 outval->resblks_avail = mp->m_resblks_avail;
735 }
736 spin_unlock(&mp->m_sb_lock);
737 740
738 if (fdblks_delta) {
739 /* 741 /*
740 * If we are putting blocks back here, m_resblks_avail is 742 * We'll either succeed in getting space from the free block
741 * already at its max so this will put it in the free pool. 743 * count or we'll get an ENOSPC. If we get a ENOSPC, it means
742 * 744 * things changed while we were calculating fdblks_delta and so
743 * If we need space, we'll either succeed in getting it 745 * we should try again to see if there is anything left to
744 * from the free block count or we'll get an enospc. If 746 * reserve.
745 * we get a ENOSPC, it means things changed while we were
746 * calculating fdblks_delta and so we should try again to
747 * see if there is anything left to reserve.
748 * 747 *
749 * Don't set the reserved flag here - we don't want to reserve 748 * Don't set the reserved flag here - we don't want to reserve
750 * the extra reserve blocks from the reserve..... 749 * the extra reserve blocks from the reserve.....
751 */ 750 */
752 int error; 751 spin_unlock(&mp->m_sb_lock);
753 error = xfs_mod_fdblocks(mp, fdblks_delta, 0); 752 error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
754 if (error == -ENOSPC) 753 spin_lock(&mp->m_sb_lock);
755 goto retry; 754 } while (error == -ENOSPC);
755
756 /*
757 * Update the reserve counters if blocks have been successfully
758 * allocated.
759 */
760 if (!error && fdblks_delta) {
761 mp->m_resblks += fdblks_delta;
762 mp->m_resblks_avail += fdblks_delta;
756 } 763 }
757 return 0; 764
765out:
766 if (outval) {
767 outval->resblks = mp->m_resblks;
768 outval->resblks_avail = mp->m_resblks_avail;
769 }
770
771 spin_unlock(&mp->m_sb_lock);
772 return error;
758} 773}
759 774
760int 775int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 99ee6eee5e0b..fb39a66914dd 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -765,7 +765,7 @@ restart:
765 * Background scanning to trim post-EOF preallocated space. This is queued 765 * Background scanning to trim post-EOF preallocated space. This is queued
766 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 766 * based on the 'speculative_prealloc_lifetime' tunable (5m by default).
767 */ 767 */
768STATIC void 768void
769xfs_queue_eofblocks( 769xfs_queue_eofblocks(
770 struct xfs_mount *mp) 770 struct xfs_mount *mp)
771{ 771{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 62f1f91c32cb..05bac99bef75 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -68,6 +68,7 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
68int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); 68int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
69int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip); 69int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
70void xfs_eofblocks_worker(struct work_struct *); 70void xfs_eofblocks_worker(struct work_struct *);
71void xfs_queue_eofblocks(struct xfs_mount *);
71 72
72int xfs_inode_ag_iterator(struct xfs_mount *mp, 73int xfs_inode_ag_iterator(struct xfs_mount *mp,
73 int (*execute)(struct xfs_inode *ip, int flags, void *args), 74 int (*execute)(struct xfs_inode *ip, int flags, void *args),
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ee6799e0476f..8825bcfd314c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -431,7 +431,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
431 * lock more than one at a time, lockdep will report false positives saying we 431 * lock more than one at a time, lockdep will report false positives saying we
432 * have violated locking orders. 432 * have violated locking orders.
433 */ 433 */
434void 434static void
435xfs_lock_inodes( 435xfs_lock_inodes(
436 xfs_inode_t **ips, 436 xfs_inode_t **ips,
437 int inodes, 437 int inodes,
@@ -667,14 +667,6 @@ xfs_ip2xflags(
667 return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); 667 return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
668} 668}
669 669
670uint
671xfs_dic2xflags(
672 struct xfs_dinode *dip)
673{
674 return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
675 be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
676}
677
678/* 670/*
679 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 671 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
680 * is allowed, otherwise it has to be an exact match. If a CI match is found, 672 * is allowed, otherwise it has to be an exact match. If a CI match is found,
@@ -748,7 +740,7 @@ out_unlock:
748 * are not linked into the directory structure - they are attached 740 * are not linked into the directory structure - they are attached
749 * directly to the superblock - and so have no parent. 741 * directly to the superblock - and so have no parent.
750 */ 742 */
751int 743static int
752xfs_ialloc( 744xfs_ialloc(
753 xfs_trans_t *tp, 745 xfs_trans_t *tp,
754 xfs_inode_t *pip, 746 xfs_inode_t *pip,
@@ -1085,7 +1077,7 @@ xfs_dir_ialloc(
1085 * link count to go to zero, move the inode to AGI unlinked list so that it can 1077 * link count to go to zero, move the inode to AGI unlinked list so that it can
1086 * be freed when the last active reference goes away via xfs_inactive(). 1078 * be freed when the last active reference goes away via xfs_inactive().
1087 */ 1079 */
1088int /* error */ 1080static int /* error */
1089xfs_droplink( 1081xfs_droplink(
1090 xfs_trans_t *tp, 1082 xfs_trans_t *tp,
1091 xfs_inode_t *ip) 1083 xfs_inode_t *ip)
@@ -1104,7 +1096,7 @@ xfs_droplink(
1104/* 1096/*
1105 * Increment the link count on an inode & log the change. 1097 * Increment the link count on an inode & log the change.
1106 */ 1098 */
1107int 1099static int
1108xfs_bumplink( 1100xfs_bumplink(
1109 xfs_trans_t *tp, 1101 xfs_trans_t *tp,
1110 xfs_inode_t *ip) 1102 xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index e52d7c7aeb5b..8eb78ec4a6e2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -395,12 +395,8 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
395int xfs_isilocked(xfs_inode_t *, uint); 395int xfs_isilocked(xfs_inode_t *, uint);
396uint xfs_ilock_data_map_shared(struct xfs_inode *); 396uint xfs_ilock_data_map_shared(struct xfs_inode *);
397uint xfs_ilock_attr_map_shared(struct xfs_inode *); 397uint xfs_ilock_attr_map_shared(struct xfs_inode *);
398int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
399 xfs_nlink_t, xfs_dev_t, prid_t, int,
400 struct xfs_buf **, xfs_inode_t **);
401 398
402uint xfs_ip2xflags(struct xfs_inode *); 399uint xfs_ip2xflags(struct xfs_inode *);
403uint xfs_dic2xflags(struct xfs_dinode *);
404int xfs_ifree(struct xfs_trans *, xfs_inode_t *, 400int xfs_ifree(struct xfs_trans *, xfs_inode_t *,
405 struct xfs_bmap_free *); 401 struct xfs_bmap_free *);
406int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, 402int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *,
@@ -411,7 +407,6 @@ void xfs_iunpin_wait(xfs_inode_t *);
411#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount)) 407#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
412 408
413int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 409int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
414void xfs_lock_inodes(xfs_inode_t **, int, uint);
415void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 410void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
416 411
417xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); 412xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
@@ -419,8 +414,6 @@ xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip);
419int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, 414int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
420 xfs_nlink_t, xfs_dev_t, prid_t, int, 415 xfs_nlink_t, xfs_dev_t, prid_t, int,
421 struct xfs_inode **, int *); 416 struct xfs_inode **, int *);
422int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
423int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
424 417
425/* from xfs_file.c */ 418/* from xfs_file.c */
426enum xfs_prealloc_flags { 419enum xfs_prealloc_flags {
@@ -434,7 +427,8 @@ int xfs_update_prealloc_flags(struct xfs_inode *ip,
434 enum xfs_prealloc_flags flags); 427 enum xfs_prealloc_flags flags);
435int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, 428int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset,
436 xfs_fsize_t isize, bool *did_zeroing); 429 xfs_fsize_t isize, bool *did_zeroing);
437int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); 430int xfs_zero_range(struct xfs_inode *ip, xfs_off_t pos, xfs_off_t count,
431 bool *did_zero);
438loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start, 432loff_t __xfs_seek_hole_data(struct inode *inode, loff_t start,
439 loff_t eof, int whence); 433 loff_t eof, int whence);
440 434
@@ -479,14 +473,4 @@ do { \
479 473
480extern struct kmem_zone *xfs_inode_zone; 474extern struct kmem_zone *xfs_inode_zone;
481 475
482/*
483 * Flags for read/write calls
484 */
485#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
486#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
487
488#define XFS_IO_FLAGS \
489 { XFS_IO_ISDIRECT, "DIRECT" }, \
490 { XFS_IO_INVIS, "INVIS"}
491
492#endif /* __XFS_INODE_H__ */ 476#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a1b07612224c..892c2aced207 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -651,6 +651,7 @@ void
651xfs_inode_item_destroy( 651xfs_inode_item_destroy(
652 xfs_inode_t *ip) 652 xfs_inode_t *ip)
653{ 653{
654 kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
654 kmem_zone_free(xfs_ili_zone, ip->i_itemp); 655 kmem_zone_free(xfs_ili_zone, ip->i_itemp);
655} 656}
656 657
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 63a6ff2cfc68..9a7c87809d3b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -595,13 +595,12 @@ xfs_attrmulti_by_handle(
595 595
596int 596int
597xfs_ioc_space( 597xfs_ioc_space(
598 struct xfs_inode *ip,
599 struct inode *inode,
600 struct file *filp, 598 struct file *filp,
601 int ioflags,
602 unsigned int cmd, 599 unsigned int cmd,
603 xfs_flock64_t *bf) 600 xfs_flock64_t *bf)
604{ 601{
602 struct inode *inode = file_inode(filp);
603 struct xfs_inode *ip = XFS_I(inode);
605 struct iattr iattr; 604 struct iattr iattr;
606 enum xfs_prealloc_flags flags = 0; 605 enum xfs_prealloc_flags flags = 0;
607 uint iolock = XFS_IOLOCK_EXCL; 606 uint iolock = XFS_IOLOCK_EXCL;
@@ -626,7 +625,7 @@ xfs_ioc_space(
626 625
627 if (filp->f_flags & O_DSYNC) 626 if (filp->f_flags & O_DSYNC)
628 flags |= XFS_PREALLOC_SYNC; 627 flags |= XFS_PREALLOC_SYNC;
629 if (ioflags & XFS_IO_INVIS) 628 if (filp->f_mode & FMODE_NOCMTIME)
630 flags |= XFS_PREALLOC_INVISIBLE; 629 flags |= XFS_PREALLOC_INVISIBLE;
631 630
632 error = mnt_want_write_file(filp); 631 error = mnt_want_write_file(filp);
@@ -1464,8 +1463,7 @@ xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
1464 1463
1465STATIC int 1464STATIC int
1466xfs_ioc_getbmap( 1465xfs_ioc_getbmap(
1467 struct xfs_inode *ip, 1466 struct file *file,
1468 int ioflags,
1469 unsigned int cmd, 1467 unsigned int cmd,
1470 void __user *arg) 1468 void __user *arg)
1471{ 1469{
@@ -1479,10 +1477,10 @@ xfs_ioc_getbmap(
1479 return -EINVAL; 1477 return -EINVAL;
1480 1478
1481 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); 1479 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
1482 if (ioflags & XFS_IO_INVIS) 1480 if (file->f_mode & FMODE_NOCMTIME)
1483 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; 1481 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
1484 1482
1485 error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, 1483 error = xfs_getbmap(XFS_I(file_inode(file)), &bmx, xfs_getbmap_format,
1486 (__force struct getbmap *)arg+1); 1484 (__force struct getbmap *)arg+1);
1487 if (error) 1485 if (error)
1488 return error; 1486 return error;
@@ -1575,6 +1573,11 @@ xfs_ioc_swapext(
1575 goto out_put_tmp_file; 1573 goto out_put_tmp_file;
1576 } 1574 }
1577 1575
1576 /*
1577 * We need to ensure that the fds passed in point to XFS inodes
1578 * before we cast and access them as XFS structures as we have no
1579 * control over what the user passes us here.
1580 */
1578 if (f.file->f_op != &xfs_file_operations || 1581 if (f.file->f_op != &xfs_file_operations ||
1579 tmp.file->f_op != &xfs_file_operations) { 1582 tmp.file->f_op != &xfs_file_operations) {
1580 error = -EINVAL; 1583 error = -EINVAL;
@@ -1625,12 +1628,8 @@ xfs_file_ioctl(
1625 struct xfs_inode *ip = XFS_I(inode); 1628 struct xfs_inode *ip = XFS_I(inode);
1626 struct xfs_mount *mp = ip->i_mount; 1629 struct xfs_mount *mp = ip->i_mount;
1627 void __user *arg = (void __user *)p; 1630 void __user *arg = (void __user *)p;
1628 int ioflags = 0;
1629 int error; 1631 int error;
1630 1632
1631 if (filp->f_mode & FMODE_NOCMTIME)
1632 ioflags |= XFS_IO_INVIS;
1633
1634 trace_xfs_file_ioctl(ip); 1633 trace_xfs_file_ioctl(ip);
1635 1634
1636 switch (cmd) { 1635 switch (cmd) {
@@ -1649,7 +1648,7 @@ xfs_file_ioctl(
1649 1648
1650 if (copy_from_user(&bf, arg, sizeof(bf))) 1649 if (copy_from_user(&bf, arg, sizeof(bf)))
1651 return -EFAULT; 1650 return -EFAULT;
1652 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); 1651 return xfs_ioc_space(filp, cmd, &bf);
1653 } 1652 }
1654 case XFS_IOC_DIOINFO: { 1653 case XFS_IOC_DIOINFO: {
1655 struct dioattr da; 1654 struct dioattr da;
@@ -1708,7 +1707,7 @@ xfs_file_ioctl(
1708 1707
1709 case XFS_IOC_GETBMAP: 1708 case XFS_IOC_GETBMAP:
1710 case XFS_IOC_GETBMAPA: 1709 case XFS_IOC_GETBMAPA:
1711 return xfs_ioc_getbmap(ip, ioflags, cmd, arg); 1710 return xfs_ioc_getbmap(filp, cmd, arg);
1712 1711
1713 case XFS_IOC_GETBMAPX: 1712 case XFS_IOC_GETBMAPX:
1714 return xfs_ioc_getbmapx(ip, arg); 1713 return xfs_ioc_getbmapx(ip, arg);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index 77c02c7900b6..8b52881bfd90 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -20,10 +20,7 @@
20 20
21extern int 21extern int
22xfs_ioc_space( 22xfs_ioc_space(
23 struct xfs_inode *ip,
24 struct inode *inode,
25 struct file *filp, 23 struct file *filp,
26 int ioflags,
27 unsigned int cmd, 24 unsigned int cmd,
28 xfs_flock64_t *bf); 25 xfs_flock64_t *bf);
29 26
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 1a05d8ae327d..321f57721b92 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -532,12 +532,8 @@ xfs_file_compat_ioctl(
532 struct xfs_inode *ip = XFS_I(inode); 532 struct xfs_inode *ip = XFS_I(inode);
533 struct xfs_mount *mp = ip->i_mount; 533 struct xfs_mount *mp = ip->i_mount;
534 void __user *arg = (void __user *)p; 534 void __user *arg = (void __user *)p;
535 int ioflags = 0;
536 int error; 535 int error;
537 536
538 if (filp->f_mode & FMODE_NOCMTIME)
539 ioflags |= XFS_IO_INVIS;
540
541 trace_xfs_file_compat_ioctl(ip); 537 trace_xfs_file_compat_ioctl(ip);
542 538
543 switch (cmd) { 539 switch (cmd) {
@@ -589,7 +585,7 @@ xfs_file_compat_ioctl(
589 if (xfs_compat_flock64_copyin(&bf, arg)) 585 if (xfs_compat_flock64_copyin(&bf, arg))
590 return -EFAULT; 586 return -EFAULT;
591 cmd = _NATIVE_IOC(cmd, struct xfs_flock64); 587 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
592 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); 588 return xfs_ioc_space(filp, cmd, &bf);
593 } 589 }
594 case XFS_IOC_FSGEOMETRY_V1_32: 590 case XFS_IOC_FSGEOMETRY_V1_32:
595 return xfs_compat_ioc_fsgeometry_v1(mp, arg); 591 return xfs_compat_ioc_fsgeometry_v1(mp, arg);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 58391355a44d..620fc9120444 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -15,6 +15,7 @@
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include <linux/iomap.h>
18#include "xfs.h" 19#include "xfs.h"
19#include "xfs_fs.h" 20#include "xfs_fs.h"
20#include "xfs_shared.h" 21#include "xfs_shared.h"
@@ -940,3 +941,173 @@ error_on_bmapi_transaction:
940 xfs_iunlock(ip, XFS_ILOCK_EXCL); 941 xfs_iunlock(ip, XFS_ILOCK_EXCL);
941 return error; 942 return error;
942} 943}
944
945void
946xfs_bmbt_to_iomap(
947 struct xfs_inode *ip,
948 struct iomap *iomap,
949 struct xfs_bmbt_irec *imap)
950{
951 struct xfs_mount *mp = ip->i_mount;
952
953 if (imap->br_startblock == HOLESTARTBLOCK) {
954 iomap->blkno = IOMAP_NULL_BLOCK;
955 iomap->type = IOMAP_HOLE;
956 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
957 iomap->blkno = IOMAP_NULL_BLOCK;
958 iomap->type = IOMAP_DELALLOC;
959 } else {
960 iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
961 if (imap->br_state == XFS_EXT_UNWRITTEN)
962 iomap->type = IOMAP_UNWRITTEN;
963 else
964 iomap->type = IOMAP_MAPPED;
965 }
966 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
967 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
968 iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
969}
970
971static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
972{
973 return !nimaps ||
974 imap->br_startblock == HOLESTARTBLOCK ||
975 imap->br_startblock == DELAYSTARTBLOCK;
976}
977
978static int
979xfs_file_iomap_begin(
980 struct inode *inode,
981 loff_t offset,
982 loff_t length,
983 unsigned flags,
984 struct iomap *iomap)
985{
986 struct xfs_inode *ip = XFS_I(inode);
987 struct xfs_mount *mp = ip->i_mount;
988 struct xfs_bmbt_irec imap;
989 xfs_fileoff_t offset_fsb, end_fsb;
990 int nimaps = 1, error = 0;
991
992 if (XFS_FORCED_SHUTDOWN(mp))
993 return -EIO;
994
995 xfs_ilock(ip, XFS_ILOCK_EXCL);
996
997 ASSERT(offset <= mp->m_super->s_maxbytes);
998 if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
999 length = mp->m_super->s_maxbytes - offset;
1000 offset_fsb = XFS_B_TO_FSBT(mp, offset);
1001 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1002
1003 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
1004 &nimaps, XFS_BMAPI_ENTIRE);
1005 if (error) {
1006 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1007 return error;
1008 }
1009
1010 if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
1011 /*
1012 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
1013 * pages to keep the chunks of work done where somewhat symmetric
1014 * with the work writeback does. This is a completely arbitrary
1015 * number pulled out of thin air as a best guess for initial
1016 * testing.
1017 *
1018 * Note that the values needs to be less than 32-bits wide until
1019 * the lower level functions are updated.
1020 */
1021 length = min_t(loff_t, length, 1024 * PAGE_SIZE);
1022 if (xfs_get_extsz_hint(ip)) {
1023 /*
1024 * xfs_iomap_write_direct() expects the shared lock. It
1025 * is unlocked on return.
1026 */
1027 xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
1028 error = xfs_iomap_write_direct(ip, offset, length, &imap,
1029 nimaps);
1030 } else {
1031 error = xfs_iomap_write_delay(ip, offset, length, &imap);
1032 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1033 }
1034
1035 if (error)
1036 return error;
1037
1038 trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
1039 xfs_bmbt_to_iomap(ip, iomap, &imap);
1040 } else if (nimaps) {
1041 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1042 trace_xfs_iomap_found(ip, offset, length, 0, &imap);
1043 xfs_bmbt_to_iomap(ip, iomap, &imap);
1044 } else {
1045 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1046 trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
1047 iomap->blkno = IOMAP_NULL_BLOCK;
1048 iomap->type = IOMAP_HOLE;
1049 iomap->offset = offset;
1050 iomap->length = length;
1051 }
1052
1053 return 0;
1054}
1055
1056static int
1057xfs_file_iomap_end_delalloc(
1058 struct xfs_inode *ip,
1059 loff_t offset,
1060 loff_t length,
1061 ssize_t written)
1062{
1063 struct xfs_mount *mp = ip->i_mount;
1064 xfs_fileoff_t start_fsb;
1065 xfs_fileoff_t end_fsb;
1066 int error = 0;
1067
1068 start_fsb = XFS_B_TO_FSB(mp, offset + written);
1069 end_fsb = XFS_B_TO_FSB(mp, offset + length);
1070
1071 /*
1072 * Trim back delalloc blocks if we didn't manage to write the whole
1073 * range reserved.
1074 *
1075 * We don't need to care about racing delalloc as we hold i_mutex
1076 * across the reserve/allocate/unreserve calls. If there are delalloc
1077 * blocks in the range, they are ours.
1078 */
1079 if (start_fsb < end_fsb) {
1080 xfs_ilock(ip, XFS_ILOCK_EXCL);
1081 error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
1082 end_fsb - start_fsb);
1083 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1084
1085 if (error && !XFS_FORCED_SHUTDOWN(mp)) {
1086 xfs_alert(mp, "%s: unable to clean up ino %lld",
1087 __func__, ip->i_ino);
1088 return error;
1089 }
1090 }
1091
1092 return 0;
1093}
1094
1095static int
1096xfs_file_iomap_end(
1097 struct inode *inode,
1098 loff_t offset,
1099 loff_t length,
1100 ssize_t written,
1101 unsigned flags,
1102 struct iomap *iomap)
1103{
1104 if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
1105 return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
1106 length, written);
1107 return 0;
1108}
1109
1110struct iomap_ops xfs_iomap_ops = {
1111 .iomap_begin = xfs_file_iomap_begin,
1112 .iomap_end = xfs_file_iomap_end,
1113};
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 8688e663d744..e066d045e2ff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -18,6 +18,8 @@
18#ifndef __XFS_IOMAP_H__ 18#ifndef __XFS_IOMAP_H__
19#define __XFS_IOMAP_H__ 19#define __XFS_IOMAP_H__
20 20
21#include <linux/iomap.h>
22
21struct xfs_inode; 23struct xfs_inode;
22struct xfs_bmbt_irec; 24struct xfs_bmbt_irec;
23 25
@@ -29,4 +31,9 @@ int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
29 struct xfs_bmbt_irec *); 31 struct xfs_bmbt_irec *);
30int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); 32int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
31 33
34void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
35 struct xfs_bmbt_irec *);
36
37extern struct iomap_ops xfs_iomap_ops;
38
32#endif /* __XFS_IOMAP_H__*/ 39#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5d4eba6972e..ab820f84ed50 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,12 +38,13 @@
38#include "xfs_dir2.h" 38#include "xfs_dir2.h"
39#include "xfs_trans_space.h" 39#include "xfs_trans_space.h"
40#include "xfs_pnfs.h" 40#include "xfs_pnfs.h"
41#include "xfs_iomap.h"
41 42
42#include <linux/capability.h> 43#include <linux/capability.h>
43#include <linux/xattr.h> 44#include <linux/xattr.h>
44#include <linux/posix_acl.h> 45#include <linux/posix_acl.h>
45#include <linux/security.h> 46#include <linux/security.h>
46#include <linux/fiemap.h> 47#include <linux/iomap.h>
47#include <linux/slab.h> 48#include <linux/slab.h>
48 49
49/* 50/*
@@ -801,20 +802,30 @@ xfs_setattr_size(
801 return error; 802 return error;
802 803
803 /* 804 /*
805 * Wait for all direct I/O to complete.
806 */
807 inode_dio_wait(inode);
808
809 /*
804 * File data changes must be complete before we start the transaction to 810 * File data changes must be complete before we start the transaction to
805 * modify the inode. This needs to be done before joining the inode to 811 * modify the inode. This needs to be done before joining the inode to
806 * the transaction because the inode cannot be unlocked once it is a 812 * the transaction because the inode cannot be unlocked once it is a
807 * part of the transaction. 813 * part of the transaction.
808 * 814 *
809 * Start with zeroing any data block beyond EOF that we may expose on 815 * Start with zeroing any data beyond EOF that we may expose on file
810 * file extension. 816 * extension, or zeroing out the rest of the block on a downward
817 * truncate.
811 */ 818 */
812 if (newsize > oldsize) { 819 if (newsize > oldsize) {
813 error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); 820 error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing);
814 if (error) 821 } else {
815 return error; 822 error = iomap_truncate_page(inode, newsize, &did_zeroing,
823 &xfs_iomap_ops);
816 } 824 }
817 825
826 if (error)
827 return error;
828
818 /* 829 /*
819 * We are going to log the inode size change in this transaction so 830 * We are going to log the inode size change in this transaction so
820 * any previous writes that are beyond the on disk EOF and the new 831 * any previous writes that are beyond the on disk EOF and the new
@@ -823,17 +834,14 @@ xfs_setattr_size(
823 * problem. Note that this includes any block zeroing we did above; 834 * problem. Note that this includes any block zeroing we did above;
824 * otherwise those blocks may not be zeroed after a crash. 835 * otherwise those blocks may not be zeroed after a crash.
825 */ 836 */
826 if (newsize > ip->i_d.di_size && 837 if (did_zeroing ||
827 (oldsize != ip->i_d.di_size || did_zeroing)) { 838 (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) {
828 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 839 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
829 ip->i_d.di_size, newsize); 840 ip->i_d.di_size, newsize);
830 if (error) 841 if (error)
831 return error; 842 return error;
832 } 843 }
833 844
834 /* Now wait for all direct I/O to complete. */
835 inode_dio_wait(inode);
836
837 /* 845 /*
838 * We've already locked out new page faults, so now we can safely remove 846 * We've already locked out new page faults, so now we can safely remove
839 * pages from the page cache knowing they won't get refaulted until we 847 * pages from the page cache knowing they won't get refaulted until we
@@ -851,13 +859,6 @@ xfs_setattr_size(
851 * to hope that the caller sees ENOMEM and retries the truncate 859 * to hope that the caller sees ENOMEM and retries the truncate
852 * operation. 860 * operation.
853 */ 861 */
854 if (IS_DAX(inode))
855 error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct);
856 else
857 error = block_truncate_page(inode->i_mapping, newsize,
858 xfs_get_blocks);
859 if (error)
860 return error;
861 truncate_setsize(inode, newsize); 862 truncate_setsize(inode, newsize);
862 863
863 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 864 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
@@ -998,51 +999,6 @@ xfs_vn_update_time(
998 return xfs_trans_commit(tp); 999 return xfs_trans_commit(tp);
999} 1000}
1000 1001
1001#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
1002
1003/*
1004 * Call fiemap helper to fill in user data.
1005 * Returns positive errors to xfs_getbmap.
1006 */
1007STATIC int
1008xfs_fiemap_format(
1009 void **arg,
1010 struct getbmapx *bmv,
1011 int *full)
1012{
1013 int error;
1014 struct fiemap_extent_info *fieinfo = *arg;
1015 u32 fiemap_flags = 0;
1016 u64 logical, physical, length;
1017
1018 /* Do nothing for a hole */
1019 if (bmv->bmv_block == -1LL)
1020 return 0;
1021
1022 logical = BBTOB(bmv->bmv_offset);
1023 physical = BBTOB(bmv->bmv_block);
1024 length = BBTOB(bmv->bmv_length);
1025
1026 if (bmv->bmv_oflags & BMV_OF_PREALLOC)
1027 fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
1028 else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
1029 fiemap_flags |= (FIEMAP_EXTENT_DELALLOC |
1030 FIEMAP_EXTENT_UNKNOWN);
1031 physical = 0; /* no block yet */
1032 }
1033 if (bmv->bmv_oflags & BMV_OF_LAST)
1034 fiemap_flags |= FIEMAP_EXTENT_LAST;
1035
1036 error = fiemap_fill_next_extent(fieinfo, logical, physical,
1037 length, fiemap_flags);
1038 if (error > 0) {
1039 error = 0;
1040 *full = 1; /* user array now full */
1041 }
1042
1043 return error;
1044}
1045
1046STATIC int 1002STATIC int
1047xfs_vn_fiemap( 1003xfs_vn_fiemap(
1048 struct inode *inode, 1004 struct inode *inode,
@@ -1050,38 +1006,13 @@ xfs_vn_fiemap(
1050 u64 start, 1006 u64 start,
1051 u64 length) 1007 u64 length)
1052{ 1008{
1053 xfs_inode_t *ip = XFS_I(inode);
1054 struct getbmapx bm;
1055 int error; 1009 int error;
1056 1010
1057 error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS); 1011 xfs_ilock(XFS_I(inode), XFS_IOLOCK_SHARED);
1058 if (error) 1012 error = iomap_fiemap(inode, fieinfo, start, length, &xfs_iomap_ops);
1059 return error; 1013 xfs_iunlock(XFS_I(inode), XFS_IOLOCK_SHARED);
1060
1061 /* Set up bmap header for xfs internal routine */
1062 bm.bmv_offset = BTOBBT(start);
1063 /* Special case for whole file */
1064 if (length == FIEMAP_MAX_OFFSET)
1065 bm.bmv_length = -1LL;
1066 else
1067 bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
1068
1069 /* We add one because in getbmap world count includes the header */
1070 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
1071 fieinfo->fi_extents_max + 1;
1072 bm.bmv_count = min_t(__s32, bm.bmv_count,
1073 (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
1074 bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
1075 if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
1076 bm.bmv_iflags |= BMV_IF_ATTRFORK;
1077 if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
1078 bm.bmv_iflags |= BMV_IF_DELALLOC;
1079
1080 error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
1081 if (error)
1082 return error;
1083 1014
1084 return 0; 1015 return error;
1085} 1016}
1086 1017
1087STATIC int 1018STATIC int
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index a8192dc797dc..b8d64d520e12 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -328,13 +328,6 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
328 return x; 328 return x;
329} 329}
330 330
331/* ARM old ABI has some weird alignment/padding */
332#if defined(__arm__) && !defined(__ARM_EABI__)
333#define __arch_pack __attribute__((packed))
334#else
335#define __arch_pack
336#endif
337
338#define ASSERT_ALWAYS(expr) \ 331#define ASSERT_ALWAYS(expr) \
339 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) 332 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
340 333
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bde02f1fba73..3b74fa011bb1 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -788,7 +788,7 @@ xfs_log_mount_cancel(
788 * As far as I know, there weren't any dependencies on the old behaviour. 788 * As far as I know, there weren't any dependencies on the old behaviour.
789 */ 789 */
790 790
791int 791static int
792xfs_log_unmount_write(xfs_mount_t *mp) 792xfs_log_unmount_write(xfs_mount_t *mp)
793{ 793{
794 struct xlog *log = mp->m_log; 794 struct xlog *log = mp->m_log;
@@ -1036,7 +1036,7 @@ xfs_log_space_wake(
1036 * there's no point in running a dummy transaction at this point because we 1036 * there's no point in running a dummy transaction at this point because we
1037 * can't start trying to idle the log until both the CIL and AIL are empty. 1037 * can't start trying to idle the log until both the CIL and AIL are empty.
1038 */ 1038 */
1039int 1039static int
1040xfs_log_need_covered(xfs_mount_t *mp) 1040xfs_log_need_covered(xfs_mount_t *mp)
1041{ 1041{
1042 struct xlog *log = mp->m_log; 1042 struct xlog *log = mp->m_log;
@@ -1177,7 +1177,7 @@ xlog_space_left(
1177 * The log manager needs its own routine, in order to control what 1177 * The log manager needs its own routine, in order to control what
1178 * happens with the buffer after the write completes. 1178 * happens with the buffer after the write completes.
1179 */ 1179 */
1180void 1180static void
1181xlog_iodone(xfs_buf_t *bp) 1181xlog_iodone(xfs_buf_t *bp)
1182{ 1182{
1183 struct xlog_in_core *iclog = bp->b_fspriv; 1183 struct xlog_in_core *iclog = bp->b_fspriv;
@@ -1302,7 +1302,7 @@ xfs_log_work_queue(
1302 * disk. If there is nothing dirty, then we might need to cover the log to 1302 * disk. If there is nothing dirty, then we might need to cover the log to
1303 * indicate that the filesystem is idle. 1303 * indicate that the filesystem is idle.
1304 */ 1304 */
1305void 1305static void
1306xfs_log_worker( 1306xfs_log_worker(
1307 struct work_struct *work) 1307 struct work_struct *work)
1308{ 1308{
@@ -1415,7 +1415,7 @@ xlog_alloc_log(
1415 */ 1415 */
1416 error = -ENOMEM; 1416 error = -ENOMEM;
1417 bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL, 1417 bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
1418 BTOBB(log->l_iclog_size), 0); 1418 BTOBB(log->l_iclog_size), XBF_NO_IOACCT);
1419 if (!bp) 1419 if (!bp)
1420 goto out_free_log; 1420 goto out_free_log;
1421 1421
@@ -1454,7 +1454,8 @@ xlog_alloc_log(
1454 prev_iclog = iclog; 1454 prev_iclog = iclog;
1455 1455
1456 bp = xfs_buf_get_uncached(mp->m_logdev_targp, 1456 bp = xfs_buf_get_uncached(mp->m_logdev_targp,
1457 BTOBB(log->l_iclog_size), 0); 1457 BTOBB(log->l_iclog_size),
1458 XBF_NO_IOACCT);
1458 if (!bp) 1459 if (!bp)
1459 goto out_free_iclog; 1460 goto out_free_iclog;
1460 1461
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 80ba0c047090..b5e71072fde5 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -163,12 +163,8 @@ int xfs_log_reserve(struct xfs_mount *mp,
163 __uint8_t clientid, 163 __uint8_t clientid,
164 bool permanent); 164 bool permanent);
165int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); 165int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
166int xfs_log_unmount_write(struct xfs_mount *mp);
167void xfs_log_unmount(struct xfs_mount *mp); 166void xfs_log_unmount(struct xfs_mount *mp);
168int xfs_log_force_umount(struct xfs_mount *mp, int logerror); 167int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
169int xfs_log_need_covered(struct xfs_mount *mp);
170
171void xlog_iodone(struct xfs_buf *);
172 168
173struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); 169struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
174void xfs_log_ticket_put(struct xlog_ticket *ticket); 170void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -178,7 +174,6 @@ void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
178bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); 174bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
179 175
180void xfs_log_work_queue(struct xfs_mount *mp); 176void xfs_log_work_queue(struct xfs_mount *mp);
181void xfs_log_worker(struct work_struct *work);
182void xfs_log_quiesce(struct xfs_mount *mp); 177void xfs_log_quiesce(struct xfs_mount *mp);
183bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); 178bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
184 179
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 5e54e7955ea6..a4ab192e1792 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
78 log->l_cilp->xc_ctx->sequence = 1; 78 log->l_cilp->xc_ctx->sequence = 1;
79} 79}
80 80
81static inline int
82xlog_cil_iovec_space(
83 uint niovecs)
84{
85 return round_up((sizeof(struct xfs_log_vec) +
86 niovecs * sizeof(struct xfs_log_iovec)),
87 sizeof(uint64_t));
88}
89
90/*
91 * Allocate or pin log vector buffers for CIL insertion.
92 *
93 * The CIL currently uses disposable buffers for copying a snapshot of the
94 * modified items into the log during a push. The biggest problem with this is
95 * the requirement to allocate the disposable buffer during the commit if:
96 * a) does not exist; or
97 * b) it is too small
98 *
99 * If we do this allocation within xlog_cil_insert_format_items(), it is done
100 * under the xc_ctx_lock, which means that a CIL push cannot occur during
101 * the memory allocation. This means that we have a potential deadlock situation
102 * under low memory conditions when we have lots of dirty metadata pinned in
103 * the CIL and we need a CIL commit to occur to free memory.
104 *
105 * To avoid this, we need to move the memory allocation outside the
106 * xc_ctx_lock, but because the log vector buffers are disposable, that opens
107 * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
108 * vector buffers between the check and the formatting of the item into the
109 * log vector buffer within the xc_ctx_lock.
110 *
111 * Because the log vector buffer needs to be unchanged during the CIL push
112 * process, we cannot share the buffer between the transaction commit (which
113 * modifies the buffer) and the CIL push context that is writing the changes
114 * into the log. This means skipping preallocation of buffer space is
115 * unreliable, but we most definitely do not want to be allocating and freeing
116 * buffers unnecessarily during commits when overwrites can be done safely.
117 *
118 * The simplest solution to this problem is to allocate a shadow buffer when a
119 * log item is committed for the second time, and then to only use this buffer
120 * if necessary. The buffer can remain attached to the log item until such time
121 * it is needed, and this is the buffer that is reallocated to match the size of
122 * the incoming modification. Then during the formatting of the item we can swap
123 * the active buffer with the new one if we can't reuse the existing buffer. We
124 * don't free the old buffer as it may be reused on the next modification if
125 * it's size is right, otherwise we'll free and reallocate it at that point.
126 *
127 * This function builds a vector for the changes in each log item in the
128 * transaction. It then works out the length of the buffer needed for each log
129 * item, allocates them and attaches the vector to the log item in preparation
130 * for the formatting step which occurs under the xc_ctx_lock.
131 *
132 * While this means the memory footprint goes up, it avoids the repeated
133 * alloc/free pattern that repeated modifications of an item would otherwise
134 * cause, and hence minimises the CPU overhead of such behaviour.
135 */
136static void
137xlog_cil_alloc_shadow_bufs(
138 struct xlog *log,
139 struct xfs_trans *tp)
140{
141 struct xfs_log_item_desc *lidp;
142
143 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
144 struct xfs_log_item *lip = lidp->lid_item;
145 struct xfs_log_vec *lv;
146 int niovecs = 0;
147 int nbytes = 0;
148 int buf_size;
149 bool ordered = false;
150
151 /* Skip items which aren't dirty in this transaction. */
152 if (!(lidp->lid_flags & XFS_LID_DIRTY))
153 continue;
154
155 /* get number of vecs and size of data to be stored */
156 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
157
158 /*
159 * Ordered items need to be tracked but we do not wish to write
160 * them. We need a logvec to track the object, but we do not
161 * need an iovec or buffer to be allocated for copying data.
162 */
163 if (niovecs == XFS_LOG_VEC_ORDERED) {
164 ordered = true;
165 niovecs = 0;
166 nbytes = 0;
167 }
168
169 /*
170 * We 64-bit align the length of each iovec so that the start
171 * of the next one is naturally aligned. We'll need to
172 * account for that slack space here. Then round nbytes up
173 * to 64-bit alignment so that the initial buffer alignment is
174 * easy to calculate and verify.
175 */
176 nbytes += niovecs * sizeof(uint64_t);
177 nbytes = round_up(nbytes, sizeof(uint64_t));
178
179 /*
180 * The data buffer needs to start 64-bit aligned, so round up
181 * that space to ensure we can align it appropriately and not
182 * overrun the buffer.
183 */
184 buf_size = nbytes + xlog_cil_iovec_space(niovecs);
185
186 /*
187 * if we have no shadow buffer, or it is too small, we need to
188 * reallocate it.
189 */
190 if (!lip->li_lv_shadow ||
191 buf_size > lip->li_lv_shadow->lv_size) {
192
193 /*
194 * We free and allocate here as a realloc would copy
195 * unecessary data. We don't use kmem_zalloc() for the
196 * same reason - we don't need to zero the data area in
197 * the buffer, only the log vector header and the iovec
198 * storage.
199 */
200 kmem_free(lip->li_lv_shadow);
201
202 lv = kmem_alloc(buf_size, KM_SLEEP|KM_NOFS);
203 memset(lv, 0, xlog_cil_iovec_space(niovecs));
204
205 lv->lv_item = lip;
206 lv->lv_size = buf_size;
207 if (ordered)
208 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
209 else
210 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
211 lip->li_lv_shadow = lv;
212 } else {
213 /* same or smaller, optimise common overwrite case */
214 lv = lip->li_lv_shadow;
215 if (ordered)
216 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
217 else
218 lv->lv_buf_len = 0;
219 lv->lv_bytes = 0;
220 lv->lv_next = NULL;
221 }
222
223 /* Ensure the lv is set up according to ->iop_size */
224 lv->lv_niovecs = niovecs;
225
226 /* The allocated data region lies beyond the iovec region */
227 lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs);
228 }
229
230}
231
81/* 232/*
82 * Prepare the log item for insertion into the CIL. Calculate the difference in 233 * Prepare the log item for insertion into the CIL. Calculate the difference in
83 * log space and vectors it will consume, and if it is a new item pin it as 234 * log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
100 /* 251 /*
101 * If there is no old LV, this is the first time we've seen the item in 252 * If there is no old LV, this is the first time we've seen the item in
102 * this CIL context and so we need to pin it. If we are replacing the 253 * this CIL context and so we need to pin it. If we are replacing the
103 * old_lv, then remove the space it accounts for and free it. 254 * old_lv, then remove the space it accounts for and make it the shadow
255 * buffer for later freeing. In both cases we are now switching to the
256 * shadow buffer, so update the the pointer to it appropriately.
104 */ 257 */
105 if (!old_lv) 258 if (!old_lv) {
106 lv->lv_item->li_ops->iop_pin(lv->lv_item); 259 lv->lv_item->li_ops->iop_pin(lv->lv_item);
107 else if (old_lv != lv) { 260 lv->lv_item->li_lv_shadow = NULL;
261 } else if (old_lv != lv) {
108 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); 262 ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
109 263
110 *diff_len -= old_lv->lv_bytes; 264 *diff_len -= old_lv->lv_bytes;
111 *diff_iovecs -= old_lv->lv_niovecs; 265 *diff_iovecs -= old_lv->lv_niovecs;
112 kmem_free(old_lv); 266 lv->lv_item->li_lv_shadow = old_lv;
113 } 267 }
114 268
115 /* attach new log vector to log item */ 269 /* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
133 * write it out asynchronously without needing to relock the object that was 287 * write it out asynchronously without needing to relock the object that was
134 * modified at the time it gets written into the iclog. 288 * modified at the time it gets written into the iclog.
135 * 289 *
136 * This function builds a vector for the changes in each log item in the 290 * This function takes the prepared log vectors attached to each log item, and
137 * transaction. It then works out the length of the buffer needed for each log 291 * formats the changes into the log vector buffer. The buffer it uses is
138 * item, allocates them and formats the vector for the item into the buffer. 292 * dependent on the current state of the vector in the CIL - the shadow lv is
139 * The buffer is then attached to the log item are then inserted into the 293 * guaranteed to be large enough for the current modification, but we will only
140 * Committed Item List for tracking until the next checkpoint is written out. 294 * use that if we can't reuse the existing lv. If we can't reuse the existing
295 * lv, then simple swap it out for the shadow lv. We don't free it - that is
296 * done lazily either by th enext modification or the freeing of the log item.
141 * 297 *
142 * We don't set up region headers during this process; we simply copy the 298 * We don't set up region headers during this process; we simply copy the
143 * regions into the flat buffer. We can do this because we still have to do a 299 * regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
170 list_for_each_entry(lidp, &tp->t_items, lid_trans) { 326 list_for_each_entry(lidp, &tp->t_items, lid_trans) {
171 struct xfs_log_item *lip = lidp->lid_item; 327 struct xfs_log_item *lip = lidp->lid_item;
172 struct xfs_log_vec *lv; 328 struct xfs_log_vec *lv;
173 struct xfs_log_vec *old_lv; 329 struct xfs_log_vec *old_lv = NULL;
174 int niovecs = 0; 330 struct xfs_log_vec *shadow;
175 int nbytes = 0;
176 int buf_size;
177 bool ordered = false; 331 bool ordered = false;
178 332
179 /* Skip items which aren't dirty in this transaction. */ 333 /* Skip items which aren't dirty in this transaction. */
180 if (!(lidp->lid_flags & XFS_LID_DIRTY)) 334 if (!(lidp->lid_flags & XFS_LID_DIRTY))
181 continue; 335 continue;
182 336
183 /* get number of vecs and size of data to be stored */
184 lip->li_ops->iop_size(lip, &niovecs, &nbytes);
185
186 /* Skip items that do not have any vectors for writing */
187 if (!niovecs)
188 continue;
189
190 /* 337 /*
191 * Ordered items need to be tracked but we do not wish to write 338 * The formatting size information is already attached to
192 * them. We need a logvec to track the object, but we do not 339 * the shadow lv on the log item.
193 * need an iovec or buffer to be allocated for copying data.
194 */ 340 */
195 if (niovecs == XFS_LOG_VEC_ORDERED) { 341 shadow = lip->li_lv_shadow;
342 if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED)
196 ordered = true; 343 ordered = true;
197 niovecs = 0;
198 nbytes = 0;
199 }
200 344
201 /* 345 /* Skip items that do not have any vectors for writing */
202 * We 64-bit align the length of each iovec so that the start 346 if (!shadow->lv_niovecs && !ordered)
203 * of the next one is naturally aligned. We'll need to 347 continue;
204 * account for that slack space here. Then round nbytes up
205 * to 64-bit alignment so that the initial buffer alignment is
206 * easy to calculate and verify.
207 */
208 nbytes += niovecs * sizeof(uint64_t);
209 nbytes = round_up(nbytes, sizeof(uint64_t));
210
211 /* grab the old item if it exists for reservation accounting */
212 old_lv = lip->li_lv;
213
214 /*
215 * The data buffer needs to start 64-bit aligned, so round up
216 * that space to ensure we can align it appropriately and not
217 * overrun the buffer.
218 */
219 buf_size = nbytes +
220 round_up((sizeof(struct xfs_log_vec) +
221 niovecs * sizeof(struct xfs_log_iovec)),
222 sizeof(uint64_t));
223 348
224 /* compare to existing item size */ 349 /* compare to existing item size */
225 if (lip->li_lv && buf_size <= lip->li_lv->lv_size) { 350 old_lv = lip->li_lv;
351 if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) {
226 /* same or smaller, optimise common overwrite case */ 352 /* same or smaller, optimise common overwrite case */
227 lv = lip->li_lv; 353 lv = lip->li_lv;
228 lv->lv_next = NULL; 354 lv->lv_next = NULL;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
236 */ 362 */
237 *diff_iovecs -= lv->lv_niovecs; 363 *diff_iovecs -= lv->lv_niovecs;
238 *diff_len -= lv->lv_bytes; 364 *diff_len -= lv->lv_bytes;
365
366 /* Ensure the lv is set up according to ->iop_size */
367 lv->lv_niovecs = shadow->lv_niovecs;
368
369 /* reset the lv buffer information for new formatting */
370 lv->lv_buf_len = 0;
371 lv->lv_bytes = 0;
372 lv->lv_buf = (char *)lv +
373 xlog_cil_iovec_space(lv->lv_niovecs);
239 } else { 374 } else {
240 /* allocate new data chunk */ 375 /* switch to shadow buffer! */
241 lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS); 376 lv = shadow;
242 lv->lv_item = lip; 377 lv->lv_item = lip;
243 lv->lv_size = buf_size;
244 if (ordered) { 378 if (ordered) {
245 /* track as an ordered logvec */ 379 /* track as an ordered logvec */
246 ASSERT(lip->li_lv == NULL); 380 ASSERT(lip->li_lv == NULL);
247 lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
248 goto insert; 381 goto insert;
249 } 382 }
250 lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
251 } 383 }
252 384
253 /* Ensure the lv is set up according to ->iop_size */
254 lv->lv_niovecs = niovecs;
255
256 /* The allocated data region lies beyond the iovec region */
257 lv->lv_buf_len = 0;
258 lv->lv_bytes = 0;
259 lv->lv_buf = (char *)lv + buf_size - nbytes;
260 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); 385 ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
261
262 lip->li_ops->iop_format(lip, lv); 386 lip->li_ops->iop_format(lip, lv);
263insert: 387insert:
264 ASSERT(lv->lv_buf_len <= nbytes);
265 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs); 388 xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
266 } 389 }
267} 390}
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
783 struct xlog *log = mp->m_log; 906 struct xlog *log = mp->m_log;
784 struct xfs_cil *cil = log->l_cilp; 907 struct xfs_cil *cil = log->l_cilp;
785 908
909 /*
910 * Do all necessary memory allocation before we lock the CIL.
911 * This ensures the allocation does not deadlock with a CIL
912 * push in memory reclaim (e.g. from kswapd).
913 */
914 xlog_cil_alloc_shadow_bufs(log, tp);
915
786 /* lock out background commit */ 916 /* lock out background commit */
787 down_read(&cil->xc_ctx_lock); 917 down_read(&cil->xc_ctx_lock);
788 918
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e39b02351b4a..970c19ba2f56 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -272,13 +272,15 @@ xfs_readsb(
272 buf_ops = NULL; 272 buf_ops = NULL;
273 273
274 /* 274 /*
275 * Allocate a (locked) buffer to hold the superblock. 275 * Allocate a (locked) buffer to hold the superblock. This will be kept
276 * This will be kept around at all times to optimize 276 * around at all times to optimize access to the superblock. Therefore,
277 * access to the superblock. 277 * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count
278 * elevated.
278 */ 279 */
279reread: 280reread:
280 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 281 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
281 BTOBB(sector_size), 0, &bp, buf_ops); 282 BTOBB(sector_size), XBF_NO_IOACCT, &bp,
283 buf_ops);
282 if (error) { 284 if (error) {
283 if (loud) 285 if (loud)
284 xfs_warn(mp, "SB validate failed with error %d.", error); 286 xfs_warn(mp, "SB validate failed with error %d.", error);
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 184c44effdd5..0cc8d8f74356 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -22,6 +22,11 @@
22 BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \ 22 BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
23 #structname ") is wrong, expected " #size) 23 #structname ") is wrong, expected " #size)
24 24
25#define XFS_CHECK_OFFSET(structname, member, off) \
26 BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
27 "XFS: offsetof(" #structname ", " #member ") is wrong, " \
28 "expected " #off)
29
25static inline void __init 30static inline void __init
26xfs_check_ondisk_structs(void) 31xfs_check_ondisk_structs(void)
27{ 32{
@@ -34,6 +39,8 @@ xfs_check_ondisk_structs(void)
34 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8); 39 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_key, 8);
35 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16); 40 XFS_CHECK_STRUCT_SIZE(struct xfs_bmbt_rec, 16);
36 XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4); 41 XFS_CHECK_STRUCT_SIZE(struct xfs_bmdr_block, 4);
42 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_shdr, 48);
43 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block_lhdr, 64);
37 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72); 44 XFS_CHECK_STRUCT_SIZE(struct xfs_btree_block, 72);
38 XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176); 45 XFS_CHECK_STRUCT_SIZE(struct xfs_dinode, 176);
39 XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104); 46 XFS_CHECK_STRUCT_SIZE(struct xfs_disk_dquot, 104);
@@ -75,27 +82,39 @@ xfs_check_ondisk_structs(void)
75 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12); 82 XFS_CHECK_STRUCT_SIZE(xfs_attr_leaf_name_remote_t, 12);
76 */ 83 */
77 84
85 XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, valuelen, 0);
86 XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, namelen, 2);
87 XFS_CHECK_OFFSET(xfs_attr_leaf_name_local_t, nameval, 3);
88 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valueblk, 0);
89 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4);
90 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
91 XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
78 XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); 92 XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40);
79 XFS_CHECK_STRUCT_SIZE(xfs_attr_shortform_t, 8); 93 XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.totsize, 0);
94 XFS_CHECK_OFFSET(xfs_attr_shortform_t, hdr.count, 2);
95 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].namelen, 4);
96 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].valuelen, 5);
97 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].flags, 6);
98 XFS_CHECK_OFFSET(xfs_attr_shortform_t, list[0].nameval, 7);
80 XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12); 99 XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
81 XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16); 100 XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
82 XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8); 101 XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
83 XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16); 102 XFS_CHECK_STRUCT_SIZE(xfs_da_node_hdr_t, 16);
84 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4); 103 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_free_t, 4);
85 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16); 104 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_hdr_t, 16);
86 XFS_CHECK_STRUCT_SIZE(xfs_dir2_data_unused_t, 6); 105 XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, freetag, 0);
106 XFS_CHECK_OFFSET(xfs_dir2_data_unused_t, length, 2);
87 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16); 107 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_hdr_t, 16);
88 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16); 108 XFS_CHECK_STRUCT_SIZE(xfs_dir2_free_t, 16);
89 XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino4_t, 4);
90 XFS_CHECK_STRUCT_SIZE(xfs_dir2_ino8_t, 8);
91 XFS_CHECK_STRUCT_SIZE(xfs_dir2_inou_t, 8);
92 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8); 109 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_entry_t, 8);
93 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16); 110 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_hdr_t, 16);
94 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16); 111 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_t, 16);
95 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4); 112 XFS_CHECK_STRUCT_SIZE(xfs_dir2_leaf_tail_t, 4);
96 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3); 113 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_entry_t, 3);
114 XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, namelen, 0);
115 XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, offset, 1);
116 XFS_CHECK_OFFSET(xfs_dir2_sf_entry_t, name, 3);
97 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10); 117 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_hdr_t, 10);
98 XFS_CHECK_STRUCT_SIZE(xfs_dir2_sf_off_t, 2);
99 118
100 /* log structures */ 119 /* log structures */
101 XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24); 120 XFS_CHECK_STRUCT_SIZE(struct xfs_dq_logformat, 24);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index d5b756669fb5..0f14b2e4bf6c 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * Copyright (c) 2014 Christoph Hellwig. 2 * Copyright (c) 2014 Christoph Hellwig.
3 */ 3 */
4#include <linux/iomap.h>
4#include "xfs.h" 5#include "xfs.h"
5#include "xfs_format.h" 6#include "xfs_format.h"
6#include "xfs_log_format.h" 7#include "xfs_log_format.h"
@@ -79,32 +80,6 @@ xfs_fs_get_uuid(
79 return 0; 80 return 0;
80} 81}
81 82
82static void
83xfs_bmbt_to_iomap(
84 struct xfs_inode *ip,
85 struct iomap *iomap,
86 struct xfs_bmbt_irec *imap)
87{
88 struct xfs_mount *mp = ip->i_mount;
89
90 if (imap->br_startblock == HOLESTARTBLOCK) {
91 iomap->blkno = IOMAP_NULL_BLOCK;
92 iomap->type = IOMAP_HOLE;
93 } else if (imap->br_startblock == DELAYSTARTBLOCK) {
94 iomap->blkno = IOMAP_NULL_BLOCK;
95 iomap->type = IOMAP_DELALLOC;
96 } else {
97 iomap->blkno =
98 XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
99 if (imap->br_state == XFS_EXT_UNWRITTEN)
100 iomap->type = IOMAP_UNWRITTEN;
101 else
102 iomap->type = IOMAP_MAPPED;
103 }
104 iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
105 iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
106}
107
108/* 83/*
109 * Get a layout for the pNFS client. 84 * Get a layout for the pNFS client.
110 */ 85 */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 76c0a4a9bb17..355dd9e1cb64 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -98,8 +98,6 @@ xfs_growfs_rt(
98/* 98/*
99 * From xfs_rtbitmap.c 99 * From xfs_rtbitmap.c
100 */ 100 */
101int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
102 xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
103int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp, 101int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
104 xfs_rtblock_t start, xfs_extlen_t len, int val, 102 xfs_rtblock_t start, xfs_extlen_t len, int val,
105 xfs_rtblock_t *new, int *stat); 103 xfs_rtblock_t *new, int *stat);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 11ea5d51db56..0303f1005f88 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -546,7 +546,7 @@ xfs_showargs(
546 546
547 return 0; 547 return 0;
548} 548}
549__uint64_t 549static __uint64_t
550xfs_max_file_offset( 550xfs_max_file_offset(
551 unsigned int blockshift) 551 unsigned int blockshift)
552{ 552{
@@ -1294,6 +1294,7 @@ xfs_fs_remount(
1294 */ 1294 */
1295 xfs_restore_resvblks(mp); 1295 xfs_restore_resvblks(mp);
1296 xfs_log_work_queue(mp); 1296 xfs_log_work_queue(mp);
1297 xfs_queue_eofblocks(mp);
1297 } 1298 }
1298 1299
1299 /* rw -> ro */ 1300 /* rw -> ro */
@@ -1306,6 +1307,13 @@ xfs_fs_remount(
1306 * return it to the same size. 1307 * return it to the same size.
1307 */ 1308 */
1308 xfs_save_resvblks(mp); 1309 xfs_save_resvblks(mp);
1310
1311 /*
1312 * Cancel background eofb scanning so it cannot race with the
1313 * final log force+buftarg wait and deadlock the remount.
1314 */
1315 cancel_delayed_work_sync(&mp->m_eofblocks_work);
1316
1309 xfs_quiesce_attr(mp); 1317 xfs_quiesce_attr(mp);
1310 mp->m_flags |= XFS_MOUNT_RDONLY; 1318 mp->m_flags |= XFS_MOUNT_RDONLY;
1311 } 1319 }
@@ -1565,10 +1573,6 @@ xfs_fs_fill_super(
1565 } 1573 }
1566 } 1574 }
1567 1575
1568 if (xfs_sb_version_hassparseinodes(&mp->m_sb))
1569 xfs_alert(mp,
1570 "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!");
1571
1572 error = xfs_mountfs(mp); 1576 error = xfs_mountfs(mp);
1573 if (error) 1577 if (error)
1574 goto out_filestream_unmount; 1578 goto out_filestream_unmount;
@@ -1692,8 +1696,9 @@ xfs_init_zones(void)
1692 if (!xfs_log_ticket_zone) 1696 if (!xfs_log_ticket_zone)
1693 goto out_free_ioend_bioset; 1697 goto out_free_ioend_bioset;
1694 1698
1695 xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), 1699 xfs_bmap_free_item_zone = kmem_zone_init(
1696 "xfs_bmap_free_item"); 1700 sizeof(struct xfs_bmap_free_item),
1701 "xfs_bmap_free_item");
1697 if (!xfs_bmap_free_item_zone) 1702 if (!xfs_bmap_free_item_zone)
1698 goto out_destroy_log_ticket_zone; 1703 goto out_destroy_log_ticket_zone;
1699 1704
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 2dfb1ce4585f..529bce9fc37e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -61,8 +61,6 @@ struct xfs_mount;
61struct xfs_buftarg; 61struct xfs_buftarg;
62struct block_device; 62struct block_device;
63 63
64extern __uint64_t xfs_max_file_offset(unsigned int);
65
66extern void xfs_flush_inodes(struct xfs_mount *mp); 64extern void xfs_flush_inodes(struct xfs_mount *mp);
67extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 65extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
68extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *, 66extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 4c2c55086208..79cfd3fc5324 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -634,6 +634,9 @@ xfs_error_get_cfg(
634{ 634{
635 struct xfs_error_cfg *cfg; 635 struct xfs_error_cfg *cfg;
636 636
637 if (error < 0)
638 error = -error;
639
637 switch (error) { 640 switch (error) {
638 case EIO: 641 case EIO:
639 cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; 642 cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO];
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ea94ee0fe5ea..145169093fe0 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -354,6 +354,7 @@ DEFINE_BUF_EVENT(xfs_buf_submit_wait);
354DEFINE_BUF_EVENT(xfs_buf_bawrite); 354DEFINE_BUF_EVENT(xfs_buf_bawrite);
355DEFINE_BUF_EVENT(xfs_buf_lock); 355DEFINE_BUF_EVENT(xfs_buf_lock);
356DEFINE_BUF_EVENT(xfs_buf_lock_done); 356DEFINE_BUF_EVENT(xfs_buf_lock_done);
357DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
357DEFINE_BUF_EVENT(xfs_buf_trylock); 358DEFINE_BUF_EVENT(xfs_buf_trylock);
358DEFINE_BUF_EVENT(xfs_buf_unlock); 359DEFINE_BUF_EVENT(xfs_buf_unlock);
359DEFINE_BUF_EVENT(xfs_buf_iowait); 360DEFINE_BUF_EVENT(xfs_buf_iowait);
@@ -1134,15 +1135,14 @@ TRACE_EVENT(xfs_log_assign_tail_lsn,
1134) 1135)
1135 1136
1136DECLARE_EVENT_CLASS(xfs_file_class, 1137DECLARE_EVENT_CLASS(xfs_file_class,
1137 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), 1138 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset),
1138 TP_ARGS(ip, count, offset, flags), 1139 TP_ARGS(ip, count, offset),
1139 TP_STRUCT__entry( 1140 TP_STRUCT__entry(
1140 __field(dev_t, dev) 1141 __field(dev_t, dev)
1141 __field(xfs_ino_t, ino) 1142 __field(xfs_ino_t, ino)
1142 __field(xfs_fsize_t, size) 1143 __field(xfs_fsize_t, size)
1143 __field(loff_t, offset) 1144 __field(loff_t, offset)
1144 __field(size_t, count) 1145 __field(size_t, count)
1145 __field(int, flags)
1146 ), 1146 ),
1147 TP_fast_assign( 1147 TP_fast_assign(
1148 __entry->dev = VFS_I(ip)->i_sb->s_dev; 1148 __entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -1150,25 +1150,25 @@ DECLARE_EVENT_CLASS(xfs_file_class,
1150 __entry->size = ip->i_d.di_size; 1150 __entry->size = ip->i_d.di_size;
1151 __entry->offset = offset; 1151 __entry->offset = offset;
1152 __entry->count = count; 1152 __entry->count = count;
1153 __entry->flags = flags;
1154 ), 1153 ),
1155 TP_printk("dev %d:%d ino 0x%llx size 0x%llx " 1154 TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx",
1156 "offset 0x%llx count 0x%zx ioflags %s",
1157 MAJOR(__entry->dev), MINOR(__entry->dev), 1155 MAJOR(__entry->dev), MINOR(__entry->dev),
1158 __entry->ino, 1156 __entry->ino,
1159 __entry->size, 1157 __entry->size,
1160 __entry->offset, 1158 __entry->offset,
1161 __entry->count, 1159 __entry->count)
1162 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
1163) 1160)
1164 1161
1165#define DEFINE_RW_EVENT(name) \ 1162#define DEFINE_RW_EVENT(name) \
1166DEFINE_EVENT(xfs_file_class, name, \ 1163DEFINE_EVENT(xfs_file_class, name, \
1167 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ 1164 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset), \
1168 TP_ARGS(ip, count, offset, flags)) 1165 TP_ARGS(ip, count, offset))
1169DEFINE_RW_EVENT(xfs_file_read); 1166DEFINE_RW_EVENT(xfs_file_buffered_read);
1167DEFINE_RW_EVENT(xfs_file_direct_read);
1168DEFINE_RW_EVENT(xfs_file_dax_read);
1170DEFINE_RW_EVENT(xfs_file_buffered_write); 1169DEFINE_RW_EVENT(xfs_file_buffered_write);
1171DEFINE_RW_EVENT(xfs_file_direct_write); 1170DEFINE_RW_EVENT(xfs_file_direct_write);
1171DEFINE_RW_EVENT(xfs_file_dax_write);
1172DEFINE_RW_EVENT(xfs_file_splice_read); 1172DEFINE_RW_EVENT(xfs_file_splice_read);
1173 1173
1174DECLARE_EVENT_CLASS(xfs_page_class, 1174DECLARE_EVENT_CLASS(xfs_page_class,
@@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
1295DEFINE_IOMAP_EVENT(xfs_get_blocks_found); 1295DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
1296DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); 1296DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
1297DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); 1297DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct);
1298DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
1299DEFINE_IOMAP_EVENT(xfs_iomap_found);
1300DEFINE_IOMAP_EVENT(xfs_iomap_not_found);
1298 1301
1299DECLARE_EVENT_CLASS(xfs_simple_io_class, 1302DECLARE_EVENT_CLASS(xfs_simple_io_class,
1300 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), 1303 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 9a462e892e4f..9b2b9fa89331 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -52,6 +52,7 @@ typedef struct xfs_log_item {
52 /* delayed logging */ 52 /* delayed logging */
53 struct list_head li_cil; /* CIL pointers */ 53 struct list_head li_cil; /* CIL pointers */
54 struct xfs_log_vec *li_lv; /* active log vector */ 54 struct xfs_log_vec *li_lv; /* active log vector */
55 struct xfs_log_vec *li_lv_shadow; /* standby vector */
55 xfs_lsn_t li_seq; /* CIL commit seq */ 56 xfs_lsn_t li_seq; /* CIL commit seq */
56} xfs_log_item_t; 57} xfs_log_item_t;
57 58
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index d8414502edb4..b03c0625fa6e 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -6,6 +6,7 @@
6struct dentry; 6struct dentry;
7struct iattr; 7struct iattr;
8struct inode; 8struct inode;
9struct iomap;
9struct super_block; 10struct super_block;
10struct vfsmount; 11struct vfsmount;
11 12
@@ -187,21 +188,6 @@ struct fid {
187 * get_name is not (which is possibly inconsistent) 188 * get_name is not (which is possibly inconsistent)
188 */ 189 */
189 190
190/* types of block ranges for multipage write mappings. */
191#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
192#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
193#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
194#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
195
196#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
197
198struct iomap {
199 sector_t blkno; /* first sector of mapping */
200 loff_t offset; /* file offset of mapping, bytes */
201 u64 length; /* length of mapping, bytes */
202 int type; /* type of mapping */
203};
204
205struct export_operations { 191struct export_operations {
206 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len, 192 int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
207 struct inode *parent); 193 struct inode *parent);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 000000000000..3267df461012
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,70 @@
1#ifndef LINUX_IOMAP_H
2#define LINUX_IOMAP_H 1
3
4#include <linux/types.h>
5
6struct fiemap_extent_info;
7struct inode;
8struct iov_iter;
9struct kiocb;
10struct vm_area_struct;
11struct vm_fault;
12
13/*
14 * Types of block ranges for iomap mappings:
15 */
16#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
17#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
18#define IOMAP_MAPPED 0x03 /* blocks allocated @blkno */
19#define IOMAP_UNWRITTEN 0x04 /* blocks allocated @blkno in unwritten state */
20
21/*
22 * Magic value for blkno:
23 */
24#define IOMAP_NULL_BLOCK -1LL /* blkno is not valid */
25
26struct iomap {
27 sector_t blkno; /* 1st sector of mapping, 512b units */
28 loff_t offset; /* file offset of mapping, bytes */
29 u64 length; /* length of mapping, bytes */
30 int type; /* type of mapping */
31 struct block_device *bdev; /* block device for I/O */
32};
33
34/*
35 * Flags for iomap_begin / iomap_end. No flag implies a read.
36 */
37#define IOMAP_WRITE (1 << 0)
38#define IOMAP_ZERO (1 << 1)
39
40struct iomap_ops {
41 /*
42 * Return the existing mapping at pos, or reserve space starting at
43 * pos for up to length, as long as we can do it as a single mapping.
44 * The actual length is returned in iomap->length.
45 */
46 int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
47 unsigned flags, struct iomap *iomap);
48
49 /*
50 * Commit and/or unreserve space previous allocated using iomap_begin.
51 * Written indicates the length of the successful write operation which
52 * needs to be commited, while the rest needs to be unreserved.
53 * Written might be zero if no data was written.
54 */
55 int (*iomap_end)(struct inode *inode, loff_t pos, loff_t length,
56 ssize_t written, unsigned flags, struct iomap *iomap);
57};
58
59ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
60 struct iomap_ops *ops);
61int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
62 bool *did_zero, struct iomap_ops *ops);
63int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
64 struct iomap_ops *ops);
65int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
66 struct iomap_ops *ops);
67int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
68 loff_t start, loff_t len, struct iomap_ops *ops);
69
70#endif /* LINUX_IOMAP_H */