aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorAlex Elder <aelder@sgi.com>2010-03-05 12:45:03 -0500
committerAlex Elder <aelder@sgi.com>2010-03-05 12:45:03 -0500
commit9b1f56d60acfd634728f91f34922066c6f80ede6 (patch)
treee8f64d4bea808341f56f41e724c2920ae6b1ed26 /fs
parent64ba9926759792cf7b95f823402e2781edd1b5d4 (diff)
parent07000ee686cf19e853fa06f7904eff2cfe230ea3 (diff)
Merge branch 'for-2.6.34-rc1-batch2' into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c221
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c854
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c796
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h29
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h22
-rw-r--r--fs/xfs/xfs_bmap.c220
-rw-r--r--fs/xfs/xfs_fs.h3
-rw-r--r--fs/xfs/xfs_iget.c19
-rw-r--r--fs/xfs/xfs_inode.c68
-rw-r--r--fs/xfs/xfs_inode.h3
-rw-r--r--fs/xfs/xfs_inode_item.c18
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_log.c106
-rw-r--r--fs/xfs/xfs_log.h16
-rw-r--r--fs/xfs/xfs_mount.c69
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_trans.c2
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_buf.c216
-rw-r--r--fs/xfs/xfs_vnodeops.c107
-rw-r--r--fs/xfs/xfs_vnodeops.h15
26 files changed, 1344 insertions, 1484 deletions
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 5c5a366aa332..b4769e40e8bc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -105,7 +105,6 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
105 xfs_globals.o \ 105 xfs_globals.o \
106 xfs_ioctl.o \ 106 xfs_ioctl.o \
107 xfs_iops.o \ 107 xfs_iops.o \
108 xfs_lrw.o \
109 xfs_super.o \ 108 xfs_super.o \
110 xfs_sync.o \ 109 xfs_sync.o \
111 xfs_xattr.o) 110 xfs_xattr.o)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 66abe36c1213..9083357f9e44 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -39,6 +39,7 @@
39#include "xfs_iomap.h" 39#include "xfs_iomap.h"
40#include "xfs_vnodeops.h" 40#include "xfs_vnodeops.h"
41#include "xfs_trace.h" 41#include "xfs_trace.h"
42#include "xfs_bmap.h"
42#include <linux/mpage.h> 43#include <linux/mpage.h>
43#include <linux/pagevec.h> 44#include <linux/pagevec.h>
44#include <linux/writeback.h> 45#include <linux/writeback.h>
@@ -163,14 +164,17 @@ xfs_ioend_new_eof(
163} 164}
164 165
165/* 166/*
166 * Update on-disk file size now that data has been written to disk. 167 * Update on-disk file size now that data has been written to disk. The
167 * The current in-memory file size is i_size. If a write is beyond 168 * current in-memory file size is i_size. If a write is beyond eof i_new_size
168 * eof i_new_size will be the intended file size until i_size is 169 * will be the intended file size until i_size is updated. If this write does
169 * updated. If this write does not extend all the way to the valid 170 * not extend all the way to the valid file size then restrict this update to
170 * file size then restrict this update to the end of the write. 171 * the end of the write.
172 *
173 * This function does not block as blocking on the inode lock in IO completion
174 * can lead to IO completion order dependency deadlocks.. If it can't get the
175 * inode ilock it will return EAGAIN. Callers must handle this.
171 */ 176 */
172 177STATIC int
173STATIC void
174xfs_setfilesize( 178xfs_setfilesize(
175 xfs_ioend_t *ioend) 179 xfs_ioend_t *ioend)
176{ 180{
@@ -181,16 +185,40 @@ xfs_setfilesize(
181 ASSERT(ioend->io_type != IOMAP_READ); 185 ASSERT(ioend->io_type != IOMAP_READ);
182 186
183 if (unlikely(ioend->io_error)) 187 if (unlikely(ioend->io_error))
184 return; 188 return 0;
189
190 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
191 return EAGAIN;
185 192
186 xfs_ilock(ip, XFS_ILOCK_EXCL);
187 isize = xfs_ioend_new_eof(ioend); 193 isize = xfs_ioend_new_eof(ioend);
188 if (isize) { 194 if (isize) {
189 ip->i_d.di_size = isize; 195 ip->i_d.di_size = isize;
190 xfs_mark_inode_dirty_sync(ip); 196 xfs_mark_inode_dirty(ip);
191 } 197 }
192 198
193 xfs_iunlock(ip, XFS_ILOCK_EXCL); 199 xfs_iunlock(ip, XFS_ILOCK_EXCL);
200 return 0;
201}
202
203/*
204 * Schedule IO completion handling on a xfsdatad if this was
205 * the final hold on this ioend. If we are asked to wait,
206 * flush the workqueue.
207 */
208STATIC void
209xfs_finish_ioend(
210 xfs_ioend_t *ioend,
211 int wait)
212{
213 if (atomic_dec_and_test(&ioend->io_remaining)) {
214 struct workqueue_struct *wq;
215
216 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
217 xfsconvertd_workqueue : xfsdatad_workqueue;
218 queue_work(wq, &ioend->io_work);
219 if (wait)
220 flush_workqueue(wq);
221 }
194} 222}
195 223
196/* 224/*
@@ -198,11 +226,11 @@ xfs_setfilesize(
198 */ 226 */
199STATIC void 227STATIC void
200xfs_end_io( 228xfs_end_io(
201 struct work_struct *work) 229 struct work_struct *work)
202{ 230{
203 xfs_ioend_t *ioend = 231 xfs_ioend_t *ioend = container_of(work, xfs_ioend_t, io_work);
204 container_of(work, xfs_ioend_t, io_work); 232 struct xfs_inode *ip = XFS_I(ioend->io_inode);
205 struct xfs_inode *ip = XFS_I(ioend->io_inode); 233 int error = 0;
206 234
207 /* 235 /*
208 * For unwritten extents we need to issue transactions to convert a 236 * For unwritten extents we need to issue transactions to convert a
@@ -210,7 +238,6 @@ xfs_end_io(
210 */ 238 */
211 if (ioend->io_type == IOMAP_UNWRITTEN && 239 if (ioend->io_type == IOMAP_UNWRITTEN &&
212 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) { 240 likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
213 int error;
214 241
215 error = xfs_iomap_write_unwritten(ip, ioend->io_offset, 242 error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
216 ioend->io_size); 243 ioend->io_size);
@@ -222,30 +249,23 @@ xfs_end_io(
222 * We might have to update the on-disk file size after extending 249 * We might have to update the on-disk file size after extending
223 * writes. 250 * writes.
224 */ 251 */
225 if (ioend->io_type != IOMAP_READ) 252 if (ioend->io_type != IOMAP_READ) {
226 xfs_setfilesize(ioend); 253 error = xfs_setfilesize(ioend);
227 xfs_destroy_ioend(ioend); 254 ASSERT(!error || error == EAGAIN);
228}
229
230/*
231 * Schedule IO completion handling on a xfsdatad if this was
232 * the final hold on this ioend. If we are asked to wait,
233 * flush the workqueue.
234 */
235STATIC void
236xfs_finish_ioend(
237 xfs_ioend_t *ioend,
238 int wait)
239{
240 if (atomic_dec_and_test(&ioend->io_remaining)) {
241 struct workqueue_struct *wq;
242
243 wq = (ioend->io_type == IOMAP_UNWRITTEN) ?
244 xfsconvertd_workqueue : xfsdatad_workqueue;
245 queue_work(wq, &ioend->io_work);
246 if (wait)
247 flush_workqueue(wq);
248 } 255 }
256
257 /*
258 * If we didn't complete processing of the ioend, requeue it to the
259 * tail of the workqueue for another attempt later. Otherwise destroy
260 * it.
261 */
262 if (error == EAGAIN) {
263 atomic_inc(&ioend->io_remaining);
264 xfs_finish_ioend(ioend, 0);
265 /* ensure we don't spin on blocked ioends */
266 delay(1);
267 } else
268 xfs_destroy_ioend(ioend);
249} 269}
250 270
251/* 271/*
@@ -341,7 +361,7 @@ xfs_submit_ioend_bio(
341 * but don't update the inode size until I/O completion. 361 * but don't update the inode size until I/O completion.
342 */ 362 */
343 if (xfs_ioend_new_eof(ioend)) 363 if (xfs_ioend_new_eof(ioend))
344 xfs_mark_inode_dirty_sync(XFS_I(ioend->io_inode)); 364 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
345 365
346 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 366 submit_bio(wbc->sync_mode == WB_SYNC_ALL ?
347 WRITE_SYNC_PLUG : WRITE, bio); 367 WRITE_SYNC_PLUG : WRITE, bio);
@@ -874,6 +894,118 @@ xfs_cluster_write(
874 } 894 }
875} 895}
876 896
897STATIC void
898xfs_vm_invalidatepage(
899 struct page *page,
900 unsigned long offset)
901{
902 trace_xfs_invalidatepage(page->mapping->host, page, offset);
903 block_invalidatepage(page, offset);
904}
905
906/*
907 * If the page has delalloc buffers on it, we need to punch them out before we
908 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
909 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
910 * is done on that same region - the delalloc extent is returned when none is
911 * supposed to be there.
912 *
913 * We prevent this by truncating away the delalloc regions on the page before
914 * invalidating it. Because they are delalloc, we can do this without needing a
915 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
916 * truncation without a transaction as there is no space left for block
917 * reservation (typically why we see a ENOSPC in writeback).
918 *
919 * This is not a performance critical path, so for now just do the punching a
920 * buffer head at a time.
921 */
922STATIC void
923xfs_aops_discard_page(
924 struct page *page)
925{
926 struct inode *inode = page->mapping->host;
927 struct xfs_inode *ip = XFS_I(inode);
928 struct buffer_head *bh, *head;
929 loff_t offset = page_offset(page);
930 ssize_t len = 1 << inode->i_blkbits;
931
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
933 goto out_invalidate;
934
935 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
936 "page discard on page %p, inode 0x%llx, offset %llu.",
937 page, ip->i_ino, offset);
938
939 xfs_ilock(ip, XFS_ILOCK_EXCL);
940 bh = head = page_buffers(page);
941 do {
942 int done;
943 xfs_fileoff_t offset_fsb;
944 xfs_bmbt_irec_t imap;
945 int nimaps = 1;
946 int error;
947 xfs_fsblock_t firstblock;
948 xfs_bmap_free_t flist;
949
950 if (!buffer_delay(bh))
951 goto next_buffer;
952
953 offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
954
955 /*
956 * Map the range first and check that it is a delalloc extent
957 * before trying to unmap the range. Otherwise we will be
958 * trying to remove a real extent (which requires a
959 * transaction) or a hole, which is probably a bad idea...
960 */
961 error = xfs_bmapi(NULL, ip, offset_fsb, 1,
962 XFS_BMAPI_ENTIRE, NULL, 0, &imap,
963 &nimaps, NULL, NULL);
964
965 if (error) {
966 /* something screwed, just bail */
967 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
968 "page discard failed delalloc mapping lookup.");
969 break;
970 }
971 if (!nimaps) {
972 /* nothing there */
973 goto next_buffer;
974 }
975 if (imap.br_startblock != DELAYSTARTBLOCK) {
976 /* been converted, ignore */
977 goto next_buffer;
978 }
979 WARN_ON(imap.br_blockcount == 0);
980
981 /*
982 * Note: while we initialise the firstblock/flist pair, they
983 * should never be used because blocks should never be
984 * allocated or freed for a delalloc extent and hence we need
985 * don't cancel or finish them after the xfs_bunmapi() call.
986 */
987 xfs_bmap_init(&flist, &firstblock);
988 error = xfs_bunmapi(NULL, ip, offset_fsb, 1, 0, 1, &firstblock,
989 &flist, NULL, &done);
990
991 ASSERT(!flist.xbf_count && !flist.xbf_first);
992 if (error) {
993 /* something screwed, just bail */
994 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
995 "page discard unable to remove delalloc mapping.");
996 break;
997 }
998next_buffer:
999 offset += len;
1000
1001 } while ((bh = bh->b_this_page) != head);
1002
1003 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1004out_invalidate:
1005 xfs_vm_invalidatepage(page, 0);
1006 return;
1007}
1008
877/* 1009/*
878 * Calling this without startio set means we are being asked to make a dirty 1010 * Calling this without startio set means we are being asked to make a dirty
879 * page ready for freeing it's buffers. When called with startio set then 1011 * page ready for freeing it's buffers. When called with startio set then
@@ -1125,7 +1257,7 @@ error:
1125 */ 1257 */
1126 if (err != -EAGAIN) { 1258 if (err != -EAGAIN) {
1127 if (!unmapped) 1259 if (!unmapped)
1128 block_invalidatepage(page, 0); 1260 xfs_aops_discard_page(page);
1129 ClearPageUptodate(page); 1261 ClearPageUptodate(page);
1130 } 1262 }
1131 return err; 1263 return err;
@@ -1535,15 +1667,6 @@ xfs_vm_readpages(
1535 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1667 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1536} 1668}
1537 1669
1538STATIC void
1539xfs_vm_invalidatepage(
1540 struct page *page,
1541 unsigned long offset)
1542{
1543 trace_xfs_invalidatepage(page->mapping->host, page, offset);
1544 block_invalidatepage(page, offset);
1545}
1546
1547const struct address_space_operations xfs_address_space_operations = { 1670const struct address_space_operations xfs_address_space_operations = {
1548 .readpage = xfs_vm_readpage, 1671 .readpage = xfs_vm_readpage,
1549 .readpages = xfs_vm_readpages, 1672 .readpages = xfs_vm_readpages,
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e4caeb28ce2e..42dd3bcfba6b 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -16,6 +16,7 @@
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_fs.h"
19#include "xfs_bit.h" 20#include "xfs_bit.h"
20#include "xfs_log.h" 21#include "xfs_log.h"
21#include "xfs_inum.h" 22#include "xfs_inum.h"
@@ -34,52 +35,279 @@
34#include "xfs_dir2_sf.h" 35#include "xfs_dir2_sf.h"
35#include "xfs_dinode.h" 36#include "xfs_dinode.h"
36#include "xfs_inode.h" 37#include "xfs_inode.h"
38#include "xfs_inode_item.h"
39#include "xfs_bmap.h"
37#include "xfs_error.h" 40#include "xfs_error.h"
38#include "xfs_rw.h" 41#include "xfs_rw.h"
39#include "xfs_vnodeops.h" 42#include "xfs_vnodeops.h"
40#include "xfs_da_btree.h" 43#include "xfs_da_btree.h"
41#include "xfs_ioctl.h" 44#include "xfs_ioctl.h"
45#include "xfs_trace.h"
42 46
43#include <linux/dcache.h> 47#include <linux/dcache.h>
44 48
45static const struct vm_operations_struct xfs_file_vm_ops; 49static const struct vm_operations_struct xfs_file_vm_ops;
46 50
47STATIC ssize_t 51/*
48xfs_file_aio_read( 52 * xfs_iozero
49 struct kiocb *iocb, 53 *
50 const struct iovec *iov, 54 * xfs_iozero clears the specified range of buffer supplied,
51 unsigned long nr_segs, 55 * and marks all the affected blocks as valid and modified. If
52 loff_t pos) 56 * an affected block is not allocated, it will be allocated. If
57 * an affected block is not completely overwritten, and is not
58 * valid before the operation, it will be read from disk before
59 * being partially zeroed.
60 */
61STATIC int
62xfs_iozero(
63 struct xfs_inode *ip, /* inode */
64 loff_t pos, /* offset in file */
65 size_t count) /* size of data to zero */
53{ 66{
54 struct file *file = iocb->ki_filp; 67 struct page *page;
55 int ioflags = 0; 68 struct address_space *mapping;
69 int status;
56 70
57 BUG_ON(iocb->ki_pos != pos); 71 mapping = VFS_I(ip)->i_mapping;
58 if (unlikely(file->f_flags & O_DIRECT)) 72 do {
59 ioflags |= IO_ISDIRECT; 73 unsigned offset, bytes;
60 if (file->f_mode & FMODE_NOCMTIME) 74 void *fsdata;
61 ioflags |= IO_INVIS; 75
62 return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov, 76 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
63 nr_segs, &iocb->ki_pos, ioflags); 77 bytes = PAGE_CACHE_SIZE - offset;
78 if (bytes > count)
79 bytes = count;
80
81 status = pagecache_write_begin(NULL, mapping, pos, bytes,
82 AOP_FLAG_UNINTERRUPTIBLE,
83 &page, &fsdata);
84 if (status)
85 break;
86
87 zero_user(page, offset, bytes);
88
89 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
90 page, fsdata);
91 WARN_ON(status <= 0); /* can't return less than zero! */
92 pos += bytes;
93 count -= bytes;
94 status = 0;
95 } while (count);
96
97 return (-status);
98}
99
100STATIC int
101xfs_file_fsync(
102 struct file *file,
103 struct dentry *dentry,
104 int datasync)
105{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode);
107 struct xfs_trans *tp;
108 int error = 0;
109 int log_flushed = 0;
110
111 xfs_itrace_entry(ip);
112
113 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
114 return -XFS_ERROR(EIO);
115
116 xfs_iflags_clear(ip, XFS_ITRUNCATED);
117
118 /*
119 * We always need to make sure that the required inode state is safe on
120 * disk. The inode might be clean but we still might need to force the
121 * log because of committed transactions that haven't hit the disk yet.
122 * Likewise, there could be unflushed non-transactional changes to the
123 * inode core that have to go to disk and this requires us to issue
124 * a synchronous transaction to capture these changes correctly.
125 *
126 * This code relies on the assumption that if the i_update_core field
127 * of the inode is clear and the inode is unpinned then it is clean
128 * and no action is required.
129 */
130 xfs_ilock(ip, XFS_ILOCK_SHARED);
131
132 /*
133 * First check if the VFS inode is marked dirty. All the dirtying
134 * of non-transactional updates no goes through mark_inode_dirty*,
135 * which allows us to distinguish beteeen pure timestamp updates
136 * and i_size updates which need to be caught for fdatasync.
137 * After that also theck for the dirty state in the XFS inode, which
138 * might gets cleared when the inode gets written out via the AIL
139 * or xfs_iflush_cluster.
140 */
141 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
142 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
143 ip->i_update_core) {
144 /*
145 * Kick off a transaction to log the inode core to get the
146 * updates. The sync transaction will also force the log.
147 */
148 xfs_iunlock(ip, XFS_ILOCK_SHARED);
149 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
150 error = xfs_trans_reserve(tp, 0,
151 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
152 if (error) {
153 xfs_trans_cancel(tp, 0);
154 return -error;
155 }
156 xfs_ilock(ip, XFS_ILOCK_EXCL);
157
158 /*
159 * Note - it's possible that we might have pushed ourselves out
160 * of the way during trans_reserve which would flush the inode.
161 * But there's no guarantee that the inode buffer has actually
162 * gone out yet (it's delwri). Plus the buffer could be pinned
163 * anyway if it's part of an inode in another recent
164 * transaction. So we play it safe and fire off the
165 * transaction anyway.
166 */
167 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
168 xfs_trans_ihold(tp, ip);
169 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
170 xfs_trans_set_sync(tp);
171 error = _xfs_trans_commit(tp, 0, &log_flushed);
172
173 xfs_iunlock(ip, XFS_ILOCK_EXCL);
174 } else {
175 /*
176 * Timestamps/size haven't changed since last inode flush or
177 * inode transaction commit. That means either nothing got
178 * written or a transaction committed which caught the updates.
179 * If the latter happened and the transaction hasn't hit the
180 * disk yet, the inode will be still be pinned. If it is,
181 * force the log.
182 */
183 if (xfs_ipincount(ip)) {
184 error = _xfs_log_force_lsn(ip->i_mount,
185 ip->i_itemp->ili_last_lsn,
186 XFS_LOG_SYNC, &log_flushed);
187 }
188 xfs_iunlock(ip, XFS_ILOCK_SHARED);
189 }
190
191 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
192 /*
193 * If the log write didn't issue an ordered tag we need
194 * to flush the disk cache for the data device now.
195 */
196 if (!log_flushed)
197 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
198
199 /*
200 * If this inode is on the RT dev we need to flush that
201 * cache as well.
202 */
203 if (XFS_IS_REALTIME_INODE(ip))
204 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
205 }
206
207 return -error;
64} 208}
65 209
66STATIC ssize_t 210STATIC ssize_t
67xfs_file_aio_write( 211xfs_file_aio_read(
68 struct kiocb *iocb, 212 struct kiocb *iocb,
69 const struct iovec *iov, 213 const struct iovec *iovp,
70 unsigned long nr_segs, 214 unsigned long nr_segs,
71 loff_t pos) 215 loff_t pos)
72{ 216{
73 struct file *file = iocb->ki_filp; 217 struct file *file = iocb->ki_filp;
218 struct inode *inode = file->f_mapping->host;
219 struct xfs_inode *ip = XFS_I(inode);
220 struct xfs_mount *mp = ip->i_mount;
221 size_t size = 0;
222 ssize_t ret = 0;
74 int ioflags = 0; 223 int ioflags = 0;
224 xfs_fsize_t n;
225 unsigned long seg;
226
227 XFS_STATS_INC(xs_read_calls);
75 228
76 BUG_ON(iocb->ki_pos != pos); 229 BUG_ON(iocb->ki_pos != pos);
230
77 if (unlikely(file->f_flags & O_DIRECT)) 231 if (unlikely(file->f_flags & O_DIRECT))
78 ioflags |= IO_ISDIRECT; 232 ioflags |= IO_ISDIRECT;
79 if (file->f_mode & FMODE_NOCMTIME) 233 if (file->f_mode & FMODE_NOCMTIME)
80 ioflags |= IO_INVIS; 234 ioflags |= IO_INVIS;
81 return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs, 235
82 &iocb->ki_pos, ioflags); 236 /* START copy & waste from filemap.c */
237 for (seg = 0; seg < nr_segs; seg++) {
238 const struct iovec *iv = &iovp[seg];
239
240 /*
241 * If any segment has a negative length, or the cumulative
242 * length ever wraps negative then return -EINVAL.
243 */
244 size += iv->iov_len;
245 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
246 return XFS_ERROR(-EINVAL);
247 }
248 /* END copy & waste from filemap.c */
249
250 if (unlikely(ioflags & IO_ISDIRECT)) {
251 xfs_buftarg_t *target =
252 XFS_IS_REALTIME_INODE(ip) ?
253 mp->m_rtdev_targp : mp->m_ddev_targp;
254 if ((iocb->ki_pos & target->bt_smask) ||
255 (size & target->bt_smask)) {
256 if (iocb->ki_pos == ip->i_size)
257 return 0;
258 return -XFS_ERROR(EINVAL);
259 }
260 }
261
262 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
263 if (n <= 0 || size == 0)
264 return 0;
265
266 if (n < size)
267 size = n;
268
269 if (XFS_FORCED_SHUTDOWN(mp))
270 return -EIO;
271
272 if (unlikely(ioflags & IO_ISDIRECT))
273 mutex_lock(&inode->i_mutex);
274 xfs_ilock(ip, XFS_IOLOCK_SHARED);
275
276 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
277 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
278 int iolock = XFS_IOLOCK_SHARED;
279
280 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, iocb->ki_pos, size,
281 dmflags, &iolock);
282 if (ret) {
283 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
284 if (unlikely(ioflags & IO_ISDIRECT))
285 mutex_unlock(&inode->i_mutex);
286 return ret;
287 }
288 }
289
290 if (unlikely(ioflags & IO_ISDIRECT)) {
291 if (inode->i_mapping->nrpages) {
292 ret = -xfs_flushinval_pages(ip,
293 (iocb->ki_pos & PAGE_CACHE_MASK),
294 -1, FI_REMAPF_LOCKED);
295 }
296 mutex_unlock(&inode->i_mutex);
297 if (ret) {
298 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
299 return ret;
300 }
301 }
302
303 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
304
305 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
306 if (ret > 0)
307 XFS_STATS_ADD(xs_read_bytes, ret);
308
309 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
310 return ret;
83} 311}
84 312
85STATIC ssize_t 313STATIC ssize_t
@@ -87,16 +315,44 @@ xfs_file_splice_read(
87 struct file *infilp, 315 struct file *infilp,
88 loff_t *ppos, 316 loff_t *ppos,
89 struct pipe_inode_info *pipe, 317 struct pipe_inode_info *pipe,
90 size_t len, 318 size_t count,
91 unsigned int flags) 319 unsigned int flags)
92{ 320{
321 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
322 struct xfs_mount *mp = ip->i_mount;
93 int ioflags = 0; 323 int ioflags = 0;
324 ssize_t ret;
325
326 XFS_STATS_INC(xs_read_calls);
94 327
95 if (infilp->f_mode & FMODE_NOCMTIME) 328 if (infilp->f_mode & FMODE_NOCMTIME)
96 ioflags |= IO_INVIS; 329 ioflags |= IO_INVIS;
97 330
98 return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode), 331 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
99 infilp, ppos, pipe, len, flags, ioflags); 332 return -EIO;
333
334 xfs_ilock(ip, XFS_IOLOCK_SHARED);
335
336 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
337 int iolock = XFS_IOLOCK_SHARED;
338 int error;
339
340 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
341 FILP_DELAY_FLAG(infilp), &iolock);
342 if (error) {
343 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
344 return -error;
345 }
346 }
347
348 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
349
350 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
351 if (ret > 0)
352 XFS_STATS_ADD(xs_read_bytes, ret);
353
354 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
355 return ret;
100} 356}
101 357
102STATIC ssize_t 358STATIC ssize_t
@@ -104,16 +360,538 @@ xfs_file_splice_write(
104 struct pipe_inode_info *pipe, 360 struct pipe_inode_info *pipe,
105 struct file *outfilp, 361 struct file *outfilp,
106 loff_t *ppos, 362 loff_t *ppos,
107 size_t len, 363 size_t count,
108 unsigned int flags) 364 unsigned int flags)
109{ 365{
366 struct inode *inode = outfilp->f_mapping->host;
367 struct xfs_inode *ip = XFS_I(inode);
368 struct xfs_mount *mp = ip->i_mount;
369 xfs_fsize_t isize, new_size;
110 int ioflags = 0; 370 int ioflags = 0;
371 ssize_t ret;
372
373 XFS_STATS_INC(xs_write_calls);
111 374
112 if (outfilp->f_mode & FMODE_NOCMTIME) 375 if (outfilp->f_mode & FMODE_NOCMTIME)
113 ioflags |= IO_INVIS; 376 ioflags |= IO_INVIS;
114 377
115 return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode), 378 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
116 pipe, outfilp, ppos, len, flags, ioflags); 379 return -EIO;
380
381 xfs_ilock(ip, XFS_IOLOCK_EXCL);
382
383 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
384 int iolock = XFS_IOLOCK_EXCL;
385 int error;
386
387 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
388 FILP_DELAY_FLAG(outfilp), &iolock);
389 if (error) {
390 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
391 return -error;
392 }
393 }
394
395 new_size = *ppos + count;
396
397 xfs_ilock(ip, XFS_ILOCK_EXCL);
398 if (new_size > ip->i_size)
399 ip->i_new_size = new_size;
400 xfs_iunlock(ip, XFS_ILOCK_EXCL);
401
402 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
403
404 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
405 if (ret > 0)
406 XFS_STATS_ADD(xs_write_bytes, ret);
407
408 isize = i_size_read(inode);
409 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
410 *ppos = isize;
411
412 if (*ppos > ip->i_size) {
413 xfs_ilock(ip, XFS_ILOCK_EXCL);
414 if (*ppos > ip->i_size)
415 ip->i_size = *ppos;
416 xfs_iunlock(ip, XFS_ILOCK_EXCL);
417 }
418
419 if (ip->i_new_size) {
420 xfs_ilock(ip, XFS_ILOCK_EXCL);
421 ip->i_new_size = 0;
422 if (ip->i_d.di_size > ip->i_size)
423 ip->i_d.di_size = ip->i_size;
424 xfs_iunlock(ip, XFS_ILOCK_EXCL);
425 }
426 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
427 return ret;
428}
429
430/*
431 * This routine is called to handle zeroing any space in the last
432 * block of the file that is beyond the EOF. We do this since the
433 * size is being increased without writing anything to that block
434 * and we don't want anyone to read the garbage on the disk.
435 */
436STATIC int /* error (positive) */
437xfs_zero_last_block(
438 xfs_inode_t *ip,
439 xfs_fsize_t offset,
440 xfs_fsize_t isize)
441{
442 xfs_fileoff_t last_fsb;
443 xfs_mount_t *mp = ip->i_mount;
444 int nimaps;
445 int zero_offset;
446 int zero_len;
447 int error = 0;
448 xfs_bmbt_irec_t imap;
449
450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
451
452 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
453 if (zero_offset == 0) {
454 /*
455 * There are no extra bytes in the last block on disk to
456 * zero, so return.
457 */
458 return 0;
459 }
460
461 last_fsb = XFS_B_TO_FSBT(mp, isize);
462 nimaps = 1;
463 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
464 &nimaps, NULL, NULL);
465 if (error) {
466 return error;
467 }
468 ASSERT(nimaps > 0);
469 /*
470 * If the block underlying isize is just a hole, then there
471 * is nothing to zero.
472 */
473 if (imap.br_startblock == HOLESTARTBLOCK) {
474 return 0;
475 }
476 /*
477 * Zero the part of the last block beyond the EOF, and write it
478 * out sync. We need to drop the ilock while we do this so we
479 * don't deadlock when the buffer cache calls back to us.
480 */
481 xfs_iunlock(ip, XFS_ILOCK_EXCL);
482
483 zero_len = mp->m_sb.sb_blocksize - zero_offset;
484 if (isize + zero_len > offset)
485 zero_len = offset - isize;
486 error = xfs_iozero(ip, isize, zero_len);
487
488 xfs_ilock(ip, XFS_ILOCK_EXCL);
489 ASSERT(error >= 0);
490 return error;
491}
492
493/*
494 * Zero any on disk space between the current EOF and the new,
495 * larger EOF. This handles the normal case of zeroing the remainder
496 * of the last block in the file and the unusual case of zeroing blocks
497 * out beyond the size of the file. This second case only happens
498 * with fixed size extents and when the system crashes before the inode
499 * size was updated but after blocks were allocated. If fill is set,
500 * then any holes in the range are filled and zeroed. If not, the holes
501 * are left alone as holes.
502 */
503
504int /* error (positive) */
505xfs_zero_eof(
506 xfs_inode_t *ip,
507 xfs_off_t offset, /* starting I/O offset */
508 xfs_fsize_t isize) /* current inode size */
509{
510 xfs_mount_t *mp = ip->i_mount;
511 xfs_fileoff_t start_zero_fsb;
512 xfs_fileoff_t end_zero_fsb;
513 xfs_fileoff_t zero_count_fsb;
514 xfs_fileoff_t last_fsb;
515 xfs_fileoff_t zero_off;
516 xfs_fsize_t zero_len;
517 int nimaps;
518 int error = 0;
519 xfs_bmbt_irec_t imap;
520
521 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
522 ASSERT(offset > isize);
523
524 /*
525 * First handle zeroing the block on which isize resides.
526 * We only zero a part of that block so it is handled specially.
527 */
528 error = xfs_zero_last_block(ip, offset, isize);
529 if (error) {
530 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
531 return error;
532 }
533
534 /*
535 * Calculate the range between the new size and the old
536 * where blocks needing to be zeroed may exist. To get the
537 * block where the last byte in the file currently resides,
538 * we need to subtract one from the size and truncate back
539 * to a block boundary. We subtract 1 in case the size is
540 * exactly on a block boundary.
541 */
542 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
543 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
544 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
545 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
546 if (last_fsb == end_zero_fsb) {
547 /*
548 * The size was only incremented on its last block.
549 * We took care of that above, so just return.
550 */
551 return 0;
552 }
553
554 ASSERT(start_zero_fsb <= end_zero_fsb);
555 while (start_zero_fsb <= end_zero_fsb) {
556 nimaps = 1;
557 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
558 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
559 0, NULL, 0, &imap, &nimaps, NULL, NULL);
560 if (error) {
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 return error;
563 }
564 ASSERT(nimaps > 0);
565
566 if (imap.br_state == XFS_EXT_UNWRITTEN ||
567 imap.br_startblock == HOLESTARTBLOCK) {
568 /*
569 * This loop handles initializing pages that were
570 * partially initialized by the code below this
571 * loop. It basically zeroes the part of the page
572 * that sits on a hole and sets the page as P_HOLE
573 * and calls remapf if it is a mapped file.
574 */
575 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
576 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
577 continue;
578 }
579
580 /*
581 * There are blocks we need to zero.
582 * Drop the inode lock while we're doing the I/O.
583 * We'll still have the iolock to protect us.
584 */
585 xfs_iunlock(ip, XFS_ILOCK_EXCL);
586
587 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
588 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
589
590 if ((zero_off + zero_len) > offset)
591 zero_len = offset - zero_off;
592
593 error = xfs_iozero(ip, zero_off, zero_len);
594 if (error) {
595 goto out_lock;
596 }
597
598 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
599 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
600
601 xfs_ilock(ip, XFS_ILOCK_EXCL);
602 }
603
604 return 0;
605
606out_lock:
607 xfs_ilock(ip, XFS_ILOCK_EXCL);
608 ASSERT(error >= 0);
609 return error;
610}
611
612STATIC ssize_t
613xfs_file_aio_write(
614 struct kiocb *iocb,
615 const struct iovec *iovp,
616 unsigned long nr_segs,
617 loff_t pos)
618{
619 struct file *file = iocb->ki_filp;
620 struct address_space *mapping = file->f_mapping;
621 struct inode *inode = mapping->host;
622 struct xfs_inode *ip = XFS_I(inode);
623 struct xfs_mount *mp = ip->i_mount;
624 ssize_t ret = 0, error = 0;
625 int ioflags = 0;
626 xfs_fsize_t isize, new_size;
627 int iolock;
628 int eventsent = 0;
629 size_t ocount = 0, count;
630 int need_i_mutex;
631
632 XFS_STATS_INC(xs_write_calls);
633
634 BUG_ON(iocb->ki_pos != pos);
635
636 if (unlikely(file->f_flags & O_DIRECT))
637 ioflags |= IO_ISDIRECT;
638 if (file->f_mode & FMODE_NOCMTIME)
639 ioflags |= IO_INVIS;
640
641 error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
642 if (error)
643 return error;
644
645 count = ocount;
646 if (count == 0)
647 return 0;
648
649 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
650
651 if (XFS_FORCED_SHUTDOWN(mp))
652 return -EIO;
653
654relock:
655 if (ioflags & IO_ISDIRECT) {
656 iolock = XFS_IOLOCK_SHARED;
657 need_i_mutex = 0;
658 } else {
659 iolock = XFS_IOLOCK_EXCL;
660 need_i_mutex = 1;
661 mutex_lock(&inode->i_mutex);
662 }
663
664 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
665
666start:
667 error = -generic_write_checks(file, &pos, &count,
668 S_ISBLK(inode->i_mode));
669 if (error) {
670 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
671 goto out_unlock_mutex;
672 }
673
674 if ((DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) &&
675 !(ioflags & IO_INVIS) && !eventsent)) {
676 int dmflags = FILP_DELAY_FLAG(file);
677
678 if (need_i_mutex)
679 dmflags |= DM_FLAGS_IMUX;
680
681 xfs_iunlock(ip, XFS_ILOCK_EXCL);
682 error = XFS_SEND_DATA(ip->i_mount, DM_EVENT_WRITE, ip,
683 pos, count, dmflags, &iolock);
684 if (error) {
685 goto out_unlock_internal;
686 }
687 xfs_ilock(ip, XFS_ILOCK_EXCL);
688 eventsent = 1;
689
690 /*
691 * The iolock was dropped and reacquired in XFS_SEND_DATA
692 * so we have to recheck the size when appending.
693 * We will only "goto start;" once, since having sent the
694 * event prevents another call to XFS_SEND_DATA, which is
695 * what allows the size to change in the first place.
696 */
697 if ((file->f_flags & O_APPEND) && pos != ip->i_size)
698 goto start;
699 }
700
701 if (ioflags & IO_ISDIRECT) {
702 xfs_buftarg_t *target =
703 XFS_IS_REALTIME_INODE(ip) ?
704 mp->m_rtdev_targp : mp->m_ddev_targp;
705
706 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
707 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
708 return XFS_ERROR(-EINVAL);
709 }
710
711 if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
712 xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
713 iolock = XFS_IOLOCK_EXCL;
714 need_i_mutex = 1;
715 mutex_lock(&inode->i_mutex);
716 xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
717 goto start;
718 }
719 }
720
721 new_size = pos + count;
722 if (new_size > ip->i_size)
723 ip->i_new_size = new_size;
724
725 if (likely(!(ioflags & IO_INVIS)))
726 file_update_time(file);
727
728 /*
729 * If the offset is beyond the size of the file, we have a couple
730 * of things to do. First, if there is already space allocated
731 * we need to either create holes or zero the disk or ...
732 *
733 * If there is a page where the previous size lands, we need
734 * to zero it out up to the new size.
735 */
736
737 if (pos > ip->i_size) {
738 error = xfs_zero_eof(ip, pos, ip->i_size);
739 if (error) {
740 xfs_iunlock(ip, XFS_ILOCK_EXCL);
741 goto out_unlock_internal;
742 }
743 }
744 xfs_iunlock(ip, XFS_ILOCK_EXCL);
745
746 /*
747 * If we're writing the file then make sure to clear the
748 * setuid and setgid bits if the process is not being run
749 * by root. This keeps people from modifying setuid and
750 * setgid binaries.
751 */
752 error = -file_remove_suid(file);
753 if (unlikely(error))
754 goto out_unlock_internal;
755
756 /* We can write back this queue in page reclaim */
757 current->backing_dev_info = mapping->backing_dev_info;
758
759 if ((ioflags & IO_ISDIRECT)) {
760 if (mapping->nrpages) {
761 WARN_ON(need_i_mutex == 0);
762 error = xfs_flushinval_pages(ip,
763 (pos & PAGE_CACHE_MASK),
764 -1, FI_REMAPF_LOCKED);
765 if (error)
766 goto out_unlock_internal;
767 }
768
769 if (need_i_mutex) {
770 /* demote the lock now the cached pages are gone */
771 xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
772 mutex_unlock(&inode->i_mutex);
773
774 iolock = XFS_IOLOCK_SHARED;
775 need_i_mutex = 0;
776 }
777
778 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
779 ret = generic_file_direct_write(iocb, iovp,
780 &nr_segs, pos, &iocb->ki_pos, count, ocount);
781
782 /*
783 * direct-io write to a hole: fall through to buffered I/O
784 * for completing the rest of the request.
785 */
786 if (ret >= 0 && ret != count) {
787 XFS_STATS_ADD(xs_write_bytes, ret);
788
789 pos += ret;
790 count -= ret;
791
792 ioflags &= ~IO_ISDIRECT;
793 xfs_iunlock(ip, iolock);
794 goto relock;
795 }
796 } else {
797 int enospc = 0;
798 ssize_t ret2 = 0;
799
800write_retry:
801 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
802 ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
803 pos, &iocb->ki_pos, count, ret);
804 /*
805 * if we just got an ENOSPC, flush the inode now we
806 * aren't holding any page locks and retry *once*
807 */
808 if (ret2 == -ENOSPC && !enospc) {
809 error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
810 if (error)
811 goto out_unlock_internal;
812 enospc = 1;
813 goto write_retry;
814 }
815 ret = ret2;
816 }
817
818 current->backing_dev_info = NULL;
819
820 isize = i_size_read(inode);
821 if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
822 iocb->ki_pos = isize;
823
824 if (iocb->ki_pos > ip->i_size) {
825 xfs_ilock(ip, XFS_ILOCK_EXCL);
826 if (iocb->ki_pos > ip->i_size)
827 ip->i_size = iocb->ki_pos;
828 xfs_iunlock(ip, XFS_ILOCK_EXCL);
829 }
830
831 if (ret == -ENOSPC &&
832 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
833 xfs_iunlock(ip, iolock);
834 if (need_i_mutex)
835 mutex_unlock(&inode->i_mutex);
836 error = XFS_SEND_NAMESP(ip->i_mount, DM_EVENT_NOSPACE, ip,
837 DM_RIGHT_NULL, ip, DM_RIGHT_NULL, NULL, NULL,
838 0, 0, 0); /* Delay flag intentionally unused */
839 if (need_i_mutex)
840 mutex_lock(&inode->i_mutex);
841 xfs_ilock(ip, iolock);
842 if (error)
843 goto out_unlock_internal;
844 goto start;
845 }
846
847 error = -ret;
848 if (ret <= 0)
849 goto out_unlock_internal;
850
851 XFS_STATS_ADD(xs_write_bytes, ret);
852
853 /* Handle various SYNC-type writes */
854 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
855 loff_t end = pos + ret - 1;
856 int error2;
857
858 xfs_iunlock(ip, iolock);
859 if (need_i_mutex)
860 mutex_unlock(&inode->i_mutex);
861
862 error2 = filemap_write_and_wait_range(mapping, pos, end);
863 if (!error)
864 error = error2;
865 if (need_i_mutex)
866 mutex_lock(&inode->i_mutex);
867 xfs_ilock(ip, iolock);
868
869 error2 = -xfs_file_fsync(file, file->f_path.dentry,
870 (file->f_flags & __O_SYNC) ? 0 : 1);
871 if (!error)
872 error = error2;
873 }
874
875 out_unlock_internal:
876 if (ip->i_new_size) {
877 xfs_ilock(ip, XFS_ILOCK_EXCL);
878 ip->i_new_size = 0;
879 /*
880 * If this was a direct or synchronous I/O that failed (such
881 * as ENOSPC) then part of the I/O may have been written to
882 * disk before the error occured. In this case the on-disk
883 * file size may have been adjusted beyond the in-memory file
884 * size and now needs to be truncated back.
885 */
886 if (ip->i_d.di_size > ip->i_size)
887 ip->i_d.di_size = ip->i_size;
888 xfs_iunlock(ip, XFS_ILOCK_EXCL);
889 }
890 xfs_iunlock(ip, iolock);
891 out_unlock_mutex:
892 if (need_i_mutex)
893 mutex_unlock(&inode->i_mutex);
894 return -error;
117} 895}
118 896
119STATIC int 897STATIC int
@@ -160,28 +938,6 @@ xfs_file_release(
160 return -xfs_release(XFS_I(inode)); 938 return -xfs_release(XFS_I(inode));
161} 939}
162 940
163/*
164 * We ignore the datasync flag here because a datasync is effectively
165 * identical to an fsync. That is, datasync implies that we need to write
166 * only the metadata needed to be able to access the data that is written
167 * if we crash after the call completes. Hence if we are writing beyond
168 * EOF we have to log the inode size change as well, which makes it a
169 * full fsync. If we don't write beyond EOF, the inode core will be
170 * clean in memory and so we don't need to log the inode, just like
171 * fsync.
172 */
173STATIC int
174xfs_file_fsync(
175 struct file *file,
176 struct dentry *dentry,
177 int datasync)
178{
179 struct xfs_inode *ip = XFS_I(dentry->d_inode);
180
181 xfs_iflags_clear(ip, XFS_ITRUNCATED);
182 return -xfs_fsync(ip);
183}
184
185STATIC int 941STATIC int
186xfs_file_readdir( 942xfs_file_readdir(
187 struct file *filp, 943 struct file *filp,
@@ -203,9 +959,9 @@ xfs_file_readdir(
203 * 959 *
204 * Try to give it an estimate that's good enough, maybe at some 960 * Try to give it an estimate that's good enough, maybe at some
205 * point we can change the ->readdir prototype to include the 961 * point we can change the ->readdir prototype to include the
206 * buffer size. 962 * buffer size. For now we use the current glibc buffer size.
207 */ 963 */
208 bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size); 964 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
209 965
210 error = xfs_readdir(ip, dirent, bufsize, 966 error = xfs_readdir(ip, dirent, bufsize,
211 (xfs_off_t *)&filp->f_pos, filldir); 967 (xfs_off_t *)&filp->f_pos, filldir);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index e8566bbf0f00..61a99608731e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -91,6 +91,16 @@ xfs_mark_inode_dirty_sync(
91 mark_inode_dirty_sync(inode); 91 mark_inode_dirty_sync(inode);
92} 92}
93 93
94void
95xfs_mark_inode_dirty(
96 xfs_inode_t *ip)
97{
98 struct inode *inode = VFS_I(ip);
99
100 if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
101 mark_inode_dirty(inode);
102}
103
94/* 104/*
95 * Change the requested timestamp in the given inode. 105 * Change the requested timestamp in the given inode.
96 * We don't lock across timestamp updates, and we don't log them but 106 * We don't lock across timestamp updates, and we don't log them but
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 5af0c81ca1ae..facfb323a706 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,7 +88,6 @@
88#include <xfs_super.h> 88#include <xfs_super.h>
89#include <xfs_globals.h> 89#include <xfs_globals.h>
90#include <xfs_fs_subr.h> 90#include <xfs_fs_subr.h>
91#include <xfs_lrw.h>
92#include <xfs_buf.h> 91#include <xfs_buf.h>
93 92
94/* 93/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
deleted file mode 100644
index eac6f80d786d..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ /dev/null
@@ -1,796 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_dir2.h"
27#include "xfs_alloc.h"
28#include "xfs_dmapi.h"
29#include "xfs_quota.h"
30#include "xfs_mount.h"
31#include "xfs_bmap_btree.h"
32#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
34#include "xfs_dir2_sf.h"
35#include "xfs_attr_sf.h"
36#include "xfs_dinode.h"
37#include "xfs_inode.h"
38#include "xfs_bmap.h"
39#include "xfs_btree.h"
40#include "xfs_ialloc.h"
41#include "xfs_rtalloc.h"
42#include "xfs_error.h"
43#include "xfs_itable.h"
44#include "xfs_rw.h"
45#include "xfs_attr.h"
46#include "xfs_inode_item.h"
47#include "xfs_buf_item.h"
48#include "xfs_utils.h"
49#include "xfs_iomap.h"
50#include "xfs_vnodeops.h"
51#include "xfs_trace.h"
52
53#include <linux/capability.h>
54#include <linux/writeback.h>
55
56
57/*
58 * xfs_iozero
59 *
60 * xfs_iozero clears the specified range of buffer supplied,
61 * and marks all the affected blocks as valid and modified. If
62 * an affected block is not allocated, it will be allocated. If
63 * an affected block is not completely overwritten, and is not
64 * valid before the operation, it will be read from disk before
65 * being partially zeroed.
66 */
67STATIC int
68xfs_iozero(
69 struct xfs_inode *ip, /* inode */
70 loff_t pos, /* offset in file */
71 size_t count) /* size of data to zero */
72{
73 struct page *page;
74 struct address_space *mapping;
75 int status;
76
77 mapping = VFS_I(ip)->i_mapping;
78 do {
79 unsigned offset, bytes;
80 void *fsdata;
81
82 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
83 bytes = PAGE_CACHE_SIZE - offset;
84 if (bytes > count)
85 bytes = count;
86
87 status = pagecache_write_begin(NULL, mapping, pos, bytes,
88 AOP_FLAG_UNINTERRUPTIBLE,
89 &page, &fsdata);
90 if (status)
91 break;
92
93 zero_user(page, offset, bytes);
94
95 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
96 page, fsdata);
97 WARN_ON(status <= 0); /* can't return less than zero! */
98 pos += bytes;
99 count -= bytes;
100 status = 0;
101 } while (count);
102
103 return (-status);
104}
105
106ssize_t /* bytes read, or (-) error */
107xfs_read(
108 xfs_inode_t *ip,
109 struct kiocb *iocb,
110 const struct iovec *iovp,
111 unsigned int segs,
112 loff_t *offset,
113 int ioflags)
114{
115 struct file *file = iocb->ki_filp;
116 struct inode *inode = file->f_mapping->host;
117 xfs_mount_t *mp = ip->i_mount;
118 size_t size = 0;
119 ssize_t ret = 0;
120 xfs_fsize_t n;
121 unsigned long seg;
122
123
124 XFS_STATS_INC(xs_read_calls);
125
126 /* START copy & waste from filemap.c */
127 for (seg = 0; seg < segs; seg++) {
128 const struct iovec *iv = &iovp[seg];
129
130 /*
131 * If any segment has a negative length, or the cumulative
132 * length ever wraps negative then return -EINVAL.
133 */
134 size += iv->iov_len;
135 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
136 return XFS_ERROR(-EINVAL);
137 }
138 /* END copy & waste from filemap.c */
139
140 if (unlikely(ioflags & IO_ISDIRECT)) {
141 xfs_buftarg_t *target =
142 XFS_IS_REALTIME_INODE(ip) ?
143 mp->m_rtdev_targp : mp->m_ddev_targp;
144 if ((*offset & target->bt_smask) ||
145 (size & target->bt_smask)) {
146 if (*offset == ip->i_size) {
147 return (0);
148 }
149 return -XFS_ERROR(EINVAL);
150 }
151 }
152
153 n = XFS_MAXIOFFSET(mp) - *offset;
154 if ((n <= 0) || (size == 0))
155 return 0;
156
157 if (n < size)
158 size = n;
159
160 if (XFS_FORCED_SHUTDOWN(mp))
161 return -EIO;
162
163 if (unlikely(ioflags & IO_ISDIRECT))
164 mutex_lock(&inode->i_mutex);
165 xfs_ilock(ip, XFS_IOLOCK_SHARED);
166
167 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
168 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
169 int iolock = XFS_IOLOCK_SHARED;
170
171 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
172 dmflags, &iolock);
173 if (ret) {
174 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
175 if (unlikely(ioflags & IO_ISDIRECT))
176 mutex_unlock(&inode->i_mutex);
177 return ret;
178 }
179 }
180
181 if (unlikely(ioflags & IO_ISDIRECT)) {
182 if (inode->i_mapping->nrpages)
183 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
184 -1, FI_REMAPF_LOCKED);
185 mutex_unlock(&inode->i_mutex);
186 if (ret) {
187 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
188 return ret;
189 }
190 }
191
192 trace_xfs_file_read(ip, size, *offset, ioflags);
193
194 iocb->ki_pos = *offset;
195 ret = generic_file_aio_read(iocb, iovp, segs, *offset);
196 if (ret > 0)
197 XFS_STATS_ADD(xs_read_bytes, ret);
198
199 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
200 return ret;
201}
202
203ssize_t
204xfs_splice_read(
205 xfs_inode_t *ip,
206 struct file *infilp,
207 loff_t *ppos,
208 struct pipe_inode_info *pipe,
209 size_t count,
210 int flags,
211 int ioflags)
212{
213 xfs_mount_t *mp = ip->i_mount;
214 ssize_t ret;
215
216 XFS_STATS_INC(xs_read_calls);
217 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
218 return -EIO;
219
220 xfs_ilock(ip, XFS_IOLOCK_SHARED);
221
222 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
223 int iolock = XFS_IOLOCK_SHARED;
224 int error;
225
226 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
227 FILP_DELAY_FLAG(infilp), &iolock);
228 if (error) {
229 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
230 return -error;
231 }
232 }
233
234 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
235
236 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
237 if (ret > 0)
238 XFS_STATS_ADD(xs_read_bytes, ret);
239
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241 return ret;
242}
243
244ssize_t
245xfs_splice_write(
246 xfs_inode_t *ip,
247 struct pipe_inode_info *pipe,
248 struct file *outfilp,
249 loff_t *ppos,
250 size_t count,
251 int flags,
252 int ioflags)
253{
254 xfs_mount_t *mp = ip->i_mount;
255 ssize_t ret;
256 struct inode *inode = outfilp->f_mapping->host;
257 xfs_fsize_t isize, new_size;
258
259 XFS_STATS_INC(xs_write_calls);
260 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
261 return -EIO;
262
263 xfs_ilock(ip, XFS_IOLOCK_EXCL);
264
265 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
266 int iolock = XFS_IOLOCK_EXCL;
267 int error;
268
269 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
270 FILP_DELAY_FLAG(outfilp), &iolock);
271 if (error) {
272 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
273 return -error;
274 }
275 }
276
277 new_size = *ppos + count;
278
279 xfs_ilock(ip, XFS_ILOCK_EXCL);
280 if (new_size > ip->i_size)
281 ip->i_new_size = new_size;
282 xfs_iunlock(ip, XFS_ILOCK_EXCL);
283
284 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
285
286 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
287 if (ret > 0)
288 XFS_STATS_ADD(xs_write_bytes, ret);
289
290 isize = i_size_read(inode);
291 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
292 *ppos = isize;
293
294 if (*ppos > ip->i_size) {
295 xfs_ilock(ip, XFS_ILOCK_EXCL);
296 if (*ppos > ip->i_size)
297 ip->i_size = *ppos;
298 xfs_iunlock(ip, XFS_ILOCK_EXCL);
299 }
300
301 if (ip->i_new_size) {
302 xfs_ilock(ip, XFS_ILOCK_EXCL);
303 ip->i_new_size = 0;
304 if (ip->i_d.di_size > ip->i_size)
305 ip->i_d.di_size = ip->i_size;
306 xfs_iunlock(ip, XFS_ILOCK_EXCL);
307 }
308 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
309 return ret;
310}
311
312/*
313 * This routine is called to handle zeroing any space in the last
314 * block of the file that is beyond the EOF. We do this since the
315 * size is being increased without writing anything to that block
316 * and we don't want anyone to read the garbage on the disk.
317 */
318STATIC int /* error (positive) */
319xfs_zero_last_block(
320 xfs_inode_t *ip,
321 xfs_fsize_t offset,
322 xfs_fsize_t isize)
323{
324 xfs_fileoff_t last_fsb;
325 xfs_mount_t *mp = ip->i_mount;
326 int nimaps;
327 int zero_offset;
328 int zero_len;
329 int error = 0;
330 xfs_bmbt_irec_t imap;
331
332 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
333
334 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
335 if (zero_offset == 0) {
336 /*
337 * There are no extra bytes in the last block on disk to
338 * zero, so return.
339 */
340 return 0;
341 }
342
343 last_fsb = XFS_B_TO_FSBT(mp, isize);
344 nimaps = 1;
345 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
346 &nimaps, NULL, NULL);
347 if (error) {
348 return error;
349 }
350 ASSERT(nimaps > 0);
351 /*
352 * If the block underlying isize is just a hole, then there
353 * is nothing to zero.
354 */
355 if (imap.br_startblock == HOLESTARTBLOCK) {
356 return 0;
357 }
358 /*
359 * Zero the part of the last block beyond the EOF, and write it
360 * out sync. We need to drop the ilock while we do this so we
361 * don't deadlock when the buffer cache calls back to us.
362 */
363 xfs_iunlock(ip, XFS_ILOCK_EXCL);
364
365 zero_len = mp->m_sb.sb_blocksize - zero_offset;
366 if (isize + zero_len > offset)
367 zero_len = offset - isize;
368 error = xfs_iozero(ip, isize, zero_len);
369
370 xfs_ilock(ip, XFS_ILOCK_EXCL);
371 ASSERT(error >= 0);
372 return error;
373}
374
375/*
376 * Zero any on disk space between the current EOF and the new,
377 * larger EOF. This handles the normal case of zeroing the remainder
378 * of the last block in the file and the unusual case of zeroing blocks
379 * out beyond the size of the file. This second case only happens
380 * with fixed size extents and when the system crashes before the inode
381 * size was updated but after blocks were allocated. If fill is set,
382 * then any holes in the range are filled and zeroed. If not, the holes
383 * are left alone as holes.
384 */
385
386int /* error (positive) */
387xfs_zero_eof(
388 xfs_inode_t *ip,
389 xfs_off_t offset, /* starting I/O offset */
390 xfs_fsize_t isize) /* current inode size */
391{
392 xfs_mount_t *mp = ip->i_mount;
393 xfs_fileoff_t start_zero_fsb;
394 xfs_fileoff_t end_zero_fsb;
395 xfs_fileoff_t zero_count_fsb;
396 xfs_fileoff_t last_fsb;
397 xfs_fileoff_t zero_off;
398 xfs_fsize_t zero_len;
399 int nimaps;
400 int error = 0;
401 xfs_bmbt_irec_t imap;
402
403 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
404 ASSERT(offset > isize);
405
406 /*
407 * First handle zeroing the block on which isize resides.
408 * We only zero a part of that block so it is handled specially.
409 */
410 error = xfs_zero_last_block(ip, offset, isize);
411 if (error) {
412 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
413 return error;
414 }
415
416 /*
417 * Calculate the range between the new size and the old
418 * where blocks needing to be zeroed may exist. To get the
419 * block where the last byte in the file currently resides,
420 * we need to subtract one from the size and truncate back
421 * to a block boundary. We subtract 1 in case the size is
422 * exactly on a block boundary.
423 */
424 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
425 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
426 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
427 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
428 if (last_fsb == end_zero_fsb) {
429 /*
430 * The size was only incremented on its last block.
431 * We took care of that above, so just return.
432 */
433 return 0;
434 }
435
436 ASSERT(start_zero_fsb <= end_zero_fsb);
437 while (start_zero_fsb <= end_zero_fsb) {
438 nimaps = 1;
439 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
440 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
441 0, NULL, 0, &imap, &nimaps, NULL, NULL);
442 if (error) {
443 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
444 return error;
445 }
446 ASSERT(nimaps > 0);
447
448 if (imap.br_state == XFS_EXT_UNWRITTEN ||
449 imap.br_startblock == HOLESTARTBLOCK) {
450 /*
451 * This loop handles initializing pages that were
452 * partially initialized by the code below this
453 * loop. It basically zeroes the part of the page
454 * that sits on a hole and sets the page as P_HOLE
455 * and calls remapf if it is a mapped file.
456 */
457 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
458 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
459 continue;
460 }
461
462 /*
463 * There are blocks we need to zero.
464 * Drop the inode lock while we're doing the I/O.
465 * We'll still have the iolock to protect us.
466 */
467 xfs_iunlock(ip, XFS_ILOCK_EXCL);
468
469 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
470 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
471
472 if ((zero_off + zero_len) > offset)
473 zero_len = offset - zero_off;
474
475 error = xfs_iozero(ip, zero_off, zero_len);
476 if (error) {
477 goto out_lock;
478 }
479
480 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
481 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
482
483 xfs_ilock(ip, XFS_ILOCK_EXCL);
484 }
485
486 return 0;
487
488out_lock:
489 xfs_ilock(ip, XFS_ILOCK_EXCL);
490 ASSERT(error >= 0);
491 return error;
492}
493
494ssize_t /* bytes written, or (-) error */
495xfs_write(
496 struct xfs_inode *xip,
497 struct kiocb *iocb,
498 const struct iovec *iovp,
499 unsigned int nsegs,
500 loff_t *offset,
501 int ioflags)
502{
503 struct file *file = iocb->ki_filp;
504 struct address_space *mapping = file->f_mapping;
505 struct inode *inode = mapping->host;
506 unsigned long segs = nsegs;
507 xfs_mount_t *mp;
508 ssize_t ret = 0, error = 0;
509 xfs_fsize_t isize, new_size;
510 int iolock;
511 int eventsent = 0;
512 size_t ocount = 0, count;
513 loff_t pos;
514 int need_i_mutex;
515
516 XFS_STATS_INC(xs_write_calls);
517
518 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
519 if (error)
520 return error;
521
522 count = ocount;
523 pos = *offset;
524
525 if (count == 0)
526 return 0;
527
528 mp = xip->i_mount;
529
530 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
531
532 if (XFS_FORCED_SHUTDOWN(mp))
533 return -EIO;
534
535relock:
536 if (ioflags & IO_ISDIRECT) {
537 iolock = XFS_IOLOCK_SHARED;
538 need_i_mutex = 0;
539 } else {
540 iolock = XFS_IOLOCK_EXCL;
541 need_i_mutex = 1;
542 mutex_lock(&inode->i_mutex);
543 }
544
545 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
546
547start:
548 error = -generic_write_checks(file, &pos, &count,
549 S_ISBLK(inode->i_mode));
550 if (error) {
551 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
552 goto out_unlock_mutex;
553 }
554
555 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
556 !(ioflags & IO_INVIS) && !eventsent)) {
557 int dmflags = FILP_DELAY_FLAG(file);
558
559 if (need_i_mutex)
560 dmflags |= DM_FLAGS_IMUX;
561
562 xfs_iunlock(xip, XFS_ILOCK_EXCL);
563 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
564 pos, count, dmflags, &iolock);
565 if (error) {
566 goto out_unlock_internal;
567 }
568 xfs_ilock(xip, XFS_ILOCK_EXCL);
569 eventsent = 1;
570
571 /*
572 * The iolock was dropped and reacquired in XFS_SEND_DATA
573 * so we have to recheck the size when appending.
574 * We will only "goto start;" once, since having sent the
575 * event prevents another call to XFS_SEND_DATA, which is
576 * what allows the size to change in the first place.
577 */
578 if ((file->f_flags & O_APPEND) && pos != xip->i_size)
579 goto start;
580 }
581
582 if (ioflags & IO_ISDIRECT) {
583 xfs_buftarg_t *target =
584 XFS_IS_REALTIME_INODE(xip) ?
585 mp->m_rtdev_targp : mp->m_ddev_targp;
586
587 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
588 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
589 return XFS_ERROR(-EINVAL);
590 }
591
592 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
593 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
594 iolock = XFS_IOLOCK_EXCL;
595 need_i_mutex = 1;
596 mutex_lock(&inode->i_mutex);
597 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
598 goto start;
599 }
600 }
601
602 new_size = pos + count;
603 if (new_size > xip->i_size)
604 xip->i_new_size = new_size;
605
606 if (likely(!(ioflags & IO_INVIS)))
607 file_update_time(file);
608
609 /*
610 * If the offset is beyond the size of the file, we have a couple
611 * of things to do. First, if there is already space allocated
612 * we need to either create holes or zero the disk or ...
613 *
614 * If there is a page where the previous size lands, we need
615 * to zero it out up to the new size.
616 */
617
618 if (pos > xip->i_size) {
619 error = xfs_zero_eof(xip, pos, xip->i_size);
620 if (error) {
621 xfs_iunlock(xip, XFS_ILOCK_EXCL);
622 goto out_unlock_internal;
623 }
624 }
625 xfs_iunlock(xip, XFS_ILOCK_EXCL);
626
627 /*
628 * If we're writing the file then make sure to clear the
629 * setuid and setgid bits if the process is not being run
630 * by root. This keeps people from modifying setuid and
631 * setgid binaries.
632 */
633 error = -file_remove_suid(file);
634 if (unlikely(error))
635 goto out_unlock_internal;
636
637 /* We can write back this queue in page reclaim */
638 current->backing_dev_info = mapping->backing_dev_info;
639
640 if ((ioflags & IO_ISDIRECT)) {
641 if (mapping->nrpages) {
642 WARN_ON(need_i_mutex == 0);
643 error = xfs_flushinval_pages(xip,
644 (pos & PAGE_CACHE_MASK),
645 -1, FI_REMAPF_LOCKED);
646 if (error)
647 goto out_unlock_internal;
648 }
649
650 if (need_i_mutex) {
651 /* demote the lock now the cached pages are gone */
652 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
653 mutex_unlock(&inode->i_mutex);
654
655 iolock = XFS_IOLOCK_SHARED;
656 need_i_mutex = 0;
657 }
658
659 trace_xfs_file_direct_write(xip, count, *offset, ioflags);
660 ret = generic_file_direct_write(iocb, iovp,
661 &segs, pos, offset, count, ocount);
662
663 /*
664 * direct-io write to a hole: fall through to buffered I/O
665 * for completing the rest of the request.
666 */
667 if (ret >= 0 && ret != count) {
668 XFS_STATS_ADD(xs_write_bytes, ret);
669
670 pos += ret;
671 count -= ret;
672
673 ioflags &= ~IO_ISDIRECT;
674 xfs_iunlock(xip, iolock);
675 goto relock;
676 }
677 } else {
678 int enospc = 0;
679 ssize_t ret2 = 0;
680
681write_retry:
682 trace_xfs_file_buffered_write(xip, count, *offset, ioflags);
683 ret2 = generic_file_buffered_write(iocb, iovp, segs,
684 pos, offset, count, ret);
685 /*
686 * if we just got an ENOSPC, flush the inode now we
687 * aren't holding any page locks and retry *once*
688 */
689 if (ret2 == -ENOSPC && !enospc) {
690 error = xfs_flush_pages(xip, 0, -1, 0, FI_NONE);
691 if (error)
692 goto out_unlock_internal;
693 enospc = 1;
694 goto write_retry;
695 }
696 ret = ret2;
697 }
698
699 current->backing_dev_info = NULL;
700
701 isize = i_size_read(inode);
702 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
703 *offset = isize;
704
705 if (*offset > xip->i_size) {
706 xfs_ilock(xip, XFS_ILOCK_EXCL);
707 if (*offset > xip->i_size)
708 xip->i_size = *offset;
709 xfs_iunlock(xip, XFS_ILOCK_EXCL);
710 }
711
712 if (ret == -ENOSPC &&
713 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
714 xfs_iunlock(xip, iolock);
715 if (need_i_mutex)
716 mutex_unlock(&inode->i_mutex);
717 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
718 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
719 0, 0, 0); /* Delay flag intentionally unused */
720 if (need_i_mutex)
721 mutex_lock(&inode->i_mutex);
722 xfs_ilock(xip, iolock);
723 if (error)
724 goto out_unlock_internal;
725 goto start;
726 }
727
728 error = -ret;
729 if (ret <= 0)
730 goto out_unlock_internal;
731
732 XFS_STATS_ADD(xs_write_bytes, ret);
733
734 /* Handle various SYNC-type writes */
735 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
736 loff_t end = pos + ret - 1;
737 int error2;
738
739 xfs_iunlock(xip, iolock);
740 if (need_i_mutex)
741 mutex_unlock(&inode->i_mutex);
742
743 error2 = filemap_write_and_wait_range(mapping, pos, end);
744 if (!error)
745 error = error2;
746 if (need_i_mutex)
747 mutex_lock(&inode->i_mutex);
748 xfs_ilock(xip, iolock);
749
750 error2 = xfs_fsync(xip);
751 if (!error)
752 error = error2;
753 }
754
755 out_unlock_internal:
756 if (xip->i_new_size) {
757 xfs_ilock(xip, XFS_ILOCK_EXCL);
758 xip->i_new_size = 0;
759 /*
760 * If this was a direct or synchronous I/O that failed (such
761 * as ENOSPC) then part of the I/O may have been written to
762 * disk before the error occured. In this case the on-disk
763 * file size may have been adjusted beyond the in-memory file
764 * size and now needs to be truncated back.
765 */
766 if (xip->i_d.di_size > xip->i_size)
767 xip->i_d.di_size = xip->i_size;
768 xfs_iunlock(xip, XFS_ILOCK_EXCL);
769 }
770 xfs_iunlock(xip, iolock);
771 out_unlock_mutex:
772 if (need_i_mutex)
773 mutex_unlock(&inode->i_mutex);
774 return -error;
775}
776
777/*
778 * If the underlying (data/log/rt) device is readonly, there are some
779 * operations that cannot proceed.
780 */
781int
782xfs_dev_is_read_only(
783 xfs_mount_t *mp,
784 char *message)
785{
786 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
787 xfs_readonly_buftarg(mp->m_logdev_targp) ||
788 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
789 cmn_err(CE_NOTE,
790 "XFS: %s required on read-only device.", message);
791 cmn_err(CE_NOTE,
792 "XFS: write access unavailable, cannot proceed.");
793 return EROFS;
794 }
795 return 0;
796}
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
deleted file mode 100644
index 342ae8c0d011..000000000000
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ /dev/null
@@ -1,29 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_LRW_H__
19#define __XFS_LRW_H__
20
21struct xfs_mount;
22struct xfs_inode;
23struct xfs_buf;
24
25extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
26
27extern int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
28
29#endif /* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a9f6d20aff41..05cd85317f6f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -607,7 +607,8 @@ xfssyncd(
607 set_freezable(); 607 set_freezable();
608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); 608 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
609 for (;;) { 609 for (;;) {
610 timeleft = schedule_timeout_interruptible(timeleft); 610 if (list_empty(&mp->m_sync_list))
611 timeleft = schedule_timeout_interruptible(timeleft);
611 /* swsusp */ 612 /* swsusp */
612 try_to_freeze(); 613 try_to_freeze();
613 if (kthread_should_stop() && list_empty(&mp->m_sync_list)) 614 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
@@ -627,8 +628,7 @@ xfssyncd(
627 list_add_tail(&mp->m_sync_work.w_list, 628 list_add_tail(&mp->m_sync_work.w_list,
628 &mp->m_sync_list); 629 &mp->m_sync_list);
629 } 630 }
630 list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list) 631 list_splice_init(&mp->m_sync_list, &tmp);
631 list_move(&work->w_list, &tmp);
632 spin_unlock(&mp->m_sync_lock); 632 spin_unlock(&mp->m_sync_lock);
633 633
634 list_for_each_entry_safe(work, n, &tmp, w_list) { 634 list_for_each_entry_safe(work, n, &tmp, w_list) {
@@ -688,12 +688,12 @@ xfs_inode_set_reclaim_tag(
688 struct xfs_perag *pag; 688 struct xfs_perag *pag;
689 689
690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 690 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
691 read_lock(&pag->pag_ici_lock); 691 write_lock(&pag->pag_ici_lock);
692 spin_lock(&ip->i_flags_lock); 692 spin_lock(&ip->i_flags_lock);
693 __xfs_inode_set_reclaim_tag(pag, ip); 693 __xfs_inode_set_reclaim_tag(pag, ip);
694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 694 __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
695 spin_unlock(&ip->i_flags_lock); 695 spin_unlock(&ip->i_flags_lock);
696 read_unlock(&pag->pag_ici_lock); 696 write_unlock(&pag->pag_ici_lock);
697 xfs_perag_put(pag); 697 xfs_perag_put(pag);
698} 698}
699 699
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 856eb3c8d605..5a107601e969 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -52,22 +52,6 @@
52#include "quota/xfs_dquot.h" 52#include "quota/xfs_dquot.h"
53 53
54/* 54/*
55 * Format fsblock number into a static buffer & return it.
56 */
57STATIC char *xfs_fmtfsblock(xfs_fsblock_t bno)
58{
59 static char rval[50];
60
61 if (bno == NULLFSBLOCK)
62 sprintf(rval, "NULLFSBLOCK");
63 else if (isnullstartblock(bno))
64 sprintf(rval, "NULLSTARTBLOCK(%lld)", startblockval(bno));
65 else
66 sprintf(rval, "%lld", (xfs_dfsbno_t)bno);
67 return rval;
68}
69
70/*
71 * We include this last to have the helpers above available for the trace 55 * We include this last to have the helpers above available for the trace
72 * event implementations. 56 * event implementations.
73 */ 57 */
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index a4574dcf5065..fcaa62f0799e 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -197,13 +197,13 @@ TRACE_EVENT(xfs_iext_insert,
197 __entry->caller_ip = caller_ip; 197 __entry->caller_ip = caller_ip;
198 ), 198 ),
199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 199 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
200 "offset %lld block %s count %lld flag %d caller %pf", 200 "offset %lld block %lld count %lld flag %d caller %pf",
201 MAJOR(__entry->dev), MINOR(__entry->dev), 201 MAJOR(__entry->dev), MINOR(__entry->dev),
202 __entry->ino, 202 __entry->ino,
203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 203 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
204 (long)__entry->idx, 204 (long)__entry->idx,
205 __entry->startoff, 205 __entry->startoff,
206 xfs_fmtfsblock(__entry->startblock), 206 (__int64_t)__entry->startblock,
207 __entry->blockcount, 207 __entry->blockcount,
208 __entry->state, 208 __entry->state,
209 (char *)__entry->caller_ip) 209 (char *)__entry->caller_ip)
@@ -241,13 +241,13 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
241 __entry->caller_ip = caller_ip; 241 __entry->caller_ip = caller_ip;
242 ), 242 ),
243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " 243 TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
244 "offset %lld block %s count %lld flag %d caller %pf", 244 "offset %lld block %lld count %lld flag %d caller %pf",
245 MAJOR(__entry->dev), MINOR(__entry->dev), 245 MAJOR(__entry->dev), MINOR(__entry->dev),
246 __entry->ino, 246 __entry->ino,
247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), 247 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
248 (long)__entry->idx, 248 (long)__entry->idx,
249 __entry->startoff, 249 __entry->startoff,
250 xfs_fmtfsblock(__entry->startblock), 250 (__int64_t)__entry->startblock,
251 __entry->blockcount, 251 __entry->blockcount,
252 __entry->state, 252 __entry->state,
253 (char *)__entry->caller_ip) 253 (char *)__entry->caller_ip)
@@ -593,7 +593,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
593 TP_ARGS(dqp), 593 TP_ARGS(dqp),
594 TP_STRUCT__entry( 594 TP_STRUCT__entry(
595 __field(dev_t, dev) 595 __field(dev_t, dev)
596 __field(__be32, id) 596 __field(u32, id)
597 __field(unsigned, flags) 597 __field(unsigned, flags)
598 __field(unsigned, nrefs) 598 __field(unsigned, nrefs)
599 __field(unsigned long long, res_bcount) 599 __field(unsigned long long, res_bcount)
@@ -606,7 +606,7 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
606 ), \ 606 ), \
607 TP_fast_assign( 607 TP_fast_assign(
608 __entry->dev = dqp->q_mount->m_super->s_dev; 608 __entry->dev = dqp->q_mount->m_super->s_dev;
609 __entry->id = dqp->q_core.d_id; 609 __entry->id = be32_to_cpu(dqp->q_core.d_id);
610 __entry->flags = dqp->dq_flags; 610 __entry->flags = dqp->dq_flags;
611 __entry->nrefs = dqp->q_nrefs; 611 __entry->nrefs = dqp->q_nrefs;
612 __entry->res_bcount = dqp->q_res_bcount; 612 __entry->res_bcount = dqp->q_res_bcount;
@@ -622,10 +622,10 @@ DECLARE_EVENT_CLASS(xfs_dquot_class,
622 be64_to_cpu(dqp->q_core.d_ino_softlimit); 622 be64_to_cpu(dqp->q_core.d_ino_softlimit);
623 ), 623 ),
624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " 624 TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
625 "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " 625 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
626 "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", 626 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
627 MAJOR(__entry->dev), MINOR(__entry->dev), 627 MAJOR(__entry->dev), MINOR(__entry->dev),
628 be32_to_cpu(__entry->id), 628 __entry->id,
629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), 629 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
630 __entry->nrefs, 630 __entry->nrefs,
631 __entry->res_bcount, 631 __entry->res_bcount,
@@ -881,7 +881,7 @@ TRACE_EVENT(name, \
881 ), \ 881 ), \
882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 882 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
883 "offset 0x%llx count %zd flags %s " \ 883 "offset 0x%llx count %zd flags %s " \
884 "startoff 0x%llx startblock %s blockcount 0x%llx", \ 884 "startoff 0x%llx startblock %lld blockcount 0x%llx", \
885 MAJOR(__entry->dev), MINOR(__entry->dev), \ 885 MAJOR(__entry->dev), MINOR(__entry->dev), \
886 __entry->ino, \ 886 __entry->ino, \
887 __entry->size, \ 887 __entry->size, \
@@ -890,7 +890,7 @@ TRACE_EVENT(name, \
890 __entry->count, \ 890 __entry->count, \
891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 891 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
892 __entry->startoff, \ 892 __entry->startoff, \
893 xfs_fmtfsblock(__entry->startblock), \ 893 (__int64_t)__entry->startblock, \
894 __entry->blockcount) \ 894 __entry->blockcount) \
895) 895)
896DEFINE_IOMAP_EVENT(xfs_iomap_enter); 896DEFINE_IOMAP_EVENT(xfs_iomap_enter);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1869fb973819..5c11e4d17010 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2550,22 +2550,134 @@ xfs_bmap_rtalloc(
2550} 2550}
2551 2551
2552STATIC int 2552STATIC int
2553xfs_bmap_btalloc_nullfb(
2554 struct xfs_bmalloca *ap,
2555 struct xfs_alloc_arg *args,
2556 xfs_extlen_t *blen)
2557{
2558 struct xfs_mount *mp = ap->ip->i_mount;
2559 struct xfs_perag *pag;
2560 xfs_agnumber_t ag, startag;
2561 int notinit = 0;
2562 int error;
2563
2564 if (ap->userdata && xfs_inode_is_filestream(ap->ip))
2565 args->type = XFS_ALLOCTYPE_NEAR_BNO;
2566 else
2567 args->type = XFS_ALLOCTYPE_START_BNO;
2568 args->total = ap->total;
2569
2570 /*
2571 * Search for an allocation group with a single extent large enough
2572 * for the request. If one isn't found, then adjust the minimum
2573 * allocation size to the largest space found.
2574 */
2575 startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
2576 if (startag == NULLAGNUMBER)
2577 startag = ag = 0;
2578
2579 pag = xfs_perag_get(mp, ag);
2580 while (*blen < ap->alen) {
2581 if (!pag->pagf_init) {
2582 error = xfs_alloc_pagf_init(mp, args->tp, ag,
2583 XFS_ALLOC_FLAG_TRYLOCK);
2584 if (error) {
2585 xfs_perag_put(pag);
2586 return error;
2587 }
2588 }
2589
2590 /*
2591 * See xfs_alloc_fix_freelist...
2592 */
2593 if (pag->pagf_init) {
2594 xfs_extlen_t longest;
2595 longest = xfs_alloc_longest_free_extent(mp, pag);
2596 if (*blen < longest)
2597 *blen = longest;
2598 } else
2599 notinit = 1;
2600
2601 if (xfs_inode_is_filestream(ap->ip)) {
2602 if (*blen >= ap->alen)
2603 break;
2604
2605 if (ap->userdata) {
2606 /*
2607 * If startag is an invalid AG, we've
2608 * come here once before and
2609 * xfs_filestream_new_ag picked the
2610 * best currently available.
2611 *
2612 * Don't continue looping, since we
2613 * could loop forever.
2614 */
2615 if (startag == NULLAGNUMBER)
2616 break;
2617
2618 error = xfs_filestream_new_ag(ap, &ag);
2619 xfs_perag_put(pag);
2620 if (error)
2621 return error;
2622
2623 /* loop again to set 'blen'*/
2624 startag = NULLAGNUMBER;
2625 pag = xfs_perag_get(mp, ag);
2626 continue;
2627 }
2628 }
2629 if (++ag == mp->m_sb.sb_agcount)
2630 ag = 0;
2631 if (ag == startag)
2632 break;
2633 xfs_perag_put(pag);
2634 pag = xfs_perag_get(mp, ag);
2635 }
2636 xfs_perag_put(pag);
2637
2638 /*
2639 * Since the above loop did a BUF_TRYLOCK, it is
2640 * possible that there is space for this request.
2641 */
2642 if (notinit || *blen < ap->minlen)
2643 args->minlen = ap->minlen;
2644 /*
2645 * If the best seen length is less than the request
2646 * length, use the best as the minimum.
2647 */
2648 else if (*blen < ap->alen)
2649 args->minlen = *blen;
2650 /*
2651 * Otherwise we've seen an extent as big as alen,
2652 * use that as the minimum.
2653 */
2654 else
2655 args->minlen = ap->alen;
2656
2657 /*
2658 * set the failure fallback case to look in the selected
2659 * AG as the stream may have moved.
2660 */
2661 if (xfs_inode_is_filestream(ap->ip))
2662 ap->rval = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2663
2664 return 0;
2665}
2666
2667STATIC int
2553xfs_bmap_btalloc( 2668xfs_bmap_btalloc(
2554 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 2669 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2555{ 2670{
2556 xfs_mount_t *mp; /* mount point structure */ 2671 xfs_mount_t *mp; /* mount point structure */
2557 xfs_alloctype_t atype = 0; /* type for allocation routines */ 2672 xfs_alloctype_t atype = 0; /* type for allocation routines */
2558 xfs_extlen_t align; /* minimum allocation alignment */ 2673 xfs_extlen_t align; /* minimum allocation alignment */
2559 xfs_agnumber_t ag;
2560 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */ 2674 xfs_agnumber_t fb_agno; /* ag number of ap->firstblock */
2561 xfs_agnumber_t startag; 2675 xfs_agnumber_t ag;
2562 xfs_alloc_arg_t args; 2676 xfs_alloc_arg_t args;
2563 xfs_extlen_t blen; 2677 xfs_extlen_t blen;
2564 xfs_extlen_t nextminlen = 0; 2678 xfs_extlen_t nextminlen = 0;
2565 xfs_perag_t *pag;
2566 int nullfb; /* true if ap->firstblock isn't set */ 2679 int nullfb; /* true if ap->firstblock isn't set */
2567 int isaligned; 2680 int isaligned;
2568 int notinit;
2569 int tryagain; 2681 int tryagain;
2570 int error; 2682 int error;
2571 2683
@@ -2612,103 +2724,9 @@ xfs_bmap_btalloc(
2612 args.firstblock = ap->firstblock; 2724 args.firstblock = ap->firstblock;
2613 blen = 0; 2725 blen = 0;
2614 if (nullfb) { 2726 if (nullfb) {
2615 if (ap->userdata && xfs_inode_is_filestream(ap->ip)) 2727 error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
2616 args.type = XFS_ALLOCTYPE_NEAR_BNO; 2728 if (error)
2617 else 2729 return error;
2618 args.type = XFS_ALLOCTYPE_START_BNO;
2619 args.total = ap->total;
2620
2621 /*
2622 * Search for an allocation group with a single extent
2623 * large enough for the request.
2624 *
2625 * If one isn't found, then adjust the minimum allocation
2626 * size to the largest space found.
2627 */
2628 startag = ag = XFS_FSB_TO_AGNO(mp, args.fsbno);
2629 if (startag == NULLAGNUMBER)
2630 startag = ag = 0;
2631 notinit = 0;
2632 pag = xfs_perag_get(mp, ag);
2633 while (blen < ap->alen) {
2634 if (!pag->pagf_init &&
2635 (error = xfs_alloc_pagf_init(mp, args.tp,
2636 ag, XFS_ALLOC_FLAG_TRYLOCK))) {
2637 xfs_perag_put(pag);
2638 return error;
2639 }
2640 /*
2641 * See xfs_alloc_fix_freelist...
2642 */
2643 if (pag->pagf_init) {
2644 xfs_extlen_t longest;
2645 longest = xfs_alloc_longest_free_extent(mp, pag);
2646 if (blen < longest)
2647 blen = longest;
2648 } else
2649 notinit = 1;
2650
2651 if (xfs_inode_is_filestream(ap->ip)) {
2652 if (blen >= ap->alen)
2653 break;
2654
2655 if (ap->userdata) {
2656 /*
2657 * If startag is an invalid AG, we've
2658 * come here once before and
2659 * xfs_filestream_new_ag picked the
2660 * best currently available.
2661 *
2662 * Don't continue looping, since we
2663 * could loop forever.
2664 */
2665 if (startag == NULLAGNUMBER)
2666 break;
2667
2668 error = xfs_filestream_new_ag(ap, &ag);
2669 xfs_perag_put(pag);
2670 if (error)
2671 return error;
2672
2673 /* loop again to set 'blen'*/
2674 startag = NULLAGNUMBER;
2675 pag = xfs_perag_get(mp, ag);
2676 continue;
2677 }
2678 }
2679 if (++ag == mp->m_sb.sb_agcount)
2680 ag = 0;
2681 if (ag == startag)
2682 break;
2683 xfs_perag_put(pag);
2684 pag = xfs_perag_get(mp, ag);
2685 }
2686 xfs_perag_put(pag);
2687 /*
2688 * Since the above loop did a BUF_TRYLOCK, it is
2689 * possible that there is space for this request.
2690 */
2691 if (notinit || blen < ap->minlen)
2692 args.minlen = ap->minlen;
2693 /*
2694 * If the best seen length is less than the request
2695 * length, use the best as the minimum.
2696 */
2697 else if (blen < ap->alen)
2698 args.minlen = blen;
2699 /*
2700 * Otherwise we've seen an extent as big as alen,
2701 * use that as the minimum.
2702 */
2703 else
2704 args.minlen = ap->alen;
2705
2706 /*
2707 * set the failure fallback case to look in the selected
2708 * AG as the stream may have moved.
2709 */
2710 if (xfs_inode_is_filestream(ap->ip))
2711 ap->rval = args.fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
2712 } else if (ap->low) { 2730 } else if (ap->low) {
2713 if (xfs_inode_is_filestream(ap->ip)) 2731 if (xfs_inode_is_filestream(ap->ip))
2714 args.type = XFS_ALLOCTYPE_FIRST_AG; 2732 args.type = XFS_ALLOCTYPE_FIRST_AG;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index f52ac276277e..7cf7220e7d5f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -292,7 +292,8 @@ typedef struct xfs_bstat {
292 __s32 bs_extents; /* number of extents */ 292 __s32 bs_extents; /* number of extents */
293 __u32 bs_gen; /* generation count */ 293 __u32 bs_gen; /* generation count */
294 __u16 bs_projid; /* project id */ 294 __u16 bs_projid; /* project id */
295 unsigned char bs_pad[14]; /* pad space, unused */ 295 __u16 bs_forkoff; /* inode fork offset in bytes */
296 unsigned char bs_pad[12]; /* pad space, unused */
296 __u32 bs_dmevmask; /* DMIG event mask */ 297 __u32 bs_dmevmask; /* DMIG event mask */
297 __u16 bs_dmstate; /* DMIG state info */ 298 __u16 bs_dmstate; /* DMIG state info */
298 __u16 bs_aextents; /* attribute number of extents */ 299 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e281eb4a1c49..6845db90818f 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -190,13 +190,12 @@ xfs_iget_cache_hit(
190 trace_xfs_iget_reclaim(ip); 190 trace_xfs_iget_reclaim(ip);
191 191
192 /* 192 /*
193 * We need to set XFS_INEW atomically with clearing the 193 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
194 * reclaimable tag so that we do have an indicator of the 194 * from stomping over us while we recycle the inode. We can't
195 * inode still being initialized. 195 * clear the radix tree reclaimable tag yet as it requires
196 * pag_ici_lock to be held exclusive.
196 */ 197 */
197 ip->i_flags |= XFS_INEW; 198 ip->i_flags |= XFS_IRECLAIM;
198 ip->i_flags &= ~XFS_IRECLAIMABLE;
199 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
200 199
201 spin_unlock(&ip->i_flags_lock); 200 spin_unlock(&ip->i_flags_lock);
202 read_unlock(&pag->pag_ici_lock); 201 read_unlock(&pag->pag_ici_lock);
@@ -216,7 +215,15 @@ xfs_iget_cache_hit(
216 trace_xfs_iget_reclaim(ip); 215 trace_xfs_iget_reclaim(ip);
217 goto out_error; 216 goto out_error;
218 } 217 }
218
219 write_lock(&pag->pag_ici_lock);
220 spin_lock(&ip->i_flags_lock);
221 ip->i_flags &= ~(XFS_IRECLAIMABLE | XFS_IRECLAIM);
222 ip->i_flags |= XFS_INEW;
223 __xfs_inode_clear_reclaim_tag(mp, pag, ip);
219 inode->i_state = I_NEW; 224 inode->i_state = I_NEW;
225 spin_unlock(&ip->i_flags_lock);
226 write_unlock(&pag->pag_ici_lock);
220 } else { 227 } else {
221 /* If the VFS inode is being torn down, pause and try again. */ 228 /* If the VFS inode is being torn down, pause and try again. */
222 if (!igrab(inode)) { 229 if (!igrab(inode)) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index fa31360046d4..0ffd56447045 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2439,75 +2439,31 @@ xfs_idestroy_fork(
2439} 2439}
2440 2440
2441/* 2441/*
2442 * Increment the pin count of the given buffer. 2442 * This is called to unpin an inode. The caller must have the inode locked
2443 * This value is protected by ipinlock spinlock in the mount structure. 2443 * in at least shared mode so that the buffer cannot be subsequently pinned
2444 * once someone is waiting for it to be unpinned.
2444 */ 2445 */
2445void 2446static void
2446xfs_ipin( 2447xfs_iunpin_nowait(
2447 xfs_inode_t *ip) 2448 struct xfs_inode *ip)
2448{
2449 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2450
2451 atomic_inc(&ip->i_pincount);
2452}
2453
2454/*
2455 * Decrement the pin count of the given inode, and wake up
2456 * anyone in xfs_iwait_unpin() if the count goes to 0. The
2457 * inode must have been previously pinned with a call to xfs_ipin().
2458 */
2459void
2460xfs_iunpin(
2461 xfs_inode_t *ip)
2462{
2463 ASSERT(atomic_read(&ip->i_pincount) > 0);
2464
2465 if (atomic_dec_and_test(&ip->i_pincount))
2466 wake_up(&ip->i_ipin_wait);
2467}
2468
2469/*
2470 * This is called to unpin an inode. It can be directed to wait or to return
2471 * immediately without waiting for the inode to be unpinned. The caller must
2472 * have the inode locked in at least shared mode so that the buffer cannot be
2473 * subsequently pinned once someone is waiting for it to be unpinned.
2474 */
2475STATIC void
2476__xfs_iunpin_wait(
2477 xfs_inode_t *ip,
2478 int wait)
2479{ 2449{
2480 xfs_inode_log_item_t *iip = ip->i_itemp;
2481
2482 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); 2450 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2483 if (atomic_read(&ip->i_pincount) == 0)
2484 return;
2485 2451
2486 /* Give the log a push to start the unpinning I/O */ 2452 /* Give the log a push to start the unpinning I/O */
2487 if (iip && iip->ili_last_lsn) 2453 xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2488 xfs_log_force_lsn(ip->i_mount, iip->ili_last_lsn, 0);
2489 else
2490 xfs_log_force(ip->i_mount, 0);
2491 2454
2492 if (wait)
2493 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2494} 2455}
2495 2456
2496void 2457void
2497xfs_iunpin_wait( 2458xfs_iunpin_wait(
2498 xfs_inode_t *ip) 2459 struct xfs_inode *ip)
2499{ 2460{
2500 __xfs_iunpin_wait(ip, 1); 2461 if (xfs_ipincount(ip)) {
2501} 2462 xfs_iunpin_nowait(ip);
2502 2463 wait_event(ip->i_ipin_wait, (xfs_ipincount(ip) == 0));
2503static inline void 2464 }
2504xfs_iunpin_nowait(
2505 xfs_inode_t *ip)
2506{
2507 __xfs_iunpin_wait(ip, 0);
2508} 2465}
2509 2466
2510
2511/* 2467/*
2512 * xfs_iextents_copy() 2468 * xfs_iextents_copy()
2513 * 2469 *
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6c912b027596..9965e40a4615 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -471,8 +471,6 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
471int xfs_iunlink(struct xfs_trans *, xfs_inode_t *); 471int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
472 472
473void xfs_iext_realloc(xfs_inode_t *, int, int); 473void xfs_iext_realloc(xfs_inode_t *, int, int);
474void xfs_ipin(xfs_inode_t *);
475void xfs_iunpin(xfs_inode_t *);
476void xfs_iunpin_wait(xfs_inode_t *); 474void xfs_iunpin_wait(xfs_inode_t *);
477int xfs_iflush(xfs_inode_t *, uint); 475int xfs_iflush(xfs_inode_t *, uint);
478void xfs_ichgtime(xfs_inode_t *, int); 476void xfs_ichgtime(xfs_inode_t *, int);
@@ -480,6 +478,7 @@ void xfs_lock_inodes(xfs_inode_t **, int, uint);
480void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 478void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
481 479
482void xfs_synchronize_times(xfs_inode_t *); 480void xfs_synchronize_times(xfs_inode_t *);
481void xfs_mark_inode_dirty(xfs_inode_t *);
483void xfs_mark_inode_dirty_sync(xfs_inode_t *); 482void xfs_mark_inode_dirty_sync(xfs_inode_t *);
484 483
485#define IHOLD(ip) \ 484#define IHOLD(ip) \
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d4dc063111f8..7bfea8540159 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -535,23 +535,23 @@ xfs_inode_item_format(
535 535
536/* 536/*
537 * This is called to pin the inode associated with the inode log 537 * This is called to pin the inode associated with the inode log
538 * item in memory so it cannot be written out. Do this by calling 538 * item in memory so it cannot be written out.
539 * xfs_ipin() to bump the pin count in the inode while holding the
540 * inode pin lock.
541 */ 539 */
542STATIC void 540STATIC void
543xfs_inode_item_pin( 541xfs_inode_item_pin(
544 xfs_inode_log_item_t *iip) 542 xfs_inode_log_item_t *iip)
545{ 543{
546 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); 544 ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL));
547 xfs_ipin(iip->ili_inode); 545
546 atomic_inc(&iip->ili_inode->i_pincount);
548} 547}
549 548
550 549
551/* 550/*
552 * This is called to unpin the inode associated with the inode log 551 * This is called to unpin the inode associated with the inode log
553 * item which was previously pinned with a call to xfs_inode_item_pin(). 552 * item which was previously pinned with a call to xfs_inode_item_pin().
554 * Just call xfs_iunpin() on the inode to do this. 553 *
554 * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
555 */ 555 */
556/* ARGSUSED */ 556/* ARGSUSED */
557STATIC void 557STATIC void
@@ -559,7 +559,11 @@ xfs_inode_item_unpin(
559 xfs_inode_log_item_t *iip, 559 xfs_inode_log_item_t *iip,
560 int stale) 560 int stale)
561{ 561{
562 xfs_iunpin(iip->ili_inode); 562 struct xfs_inode *ip = iip->ili_inode;
563
564 ASSERT(atomic_read(&ip->i_pincount) > 0);
565 if (atomic_dec_and_test(&ip->i_pincount))
566 wake_up(&ip->i_ipin_wait);
563} 567}
564 568
565/* ARGSUSED */ 569/* ARGSUSED */
@@ -568,7 +572,7 @@ xfs_inode_item_unpin_remove(
568 xfs_inode_log_item_t *iip, 572 xfs_inode_log_item_t *iip,
569 xfs_trans_t *tp) 573 xfs_trans_t *tp)
570{ 574{
571 xfs_iunpin(iip->ili_inode); 575 xfs_inode_item_unpin(iip, 0);
572} 576}
573 577
574/* 578/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 3af02314c605..b1b801e4a28e 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -106,6 +106,7 @@ xfs_bulkstat_one_iget(
106 buf->bs_dmevmask = dic->di_dmevmask; 106 buf->bs_dmevmask = dic->di_dmevmask;
107 buf->bs_dmstate = dic->di_dmstate; 107 buf->bs_dmstate = dic->di_dmstate;
108 buf->bs_aextents = dic->di_anextents; 108 buf->bs_aextents = dic->di_anextents;
109 buf->bs_forkoff = XFS_IFORK_BOFF(ip);
109 110
110 switch (dic->di_format) { 111 switch (dic->di_format) {
111 case XFS_DINODE_FMT_DEV: 112 case XFS_DINODE_FMT_DEV:
@@ -176,6 +177,7 @@ xfs_bulkstat_one_dinode(
176 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask); 177 buf->bs_dmevmask = be32_to_cpu(dic->di_dmevmask);
177 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate); 178 buf->bs_dmstate = be16_to_cpu(dic->di_dmstate);
178 buf->bs_aextents = be16_to_cpu(dic->di_anextents); 179 buf->bs_aextents = be16_to_cpu(dic->di_anextents);
180 buf->bs_forkoff = XFS_DFORK_BOFF(dic);
179 181
180 switch (dic->di_format) { 182 switch (dic->di_format) {
181 case XFS_DINODE_FMT_DEV: 183 case XFS_DINODE_FMT_DEV:
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4f16be4b6ee5..e8fba92d7cd9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -60,7 +60,7 @@ STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 60STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
61STATIC void xlog_dealloc_log(xlog_t *log); 61STATIC void xlog_dealloc_log(xlog_t *log);
62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], 62STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[],
63 int nentries, xfs_log_ticket_t tic, 63 int nentries, struct xlog_ticket *tic,
64 xfs_lsn_t *start_lsn, 64 xfs_lsn_t *start_lsn,
65 xlog_in_core_t **commit_iclog, 65 xlog_in_core_t **commit_iclog,
66 uint flags); 66 uint flags);
@@ -243,14 +243,14 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
243 * out when the next write occurs. 243 * out when the next write occurs.
244 */ 244 */
245xfs_lsn_t 245xfs_lsn_t
246xfs_log_done(xfs_mount_t *mp, 246xfs_log_done(
247 xfs_log_ticket_t xtic, 247 struct xfs_mount *mp,
248 void **iclog, 248 struct xlog_ticket *ticket,
249 uint flags) 249 struct xlog_in_core **iclog,
250 uint flags)
250{ 251{
251 xlog_t *log = mp->m_log; 252 struct log *log = mp->m_log;
252 xlog_ticket_t *ticket = (xfs_log_ticket_t) xtic; 253 xfs_lsn_t lsn = 0;
253 xfs_lsn_t lsn = 0;
254 254
255 if (XLOG_FORCED_SHUTDOWN(log) || 255 if (XLOG_FORCED_SHUTDOWN(log) ||
256 /* 256 /*
@@ -258,8 +258,7 @@ xfs_log_done(xfs_mount_t *mp,
258 * If we get an error, just continue and give back the log ticket. 258 * If we get an error, just continue and give back the log ticket.
259 */ 259 */
260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) && 260 (((ticket->t_flags & XLOG_TIC_INITED) == 0) &&
261 (xlog_commit_record(mp, ticket, 261 (xlog_commit_record(mp, ticket, iclog, &lsn)))) {
262 (xlog_in_core_t **)iclog, &lsn)))) {
263 lsn = (xfs_lsn_t) -1; 262 lsn = (xfs_lsn_t) -1;
264 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { 263 if (ticket->t_flags & XLOG_TIC_PERM_RESERV) {
265 flags |= XFS_LOG_REL_PERM_RESERV; 264 flags |= XFS_LOG_REL_PERM_RESERV;
@@ -289,7 +288,7 @@ xfs_log_done(xfs_mount_t *mp,
289 } 288 }
290 289
291 return lsn; 290 return lsn;
292} /* xfs_log_done */ 291}
293 292
294/* 293/*
295 * Attaches a new iclog I/O completion callback routine during 294 * Attaches a new iclog I/O completion callback routine during
@@ -298,11 +297,11 @@ xfs_log_done(xfs_mount_t *mp,
298 * executing the callback at an appropriate time. 297 * executing the callback at an appropriate time.
299 */ 298 */
300int 299int
301xfs_log_notify(xfs_mount_t *mp, /* mount of partition */ 300xfs_log_notify(
302 void *iclog_hndl, /* iclog to hang callback off */ 301 struct xfs_mount *mp,
303 xfs_log_callback_t *cb) 302 struct xlog_in_core *iclog,
303 xfs_log_callback_t *cb)
304{ 304{
305 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
306 int abortflg; 305 int abortflg;
307 306
308 spin_lock(&iclog->ic_callback_lock); 307 spin_lock(&iclog->ic_callback_lock);
@@ -316,16 +315,14 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
316 } 315 }
317 spin_unlock(&iclog->ic_callback_lock); 316 spin_unlock(&iclog->ic_callback_lock);
318 return abortflg; 317 return abortflg;
319} /* xfs_log_notify */ 318}
320 319
321int 320int
322xfs_log_release_iclog(xfs_mount_t *mp, 321xfs_log_release_iclog(
323 void *iclog_hndl) 322 struct xfs_mount *mp,
323 struct xlog_in_core *iclog)
324{ 324{
325 xlog_t *log = mp->m_log; 325 if (xlog_state_release_iclog(mp->m_log, iclog)) {
326 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
327
328 if (xlog_state_release_iclog(log, iclog)) {
329 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 326 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
330 return EIO; 327 return EIO;
331 } 328 }
@@ -344,17 +341,18 @@ xfs_log_release_iclog(xfs_mount_t *mp,
344 * reservation, we prevent over allocation problems. 341 * reservation, we prevent over allocation problems.
345 */ 342 */
346int 343int
347xfs_log_reserve(xfs_mount_t *mp, 344xfs_log_reserve(
348 int unit_bytes, 345 struct xfs_mount *mp,
349 int cnt, 346 int unit_bytes,
350 xfs_log_ticket_t *ticket, 347 int cnt,
351 __uint8_t client, 348 struct xlog_ticket **ticket,
352 uint flags, 349 __uint8_t client,
353 uint t_type) 350 uint flags,
351 uint t_type)
354{ 352{
355 xlog_t *log = mp->m_log; 353 struct log *log = mp->m_log;
356 xlog_ticket_t *internal_ticket; 354 struct xlog_ticket *internal_ticket;
357 int retval = 0; 355 int retval = 0;
358 356
359 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 357 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
360 ASSERT((flags & XFS_LOG_NOSLEEP) == 0); 358 ASSERT((flags & XFS_LOG_NOSLEEP) == 0);
@@ -367,7 +365,7 @@ xfs_log_reserve(xfs_mount_t *mp,
367 365
368 if (*ticket != NULL) { 366 if (*ticket != NULL) {
369 ASSERT(flags & XFS_LOG_PERM_RESERV); 367 ASSERT(flags & XFS_LOG_PERM_RESERV);
370 internal_ticket = (xlog_ticket_t *)*ticket; 368 internal_ticket = *ticket;
371 369
372 trace_xfs_log_reserve(log, internal_ticket); 370 trace_xfs_log_reserve(log, internal_ticket);
373 371
@@ -519,7 +517,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
519 xlog_in_core_t *first_iclog; 517 xlog_in_core_t *first_iclog;
520#endif 518#endif
521 xfs_log_iovec_t reg[1]; 519 xfs_log_iovec_t reg[1];
522 xfs_log_ticket_t tic = NULL; 520 xlog_ticket_t *tic = NULL;
523 xfs_lsn_t lsn; 521 xfs_lsn_t lsn;
524 int error; 522 int error;
525 523
@@ -656,24 +654,24 @@ xfs_log_unmount(xfs_mount_t *mp)
656 * transaction occur with one call to xfs_log_write(). 654 * transaction occur with one call to xfs_log_write().
657 */ 655 */
658int 656int
659xfs_log_write(xfs_mount_t * mp, 657xfs_log_write(
660 xfs_log_iovec_t reg[], 658 struct xfs_mount *mp,
661 int nentries, 659 struct xfs_log_iovec reg[],
662 xfs_log_ticket_t tic, 660 int nentries,
663 xfs_lsn_t *start_lsn) 661 struct xlog_ticket *tic,
662 xfs_lsn_t *start_lsn)
664{ 663{
665 int error; 664 struct log *log = mp->m_log;
666 xlog_t *log = mp->m_log; 665 int error;
667 666
668 if (XLOG_FORCED_SHUTDOWN(log)) 667 if (XLOG_FORCED_SHUTDOWN(log))
669 return XFS_ERROR(EIO); 668 return XFS_ERROR(EIO);
670 669
671 if ((error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0))) { 670 error = xlog_write(mp, reg, nentries, tic, start_lsn, NULL, 0);
671 if (error)
672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 672 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
673 }
674 return error; 673 return error;
675} /* xfs_log_write */ 674}
676
677 675
678void 676void
679xfs_log_move_tail(xfs_mount_t *mp, 677xfs_log_move_tail(xfs_mount_t *mp,
@@ -1642,16 +1640,16 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1642 * bytes have been written out. 1640 * bytes have been written out.
1643 */ 1641 */
1644STATIC int 1642STATIC int
1645xlog_write(xfs_mount_t * mp, 1643xlog_write(
1646 xfs_log_iovec_t reg[], 1644 struct xfs_mount *mp,
1647 int nentries, 1645 struct xfs_log_iovec reg[],
1648 xfs_log_ticket_t tic, 1646 int nentries,
1649 xfs_lsn_t *start_lsn, 1647 struct xlog_ticket *ticket,
1650 xlog_in_core_t **commit_iclog, 1648 xfs_lsn_t *start_lsn,
1651 uint flags) 1649 struct xlog_in_core **commit_iclog,
1650 uint flags)
1652{ 1651{
1653 xlog_t *log = mp->m_log; 1652 xlog_t *log = mp->m_log;
1654 xlog_ticket_t *ticket = (xlog_ticket_t *)tic;
1655 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ 1653 xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */
1656 xlog_op_header_t *logop_head; /* ptr to log operation header */ 1654 xlog_op_header_t *logop_head; /* ptr to log operation header */
1657 __psint_t ptr; /* copy address into data region */ 1655 __psint_t ptr; /* copy address into data region */
@@ -1765,7 +1763,7 @@ xlog_write(xfs_mount_t * mp,
1765 default: 1763 default:
1766 xfs_fs_cmn_err(CE_WARN, mp, 1764 xfs_fs_cmn_err(CE_WARN, mp,
1767 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1765 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1768 logop_head->oh_clientid, tic); 1766 logop_head->oh_clientid, ticket);
1769 return XFS_ERROR(EIO); 1767 return XFS_ERROR(EIO);
1770 } 1768 }
1771 1769
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 7074be9d13e9..97a24c7795a4 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -110,8 +110,6 @@ typedef struct xfs_log_iovec {
110 uint i_type; /* type of region */ 110 uint i_type; /* type of region */
111} xfs_log_iovec_t; 111} xfs_log_iovec_t;
112 112
113typedef void* xfs_log_ticket_t;
114
115/* 113/*
116 * Structure used to pass callback function and the function's argument 114 * Structure used to pass callback function and the function's argument
117 * to the log manager. 115 * to the log manager.
@@ -126,10 +124,12 @@ typedef struct xfs_log_callback {
126#ifdef __KERNEL__ 124#ifdef __KERNEL__
127/* Log manager interfaces */ 125/* Log manager interfaces */
128struct xfs_mount; 126struct xfs_mount;
127struct xlog_in_core;
129struct xlog_ticket; 128struct xlog_ticket;
129
130xfs_lsn_t xfs_log_done(struct xfs_mount *mp, 130xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
131 xfs_log_ticket_t ticket, 131 struct xlog_ticket *ticket,
132 void **iclog, 132 struct xlog_in_core **iclog,
133 uint flags); 133 uint flags);
134int _xfs_log_force(struct xfs_mount *mp, 134int _xfs_log_force(struct xfs_mount *mp,
135 uint flags, 135 uint flags,
@@ -151,21 +151,21 @@ int xfs_log_mount_finish(struct xfs_mount *mp);
151void xfs_log_move_tail(struct xfs_mount *mp, 151void xfs_log_move_tail(struct xfs_mount *mp,
152 xfs_lsn_t tail_lsn); 152 xfs_lsn_t tail_lsn);
153int xfs_log_notify(struct xfs_mount *mp, 153int xfs_log_notify(struct xfs_mount *mp,
154 void *iclog, 154 struct xlog_in_core *iclog,
155 xfs_log_callback_t *callback_entry); 155 xfs_log_callback_t *callback_entry);
156int xfs_log_release_iclog(struct xfs_mount *mp, 156int xfs_log_release_iclog(struct xfs_mount *mp,
157 void *iclog_hndl); 157 struct xlog_in_core *iclog);
158int xfs_log_reserve(struct xfs_mount *mp, 158int xfs_log_reserve(struct xfs_mount *mp,
159 int length, 159 int length,
160 int count, 160 int count,
161 xfs_log_ticket_t *ticket, 161 struct xlog_ticket **ticket,
162 __uint8_t clientid, 162 __uint8_t clientid,
163 uint flags, 163 uint flags,
164 uint t_type); 164 uint t_type);
165int xfs_log_write(struct xfs_mount *mp, 165int xfs_log_write(struct xfs_mount *mp,
166 xfs_log_iovec_t region[], 166 xfs_log_iovec_t region[],
167 int nentries, 167 int nentries,
168 xfs_log_ticket_t ticket, 168 struct xlog_ticket *ticket,
169 xfs_lsn_t *start_lsn); 169 xfs_lsn_t *start_lsn);
170int xfs_log_unmount_write(struct xfs_mount *mp); 170int xfs_log_unmount_write(struct xfs_mount *mp);
171void xfs_log_unmount(struct xfs_mount *mp); 171void xfs_log_unmount(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6afaaeb2950a..e79b56b4bca6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1097,13 +1097,15 @@ xfs_default_resblks(xfs_mount_t *mp)
1097 __uint64_t resblks; 1097 __uint64_t resblks;
1098 1098
1099 /* 1099 /*
1100 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. 1100 * We default to 5% or 8192 fsbs of space reserved, whichever is
1101 * This may drive us straight to ENOSPC on mount, but that implies 1101 * smaller. This is intended to cover concurrent allocation
1102 * we were already there on the last unmount. Warn if this occurs. 1102 * transactions when we initially hit enospc. These each require a 4
1103 * block reservation. Hence by default we cover roughly 2000 concurrent
1104 * allocation reservations.
1103 */ 1105 */
1104 resblks = mp->m_sb.sb_dblocks; 1106 resblks = mp->m_sb.sb_dblocks;
1105 do_div(resblks, 20); 1107 do_div(resblks, 20);
1106 resblks = min_t(__uint64_t, resblks, 1024); 1108 resblks = min_t(__uint64_t, resblks, 8192);
1107 return resblks; 1109 return resblks;
1108} 1110}
1109 1111
@@ -1417,6 +1419,9 @@ xfs_mountfs(
1417 * when at ENOSPC. This is needed for operations like create with 1419 * when at ENOSPC. This is needed for operations like create with
1418 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations 1420 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
1419 * are not allowed to use this reserved space. 1421 * are not allowed to use this reserved space.
1422 *
1423 * This may drive us straight to ENOSPC on mount, but that implies
1424 * we were already there on the last unmount. Warn if this occurs.
1420 */ 1425 */
1421 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 1426 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
1422 resblks = xfs_default_resblks(mp); 1427 resblks = xfs_default_resblks(mp);
@@ -1725,26 +1730,30 @@ xfs_mod_incore_sb_unlocked(
1725 lcounter += rem; 1730 lcounter += rem;
1726 } 1731 }
1727 } else { /* Taking blocks away */ 1732 } else { /* Taking blocks away */
1728
1729 lcounter += delta; 1733 lcounter += delta;
1734 if (lcounter >= 0) {
1735 mp->m_sb.sb_fdblocks = lcounter +
1736 XFS_ALLOC_SET_ASIDE(mp);
1737 return 0;
1738 }
1730 1739
1731 /* 1740 /*
1732 * If were out of blocks, use any available reserved blocks if 1741 * We are out of blocks, use any available reserved
1733 * were allowed to. 1742 * blocks if were allowed to.
1734 */ 1743 */
1744 if (!rsvd)
1745 return XFS_ERROR(ENOSPC);
1735 1746
1736 if (lcounter < 0) { 1747 lcounter = (long long)mp->m_resblks_avail + delta;
1737 if (rsvd) { 1748 if (lcounter >= 0) {
1738 lcounter = (long long)mp->m_resblks_avail + delta; 1749 mp->m_resblks_avail = lcounter;
1739 if (lcounter < 0) { 1750 return 0;
1740 return XFS_ERROR(ENOSPC);
1741 }
1742 mp->m_resblks_avail = lcounter;
1743 return 0;
1744 } else { /* not reserved */
1745 return XFS_ERROR(ENOSPC);
1746 }
1747 } 1751 }
1752 printk_once(KERN_WARNING
1753 "Filesystem \"%s\": reserve blocks depleted! "
1754 "Consider increasing reserve pool size.",
1755 mp->m_fsname);
1756 return XFS_ERROR(ENOSPC);
1748 } 1757 }
1749 1758
1750 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 1759 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -2052,6 +2061,26 @@ xfs_mount_log_sb(
2052 return error; 2061 return error;
2053} 2062}
2054 2063
2064/*
2065 * If the underlying (data/log/rt) device is readonly, there are some
2066 * operations that cannot proceed.
2067 */
2068int
2069xfs_dev_is_read_only(
2070 struct xfs_mount *mp,
2071 char *message)
2072{
2073 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2074 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2075 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2076 cmn_err(CE_NOTE,
2077 "XFS: %s required on read-only device.", message);
2078 cmn_err(CE_NOTE,
2079 "XFS: write access unavailable, cannot proceed.");
2080 return EROFS;
2081 }
2082 return 0;
2083}
2055 2084
2056#ifdef HAVE_PERCPU_SB 2085#ifdef HAVE_PERCPU_SB
2057/* 2086/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 14dafd608230..4fa0bc7b983e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -436,6 +436,8 @@ extern void xfs_freesb(xfs_mount_t *);
436extern int xfs_fs_writable(xfs_mount_t *); 436extern int xfs_fs_writable(xfs_mount_t *);
437extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); 437extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
438 438
439extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
440
439extern int xfs_dmops_get(struct xfs_mount *); 441extern int xfs_dmops_get(struct xfs_mount *);
440extern void xfs_dmops_put(struct xfs_mount *); 442extern void xfs_dmops_put(struct xfs_mount *);
441 443
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be942d4e3324..f73e358bae8d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -796,7 +796,7 @@ _xfs_trans_commit(
796 int sync; 796 int sync;
797#define XFS_TRANS_LOGVEC_COUNT 16 797#define XFS_TRANS_LOGVEC_COUNT 16
798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT]; 798 xfs_log_iovec_t log_vector_fast[XFS_TRANS_LOGVEC_COUNT];
799 void *commit_iclog; 799 struct xlog_in_core *commit_iclog;
800 int shutdown; 800 int shutdown;
801 801
802 commit_lsn = -1; 802 commit_lsn = -1;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c93e3a102857..79c8bab9dfff 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -910,7 +910,7 @@ typedef struct xfs_trans {
910 unsigned int t_blk_res_used; /* # of resvd blocks used */ 910 unsigned int t_blk_res_used; /* # of resvd blocks used */
911 unsigned int t_rtx_res; /* # of rt extents resvd */ 911 unsigned int t_rtx_res; /* # of rt extents resvd */
912 unsigned int t_rtx_res_used; /* # of resvd rt extents used */ 912 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
913 xfs_log_ticket_t t_ticket; /* log mgr ticket */ 913 struct xlog_ticket *t_ticket; /* log mgr ticket */
914 xfs_lsn_t t_lsn; /* log seq num of start of 914 xfs_lsn_t t_lsn; /* log seq num of start of
915 * transaction. */ 915 * transaction. */
916 xfs_lsn_t t_commit_lsn; /* log seq num of end of 916 xfs_lsn_t t_commit_lsn; /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 5ffd544434eb..fb586360d1c9 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -46,6 +46,65 @@ STATIC xfs_buf_t *xfs_trans_buf_item_match(xfs_trans_t *, xfs_buftarg_t *,
46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *, 46STATIC xfs_buf_t *xfs_trans_buf_item_match_all(xfs_trans_t *, xfs_buftarg_t *,
47 xfs_daddr_t, int); 47 xfs_daddr_t, int);
48 48
49/*
50 * Add the locked buffer to the transaction.
51 *
52 * The buffer must be locked, and it cannot be associated with any
53 * transaction.
54 *
55 * If the buffer does not yet have a buf log item associated with it,
56 * then allocate one for it. Then add the buf item to the transaction.
57 */
58STATIC void
59_xfs_trans_bjoin(
60 struct xfs_trans *tp,
61 struct xfs_buf *bp,
62 int reset_recur)
63{
64 struct xfs_buf_log_item *bip;
65
66 ASSERT(XFS_BUF_ISBUSY(bp));
67 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
68
69 /*
70 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
71 * it doesn't have one yet, then allocate one and initialize it.
72 * The checks to see if one is there are in xfs_buf_item_init().
73 */
74 xfs_buf_item_init(bp, tp->t_mountp);
75 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
76 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
77 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
78 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
79 if (reset_recur)
80 bip->bli_recur = 0;
81
82 /*
83 * Take a reference for this transaction on the buf item.
84 */
85 atomic_inc(&bip->bli_refcount);
86
87 /*
88 * Get a log_item_desc to point at the new item.
89 */
90 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
91
92 /*
93 * Initialize b_fsprivate2 so we can find it with incore_match()
94 * in xfs_trans_get_buf() and friends above.
95 */
96 XFS_BUF_SET_FSPRIVATE2(bp, tp);
97
98}
99
100void
101xfs_trans_bjoin(
102 struct xfs_trans *tp,
103 struct xfs_buf *bp)
104{
105 _xfs_trans_bjoin(tp, bp, 0);
106 trace_xfs_trans_bjoin(bp->b_fspriv);
107}
49 108
50/* 109/*
51 * Get and lock the buffer for the caller if it is not already 110 * Get and lock the buffer for the caller if it is not already
@@ -132,40 +191,8 @@ xfs_trans_get_buf(xfs_trans_t *tp,
132 191
133 ASSERT(!XFS_BUF_GETERROR(bp)); 192 ASSERT(!XFS_BUF_GETERROR(bp));
134 193
135 /* 194 _xfs_trans_bjoin(tp, bp, 1);
136 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 195 trace_xfs_trans_get_buf(bp->b_fspriv);
137 * it doesn't have one yet, then allocate one and initialize it.
138 * The checks to see if one is there are in xfs_buf_item_init().
139 */
140 xfs_buf_item_init(bp, tp->t_mountp);
141
142 /*
143 * Set the recursion count for the buffer within this transaction
144 * to 0.
145 */
146 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
147 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
148 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
149 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
150 bip->bli_recur = 0;
151
152 /*
153 * Take a reference for this transaction on the buf item.
154 */
155 atomic_inc(&bip->bli_refcount);
156
157 /*
158 * Get a log_item_desc to point at the new item.
159 */
160 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
161
162 /*
163 * Initialize b_fsprivate2 so we can find it with incore_match()
164 * above.
165 */
166 XFS_BUF_SET_FSPRIVATE2(bp, tp);
167
168 trace_xfs_trans_get_buf(bip);
169 return (bp); 196 return (bp);
170} 197}
171 198
@@ -210,44 +237,11 @@ xfs_trans_getsb(xfs_trans_t *tp,
210 } 237 }
211 238
212 bp = xfs_getsb(mp, flags); 239 bp = xfs_getsb(mp, flags);
213 if (bp == NULL) { 240 if (bp == NULL)
214 return NULL; 241 return NULL;
215 }
216
217 /*
218 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
219 * it doesn't have one yet, then allocate one and initialize it.
220 * The checks to see if one is there are in xfs_buf_item_init().
221 */
222 xfs_buf_item_init(bp, mp);
223
224 /*
225 * Set the recursion count for the buffer within this transaction
226 * to 0.
227 */
228 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
229 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
230 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
231 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
232 bip->bli_recur = 0;
233
234 /*
235 * Take a reference for this transaction on the buf item.
236 */
237 atomic_inc(&bip->bli_refcount);
238
239 /*
240 * Get a log_item_desc to point at the new item.
241 */
242 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
243
244 /*
245 * Initialize b_fsprivate2 so we can find it with incore_match()
246 * above.
247 */
248 XFS_BUF_SET_FSPRIVATE2(bp, tp);
249 242
250 trace_xfs_trans_getsb(bip); 243 _xfs_trans_bjoin(tp, bp, 1);
244 trace_xfs_trans_getsb(bp->b_fspriv);
251 return (bp); 245 return (bp);
252} 246}
253 247
@@ -425,40 +419,9 @@ xfs_trans_read_buf(
425 if (XFS_FORCED_SHUTDOWN(mp)) 419 if (XFS_FORCED_SHUTDOWN(mp))
426 goto shutdown_abort; 420 goto shutdown_abort;
427 421
428 /* 422 _xfs_trans_bjoin(tp, bp, 1);
429 * The xfs_buf_log_item pointer is stored in b_fsprivate. If 423 trace_xfs_trans_read_buf(bp->b_fspriv);
430 * it doesn't have one yet, then allocate one and initialize it.
431 * The checks to see if one is there are in xfs_buf_item_init().
432 */
433 xfs_buf_item_init(bp, tp->t_mountp);
434 424
435 /*
436 * Set the recursion count for the buffer within this transaction
437 * to 0.
438 */
439 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
440 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
441 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
442 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
443 bip->bli_recur = 0;
444
445 /*
446 * Take a reference for this transaction on the buf item.
447 */
448 atomic_inc(&bip->bli_refcount);
449
450 /*
451 * Get a log_item_desc to point at the new item.
452 */
453 (void) xfs_trans_add_item(tp, (xfs_log_item_t*)bip);
454
455 /*
456 * Initialize b_fsprivate2 so we can find it with incore_match()
457 * above.
458 */
459 XFS_BUF_SET_FSPRIVATE2(bp, tp);
460
461 trace_xfs_trans_read_buf(bip);
462 *bpp = bp; 425 *bpp = bp;
463 return 0; 426 return 0;
464 427
@@ -623,53 +586,6 @@ xfs_trans_brelse(xfs_trans_t *tp,
623} 586}
624 587
625/* 588/*
626 * Add the locked buffer to the transaction.
627 * The buffer must be locked, and it cannot be associated with any
628 * transaction.
629 *
630 * If the buffer does not yet have a buf log item associated with it,
631 * then allocate one for it. Then add the buf item to the transaction.
632 */
633void
634xfs_trans_bjoin(xfs_trans_t *tp,
635 xfs_buf_t *bp)
636{
637 xfs_buf_log_item_t *bip;
638
639 ASSERT(XFS_BUF_ISBUSY(bp));
640 ASSERT(XFS_BUF_FSPRIVATE2(bp, void *) == NULL);
641
642 /*
643 * The xfs_buf_log_item pointer is stored in b_fsprivate. If
644 * it doesn't have one yet, then allocate one and initialize it.
645 * The checks to see if one is there are in xfs_buf_item_init().
646 */
647 xfs_buf_item_init(bp, tp->t_mountp);
648 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
649 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
650 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
651 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
652
653 /*
654 * Take a reference for this transaction on the buf item.
655 */
656 atomic_inc(&bip->bli_refcount);
657
658 /*
659 * Get a log_item_desc to point at the new item.
660 */
661 (void) xfs_trans_add_item(tp, (xfs_log_item_t *)bip);
662
663 /*
664 * Initialize b_fsprivate2 so we can find it with incore_match()
665 * in xfs_trans_get_buf() and friends above.
666 */
667 XFS_BUF_SET_FSPRIVATE2(bp, tp);
668
669 trace_xfs_trans_bjoin(bip);
670}
671
672/*
673 * Mark the buffer as not needing to be unlocked when the buf item's 589 * Mark the buffer as not needing to be unlocked when the buf item's
674 * IOP_UNLOCK() routine is called. The buffer must already be locked 590 * IOP_UNLOCK() routine is called. The buffer must already be locked
675 * and associated with the given transaction. 591 * and associated with the given transaction.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index ddd2c5d1b854..9d376be0ea38 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -584,113 +584,6 @@ xfs_readlink(
584} 584}
585 585
586/* 586/*
587 * xfs_fsync
588 *
589 * This is called to sync the inode and its data out to disk. We need to hold
590 * the I/O lock while flushing the data, and the inode lock while flushing the
591 * inode. The inode lock CANNOT be held while flushing the data, so acquire
592 * after we're done with that.
593 */
594int
595xfs_fsync(
596 xfs_inode_t *ip)
597{
598 xfs_trans_t *tp;
599 int error = 0;
600 int log_flushed = 0;
601
602 xfs_itrace_entry(ip);
603
604 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
605 return XFS_ERROR(EIO);
606
607 /*
608 * We always need to make sure that the required inode state is safe on
609 * disk. The inode might be clean but we still might need to force the
610 * log because of committed transactions that haven't hit the disk yet.
611 * Likewise, there could be unflushed non-transactional changes to the
612 * inode core that have to go to disk and this requires us to issue
613 * a synchronous transaction to capture these changes correctly.
614 *
615 * This code relies on the assumption that if the update_* fields
616 * of the inode are clear and the inode is unpinned then it is clean
617 * and no action is required.
618 */
619 xfs_ilock(ip, XFS_ILOCK_SHARED);
620
621 if (!ip->i_update_core) {
622 /*
623 * Timestamps/size haven't changed since last inode flush or
624 * inode transaction commit. That means either nothing got
625 * written or a transaction committed which caught the updates.
626 * If the latter happened and the transaction hasn't hit the
627 * disk yet, the inode will be still be pinned. If it is,
628 * force the log.
629 */
630 xfs_iunlock(ip, XFS_ILOCK_SHARED);
631 if (xfs_ipincount(ip)) {
632 if (ip->i_itemp->ili_last_lsn) {
633 error = _xfs_log_force_lsn(ip->i_mount,
634 ip->i_itemp->ili_last_lsn,
635 XFS_LOG_SYNC, &log_flushed);
636 } else {
637 error = _xfs_log_force(ip->i_mount,
638 XFS_LOG_SYNC, &log_flushed);
639 }
640 }
641 } else {
642 /*
643 * Kick off a transaction to log the inode core to get the
644 * updates. The sync transaction will also force the log.
645 */
646 xfs_iunlock(ip, XFS_ILOCK_SHARED);
647 tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
648 error = xfs_trans_reserve(tp, 0,
649 XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
650 if (error) {
651 xfs_trans_cancel(tp, 0);
652 return error;
653 }
654 xfs_ilock(ip, XFS_ILOCK_EXCL);
655
656 /*
657 * Note - it's possible that we might have pushed ourselves out
658 * of the way during trans_reserve which would flush the inode.
659 * But there's no guarantee that the inode buffer has actually
660 * gone out yet (it's delwri). Plus the buffer could be pinned
661 * anyway if it's part of an inode in another recent
662 * transaction. So we play it safe and fire off the
663 * transaction anyway.
664 */
665 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
666 xfs_trans_ihold(tp, ip);
667 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
668 xfs_trans_set_sync(tp);
669 error = _xfs_trans_commit(tp, 0, &log_flushed);
670
671 xfs_iunlock(ip, XFS_ILOCK_EXCL);
672 }
673
674 if (ip->i_mount->m_flags & XFS_MOUNT_BARRIER) {
675 /*
676 * If the log write didn't issue an ordered tag we need
677 * to flush the disk cache for the data device now.
678 */
679 if (!log_flushed)
680 xfs_blkdev_issue_flush(ip->i_mount->m_ddev_targp);
681
682 /*
683 * If this inode is on the RT dev we need to flush that
684 * cache as well.
685 */
686 if (XFS_IS_REALTIME_INODE(ip))
687 xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp);
688 }
689
690 return error;
691}
692
693/*
694 * Flags for xfs_free_eofblocks 587 * Flags for xfs_free_eofblocks
695 */ 588 */
696#define XFS_FREE_EOF_TRYLOCK (1<<0) 589#define XFS_FREE_EOF_TRYLOCK (1<<0)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 774f40729ca1..d8dfa8d0dadd 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -21,7 +21,6 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 21#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
22 22
23int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
24int xfs_fsync(struct xfs_inode *ip);
25int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);
26int xfs_inactive(struct xfs_inode *ip); 25int xfs_inactive(struct xfs_inode *ip);
27int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, 26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
@@ -50,18 +49,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
50int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags); 49int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
51int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize, 50int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
52 int flags, struct attrlist_cursor_kern *cursor); 51 int flags, struct attrlist_cursor_kern *cursor);
53ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
54 const struct iovec *iovp, unsigned int segs,
55 loff_t *offset, int ioflags);
56ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp,
57 loff_t *ppos, struct pipe_inode_info *pipe, size_t count,
58 int flags, int ioflags);
59ssize_t xfs_splice_write(struct xfs_inode *ip,
60 struct pipe_inode_info *pipe, struct file *outfilp,
61 loff_t *ppos, size_t count, int flags, int ioflags);
62ssize_t xfs_write(struct xfs_inode *xip, struct kiocb *iocb,
63 const struct iovec *iovp, unsigned int nsegs,
64 loff_t *offset, int ioflags);
65int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, 52int xfs_bmap(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
66 int flags, struct xfs_iomap *iomapp, int *niomaps); 53 int flags, struct xfs_iomap *iomapp, int *niomaps);
67void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first, 54void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
@@ -72,4 +59,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
72 xfs_off_t last, uint64_t flags, int fiopt); 59 xfs_off_t last, uint64_t flags, int fiopt);
73int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last); 60int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
74 61
62int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
63
75#endif /* _XFS_VNODEOPS_H */ 64#endif /* _XFS_VNODEOPS_H */