aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_file.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2011-08-12 17:21:35 -0400
committerAlex Elder <aelder@sgi.com>2011-08-12 17:21:35 -0400
commitc59d87c460767bc35dafd490139d3cfe78fb8da4 (patch)
tree2aad8261f86488e501d9645bd35d1398906da46d /fs/xfs/xfs_file.c
parent06f8e2d6754dc631732415b741b5aa58a0f7133f (diff)
xfs: remove subdirectories
Use the move from Linux 2.6 to Linux 3.x as an excuse to kill the annoying subdirectories in the XFS source code. Besides the large amount of file rename the only changes are to the Makefile, a few files including headers with the subdirectory prefix, and the binary sysctl compat code that includes a header under fs/xfs/ from kernel/. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_file.c')
-rw-r--r--fs/xfs/xfs_file.c1096
1 files changed, 1096 insertions, 0 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
new file mode 100644
index 000000000000..7f7b42469ea7
--- /dev/null
+++ b/fs/xfs/xfs_file.c
@@ -0,0 +1,1096 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_bit.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_sb.h"
24#include "xfs_ag.h"
25#include "xfs_trans.h"
26#include "xfs_mount.h"
27#include "xfs_bmap_btree.h"
28#include "xfs_alloc.h"
29#include "xfs_dinode.h"
30#include "xfs_inode.h"
31#include "xfs_inode_item.h"
32#include "xfs_bmap.h"
33#include "xfs_error.h"
34#include "xfs_vnodeops.h"
35#include "xfs_da_btree.h"
36#include "xfs_ioctl.h"
37#include "xfs_trace.h"
38
39#include <linux/dcache.h>
40#include <linux/falloc.h>
41
42static const struct vm_operations_struct xfs_file_vm_ops;
43
44/*
45 * Locking primitives for read and write IO paths to ensure we consistently use
46 * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
47 */
48static inline void
49xfs_rw_ilock(
50 struct xfs_inode *ip,
51 int type)
52{
53 if (type & XFS_IOLOCK_EXCL)
54 mutex_lock(&VFS_I(ip)->i_mutex);
55 xfs_ilock(ip, type);
56}
57
58static inline void
59xfs_rw_iunlock(
60 struct xfs_inode *ip,
61 int type)
62{
63 xfs_iunlock(ip, type);
64 if (type & XFS_IOLOCK_EXCL)
65 mutex_unlock(&VFS_I(ip)->i_mutex);
66}
67
68static inline void
69xfs_rw_ilock_demote(
70 struct xfs_inode *ip,
71 int type)
72{
73 xfs_ilock_demote(ip, type);
74 if (type & XFS_IOLOCK_EXCL)
75 mutex_unlock(&VFS_I(ip)->i_mutex);
76}
77
78/*
79 * xfs_iozero
80 *
81 * xfs_iozero clears the specified range of buffer supplied,
82 * and marks all the affected blocks as valid and modified. If
83 * an affected block is not allocated, it will be allocated. If
84 * an affected block is not completely overwritten, and is not
85 * valid before the operation, it will be read from disk before
86 * being partially zeroed.
87 */
88STATIC int
89xfs_iozero(
90 struct xfs_inode *ip, /* inode */
91 loff_t pos, /* offset in file */
92 size_t count) /* size of data to zero */
93{
94 struct page *page;
95 struct address_space *mapping;
96 int status;
97
98 mapping = VFS_I(ip)->i_mapping;
99 do {
100 unsigned offset, bytes;
101 void *fsdata;
102
103 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
104 bytes = PAGE_CACHE_SIZE - offset;
105 if (bytes > count)
106 bytes = count;
107
108 status = pagecache_write_begin(NULL, mapping, pos, bytes,
109 AOP_FLAG_UNINTERRUPTIBLE,
110 &page, &fsdata);
111 if (status)
112 break;
113
114 zero_user(page, offset, bytes);
115
116 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
117 page, fsdata);
118 WARN_ON(status <= 0); /* can't return less than zero! */
119 pos += bytes;
120 count -= bytes;
121 status = 0;
122 } while (count);
123
124 return (-status);
125}
126
127STATIC int
128xfs_file_fsync(
129 struct file *file,
130 loff_t start,
131 loff_t end,
132 int datasync)
133{
134 struct inode *inode = file->f_mapping->host;
135 struct xfs_inode *ip = XFS_I(inode);
136 struct xfs_mount *mp = ip->i_mount;
137 struct xfs_trans *tp;
138 int error = 0;
139 int log_flushed = 0;
140
141 trace_xfs_file_fsync(ip);
142
143 error = filemap_write_and_wait_range(inode->i_mapping, start, end);
144 if (error)
145 return error;
146
147 if (XFS_FORCED_SHUTDOWN(mp))
148 return -XFS_ERROR(EIO);
149
150 xfs_iflags_clear(ip, XFS_ITRUNCATED);
151
152 xfs_ilock(ip, XFS_IOLOCK_SHARED);
153 xfs_ioend_wait(ip);
154 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
155
156 if (mp->m_flags & XFS_MOUNT_BARRIER) {
157 /*
158 * If we have an RT and/or log subvolume we need to make sure
159 * to flush the write cache the device used for file data
160 * first. This is to ensure newly written file data make
161 * it to disk before logging the new inode size in case of
162 * an extending write.
163 */
164 if (XFS_IS_REALTIME_INODE(ip))
165 xfs_blkdev_issue_flush(mp->m_rtdev_targp);
166 else if (mp->m_logdev_targp != mp->m_ddev_targp)
167 xfs_blkdev_issue_flush(mp->m_ddev_targp);
168 }
169
170 /*
171 * We always need to make sure that the required inode state is safe on
172 * disk. The inode might be clean but we still might need to force the
173 * log because of committed transactions that haven't hit the disk yet.
174 * Likewise, there could be unflushed non-transactional changes to the
175 * inode core that have to go to disk and this requires us to issue
176 * a synchronous transaction to capture these changes correctly.
177 *
178 * This code relies on the assumption that if the i_update_core field
179 * of the inode is clear and the inode is unpinned then it is clean
180 * and no action is required.
181 */
182 xfs_ilock(ip, XFS_ILOCK_SHARED);
183
184 /*
185 * First check if the VFS inode is marked dirty. All the dirtying
186 * of non-transactional updates no goes through mark_inode_dirty*,
187 * which allows us to distinguish beteeen pure timestamp updates
188 * and i_size updates which need to be caught for fdatasync.
189 * After that also theck for the dirty state in the XFS inode, which
190 * might gets cleared when the inode gets written out via the AIL
191 * or xfs_iflush_cluster.
192 */
193 if (((inode->i_state & I_DIRTY_DATASYNC) ||
194 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
195 ip->i_update_core) {
196 /*
197 * Kick off a transaction to log the inode core to get the
198 * updates. The sync transaction will also force the log.
199 */
200 xfs_iunlock(ip, XFS_ILOCK_SHARED);
201 tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
202 error = xfs_trans_reserve(tp, 0,
203 XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
204 if (error) {
205 xfs_trans_cancel(tp, 0);
206 return -error;
207 }
208 xfs_ilock(ip, XFS_ILOCK_EXCL);
209
210 /*
211 * Note - it's possible that we might have pushed ourselves out
212 * of the way during trans_reserve which would flush the inode.
213 * But there's no guarantee that the inode buffer has actually
214 * gone out yet (it's delwri). Plus the buffer could be pinned
215 * anyway if it's part of an inode in another recent
216 * transaction. So we play it safe and fire off the
217 * transaction anyway.
218 */
219 xfs_trans_ijoin(tp, ip);
220 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
221 xfs_trans_set_sync(tp);
222 error = _xfs_trans_commit(tp, 0, &log_flushed);
223
224 xfs_iunlock(ip, XFS_ILOCK_EXCL);
225 } else {
226 /*
227 * Timestamps/size haven't changed since last inode flush or
228 * inode transaction commit. That means either nothing got
229 * written or a transaction committed which caught the updates.
230 * If the latter happened and the transaction hasn't hit the
231 * disk yet, the inode will be still be pinned. If it is,
232 * force the log.
233 */
234 if (xfs_ipincount(ip)) {
235 error = _xfs_log_force_lsn(mp,
236 ip->i_itemp->ili_last_lsn,
237 XFS_LOG_SYNC, &log_flushed);
238 }
239 xfs_iunlock(ip, XFS_ILOCK_SHARED);
240 }
241
242 /*
243 * If we only have a single device, and the log force about was
244 * a no-op we might have to flush the data device cache here.
245 * This can only happen for fdatasync/O_DSYNC if we were overwriting
246 * an already allocated file and thus do not have any metadata to
247 * commit.
248 */
249 if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
250 mp->m_logdev_targp == mp->m_ddev_targp &&
251 !XFS_IS_REALTIME_INODE(ip) &&
252 !log_flushed)
253 xfs_blkdev_issue_flush(mp->m_ddev_targp);
254
255 return -error;
256}
257
258STATIC ssize_t
259xfs_file_aio_read(
260 struct kiocb *iocb,
261 const struct iovec *iovp,
262 unsigned long nr_segs,
263 loff_t pos)
264{
265 struct file *file = iocb->ki_filp;
266 struct inode *inode = file->f_mapping->host;
267 struct xfs_inode *ip = XFS_I(inode);
268 struct xfs_mount *mp = ip->i_mount;
269 size_t size = 0;
270 ssize_t ret = 0;
271 int ioflags = 0;
272 xfs_fsize_t n;
273 unsigned long seg;
274
275 XFS_STATS_INC(xs_read_calls);
276
277 BUG_ON(iocb->ki_pos != pos);
278
279 if (unlikely(file->f_flags & O_DIRECT))
280 ioflags |= IO_ISDIRECT;
281 if (file->f_mode & FMODE_NOCMTIME)
282 ioflags |= IO_INVIS;
283
284 /* START copy & waste from filemap.c */
285 for (seg = 0; seg < nr_segs; seg++) {
286 const struct iovec *iv = &iovp[seg];
287
288 /*
289 * If any segment has a negative length, or the cumulative
290 * length ever wraps negative then return -EINVAL.
291 */
292 size += iv->iov_len;
293 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
294 return XFS_ERROR(-EINVAL);
295 }
296 /* END copy & waste from filemap.c */
297
298 if (unlikely(ioflags & IO_ISDIRECT)) {
299 xfs_buftarg_t *target =
300 XFS_IS_REALTIME_INODE(ip) ?
301 mp->m_rtdev_targp : mp->m_ddev_targp;
302 if ((iocb->ki_pos & target->bt_smask) ||
303 (size & target->bt_smask)) {
304 if (iocb->ki_pos == ip->i_size)
305 return 0;
306 return -XFS_ERROR(EINVAL);
307 }
308 }
309
310 n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
311 if (n <= 0 || size == 0)
312 return 0;
313
314 if (n < size)
315 size = n;
316
317 if (XFS_FORCED_SHUTDOWN(mp))
318 return -EIO;
319
320 if (unlikely(ioflags & IO_ISDIRECT)) {
321 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
322
323 if (inode->i_mapping->nrpages) {
324 ret = -xfs_flushinval_pages(ip,
325 (iocb->ki_pos & PAGE_CACHE_MASK),
326 -1, FI_REMAPF_LOCKED);
327 if (ret) {
328 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
329 return ret;
330 }
331 }
332 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
333 } else
334 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
335
336 trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
337
338 ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
339 if (ret > 0)
340 XFS_STATS_ADD(xs_read_bytes, ret);
341
342 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
343 return ret;
344}
345
346STATIC ssize_t
347xfs_file_splice_read(
348 struct file *infilp,
349 loff_t *ppos,
350 struct pipe_inode_info *pipe,
351 size_t count,
352 unsigned int flags)
353{
354 struct xfs_inode *ip = XFS_I(infilp->f_mapping->host);
355 int ioflags = 0;
356 ssize_t ret;
357
358 XFS_STATS_INC(xs_read_calls);
359
360 if (infilp->f_mode & FMODE_NOCMTIME)
361 ioflags |= IO_INVIS;
362
363 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
364 return -EIO;
365
366 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
367
368 trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
369
370 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
371 if (ret > 0)
372 XFS_STATS_ADD(xs_read_bytes, ret);
373
374 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
375 return ret;
376}
377
378STATIC void
379xfs_aio_write_isize_update(
380 struct inode *inode,
381 loff_t *ppos,
382 ssize_t bytes_written)
383{
384 struct xfs_inode *ip = XFS_I(inode);
385 xfs_fsize_t isize = i_size_read(inode);
386
387 if (bytes_written > 0)
388 XFS_STATS_ADD(xs_write_bytes, bytes_written);
389
390 if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
391 *ppos > isize))
392 *ppos = isize;
393
394 if (*ppos > ip->i_size) {
395 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
396 if (*ppos > ip->i_size)
397 ip->i_size = *ppos;
398 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
399 }
400}
401
402/*
403 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
404 * part of the I/O may have been written to disk before the error occurred. In
405 * this case the on-disk file size may have been adjusted beyond the in-memory
406 * file size and now needs to be truncated back.
407 */
408STATIC void
409xfs_aio_write_newsize_update(
410 struct xfs_inode *ip)
411{
412 if (ip->i_new_size) {
413 xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
414 ip->i_new_size = 0;
415 if (ip->i_d.di_size > ip->i_size)
416 ip->i_d.di_size = ip->i_size;
417 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
418 }
419}
420
421/*
422 * xfs_file_splice_write() does not use xfs_rw_ilock() because
423 * generic_file_splice_write() takes the i_mutex itself. This, in theory,
424 * couuld cause lock inversions between the aio_write path and the splice path
425 * if someone is doing concurrent splice(2) based writes and write(2) based
426 * writes to the same inode. The only real way to fix this is to re-implement
427 * the generic code here with correct locking orders.
428 */
429STATIC ssize_t
430xfs_file_splice_write(
431 struct pipe_inode_info *pipe,
432 struct file *outfilp,
433 loff_t *ppos,
434 size_t count,
435 unsigned int flags)
436{
437 struct inode *inode = outfilp->f_mapping->host;
438 struct xfs_inode *ip = XFS_I(inode);
439 xfs_fsize_t new_size;
440 int ioflags = 0;
441 ssize_t ret;
442
443 XFS_STATS_INC(xs_write_calls);
444
445 if (outfilp->f_mode & FMODE_NOCMTIME)
446 ioflags |= IO_INVIS;
447
448 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
449 return -EIO;
450
451 xfs_ilock(ip, XFS_IOLOCK_EXCL);
452
453 new_size = *ppos + count;
454
455 xfs_ilock(ip, XFS_ILOCK_EXCL);
456 if (new_size > ip->i_size)
457 ip->i_new_size = new_size;
458 xfs_iunlock(ip, XFS_ILOCK_EXCL);
459
460 trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
461
462 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
463
464 xfs_aio_write_isize_update(inode, ppos, ret);
465 xfs_aio_write_newsize_update(ip);
466 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
467 return ret;
468}
469
470/*
471 * This routine is called to handle zeroing any space in the last
472 * block of the file that is beyond the EOF. We do this since the
473 * size is being increased without writing anything to that block
474 * and we don't want anyone to read the garbage on the disk.
475 */
476STATIC int /* error (positive) */
477xfs_zero_last_block(
478 xfs_inode_t *ip,
479 xfs_fsize_t offset,
480 xfs_fsize_t isize)
481{
482 xfs_fileoff_t last_fsb;
483 xfs_mount_t *mp = ip->i_mount;
484 int nimaps;
485 int zero_offset;
486 int zero_len;
487 int error = 0;
488 xfs_bmbt_irec_t imap;
489
490 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
491
492 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
493 if (zero_offset == 0) {
494 /*
495 * There are no extra bytes in the last block on disk to
496 * zero, so return.
497 */
498 return 0;
499 }
500
501 last_fsb = XFS_B_TO_FSBT(mp, isize);
502 nimaps = 1;
503 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
504 &nimaps, NULL);
505 if (error) {
506 return error;
507 }
508 ASSERT(nimaps > 0);
509 /*
510 * If the block underlying isize is just a hole, then there
511 * is nothing to zero.
512 */
513 if (imap.br_startblock == HOLESTARTBLOCK) {
514 return 0;
515 }
516 /*
517 * Zero the part of the last block beyond the EOF, and write it
518 * out sync. We need to drop the ilock while we do this so we
519 * don't deadlock when the buffer cache calls back to us.
520 */
521 xfs_iunlock(ip, XFS_ILOCK_EXCL);
522
523 zero_len = mp->m_sb.sb_blocksize - zero_offset;
524 if (isize + zero_len > offset)
525 zero_len = offset - isize;
526 error = xfs_iozero(ip, isize, zero_len);
527
528 xfs_ilock(ip, XFS_ILOCK_EXCL);
529 ASSERT(error >= 0);
530 return error;
531}
532
533/*
534 * Zero any on disk space between the current EOF and the new,
535 * larger EOF. This handles the normal case of zeroing the remainder
536 * of the last block in the file and the unusual case of zeroing blocks
537 * out beyond the size of the file. This second case only happens
538 * with fixed size extents and when the system crashes before the inode
539 * size was updated but after blocks were allocated. If fill is set,
540 * then any holes in the range are filled and zeroed. If not, the holes
541 * are left alone as holes.
542 */
543
544int /* error (positive) */
545xfs_zero_eof(
546 xfs_inode_t *ip,
547 xfs_off_t offset, /* starting I/O offset */
548 xfs_fsize_t isize) /* current inode size */
549{
550 xfs_mount_t *mp = ip->i_mount;
551 xfs_fileoff_t start_zero_fsb;
552 xfs_fileoff_t end_zero_fsb;
553 xfs_fileoff_t zero_count_fsb;
554 xfs_fileoff_t last_fsb;
555 xfs_fileoff_t zero_off;
556 xfs_fsize_t zero_len;
557 int nimaps;
558 int error = 0;
559 xfs_bmbt_irec_t imap;
560
561 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
562 ASSERT(offset > isize);
563
564 /*
565 * First handle zeroing the block on which isize resides.
566 * We only zero a part of that block so it is handled specially.
567 */
568 error = xfs_zero_last_block(ip, offset, isize);
569 if (error) {
570 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
571 return error;
572 }
573
574 /*
575 * Calculate the range between the new size and the old
576 * where blocks needing to be zeroed may exist. To get the
577 * block where the last byte in the file currently resides,
578 * we need to subtract one from the size and truncate back
579 * to a block boundary. We subtract 1 in case the size is
580 * exactly on a block boundary.
581 */
582 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
583 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
584 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
585 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
586 if (last_fsb == end_zero_fsb) {
587 /*
588 * The size was only incremented on its last block.
589 * We took care of that above, so just return.
590 */
591 return 0;
592 }
593
594 ASSERT(start_zero_fsb <= end_zero_fsb);
595 while (start_zero_fsb <= end_zero_fsb) {
596 nimaps = 1;
597 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
598 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
599 0, NULL, 0, &imap, &nimaps, NULL);
600 if (error) {
601 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
602 return error;
603 }
604 ASSERT(nimaps > 0);
605
606 if (imap.br_state == XFS_EXT_UNWRITTEN ||
607 imap.br_startblock == HOLESTARTBLOCK) {
608 /*
609 * This loop handles initializing pages that were
610 * partially initialized by the code below this
611 * loop. It basically zeroes the part of the page
612 * that sits on a hole and sets the page as P_HOLE
613 * and calls remapf if it is a mapped file.
614 */
615 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
616 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
617 continue;
618 }
619
620 /*
621 * There are blocks we need to zero.
622 * Drop the inode lock while we're doing the I/O.
623 * We'll still have the iolock to protect us.
624 */
625 xfs_iunlock(ip, XFS_ILOCK_EXCL);
626
627 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
628 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
629
630 if ((zero_off + zero_len) > offset)
631 zero_len = offset - zero_off;
632
633 error = xfs_iozero(ip, zero_off, zero_len);
634 if (error) {
635 goto out_lock;
636 }
637
638 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
639 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
640
641 xfs_ilock(ip, XFS_ILOCK_EXCL);
642 }
643
644 return 0;
645
646out_lock:
647 xfs_ilock(ip, XFS_ILOCK_EXCL);
648 ASSERT(error >= 0);
649 return error;
650}
651
652/*
653 * Common pre-write limit and setup checks.
654 *
655 * Returns with iolock held according to @iolock.
656 */
657STATIC ssize_t
658xfs_file_aio_write_checks(
659 struct file *file,
660 loff_t *pos,
661 size_t *count,
662 int *iolock)
663{
664 struct inode *inode = file->f_mapping->host;
665 struct xfs_inode *ip = XFS_I(inode);
666 xfs_fsize_t new_size;
667 int error = 0;
668
669 error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
670 if (error) {
671 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
672 *iolock = 0;
673 return error;
674 }
675
676 new_size = *pos + *count;
677 if (new_size > ip->i_size)
678 ip->i_new_size = new_size;
679
680 if (likely(!(file->f_mode & FMODE_NOCMTIME)))
681 file_update_time(file);
682
683 /*
684 * If the offset is beyond the size of the file, we need to zero any
685 * blocks that fall between the existing EOF and the start of this
686 * write.
687 */
688 if (*pos > ip->i_size)
689 error = -xfs_zero_eof(ip, *pos, ip->i_size);
690
691 xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
692 if (error)
693 return error;
694
695 /*
696 * If we're writing the file then make sure to clear the setuid and
697 * setgid bits if the process is not being run by root. This keeps
698 * people from modifying setuid and setgid binaries.
699 */
700 return file_remove_suid(file);
701
702}
703
704/*
705 * xfs_file_dio_aio_write - handle direct IO writes
706 *
707 * Lock the inode appropriately to prepare for and issue a direct IO write.
708 * By separating it from the buffered write path we remove all the tricky to
709 * follow locking changes and looping.
710 *
711 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
712 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
713 * pages are flushed out.
714 *
715 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
716 * allowing them to be done in parallel with reads and other direct IO writes.
717 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
718 * needs to do sub-block zeroing and that requires serialisation against other
719 * direct IOs to the same block. In this case we need to serialise the
720 * submission of the unaligned IOs so that we don't get racing block zeroing in
721 * the dio layer. To avoid the problem with aio, we also need to wait for
722 * outstanding IOs to complete so that unwritten extent conversion is completed
723 * before we try to map the overlapping block. This is currently implemented by
724 * hitting it with a big hammer (i.e. xfs_ioend_wait()).
725 *
726 * Returns with locks held indicated by @iolock and errors indicated by
727 * negative return values.
728 */
729STATIC ssize_t
730xfs_file_dio_aio_write(
731 struct kiocb *iocb,
732 const struct iovec *iovp,
733 unsigned long nr_segs,
734 loff_t pos,
735 size_t ocount,
736 int *iolock)
737{
738 struct file *file = iocb->ki_filp;
739 struct address_space *mapping = file->f_mapping;
740 struct inode *inode = mapping->host;
741 struct xfs_inode *ip = XFS_I(inode);
742 struct xfs_mount *mp = ip->i_mount;
743 ssize_t ret = 0;
744 size_t count = ocount;
745 int unaligned_io = 0;
746 struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
747 mp->m_rtdev_targp : mp->m_ddev_targp;
748
749 *iolock = 0;
750 if ((pos & target->bt_smask) || (count & target->bt_smask))
751 return -XFS_ERROR(EINVAL);
752
753 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
754 unaligned_io = 1;
755
756 if (unaligned_io || mapping->nrpages || pos > ip->i_size)
757 *iolock = XFS_IOLOCK_EXCL;
758 else
759 *iolock = XFS_IOLOCK_SHARED;
760 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
761
762 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
763 if (ret)
764 return ret;
765
766 if (mapping->nrpages) {
767 WARN_ON(*iolock != XFS_IOLOCK_EXCL);
768 ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
769 FI_REMAPF_LOCKED);
770 if (ret)
771 return ret;
772 }
773
774 /*
775 * If we are doing unaligned IO, wait for all other IO to drain,
776 * otherwise demote the lock if we had to flush cached pages
777 */
778 if (unaligned_io)
779 xfs_ioend_wait(ip);
780 else if (*iolock == XFS_IOLOCK_EXCL) {
781 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
782 *iolock = XFS_IOLOCK_SHARED;
783 }
784
785 trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
786 ret = generic_file_direct_write(iocb, iovp,
787 &nr_segs, pos, &iocb->ki_pos, count, ocount);
788
789 /* No fallback to buffered IO on errors for XFS. */
790 ASSERT(ret < 0 || ret == count);
791 return ret;
792}
793
794STATIC ssize_t
795xfs_file_buffered_aio_write(
796 struct kiocb *iocb,
797 const struct iovec *iovp,
798 unsigned long nr_segs,
799 loff_t pos,
800 size_t ocount,
801 int *iolock)
802{
803 struct file *file = iocb->ki_filp;
804 struct address_space *mapping = file->f_mapping;
805 struct inode *inode = mapping->host;
806 struct xfs_inode *ip = XFS_I(inode);
807 ssize_t ret;
808 int enospc = 0;
809 size_t count = ocount;
810
811 *iolock = XFS_IOLOCK_EXCL;
812 xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
813
814 ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
815 if (ret)
816 return ret;
817
818 /* We can write back this queue in page reclaim */
819 current->backing_dev_info = mapping->backing_dev_info;
820
821write_retry:
822 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
823 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
824 pos, &iocb->ki_pos, count, ret);
825 /*
826 * if we just got an ENOSPC, flush the inode now we aren't holding any
827 * page locks and retry *once*
828 */
829 if (ret == -ENOSPC && !enospc) {
830 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
831 if (ret)
832 return ret;
833 enospc = 1;
834 goto write_retry;
835 }
836 current->backing_dev_info = NULL;
837 return ret;
838}
839
840STATIC ssize_t
841xfs_file_aio_write(
842 struct kiocb *iocb,
843 const struct iovec *iovp,
844 unsigned long nr_segs,
845 loff_t pos)
846{
847 struct file *file = iocb->ki_filp;
848 struct address_space *mapping = file->f_mapping;
849 struct inode *inode = mapping->host;
850 struct xfs_inode *ip = XFS_I(inode);
851 ssize_t ret;
852 int iolock;
853 size_t ocount = 0;
854
855 XFS_STATS_INC(xs_write_calls);
856
857 BUG_ON(iocb->ki_pos != pos);
858
859 ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
860 if (ret)
861 return ret;
862
863 if (ocount == 0)
864 return 0;
865
866 xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
867
868 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
869 return -EIO;
870
871 if (unlikely(file->f_flags & O_DIRECT))
872 ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
873 ocount, &iolock);
874 else
875 ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
876 ocount, &iolock);
877
878 xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
879
880 if (ret <= 0)
881 goto out_unlock;
882
883 /* Handle various SYNC-type writes */
884 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
885 loff_t end = pos + ret - 1;
886 int error;
887
888 xfs_rw_iunlock(ip, iolock);
889 error = xfs_file_fsync(file, pos, end,
890 (file->f_flags & __O_SYNC) ? 0 : 1);
891 xfs_rw_ilock(ip, iolock);
892 if (error)
893 ret = error;
894 }
895
896out_unlock:
897 xfs_aio_write_newsize_update(ip);
898 xfs_rw_iunlock(ip, iolock);
899 return ret;
900}
901
902STATIC long
903xfs_file_fallocate(
904 struct file *file,
905 int mode,
906 loff_t offset,
907 loff_t len)
908{
909 struct inode *inode = file->f_path.dentry->d_inode;
910 long error;
911 loff_t new_size = 0;
912 xfs_flock64_t bf;
913 xfs_inode_t *ip = XFS_I(inode);
914 int cmd = XFS_IOC_RESVSP;
915 int attr_flags = XFS_ATTR_NOLOCK;
916
917 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
918 return -EOPNOTSUPP;
919
920 bf.l_whence = 0;
921 bf.l_start = offset;
922 bf.l_len = len;
923
924 xfs_ilock(ip, XFS_IOLOCK_EXCL);
925
926 if (mode & FALLOC_FL_PUNCH_HOLE)
927 cmd = XFS_IOC_UNRESVSP;
928
929 /* check the new inode size is valid before allocating */
930 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
931 offset + len > i_size_read(inode)) {
932 new_size = offset + len;
933 error = inode_newsize_ok(inode, new_size);
934 if (error)
935 goto out_unlock;
936 }
937
938 if (file->f_flags & O_DSYNC)
939 attr_flags |= XFS_ATTR_SYNC;
940
941 error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
942 if (error)
943 goto out_unlock;
944
945 /* Change file size if needed */
946 if (new_size) {
947 struct iattr iattr;
948
949 iattr.ia_valid = ATTR_SIZE;
950 iattr.ia_size = new_size;
951 error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
952 }
953
954out_unlock:
955 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
956 return error;
957}
958
959
960STATIC int
961xfs_file_open(
962 struct inode *inode,
963 struct file *file)
964{
965 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
966 return -EFBIG;
967 if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
968 return -EIO;
969 return 0;
970}
971
972STATIC int
973xfs_dir_open(
974 struct inode *inode,
975 struct file *file)
976{
977 struct xfs_inode *ip = XFS_I(inode);
978 int mode;
979 int error;
980
981 error = xfs_file_open(inode, file);
982 if (error)
983 return error;
984
985 /*
986 * If there are any blocks, read-ahead block 0 as we're almost
987 * certain to have the next operation be a read there.
988 */
989 mode = xfs_ilock_map_shared(ip);
990 if (ip->i_d.di_nextents > 0)
991 xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
992 xfs_iunlock(ip, mode);
993 return 0;
994}
995
996STATIC int
997xfs_file_release(
998 struct inode *inode,
999 struct file *filp)
1000{
1001 return -xfs_release(XFS_I(inode));
1002}
1003
1004STATIC int
1005xfs_file_readdir(
1006 struct file *filp,
1007 void *dirent,
1008 filldir_t filldir)
1009{
1010 struct inode *inode = filp->f_path.dentry->d_inode;
1011 xfs_inode_t *ip = XFS_I(inode);
1012 int error;
1013 size_t bufsize;
1014
1015 /*
1016 * The Linux API doesn't pass down the total size of the buffer
1017 * we read into down to the filesystem. With the filldir concept
1018 * it's not needed for correct information, but the XFS dir2 leaf
1019 * code wants an estimate of the buffer size to calculate it's
1020 * readahead window and size the buffers used for mapping to
1021 * physical blocks.
1022 *
1023 * Try to give it an estimate that's good enough, maybe at some
1024 * point we can change the ->readdir prototype to include the
1025 * buffer size. For now we use the current glibc buffer size.
1026 */
1027 bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
1028
1029 error = xfs_readdir(ip, dirent, bufsize,
1030 (xfs_off_t *)&filp->f_pos, filldir);
1031 if (error)
1032 return -error;
1033 return 0;
1034}
1035
1036STATIC int
1037xfs_file_mmap(
1038 struct file *filp,
1039 struct vm_area_struct *vma)
1040{
1041 vma->vm_ops = &xfs_file_vm_ops;
1042 vma->vm_flags |= VM_CAN_NONLINEAR;
1043
1044 file_accessed(filp);
1045 return 0;
1046}
1047
1048/*
1049 * mmap()d file has taken write protection fault and is being made
1050 * writable. We can set the page state up correctly for a writable
1051 * page, which means we can do correct delalloc accounting (ENOSPC
1052 * checking!) and unwritten extent mapping.
1053 */
1054STATIC int
1055xfs_vm_page_mkwrite(
1056 struct vm_area_struct *vma,
1057 struct vm_fault *vmf)
1058{
1059 return block_page_mkwrite(vma, vmf, xfs_get_blocks);
1060}
1061
1062const struct file_operations xfs_file_operations = {
1063 .llseek = generic_file_llseek,
1064 .read = do_sync_read,
1065 .write = do_sync_write,
1066 .aio_read = xfs_file_aio_read,
1067 .aio_write = xfs_file_aio_write,
1068 .splice_read = xfs_file_splice_read,
1069 .splice_write = xfs_file_splice_write,
1070 .unlocked_ioctl = xfs_file_ioctl,
1071#ifdef CONFIG_COMPAT
1072 .compat_ioctl = xfs_file_compat_ioctl,
1073#endif
1074 .mmap = xfs_file_mmap,
1075 .open = xfs_file_open,
1076 .release = xfs_file_release,
1077 .fsync = xfs_file_fsync,
1078 .fallocate = xfs_file_fallocate,
1079};
1080
1081const struct file_operations xfs_dir_file_operations = {
1082 .open = xfs_dir_open,
1083 .read = generic_read_dir,
1084 .readdir = xfs_file_readdir,
1085 .llseek = generic_file_llseek,
1086 .unlocked_ioctl = xfs_file_ioctl,
1087#ifdef CONFIG_COMPAT
1088 .compat_ioctl = xfs_file_compat_ioctl,
1089#endif
1090 .fsync = xfs_file_fsync,
1091};
1092
1093static const struct vm_operations_struct xfs_file_vm_ops = {
1094 .fault = filemap_fault,
1095 .page_mkwrite = xfs_vm_page_mkwrite,
1096};