Merge branch 'for-linus' of git://oss.sgi.com/xfs/xfs

* 'for-linus' of git://oss.sgi.com/xfs/xfs: xfs: prevent NMI timeouts in cmn_err xfs: Add log level to assertion printk xfs: fix an assignment within an ASSERT() xfs: fix error handling for synchronous writes xfs: add FITRIM support xfs: ensure log covering transactions are synchronous xfs: serialise unaligned direct IOs xfs: factor common write setup code xfs: split buffered IO write path from xfs_file_aio_write xfs: split direct IO write path from xfs_file_aio_write xfs: introduce xfs_rw_lock() helpers for locking the inode xfs: factor post-write newsize updates xfs: factor common post-write isize handling code xfs: ensure sync write errors are returned
author: Linus Torvalds <torvalds@linux-foundation.org> 2011-01-14 18:24:17 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-01-14 18:24:17 -0500
commit: 7cb3920a6529df7f54487abe973b903b8239e901 (patch)
tree: d36da0fe62adfa3e2e46a485e0bdb06019b2e560 /fs/xfs/linux-2.6
parent: ad56cbf0fa6c09350c738ec59a3361f2e4ab4bc7 (diff)
parent: 73efe4a4ddf8eb2b1cc7039e8a66a23a424961af (diff)
10 files changed, 573 insertions, 247 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 92f1f2acc6ab..ac1c7e8378dd 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -896,7 +896,6 @@ xfs_buf_rele(
        trace_xfs_buf_rele(bp, _RET_IP_);
        if (!pag) {
-                ASSERT(!bp->b_relse);
                ASSERT(list_empty(&bp->b_lru));
                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
                if (atomic_dec_and_test(&bp->b_hold))
@@ -908,11 +907,7 @@ xfs_buf_rele(
        ASSERT(atomic_read(&bp->b_hold) > 0);
        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-                if (bp->b_relse) {
+                if (!(bp->b_flags & XBF_STALE) &&
-                        atomic_inc(&bp->b_hold);
-                        spin_unlock(&pag->pag_buf_lock);
-                        bp->b_relse(bp);
-                } else if (!(bp->b_flags & XBF_STALE) &&
                           atomic_read(&bp->b_lru_ref)) {
                        xfs_buf_lru_add(bp);
                        spin_unlock(&pag->pag_buf_lock);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a76c2428faff..cbe65950e524 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -152,8 +152,6 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
-typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES        2
@@ -183,7 +181,6 @@ typedef struct xfs_buf {
        void                    *b_addr;        /* virtual address of buffer */
        struct work_struct      b_iodone_work;
        xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-        xfs_buf_relse_t         b_relse;        /* releasing function */
        struct completion       b_iowait;       /* queue for I/O waiters */
        void                    *b_fspriv;
        void                    *b_fspriv2;
@@ -323,7 +320,6 @@ void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_FSPRIVATE2(bp, type)            ((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)         ((bp)->b_fspriv2 = (void*)(val))
 #define XFS_BUF_SET_START(bp)                   do { } while (0)
-#define XFS_BUF_SET_BRELSE_FUNC(bp, func)       ((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)                 (xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)   xfs_buf_associate_memory(bp, val, cnt)
@@ -360,8 +356,7 @@ xfs_buf_set_ref(
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
-        if (!bp->b_relse)
+        xfs_buf_unlock(bp);
-                xfs_buf_unlock(bp);
        xfs_buf_rele(bp);
 }
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
new file mode 100644
index 000000000000..05201ae719e5
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+STATIC int
+xfs_trim_extents(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_fsblock_t           start,
+        xfs_fsblock_t           len,
+        xfs_fsblock_t           minlen,
+        __uint64_t              *blocks_trimmed)
+{
+        struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+        struct xfs_btree_cur    *cur;
+        struct xfs_buf          *agbp;
+        struct xfs_perag        *pag;
+        int                     error;
+        int                     i;
+        pag = xfs_perag_get(mp, agno);
+        error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+        if (error || !agbp)
+                goto out_put_perag;
+        cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+        /*
+         * Force out the log.  This means any transactions that might have freed
+         * space before we took the AGF buffer lock are now on disk, and the
+         * volatile disk cache is flushed.
+         */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * Look up the longest btree in the AGF and start with it.
+         */
+        error = xfs_alloc_lookup_le(cur, 0,
+                                    XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+        if (error)
+                goto out_del_cursor;
+        /*
+         * Loop until we are done with all extents that are large
+         * enough to be worth discarding.
+         */
+        while (i) {
+                xfs_agblock_t fbno;
+                xfs_extlen_t flen;
+                error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+                if (error)
+                        goto out_del_cursor;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+                ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+                /*
+                 * Too small?  Give up.
+                 */
+                if (flen < minlen) {
+                        trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                        goto out_del_cursor;
+                }
+                /*
+                 * If the extent is entirely outside of the range we are
+                 * supposed to discard skip it.  Do not bother to trim
+                 * down partially overlapping ranges for now.
+                 */
+                if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                    XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                        trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                /*
+                 * If any blocks in the range are still busy, skip the
+                 * discard and try again the next time.
+                 */
+                if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                        trace_xfs_discard_busy(mp, agno, fbno, flen);
+                        goto next_extent;
+                }
+                trace_xfs_discard_extent(mp, agno, fbno, flen);
+                error = -blkdev_issue_discard(bdev,
+                                XFS_AGB_TO_DADDR(mp, agno, fbno),
+                                XFS_FSB_TO_BB(mp, flen),
+                                GFP_NOFS, 0);
+                if (error)
+                        goto out_del_cursor;
+                *blocks_trimmed += flen;
+next_extent:
+                error = xfs_btree_decrement(cur, 0, &i);
+                if (error)
+                        goto out_del_cursor;
+        }
+out_del_cursor:
+        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+        xfs_buf_relse(agbp);
+out_put_perag:
+        xfs_perag_put(pag);
+        return error;
+}
+int
+xfs_ioc_trim(
+        struct xfs_mount                *mp,
+        struct fstrim_range __user      *urange)
+{
+        struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+        unsigned int            granularity = q->limits.discard_granularity;
+        struct fstrim_range     range;
+        xfs_fsblock_t           start, len, minlen;
+        xfs_agnumber_t          start_agno, end_agno, agno;
+        __uint64_t              blocks_trimmed = 0;
+        int                     error, last_error = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -XFS_ERROR(EPERM);
+        if (copy_from_user(&range, urange, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        /*
+         * Truncating down the len isn't actually quite correct, but using
+         * XFS_B_TO_FSB would mean we trivially get overflows for values
+         * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+         * used by the fstrim application.  In the end it really doesn't
+         * matter as trimming blocks is an advisory interface.
+         */
+        start = XFS_B_TO_FSBT(mp, range.start);
+        len = XFS_B_TO_FSBT(mp, range.len);
+        minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+        start_agno = XFS_FSB_TO_AGNO(mp, start);
+        if (start_agno >= mp->m_sb.sb_agcount)
+                return -XFS_ERROR(EINVAL);
+        end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+        if (end_agno >= mp->m_sb.sb_agcount)
+                end_agno = mp->m_sb.sb_agcount - 1;
+        for (agno = start_agno; agno <= end_agno; agno++) {
+                error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                          &blocks_trimmed);
+                if (error)
+                        last_error = error;
+        }
+        if (last_error)
+                return last_error;
+        range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+        if (copy_to_user(urange, &range, sizeof(range)))
+                return -XFS_ERROR(EFAULT);
+        return 0;
+}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
new file mode 100644
index 000000000000..e82b6dd3e127
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_discard.h
@@ -0,0 +1,8 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+struct fstrim_range;
+extern int      xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ba8ad422a165..ef51eb43e137 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -41,6 +41,40 @@
 static const struct vm_operations_struct xfs_file_vm_ops;
 /*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_lock(&VFS_I(ip)->i_mutex);
+        xfs_ilock(ip, type);
+}
+static inline void
+xfs_rw_iunlock(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_iunlock(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+static inline void
+xfs_rw_ilock_demote(
+        struct xfs_inode        *ip,
+        int                     type)
+{
+        xfs_ilock_demote(ip, type);
+        if (type & XFS_IOLOCK_EXCL)
+                mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+/*
 *      xfs_iozero
 *
 *      xfs_iozero clears the specified range of buffer supplied,
@@ -262,22 +296,21 @@ xfs_file_aio_read(
        if (XFS_FORCED_SHUTDOWN(mp))
                return -EIO;
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_lock(&inode->i_mutex);
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (unlikely(ioflags & IO_ISDIRECT)) {
+                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
                        ret = -xfs_flushinval_pages(ip,
                                        (iocb->ki_pos & PAGE_CACHE_MASK),
                                        -1, FI_REMAPF_LOCKED);
+                        if (ret) {
+                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                                return ret;
+                        }
                }
-                mutex_unlock(&inode->i_mutex);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                if (ret) {
+        } else
-                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+                xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-                        return ret;
-                }
-        }
        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
@@ -285,7 +318,7 @@ xfs_file_aio_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
@@ -309,7 +342,7 @@ xfs_file_splice_read(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
-        xfs_ilock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
@@ -317,10 +350,61 @@ xfs_file_splice_read(
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+        xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
+STATIC void
+xfs_aio_write_isize_update(
+        struct inode    *inode,
+        loff_t          *ppos,
+        ssize_t         bytes_written)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        xfs_fsize_t             isize = i_size_read(inode);
+        if (bytes_written > 0)
+                XFS_STATS_ADD(xs_write_bytes, bytes_written);
+        if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                        *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_size)
+                        ip->i_size = *ppos;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occured.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+        struct xfs_inode        *ip)
+{
+        if (ip->i_new_size) {
+                xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+                ip->i_new_size = 0;
+                if (ip->i_d.di_size > ip->i_size)
+                        ip->i_d.di_size = ip->i_size;
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+        }
+}
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
 STATIC ssize_t
 xfs_file_splice_write(
        struct pipe_inode_info  *pipe,
@@ -331,7 +415,7 @@ xfs_file_splice_write(
 {
        struct inode            *inode = outfilp->f_mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        xfs_fsize_t             isize, new_size;
+        xfs_fsize_t             new_size;
        int                     ioflags = 0;
        ssize_t                 ret;
@@ -355,27 +439,9 @@ xfs_file_splice_write(
        trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
        ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-        if (ret > 0)
-                XFS_STATS_ADD(xs_write_bytes, ret);
-        isize = i_size_read(inode);
-        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
-                *ppos = isize;
-        if (*ppos > ip->i_size) {
+        xfs_aio_write_isize_update(inode, ppos, ret);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_aio_write_newsize_update(ip);
-                if (*ppos > ip->i_size)
-                        ip->i_size = *ppos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        if (ip->i_new_size) {
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                ip->i_new_size = 0;
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
@@ -562,245 +628,258 @@ out_lock:
        return error;
 }
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
 STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_aio_write_checks(
-        struct kiocb            *iocb,
+        struct file             *file,
-        const struct iovec      *iovp,
+        loff_t                  *pos,
-        unsigned long           nr_segs,
+        size_t                  *count,
-        loff_t                  pos)
+        int                     *iolock)
 {
-        struct file             *file = iocb->ki_filp;
+        struct inode            *inode = file->f_mapping->host;
-        struct address_space    *mapping = file->f_mapping;
-        struct inode            *inode = mapping->host;
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_mount        *mp = ip->i_mount;
+        xfs_fsize_t             new_size;
-        ssize_t                 ret = 0, error = 0;
+        int                     error = 0;
-        int                     ioflags = 0;
-        xfs_fsize_t             isize, new_size;
-        int                     iolock;
-        size_t                  ocount = 0, count;
-        int                     need_i_mutex;
-        XFS_STATS_INC(xs_write_calls);
+        error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+        if (error) {
+                xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+                *iolock = 0;
+                return error;
+        }
-        BUG_ON(iocb->ki_pos != pos);
+        new_size = *pos + *count;
+        if (new_size > ip->i_size)
+                ip->i_new_size = new_size;
-        if (unlikely(file->f_flags & O_DIRECT))
+        if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-                ioflags |= IO_ISDIRECT;
+                file_update_time(file);
-        if (file->f_mode & FMODE_NOCMTIME)
-                ioflags |= IO_INVIS;
-        error = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+        /*
+         * If the offset is beyond the size of the file, we need to zero any
+         * blocks that fall between the existing EOF and the start of this
+         * write.
+         */
+        if (*pos > ip->i_size)
+                error = -xfs_zero_eof(ip, *pos, ip->i_size);
+        xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
        if (error)
                return error;
-        count = ocount;
+        /*
-        if (count == 0)
+         * If we're writing the file then make sure to clear the setuid and
-                return 0;
+         * setgid bits if the process is not being run by root.  This keeps
+         * people from modifying setuid and setgid binaries.
-        xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
+         */
+        return file_remove_suid(file);
-        if (XFS_FORCED_SHUTDOWN(mp))
-                return -EIO;
-relock:
-        if (ioflags & IO_ISDIRECT) {
-                iolock = XFS_IOLOCK_SHARED;
-                need_i_mutex = 0;
-        } else {
-                iolock = XFS_IOLOCK_EXCL;
-                need_i_mutex = 1;
-                mutex_lock(&inode->i_mutex);
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
+}
-start:
+/*
-        error = -generic_write_checks(file, &pos, &count,
+ * xfs_file_dio_aio_write - handle direct IO writes
-                                        S_ISBLK(inode->i_mode));
+ *
-        if (error) {
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
-                xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+ * By separating it from the buffered write path we remove all the tricky to
-                goto out_unlock_mutex;
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos,
+        size_t                  ocount,
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        ssize_t                 ret = 0;
+        size_t                  count = ocount;
+        int                     unaligned_io = 0;
+        struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                        mp->m_rtdev_targp : mp->m_ddev_targp;
+        *iolock = 0;
+        if ((pos & target->bt_smask) || (count & target->bt_smask))
+                return -XFS_ERROR(EINVAL);
+        if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+                unaligned_io = 1;
+        if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+                *iolock = XFS_IOLOCK_EXCL;
+        else
+                *iolock = XFS_IOLOCK_SHARED;
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+        if (ret)
+                return ret;
+        if (mapping->nrpages) {
+                WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                        FI_REMAPF_LOCKED);
+                if (ret)
+                        return ret;
        }
-        if (ioflags & IO_ISDIRECT) {
+        /*
-                xfs_buftarg_t   *target =
+         * If we are doing unaligned IO, wait for all other IO to drain,
-                        XFS_IS_REALTIME_INODE(ip) ?
+         * otherwise demote the lock if we had to flush cached pages
-                                mp->m_rtdev_targp : mp->m_ddev_targp;
+         */
+        if (unaligned_io)
-                if ((pos & target->bt_smask) || (count & target->bt_smask)) {
+                xfs_ioend_wait(ip);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
+        else if (*iolock == XFS_IOLOCK_EXCL) {
-                        return XFS_ERROR(-EINVAL);
+                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-                }
+                *iolock = XFS_IOLOCK_SHARED;
-                if (!need_i_mutex && (mapping->nrpages || pos > ip->i_size)) {
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL|iolock);
-                        iolock = XFS_IOLOCK_EXCL;
-                        need_i_mutex = 1;
-                        mutex_lock(&inode->i_mutex);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL|iolock);
-                        goto start;
-                }
        }
-        new_size = pos + count;
+        trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-        if (new_size > ip->i_size)
+        ret = generic_file_direct_write(iocb, iovp,
-                ip->i_new_size = new_size;
+                        &nr_segs, pos, &iocb->ki_pos, count, ocount);
-        if (likely(!(ioflags & IO_INVIS)))
+        /* No fallback to buffered IO on errors for XFS. */
-                file_update_time(file);
+        ASSERT(ret < 0 || ret == count);
+        return ret;
+}
-        /*
+STATIC ssize_t
-         * If the offset is beyond the size of the file, we have a couple
+xfs_file_buffered_aio_write(
-         * of things to do. First, if there is already space allocated
+        struct kiocb            *iocb,
-         * we need to either create holes or zero the disk or ...
+        const struct iovec      *iovp,
-         *
+        unsigned long           nr_segs,
-         * If there is a page where the previous size lands, we need
+        loff_t                  pos,
-         * to zero it out up to the new size.
+        size_t                  ocount,
-         */
+        int                     *iolock)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     enospc = 0;
+        size_t                  count = ocount;
-        if (pos > ip->i_size) {
+        *iolock = XFS_IOLOCK_EXCL;
-                error = xfs_zero_eof(ip, pos, ip->i_size);
+        xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-                if (error) {
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        goto out_unlock_internal;
-                }
-        }
-        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        /*
+        ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-         * If we're writing the file then make sure to clear the
+        if (ret)
-         * setuid and setgid bits if the process is not being run
+                return ret;
-         * by root.  This keeps people from modifying setuid and
-         * setgid binaries.
-         */
-        error = -file_remove_suid(file);
-        if (unlikely(error))
-                goto out_unlock_internal;
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
-        if ((ioflags & IO_ISDIRECT)) {
+write_retry:
-                if (mapping->nrpages) {
+        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-                        WARN_ON(need_i_mutex == 0);
+        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-                        error = xfs_flushinval_pages(ip,
+                        pos, &iocb->ki_pos, count, ret);
-                                        (pos & PAGE_CACHE_MASK),
+        /*
-                                        -1, FI_REMAPF_LOCKED);
+         * if we just got an ENOSPC, flush the inode now we aren't holding any
-                        if (error)
+         * page locks and retry *once*
-                                goto out_unlock_internal;
+         */
-                }
+        if (ret == -ENOSPC && !enospc) {
+                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                if (need_i_mutex) {
+                if (ret)
-                        /* demote the lock now the cached pages are gone */
+                        return ret;
-                        xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                enospc = 1;
-                        mutex_unlock(&inode->i_mutex);
+                goto write_retry;
+        }
+        current->backing_dev_info = NULL;
+        return ret;
+}
-                        iolock = XFS_IOLOCK_SHARED;
+STATIC ssize_t
-                        need_i_mutex = 0;
+xfs_file_aio_write(
-                }
+        struct kiocb            *iocb,
+        const struct iovec      *iovp,
+        unsigned long           nr_segs,
+        loff_t                  pos)
+{
+        struct file             *file = iocb->ki_filp;
+        struct address_space    *mapping = file->f_mapping;
+        struct inode            *inode = mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
+        ssize_t                 ret;
+        int                     iolock;
+        size_t                  ocount = 0;
-                trace_xfs_file_direct_write(ip, count, iocb->ki_pos, ioflags);
+        XFS_STATS_INC(xs_write_calls);
-                ret = generic_file_direct_write(iocb, iovp,
-                                &nr_segs, pos, &iocb->ki_pos, count, ocount);
-                /*
+        BUG_ON(iocb->ki_pos != pos);
-                 * direct-io write to a hole: fall through to buffered I/O
-                 * for completing the rest of the request.
-                 */
-                if (ret >= 0 && ret != count) {
-                        XFS_STATS_ADD(xs_write_bytes, ret);
-                        pos += ret;
+        ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-                        count -= ret;
+        if (ret)
+                return ret;
-                        ioflags &= ~IO_ISDIRECT;
+        if (ocount == 0)
-                        xfs_iunlock(ip, iolock);
+                return 0;
-                        goto relock;
-                }
-        } else {
-                int enospc = 0;
-                ssize_t ret2 = 0;
-write_retry:
+        xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-                trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, ioflags);
-                ret2 = generic_file_buffered_write(iocb, iovp, nr_segs,
-                                pos, &iocb->ki_pos, count, ret);
-                /*
-                 * if we just got an ENOSPC, flush the inode now we
-                 * aren't holding any page locks and retry *once*
-                 */
-                if (ret2 == -ENOSPC && !enospc) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-                        if (error)
-                                goto out_unlock_internal;
-                        enospc = 1;
-                        goto write_retry;
-                }
-                ret = ret2;
-        }
-        current->backing_dev_info = NULL;
+        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+                return -EIO;
-        isize = i_size_read(inode);
+        if (unlikely(file->f_flags & O_DIRECT))
-        if (unlikely(ret < 0 && ret != -EFAULT && iocb->ki_pos > isize))
+                ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-                iocb->ki_pos = isize;
+                                                ocount, &iolock);
+        else
+                ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                                ocount, &iolock);
-        if (iocb->ki_pos > ip->i_size) {
+        xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
-                if (iocb->ki_pos > ip->i_size)
-                        ip->i_size = iocb->ki_pos;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        error = -ret;
        if (ret <= 0)
-                goto out_unlock_internal;
+                goto out_unlock;
-        XFS_STATS_ADD(xs_write_bytes, ret);
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
                loff_t end = pos + ret - 1;
-                int error2;
+                int error, error2;
-                xfs_iunlock(ip, iolock);
-                if (need_i_mutex)
-                        mutex_unlock(&inode->i_mutex);
-                error2 = filemap_write_and_wait_range(mapping, pos, end);
+                xfs_rw_iunlock(ip, iolock);
-                if (!error)
+                error = filemap_write_and_wait_range(mapping, pos, end);
-                        error = error2;
+                xfs_rw_ilock(ip, iolock);
-                if (need_i_mutex)
-                        mutex_lock(&inode->i_mutex);
-                xfs_ilock(ip, iolock);
                error2 = -xfs_file_fsync(file,
                                         (file->f_flags & __O_SYNC) ? 0 : 1);
-                if (!error)
+                if (error)
-                        error = error2;
+                        ret = error;
+                else if (error2)
+                        ret = error2;
        }
- out_unlock_internal:
+out_unlock:
-        if (ip->i_new_size) {
+        xfs_aio_write_newsize_update(ip);
-                xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_rw_iunlock(ip, iolock);
-                ip->i_new_size = 0;
+        return ret;
-                /*
-                 * If this was a direct or synchronous I/O that failed (such
-                 * as ENOSPC) then part of the I/O may have been written to
-                 * disk before the error occured.  In this case the on-disk
-                 * file size may have been adjusted beyond the in-memory file
-                 * size and now needs to be truncated back.
-                 */
-                if (ip->i_d.di_size > ip->i_size)
-                        ip->i_d.di_size = ip->i_size;
-                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-        }
-        xfs_iunlock(ip, iolock);
- out_unlock_mutex:
-        if (need_i_mutex)
-                mutex_unlock(&inode->i_mutex);
-        return -error;
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index ad442d9e392e..b06ede1d0bed 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_dfrag.h"
 #include "xfs_fsops.h"
 #include "xfs_vnodeops.h"
+#include "xfs_discard.h"
 #include "xfs_quota.h"
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
@@ -1294,6 +1295,8 @@ xfs_file_ioctl(
        trace_xfs_file_ioctl(ip);
        switch (cmd) {
+        case FITRIM:
+                return xfs_ioc_trim(mp, arg);
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bd07f7339366..9731898083ae 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1414,7 +1414,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return -xfs_fs_log_dummy(mp, SYNC_WAIT);
+        return -xfs_fs_log_dummy(mp);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index a02480de9759..e22f0057d21f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -362,7 +362,7 @@ xfs_quiesce_data(
        /* mark the log as covered if needed */
        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
+                error2 = xfs_fs_log_dummy(mp);
        /* flush data-only devices */
        if (mp->m_rtdev_targp)
@@ -503,13 +503,14 @@ xfs_sync_worker(
        int             error;
        if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                xfs_log_force(mp, 0);
-                xfs_reclaim_inodes(mp, 0);
                /* dgc: errors ignored here */
-                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
                if (mp->m_super->s_frozen == SB_UNFROZEN &&
                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp, 0);
+                        error = xfs_fs_log_dummy(mp);
+                else
+                        xfs_log_force(mp, 0);
+                xfs_reclaim_inodes(mp, 0);
+                error = xfs_qm_sync(mp, SYNC_TRYLOCK);
        }
        mp->m_sync_seq++;
        wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7bb5092d6ae4..ee3cee097e7e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -18,6 +18,7 @@
 #include "xfs.h"
 #include <linux/sysctl.h>
 #include <linux/proc_fs.h>
+#include "xfs_error.h"
 static struct ctl_table_header *xfs_table_header;
@@ -51,6 +52,26 @@ xfs_stats_clear_proc_handler(
        return ret;
 }
+STATIC int
+xfs_panic_mask_proc_handler(
+        ctl_table       *ctl,
+        int             write,
+        void            __user *buffer,
+        size_t          *lenp,
+        loff_t          *ppos)
+{
+        int             ret, *valp = ctl->data;
+        ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+        if (!ret && write) {
+                xfs_panic_mask = *valp;
+#ifdef DEBUG
+                xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+        }
+        return ret;
+}
 #endif /* CONFIG_PROC_FS */
 static ctl_table xfs_table[] = {
@@ -77,7 +98,7 @@ static ctl_table xfs_table[] = {
                .data           = &xfs_params.panic_mask.val,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
+                .proc_handler   = xfs_panic_mask_proc_handler,
                .extra1         = &xfs_params.panic_mask.min,
                .extra2         = &xfs_params.panic_mask.max
        },
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 647af2a2e7aa..2d0bcb479075 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1759,6 +1759,39 @@ DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
 DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+DECLARE_EVENT_CLASS(xfs_discard_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                 xfs_agblock_t agbno, xfs_extlen_t len),
+        TP_ARGS(mp, agno, agbno, len),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(xfs_agblock_t, agbno)
+                __field(xfs_extlen_t, len)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->agbno = agbno;
+                __entry->len = len;
+        ),
+        TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->agbno,
+                  __entry->len)
+)
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                 xfs_agblock_t agbno, xfs_extlen_t len), \
+        TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
 #endif /* _TRACE_XFS_H */
 #undef TRACE_INCLUDE_PATH
author	Linus Torvalds <torvalds@linux-foundation.org>	2011-01-14 18:24:17 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-01-14 18:24:17 -0500
commit	7cb3920a6529df7f54487abe973b903b8239e901 (patch)
tree	d36da0fe62adfa3e2e46a485e0bdb06019b2e560 /fs/xfs/linux-2.6
parent	ad56cbf0fa6c09350c738ec59a3361f2e4ab4bc7 (diff)
parent	73efe4a4ddf8eb2b1cc7039e8a66a23a424961af (diff)