8 files changed, 288 insertions, 7 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3af4cfb5654c..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
+        /*
+         * list for tracking inodes that must be sent to disk before a
+         * rename or truncate commit
+         */
+        struct list_head ordered_operations;
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
@@ -122,6 +128,18 @@ struct btrfs_inode {
         */
        u64 last_unlink_trans;
+        /*
+         * ordered_data_close is set by truncate when a file that used
+         * to have good data has been truncated to zero.  When it is set
+         * the btrfs file release call will add this inode to the
+         * ordered operations list so that we make sure to flush out any
+         * new data the application may have written before commit.
+         *
+         * yes, its silly to have a single bitflag, but we might grow more
+         * of these.
+         */
+        unsigned ordered_data_close:1;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2737facbd341..f48905ee5240 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAX_LEVEL 8
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -727,6 +734,15 @@ struct btrfs_fs_info {
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        /*
+         * this protects the ordered operations list only while we are
+         * processing all of the entries on it.  This way we make
+         * sure the commit code doesn't find the list temporarily empty
+         * because another function happens to be doing non-waiting preflush
+         * before jumping into the main commit.
+         */
+        struct mutex ordered_operations_mutex;
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -741,10 +757,29 @@ struct btrfs_fs_info {
         * ordered extents
         */
        spinlock_t ordered_extent_lock;
+        /*
+         * all of the data=ordered extents pending writeback
+         * these can span multiple transactions and basically include
+         * every dirty data page that isn't from nodatacow
+         */
        struct list_head ordered_extents;
+        /*
+         * all of the inodes that have delalloc bytes.  It is possible for
+         * this list to be empty even when there is still dirty data=ordered
+         * extents waiting to finish IO.
+         */
        struct list_head delalloc_inodes;
        /*
+         * special rename and truncate targets that must be on disk before
+         * we're allowed to commit.  This is basically the ext3 style
+         * data=ordered list.
+         */
+        struct list_head ordered_operations;
+        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
         * can run with FS locks held, and the writers may be waiting for
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9244cd7313d4..1747dfd18654 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1572,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+        INIT_LIST_HEAD(&fs_info->ordered_operations);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
@@ -1643,6 +1644,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        insert_inode_hash(fs_info->btree_inode);
        mutex_init(&fs_info->trans_mutex);
+        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
        mutex_init(&fs_info->pinned_mutex);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32d10a617613..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1161,6 +1161,20 @@ out_nolock:
                page_cache_release(pinned[1]);
        *ppos = pos;
+        /*
+         * we want to make sure fsync finds this change
+         * but we haven't joined a transaction running right now.
+         *
+         * Later on, someone is sure to update the inode and get the
+         * real transid recorded.
+         *
+         * We set last_trans now to the fs_info generation + 1,
+         * this will either be one more than the running transaction
+         * or the generation used for the next transaction if there isn't
+         * one running right now.
+         */
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        if (num_written > 0 && will_write) {
                struct btrfs_trans_handle *trans;
@@ -1194,6 +1208,18 @@ out_nolock:
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+        /*
+         * ordered_data_close is set by settattr when we are about to truncate
+         * a file from a non-zero size to a zero size.  This tries to
+         * flush down new bytes that may have been written if the
+         * application were using truncate to replace a file in place.
+         */
+        if (BTRFS_I(inode)->ordered_data_close) {
+                BTRFS_I(inode)->ordered_data_close = 0;
+                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                        filemap_flush(inode->i_mapping);
+        }
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
        return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index bffd79faffb5..1cff528d5b51 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2907,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (err)
                return err;
-        if (S_ISREG(inode->i_mode) &&
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-            attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+                if (attr->ia_size > inode->i_size) {
-                err = btrfs_cont_expand(inode, attr->ia_size);
+                        err = btrfs_cont_expand(inode, attr->ia_size);
-                if (err)
+                        if (err)
-                        return err;
+                                return err;
+                } else if (inode->i_size > 0 &&
+                           attr->ia_size == 0) {
+                        /* we're truncating a file that used to have good
+                         * data down to zero.  Make sure it gets into
+                         * the ordered flush list so that any new writes
+                         * get down to disk quickly.
+                         */
+                        BTRFS_I(inode)->ordered_data_close = 1;
+                }
        }
        err = inode_setattr(inode, attr);
@@ -3050,6 +3060,7 @@ static noinline void init_btrfs_i(struct inode *inode)
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -4419,6 +4430,8 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 out_unlock:
@@ -4444,6 +4457,27 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        trans = btrfs_start_transaction(root, 1);
+        /*
+         * setattr is responsible for setting the ordered_data_close flag,
+         * but that is only tested during the last file release.  That
+         * could happen well after the next commit, leaving a great big
+         * window where new writes may get lost if someone chooses to write
+         * to this file after truncating to zero
+         *
+         * The inode doesn't have any dirty data here, and so if we commit
+         * this is a noop.  If someone immediately starts writing to the inode
+         * it is very likely we'll catch some of their writes in this
+         * transaction, and the commit will find this file on the ordered
+         * data list with good things to send down.
+         *
+         * This is a best effort solution, there is still a window where
+         * using truncate to replace the contents of the file will
+         * end up with a zero length file after a crash.
+         */
+        if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+                btrfs_add_ordered_operation(trans, root, inode);
        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, inode->i_size);
@@ -4520,12 +4554,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->i_acl = BTRFS_ACL_NOT_CACHED;
        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
@@ -4536,13 +4573,24 @@ void btrfs_destroy_inode(struct inode *inode)
            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
                posix_acl_release(BTRFS_I(inode)->i_default_acl);
-        spin_lock(&BTRFS_I(inode)->root->list_lock);
+        /*
+         * Make sure we're properly removed from the ordered operation
+         * lists.
+         */
+        smp_mb();
+        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                spin_lock(&root->fs_info->ordered_extent_lock);
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_lock(&root->list_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
                       " list\n", inode->i_ino);
                dump_stack();
        }
-        spin_unlock(&BTRFS_I(inode)->root->list_lock);
+        spin_unlock(&root->list_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4667,9 +4715,28 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_unlock;
+        /*
+         * we're using rename to replace one file with another.
+         * and the replacement file is large.  Start IO on it now so
+         * we don't add too much work to the end of the transaction
+         */
+        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+            new_inode->i_size &&
+            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                filemap_flush(old_inode->i_mapping);
        trans = btrfs_start_transaction(root, 1);
        /*
+         * make sure the inode gets flushed if it is replacing
+         * something.
+         */
+        if (new_inode && new_inode->i_size &&
+            old_inode && S_ISREG(old_inode->i_mode)) {
+                btrfs_add_ordered_operation(trans, root, old_inode);
+        }
+        /*
         * this is an ugly little race, but the rename is required to make
         * sure that if we crash, the inode is either at the old name
         * or the new one.  pinning the log transaction lets us make sure
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+        /*
+         * we have no more ordered extents for this inode and
+         * no dirty pages.  We can safely remove it from the
+         * list of ordered extents
+         */
+        if (RB_EMPTY_ROOT(&tree->tree) &&
+            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+        }
        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 }
 /*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct inode *inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                   ordered_operations);
+                inode = &btrfs_inode->vfs_inode;
+                list_del_init(&btrfs_inode->ordered_operations);
+                /*
+                 * the inode may be getting freed (in sys_unlink path).
+                 */
+                inode = igrab(inode);
+                if (!wait && inode) {
+                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                              &root->fs_info->ordered_operations);
+                }
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode) {
+                        if (wait)
+                                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                        else
+                                filemap_flush(inode->i_mapping);
+                        iput(inode);
+                }
+                cond_resched();
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        if (wait && !list_empty(&root->fs_info->ordered_operations))
+                goto again;
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return 0;
+}
+/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode)
+{
+        u64 last_mod;
+        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+        /*
+         * if this file hasn't been changed since the last transaction
+         * commit, we can safely return without doing anything
+         */
+        if (last_mod < root->fs_info->last_trans_committed)
+                return 0;
+        /*
+         * the transaction is already committing.  Just start the IO and
+         * don't bother with all of this list nonsense
+         */
+        if (trans && root->fs_info->running_transaction->blocked) {
+                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                return 0;
+        }
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                              &root->fs_info->ordered_operations);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                           loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9c8f158dd2db..664782c6a2df 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -975,6 +975,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        int should_grow = 0;
        unsigned long now = get_seconds();
+        btrfs_run_ordered_operations(root, 0);
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
         */
@@ -1056,6 +1058,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
+                /*
+                 * rename don't use btrfs_join_transaction, so, once we
+                 * set the transaction to blocked above, we aren't going
+                 * to get any new ordered operations.  We can safely run
+                 * it here and no for sure that nothing new will be added
+                 * to the list
+                 */
+                btrfs_run_ordered_operations(root, 1);
                smp_mb();
                if (cur_trans->num_writers > 1 || should_grow)
                        schedule_timeout(timeout);