Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/btrfs/file.c
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
1 files changed, 700 insertions, 237 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df082..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -39,16 +40,274 @@
 #include "locking.h"
 #include "compat.h"
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+        struct rb_node rb_node;
+        /* objectid */
+        u64 ino;
+        /*
+         * transid where the defrag was added, we search for
+         * extents newer than this
+         */
+        u64 transid;
+        /* root objectid */
+        u64 root;
+        /* last offset we were able to defrag */
+        u64 last_offset;
+        /* if we've wrapped around back to zero once already */
+        int cycled;
+};
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+                                    struct inode_defrag *defrag)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct inode_defrag *entry;
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        p = &root->fs_info->defrag_inodes.rb_node;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct inode_defrag, rb_node);
+                if (defrag->ino < entry->ino)
+                        p = &parent->rb_left;
+                else if (defrag->ino > entry->ino)
+                        p = &parent->rb_right;
+                else {
+                        /* if we're reinserting an entry for
+                         * an old defrag run, make sure to
+                         * lower the transid of our existing record
+                         */
+                        if (defrag->transid < entry->transid)
+                                entry->transid = defrag->transid;
+                        if (defrag->last_offset > entry->last_offset)
+                                entry->last_offset = defrag->last_offset;
+                        goto exists;
+                }
+        }
+        BTRFS_I(inode)->in_defrag = 1;
+        rb_link_node(&defrag->rb_node, parent, p);
+        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+        return 0;
+exists:
+        kfree(defrag);
+        return 0;
+}
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+                           struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct inode_defrag *defrag;
+        int ret = 0;
+        u64 transid;
+        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+                return 0;
+        if (btrfs_fs_closing(root->fs_info))
+                return 0;
+        if (BTRFS_I(inode)->in_defrag)
+                return 0;
+        if (trans)
+                transid = trans->transid;
+        else
+                transid = BTRFS_I(inode)->root->last_trans;
+        defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+        if (!defrag)
+                return -ENOMEM;
+        defrag->ino = btrfs_ino(inode);
+        defrag->transid = transid;
+        defrag->root = root->root_key.objectid;
+        spin_lock(&root->fs_info->defrag_inodes_lock);
+        if (!BTRFS_I(inode)->in_defrag)
+                ret = __btrfs_add_inode_defrag(inode, defrag);
+        spin_unlock(&root->fs_info->defrag_inodes_lock);
+        return ret;
+}
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+                                             struct rb_node **next)
+{
+        struct inode_defrag *entry = NULL;
+        struct rb_node *p;
+        struct rb_node *parent = NULL;
+        p = info->defrag_inodes.rb_node;
+        while (p) {
+                parent = p;
+                entry = rb_entry(parent, struct inode_defrag, rb_node);
+                if (ino < entry->ino)
+                        p = parent->rb_left;
+                else if (ino > entry->ino)
+                        p = parent->rb_right;
+                else
+                        return entry;
+        }
+        if (next) {
+                while (parent && ino > entry->ino) {
+                        parent = rb_next(parent);
+                        entry = rb_entry(parent, struct inode_defrag, rb_node);
+                }
+                *next = parent;
+        }
+        return NULL;
+}
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+        struct inode_defrag *defrag;
+        struct btrfs_root *inode_root;
+        struct inode *inode;
+        struct rb_node *n;
+        struct btrfs_key key;
+        struct btrfs_ioctl_defrag_range_args range;
+        u64 first_ino = 0;
+        int num_defrag;
+        int defrag_batch = 1024;
+        memset(&range, 0, sizeof(range));
+        range.len = (u64)-1;
+        atomic_inc(&fs_info->defrag_running);
+        spin_lock(&fs_info->defrag_inodes_lock);
+        while(1) {
+                n = NULL;
+                /* find an inode to defrag */
+                defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+                if (!defrag) {
+                        if (n)
+                                defrag = rb_entry(n, struct inode_defrag, rb_node);
+                        else if (first_ino) {
+                                first_ino = 0;
+                                continue;
+                        } else {
+                                break;
+                        }
+                }
+                /* remove it from the rbtree */
+                first_ino = defrag->ino + 1;
+                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+                if (btrfs_fs_closing(fs_info))
+                        goto next_free;
+                spin_unlock(&fs_info->defrag_inodes_lock);
+                /* get the inode */
+                key.objectid = defrag->root;
+                btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+                key.offset = (u64)-1;
+                inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+                if (IS_ERR(inode_root))
+                        goto next;
+                key.objectid = defrag->ino;
+                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+                key.offset = 0;
+                inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+                if (IS_ERR(inode))
+                        goto next;
+                /* do a chunk of defrag */
+                BTRFS_I(inode)->in_defrag = 0;
+                range.start = defrag->last_offset;
+                num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                               defrag_batch);
+                /*
+                 * if we filled the whole defrag batch, there
+                 * must be more work to do.  Queue this defrag
+                 * again
+                 */
+                if (num_defrag == defrag_batch) {
+                        defrag->last_offset = range.start;
+                        __btrfs_add_inode_defrag(inode, defrag);
+                        /*
+                         * we don't want to kfree defrag, we added it back to
+                         * the rbtree
+                         */
+                        defrag = NULL;
+                } else if (defrag->last_offset && !defrag->cycled) {
+                        /*
+                         * we didn't fill our defrag batch, but
+                         * we didn't start at zero.  Make sure we loop
+                         * around to the start of the file.
+                         */
+                        defrag->last_offset = 0;
+                        defrag->cycled = 1;
+                        __btrfs_add_inode_defrag(inode, defrag);
+                        defrag = NULL;
+                }
+                iput(inode);
+next:
+                spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+                kfree(defrag);
+        }
+        spin_unlock(&fs_info->defrag_inodes_lock);
+        atomic_dec(&fs_info->defrag_running);
+        /*
+         * during unmount, we use the transaction_wait queue to
+         * wait for the defragger to stop
+         */
+        wake_up(&fs_info->transaction_wait);
+        return 0;
+}
 /* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-                                         int write_bytes,
+                                         size_t write_bytes,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
-        size_t copied;
+        size_t copied = 0;
+        size_t total_copied = 0;
        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -56,23 +315,38 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
                struct page *page = prepared_pages[pg];
-again:
+                /*
-                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                 * Copy data from userspace to the current page
-                        return -EFAULT;
+                 *
+                 * Disable pagefault to avoid recursive lock since
-                /* Copy data from userspace to the current page */
+                 * the pages are already locked
-                copied = iov_iter_copy_from_user(page, i, offset, count);
+                 */
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
+                /*
+                 * if we get a partial write, we can end up with
+                 * partially up to date pages.  These add
+                 * a lot of complexity, so make sure they don't
+                 * happen by forcing this copy to be retried.
+                 *
+                 * The rest of the btrfs_file_write code will fall
+                 * back to page at a time copies after we return 0.
+                 */
+                if (!PageUptodate(page) && copied < count)
+                        copied = 0;
                iov_iter_advance(i, copied);
                write_bytes -= copied;
+                total_copied += copied;
-                if (unlikely(copied == 0)) {
+                /* Return to btrfs_file_aio_write to fault page */
-                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                if (unlikely(copied == 0))
-                                      iov_iter_single_seg_count(i));
+                        break;
-                        goto again;
-                }
                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
                        offset += copied;
@@ -81,18 +355,16 @@ again:
                        offset = 0;
                }
        }
-        return 0;
+        return total_copied;
 }
 /*
 * unlocks pages after btrfs_file_write is done with them
 */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
-                if (!pages[i])
-                        break;
                /* page checked is some magic around finding pages that
                 * have been modified without going through btrfs_set_page_dirty
                 * clear it here
@@ -112,17 +384,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
-static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-                                   struct btrfs_root *root,
+                      struct page **pages, size_t num_pages,
-                                   struct file *file,
+                      loff_t pos, size_t write_bytes,
-                                   struct page **pages,
+                      struct extent_state **cached)
-                                   size_t num_pages,
-                                   loff_t pos,
-                                   size_t write_bytes)
 {
        int err = 0;
        int i;
-        struct inode *inode = fdentry(file)->d_inode;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -135,8 +403,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                        NULL);
+                                        cached);
-        BUG_ON(err);
+        if (err)
+                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -144,13 +413,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                ClearPageChecked(p);
                set_page_dirty(p);
        }
-        if (end_pos > isize) {
+        /*
+         * we've only changed i_size in ram, and we haven't updated
+         * the disk i_size.  There is no need to log the inode
+         * at this time.
+         */
+        if (end_pos > isize)
                i_size_write(inode, end_pos);
-                /* we've only changed i_size in ram, and we haven't updated
-                 * the disk i_size.  There is no need to log the inode
-                 * at this time.
-                 */
-        }
        return 0;
 }
@@ -178,9 +448,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        }
        while (1) {
                if (!split)
-                        split = alloc_extent_map(GFP_NOFS);
+                        split = alloc_extent_map();
                if (!split2)
-                        split2 = alloc_extent_map(GFP_NOFS);
+                        split2 = alloc_extent_map();
+                BUG_ON(!split || !split2);
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -220,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
@@ -234,6 +506,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -282,6 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key new_key;
+        u64 ino = btrfs_ino(inode);
        u64 search_start = start;
        u64 disk_bytenr = 0;
        u64 num_bytes = 0;
@@ -302,14 +576,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        while (1) {
                recow = 0;
-                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               search_start, -1);
                if (ret < 0)
                        break;
                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
-                        if (key.objectid == inode->i_ino &&
+                        if (key.objectid == ino &&
                            key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
@@ -330,7 +604,7 @@ next_slot:
                }
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (key.objectid > inode->i_ino ||
+                if (key.objectid > ino ||
                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
                        break;
@@ -360,7 +634,7 @@ next_slot:
                search_start = max(key.offset, start);
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -377,7 +651,7 @@ next_slot:
                        ret = btrfs_duplicate_item(trans, root, path,
                                                   &new_key);
                        if (ret == -EAGAIN) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                continue;
                        }
                        if (ret < 0)
@@ -500,7 +774,7 @@ next_slot:
                        del_nr = 0;
                        del_slot = 0;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -576,6 +850,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        int del_slot = 0;
        int recow;
        int ret;
+        u64 ino = btrfs_ino(inode);
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -584,18 +859,19 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 again:
        recow = 0;
        split = start;
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = split;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
        if (ret > 0 && path->slots[0] > 0)
                path->slots[0]--;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        BUG_ON(key.objectid != inode->i_ino ||
+        BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
-               key.type != BTRFS_EXTENT_DATA_KEY);
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -612,7 +888,7 @@ again:
                other_start = 0;
                other_end = start;
                if (extent_mergeable(leaf, path->slots[0] - 1,
-                                     inode->i_ino, bytenr, orig_offset,
+                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        new_key.offset = end;
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
@@ -635,7 +911,7 @@ again:
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1,
-                                     inode->i_ino, bytenr, orig_offset,
+                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
@@ -663,7 +939,7 @@ again:
                new_key.offset = split;
                ret = btrfs_duplicate_item(trans, root, path, &new_key);
                if (ret == -EAGAIN) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                BUG_ON(ret < 0);
@@ -684,7 +960,7 @@ again:
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                           inode->i_ino, orig_offset);
+                                           ino, orig_offset);
                BUG_ON(ret);
                if (split == start) {
@@ -700,10 +976,10 @@ again:
        other_start = end;
        other_end = 0;
        if (extent_mergeable(leaf, path->slots[0] + 1,
-                             inode->i_ino, bytenr, orig_offset,
+                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                extent_end = other_end;
@@ -711,16 +987,16 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        inode->i_ino, orig_offset);
+                                        ino, orig_offset);
                BUG_ON(ret);
        }
        other_start = 0;
        other_end = start;
        if (extent_mergeable(leaf, path->slots[0] - 1,
-                             inode->i_ino, bytenr, orig_offset,
+                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                key.offset = other_start;
@@ -728,7 +1004,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        inode->i_ino, orig_offset);
+                                        ino, orig_offset);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -755,6 +1031,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -769,6 +1066,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -776,21 +1074,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
        if (start_pos > inode->i_size) {
-                err = btrfs_cont_expand(inode, start_pos);
+                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
                if (err)
                        return err;
        }
-        memset(pages, 0, num_pages * sizeof(struct page *));
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
+                        faili = i - 1;
                        err = -ENOMEM;
-                        BUG_ON(1);
+                        goto fail;
+                }
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -830,199 +1140,264 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
-                                    const struct iovec *iov,
+                                               struct iov_iter *i,
-                                    unsigned long nr_segs, loff_t pos)
+                                               loff_t pos)
 {
-        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
-        struct iov_iter i;
-        loff_t *ppos = &iocb->ki_pos;
-        loff_t start_pos;
-        ssize_t num_written = 0;
-        ssize_t err = 0;
-        size_t count;
-        size_t ocount;
-        int ret = 0;
-        int nrptrs;
        unsigned long first_index;
        unsigned long last_index;
-        int will_write;
+        size_t num_written = 0;
-        int buffered = 0;
+        int nrptrs;
+        int ret = 0;
-        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
+        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
-                      (file->f_flags & O_DIRECT));
+                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                     (sizeof(struct page *)));
+        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        if (!pages)
+                return -ENOMEM;
-        pinned[0] = NULL;
+        first_index = pos >> PAGE_CACHE_SHIFT;
-        pinned[1] = NULL;
+        last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
-        start_pos = pos;
+        while (iov_iter_count(i) > 0) {
+                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                size_t write_bytes = min(iov_iter_count(i),
+                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
+                                         offset);
+                size_t num_pages = (write_bytes + offset +
+                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                size_t dirty_pages;
+                size_t copied;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+                WARN_ON(num_pages > nrptrs);
-        mutex_lock(&inode->i_mutex);
+                /*
+                 * Fault pages before locking them in prepare_pages
+                 * to avoid recursive lock
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+                        ret = -EFAULT;
+                        break;
+                }
-        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+                ret = btrfs_delalloc_reserve_space(inode,
-        if (err)
+                                        num_pages << PAGE_CACHE_SHIFT);
-                goto out;
+                if (ret)
-        count = ocount;
+                        break;
-        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+                /*
-        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+                 * This is going to setup the pages array with the number of
-        if (err)
+                 * pages we want, so we don't really need to worry about the
-                goto out;
+                 * contents of pages from loop to loop
+                 */
+                ret = prepare_pages(root, file, pages, num_pages,
+                                    pos, first_index, last_index,
+                                    write_bytes);
+                if (ret) {
+                        btrfs_delalloc_release_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
+                        break;
+                }
-        if (count == 0)
+                copied = btrfs_copy_from_user(pos, num_pages,
-                goto out;
+                                           write_bytes, pages, i);
-        err = file_remove_suid(file);
+                /*
-        if (err)
+                 * if we have trouble faulting in the pages, fall
-                goto out;
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
-        file_update_time(file);
+                if (copied == 0)
-        BTRFS_I(inode)->sequence++;
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
-        if (unlikely(file->f_flags & O_DIRECT)) {
-                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
-                                                        pos, ppos, count,
-                                                        ocount);
                /*
-                 * the generic O_DIRECT will update in-memory i_size after the
+                 * If we had a short copy we need to release the excess delaloc
-                 * DIOs are done.  But our endio handlers that update the on
+                 * bytes we reserved.  We need to increment outstanding_extents
-                 * disk i_size never update past the in memory i_size.  So we
+                 * because btrfs_delalloc_release_space will decrement it, but
-                 * need one more update here to catch any additions to the
+                 * we still have an outstanding extent for the chunk we actually
-                 * file
+                 * managed to copy.
                 */
-                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                if (num_pages > dirty_pages) {
-                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                        if (copied > 0)
-                        mark_inode_dirty(inode);
+                                atomic_inc(
+                                        &BTRFS_I(inode)->outstanding_extents);
+                        btrfs_delalloc_release_space(inode,
+                                        (num_pages - dirty_pages) <<
+                                        PAGE_CACHE_SHIFT);
                }
-                if (num_written < 0) {
+                if (copied > 0) {
-                        ret = num_written;
+                        ret = btrfs_dirty_pages(root, inode, pages,
-                        num_written = 0;
+                                                dirty_pages, pos, copied,
-                        goto out;
+                                                NULL);
-                } else if (num_written == count) {
+                        if (ret) {
-                        /* pick up pos changes done by the generic code */
+                                btrfs_delalloc_release_space(inode,
-                        pos = *ppos;
+                                        dirty_pages << PAGE_CACHE_SHIFT);
-                        goto out;
+                                btrfs_drop_pages(pages, num_pages);
+                                break;
+                        }
                }
-                /*
-                 * We are going to do buffered for the rest of the range, so we
+                btrfs_drop_pages(pages, num_pages);
-                 * need to make sure to invalidate the buffered pages when we're
-                 * done.
+                cond_resched();
-                 */
-                buffered = 1;
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                pos += num_written;
+                                                   dirty_pages);
+                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                        btrfs_btree_balance_dirty(root, 1);
+                btrfs_throttle(root);
+                pos += copied;
+                num_written += copied;
        }
-        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        kfree(pages);
-        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
-                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
-                     (sizeof(struct page *)));
-        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
-        /* generic_write_checks can change our pos */
+        return num_written ? num_written : ret;
-        start_pos = pos;
+}
-        first_index = pos >> PAGE_CACHE_SHIFT;
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
-        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos,
+                                    loff_t *ppos, size_t count, size_t ocount)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct iov_iter i;
+        ssize_t written;
+        ssize_t written_buffered;
+        loff_t endbyte;
+        int err;
+        written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+                                            count, ocount);
        /*
-         * there are lots of better ways to do this, but this code
+         * the generic O_DIRECT will update in-memory i_size after the
-         * makes sure the first and last page in the file range are
+         * DIOs are done.  But our endio handlers that update the on
-         * up to date and ready for cow
+         * disk i_size never update past the in memory i_size.  So we
+         * need one more update here to catch any additions to the
+         * file
         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
+        if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                if (!PageUptodate(pinned[0])) {
+                mark_inode_dirty(inode);
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+        if (written < 0 || written == count)
-                if (!PageUptodate(pinned[1])) {
+                return written;
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
+        pos += written;
-                        wait_on_page_locked(pinned[1]);
+        count -= written;
-                } else {
+        iov_iter_init(&i, iov, nr_segs, count, written);
-                        unlock_page(pinned[1]);
+        written_buffered = __btrfs_buffered_write(file, &i, pos);
-                }
+        if (written_buffered < 0) {
+                err = written_buffered;
+                goto out;
        }
+        endbyte = pos + written_buffered - 1;
+        err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+        if (err)
+                goto out;
+        written += written_buffered;
+        *ppos = pos + written_buffered;
+        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+                                 endbyte >> PAGE_CACHE_SHIFT);
+out:
+        return written ? written : err;
+}
-        while (iov_iter_count(&i) > 0) {
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                                    const struct iovec *iov,
-                size_t write_bytes = min(iov_iter_count(&i),
+                                    unsigned long nr_segs, loff_t pos)
-                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
+{
-                                         offset);
+        struct file *file = iocb->ki_filp;
-                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+        struct inode *inode = fdentry(file)->d_inode;
-                                        PAGE_CACHE_SHIFT;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        loff_t *ppos = &iocb->ki_pos;
+        ssize_t num_written = 0;
+        ssize_t err = 0;
+        size_t count, ocount;
-                WARN_ON(num_pages > nrptrs);
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+        mutex_lock(&inode->i_mutex);
-                if (ret)
-                        goto out;
-                ret = prepare_pages(root, file, pages, num_pages,
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-                                    pos, first_index, last_index,
+        if (err) {
-                                    write_bytes);
+                mutex_unlock(&inode->i_mutex);
-                if (ret) {
+                goto out;
-                        btrfs_delalloc_release_space(inode, write_bytes);
+        }
-                        goto out;
+        count = ocount;
-                }
-                ret = btrfs_copy_from_user(pos, num_pages,
+        current->backing_dev_info = inode->i_mapping->backing_dev_info;
-                                           write_bytes, pages, &i);
+        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-                if (ret == 0) {
+        if (err) {
-                        dirty_and_release_pages(NULL, root, file, pages,
+                mutex_unlock(&inode->i_mutex);
-                                                num_pages, pos, write_bytes);
+                goto out;
-                }
+        }
-                btrfs_drop_pages(pages, num_pages);
+        if (count == 0) {
-                if (ret) {
+                mutex_unlock(&inode->i_mutex);
-                        btrfs_delalloc_release_space(inode, write_bytes);
+                goto out;
-                        goto out;
+        }
-                }
-                if (will_write) {
+        err = file_remove_suid(file);
-                        filemap_fdatawrite_range(inode->i_mapping, pos,
+        if (err) {
-                                                 pos + write_bytes - 1);
+                mutex_unlock(&inode->i_mutex);
-                } else {
+                goto out;
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+        }
-                                                           num_pages);
-                        if (num_pages <
-                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                                btrfs_btree_balance_dirty(root, 1);
-                        btrfs_throttle(root);
-                }
-                pos += write_bytes;
+        /*
-                num_written += write_bytes;
+         * If BTRFS flips readonly due to some impossible error
+         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+         * although we have opened a file as writable, we have
+         * to stop this write operation to ensure FS consistency.
+         */
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                mutex_unlock(&inode->i_mutex);
+                err = -EROFS;
+                goto out;
+        }
-                cond_resched();
+        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
+                                                   pos, ppos, count, ocount);
+        } else {
+                struct iov_iter i;
+                iov_iter_init(&i, iov, nr_segs, count, num_written);
+                num_written = __btrfs_buffered_write(file, &i, pos);
+                if (num_written > 0)
+                        *ppos = pos + num_written;
        }
-out:
-        mutex_unlock(&inode->i_mutex);
-        if (ret)
-                err = ret;
-        kfree(pages);
+        mutex_unlock(&inode->i_mutex);
-        if (pinned[0])
-                page_cache_release(pinned[0]);
-        if (pinned[1])
-                page_cache_release(pinned[1]);
-        *ppos = pos;
        /*
         * we want to make sure fsync finds this change
@@ -1037,36 +1412,12 @@ out:
         * one running right now.
         */
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+        if (num_written > 0 || num_written == -EIOCBQUEUED) {
-        if (num_written > 0 && will_write) {
+                err = generic_write_sync(file, pos, num_written);
-                struct btrfs_trans_handle *trans;
+                if (err < 0 && num_written > 0)
-                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
-                if (err)
                        num_written = err;
-                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 0);
-                        ret = btrfs_log_dentry_safe(trans, root,
-                                                    file->f_dentry);
-                        if (ret == 0) {
-                                ret = btrfs_sync_log(trans, root);
-                                if (ret == 0)
-                                        btrfs_end_transaction(trans, root);
-                                else
-                                        btrfs_commit_transaction(trans, root);
-                        } else if (ret != BTRFS_NO_LOG_SYNC) {
-                                btrfs_commit_transaction(trans, root);
-                        } else {
-                                btrfs_end_transaction(trans, root);
-                        }
-                }
-                if (file->f_flags & O_DIRECT && buffered) {
-                        invalidate_mapping_pages(inode->i_mapping,
-                              start_pos >> PAGE_CACHE_SHIFT,
-                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
-                }
        }
+out:
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
@@ -1109,6 +1460,7 @@ int btrfs_sync_file(struct file *file, int datasync)
        int ret = 0;
        struct btrfs_trans_handle *trans;
+        trace_btrfs_sync_file(file, datasync);
        /* we wait first, since the writeback may change the inode */
        root->log_batch++;
@@ -1128,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
         * the current transaction, we can bail out now without any
         * syncing
         */
-        mutex_lock(&root->fs_info->trans_mutex);
+        smp_mb();
        if (BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
-                mutex_unlock(&root->fs_info->trans_mutex);
                goto out;
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        /*
         * ok we haven't committed the transaction yet, lets do a commit
@@ -1202,6 +1552,118 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static long btrfs_fallocate(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct extent_state *cached_state = NULL;
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 locked_end;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, i_size_read(inode),
+                                        alloc_start);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        if (ret)
+                goto out;
+        locked_end = alloc_end - 1;
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                 locked_end, 0, &cached_state, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR_OR_NULL(em));
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE ||
+                    (cur_offset >= inode->i_size &&
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+                                                        last_byte - cur_offset,
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                             &cached_state, GFP_NOFS);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -1213,6 +1675,7 @@ const struct file_operations btrfs_file_operations = {
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
+        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /fs/btrfs/file.c
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)