Merge tag 'nfs-rdma-for-4.1-1' of git://git.linux-nfs.org/projects/anna/nfs-rdma

NFS: NFSoRDMA Client Changes This patch series creates an operation vector for each of the different memory registration modes. This should make it easier to one day increase credit limit, rsize, and wsize. Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
author: Trond Myklebust <trond.myklebust@primarydata.com> 2015-04-23 15:16:37 -0400
committer: Trond Myklebust <trond.myklebust@primarydata.com> 2015-04-23 15:16:37 -0400
commit: f139b6c676c7e49b66016b28bf3f8ec5c54be891 (patch)
tree: 742f00e431dded1daf642b44f4c199b318f255dc /fs
parent: 21330b667070fd64b2340d8d31c1b0800df78ec8 (diff)
parent: d654788e98f74f2df8dfc6079fa314938f739486 (diff)
20 files changed, 439 insertions, 85 deletions
diff --git a/fs/affs/file.c b/fs/affs/file.c
index d2468bf95669..a91795e01a7f 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -699,8 +699,10 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
        boff = tmp % bsize;
        if (boff) {
                bh = affs_bread_ino(inode, bidx, 0);
-                if (IS_ERR(bh))
+                if (IS_ERR(bh)) {
-                        return PTR_ERR(bh);
+                        written = PTR_ERR(bh);
+                        goto err_first_bh;
+                }
                tmp = min(bsize - boff, to - from);
                BUG_ON(boff + tmp > bsize || tmp > bsize);
                memcpy(AFFS_DATA(bh) + boff, data + from, tmp);
@@ -712,14 +714,16 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
                bidx++;
        } else if (bidx) {
                bh = affs_bread_ino(inode, bidx - 1, 0);
-                if (IS_ERR(bh))
+                if (IS_ERR(bh)) {
-                        return PTR_ERR(bh);
+                        written = PTR_ERR(bh);
+                        goto err_first_bh;
+                }
        }
        while (from + bsize <= to) {
                prev_bh = bh;
                bh = affs_getemptyblk_ino(inode, bidx);
                if (IS_ERR(bh))
-                        goto out;
+                        goto err_bh;
                memcpy(AFFS_DATA(bh), data + from, bsize);
                if (buffer_new(bh)) {
                        AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA);
@@ -751,7 +755,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
                prev_bh = bh;
                bh = affs_bread_ino(inode, bidx, 1);
                if (IS_ERR(bh))
-                        goto out;
+                        goto err_bh;
                tmp = min(bsize, to - from);
                BUG_ON(tmp > bsize);
                memcpy(AFFS_DATA(bh), data + from, tmp);
@@ -790,12 +794,13 @@ done:
        if (tmp > inode->i_size)
                inode->i_size = AFFS_I(inode)->mmu_private = tmp;
+err_first_bh:
        unlock_page(page);
        page_cache_release(page);
        return written;
-out:
+err_bh:
        bh = prev_bh;
        if (!written)
                written = PTR_ERR(bh);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84c3b00f3de8..f9c89cae39ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3387,6 +3387,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root);
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
@@ -3909,6 +3911,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
                                    loff_t actual_len, u64 *alloc_hint);
 int btrfs_inode_check_errors(struct inode *inode);
 extern const struct dentry_operations btrfs_dentry_operations;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
+#endif
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f79f38542a73..639f2663ed3f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3921,7 +3921,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        }
        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
                        + sizeof(struct btrfs_chunk)) {
-                printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+                printk(KERN_ERR "BTRFS: system chunk array too small %u < %zu\n",
                                btrfs_super_sys_array_size(sb),
                                sizeof(struct btrfs_disk_key)
                                + sizeof(struct btrfs_chunk));
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6f080451fcb1..8b353ad02f03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3325,6 +3325,32 @@ out:
        return ret;
 }
+int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_block_group_cache *cache, *tmp;
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        struct btrfs_path *path;
+        if (list_empty(&cur_trans->dirty_bgs) ||
+            !btrfs_test_opt(root, SPACE_CACHE))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* Could add new block groups, use _safe just in case */
+        list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
+                                 dirty_list) {
+                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                        cache_save_setup(cache, trans, path);
+        }
+        btrfs_free_path(path);
+        return 0;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
@@ -5110,7 +5136,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
-        BTRFS_I(inode)->outstanding_extents++;
+        nr_extents = (unsigned)div64_u64(num_bytes +
+                                         BTRFS_MAX_EXTENT_SIZE - 1,
+                                         BTRFS_MAX_EXTENT_SIZE);
+        BTRFS_I(inode)->outstanding_extents += nr_extents;
+        nr_extents = 0;
        if (BTRFS_I(inode)->outstanding_extents >
            BTRFS_I(inode)->reserved_extents)
@@ -5255,6 +5285,9 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
+        if (btrfs_test_is_dummy_root(root))
+                return;
        trace_btrfs_space_reservation(root->fs_info, "delalloc",
                                      btrfs_ino(inode), to_free, 0);
        if (root->fs_info->quota_enabled) {
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c7233ff1d533..d688cfe5d496 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4968,6 +4968,12 @@ static int release_extent_buffer(struct extent_buffer *eb)
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+                if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
+                        __free_extent_buffer(eb);
+                        return 1;
+                }
+#endif
                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
                return 1;
        }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da828cf5e8f8..d2e732d7af52 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -108,6 +108,13 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 static int btrfs_dirty_inode(struct inode *inode);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode)
+{
+        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+}
+#endif
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
                                     struct inode *inode,  struct inode *dir,
                                     const struct qstr *qstr)
@@ -1542,30 +1549,17 @@ static void btrfs_split_extent_hook(struct inode *inode,
                u64 new_size;
                /*
-                 * We need the largest size of the remaining extent to see if we
+                 * See the explanation in btrfs_merge_extent_hook, the same
-                 * need to add a new outstanding extent.  Think of the following
+                 * applies here, just in reverse.
-                 * case
-                 *
-                 * [MEAX_EXTENT_SIZEx2 - 4k][4k]
-                 *
-                 * The new_size would just be 4k and we'd think we had enough
-                 * outstanding extents for this if we only took one side of the
-                 * split, same goes for the other direction.  We need to see if
-                 * the larger size still is the same amount of extents as the
-                 * original size, because if it is we need to add a new
-                 * outstanding extent.  But if we split up and the larger size
-                 * is less than the original then we are good to go since we've
-                 * already accounted for the extra extent in our original
-                 * accounting.
                 */
                new_size = orig->end - split + 1;
-                if ((split - orig->start) > new_size)
+                num_extents = div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                        new_size = split - orig->start;
-                num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
                                        BTRFS_MAX_EXTENT_SIZE);
-                if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                new_size = split - orig->start;
-                              BTRFS_MAX_EXTENT_SIZE) < num_extents)
+                num_extents += div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                        BTRFS_MAX_EXTENT_SIZE);
+                if (div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                              BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                        return;
        }
@@ -1591,8 +1585,10 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        if (!(other->state & EXTENT_DELALLOC))
                return;
-        old_size = other->end - other->start + 1;
+        if (new->start > other->start)
-        new_size = old_size + (new->end - new->start + 1);
+                new_size = new->end - other->start + 1;
+        else
+                new_size = other->end - new->start + 1;
        /* we're not bigger than the max, unreserve the space and go */
        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
@@ -1603,13 +1599,32 @@ static void btrfs_merge_extent_hook(struct inode *inode,
        }
        /*
-         * If we grew by another max_extent, just return, we want to keep that
+         * We have to add up either side to figure out how many extents were
-         * reserved amount.
+         * accounted for before we merged into one big extent.  If the number of
+         * extents we accounted for is <= the amount we need for the new range
+         * then we can return, otherwise drop.  Think of it like this
+         *
+         * [ 4k][MAX_SIZE]
+         *
+         * So we've grown the extent by a MAX_SIZE extent, this would mean we
+         * need 2 outstanding extents, on one side we have 1 and the other side
+         * we have 1 so they are == and we can return.  But in this case
+         *
+         * [MAX_SIZE+4k][MAX_SIZE+4k]
+         *
+         * Each range on their own accounts for 2 extents, but merged together
+         * they are only 3 extents worth of accounting, so we need to drop in
+         * this case.
         */
+        old_size = other->end - other->start + 1;
        num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
                                BTRFS_MAX_EXTENT_SIZE);
+        old_size = new->end - new->start + 1;
+        num_extents += div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                 BTRFS_MAX_EXTENT_SIZE);
        if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
-                      BTRFS_MAX_EXTENT_SIZE) > num_extents)
+                      BTRFS_MAX_EXTENT_SIZE) >= num_extents)
                return;
        spin_lock(&BTRFS_I(inode)->lock);
@@ -1686,6 +1701,10 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
+                /* For sanity tests */
+                if (btrfs_test_is_dummy_root(root))
+                        return;
                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
                                     root->fs_info->delalloc_batch);
                spin_lock(&BTRFS_I(inode)->lock);
@@ -1741,6 +1760,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                    root != root->fs_info->tree_root)
                        btrfs_delalloc_release_metadata(inode, len);
+                /* For sanity tests. */
+                if (btrfs_test_is_dummy_root(root))
+                        return;
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
                        btrfs_free_reserved_data_space(inode, len);
@@ -7213,7 +7236,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
-        u64 orig_len = len;
+        u64 *outstanding_extents = NULL;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
@@ -7225,6 +7248,16 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        lockstart = start;
        lockend = start + len - 1;
+        if (current->journal_info) {
+                /*
+                 * Need to pull our outstanding extents and set journal_info to NULL so
+                 * that anything that needs to check if there's a transction doesn't get
+                 * confused.
+                 */
+                outstanding_extents = current->journal_info;
+                current->journal_info = NULL;
+        }
        /*
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
@@ -7348,11 +7381,20 @@ unlock:
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
-                if (len < orig_len) {
+                /*
+                 * If we have an outstanding_extents count still set then we're
+                 * within our reservation, otherwise we need to adjust our inode
+                 * counter appropriately.
+                 */
+                if (*outstanding_extents) {
+                        (*outstanding_extents)--;
+                } else {
                        spin_lock(&BTRFS_I(inode)->lock);
                        BTRFS_I(inode)->outstanding_extents++;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
+                current->journal_info = outstanding_extents;
                btrfs_free_reserved_data_space(inode, len);
        }
@@ -7376,6 +7418,8 @@ unlock:
 unlock_err:
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+        if (outstanding_extents)
+                current->journal_info = outstanding_extents;
        return ret;
 }
@@ -8075,6 +8119,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        u64 outstanding_extents = 0;
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
@@ -8112,6 +8157,16 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                ret = btrfs_delalloc_reserve_space(inode, count);
                if (ret)
                        goto out;
+                outstanding_extents = div64_u64(count +
+                                                BTRFS_MAX_EXTENT_SIZE - 1,
+                                                BTRFS_MAX_EXTENT_SIZE);
+                /*
+                 * We need to know how many extents we reserved so that we can
+                 * do the accounting properly if we go over the number we
+                 * originally calculated.  Abuse current->journal_info for this.
+                 */
+                current->journal_info = &outstanding_extents;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_done(inode);
@@ -8124,6 +8179,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        iter, offset, btrfs_get_blocks_direct, NULL,
                        btrfs_submit_direct, flags);
        if (rw & WRITE) {
+                current->journal_info = NULL;
                if (ret < 0 && ret != -EIOCBQUEUED)
                        btrfs_delalloc_release_space(inode, count);
                else if (ret >= 0 && (size_t)ret < count)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 97159a8e91d4..058c79eecbfb 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1259,7 +1259,7 @@ static int comp_oper(struct btrfs_qgroup_operation *oper1,
        if (oper1->seq < oper2->seq)
                return -1;
        if (oper1->seq > oper2->seq)
-                return -1;
+                return 1;
        if (oper1->ref_root < oper2->ref_root)
                return -1;
        if (oper1->ref_root > oper2->ref_root)
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index a116b55ce788..054fc0d97131 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -911,6 +911,197 @@ out:
        return ret;
 }
+static int test_extent_accounting(void)
+{
+        struct inode *inode = NULL;
+        struct btrfs_root *root = NULL;
+        int ret = -ENOMEM;
+        inode = btrfs_new_test_inode();
+        if (!inode) {
+                test_msg("Couldn't allocate inode\n");
+                return ret;
+        }
+        root = btrfs_alloc_dummy_root();
+        if (IS_ERR(root)) {
+                test_msg("Couldn't allocate root\n");
+                goto out;
+        }
+        root->fs_info = btrfs_alloc_dummy_fs_info();
+        if (!root->fs_info) {
+                test_msg("Couldn't allocate dummy fs info\n");
+                goto out;
+        }
+        BTRFS_I(inode)->root = root;
+        btrfs_test_inode_set_ops(inode);
+        /* [BTRFS_MAX_EXTENT_SIZE] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1,
+                                        NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 1) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 1, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE][4k] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE,
+                                        BTRFS_MAX_EXTENT_SIZE + 4095, NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 2) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 2, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE/2][4K HOLE][the rest] */
+        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                               BTRFS_MAX_EXTENT_SIZE >> 1,
+                               (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+                               EXTENT_DELALLOC | EXTENT_DIRTY |
+                               EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
+                               NULL, GFP_NOFS);
+        if (ret) {
+                test_msg("clear_extent_bit returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 2) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 2, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE][4K] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1,
+                                        (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
+                                        NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 2) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 2, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /*
+         * [BTRFS_MAX_EXTENT_SIZE+4K][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4K]
+         *
+         * I'm artificially adding 2 to outstanding_extents because in the
+         * buffered IO case we'd add things up as we go, but I don't feel like
+         * doing that here, this isn't the interesting case we want to test.
+         */
+        BTRFS_I(inode)->outstanding_extents += 2;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 8192,
+                                        (BTRFS_MAX_EXTENT_SIZE << 1) + 12287,
+                                        NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 4) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 4, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE+4k][4k][BTRFS_MAX_EXTENT_SIZE+4k] */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+                                        BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 3) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 3, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */
+        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree,
+                               BTRFS_MAX_EXTENT_SIZE+4096,
+                               BTRFS_MAX_EXTENT_SIZE+8191,
+                               EXTENT_DIRTY | EXTENT_DELALLOC |
+                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+                               NULL, GFP_NOFS);
+        if (ret) {
+                test_msg("clear_extent_bit returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 4) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 4, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /*
+         * Refill the hole again just for good measure, because I thought it
+         * might fail and I'd rather satisfy my paranoia at this point.
+         */
+        BTRFS_I(inode)->outstanding_extents++;
+        ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE+4096,
+                                        BTRFS_MAX_EXTENT_SIZE+8191, NULL);
+        if (ret) {
+                test_msg("btrfs_set_extent_delalloc returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents != 3) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 3, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        /* Empty */
+        ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                               EXTENT_DIRTY | EXTENT_DELALLOC |
+                               EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+                               NULL, GFP_NOFS);
+        if (ret) {
+                test_msg("clear_extent_bit returned %d\n", ret);
+                goto out;
+        }
+        if (BTRFS_I(inode)->outstanding_extents) {
+                ret = -EINVAL;
+                test_msg("Miscount, wanted 0, got %u\n",
+                         BTRFS_I(inode)->outstanding_extents);
+                goto out;
+        }
+        ret = 0;
+out:
+        if (ret)
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
+                                 NULL, GFP_NOFS);
+        iput(inode);
+        btrfs_free_dummy_root(root);
+        return ret;
+}
 int btrfs_test_inodes(void)
 {
        int ret;
@@ -924,5 +1115,9 @@ int btrfs_test_inodes(void)
        if (ret)
                return ret;
        test_msg("Running hole first btrfs_get_extent test\n");
-        return test_hole_first();
+        ret = test_hole_first();
+        if (ret)
+                return ret;
+        test_msg("Running outstanding_extents tests\n");
+        return test_extent_accounting();
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 88e51aded6bd..8be4278e25e8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1023,17 +1023,13 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        u64 old_root_used;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
        old_root_used = btrfs_root_used(&root->root_item);
-        btrfs_write_dirty_block_groups(trans, root);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
-                    old_root_used == btrfs_root_used(&root->root_item) &&
+                    old_root_used == btrfs_root_used(&root->root_item))
-                    (!extent_root ||
-                     list_empty(&trans->transaction->dirty_bgs)))
                        break;
                btrfs_set_root_node(&root->root_item, root->node);
@@ -1044,14 +1040,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                        return ret;
                old_root_used = btrfs_root_used(&root->root_item);
-                if (extent_root) {
-                        ret = btrfs_write_dirty_block_groups(trans, root);
-                        if (ret)
-                                return ret;
-                }
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                if (ret)
-                        return ret;
        }
        return 0;
@@ -1068,6 +1056,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root)
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
        struct list_head *next;
        struct extent_buffer *eb;
        int ret;
@@ -1095,11 +1084,15 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
+        ret = btrfs_setup_space_cache(trans, root);
+        if (ret)
+                return ret;
        /* run_qgroups might have added some more refs */
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        if (ret)
                return ret;
+again:
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
@@ -1112,8 +1105,23 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                ret = update_cowonly_root(trans, root);
                if (ret)
                        return ret;
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                if (ret)
+                        return ret;
        }
+        while (!list_empty(dirty_bgs)) {
+                ret = btrfs_write_dirty_block_groups(trans, root);
+                if (ret)
+                        return ret;
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                if (ret)
+                        return ret;
+        }
+        if (!list_empty(&fs_info->dirty_cowonly_roots))
+                goto again;
        list_add_tail(&fs_info->extent_root->dirty_list,
                      &trans->transaction->switch_commits);
        btrfs_after_dev_replace_commit(fs_info);
@@ -1811,6 +1819,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                wait_for_commit(root, cur_trans);
+                if (unlikely(cur_trans->aborted))
+                        ret = cur_trans->aborted;
                btrfs_put_transaction(cur_trans);
                return ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ed19a7d622fa..39706c57ad3c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -890,8 +890,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        newpage = buf->page;
-        if (WARN_ON(!PageUptodate(newpage)))
+        if (!PageUptodate(newpage))
-                return -EIO;
+                SetPageUptodate(newpage);
        ClearPageMappedToDisk(newpage);
@@ -1353,6 +1353,17 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
        return err;
 }
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+        /*
+         * The fuse device's file's private_data is used to hold
+         * the fuse_conn(ection) when it is mounted, and is used to
+         * keep track of whether the file has been mounted already.
+         */
+        file->private_data = NULL;
+        return 0;
+}
 static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                              unsigned long nr_segs, loff_t pos)
 {
@@ -1797,6 +1808,9 @@ copy_finish:
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
                       unsigned int size, struct fuse_copy_state *cs)
 {
+        /* Don't try to move pages (yet) */
+        cs->move_pages = 0;
        switch (code) {
        case FUSE_NOTIFY_POLL:
                return fuse_notify_poll(fc, size, cs);
@@ -2217,6 +2231,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on)
 const struct file_operations fuse_dev_operations = {
        .owner          = THIS_MODULE,
+        .open           = fuse_dev_open,
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index 6e560d56094b..754fdf8c6356 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -131,13 +131,16 @@ skip:
        hfs_bnode_write(node, entry, data_off + key_len, entry_len);
        hfs_bnode_dump(node);
-        if (new_node) {
+        /*
-                /* update parent key if we inserted a key
+         * update parent key if we inserted a key
-                 * at the start of the first node
+         * at the start of the node and it is not the new node
-                 */
+         */
-                if (!rec && new_node != node)
+        if (!rec && new_node != node) {
-                        hfs_brec_update_parent(fd);
+                hfs_bnode_read_key(node, fd->search_key, data_off + size);
+                hfs_brec_update_parent(fd);
+        }
+        if (new_node) {
                hfs_bnode_put(fd->bnode);
                if (!new_node->parent) {
                        hfs_btree_inc_height(tree);
@@ -168,9 +171,6 @@ skip:
                goto again;
        }
-        if (!rec)
-                hfs_brec_update_parent(fd);
        return 0;
 }
@@ -370,6 +370,8 @@ again:
        if (IS_ERR(parent))
                return PTR_ERR(parent);
        __hfs_brec_find(parent, fd, hfs_find_rec_by_key);
+        if (fd->record < 0)
+                return -ENOENT;
        hfs_bnode_dump(parent);
        rec = fd->record;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index b684e8a132e6..2bacb9988566 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -207,6 +207,7 @@ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
                goto out_free;
        }
+        of->event = atomic_read(&of->kn->attr.open->event);
        ops = kernfs_ops(of->kn);
        if (ops->read)
                len = ops->read(of, buf, len, *ppos);
diff --git a/fs/locks.c b/fs/locks.c
index f1bad681fc1c..528fedfda15e 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1728,7 +1728,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
                        break;
                }
        }
-        trace_generic_delete_lease(inode, fl);
+        trace_generic_delete_lease(inode, victim);
        if (victim)
                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 3c1bfa155571..1028a0629543 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -587,8 +587,6 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
-        nfsd4_cb_layout_fail(ls);
        printk(KERN_WARNING
                "nfsd: client %s failed to respond to layout recall. "
                "  Fencing..\n", addr_str);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b9f99b..0c3f303baf32 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
        struct nilfs_inode_info *ii, *n;
+        int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE);
        int defer_iput = false;
        spin_lock(&nilfs->ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                brelse(ii->i_bh);
                ii->i_bh = NULL;
                list_del_init(&ii->i_dirty);
-                if (!ii->vfs_inode.i_nlink) {
+                if (!ii->vfs_inode.i_nlink || during_mount) {
                        /*
-                         * Defer calling iput() to avoid a deadlock
+                         * Defer calling iput() to avoid deadlocks if
-                         * over I_SYNC flag for inodes with i_nlink == 0
+                         * i_nlink == 0 or mount is not yet finished.
                         */
                        list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
                        defer_iput = true;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9a66ff79ff27..d2f97ecca6a5 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -143,7 +143,8 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
            !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
                return false;
-        if (event_mask & marks_mask & ~marks_ignored_mask)
+        if (event_mask & FAN_ALL_OUTGOING_EVENTS & marks_mask &
+                                 ~marks_ignored_mask)
                return true;
        return false;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 8490c64d34fe..460c6c37e683 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -502,7 +502,7 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
 static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
 {
-        if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
                return 1;
        return 0;
 }
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 20e37a3ed26f..db64ce2d4667 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -102,11 +102,11 @@
                                         | OCFS2_FEATURE_INCOMPAT_INDEXED_DIRS \
                                         | OCFS2_FEATURE_INCOMPAT_REFCOUNT_TREE \
                                         | OCFS2_FEATURE_INCOMPAT_DISCONTIG_BG  \
-                                         | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
+                                         | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO \
+                                         | OCFS2_FEATURE_INCOMPAT_APPEND_DIO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
-                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
-                                         | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -179,6 +179,11 @@
 #define OCFS2_FEATURE_INCOMPAT_CLUSTERINFO      0x4000
 /*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_INCOMPAT_APPEND_DIO       0x8000
+/*
 * backup superblock flag is used to indicate that this volume
 * has backup superblocks.
 */
@@ -200,10 +205,6 @@
 #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
-/*
- * Append Direct IO support
- */
-#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO      0x0008
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index b90952f528b1..5f0d1993e6e3 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -529,8 +529,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data)
 {
        struct ovl_fs *ufs = sb->s_fs_info;
-        if (!(*flags & MS_RDONLY) &&
+        if (!(*flags & MS_RDONLY) && !ufs->upper_mnt)
-            (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY)))
                return -EROFS;
        return 0;
@@ -615,9 +614,19 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
                        break;
                default:
+                        pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
                        return -EINVAL;
                }
        }
+        /* Workdir is useless in non-upper mount */
+        if (!config->upperdir && config->workdir) {
+                pr_info("overlayfs: option \"workdir=%s\" is useless in a non-upper mount, ignore\n",
+                        config->workdir);
+                kfree(config->workdir);
+                config->workdir = NULL;
+        }
        return 0;
 }
@@ -837,7 +846,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_stack_depth = 0;
        if (ufs->config.upperdir) {
-                /* FIXME: workdir is not needed for a R/O mount */
                if (!ufs->config.workdir) {
                        pr_err("overlayfs: missing 'workdir'\n");
                        goto out_free_config;
@@ -847,6 +855,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                if (err)
                        goto out_free_config;
+                /* Upper fs should not be r/o */
+                if (upperpath.mnt->mnt_sb->s_flags & MS_RDONLY) {
+                        pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n");
+                        err = -EINVAL;
+                        goto out_put_upperpath;
+                }
                err = ovl_mount_dir(ufs->config.workdir, &workpath);
                if (err)
                        goto out_put_upperpath;
@@ -869,8 +884,14 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
        err = -EINVAL;
        stacklen = ovl_split_lowerdirs(lowertmp);
-        if (stacklen > OVL_MAX_STACK)
+        if (stacklen > OVL_MAX_STACK) {
+                pr_err("overlayfs: too many lower directries, limit is %d\n",
+                       OVL_MAX_STACK);
                goto out_free_lowertmp;
+        } else if (!ufs->config.upperdir && stacklen == 1) {
+                pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n");
+                goto out_free_lowertmp;
+        }
        stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
        if (!stack)
@@ -932,8 +953,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
                ufs->numlower++;
        }
-        /* If the upper fs is r/o or nonexistent, we mark overlayfs r/o too */
+        /* If the upper fs is nonexistent, we mark overlayfs r/o too */
-        if (!ufs->upper_mnt || (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY))
+        if (!ufs->upper_mnt)
                sb->s_flags |= MS_RDONLY;
        sb->s_d_op = &ovl_dentry_operations;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 956b75d61809..6dee68d013ff 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1325,6 +1325,9 @@ out:
 static int pagemap_open(struct inode *inode, struct file *file)
 {
+        /* do not disclose physical addresses: attack vector */
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
                        "to stop being page-shift some time soon. See the "
                        "linux/Documentation/vm/pagemap.txt for details.\n");
author	Trond Myklebust <trond.myklebust@primarydata.com>	2015-04-23 15:16:37 -0400
committer	Trond Myklebust <trond.myklebust@primarydata.com>	2015-04-23 15:16:37 -0400
commit	f139b6c676c7e49b66016b28bf3f8ec5c54be891 (patch)
tree	742f00e431dded1daf642b44f4c199b318f255dc /fs
parent	21330b667070fd64b2340d8d31c1b0800df78ec8 (diff)
parent	d654788e98f74f2df8dfc6079fa314938f739486 (diff)