59 files changed, 820 insertions, 431 deletions
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (IS_ERR(bprm.file))
                return res;
+        bprm.cred = prepare_exec_creds();
+        res = -ENOMEM;
+        if (!bprm.cred)
+                goto out;
        res = prepare_binprm(&bprm);
        if (res <= (unsigned long)-4096)
                res = load_flat_file(&bprm, libs, id, NULL);
-        if (bprm.file) {
-                allow_write_access(bprm.file);
+        abort_creds(bprm.cred);
-                fput(bprm.file);
-                bprm.file = NULL;
+out:
-        }
+        allow_write_access(bprm.file);
+        fput(bprm.file);
        return(res);
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index dc84daee6bc4..72a2b9c28e9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -265,10 +265,6 @@ static int caching_kthread(void *data)
        atomic_inc(&block_group->space_info->caching_threads);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
-again:
-        /* need to make sure the commit_root doesn't disappear */
-        down_read(&fs_info->extent_commit_sem);
        /*
         * We don't want to deadlock with somebody trying to allocate a new
         * extent for the extent root while also trying to search the extent
@@ -282,6 +278,10 @@ again:
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        down_read(&fs_info->extent_commit_sem);
        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
@@ -304,6 +304,19 @@ again:
                        if (need_resched() ||
                            btrfs_transaction_in_commit(fs_info)) {
+                                leaf = path->nodes[0];
+                                /* this shouldn't happen, but if the
+                                 * leaf is empty just move on.
+                                 */
+                                if (btrfs_header_nritems(leaf) == 0)
+                                        break;
+                                /*
+                                 * we need to copy the key out so that
+                                 * we are sure the next search advances
+                                 * us forward in the btree.
+                                 */
+                                btrfs_item_key_to_cpu(leaf, &key, 0);
                                btrfs_release_path(fs_info->extent_root, path);
                                up_read(&fs_info->extent_commit_sem);
                                schedule_timeout(1);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af99b78b288e..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -414,11 +414,29 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
                              u64 *offset, u64 *bytes)
 {
        u64 end;
+        u64 search_start, search_bytes;
+        int ret;
 again:
        end = bitmap_info->offset +
                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+        /*
+         * XXX - this can go away after a few releases.
+         *
+         * since the only user of btrfs_remove_free_space is the tree logging
+         * stuff, and the only way to test that is under crash conditions, we
+         * want to have this debug stuff here just in case somethings not
+         * working.  Search the bitmap for the space we are trying to use to
+         * make sure its actually there.  If its not there then we need to stop
+         * because something has gone wrong.
+         */
+        search_start = *offset;
+        search_bytes = *bytes;
+        ret = search_bitmap(block_group, bitmap_info, &search_start,
+                            &search_bytes);
+        BUG_ON(ret < 0 || search_start != *offset);
        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
                bitmap_clear_bits(block_group, bitmap_info, *offset,
                                  end - *offset + 1);
@@ -430,6 +448,7 @@ again:
        }
        if (*bytes) {
+                struct rb_node *next = rb_next(&bitmap_info->offset_index);
                if (!bitmap_info->bytes) {
                        unlink_free_space(block_group, bitmap_info);
                        kfree(bitmap_info->bitmap);
@@ -438,16 +457,36 @@ again:
                        recalculate_thresholds(block_group);
                }
-                bitmap_info = tree_search_offset(block_group,
+                /*
-                                                 offset_to_bitmap(block_group,
+                 * no entry after this bitmap, but we still have bytes to
-                                                                  *offset),
+                 * remove, so something has gone wrong.
-                                                 1, 0);
+                 */
-                if (!bitmap_info)
+                if (!next)
                        return -EINVAL;
+                bitmap_info = rb_entry(next, struct btrfs_free_space,
+                                       offset_index);
+                /*
+                 * if the next entry isn't a bitmap we need to return to let the
+                 * extent stuff do its work.
+                 */
                if (!bitmap_info->bitmap)
                        return -EAGAIN;
+                /*
+                 * Ok the next item is a bitmap, but it may not actually hold
+                 * the information for the rest of this free space stuff, so
+                 * look for it, and if we don't find it return so we can try
+                 * everything over again.
+                 */
+                search_start = *offset;
+                search_bytes = *bytes;
+                ret = search_bitmap(block_group, bitmap_info, &search_start,
+                                    &search_bytes);
+                if (ret < 0 || search_start != *offset)
+                        return -EAGAIN;
                goto again;
        } else if (!bitmap_info->bytes) {
                unlink_free_space(block_group, bitmap_info);
@@ -644,8 +683,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 again:
        info = tree_search_offset(block_group, offset, 0, 0);
        if (!info) {
-                WARN_ON(1);
+                /*
-                goto out_lock;
+                 * oops didn't find an extent that matched the space we wanted
+                 * to remove, look for a bitmap instead
+                 */
+                info = tree_search_offset(block_group,
+                                          offset_to_bitmap(block_group, offset),
+                                          1, 0);
+                if (!info) {
+                        WARN_ON(1);
+                        goto out_lock;
+                }
        }
        if (info->bytes < bytes && rb_next(&info->offset_index)) {
@@ -957,8 +1005,15 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
        if (cluster->block_group != block_group)
                goto out;
-        entry = tree_search_offset(block_group, search_start, 0, 0);
+        /*
+         * search_start is the beginning of the bitmap, but at some point it may
+         * be a good idea to point to the actual start of the free area in the
+         * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+         * to 1 to make sure we get the bitmap entry
+         */
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, search_start),
+                                   1, 0);
        if (!entry || !entry->bitmap)
                goto out;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 56fe83fa60c4..272b9b2bea86 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4785,8 +4785,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * and the replacement file is large.  Start IO on it now so
         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-            new_inode->i_size &&
            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e71264d1c2c9..c04f7f212602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2553,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
        /* make sure the dirty trick played by the caller work */
-        ret = invalidate_inode_pages2_range(inode->i_mapping,
+        while (1) {
-                                            first_index, last_index);
+                ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                                    first_index, last_index);
+                if (ret != -EBUSY)
+                        break;
+                schedule_timeout(HZ/10);
+        }
        if (ret)
                goto out_unlock;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_in = 0;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        char *kaddr;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
                return -ENOMEM;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 92888aa90749..e85b1e4389e0 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,10 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters.
 Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde4..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  forceuid      Set the default uid for inodes based on the uid
+  forceuid      Set the default uid for inodes to the uid
-                passed in. For mounts to servers
+                passed in on mount. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
-                the uid, gid and mode so this parameter should  not be
+                the uid, gid and mode so this parameter should not be
                specified unless the server and clients uid and gid
                numbering differ.  If the server and client are in the
                same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
                of existing files will be the uid (gid) of the person
                who executed the mount (root, except when mount.cifs
                is configured setuid for user mounts) unless the "uid=" 
-                (gid) mount option is specified.  For the uid (gid) of newly
+                (gid) mount option is specified. Also note that permission
-                created files and directories, ie files created since 
-                the last mount of the server share, the expected uid 
-                (gid) is cached as long as the inode remains in 
-                memory on the client.   Also note that permission
                checks (authorization checks) on accesses to a file occur
                at the server, but there are cases in which an administrator
                may want to restrict at the client as well.  For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
                (such as Windows), permissions can also be checked at the
                client, and a crude form of client side permission checking 
                can be enabled by specifying file_mode and dir_mode on 
-                the client.  Note that the mount.cifs helper must be
+                the client.  (default)
-                at version 1.10 or higher to support specifying the uid
+  forcegid      (similar to above but for the groupid instead of uid) (default)
-                (or gid) in non-numeric form.
+  noforceuid    Fill in file owner information (uid) by requesting it from
-  forcegid      (similar to above but for the groupid instead of uid)
+                the server if possible. With this option, the value given in
+                the uid= option (on mount) will only be used if the server
+                can not support returning uids on inodes.
+  noforcegid    (similar to above but for the group owner, gid, instead of uid)
  uid           Set the default uid for inodes, and indicate to the
-                cifs kernel driver which local user mounted . If the server
+                cifs kernel driver which local user mounted. If the server
                supports the unix extensions the default uid is
                not used to fill in the owner fields of inodes (files)
                unless the "forceuid" parameter is specified.
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
 * if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
 static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
        UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
                         GFP_KERNEL);
        if (!UNC)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        /* get share name and server name */
        if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
                cERROR(1, ("%s: no server name end in node name: %s",
                        __func__, node_name));
                kfree(UNC);
-                return NULL;
+                return ERR_PTR(-EINVAL);
        }
        /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                return ERR_PTR(-EINVAL);
        *devname = cifs_get_share_name(ref->node_name);
+        if (IS_ERR(*devname)) {
+                rc = PTR_ERR(*devname);
+                *devname = NULL;
+                goto compose_mount_options_err;
+        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
-        for (i = 0; from[i] && i < maxwords; i++) {
+        for (i = 0; i < maxwords && from[i]; i++) {
                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
                                             NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 44f30504b82d..84b75253b05a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -376,10 +376,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
                seq_printf(s, ",forceuid");
+        else
+                seq_printf(s, ",noforceuid");
        seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
                seq_printf(s, ",forcegid");
+        else
+                seq_printf(s, ",noforcegid");
        cifs_show_address(s, tcon->ses->server);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index fc44d316d0bb..1f3345d7fa79 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
        char *data;
        unsigned int  temp_len, i, j;
        char separator[2];
+        short int override_uid = -1;
+        short int override_gid = -1;
+        bool uid_specified = false;
+        bool gid_specified = false;
        separator[0] = ',';
        separator[1] = 0;
@@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "too long.\n");
                                return 1;
                        }
-                } else if (strnicmp(data, "uid", 3) == 0) {
+                } else if (!strnicmp(data, "uid", 3) && value && *value) {
-                        if (value && *value)
+                        vol->linux_uid = simple_strtoul(value, &value, 0);
-                                vol->linux_uid =
+                        uid_specified = true;
-                                        simple_strtoul(value, &value, 0);
+                } else if (!strnicmp(data, "forceuid", 8)) {
-                } else if (strnicmp(data, "forceuid", 8) == 0) {
+                        override_uid = 1;
-                                vol->override_uid = 1;
+                } else if (!strnicmp(data, "noforceuid", 10)) {
-                } else if (strnicmp(data, "gid", 3) == 0) {
+                        override_uid = 0;
-                        if (value && *value)
+                } else if (!strnicmp(data, "gid", 3) && value && *value) {
-                                vol->linux_gid =
+                        vol->linux_gid = simple_strtoul(value, &value, 0);
-                                        simple_strtoul(value, &value, 0);
+                        gid_specified = true;
-                } else if (strnicmp(data, "forcegid", 8) == 0) {
+                } else if (!strnicmp(data, "forcegid", 8)) {
-                                vol->override_gid = 1;
+                        override_gid = 1;
+                } else if (!strnicmp(data, "noforcegid", 10)) {
+                        override_gid = 0;
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1355,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname,
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
+        if (uid_specified)
+                vol->override_uid = override_uid;
+        else if (override_uid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+                                   "specified with no uid= option.\n");
+        if (gid_specified)
+                vol->override_gid = override_gid;
+        else if (override_gid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+                                   "specified with no gid= option.\n");
        return 0;
 }
@@ -2544,11 +2562,20 @@ remote_path_check:
                        if (mount_data != mount_data_global)
                                kfree(mount_data);
                        mount_data = cifs_compose_mount_options(
                                        cifs_sb->mountdata, full_path + 1,
                                        referrals, &fake_devname);
-                        kfree(fake_devname);
                        free_dfs_info_array(referrals, num_referrals);
+                        kfree(fake_devname);
+                        kfree(full_path);
+                        if (IS_ERR(mount_data)) {
+                                rc = PTR_ERR(mount_data);
+                                mount_data = NULL;
+                                goto mount_fail_check;
+                        }
                        if (tcon)
                                cifs_put_tcon(tcon);
@@ -2556,8 +2583,6 @@ remote_path_check:
                                cifs_put_smb_ses(pSesInfo);
                        cleanup_volume_info(&volume_info);
-                        FreeXid(xid);
-                        kfree(full_path);
                        referral_walks_count++;
                        goto try_mount_again;
                }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f28f070a60fc..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
 COMPATIBLE_IOCTL(FIOASYNC)
 COMPATIBLE_IOCTL(FIONBIO)
 COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027b..a7cbfbd340c7 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -386,16 +386,16 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name,     0444, proto_name_show,        NULL);
+GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
-GDLM_ATTR(block,          0644, block_show,             block_store);
+GDLM_ATTR(block,                0644, block_show,               block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,          withdraw_store);
+GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,              NULL);
+GDLM_ATTR(id,                   0444, lkid_show,                NULL);
-GDLM_ATTR(jid,            0444, jid_show,               NULL);
+GDLM_ATTR(jid,                  0444, jid_show,                 NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,           NULL);
+GDLM_ATTR(first,                0444, lkfirst_show,             NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,        NULL);
+GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
-GDLM_ATTR(recover,        0200, NULL,                   recover_store);
+GDLM_ATTR(recover,              0600, NULL,                     recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,      NULL);
+GDLM_ATTR(recover_done,         0444, recover_done_show,        NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show,    NULL);
+GDLM_ATTR(recover_status,       0444, recover_status_show,      NULL);
 static struct attribute *lock_module_attrs[] = {
        &gdlm_attr_proto_name.attr,
diff --git a/fs/inode.c b/fs/inode.c
index 901bad1e5f12..ae7b67e48661 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
-struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct address_space *const mapping = &inode->i_data;
        inode->i_sb = sb;
@@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->dirtied_when = 0;
        if (security_inode_alloc(inode))
-                goto out_free_inode;
+                goto out;
        /* allocate and initialize an i_integrity */
        if (ima_inode_alloc(inode))
@@ -198,16 +197,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
-        return inode;
+        return 0;
 out_free_security:
        security_inode_free(inode);
-out_free_inode:
+out:
-        if (inode->i_sb->s_op->destroy_inode)
+        return -ENOMEM;
-                inode->i_sb->s_op->destroy_inode(inode);
-        else
-                kmem_cache_free(inode_cachep, (inode));
-        return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
@@ -220,12 +215,21 @@ static struct inode *alloc_inode(struct super_block *sb)
        else
                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
-        if (inode)
+        if (!inode)
-                return inode_init_always(sb, inode);
+                return NULL;
-        return NULL;
+        if (unlikely(inode_init_always(sb, inode))) {
+                if (inode->i_sb->s_op->destroy_inode)
+                        inode->i_sb->s_op->destroy_inode(inode);
+                else
+                        kmem_cache_free(inode_cachep, inode);
+                return NULL;
+        }
+        return inode;
 }
-void destroy_inode(struct inode *inode)
+void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
@@ -237,13 +241,17 @@ void destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
+}
+EXPORT_SYMBOL(__destroy_inode);
+void destroy_inode(struct inode *inode)
+{
+        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
                kmem_cache_free(inode_cachep, (inode));
 }
-EXPORT_SYMBOL(destroy_inode);
 /*
 * These are initializations that only need to be done
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..23c947539864 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
        kunmap(pg);
        D2(printk(KERN_DEBUG "readpage finished\n"));
-        return 0;
+        return ret;
 }
 int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/namespace.c b/fs/namespace.c
index 277c28a63ead..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
 */
 int mnt_want_write_file(struct file *file)
 {
-        if (!(file->f_mode & FMODE_WRITE))
+        struct inode *inode = file->f_dentry->d_inode;
+        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
                return mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01a3204..e4e089a8f294 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
-        nfs_readdata_release(calldata);
+        nfs_readdata_free(data);
 }
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 1, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_readdata_release(data);
+                        nfs_readdata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_readdata_release(data);
+                                nfs_readdata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
                struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
                nfs_direct_release_pages(data->pagevec, data->npages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
 }
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
        dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
        nfs_direct_write_complete(dreq, data->inode);
-        nfs_commitdata_release(calldata);
+        nfs_commit_free(data);
 }
 static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->args.fh = NFS_FH(data->inode);
        data->args.offset = 0;
        data->args.count = 0;
-        data->args.context = get_nfs_open_context(dreq->ctx);
+        data->args.context = dreq->ctx;
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 0, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_writedata_release(data);
+                        nfs_writedata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_writedata_release(data);
+                                nfs_writedata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 73ea5e8d66ce..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_rdata_mempool);
 }
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-        struct nfs_read_data *rdata = data;
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..a34fae21fe10 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_wdata_mempool);
 }
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-        struct nfs_write_data *wdata = data;
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3d3ddb3f5177..2dfd47714ae5 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
                return 0; /* Do not request flush for shadow page cache */
        if (!sb) {
                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
-                if (!writer)
+                if (!writer) {
+                        nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
                        return -EROFS;
+                }
                sb = writer->s_super;
        }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 8b5e4778cf28..51ff3d0a4ee2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
        if (!page)
                return;
-        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page))
+        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
                /*
                 * For b-tree node pages, this function may be called twice
                 * or more because they might be split in a segment.
                 */
+                if (PageDirty(page)) {
+                        /*
+                         * For pages holding split b-tree node buffers, dirty
+                         * flag on the buffers may be cleared discretely.
+                         * In that case, the page is once redirtied for
+                         * remaining buffers, and it must be cancelled if
+                         * all the buffers get cleaned later.
+                         */
+                        lock_page(page);
+                        if (nilfs_page_buffers_clean(page))
+                                __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
                return;
+        }
        __nilfs_end_page_io(page, err);
 }
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24d..5dcbafe72d71 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
        event_priv->wd = wd;
        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
-        /* EEXIST is not an error */
+        if (ret) {
-        if (ret == -EEXIST)
-                ret = 0;
-        /* did event_priv get attached? */
-        if (list_empty(&fsn_event_priv->event_list))
                inotify_free_event_priv(fsn_event_priv);
+                /* EEXIST says we tail matched, EOVERFLOW isn't something
+                 * to report up the stack. */
+                if ((ret == -EEXIST) ||
+                    (ret == -EOVERFLOW))
+                        ret = 0;
+        }
        /*
         * If we hold the entry until after the event is on the queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f30d9bbc2e1b..dc32ed8323ba 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -386,6 +386,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        struct fsnotify_event *ignored_event;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
+        int ret;
        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
                                              FSNOTIFY_EVENT_NONE, NULL, 0,
@@ -404,10 +405,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        fsn_event_priv->group = group;
        event_priv->wd = ientry->wd;
-        fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+        ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+        if (ret)
-        /* did the private data get added? */
-        if (list_empty(&fsn_event_priv->event_list))
                inotify_free_event_priv(fsn_event_priv);
 skip_send_ignore:
@@ -568,7 +567,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
        spin_lock_init(&group->inotify_data.idr_lock);
        idr_init(&group->inotify_data.idr);
-        group->inotify_data.last_wd = 0;
+        group->inotify_data.last_wd = 1;
        group->inotify_data.user = user;
        group->inotify_data.fa = NULL;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 521368574e97..3816d5750dd5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
                                return true;
                        break;
                case (FSNOTIFY_EVENT_NONE):
+                        if (old->mask & FS_Q_OVERFLOW)
+                                return true;
+                        else if (old->mask & FS_IN_IGNORED)
+                                return false;
                        return false;
                };
        }
@@ -171,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
        struct list_head *list = &group->notification_list;
        struct fsnotify_event_holder *last_holder;
        struct fsnotify_event *last_event;
+        int ret = 0;
-        /* easy to tell if priv was attached to the event */
-        INIT_LIST_HEAD(&priv->event_list);
        /*
         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -194,6 +196,7 @@ alloc_holder:
        if (group->q_len >= group->max_events) {
                event = &q_overflow_event;
+                ret = -EOVERFLOW;
                /* sorry, no private data on the overflow event */
                priv = NULL;
        }
@@ -235,7 +238,7 @@ alloc_holder:
        mutex_unlock(&group->notification_mutex);
        wake_up(&group->notification_waitq);
-        return 0;
+        return ret;
 }
 /*
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9edcde4974aa..f9a3e8942669 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         * immediately to their right.
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
-        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+        if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+                BUG_ON(right_child_el->l_tree_depth);
                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
        }
@@ -2476,15 +2477,37 @@ out_ret_path:
        return ret;
 }
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
-                                      struct ocfs2_path *path)
+                                     int subtree_index, struct ocfs2_path *path)
 {
-        int i, idx;
+        int i, idx, ret;
        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_list *el;
        struct ocfs2_extent_block *eb;
        u32 range;
+        /*
+         * In normal tree rotation process, we will never touch the
+         * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+         * doesn't reserve the credits for them either.
+         *
+         * But we do have a special case here which will update the rightmost
+         * records for all the bh in the path.
+         * So we have to allocate extra credits and access them.
+         */
+        ret = ocfs2_extend_trans(handle,
+                                 handle->h_buffer_credits + subtree_index);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /* Path should always be rightmost. */
        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
        BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2505,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
                ocfs2_journal_dirty(handle, path->p_node[i].bh);
        }
+out:
+        return ret;
 }
 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2717,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
        if (del_right_subtree) {
                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
                                     subtree_index, dealloc);
-                ocfs2_update_edge_lengths(inode, handle, left_path);
+                ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+                                                left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -3034,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                ocfs2_unlink_subtree(inode, handle, left_path, path,
                                     subtree_index, dealloc);
-                ocfs2_update_edge_lengths(inode, handle, left_path);
+                ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+                                                left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..b401654011a2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
                        mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
                        dump_stack();
+                        goto bail;
                }
                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
@@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc {
         */
        unsigned        c_new;
        unsigned        c_unwritten;
+        unsigned        c_needs_zero;
 };
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
-        return d->c_new || d->c_unwritten;
-}
 struct ocfs2_write_ctxt {
        /* Logical cluster position / len of write */
        u32                             w_cpos;
        u32                             w_clen;
+        /* First cluster allocated in a nonsparse extend */
+        u32                             w_first_new_cpos;
        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
        /*
@@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                return -ENOMEM;
        wc->w_cpos = pos >> osb->s_clustersize_bits;
+        wc->w_first_new_cpos = UINT_MAX;
        cend = (pos + len - 1) >> osb->s_clustersize_bits;
        wc->w_clen = cend - wc->w_cpos + 1;
        get_bh(di_bh);
@@ -1217,20 +1218,18 @@ out:
 */
 static int ocfs2_write_cluster(struct address_space *mapping,
                               u32 phys, unsigned int unwritten,
+                               unsigned int should_zero,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct ocfs2_write_ctxt *wc, u32 cpos,
                               loff_t user_pos, unsigned user_len)
 {
-        int ret, i, new, should_zero = 0;
+        int ret, i, new;
        u64 v_blkno, p_blkno;
        struct inode *inode = mapping->host;
        struct ocfs2_extent_tree et;
        new = phys == 0 ? 1 : 0;
-        if (new || unwritten)
-                should_zero = 1;
        if (new) {
                u32 tmp_pos;
@@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                if (tmpret) {
                        mlog_errno(tmpret);
                        if (ret == 0)
-                                tmpret = ret;
+                                ret = tmpret;
                }
        }
@@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
                        local_len = osb->s_clustersize - cluster_off;
                ret = ocfs2_write_cluster(mapping, desc->c_phys,
-                                          desc->c_unwritten, data_ac, meta_ac,
+                                          desc->c_unwritten,
+                                          desc->c_needs_zero,
+                                          data_ac, meta_ac,
                                          wc, desc->c_cpos, pos, local_len);
                if (ret) {
                        mlog_errno(ret);
@@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
                 * newly allocated cluster.
                 */
                desc = &wc->w_desc[0];
-                if (ocfs2_should_zero_cluster(desc))
+                if (desc->c_needs_zero)
                        ocfs2_figure_cluster_boundaries(osb,
                                                        desc->c_cpos,
                                                        &wc->w_target_from,
                                                        NULL);
                desc = &wc->w_desc[wc->w_clen - 1];
-                if (ocfs2_should_zero_cluster(desc))
+                if (desc->c_needs_zero)
                        ocfs2_figure_cluster_boundaries(osb,
                                                        desc->c_cpos,
                                                        NULL,
@@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                        phys++;
                }
+                /*
+                 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+                 * file that got extended.  w_first_new_cpos tells us
+                 * where the newly allocated clusters are so we can
+                 * zero them.
+                 */
+                if (desc->c_cpos >= wc->w_first_new_cpos) {
+                        BUG_ON(phys == 0);
+                        desc->c_needs_zero = 1;
+                }
                desc->c_phys = phys;
                if (phys == 0) {
                        desc->c_new = 1;
+                        desc->c_needs_zero = 1;
                        *clusters_to_alloc = *clusters_to_alloc + 1;
                }
-                if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                if (ext_flags & OCFS2_EXT_UNWRITTEN) {
                        desc->c_unwritten = 1;
+                        desc->c_needs_zero = 1;
+                }
                num_clusters--;
        }
@@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
        if (newsize <= i_size_read(inode))
                return 0;
-        ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+        ret = ocfs2_extend_no_holes(inode, newsize, pos);
        if (ret)
                mlog_errno(ret);
+        wc->w_first_new_cpos =
+                ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
        return ret;
 }
@@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
-        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
        unsigned int clusters_to_alloc, extents_to_split;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
@@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        }
-        ocfs2_set_target_boundaries(osb, wc, pos, len,
+        /*
-                                    clusters_to_alloc + extents_to_split);
+         * We have to zero sparse allocated clusters, unwritten extent clusters,
+         * and non-sparse clusters we just extended.  For non-sparse writes,
+         * we know zeros will only be needed in the first and/or last cluster.
+         */
+        if (clusters_to_alloc || extents_to_split ||
+            wc->w_desc[0].c_needs_zero ||
+            wc->w_desc[wc->w_clen - 1].c_needs_zero)
+                cluster_of_pages = 1;
+        else
+                cluster_of_pages = 0;
+        ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
         * extent.
         */
        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-                                         clusters_to_alloc + extents_to_split,
+                                         cluster_of_pages, mmap_page);
-                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_quota;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..2f28b7de2c8d 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -310,22 +310,19 @@ out_attach:
        return ret;
 }
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
 /* We limit the number of dentry locks to drop in one go. We have
 * this limit so that we don't starve other users of ocfs2_wq. */
 #define DL_INODE_DROP_COUNT 64
 /* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
 {
-        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-                                               dentry_lock_work);
        struct ocfs2_dentry_lock *dl;
-        int drop_count = DL_INODE_DROP_COUNT;
        spin_lock(&dentry_list_lock);
-        while (osb->dentry_lock_list && drop_count--) {
+        while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
                dl = osb->dentry_lock_list;
                osb->dentry_lock_list = dl->dl_next;
                spin_unlock(&dentry_list_lock);
@@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
                kfree(dl);
                spin_lock(&dentry_list_lock);
        }
-        if (osb->dentry_lock_list)
+        spin_unlock(&dentry_list_lock);
+}
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+                                               dentry_lock_work);
+        __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+        /*
+         * Don't queue dropping if umount is in progress. We flush the
+         * list in ocfs2_dismount_volume
+         */
+        spin_lock(&dentry_list_lock);
+        if (osb->dentry_lock_list &&
+            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
                queue_work(ocfs2_wq, &osb->dentry_lock_work);
        spin_unlock(&dentry_list_lock);
 }
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+        __ocfs2_drop_dl_inodes(osb, -1);
+}
 /*
 * ocfs2_dentry_iput() and friends.
 *
@@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
        /* We leave dropping of inode reference to ocfs2_wq as that can
         * possibly lead to inode deletion which gets tricky */
        spin_lock(&dentry_list_lock);
-        if (!osb->dentry_lock_list)
+        if (!osb->dentry_lock_list &&
+            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
                queue_work(ocfs2_wq, &osb->dentry_lock_work);
        dl->dl_next = osb->dentry_lock_list;
        osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
 int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
                             u64 parent_blkno);
+extern spinlock_t dentry_list_lock;
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl);
 void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
                                      int skip_unhashed);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
-        BUG_ON(!list_empty(&lock->ast_list));
        if (lock->ast_pending)
                mlog(0, "lock has an ast getting flushed right now\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
             dlm->name, res->lockname.len, res->lockname.name,
-             orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+             orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
             send_to);
        /* send it */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 62442e413a00..aa501d3f93f1 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1851,6 +1851,7 @@ relock:
                if (ret)
                        goto out_dio;
+                count = ocount;
                ret = generic_write_checks(file, ppos, &count,
                                           S_ISBLK(inode->i_mode));
                if (ret)
@@ -1918,8 +1919,10 @@ out_sems:
        mutex_unlock(&inode->i_mutex);
+        if (written)
+                ret = written;
        mlog_exit(ret);
-        return written ? written : ret;
+        return ret;
 }
 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f033760ecbea..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
        os->os_osb = osb;
        os->os_count = 0;
        os->os_seqno = 0;
-        os->os_scantime = CURRENT_TIME;
        mutex_init(&os->os_lock);
        INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        os->os_scantime = CURRENT_TIME;
        if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 5432c7f79cc6..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
@@ -329,20 +330,27 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
 /* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+                                   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
 /*
 * The two writes below can accidentally see global info dirty due
 * to set_info() quotactl so make them prepared for the writes.
 */
 /* quota data block, global info */
 /* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+                              OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 /* global quota data block, local quota data block, global quota inode,
 * global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+                             2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 {
@@ -355,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
        return credits;
 }
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345ebb8493..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,10 +224,12 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
-#define OCFS2_OSB_SOFT_RO       0x0001
+#define OCFS2_OSB_SOFT_RO                       0x0001
-#define OCFS2_OSB_HARD_RO       0x0002
+#define OCFS2_OSB_HARD_RO                       0x0002
-#define OCFS2_OSB_ERROR_FS      0x0004
+#define OCFS2_OSB_ERROR_FS                      0x0004
-#define OCFS2_DEFAULT_ATIME_QUANTUM     60
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED        0x0008
+#define OCFS2_DEFAULT_ATIME_QUANTUM             60
 struct ocfs2_journal;
 struct ocfs2_slot_info;
@@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
        spin_unlock(&osb->osb_lock);
 }
+static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
+                                                 unsigned long flag)
+{
+        unsigned long ret;
+        spin_lock(&osb->osb_lock);
+        ret = osb->osb_flags & flag;
+        spin_unlock(&osb->osb_lock);
+        return ret;
+}
 static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
                                     int hard)
 {
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..3fb96fcd4c81 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
-        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
        struct list_head dqi_chunk;     /* List of chunks */
        struct inode *dqi_gqinode;      /* Global quota file inode */
        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index edfa60cd155c..bf7742d0ee3b 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -69,6 +69,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
        d->dqb_btime = cpu_to_le64(m->dqb_btime);
        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+        d->dqb_pad1 = d->dqb_pad2 = 0;
 }
 static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
@@ -211,14 +212,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
        if (gqinode->i_size < off + len) {
-                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                loff_t rounded_end =
-                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                                ocfs2_align_bytes_to_blocks(sb, off + len);
-                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                if (err < 0)
+                /* Space is already allocated in ocfs2_global_read_dquot() */
-                        goto out;
                err = ocfs2_simple_size_update(gqinode,
                                               oinfo->dqi_gqi_bh,
-                                               off + len);
+                                               rounded_end);
                if (err < 0)
                        goto out;
                new = 1;
@@ -234,7 +234,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        }
        if (err) {
                mlog_errno(err);
-                return err;
+                goto out;
        }
        lock_buffer(bh);
        if (new)
@@ -342,7 +342,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
-        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -352,7 +351,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-                           oinfo->dqi_syncjiff);
+                           msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
        mlog_exit(status);
@@ -402,13 +401,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
        return err;
 }
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /*
+         * We may need to allocate tree blocks and a leaf block but not the
+         * root block
+         */
+        return oinfo->dqi_gi.dqi_qtree_depth;
+}
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+        /* We modify all the allocated blocks, tree root, and info block */
+        return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+}
 /* Read in information from global quota file and acquire a reference to it.
 * dquot_acquire() has already started the transaction and locked quota file */
 int ocfs2_global_read_dquot(struct dquot *dquot)
 {
        int err, err2, ex = 0;
-        struct ocfs2_mem_dqinfo *info =
+        struct super_block *sb = dquot->dq_sb;
-                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        struct inode *gqinode = info->dqi_gqinode;
+        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+        handle_t *handle = NULL;
        err = ocfs2_qinfo_lock(info, 0);
        if (err < 0)
@@ -419,14 +441,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
        OCFS2_DQUOT(dquot)->dq_use_count++;
        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        ocfs2_qinfo_unlock(info, 0);
        if (!dquot->dq_off) {   /* No real quota entry? */
-                /* Upgrade to exclusive lock for allocation */
-                ocfs2_qinfo_unlock(info, 0);
-                err = ocfs2_qinfo_lock(info, 1);
-                if (err < 0)
-                        goto out_qlock;
                ex = 1;
+                /*
+                 * Add blocks to quota file before we start a transaction since
+                 * locking allocators ranks above a transaction start
+                 */
+                WARN_ON(journal_current_handle());
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode,
+                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+                        gqinode->i_size);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
        }
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_calc_global_qinit_credits(sb, type));
+        if (IS_ERR(handle)) {
+                err = PTR_ERR(handle);
+                goto out;
+        }
+        err = ocfs2_qinfo_lock(info, ex);
+        if (err < 0)
+                goto out_trans;
        err = qtree_write_dquot(&info->dqi_gi, dquot);
        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -438,6 +479,9 @@ out_qlock:
                ocfs2_qinfo_unlock(info, 1);
        else
                ocfs2_qinfo_unlock(info, 0);
+out_trans:
+        if (handle)
+                ocfs2_commit_trans(osb, handle);
 out:
        if (err < 0)
                mlog_errno(err);
@@ -607,7 +651,7 @@ static void qsync_work_fn(struct work_struct *work)
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-                           oinfo->dqi_syncjiff);
+                           msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -635,20 +679,18 @@ out:
        return status;
 }
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 {
-        struct ocfs2_mem_dqinfo *oinfo;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        /*
-                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+         * We modify tree, leaf block, global info, local chunk header,
+         * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+         * accounts for inode update
-                return 0;
+         */
+        return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
-        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+               OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
-        /* We modify tree, leaf block, global info, local chunk header,
+               OCFS2_QINFO_WRITE_CREDITS +
-         * global and local inode */
+               OCFS2_INODE_UPDATE_CREDITS;
-        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
-               2 * OCFS2_INODE_UPDATE_CREDITS;
 }
 static int ocfs2_release_dquot(struct dquot *dquot)
@@ -680,33 +722,10 @@ out:
        return status;
 }
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
-        struct ocfs2_mem_dqinfo *oinfo;
-        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-        struct ocfs2_dinode *lfe, *gfe;
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-                return 0;
-        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
-        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
-        /* We can extend local file + global file. In local file we
-         * can modify info, chunk header block and dquot block. In
-         * global file we can modify info, tree and leaf block */
-        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
-               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
-}
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-        handle_t *handle;
        struct ocfs2_mem_dqinfo *oinfo =
                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
        int status = 0;
        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -715,16 +734,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
-        handle = ocfs2_start_trans(osb,
-                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out_ilock;
-        }
        status = dquot_acquire(dquot);
-        ocfs2_commit_trans(osb, handle);
-out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
        mlog_exit(status);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 5a460fa82553..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "quota.h"
+#include "uptodate.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        handle_t *handle;
        int status;
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -610,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                        goto out_bh;
                /* Mark quota file as clean if we are recovering quota file of
                 * some other node. */
-                handle = ocfs2_start_trans(osb, 1);
+                handle = ocfs2_start_trans(osb,
+                                           OCFS2_LOCAL_QINFO_WRITE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
                        mlog_errno(status);
@@ -940,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        struct ocfs2_local_disk_chunk *dchunk;
        int status;
        handle_t *handle;
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh = NULL, *dbh = NULL;
        u64 p_blkno;
        /* We are protected by dqio_sem so no locking needed */
@@ -964,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
                mlog_errno(status);
                goto out;
        }
+        /* Local quota info and two new blocks we initialize */
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+                        2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        /* Initialize chunk header */
        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
-                goto out;
+                goto out_trans;
        }
        bh = sb_getblk(sb, p_blkno);
        if (!bh) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto out;
+                goto out_trans;
        }
        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_new_buffer_uptodate(lqinode, bh);
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out;
-        }
        status = ocfs2_journal_access_dq(handle, lqinode, bh,
-                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -999,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        memset(dchunk->dqc_bitmap, 0,
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
-        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        status = ocfs2_journal_dirty(handle, bh);
        if (status < 0) {
@@ -1007,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
                goto out_trans;
        }
+        /* Initialize new block with structures */
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dbh = sb_getblk(sb, p_blkno);
+        if (!dbh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out_trans;
+        }
+        ocfs2_set_new_buffer_uptodate(lqinode, dbh);
+        status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(dbh);
+        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+        unlock_buffer(dbh);
+        status = ocfs2_journal_dirty(handle, dbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
        oinfo->dqi_chunks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1031,6 +1068,7 @@ out_trans:
        ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out:
        brelse(bh);
+        brelse(dbh);
        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
        return ERR_PTR(status);
 }
@@ -1048,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        struct ocfs2_local_disk_chunk *dchunk;
        int epb = ol_quota_entries_per_block(sb);
        unsigned int chunk_blocks;
+        struct buffer_head *bh;
+        u64 p_blkno;
        int status;
        handle_t *handle;
@@ -1075,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                mlog_errno(status);
                goto out;
        }
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        /* Get buffer from the just added block */
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        ocfs2_set_new_buffer_uptodate(lqinode, bh);
+        /* Local quota info, chunk header and the new block we initialize */
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+                        2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
                goto out;
        }
+        /* Zero created block */
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                 OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        memset(bh->b_data, 0, sb->s_blocksize);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
                                 OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
@@ -1097,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                mlog_errno(status);
                goto out_trans;
        }
+        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
        if (status < 0) {
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3f661376a2de..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
 * General Public License for more details.
 */
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/module.h>
@@ -153,7 +154,7 @@ static int status_map[] = {
 static int dlm_status_to_errno(enum dlm_status status)
 {
-        BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+        BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
        return status_map[status];
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349fb9bd..b0ee0fdf799a 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
                }
                di = (struct ocfs2_dinode *) (*bh)->b_data;
                memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+                spin_lock_init(&stats->b_lock);
                status = ocfs2_verify_volume(di, *bh, blksize, stats);
                if (status >= 0)
                        goto bail;
@@ -1182,7 +1183,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        wake_up(&osb->osb_mount_event);
        /* Start this when the mount is almost sure of being successful */
-        ocfs2_orphan_scan_init(osb);
+        ocfs2_orphan_scan_start(osb);
        mlog_exit(status);
        return status;
@@ -1213,14 +1214,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
                           mnt);
 }
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        /* Prevent further queueing of inode drop events */
+        spin_lock(&dentry_list_lock);
+        ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+        spin_unlock(&dentry_list_lock);
+        /* Wait for work to finish and/or remove it */
+        cancel_work_sync(&osb->dentry_lock_work);
+        kill_block_super(sb);
+}
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
        .get_sb         = ocfs2_get_sb, /* is this called when we mount
                                        * the fs? */
-        .kill_sb        = kill_block_super, /* set to the generic one
+        .kill_sb        = ocfs2_kill_sb,
-                                             * right now, but do we
-                                             * need to change that? */
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
        .next           = NULL
 };
@@ -1819,6 +1833,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        debugfs_remove(osb->osb_ctxt);
+        /*
+         * Flush inode dropping work queue so that deletes are
+         * performed while the filesystem is still working
+         */
+        ocfs2_drop_all_dl_inodes(osb);
        /* Orphan scan should be stopped as early as possible */
        ocfs2_orphan_scan_stop(osb);
@@ -1981,6 +2001,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+        ocfs2_orphan_scan_init(osb);
        status = ocfs2_recovery_init(osb);
        if (status) {
                mlog(ML_ERROR, "Unable to initialize recovery state\n");
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba320e250747..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        struct ocfs2_xattr_block *xb;
        struct ocfs2_xattr_value_root *xv;
        size_t size;
-        int ret = -ENODATA, name_offset, name_len, block_off, i;
+        int ret = -ENODATA, name_offset, name_len, i;
+        int uninitialized_var(block_off);
        xs->bucket = ocfs2_xattr_bucket_new(inode);
        if (!xs->bucket) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..175db258942f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-        struct mm_struct *mm = get_task_mm(task);
+        struct mm_struct *mm;
-        if (!mm)
+        if (mutex_lock_killable(&task->cred_guard_mutex))
                return NULL;
-        down_read(&mm->mmap_sem);
-        task_lock(task);
+        mm = get_task_mm(task);
-        if (task->mm != mm)
+        if (mm && mm != current->mm &&
-                goto out;
+                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
-        if (task->mm != current->mm &&
+                mmput(mm);
-            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
+                mm = NULL;
-                goto out;
+        }
-        task_unlock(task);
+        mutex_unlock(&task->cred_guard_mutex);
        return mm;
-out:
-        task_unlock(task);
-        up_read(&mm->mmap_sem);
-        mmput(mm);
-        return NULL;
 }
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        mm = mm_for_maps(priv->task);
        if (!mm)
                return NULL;
+        down_read(&mm->mmap_sem);
        tail_vma = get_gate_vma(priv->task);
        priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
                priv->task = NULL;
                return NULL;
        }
+        down_read(&mm->mmap_sem);
        /* start from the Nth VMA */
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/select.c b/fs/select.c
index d870237e42c7..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
 {
        init_poll_funcptr(&pwq->pt, __pollwait);
        pwq->polling_task = current;
+        pwq->triggered = 0;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0c93c7ef3d18..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
        bp->b_pages = NULL;
        bp->b_addr = mem;
-        rval = _xfs_buf_get_pages(bp, page_count, 0);
+        rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
        if (rval)
                return rval;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b8ca43..98ef624d9baf 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
        return 0;
 }
+void
+__xfs_inode_set_reclaim_tag(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip)
+{
+        radix_tree_tag_set(&pag->pag_ici_root,
+                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                           XFS_ICI_RECLAIM_TAG);
+}
 /*
 * We set the inode flag atomically with the radix tree tag.
 * Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
        read_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
-        radix_tree_tag_set(&pag->pag_ici_root,
+        __xfs_inode_set_reclaim_tag(pag, ip);
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
        read_unlock(&pag->pag_ici_lock);
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301c99c7..59120602588a 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index db15feb906ff..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-                                             blkcnt, XFS_BUF_LOCK, &bp);
+                                             blkcnt,
+                                             XFS_BUF_LOCK | XBF_DONT_BLOCK,
+                                             &bp);
                        if (error)
                                return(error);
@@ -2141,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
+                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
-                                                        blkcnt, XFS_BUF_LOCK);
+                                       XFS_BUF_LOCK | XBF_DONT_BLOCK);
                ASSERT(bp);
                ASSERT(!XFS_BUF_GETERROR(bp));
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7928b9983c1d..8ee5b5a76a2a 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -6009,7 +6009,7 @@ xfs_getbmap(
         */
        error = ENOMEM;
        subnex = 16;
-        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
        if (!map)
                goto out_unlock_ilock;
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..26717388acf5 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
                if (bp)
                        xfs_buftrace("SBTREE ERROR", bp);
-                XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
-                                 cur->bc_mp);
+                        XFS_ERRLEVEL_LOW, cur->bc_mp, block);
                return XFS_ERROR(EFSCORRUPTED);
        }
        return 0;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
 xfs_da_state_t *
 xfs_da_state_alloc(void)
 {
-        return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+        return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
 }
 /*
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
        int             off;
        if (nbuf == 1)
-                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
        else
-                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
        dabuf->dirty = 0;
 #ifdef XFS_DABUF_DEBUG
        dabuf->ra = ra;
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
                                        !(args->op_flags & XFS_DA_OP_CILOOKUP))
                return EEXIST;
-        args->value = kmem_alloc(len, KM_MAYFAIL);
+        args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
        if (!args->value)
                return ENOMEM;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
        new = nb - mp->m_sb.sb_dblocks;
        oagcount = mp->m_sb.sb_agcount;
        if (nagcount > oagcount) {
+                void *new_perag, *old_perag;
                xfs_filestream_flush(mp);
+                new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
+                                        KM_MAYFAIL);
+                if (!new_perag)
+                        return XFS_ERROR(ENOMEM);
                down_write(&mp->m_peraglock);
-                mp->m_perag = kmem_realloc(mp->m_perag,
+                memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
-                        sizeof(xfs_perag_t) * nagcount,
+                old_perag = mp->m_perag;
-                        sizeof(xfs_perag_t) * oagcount,
+                mp->m_perag = new_perag;
-                        KM_SLEEP);
-                memset(&mp->m_perag[oagcount], 0,
-                        (nagcount - oagcount) * sizeof(xfs_perag_t));
                mp->m_flags |= XFS_MOUNT_32BITINODES;
                nagimax = xfs_initialize_perag(mp, nagcount);
                up_write(&mp->m_peraglock);
+                kmem_free(old_perag);
        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
        tp->t_flags |= XFS_TRANS_RESERVE;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 5fcec6f020a7..ecbf8b4d2e2e 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -64,6 +64,10 @@ xfs_inode_alloc(
        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
        if (!ip)
                return NULL;
+        if (inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
        ASSERT(atomic_read(&ip->i_iocount) == 0);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -105,17 +109,6 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
-        /*
-        * Now initialise the VFS inode. We do this after the xfs_inode
-        * initialisation as internal failures will result in ->destroy_inode
-        * being called and that will pass down through the reclaim path and
-        * free the XFS inode. This path requires the XFS inode to already be
-        * initialised. Hence if this call fails, the xfs_inode has already
-        * been freed and we should not reference it at all in the error
-        * handling.
-        */
-        if (!inode_init_always(mp->m_super, VFS_I(ip)))
-                return NULL;
        /* prevent anyone from using this yet */
        VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -123,6 +116,71 @@ xfs_inode_alloc(
        return ip;
 }
+STATIC void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+#ifdef XFS_INODE_TRACE
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 /*
 * Check the validity of the inode we just found it the cache
 */
@@ -133,80 +191,82 @@ xfs_iget_cache_hit(
        int                     flags,
        int                     lock_flags) __releases(pag->pag_ici_lock)
 {
+        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = EAGAIN;
+        int                     error;
+        spin_lock(&ip->i_flags_lock);
        /*
-         * If INEW is set this inode is being set up
+         * If we are racing with another cache hit that is currently
-         * If IRECLAIM is set this inode is being torn down
+         * instantiating this inode or currently recycling it out of
-         * Pause and try again.
+         * reclaimabe state, wait for the initialisation to complete
+         * before continuing.
+         *
+         * XXX(hch): eventually we should do something equivalent to
+         *           wait_on_inode to wait for these flags to be cleared
+         *           instead of polling for it.
         */
-        if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
                goto out_error;
        }
-        /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+        /*
-        if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+         * If lookup is racing with unlink return an error immediately.
+         */
-                /*
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                 * If lookup is racing with unlink, then we should return an
+                error = ENOENT;
-                 * error immediately so we don't remove it from the reclaim
+                goto out_error;
-                 * list and potentially leak the inode.
+        }
-                 */
-                if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                        error = ENOENT;
-                        goto out_error;
-                }
+        /*
+         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+         * Need to carefully get it back into useable state.
+         */
+        if (ip->i_flags & XFS_IRECLAIMABLE) {
                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
                /*
-                 * We need to re-initialise the VFS inode as it has been
+                 * We need to set XFS_INEW atomically with clearing the
-                 * 'freed' by the VFS. Do this here so we can deal with
+                 * reclaimable tag so that we do have an indicator of the
-                 * errors cleanly, then tag it so it can be set up correctly
+                 * inode still being initialized.
-                 * later.
                 */
-                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                ip->i_flags |= XFS_INEW;
-                        error = ENOMEM;
+                ip->i_flags &= ~XFS_IRECLAIMABLE;
-                        goto out_error;
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-                }
-                /*
+                spin_unlock(&ip->i_flags_lock);
-                 * We must set the XFS_INEW flag before clearing the
+                read_unlock(&pag->pag_ici_lock);
-                 * XFS_IRECLAIMABLE flag so that if a racing lookup does
-                 * not find the XFS_IRECLAIMABLE above but has the igrab()
-                 * below succeed we can safely check XFS_INEW to detect
-                 * that this inode is still being initialised.
-                 */
-                xfs_iflags_set(ip, XFS_INEW);
-                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                /* clear the radix tree reclaim flag as well. */
+                error = -inode_init_always(mp->m_super, inode);
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                if (error) {
-        } else if (!igrab(VFS_I(ip))) {
+                        /*
+                         * Re-initializing the inode failed, and we are in deep
+                         * trouble.  Try to re-add it to the reclaim list.
+                         */
+                        read_lock(&pag->pag_ici_lock);
+                        spin_lock(&ip->i_flags_lock);
+                        ip->i_flags &= ~XFS_INEW;
+                        ip->i_flags |= XFS_IRECLAIMABLE;
+                        __xfs_inode_set_reclaim_tag(pag, ip);
+                        goto out_error;
+                }
+                inode->i_state = I_LOCK|I_NEW;
+        } else {
                /* If the VFS inode is being torn down, pause and try again. */
-                XFS_STATS_INC(xs_ig_frecycle);
+                if (!igrab(inode)) {
-                goto out_error;
+                        error = EAGAIN;
-        } else if (xfs_iflags_test(ip, XFS_INEW)) {
+                        goto out_error;
-                /*
+                }
-                 * We are racing with another cache hit that is
-                 * currently recycling this inode out of the XFS_IRECLAIMABLE
-                 * state. Wait for the initialisation to complete before
-                 * continuing.
-                 */
-                wait_on_inode(VFS_I(ip));
-        }
-        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+                /* We've got a live one. */
-                error = ENOENT;
+                spin_unlock(&ip->i_flags_lock);
-                iput(VFS_I(ip));
+                read_unlock(&pag->pag_ici_lock);
-                goto out_error;
        }
-        /* We've got a live one. */
-        read_unlock(&pag->pag_ici_lock);
        if (lock_flags != 0)
                xfs_ilock(ip, lock_flags);
@@ -216,6 +276,7 @@ xfs_iget_cache_hit(
        return 0;
 out_error:
+        spin_unlock(&ip->i_flags_lock);
        read_unlock(&pag->pag_ici_lock);
        return error;
 }
@@ -299,7 +360,8 @@ out_preload_end:
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
 out_destroy:
-        xfs_destroy_inode(ip);
+        __destroy_inode(VFS_I(ip));
+        xfs_inode_free(ip);
        return error;
 }
@@ -504,62 +566,7 @@ xfs_ireclaim(
        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        switch (ip->i_d.di_mode & S_IFMT) {
+        xfs_inode_free(ip);
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                struct xfs_ail  *ailp = lip->li_ailp;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&ailp->xa_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_ail_delete(ailp, lip);
-                        else
-                                spin_unlock(&ailp->xa_lock);
-                }
-                xfs_inode_item_destroy(ip);
-                ip->i_itemp = NULL;
-        }
-        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f22d65fed0a..da428b3fe0f5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -343,6 +343,16 @@ xfs_iformat(
                return XFS_ERROR(EFSCORRUPTED);
        }
+        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                     !ip->i_mount->m_rtdev_targp)) {
+                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt dinode %Lu, has realtime flag set.",
+                        ip->i_ino);
+                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
        switch (ip->i_d.di_mode & S_IFMT) {
        case S_IFIFO:
        case S_IFCHR:
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1804f866a71d..65f24a3cc992 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -310,23 +310,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 }
 /*
- * Get rid of a partially initialized inode.
- *
- * We have to go through destroy_inode to make sure allocations
- * from init_inode_always like the security data are undone.
- *
- * We mark the inode bad so that it takes the short cut in
- * the reclaim path instead of going through the flush path
- * which doesn't make sense for an inode that has never seen the
- * light of day.
- */
-static inline void xfs_destroy_inode(struct xfs_inode *ip)
-{
-        make_bad_inode(VFS_I(ip));
-        return destroy_inode(VFS_I(ip));
-}
-/*
 * i_flags helper functions
 */
 static inline void
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-        ASSERT(spin_is_locked(&log->l_icloglock));
+        assert_spin_locked(&log->l_icloglock);
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4eca5ed5dab..492d75bae2bf 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -538,7 +538,9 @@ xfs_readlink_bmap(
                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+                bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+                                        XBF_LOCK | XBF_MAPPED |
+                                        XBF_DONT_BLOCK);
                error = XFS_BUF_GETERROR(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_readlink",