43 files changed, 677 insertions, 278 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 3f00a9faabcb..530581628311 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -325,8 +325,8 @@ config FS_POSIX_ACL
 source "fs/xfs/Kconfig"
 config OCFS2_FS
-        tristate "OCFS2 file system support (EXPERIMENTAL)"
+        tristate "OCFS2 file system support"
-        depends on NET && SYSFS && EXPERIMENTAL
+        depends on NET && SYSFS
        select CONFIGFS_FS
        select JBD
        select CRC32
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index df025453dd97..816e8ef64560 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -86,6 +86,32 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare
        return sd;
 }
+/*
+ *
+ * Return -EEXIST if there is already a configfs element with the same
+ * name for the same parent.
+ *
+ * called with parent inode's i_mutex held
+ */
+int configfs_dirent_exists(struct configfs_dirent *parent_sd,
+                           const unsigned char *new)
+{
+        struct configfs_dirent * sd;
+        list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
+                if (sd->s_element) {
+                        const unsigned char *existing = configfs_get_name(sd);
+                        if (strcmp(existing, new))
+                                continue;
+                        else
+                                return -EEXIST;
+                }
+        }
+        return 0;
+}
 int configfs_make_dirent(struct configfs_dirent * parent_sd,
                         struct dentry * dentry, void * element,
                         umode_t mode, int type)
@@ -136,8 +162,10 @@ static int create_dir(struct config_item * k, struct dentry * p,
        int error;
        umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO;
-        error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+        error = configfs_dirent_exists(p->d_fsdata, d->d_name.name);
-                                     CONFIGFS_DIR);
+        if (!error)
+                error = configfs_make_dirent(p->d_fsdata, d, k, mode,
+                                             CONFIGFS_DIR);
        if (!error) {
                error = configfs_create(d, mode, init_dir);
                if (!error) {
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index d4870432ecfc..b1981d0e95ad 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -539,7 +539,6 @@ unsigned long ext2_count_free (struct buffer_head * map, unsigned int numchars)
 #endif  /*  EXT2FS_DEBUG  */
-/* Superblock must be locked */
 unsigned long ext2_count_free_blocks (struct super_block * sb)
 {
        struct ext2_group_desc * desc;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index de85c61c58c5..695f69ccf908 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -637,7 +637,6 @@ fail:
        return ERR_PTR(err);
 }
-/* Superblock must be locked */
 unsigned long ext2_count_free_inodes (struct super_block * sb)
 {
        struct ext2_group_desc *desc;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 681dea8f9532..4286ff6330b6 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -251,6 +251,44 @@ static struct super_operations ext2_sops = {
 #endif
 };
+static struct dentry *ext2_get_dentry(struct super_block *sb, void *vobjp)
+{
+        __u32 *objp = vobjp;
+        unsigned long ino = objp[0];
+        __u32 generation = objp[1];
+        struct inode *inode;
+        struct dentry *result;
+        if (ino < EXT2_FIRST_INO(sb) && ino != EXT2_ROOT_INO)
+                return ERR_PTR(-ESTALE);
+        if (ino > le32_to_cpu(EXT2_SB(sb)->s_es->s_inodes_count))
+                return ERR_PTR(-ESTALE);
+        /* iget isn't really right if the inode is currently unallocated!!
+         * ext2_read_inode currently does appropriate checks, but
+         * it might be "neater" to call ext2_get_inode first and check
+         * if the inode is valid.....
+         */
+        inode = iget(sb, ino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (is_bad_inode(inode) ||
+            (generation && inode->i_generation != generation)) {
+                /* we didn't find the right inode.. */
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        /* now to find a dentry.
+         * If possible, get a well-connected one
+         */
+        result = d_alloc_anon(inode);
+        if (!result) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return result;
+}
 /* Yes, most of these are left as NULL!!
 * A NULL value implies the default, which works with ext2-like file
 * systems, but can be improved upon.
@@ -258,6 +296,7 @@ static struct super_operations ext2_sops = {
 */
 static struct export_operations ext2_export_ops = {
        .get_parent = ext2_get_parent,
+        .get_dentry = ext2_get_dentry,
 };
 static unsigned long get_sb_block(void **data)
@@ -1044,7 +1083,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        unsigned long overhead;
        int i;
-        lock_super(sb);
        if (test_opt (sb, MINIX_DF))
                overhead = 0;
        else {
@@ -1085,7 +1123,6 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf)
        buf->f_files = le32_to_cpu(sbi->s_es->s_inodes_count);
        buf->f_ffree = ext2_count_free_inodes (sb);
        buf->f_namelen = EXT2_NAME_LEN;
-        unlock_super(sb);
        return 0;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index c5ee9f0691e3..84be02e93652 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -925,7 +925,7 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        set_buffer_new(bh_result);
 got_it:
        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
-        if (blocks_to_boundary == 0)
+        if (count > blocks_to_boundary)
                set_buffer_boundary(bh_result);
        err = count;
        /* Clean up and exit */
@@ -1009,11 +1009,14 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
        buffer_trace_init(&dummy.b_history);
        err = ext3_get_blocks_handle(handle, inode, block, 1,
                                        &dummy, create, 1);
-        if (err == 1) {
+        /*
+         * ext3_get_blocks_handle() returns number of blocks
+         * mapped. 0 in case of a HOLE.
+         */
+        if (err > 0) {
+                if (err > 1)
+                        WARN_ON(1);
                err = 0;
-        } else if (err >= 0) {
-                WARN_ON(1);
-                err = -EIO;
        }
        *errp = err;
        if (!err && buffer_mapped(&dummy)) {
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 813d589cc6c0..3559086eee5f 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -554,6 +554,47 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
        return 0;
 }
+static struct dentry *ext3_get_dentry(struct super_block *sb, void *vobjp)
+{
+        __u32 *objp = vobjp;
+        unsigned long ino = objp[0];
+        __u32 generation = objp[1];
+        struct inode *inode;
+        struct dentry *result;
+        if (ino < EXT3_FIRST_INO(sb) && ino != EXT3_ROOT_INO)
+                return ERR_PTR(-ESTALE);
+        if (ino > le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count))
+                return ERR_PTR(-ESTALE);
+        /* iget isn't really right if the inode is currently unallocated!!
+         *
+         * ext3_read_inode will return a bad_inode if the inode had been
+         * deleted, so we should be safe.
+         *
+         * Currently we don't know the generation for parent directory, so
+         * a generation of 0 means "accept any"
+         */
+        inode = iget(sb, ino);
+        if (inode == NULL)
+                return ERR_PTR(-ENOMEM);
+        if (is_bad_inode(inode) ||
+            (generation && inode->i_generation != generation)) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        /* now to find a dentry.
+         * If possible, get a well-connected one
+         */
+        result = d_alloc_anon(inode);
+        if (!result) {
+                iput(inode);
+                return ERR_PTR(-ENOMEM);
+        }
+        return result;
+}
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -622,6 +663,7 @@ static struct super_operations ext3_sops = {
 static struct export_operations ext3_export_ops = {
        .get_parent = ext3_get_parent,
+        .get_dentry = ext3_get_dentry,
 };
 enum {
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 2e0cc8e00b85..3a566077ac95 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -41,11 +41,7 @@ struct jffs2_inode_info {
        uint16_t flags;
        uint8_t usercompr;
-#if !defined (__ECOS)
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,2)
        struct inode vfs_inode;
-#endif
-#endif
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
        struct posix_acl *i_acl_access;
        struct posix_acl *i_acl_default;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 7675b33396c7..5a6b4d64206c 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -21,6 +21,9 @@
 #include <linux/pagemap.h>
 #include "nodelist.h"
+static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
+                                     struct jffs2_node_frag *this);
 void jffs2_add_fd_to_list(struct jffs2_sb_info *c, struct jffs2_full_dirent *new, struct jffs2_full_dirent **list)
 {
        struct jffs2_full_dirent **prev = list;
@@ -87,7 +90,8 @@ void jffs2_truncate_fragtree(struct jffs2_sb_info *c, struct rb_root *list, uint
        }
 }
-void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, struct jffs2_node_frag *this)
+static void jffs2_obsolete_node_frag(struct jffs2_sb_info *c,
+                                     struct jffs2_node_frag *this)
 {
        if (this->node) {
                this->node->frags--;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index cae92c14116d..0ddfd70307fb 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -334,7 +334,6 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete);
 struct rb_node *rb_next(struct rb_node *);
 struct rb_node *rb_prev(struct rb_node *);
 void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root);
-void jffs2_obsolete_node_frag(struct jffs2_sb_info *c, struct jffs2_node_frag *this);
 int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn);
 void jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size);
 int jffs2_add_older_frag_to_fragtree(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_tmp_dnode_info *tn);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index c19bd476e8ec..e52cef526d90 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -252,6 +252,11 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs,
        union jffs2_node_union *node;
        struct jffs2_eraseblock *jeb;
+        if (c->summary->sum_size == JFFS2_SUMMARY_NOSUM_SIZE) {
+                dbg_summary("Summary is disabled for this jeb! Skipping summary info!\n");
+                return 0;
+        }
        node = invecs[0].iov_base;
        jeb = &c->blocks[ofs / c->sector_size];
        ofs -= jeb->offset;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 25bc1ae08648..4da09ce1d1f5 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1215,7 +1215,6 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt
        rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE);
        if (rc) {
                JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen);
-                rc = rc ? rc : -EBADFD;
                goto out;
        }
        rc = save_xattr_datum(c, xd);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index fecd3b095deb..76ca1cbc38f9 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -100,25 +100,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
        return atomic_dec_and_test(&dreq->io_count);
 }
-/*
- * "size" is never larger than rsize or wsize.
- */
-static inline int nfs_direct_count_pages(unsigned long user_addr, size_t size)
-{
-        int page_count;
-        page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        page_count -= user_addr >> PAGE_SHIFT;
-        BUG_ON(page_count < 0);
-        return page_count;
-}
-static inline unsigned int nfs_max_pages(unsigned int size)
-{
-        return (size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-}
 /**
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
@@ -276,28 +257,24 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
        struct nfs_open_context *ctx = dreq->ctx;
        struct inode *inode = ctx->dentry->d_inode;
        size_t rsize = NFS_SERVER(inode)->rsize;
-        unsigned int rpages = nfs_max_pages(rsize);
        unsigned int pgbase;
        int result;
        ssize_t started = 0;
        get_dreq(dreq);
-        pgbase = user_addr & ~PAGE_MASK;
        do {
                struct nfs_read_data *data;
                size_t bytes;
+                pgbase = user_addr & ~PAGE_MASK;
+                bytes = min(rsize,count);
                result = -ENOMEM;
-                data = nfs_readdata_alloc(rpages);
+                data = nfs_readdata_alloc(pgbase + bytes);
                if (unlikely(!data))
                        break;
-                bytes = rsize;
-                if (count < rsize)
-                        bytes = count;
-                data->npages = nfs_direct_count_pages(user_addr, bytes);
                down_read(&current->mm->mmap_sem);
                result = get_user_pages(current, current->mm, user_addr,
                                        data->npages, 1, 0, data->pagevec, NULL);
@@ -344,8 +321,10 @@ static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq, unsigned lo
                started += bytes;
                user_addr += bytes;
                pos += bytes;
+                /* FIXME: Remove this unnecessary math from final patch */
                pgbase += bytes;
                pgbase &= ~PAGE_MASK;
+                BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
                count -= bytes;
        } while (count != 0);
@@ -524,7 +503,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
 static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
 {
-        dreq->commit_data = nfs_commit_alloc(0);
+        dreq->commit_data = nfs_commit_alloc();
        if (dreq->commit_data != NULL)
                dreq->commit_data->req = (struct nfs_page *) dreq;
 }
@@ -605,28 +584,24 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
        struct nfs_open_context *ctx = dreq->ctx;
        struct inode *inode = ctx->dentry->d_inode;
        size_t wsize = NFS_SERVER(inode)->wsize;
-        unsigned int wpages = nfs_max_pages(wsize);
        unsigned int pgbase;
        int result;
        ssize_t started = 0;
        get_dreq(dreq);
-        pgbase = user_addr & ~PAGE_MASK;
        do {
                struct nfs_write_data *data;
                size_t bytes;
+                pgbase = user_addr & ~PAGE_MASK;
+                bytes = min(wsize,count);
                result = -ENOMEM;
-                data = nfs_writedata_alloc(wpages);
+                data = nfs_writedata_alloc(pgbase + bytes);
                if (unlikely(!data))
                        break;
-                bytes = wsize;
-                if (count < wsize)
-                        bytes = count;
-                data->npages = nfs_direct_count_pages(user_addr, bytes);
                down_read(&current->mm->mmap_sem);
                result = get_user_pages(current, current->mm, user_addr,
                                        data->npages, 0, 0, data->pagevec, NULL);
@@ -676,8 +651,11 @@ static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq, unsigned l
                started += bytes;
                user_addr += bytes;
                pos += bytes;
+                /* FIXME: Remove this useless math from the final patch */
                pgbase += bytes;
                pgbase &= ~PAGE_MASK;
+                BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
                count -= bytes;
        } while (count != 0);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 153898e1331f..b14145b7b87f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -970,7 +970,7 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
        status = -ENOMEM;
        opendata = nfs4_opendata_alloc(dentry, sp, flags, sattr);
        if (opendata == NULL)
-                goto err_put_state_owner;
+                goto err_release_rwsem;
        status = _nfs4_proc_open(opendata);
        if (status != 0)
@@ -989,11 +989,11 @@ static int _nfs4_do_open(struct inode *dir, struct dentry *dentry, int flags, st
        return 0;
 err_opendata_free:
        nfs4_opendata_free(opendata);
+err_release_rwsem:
+        up_read(&clp->cl_sem);
 err_put_state_owner:
        nfs4_put_state_owner(sp);
 out_err:
-        /* Note: clp->cl_sem must be released before nfs4_put_open_state()! */
-        up_read(&clp->cl_sem);
        *res = NULL;
        return status;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index da9cf11c326f..f0aff824a291 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -43,13 +43,15 @@ static mempool_t *nfs_rdata_mempool;
 #define MIN_POOL_READ   (32)
-struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
+struct nfs_read_data *nfs_readdata_alloc(size_t len)
 {
+        unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
+                p->npages = pagecount;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -140,7 +142,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
        int             result;
        struct nfs_read_data *rdata;
-        rdata = nfs_readdata_alloc(1);
+        rdata = nfs_readdata_alloc(count);
        if (!rdata)
                return -ENOMEM;
@@ -202,9 +204,11 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
        NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
        spin_unlock(&inode->i_lock);
-        nfs_readpage_truncate_uninitialised_page(rdata);
+        if (rdata->res.eof || rdata->res.count == rdata->args.count) {
-        if (rdata->res.eof || rdata->res.count == rdata->args.count)
                SetPageUptodate(page);
+                if (rdata->res.eof && count != 0)
+                        memclear_highpage_flush(page, rdata->args.pgbase, count);
+        }
        result = 0;
 io_error:
@@ -336,25 +340,25 @@ static int nfs_pagein_multi(struct list_head *head, struct inode *inode)
        struct nfs_page *req = nfs_list_entry(head->next);
        struct page *page = req->wb_page;
        struct nfs_read_data *data;
-        unsigned int rsize = NFS_SERVER(inode)->rsize;
+        size_t rsize = NFS_SERVER(inode)->rsize, nbytes;
-        unsigned int nbytes, offset;
+        unsigned int offset;
        int requests = 0;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
        nbytes = req->wb_bytes;
-        for(;;) {
+        do {
-                data = nfs_readdata_alloc(1);
+                size_t len = min(nbytes,rsize);
+                data = nfs_readdata_alloc(len);
                if (!data)
                        goto out_bad;
                INIT_LIST_HEAD(&data->pages);
                list_add(&data->pages, &list);
                requests++;
-                if (nbytes <= rsize)
+                nbytes -= len;
-                        break;
+        } while(nbytes != 0);
-                nbytes -= rsize;
-        }
        atomic_set(&req->wb_complete, requests);
        ClearPageError(page);
@@ -402,7 +406,7 @@ static int nfs_pagein_one(struct list_head *head, struct inode *inode)
        if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
                return nfs_pagein_multi(head, inode);
-        data = nfs_readdata_alloc(NFS_SERVER(inode)->rpages);
+        data = nfs_readdata_alloc(NFS_SERVER(inode)->rsize);
        if (!data)
                goto out_bad;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 50774991f8d5..7084ac9a6455 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -90,22 +90,13 @@ static mempool_t *nfs_commit_mempool;
 static DECLARE_WAIT_QUEUE_HEAD(nfs_write_congestion);
-struct nfs_write_data *nfs_commit_alloc(unsigned int pagecount)
+struct nfs_write_data *nfs_commit_alloc(void)
 {
        struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, SLAB_NOFS);
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
-                if (pagecount <= ARRAY_SIZE(p->page_array))
-                        p->pagevec = p->page_array;
-                else {
-                        p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS);
-                        if (!p->pagevec) {
-                                mempool_free(p, nfs_commit_mempool);
-                                p = NULL;
-                        }
-                }
        }
        return p;
 }
@@ -117,13 +108,15 @@ void nfs_commit_free(struct nfs_write_data *p)
        mempool_free(p, nfs_commit_mempool);
 }
-struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
+struct nfs_write_data *nfs_writedata_alloc(size_t len)
 {
+        unsigned int pagecount = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
        if (p) {
                memset(p, 0, sizeof(*p));
                INIT_LIST_HEAD(&p->pages);
+                p->npages = pagecount;
                if (pagecount <= ARRAY_SIZE(p->page_array))
                        p->pagevec = p->page_array;
                else {
@@ -208,7 +201,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
        int             result, written = 0;
        struct nfs_write_data *wdata;
-        wdata = nfs_writedata_alloc(1);
+        wdata = nfs_writedata_alloc(wsize);
        if (!wdata)
                return -ENOMEM;
@@ -597,8 +590,8 @@ static void nfs_cancel_commit_list(struct list_head *head)
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_inode_remove_request(req);
-                nfs_clear_page_writeback(req);
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                nfs_clear_page_writeback(req);
        }
 }
@@ -999,24 +992,24 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, int how)
        struct nfs_page *req = nfs_list_entry(head->next);
        struct page *page = req->wb_page;
        struct nfs_write_data *data;
-        unsigned int wsize = NFS_SERVER(inode)->wsize;
+        size_t wsize = NFS_SERVER(inode)->wsize, nbytes;
-        unsigned int nbytes, offset;
+        unsigned int offset;
        int requests = 0;
        LIST_HEAD(list);
        nfs_list_remove_request(req);
        nbytes = req->wb_bytes;
-        for (;;) {
+        do {
-                data = nfs_writedata_alloc(1);
+                size_t len = min(nbytes, wsize);
+                data = nfs_writedata_alloc(len);
                if (!data)
                        goto out_bad;
                list_add(&data->pages, &list);
                requests++;
-                if (nbytes <= wsize)
+                nbytes -= len;
-                        break;
+        } while (nbytes != 0);
-                nbytes -= wsize;
-        }
        atomic_set(&req->wb_complete, requests);
        ClearPageError(page);
@@ -1070,7 +1063,7 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, int how)
        struct nfs_write_data   *data;
        unsigned int            count;
-        data = nfs_writedata_alloc(NFS_SERVER(inode)->wpages);
+        data = nfs_writedata_alloc(NFS_SERVER(inode)->wsize);
        if (!data)
                goto out_bad;
@@ -1378,7 +1371,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
        struct nfs_write_data   *data;
        struct nfs_page         *req;
-        data = nfs_commit_alloc(NFS_SERVER(inode)->wpages);
+        data = nfs_commit_alloc();
        if (!data)
                goto out_bad;
@@ -1393,8 +1386,8 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
                req = nfs_list_entry(head->next);
                nfs_list_remove_request(req);
                nfs_mark_request_commit(req);
-                nfs_clear_page_writeback(req);
                dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                nfs_clear_page_writeback(req);
        }
        return -ENOMEM;
 }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 06da7506363c..e35d7e52fdeb 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -33,7 +33,7 @@
 *
 */
+#include <linux/err.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfs4.h>
@@ -87,34 +87,35 @@ int
 nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
 {
        struct xdr_netobj cksum;
-        struct crypto_tfm *tfm;
+        struct hash_desc desc;
        struct scatterlist sg[1];
        int status = nfserr_resource;
        dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
                        clname->len, clname->data);
-        tfm = crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
+        desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-        if (tfm == NULL)
+        desc.tfm = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
-                goto out;
+        if (IS_ERR(desc.tfm))
-        cksum.len = crypto_tfm_alg_digestsize(tfm);
+                goto out_no_tfm;
+        cksum.len = crypto_hash_digestsize(desc.tfm);
        cksum.data = kmalloc(cksum.len, GFP_KERNEL);
        if (cksum.data == NULL)
                goto out;
-        crypto_digest_init(tfm);
        sg[0].page = virt_to_page(clname->data);
        sg[0].offset = offset_in_page(clname->data);
        sg[0].length = clname->len;
-        crypto_digest_update(tfm, sg, 1);
+        if (crypto_hash_digest(&desc, sg, sg->length, cksum.data))
-        crypto_digest_final(tfm, cksum.data);
+                goto out;
        md5_to_hex(dname, cksum.data);
        kfree(cksum.data);
        status = nfs_ok;
 out:
-        crypto_free_tfm(tfm);
+        crypto_free_hash(desc.tfm);
+out_no_tfm:
        return status;
 }
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 7d3be845a614..9fb8132f19b0 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -16,6 +16,7 @@ ocfs2-objs := \
        file.o                  \
        heartbeat.o             \
        inode.o                 \
+        ioctl.o                 \
        journal.o               \
        localalloc.o            \
        mmap.o                  \
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index edaab05a93e0..f43bc5f18a35 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1717,17 +1717,29 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
                        ocfs2_remove_from_cache(inode, eb_bh);
-                        BUG_ON(eb->h_suballoc_slot);
                        BUG_ON(el->l_recs[0].e_clusters);
                        BUG_ON(el->l_recs[0].e_cpos);
                        BUG_ON(el->l_recs[0].e_blkno);
-                        status = ocfs2_free_extent_block(handle,
+                        if (eb->h_suballoc_slot == 0) {
-                                                         tc->tc_ext_alloc_inode,
+                                /*
-                                                         tc->tc_ext_alloc_bh,
+                                 * This code only understands how to
-                                                         eb);
+                                 * lock the suballocator in slot 0,
-                        if (status < 0) {
+                                 * which is fine because allocation is
-                                mlog_errno(status);
+                                 * only ever done out of that
-                                goto bail;
+                                 * suballocator too. A future version
+                                 * might change that however, so avoid
+                                 * a free if we don't know how to
+                                 * handle it. This way an fs incompat
+                                 * bit will not be necessary.
+                                 */
+                                status = ocfs2_free_extent_block(handle,
+                                                                 tc->tc_ext_alloc_inode,
+                                                                 tc->tc_ext_alloc_bh,
+                                                                 eb);
+                                if (status < 0) {
+                                        mlog_errno(status);
+                                        goto bail;
+                                }
                        }
                }
                brelse(eb_bh);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f1d1c342ce01..3d7c082a8f58 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -391,31 +391,28 @@ out:
 static int ocfs2_commit_write(struct file *file, struct page *page,
                              unsigned from, unsigned to)
 {
-        int ret, extending = 0, locklevel = 0;
+        int ret;
-        loff_t new_i_size;
        struct buffer_head *di_bh = NULL;
        struct inode *inode = page->mapping->host;
        struct ocfs2_journal_handle *handle = NULL;
+        struct ocfs2_dinode *di;
        mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
        /* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-         * us to sample inode->i_size here without the metadata lock:
+         * us to continue here without rechecking the I/O against
+         * changed inode values.
         *
         * 1) We're currently holding the inode alloc lock, so no
         *    nodes can change it underneath us.
         *
         * 2) We've had to take the metadata lock at least once
-         *    already to check for extending writes, hence insuring
+         *    already to check for extending writes, suid removal, etc.
-         *    that our current copy is also up to date.
+         *    The meta data update code then ensures that we don't get a
+         *    stale inode allocation image (i_size, i_clusters, etc).
         */
-        new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-        if (new_i_size > i_size_read(inode)) {
-                extending = 1;
-                locklevel = 1;
-        }
-        ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, locklevel, page);
+        ret = ocfs2_meta_lock_with_page(inode, NULL, &di_bh, 1, page);
        if (ret != 0) {
                mlog_errno(ret);
                goto out;
@@ -427,23 +424,20 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                goto out_unlock_meta;
        }
-        if (extending) {
+        handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-                handle = ocfs2_start_walk_page_trans(inode, page, from, to);
+        if (IS_ERR(handle)) {
-                if (IS_ERR(handle)) {
+                ret = PTR_ERR(handle);
-                        ret = PTR_ERR(handle);
+                goto out_unlock_data;
-                        handle = NULL;
+        }
-                        goto out_unlock_data;
-                }
-                /* Mark our buffer early. We'd rather catch this error up here
+        /* Mark our buffer early. We'd rather catch this error up here
-                 * as opposed to after a successful commit_write which would
+         * as opposed to after a successful commit_write which would
-                 * require us to set back inode->i_size. */
+         * require us to set back inode->i_size. */
-                ret = ocfs2_journal_access(handle, inode, di_bh,
+        ret = ocfs2_journal_access(handle, inode, di_bh,
-                                           OCFS2_JOURNAL_ACCESS_WRITE);
+                                   OCFS2_JOURNAL_ACCESS_WRITE);
-                if (ret < 0) {
+        if (ret < 0) {
-                        mlog_errno(ret);
+                mlog_errno(ret);
-                        goto out_commit;
+                goto out_commit;
-                }
        }
        /* might update i_size */
@@ -453,37 +447,28 @@ static int ocfs2_commit_write(struct file *file, struct page *page,
                goto out_commit;
        }
-        if (extending) {
+        di = (struct ocfs2_dinode *)di_bh->b_data;
-                loff_t size = (u64) i_size_read(inode);
-                struct ocfs2_dinode *di =
-                        (struct ocfs2_dinode *)di_bh->b_data;
-                /* ocfs2_mark_inode_dirty is too heavy to use here. */
+        /* ocfs2_mark_inode_dirty() is too heavy to use here. */
-                inode->i_blocks = ocfs2_align_bytes_to_sectors(size);
+        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+        di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-                di->i_size = cpu_to_le64(size);
+        inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
-                di->i_ctime = di->i_mtime = 
+        di->i_size = cpu_to_le64((u64)i_size_read(inode));
-                                cpu_to_le64(inode->i_mtime.tv_sec);
-                di->i_ctime_nsec = di->i_mtime_nsec = 
-                                cpu_to_le32(inode->i_mtime.tv_nsec);
-                ret = ocfs2_journal_dirty(handle, di_bh);
+        ret = ocfs2_journal_dirty(handle, di_bh);
-                if (ret < 0) {
+        if (ret < 0) {
-                        mlog_errno(ret);
+                mlog_errno(ret);
-                        goto out_commit;
+                goto out_commit;
-                }
        }
-        BUG_ON(extending && (i_size_read(inode) != new_i_size));
 out_commit:
-        if (handle)
+        ocfs2_commit_trans(handle);
-                ocfs2_commit_trans(handle);
 out_unlock_data:
        ocfs2_data_unlock(inode, 1);
 out_unlock_meta:
-        ocfs2_meta_unlock(inode, locklevel);
+        ocfs2_meta_unlock(inode, 1);
 out:
        if (di_bh)
                brelse(di_bh);
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 9a24adf9be6e..c9037414f4f6 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
        mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
                   (unsigned long long)block, nr, flags, inode);
+        BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+               (!inode || !(flags & OCFS2_BH_CACHED)));
        if (osb == NULL || osb->sb == NULL || bhs == NULL) {
                status = -EINVAL;
                mlog_errno(status);
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                bh = bhs[i];
                ignore_cache = 0;
+                /* There are three read-ahead cases here which we need to
+                 * be concerned with. All three assume a buffer has
+                 * previously been submitted with OCFS2_BH_READAHEAD
+                 * and it hasn't yet completed I/O.
+                 *
+                 * 1) The current request is sync to disk. This rarely
+                 *    happens these days, and never when performance
+                 *    matters - the code can just wait on the buffer
+                 *    lock and re-submit.
+                 *
+                 * 2) The current request is cached, but not
+                 *    readahead. ocfs2_buffer_uptodate() will return
+                 *    false anyway, so we'll wind up waiting on the
+                 *    buffer lock to do I/O. We re-check the request
+                 *    with after getting the lock to avoid a re-submit.
+                 *
+                 * 3) The current request is readahead (and so must
+                 *    also be a caching one). We short circuit if the
+                 *    buffer is locked (under I/O) and if it's in the
+                 *    uptodate cache. The re-check from #2 catches the
+                 *    case that the previous read-ahead completes just
+                 *    before our is-it-in-flight check.
+                 */
                if (flags & OCFS2_BH_CACHED &&
                    !ocfs2_buffer_uptodate(inode, bh)) {
                        mlog(ML_UPTODATE,
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                continue;
                        }
+                        /* A read-ahead request was made - if the
+                         * buffer is already under read-ahead from a
+                         * previously submitted request than we are
+                         * done here. */
+                        if ((flags & OCFS2_BH_READAHEAD)
+                            && ocfs2_buffer_read_ahead(inode, bh))
+                                continue;
                        lock_buffer(bh);
                        if (buffer_jbd(bh)) {
 #ifdef CATCH_BH_JBD_RACES
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
                                continue;
 #endif
                        }
+                        /* Re-check ocfs2_buffer_uptodate() as a
+                         * previously read-ahead buffer may have
+                         * completed I/O while we were waiting for the
+                         * buffer lock. */
+                        if ((flags & OCFS2_BH_CACHED)
+                            && !(flags & OCFS2_BH_READAHEAD)
+                            && ocfs2_buffer_uptodate(inode, bh)) {
+                                unlock_buffer(bh);
+                                continue;
+                        }
                        clear_buffer_uptodate(bh);
                        get_bh(bh); /* for end_buffer_read_sync() */
                        bh->b_end_io = end_buffer_read_sync;
-                        if (flags & OCFS2_BH_READAHEAD)
+                        submit_bh(READ, bh);
-                                submit_bh(READA, bh);
-                        else
-                                submit_bh(READ, bh);
                        continue;
                }
        }
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
        for (i = (nr - 1); i >= 0; i--) {
                bh = bhs[i];
-                /* We know this can't have changed as we hold the
+                if (!(flags & OCFS2_BH_READAHEAD)) {
-                 * inode sem. Avoid doing any work on the bh if the
+                        /* We know this can't have changed as we hold the
-                 * journal has it. */
+                         * inode sem. Avoid doing any work on the bh if the
-                if (!buffer_jbd(bh))
+                         * journal has it. */
-                        wait_on_buffer(bh);
+                        if (!buffer_jbd(bh))
+                                wait_on_buffer(bh);
-                if (!buffer_uptodate(bh)) {
-                        /* Status won't be cleared from here on out,
+                        if (!buffer_uptodate(bh)) {
-                         * so we can safely record this and loop back
+                                /* Status won't be cleared from here on out,
-                         * to cleanup the other buffers. Don't need to
+                                 * so we can safely record this and loop back
-                         * remove the clustered uptodate information
+                                 * to cleanup the other buffers. Don't need to
-                         * for this bh as it's not marked locally
+                                 * remove the clustered uptodate information
-                         * uptodate. */
+                                 * for this bh as it's not marked locally
-                        status = -EIO;
+                                 * uptodate. */
-                        brelse(bh);
+                                status = -EIO;
-                        bhs[i] = NULL;
+                                brelse(bh);
-                        continue;
+                                bhs[i] = NULL;
+                                continue;
+                        }
                }
+                /* Always set the buffer in the cache, even if it was
+                 * a forced read, or read-ahead which hasn't yet
+                 * completed. */
                if (inode)
                        ocfs2_set_buffer_uptodate(inode, bh);
        }
        if (inode)
                mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
-        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", 
+        mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
             (unsigned long long)block, nr,
-             (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+             (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
 bail:
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 6ecb90937b68..6cc20930fac3 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -49,7 +49,7 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
 #define OCFS2_BH_CACHED            1
-#define OCFS2_BH_READAHEAD         8    /* use this to pass READA down to submit_bh */
+#define OCFS2_BH_READAHEAD         8
 static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
                                   struct buffer_head **bh, int flags,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 504595d6cf65..305cba3681fe 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -320,8 +320,12 @@ static int compute_max_sectors(struct block_device *bdev)
                max_pages = q->max_hw_segments;
        max_pages--; /* Handle I/Os that straddle a page */
-        max_sectors = max_pages << (PAGE_SHIFT - 9);
+        if (max_pages) {
+                max_sectors = max_pages << (PAGE_SHIFT - 9);
+        } else {
+                /* If BIO contains 1 or less than 1 page. */
+                max_sectors = q->max_sectors;
+        }
        /* Why is fls() 1-based???? */
        pow_two_sectors = 1 << (fls(max_sectors) - 1);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 3d494d1a5f36..04e01915b86e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -74,14 +74,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
        int error = 0;
-        unsigned long offset, blk;
+        unsigned long offset, blk, last_ra_blk = 0;
-        int i, num, stored;
+        int i, stored;
        struct buffer_head * bh, * tmp;
        struct ocfs2_dir_entry * de;
        int err;
        struct inode *inode = filp->f_dentry->d_inode;
        struct super_block * sb = inode->i_sb;
-        int have_disk_lock = 0;
+        unsigned int ra_sectors = 16;
        mlog_entry("dirino=%llu\n",
                   (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -95,9 +95,8 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        mlog_errno(error);
                /* we haven't got any yet, so propagate the error. */
                stored = error;
-                goto bail;
+                goto bail_nolock;
        }
-        have_disk_lock = 1;
        offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -113,16 +112,21 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
                        continue;
                }
-                /*
+                /* The idea here is to begin with 8k read-ahead and to stay
-                 * Do the readahead (8k)
+                 * 4k ahead of our current position.
-                 */
+                 *
-                if (!offset) {
+                 * TODO: Use the pagecache for this. We just need to
-                        for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+                 * make sure it's cluster-safe... */
+                if (!last_ra_blk
+                    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
+                        for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
                             i > 0; i--) {
                                tmp = ocfs2_bread(inode, ++blk, &err, 1);
                                if (tmp)
                                        brelse(tmp);
                        }
+                        last_ra_blk = blk;
+                        ra_sectors = 8;
                }
 revalidate:
@@ -194,9 +198,9 @@ revalidate:
        stored = 0;
 bail:
-        if (have_disk_lock)
+        ocfs2_meta_unlock(inode, 0);
-                ocfs2_meta_unlock(inode, 0);
+bail_nolock:
        mlog_exit(stored);
        return stored;
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 42775e2bbe2c..f13a4bac41f0 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -367,12 +367,10 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data)
                        goto do_ast;
        }
-        mlog(ML_ERROR, "got %sast for unknown lock!  cookie=%u:%llu, "
+        mlog(0, "got %sast for unknown lock!  cookie=%u:%llu, "
-                       "name=%.*s, namelen=%u\n", 
+             "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", 
-                       past->type == DLM_AST ? "" : "b", 
+             dlm_get_lock_cookie_node(cookie), dlm_get_lock_cookie_seq(cookie),
-                       dlm_get_lock_cookie_node(cookie),
+             locklen, name, locklen);
-                       dlm_get_lock_cookie_seq(cookie),
-                       locklen, name, locklen);
        ret = DLM_NORMAL;
 unlock_out:
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 762eb1fbb34d..151b41781eab 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1330,6 +1330,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
                cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
        lvb->lvb_imtime_packed =
                cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
+        lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
        mlog_meta_lvb(0, lockres);
@@ -1360,6 +1361,9 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
        oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
        i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
+        oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
+        ocfs2_set_inode_flags(inode);
        /* fast-symlinks are a special case */
        if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
                inode->i_blocks = 0;
@@ -2899,8 +2903,9 @@ void ocfs2_dump_meta_lvb_info(u64 level,
             be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
             be16_to_cpu(lvb->lvb_imode));
        mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
-             "mtime_packed 0x%llx\n", be16_to_cpu(lvb->lvb_inlink),
+             "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
             (long long)be64_to_cpu(lvb->lvb_iatime_packed),
             (long long)be64_to_cpu(lvb->lvb_ictime_packed),
-             (long long)be64_to_cpu(lvb->lvb_imtime_packed));
+             (long long)be64_to_cpu(lvb->lvb_imtime_packed),
+             be32_to_cpu(lvb->lvb_iattr));
 }
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 8f2d1db2d9ea..243ae862ece5 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -27,7 +27,7 @@
 #ifndef DLMGLUE_H
 #define DLMGLUE_H
-#define OCFS2_LVB_VERSION 2
+#define OCFS2_LVB_VERSION 3
 struct ocfs2_meta_lvb {
        __be32       lvb_version;
@@ -40,7 +40,8 @@ struct ocfs2_meta_lvb {
        __be64       lvb_isize;
        __be16       lvb_imode;
        __be16       lvb_inlink;
-        __be32       lvb_reserved[3];
+        __be32       lvb_iattr;
+        __be32       lvb_reserved[2];
 };
 /* ocfs2_meta_lock_full() and ocfs2_data_lock_full() 'arg_flags' flags */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a9559c874530..2bbfa17090cf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -44,6 +44,7 @@
 #include "file.h"
 #include "sysfile.h"
 #include "inode.h"
+#include "ioctl.h"
 #include "journal.h"
 #include "mmap.h"
 #include "suballoc.h"
@@ -1227,10 +1228,12 @@ const struct file_operations ocfs2_fops = {
        .open           = ocfs2_file_open,
        .aio_read       = ocfs2_file_aio_read,
        .aio_write      = ocfs2_file_aio_write,
+        .ioctl          = ocfs2_ioctl,
 };
 const struct file_operations ocfs2_dops = {
        .read           = generic_read_dir,
        .readdir        = ocfs2_readdir,
        .fsync          = ocfs2_sync_file,
+        .ioctl          = ocfs2_ioctl,
 };
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 327a5b7b86ed..7bcf69154592 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -71,6 +71,26 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                                    struct inode *inode,
                                    struct buffer_head *fe_bh);
+void ocfs2_set_inode_flags(struct inode *inode)
+{
+        unsigned int flags = OCFS2_I(inode)->ip_attr;
+        inode->i_flags &= ~(S_IMMUTABLE |
+                S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC);
+        if (flags & OCFS2_IMMUTABLE_FL)
+                inode->i_flags |= S_IMMUTABLE;
+        if (flags & OCFS2_SYNC_FL)
+                inode->i_flags |= S_SYNC;
+        if (flags & OCFS2_APPEND_FL)
+                inode->i_flags |= S_APPEND;
+        if (flags & OCFS2_NOATIME_FL)
+                inode->i_flags |= S_NOATIME;
+        if (flags & OCFS2_DIRSYNC_FL)
+                inode->i_flags |= S_DIRSYNC;
+}
 struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
                                     u64 blkno,
                                     int delete_vote)
@@ -260,7 +280,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                inode->i_blocks =
                        ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
        inode->i_mapping->a_ops = &ocfs2_aops;
-        inode->i_flags |= S_NOATIME;
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
        inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -276,6 +295,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
        OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
+        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
        if (create_ino)
                inode->i_ino = ino_from_blkno(inode->i_sb,
@@ -330,6 +350,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres,
                                  OCFS2_LOCK_TYPE_DATA, inode);
+        ocfs2_set_inode_flags(inode);
+        inode->i_flags |= S_NOATIME;
        status = 0;
 bail:
        mlog_exit(status);
@@ -1027,12 +1050,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
        u64 p_blkno;
        int readflags = OCFS2_BH_CACHED;
-#if 0
-        /* only turn this on if we know we can deal with read_block
-         * returning nothing */
        if (reada)
                readflags |= OCFS2_BH_READAHEAD;
-#endif
        if (((u64)block << inode->i_sb->s_blocksize_bits) >=
            i_size_read(inode)) {
@@ -1131,6 +1150,7 @@ int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
        spin_lock(&OCFS2_I(inode)->ip_lock);
        fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+        fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr);
        spin_unlock(&OCFS2_I(inode)->ip_lock);
        fe->i_size = cpu_to_le64(i_size_read(inode));
@@ -1169,6 +1189,8 @@ void ocfs2_refresh_inode(struct inode *inode,
        spin_lock(&OCFS2_I(inode)->ip_lock);
        OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+        OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+        ocfs2_set_inode_flags(inode);
        i_size_write(inode, le64_to_cpu(fe->i_size));
        inode->i_nlink = le16_to_cpu(fe->i_links_count);
        inode->i_uid = le32_to_cpu(fe->i_uid);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 35140f6cf840..4d1e53992566 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -56,6 +56,7 @@ struct ocfs2_inode_info
        struct ocfs2_journal_handle     *ip_handle;
        u32                             ip_flags; /* see below */
+        u32                             ip_attr; /* inode attributes */
        /* protected by recovery_lock. */
        struct inode                    *ip_next_orphan;
@@ -142,4 +143,6 @@ int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle,
 int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
 int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+void ocfs2_set_inode_flags(struct inode *inode);
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
new file mode 100644
index 000000000000..3663cef80689
--- /dev/null
+++ b/fs/ocfs2/ioctl.c
@@ -0,0 +1,136 @@
+/*
+ * linux/fs/ocfs2/ioctl.c
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ * adapted from Remy Card's ext2/ioctl.c
+ */
+#include <linux/fs.h>
+#include <linux/mount.h>
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "inode.h"
+#include "journal.h"
+#include "ocfs2_fs.h"
+#include "ioctl.h"
+#include <linux/ext2_fs.h>
+static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
+{
+        int status;
+        status = ocfs2_meta_lock(inode, NULL, NULL, 0);
+        if (status < 0) {
+                mlog_errno(status);
+                return status;
+        }
+        *flags = OCFS2_I(inode)->ip_attr;
+        ocfs2_meta_unlock(inode, 0);
+        mlog_exit(status);
+        return status;
+}
+static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
+                                unsigned mask)
+{
+        struct ocfs2_inode_info *ocfs2_inode = OCFS2_I(inode);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_journal_handle *handle = NULL;
+        struct buffer_head *bh = NULL;
+        unsigned oldflags;
+        int status;
+        mutex_lock(&inode->i_mutex);
+        status = ocfs2_meta_lock(inode, NULL, &bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        status = -EROFS;
+        if (IS_RDONLY(inode))
+                goto bail_unlock;
+        status = -EACCES;
+        if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+                goto bail_unlock;
+        if (!S_ISDIR(inode->i_mode))
+                flags &= ~OCFS2_DIRSYNC_FL;
+        handle = ocfs2_start_trans(osb, NULL, OCFS2_INODE_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto bail_unlock;
+        }
+        oldflags = ocfs2_inode->ip_attr;
+        flags = flags & mask;
+        flags |= oldflags & ~mask;
+        /*
+         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+         * the relevant capability.
+         */
+        status = -EPERM;
+        if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
+                (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
+                if (!capable(CAP_LINUX_IMMUTABLE))
+                        goto bail_unlock;
+        }
+        ocfs2_inode->ip_attr = flags;
+        ocfs2_set_inode_flags(inode);
+        status = ocfs2_mark_inode_dirty(handle, inode, bh);
+        if (status < 0)
+                mlog_errno(status);
+        ocfs2_commit_trans(handle);
+bail_unlock:
+        ocfs2_meta_unlock(inode, 1);
+bail:
+        mutex_unlock(&inode->i_mutex);
+        if (bh)
+                brelse(bh);
+        mlog_exit(status);
+        return status;
+}
+int ocfs2_ioctl(struct inode * inode, struct file * filp,
+        unsigned int cmd, unsigned long arg)
+{
+        unsigned int flags;
+        int status;
+        switch (cmd) {
+        case OCFS2_IOC_GETFLAGS:
+                status = ocfs2_get_inode_attr(inode, &flags);
+                if (status < 0)
+                        return status;
+                flags &= OCFS2_FL_VISIBLE;
+                return put_user(flags, (int __user *) arg);
+        case OCFS2_IOC_SETFLAGS:
+                if (get_user(flags, (int __user *) arg))
+                        return -EFAULT;
+                return ocfs2_set_inode_attr(inode, flags,
+                        OCFS2_FL_MODIFIABLE);
+        default:
+                return -ENOTTY;
+        }
+}
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
new file mode 100644
index 000000000000..4a7c82931dba
--- /dev/null
+++ b/fs/ocfs2/ioctl.h
@@ -0,0 +1,16 @@
+/*
+ * ioctl.h
+ *
+ * Function prototypes
+ *
+ * Copyright (C) 2006 Herbert Poetzl
+ *
+ */
+#ifndef OCFS2_IOCTL_H
+#define OCFS2_IOCTL_H
+int ocfs2_ioctl(struct inode * inode, struct file * filp,
+        unsigned int cmd, unsigned long arg);
+#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 0673862c8bdd..0d3e939b1f56 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -56,6 +56,7 @@
 #include "journal.h"
 #include "namei.h"
 #include "suballoc.h"
+#include "super.h"
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
@@ -310,13 +311,6 @@ static int ocfs2_mknod(struct inode *dir,
        /* get our super block */
        osb = OCFS2_SB(dir->i_sb);
-        if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
-                mlog(ML_ERROR, "inode %llu has i_nlink of %u\n",
-                     (unsigned long long)OCFS2_I(dir)->ip_blkno, dir->i_nlink);
-                status = -EMLINK;
-                goto leave;
-        }
        handle = ocfs2_alloc_handle(osb);
        if (handle == NULL) {
                status = -ENOMEM;
@@ -331,6 +325,11 @@ static int ocfs2_mknod(struct inode *dir,
                goto leave;
        }
+        if (S_ISDIR(mode) && (dir->i_nlink >= OCFS2_LINK_MAX)) {
+                status = -EMLINK;
+                goto leave;
+        }
        dirfe = (struct ocfs2_dinode *) parent_fe_bh->b_data;
        if (!dirfe->i_links_count) {
                /* can't make a file in a deleted directory. */
@@ -643,11 +642,6 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto bail;
        }
-        if (inode->i_nlink >= OCFS2_LINK_MAX) {
-                err = -EMLINK;
-                goto bail;
-        }
        handle = ocfs2_alloc_handle(osb);
        if (handle == NULL) {
                err = -ENOMEM;
@@ -661,6 +655,11 @@ static int ocfs2_link(struct dentry *old_dentry,
                goto bail;
        }
+        if (!dir->i_nlink) {
+                err = -ENOENT;
+                goto bail;
+        }
        err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
                                        dentry->d_name.len);
        if (err)
@@ -1964,13 +1963,8 @@ restart:
                                }
                                num++;
-                                /* XXX: questionable readahead stuff here */
                                bh = ocfs2_bread(dir, b++, &err, 1);
                                bh_use[ra_max] = bh;
-#if 0           // ???
-                                if (bh)
-                                        ll_rw_block(READ, 1, &bh);
-#endif
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -1978,6 +1972,10 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
+                        ocfs2_error(dir->i_sb, "reading directory %llu, "
+                                    "offset %lu\n",
+                                    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+                                    block);
                        brelse(bh);
                        goto next;
                }
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c5b1ac547c15..3330a5dc6be2 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -114,6 +114,26 @@
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
+/* Inode attributes, keep in sync with EXT2 */
+#define OCFS2_SECRM_FL          (0x00000001)    /* Secure deletion */
+#define OCFS2_UNRM_FL           (0x00000002)    /* Undelete */
+#define OCFS2_COMPR_FL          (0x00000004)    /* Compress file */
+#define OCFS2_SYNC_FL           (0x00000008)    /* Synchronous updates */
+#define OCFS2_IMMUTABLE_FL      (0x00000010)    /* Immutable file */
+#define OCFS2_APPEND_FL         (0x00000020)    /* writes to file may only append */
+#define OCFS2_NODUMP_FL         (0x00000040)    /* do not dump file */
+#define OCFS2_NOATIME_FL        (0x00000080)    /* do not update atime */
+#define OCFS2_DIRSYNC_FL        (0x00010000)    /* dirsync behaviour (directories only) */
+#define OCFS2_FL_VISIBLE        (0x000100FF)    /* User visible flags */
+#define OCFS2_FL_MODIFIABLE     (0x000100FF)    /* User modifiable flags */
+/*
+ * ioctl commands
+ */
+#define OCFS2_IOC_GETFLAGS      _IOR('f', 1, long)
+#define OCFS2_IOC_SETFLAGS      _IOW('f', 2, long)
 /*
 * Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
 */
@@ -399,7 +419,9 @@ struct ocfs2_dinode {
        __le32 i_atime_nsec;
        __le32 i_ctime_nsec;
        __le32 i_mtime_nsec;
-/*70*/  __le64 i_reserved1[9];
+        __le32 i_attr;
+        __le32 i_reserved1;
+/*70*/  __le64 i_reserved2[8];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index b8a00a793326..9707ed7a3206 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -206,7 +206,10 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
 }
 /* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache. */
+ * the block is stored in our inode metadata cache. 
+ * 
+ * This can be called under lock_buffer()
+ */
 int ocfs2_buffer_uptodate(struct inode *inode,
                          struct buffer_head *bh)
 {
@@ -226,6 +229,16 @@ int ocfs2_buffer_uptodate(struct inode *inode,
        return ocfs2_buffer_cached(OCFS2_I(inode), bh);
 }
+/* 
+ * Determine whether a buffer is currently out on a read-ahead request.
+ * ip_io_sem should be held to serialize submitters with the logic here.
+ */
+int ocfs2_buffer_read_ahead(struct inode *inode,
+                            struct buffer_head *bh)
+{
+        return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
 /* Requires ip_lock */
 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
                                     sector_t block)
@@ -403,7 +416,11 @@ out_free:
 *
 * Note that this function may actually fail to insert the block if
 * memory cannot be allocated. This is not fatal however (but may
- * result in a performance penalty) */
+ * result in a performance penalty)
+ *
+ * Readahead buffers can be passed in here before the I/O request is
+ * completed.
+ */
 void ocfs2_set_buffer_uptodate(struct inode *inode,
                               struct buffer_head *bh)
 {
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index 01cd32d26b06..2e73206059a8 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,5 +40,7 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
                                   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
                             struct buffer_head *bh);
+int ocfs2_buffer_read_ahead(struct inode *inode,
+                            struct buffer_head *bh);
 #endif /* OCFS2_UPTODATE_H */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index c40f81ba9b13..34dcb43a7837 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1390,11 +1390,19 @@ xfs_vm_direct_IO(
        iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
-        ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
+        if (rw == WRITE) {
-                iomap.iomap_target->bt_bdev,
+                ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                iov, offset, nr_segs,
+                        iomap.iomap_target->bt_bdev,
-                xfs_get_blocks_direct,
+                        iov, offset, nr_segs,
-                xfs_end_io_direct);
+                        xfs_get_blocks_direct,
+                        xfs_end_io_direct);
+        } else {
+                ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
+                        iomap.iomap_target->bt_bdev,
+                        iov, offset, nr_segs,
+                        xfs_get_blocks_direct,
+                        xfs_end_io_direct);
+        }
        if (unlikely(ret <= 0 && iocb->private))
                xfs_destroy_ioend(iocb->private);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 5d9cfd91ad08..ee788b1cb364 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -264,7 +264,9 @@ xfs_read(
                                        dmflags, &locktype);
                if (ret) {
                        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-                        goto unlock_mutex;
+                        if (unlikely(ioflags & IO_ISDIRECT))
+                                mutex_unlock(&inode->i_mutex);
+                        return ret;
                }
        }
@@ -272,6 +274,9 @@ xfs_read(
                bhv_vop_flushinval_pages(vp, ctooff(offtoct(*offset)),
                                                -1, FI_REMAPF_LOCKED);
+        if (unlikely(ioflags & IO_ISDIRECT))
+                mutex_unlock(&inode->i_mutex);
        xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
                                (void *)iovp, segs, *offset, ioflags);
        ret = __generic_file_aio_read(iocb, iovp, segs, offset);
@@ -281,10 +286,6 @@ xfs_read(
                XFS_STATS_ADD(xs_read_bytes, ret);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-unlock_mutex:
-        if (unlikely(ioflags & IO_ISDIRECT))
-                mutex_unlock(&inode->i_mutex);
        return ret;
 }
@@ -390,6 +391,8 @@ xfs_splice_write(
        xfs_inode_t             *ip = XFS_BHVTOI(bdp);
        xfs_mount_t             *mp = ip->i_mount;
        ssize_t                 ret;
+        struct inode            *inode = outfilp->f_mapping->host;
+        xfs_fsize_t             isize;
        XFS_STATS_INC(xs_write_calls);
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -416,6 +419,20 @@ xfs_splice_write(
        if (ret > 0)
                XFS_STATS_ADD(xs_write_bytes, ret);
+        isize = i_size_read(inode);
+        if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
+                *ppos = isize;
+        if (*ppos > ip->i_d.di_size) {
+                xfs_ilock(ip, XFS_ILOCK_EXCL);
+                if (*ppos > ip->i_d.di_size) {
+                        ip->i_d.di_size = *ppos;
+                        i_size_write(inode, *ppos);
+                        ip->i_update_core = 1;
+                        ip->i_update_size = 1;
+                }
+                xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        }
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
        return ret;
 }
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index f137856c3261..db8872be8c87 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -203,7 +203,7 @@ xfs_qm_statvfs(
        if (error || !vnode)
                return error;
-        mp = XFS_BHVTOM(bhv);
+        mp = xfs_vfstom(bhvtovfs(bhv));
        ip = xfs_vtoi(vnode);
        if (!(ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 650591f999ae..5a4256120ccc 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -44,6 +44,26 @@ typedef enum xfs_alloctype
 #define XFS_ALLOC_FLAG_FREEING  0x00000002  /* indicate caller is freeing extents*/
 /*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs _per AG_, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks
+ * to 4 + 4*agcount.
+ */
+#define XFS_ALLOC_SET_ASIDE(mp)  (4 + ((mp)->m_sb.sb_agcount * 4))
+/*
 * Argument structure for xfs_alloc routines.
 * This is turned into a structure to avoid having 20 arguments passed
 * down several levels of the stack.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 077629bab532..c064e72ada9e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -462,7 +462,7 @@ xfs_fs_counts(
        xfs_icsb_sync_counters_lazy(mp);
        s = XFS_SB_LOCK(mp);
-        cnt->freedata = mp->m_sb.sb_fdblocks;
+        cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
        cnt->freertx = mp->m_sb.sb_frextents;
        cnt->freeino = mp->m_sb.sb_ifree;
        cnt->allocino = mp->m_sb.sb_icount;
@@ -519,15 +519,19 @@ xfs_reserve_blocks(
                }
                mp->m_resblks = request;
        } else {
+                __int64_t       free;
+                free =  mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
                delta = request - mp->m_resblks;
-                lcounter = mp->m_sb.sb_fdblocks - delta;
+                lcounter = free - delta;
                if (lcounter < 0) {
                        /* We can't satisfy the request, just get what we can */
-                        mp->m_resblks += mp->m_sb.sb_fdblocks;
+                        mp->m_resblks += free;
-                        mp->m_resblks_avail += mp->m_sb.sb_fdblocks;
+                        mp->m_resblks_avail += free;
-                        mp->m_sb.sb_fdblocks = 0;
+                        mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp);
                } else {
-                        mp->m_sb.sb_fdblocks = lcounter;
+                        mp->m_sb.sb_fdblocks =
+                                lcounter + XFS_ALLOC_SET_ASIDE(mp);
                        mp->m_resblks = request;
                        mp->m_resblks_avail += delta;
                }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 4be5c0b2d296..9dfae18d995f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1243,24 +1243,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
        xfs_trans_log_buf(tp, bp, first, last);
 }
-/*
- * In order to avoid ENOSPC-related deadlock caused by
- * out-of-order locking of AGF buffer (PV 947395), we place
- * constraints on the relationship among actual allocations for
- * data blocks, freelist blocks, and potential file data bmap
- * btree blocks. However, these restrictions may result in no
- * actual space allocated for a delayed extent, for example, a data
- * block in a certain AG is allocated but there is no additional
- * block for the additional bmap btree block due to a split of the
- * bmap btree of the file. The result of this may lead to an
- * infinite loop in xfssyncd when the file gets flushed to disk and
- * all delayed extents need to be actually allocated. To get around
- * this, we explicitly set aside a few blocks which will not be
- * reserved in delayed allocation. Considering the minimum number of
- * needed freelist blocks is 4 fsbs, a potential split of file's bmap
- * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
-*/
-#define SET_ASIDE_BLOCKS 8
 /*
 * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
@@ -1306,7 +1288,8 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
                return 0;
        case XFS_SBS_FDBLOCKS:
-                lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
+                lcounter = (long long)
+                        mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
                res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
                if (delta > 0) {                /* Putting blocks back */
@@ -1340,7 +1323,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
                        }
                }
-                mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
+                mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
                return 0;
        case XFS_SBS_FREXTENTS:
                lcounter = (long long)mp->m_sb.sb_frextents;
@@ -2021,7 +2004,8 @@ xfs_icsb_sync_counters_lazy(
 * when we get near ENOSPC.
 */
 #define XFS_ICSB_INO_CNTR_REENABLE      64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE    512
+#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
+                (512 + XFS_ALLOC_SET_ASIDE(mp))
 STATIC void
 xfs_icsb_balance_counter(
        xfs_mount_t     *mp,
@@ -2055,7 +2039,7 @@ xfs_icsb_balance_counter(
        case XFS_SBS_FDBLOCKS:
                count = mp->m_sb.sb_fdblocks;
                resid = do_div(count, weight);
-                if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
+                if (count < XFS_ICSB_FDBLK_CNTR_REENABLE(mp))
                        goto out;
                break;
        default:
@@ -2110,11 +2094,11 @@ again:
        case XFS_SBS_FDBLOCKS:
                BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
-                lcounter = icsbp->icsb_fdblocks;
+                lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
                lcounter += delta;
                if (unlikely(lcounter < 0))
                        goto slow_path;
-                icsbp->icsb_fdblocks = lcounter;
+                icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
                break;
        default:
                BUG();
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index b427d220a169..a34796e57afb 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -811,7 +811,8 @@ xfs_statvfs(
        statp->f_bsize = sbp->sb_blocksize;
        lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
        statp->f_blocks = sbp->sb_dblocks - lsize;
-        statp->f_bfree = statp->f_bavail = sbp->sb_fdblocks;
+        statp->f_bfree = statp->f_bavail =
+                                sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
        fakeinos = statp->f_bfree << sbp->sb_inopblog;
 #if XFS_BIG_INUMS
        fakeinos += mp->m_inoadd;