242 files changed, 6708 insertions, 4051 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index f8fccaaad628..64d44efad7a5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -6,10 +6,6 @@ menu "File systems"
 if BLOCK
-config FS_JOURNAL_INFO
-        bool
-        default n
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
diff --git a/fs/aio.c b/fs/aio.c
index c30dfc006108..1cf12b3dd83a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -711,10 +711,8 @@ static ssize_t aio_run_iocb(struct kiocb *iocb)
         */
        ret = retry(iocb);
-        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
+        if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED)
-                BUG_ON(!list_empty(&iocb->ki_wait.task_list));
                aio_complete(iocb, ret, 0);
-        }
 out:
        spin_lock_irq(&ctx->ctx_lock);
@@ -866,13 +864,6 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
        unsigned long flags;
        int run = 0;
-        /* We're supposed to be the only path putting the iocb back on the run
-         * list.  If we find that the iocb is *back* on a wait queue already
-         * than retry has happened before we could queue the iocb.  This also
-         * means that the retry could have completed and freed our iocb, no
-         * good. */
-        BUG_ON((!list_empty(&iocb->ki_wait.task_list)));
        spin_lock_irqsave(&ctx->ctx_lock, flags);
        /* set this inside the lock so that we can't race with aio_run_iocb()
         * testing it and putting the iocb on the run list under the lock */
@@ -886,7 +877,7 @@ static void try_queue_kicked_iocb(struct kiocb *iocb)
 /*
 * kick_iocb:
 *      Called typically from a wait queue callback context
- *      (aio_wake_function) to trigger a retry of the iocb.
+ *      to trigger a retry of the iocb.
 *      The retry is usually executed by aio workqueue
 *      threads (See aio_kick_handler).
 */
@@ -1520,31 +1511,6 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
        return 0;
 }
-/*
- * aio_wake_function:
- *      wait queue callback function for aio notification,
- *      Simply triggers a retry of the operation via kick_iocb.
- *
- *      This callback is specified in the wait queue entry in
- *      a kiocb.
- *
- * Note:
- * This routine is executed with the wait queue lock held.
- * Since kick_iocb acquires iocb->ctx->ctx_lock, it nests
- * the ioctx lock inside the wait queue lock. This is safe
- * because this callback isn't used for wait queues which
- * are nested inside ioctx lock (i.e. ctx->wait)
- */
-static int aio_wake_function(wait_queue_t *wait, unsigned mode,
-                             int sync, void *key)
-{
-        struct kiocb *iocb = container_of(wait, struct kiocb, ki_wait);
-        list_del_init(&wait->task_list);
-        kick_iocb(iocb);
-        return 1;
-}
 static void aio_batch_add(struct address_space *mapping,
                          struct hlist_head *batch_hash)
 {
@@ -1642,8 +1608,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
        req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
        req->ki_opcode = iocb->aio_lio_opcode;
-        init_waitqueue_func_entry(&req->ki_wait, aio_wake_function);
-        INIT_LIST_HEAD(&req->ki_wait.task_list);
        ret = aio_setup_iocb(req);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 2ca7a7cafdbf..9f0bf13291e5 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -35,14 +35,13 @@ static int anon_inodefs_get_sb(struct file_system_type *fs_type, int flags,
                             mnt);
 }
-static int anon_inodefs_delete_dentry(struct dentry *dentry)
+/*
+ * anon_inodefs_dname() is called from d_path().
+ */
+static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
 {
-        /*
+        return dynamic_dname(dentry, buffer, buflen, "anon_inode:%s",
-         * We faked vfs to believe the dentry was hashed when we created it.
+                                dentry->d_name.name);
-         * Now we restore the flag so that dput() will work correctly.
-         */
-        dentry->d_flags |= DCACHE_UNHASHED;
-        return 1;
 }
 static struct file_system_type anon_inode_fs_type = {
@@ -51,7 +50,7 @@ static struct file_system_type anon_inode_fs_type = {
        .kill_sb        = kill_anon_super,
 };
 static const struct dentry_operations anon_inodefs_dentry_operations = {
-        .d_delete       = anon_inodefs_delete_dentry,
+        .d_dname        = anon_inodefs_dname,
 };
 /*
@@ -88,7 +87,7 @@ struct file *anon_inode_getfile(const char *name,
                                void *priv, int flags)
 {
        struct qstr this;
-        struct dentry *dentry;
+        struct path path;
        struct file *file;
        int error;
@@ -106,10 +105,11 @@ struct file *anon_inode_getfile(const char *name,
        this.name = name;
        this.len = strlen(name);
        this.hash = 0;
-        dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
+        path.dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
-        if (!dentry)
+        if (!path.dentry)
                goto err_module;
+        path.mnt = mntget(anon_inode_mnt);
        /*
         * We know the anon_inode inode count is always greater than zero,
         * so we can avoid doing an igrab() and we can use an open-coded
@@ -117,27 +117,24 @@ struct file *anon_inode_getfile(const char *name,
         */
        atomic_inc(&anon_inode_inode->i_count);
-        dentry->d_op = &anon_inodefs_dentry_operations;
+        path.dentry->d_op = &anon_inodefs_dentry_operations;
-        /* Do not publish this dentry inside the global dentry hash table */
+        d_instantiate(path.dentry, anon_inode_inode);
-        dentry->d_flags &= ~DCACHE_UNHASHED;
-        d_instantiate(dentry, anon_inode_inode);
        error = -ENFILE;
-        file = alloc_file(anon_inode_mnt, dentry,
+        file = alloc_file(&path, OPEN_FMODE(flags), fops);
-                          FMODE_READ | FMODE_WRITE, fops);
        if (!file)
                goto err_dput;
        file->f_mapping = anon_inode_inode->i_mapping;
        file->f_pos = 0;
-        file->f_flags = O_RDWR | (flags & O_NONBLOCK);
+        file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
        file->f_version = 0;
        file->private_data = priv;
        return file;
 err_dput:
-        dput(dentry);
+        path_put(&path);
 err_module:
        module_put(fops->owner);
        return ERR_PTR(error);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 8f7cdde41733..0118d67221b2 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -60,6 +60,11 @@ do {							\
                current->pid, __func__, ##args);        \
 } while (0)
+struct rehash_entry {
+        struct task_struct *task;
+        struct list_head list;
+};
 /* Unified info structure.  This is pointed to by both the dentry and
   inode structures.  Each file in the filesystem has an instance of this
   structure.  It holds a reference to the dentry, so dentries are never
@@ -75,6 +80,9 @@ struct autofs_info {
        struct completion expire_complete;
        struct list_head active;
+        int active_count;
+        struct list_head rehash_list;
        struct list_head expiring;
        struct autofs_sb_info *sbi;
@@ -95,6 +103,8 @@ struct autofs_info {
 #define AUTOFS_INF_EXPIRING     (1<<0) /* dentry is in the process of expiring */
 #define AUTOFS_INF_MOUNTPOINT   (1<<1) /* mountpoint status for direct expire */
+#define AUTOFS_INF_PENDING      (1<<2) /* dentry pending mount */
+#define AUTOFS_INF_REHASH       (1<<3) /* dentry in transit to ->lookup() */
 struct autofs_wait_queue {
        wait_queue_head_t queue;
@@ -161,7 +171,7 @@ static inline int autofs4_ispending(struct dentry *dentry)
 {
        struct autofs_info *inf = autofs4_dentry_ino(dentry);
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
+        if (inf->flags & AUTOFS_INF_PENDING)
                return 1;
        if (inf->flags & AUTOFS_INF_EXPIRING)
@@ -264,5 +274,31 @@ out:
        return ret;
 }
+static inline void autofs4_add_expiring(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                if (list_empty(&ino->expiring))
+                        list_add(&ino->expiring, &sbi->expiring_list);
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static inline void autofs4_del_expiring(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                if (!list_empty(&ino->expiring))
+                        list_del_init(&ino->expiring);
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
 void autofs4_dentry_release(struct dentry *);
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3da18d453488..74bc9aa6df31 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -27,7 +27,7 @@ static inline int autofs4_can_expire(struct dentry *dentry,
                return 0;
        /* No point expiring a pending mount */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING)
+        if (ino->flags & AUTOFS_INF_PENDING)
                return 0;
        if (!do_now) {
@@ -279,6 +279,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
                        root->d_mounted--;
                }
                ino->flags |= AUTOFS_INF_EXPIRING;
+                autofs4_add_expiring(root);
                init_completion(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                return root;
@@ -406,6 +407,7 @@ found:
                expired, (int)expired->d_name.len, expired->d_name.name);
        ino = autofs4_dentry_ino(expired);
        ino->flags |= AUTOFS_INF_EXPIRING;
+        autofs4_add_expiring(expired);
        init_completion(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
        spin_lock(&dcache_lock);
@@ -433,7 +435,7 @@ int autofs4_expire_wait(struct dentry *dentry)
                DPRINTK("expire done status=%d", status);
-                if (d_unhashed(dentry))
+                if (d_unhashed(dentry) && IS_DEADDIR(dentry->d_inode))
                        return -EAGAIN;
                return status;
@@ -473,6 +475,7 @@ int autofs4_expire_run(struct super_block *sb,
        spin_lock(&sbi->fs_lock);
        ino = autofs4_dentry_ino(dentry);
        ino->flags &= ~AUTOFS_INF_EXPIRING;
+        autofs4_del_expiring(dentry);
        complete_all(&ino->expire_complete);
        spin_unlock(&sbi->fs_lock);
@@ -503,6 +506,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
                }
                ino->flags &= ~AUTOFS_INF_EXPIRING;
+                autofs4_del_expiring(dentry);
                complete_all(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                dput(dentry);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 69c8142da838..d0a3de247458 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -49,6 +49,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino,
                ino->dentry = NULL;
                ino->size = 0;
                INIT_LIST_HEAD(&ino->active);
+                INIT_LIST_HEAD(&ino->rehash_list);
+                ino->active_count = 0;
                INIT_LIST_HEAD(&ino->expiring);
                atomic_set(&ino->count, 0);
        }
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index b96a3c57359d..30cc9ddf4b70 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -72,6 +72,139 @@ const struct inode_operations autofs4_dir_inode_operations = {
        .rmdir          = autofs4_dir_rmdir,
 };
+static void autofs4_add_active(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                if (!ino->active_count) {
+                        if (list_empty(&ino->active))
+                                list_add(&ino->active, &sbi->active_list);
+                }
+                ino->active_count++;
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static void autofs4_del_active(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino) {
+                spin_lock(&sbi->lookup_lock);
+                ino->active_count--;
+                if (!ino->active_count) {
+                        if (!list_empty(&ino->active))
+                                list_del_init(&ino->active);
+                }
+                spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static void autofs4_add_rehash_entry(struct autofs_info *ino,
+                                     struct rehash_entry *entry)
+{
+        entry->task = current;
+        INIT_LIST_HEAD(&entry->list);
+        list_add(&entry->list, &ino->rehash_list);
+        return;
+}
+static void autofs4_remove_rehash_entry(struct autofs_info *ino)
+{
+        struct list_head *head = &ino->rehash_list;
+        struct rehash_entry *entry;
+        list_for_each_entry(entry, head, list) {
+                if (entry->task == current) {
+                        list_del(&entry->list);
+                        kfree(entry);
+                        break;
+                }
+        }
+        return;
+}
+static void autofs4_remove_rehash_entrys(struct autofs_info *ino)
+{
+        struct autofs_sb_info *sbi = ino->sbi;
+        struct rehash_entry *entry, *next;
+        struct list_head *head;
+        spin_lock(&sbi->fs_lock);
+        spin_lock(&sbi->lookup_lock);
+        if (!(ino->flags & AUTOFS_INF_REHASH)) {
+                spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&sbi->fs_lock);
+                return;
+        }
+        ino->flags &= ~AUTOFS_INF_REHASH;
+        head = &ino->rehash_list;
+        list_for_each_entry_safe(entry, next, head, list) {
+                list_del(&entry->list);
+                kfree(entry);
+        }
+        spin_unlock(&sbi->lookup_lock);
+        spin_unlock(&sbi->fs_lock);
+        dput(ino->dentry);
+        return;
+}
+static void autofs4_revalidate_drop(struct dentry *dentry,
+                                    struct rehash_entry *entry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        /*
+         * Add to the active list so we can pick this up in
+         * ->lookup(). Also add an entry to a rehash list so
+         * we know when there are no dentrys in flight so we
+         * know when we can rehash the dentry.
+         */
+        spin_lock(&sbi->lookup_lock);
+        if (list_empty(&ino->active))
+                list_add(&ino->active, &sbi->active_list);
+        autofs4_add_rehash_entry(ino, entry);
+        spin_unlock(&sbi->lookup_lock);
+        if (!(ino->flags & AUTOFS_INF_REHASH)) {
+                ino->flags |= AUTOFS_INF_REHASH;
+                dget(dentry);
+                spin_lock(&dentry->d_lock);
+                __d_drop(dentry);
+                spin_unlock(&dentry->d_lock);
+        }
+        return;
+}
+static void autofs4_revalidate_rehash(struct dentry *dentry)
+{
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        if (ino->flags & AUTOFS_INF_REHASH) {
+                spin_lock(&sbi->lookup_lock);
+                autofs4_remove_rehash_entry(ino);
+                if (list_empty(&ino->rehash_list)) {
+                        spin_unlock(&sbi->lookup_lock);
+                        ino->flags &= ~AUTOFS_INF_REHASH;
+                        d_rehash(dentry);
+                        dput(ino->dentry);
+                } else
+                        spin_unlock(&sbi->lookup_lock);
+        }
+        return;
+}
+static unsigned int autofs4_need_mount(unsigned int flags)
+{
+        unsigned int res = 0;
+        if (flags & (TRIGGER_FLAGS | TRIGGER_INTENTS))
+                res = 1;
+        return res;
+}
 static int autofs4_dir_open(struct inode *inode, struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -93,7 +226,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * it.
         */
        spin_lock(&dcache_lock);
-        if (!d_mountpoint(dentry) && __simple_empty(dentry)) {
+        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
                spin_unlock(&dcache_lock);
                return -ENOENT;
        }
@@ -103,7 +236,7 @@ out:
        return dcache_dir_open(inode, file);
 }
-static int try_to_fill_dentry(struct dentry *dentry, int flags)
+static int try_to_fill_dentry(struct dentry *dentry)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
        struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -116,55 +249,17 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
         * Wait for a pending mount, triggering one if there
         * isn't one already
         */
-        if (dentry->d_inode == NULL) {
+        DPRINTK("waiting for mount name=%.*s",
-                DPRINTK("waiting for mount name=%.*s",
+                 dentry->d_name.len, dentry->d_name.name);
-                         dentry->d_name.len, dentry->d_name.name);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
-                /* Turn this into a real negative dentry? */
-                if (status == -ENOENT) {
-                        spin_lock(&dentry->d_lock);
-                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-                        spin_unlock(&dentry->d_lock);
-                        return status;
-                } else if (status) {
-                        /* Return a negative dentry, but leave it "pending" */
-                        return status;
-                }
-        /* Trigger mount for path component or follow link */
-        } else if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
-                        flags & (TRIGGER_FLAGS | TRIGGER_INTENTS) ||
-                        current->link_count) {
-                DPRINTK("waiting for mount name=%.*s",
-                        dentry->d_name.len, dentry->d_name.name);
-                spin_lock(&dentry->d_lock);
-                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-                spin_unlock(&dentry->d_lock);
-                status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                DPRINTK("mount done status=%d", status);
+        status = autofs4_wait(sbi, dentry, NFY_MOUNT);
-                if (status) {
+        DPRINTK("mount done status=%d", status);
-                        spin_lock(&dentry->d_lock);
-                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
-                        spin_unlock(&dentry->d_lock);
-                        return status;
-                }
-        }
-        /* Initialize expiry counter after successful mount */
-        if (ino)
-                ino->last_used = jiffies;
-        spin_lock(&dentry->d_lock);
+        /* Update expiry counter */
-        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+        ino->last_used = jiffies;
-        spin_unlock(&dentry->d_lock);
-        return 0;
+        return status;
 }
 /* For autofs direct mounts the follow link triggers the mount */
@@ -202,27 +297,39 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
        autofs4_expire_wait(dentry);
        /* We trigger a mount for almost all flags */
-        lookup_type = nd->flags & (TRIGGER_FLAGS | TRIGGER_INTENTS);
+        lookup_type = autofs4_need_mount(nd->flags);
-        if (!(lookup_type || dentry->d_flags & DCACHE_AUTOFS_PENDING))
+        spin_lock(&sbi->fs_lock);
+        spin_lock(&dcache_lock);
+        if (!(lookup_type || ino->flags & AUTOFS_INF_PENDING)) {
+                spin_unlock(&dcache_lock);
+                spin_unlock(&sbi->fs_lock);
                goto follow;
+        }
        /*
         * If the dentry contains directories then it is an autofs
         * multi-mount with no root mount offset. So don't try to
         * mount it again.
         */
-        spin_lock(&dcache_lock);
+        if (ino->flags & AUTOFS_INF_PENDING ||
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING ||
+            (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs))) {
-            (!d_mountpoint(dentry) && __simple_empty(dentry))) {
+                ino->flags |= AUTOFS_INF_PENDING;
                spin_unlock(&dcache_lock);
+                spin_unlock(&sbi->fs_lock);
+                status = try_to_fill_dentry(dentry);
+                spin_lock(&sbi->fs_lock);
+                ino->flags &= ~AUTOFS_INF_PENDING;
+                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, 0);
                if (status)
                        goto out_error;
                goto follow;
        }
        spin_unlock(&dcache_lock);
+        spin_unlock(&sbi->fs_lock);
 follow:
        /*
         * If there is no root mount it must be an autofs
@@ -254,18 +361,47 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir = dentry->d_parent->d_inode;
        struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
-        int oz_mode = autofs4_oz_mode(sbi);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        struct rehash_entry *entry;
        int flags = nd ? nd->flags : 0;
-        int status = 1;
+        unsigned int mutex_aquired;
+        DPRINTK("name = %.*s oz_mode = %d",
+                dentry->d_name.len, dentry->d_name.name, oz_mode);
+        /* Daemon never causes a mount to trigger */
+        if (autofs4_oz_mode(sbi))
+                return 1;
+        entry = kmalloc(sizeof(struct rehash_entry), GFP_KERNEL);
+        if (!entry)
+                return -ENOMEM;
+        mutex_aquired = mutex_trylock(&dir->i_mutex);
-        /* Pending dentry */
        spin_lock(&sbi->fs_lock);
+        spin_lock(&dcache_lock);
+        /* Pending dentry */
        if (autofs4_ispending(dentry)) {
-                /* The daemon never causes a mount to trigger */
+                int status;
-                spin_unlock(&sbi->fs_lock);
-                if (oz_mode)
+                /*
-                        return 1;
+                 * We can only unhash and send this to ->lookup() if
+                 * the directory mutex is held over d_revalidate() and
+                 * ->lookup(). This prevents the VFS from incorrectly
+                 * seeing the dentry as non-existent.
+                 */
+                ino->flags |= AUTOFS_INF_PENDING;
+                if (!mutex_aquired) {
+                        autofs4_revalidate_drop(dentry, entry);
+                        spin_unlock(&dcache_lock);
+                        spin_unlock(&sbi->fs_lock);
+                        return 0;
+                }
+                spin_unlock(&dcache_lock);
+                spin_unlock(&sbi->fs_lock);
+                mutex_unlock(&dir->i_mutex);
+                kfree(entry);
                /*
                 * If the directory has gone away due to an expire
@@ -279,46 +415,82 @@ static int autofs4_revalidate(struct dentry *dentry, struct nameidata *nd)
                 * A zero status is success otherwise we have a
                 * negative error code.
                 */
-                status = try_to_fill_dentry(dentry, flags);
+                status = try_to_fill_dentry(dentry);
+                spin_lock(&sbi->fs_lock);
+                ino->flags &= ~AUTOFS_INF_PENDING;
+                spin_unlock(&sbi->fs_lock);
                if (status == 0)
                        return 1;
                return status;
        }
-        spin_unlock(&sbi->fs_lock);
-        /* Negative dentry.. invalidate if "old" */
-        if (dentry->d_inode == NULL)
-                return 0;
        /* Check for a non-mountpoint directory with no contents */
-        spin_lock(&dcache_lock);
        if (S_ISDIR(dentry->d_inode->i_mode) &&
-            !d_mountpoint(dentry) && 
+            !d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-            __simple_empty(dentry)) {
                DPRINTK("dentry=%p %.*s, emptydir",
                         dentry, dentry->d_name.len, dentry->d_name.name);
-                spin_unlock(&dcache_lock);
-                /* The daemon never causes a mount to trigger */
+                if (autofs4_need_mount(flags) || current->link_count) {
-                if (oz_mode)
+                        int status;
-                        return 1;
-                /*
+                        /*
-                 * A zero status is success otherwise we have a
+                         * We can only unhash and send this to ->lookup() if
-                 * negative error code.
+                         * the directory mutex is held over d_revalidate() and
-                 */
+                         * ->lookup(). This prevents the VFS from incorrectly
-                status = try_to_fill_dentry(dentry, flags);
+                         * seeing the dentry as non-existent.
-                if (status == 0)
+                         */
-                        return 1;
+                        ino->flags |= AUTOFS_INF_PENDING;
+                        if (!mutex_aquired) {
+                                autofs4_revalidate_drop(dentry, entry);
+                                spin_unlock(&dcache_lock);
+                                spin_unlock(&sbi->fs_lock);
+                                return 0;
+                        }
+                        spin_unlock(&dcache_lock);
+                        spin_unlock(&sbi->fs_lock);
+                        mutex_unlock(&dir->i_mutex);
+                        kfree(entry);
-                return status;
+                        /*
+                         * A zero status is success otherwise we have a
+                         * negative error code.
+                         */
+                        status = try_to_fill_dentry(dentry);
+                        spin_lock(&sbi->fs_lock);
+                        ino->flags &= ~AUTOFS_INF_PENDING;
+                        spin_unlock(&sbi->fs_lock);
+                        if (status == 0)
+                                return 1;
+                        return status;
+                }
        }
        spin_unlock(&dcache_lock);
+        spin_unlock(&sbi->fs_lock);
+        if (mutex_aquired)
+                mutex_unlock(&dir->i_mutex);
+        kfree(entry);
        return 1;
 }
+static void autofs4_free_rehash_entrys(struct autofs_info *inf)
+{
+        struct list_head *head = &inf->rehash_list;
+        struct rehash_entry *entry, *next;
+        list_for_each_entry_safe(entry, next, head, list) {
+                list_del(&entry->list);
+                kfree(entry);
+        }
+}
 void autofs4_dentry_release(struct dentry *de)
 {
        struct autofs_info *inf;
@@ -337,6 +509,8 @@ void autofs4_dentry_release(struct dentry *de)
                                list_del(&inf->active);
                        if (!list_empty(&inf->expiring))
                                list_del(&inf->expiring);
+                        if (!list_empty(&inf->rehash_list))
+                                autofs4_free_rehash_entrys(inf);
                        spin_unlock(&sbi->lookup_lock);
                }
@@ -359,35 +533,52 @@ static const struct dentry_operations autofs4_dentry_operations = {
        .d_release      = autofs4_dentry_release,
 };
-static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct dentry *parent = dentry->d_parent;
+        struct qstr *name = &dentry->d_name;
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
        struct list_head *p, *head;
+restart:
        spin_lock(&dcache_lock);
        spin_lock(&sbi->lookup_lock);
        head = &sbi->active_list;
        list_for_each(p, head) {
                struct autofs_info *ino;
-                struct dentry *dentry;
+                struct dentry *active;
                struct qstr *qstr;
                ino = list_entry(p, struct autofs_info, active);
-                dentry = ino->dentry;
+                active = ino->dentry;
-                spin_lock(&dentry->d_lock);
+                spin_lock(&active->d_lock);
                /* Already gone? */
-                if (atomic_read(&dentry->d_count) == 0)
+                if (atomic_read(&active->d_count) == 0)
                        goto next;
-                qstr = &dentry->d_name;
+                if (active->d_inode && IS_DEADDIR(active->d_inode)) {
+                        if (!list_empty(&ino->rehash_list)) {
+                                dget(active);
+                                spin_unlock(&active->d_lock);
+                                spin_unlock(&sbi->lookup_lock);
+                                spin_unlock(&dcache_lock);
+                                autofs4_remove_rehash_entrys(ino);
+                                dput(active);
+                                goto restart;
+                        }
+                        goto next;
+                }
+                qstr = &active->d_name;
-                if (dentry->d_name.hash != hash)
+                if (active->d_name.hash != hash)
                        goto next;
-                if (dentry->d_parent != parent)
+                if (active->d_parent != parent)
                        goto next;
                if (qstr->len != len)
@@ -395,15 +586,13 @@ static struct dentry *autofs4_lookup_active(struct autofs_sb_info *sbi, struct d
                if (memcmp(qstr->name, str, len))
                        goto next;
-                if (d_unhashed(dentry)) {
+                dget(active);
-                        dget(dentry);
+                spin_unlock(&active->d_lock);
-                        spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&dcache_lock);
-                        spin_unlock(&dcache_lock);
+                return active;
-                        return dentry;
-                }
 next:
-                spin_unlock(&dentry->d_lock);
+                spin_unlock(&active->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
        spin_unlock(&dcache_lock);
@@ -411,8 +600,11 @@ next:
        return NULL;
 }
-static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct dentry *parent, struct qstr *name)
+static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 {
+        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct dentry *parent = dentry->d_parent;
+        struct qstr *name = &dentry->d_name;
        unsigned int len = name->len;
        unsigned int hash = name->hash;
        const unsigned char *str = name->name;
@@ -423,23 +615,23 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
        head = &sbi->expiring_list;
        list_for_each(p, head) {
                struct autofs_info *ino;
-                struct dentry *dentry;
+                struct dentry *expiring;
                struct qstr *qstr;
                ino = list_entry(p, struct autofs_info, expiring);
-                dentry = ino->dentry;
+                expiring = ino->dentry;
-                spin_lock(&dentry->d_lock);
+                spin_lock(&expiring->d_lock);
                /* Bad luck, we've already been dentry_iput */
-                if (!dentry->d_inode)
+                if (!expiring->d_inode)
                        goto next;
-                qstr = &dentry->d_name;
+                qstr = &expiring->d_name;
-                if (dentry->d_name.hash != hash)
+                if (expiring->d_name.hash != hash)
                        goto next;
-                if (dentry->d_parent != parent)
+                if (expiring->d_parent != parent)
                        goto next;
                if (qstr->len != len)
@@ -447,15 +639,13 @@ static struct dentry *autofs4_lookup_expiring(struct autofs_sb_info *sbi, struct
                if (memcmp(qstr->name, str, len))
                        goto next;
-                if (d_unhashed(dentry)) {
+                dget(expiring);
-                        dget(dentry);
+                spin_unlock(&expiring->d_lock);
-                        spin_unlock(&dentry->d_lock);
+                spin_unlock(&sbi->lookup_lock);
-                        spin_unlock(&sbi->lookup_lock);
+                spin_unlock(&dcache_lock);
-                        spin_unlock(&dcache_lock);
+                return expiring;
-                        return dentry;
-                }
 next:
-                spin_unlock(&dentry->d_lock);
+                spin_unlock(&expiring->d_lock);
        }
        spin_unlock(&sbi->lookup_lock);
        spin_unlock(&dcache_lock);
@@ -463,13 +653,56 @@ next:
        return NULL;
 }
+static struct autofs_info *init_new_dentry(struct autofs_sb_info *sbi,
+                                           struct dentry *dentry, int oz_mode)
+{
+        struct autofs_info *ino;
+        /*
+         * Mark the dentry incomplete but don't hash it. We do this
+         * to serialize our inode creation operations (symlink and
+         * mkdir) which prevents deadlock during the callback to
+         * the daemon. Subsequent user space lookups for the same
+         * dentry are placed on the wait queue while the daemon
+         * itself is allowed passage unresticted so the create
+         * operation itself can then hash the dentry. Finally,
+         * we check for the hashed dentry and return the newly
+         * hashed dentry.
+         */
+        dentry->d_op = &autofs4_root_dentry_operations;
+        /*
+         * And we need to ensure that the same dentry is used for
+         * all following lookup calls until it is hashed so that
+         * the dentry flags are persistent throughout the request.
+         */
+        ino = autofs4_init_ino(NULL, sbi, 0555);
+        if (!ino)
+                return ERR_PTR(-ENOMEM);
+        dentry->d_fsdata = ino;
+        ino->dentry = dentry;
+        /*
+         * Only set the mount pending flag for new dentrys not created
+         * by the daemon.
+         */
+        if (!oz_mode)
+                ino->flags |= AUTOFS_INF_PENDING;
+        d_instantiate(dentry, NULL);
+        return ino;
+}
 /* Lookups in the root directory */
 static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
        struct autofs_sb_info *sbi;
        struct autofs_info *ino;
-        struct dentry *expiring, *unhashed;
+        struct dentry *expiring, *active;
        int oz_mode;
+        int status = 0;
        DPRINTK("name = %.*s",
                dentry->d_name.len, dentry->d_name.name);
@@ -484,123 +717,100 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
-        unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
+        spin_lock(&sbi->fs_lock);
-        if (unhashed)
+        active = autofs4_lookup_active(dentry);
-                dentry = unhashed;
+        if (active) {
-        else {
+                dentry = active;
-                /*
+                ino = autofs4_dentry_ino(dentry);
-                 * Mark the dentry incomplete but don't hash it. We do this
+                /* If this came from revalidate, rehash it */
-                 * to serialize our inode creation operations (symlink and
+                autofs4_revalidate_rehash(dentry);
-                 * mkdir) which prevents deadlock during the callback to
+                spin_unlock(&sbi->fs_lock);
-                 * the daemon. Subsequent user space lookups for the same
+        } else {
-                 * dentry are placed on the wait queue while the daemon
+                spin_unlock(&sbi->fs_lock);
-                 * itself is allowed passage unresticted so the create
+                ino = init_new_dentry(sbi, dentry, oz_mode);
-                 * operation itself can then hash the dentry. Finally,
+                if (IS_ERR(ino))
-                 * we check for the hashed dentry and return the newly
+                        return (struct dentry *) ino;
-                 * hashed dentry.
-                 */
-                dentry->d_op = &autofs4_root_dentry_operations;
-                /*
-                 * And we need to ensure that the same dentry is used for
-                 * all following lookup calls until it is hashed so that
-                 * the dentry flags are persistent throughout the request.
-                 */
-                ino = autofs4_init_ino(NULL, sbi, 0555);
-                if (!ino)
-                        return ERR_PTR(-ENOMEM);
-                dentry->d_fsdata = ino;
-                ino->dentry = dentry;
-                spin_lock(&sbi->lookup_lock);
-                list_add(&ino->active, &sbi->active_list);
-                spin_unlock(&sbi->lookup_lock);
-                d_instantiate(dentry, NULL);
        }
+        autofs4_add_active(dentry);
        if (!oz_mode) {
+                expiring = autofs4_lookup_expiring(dentry);
                mutex_unlock(&dir->i_mutex);
-                expiring = autofs4_lookup_expiring(sbi,
-                                                   dentry->d_parent,
-                                                   &dentry->d_name);
                if (expiring) {
                        /*
                         * If we are racing with expire the request might not
                         * be quite complete but the directory has been removed
                         * so it must have been successful, so just wait for it.
                         */
-                        ino = autofs4_dentry_ino(expiring);
                        autofs4_expire_wait(expiring);
-                        spin_lock(&sbi->lookup_lock);
-                        if (!list_empty(&ino->expiring))
-                                list_del_init(&ino->expiring);
-                        spin_unlock(&sbi->lookup_lock);
                        dput(expiring);
                }
+                status = try_to_fill_dentry(dentry);
-                spin_lock(&dentry->d_lock);
-                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
-                spin_unlock(&dentry->d_lock);
-                if (dentry->d_op && dentry->d_op->d_revalidate)
-                        (dentry->d_op->d_revalidate)(dentry, nd);
                mutex_lock(&dir->i_mutex);
+                spin_lock(&sbi->fs_lock);
+                ino->flags &= ~AUTOFS_INF_PENDING;
+                spin_unlock(&sbi->fs_lock);
        }
+        autofs4_del_active(dentry);
        /*
-         * If we are still pending, check if we had to handle
+         * If we had a mount fail, check if we had to handle
         * a signal. If so we can force a restart..
         */
-        if (dentry->d_flags & DCACHE_AUTOFS_PENDING) {
+        if (status) {
                /* See if we were interrupted */
                if (signal_pending(current)) {
                        sigset_t *sigset = &current->pending.signal;
                        if (sigismember (sigset, SIGKILL) ||
                            sigismember (sigset, SIGQUIT) ||
                            sigismember (sigset, SIGINT)) {
-                            if (unhashed)
+                            if (active)
-                                dput(unhashed);
+                                dput(active);
                            return ERR_PTR(-ERESTARTNOINTR);
                        }
                }
-                if (!oz_mode) {
+        }
-                        spin_lock(&dentry->d_lock);
-                        dentry->d_flags &= ~DCACHE_AUTOFS_PENDING;
+        /*
-                        spin_unlock(&dentry->d_lock);
+         * User space can (and has done in the past) remove and re-create
+         * this directory during the callback. This can leave us with an
+         * unhashed dentry, but a successful mount!  So we need to
+         * perform another cached lookup in case the dentry now exists.
+         */
+        if (!oz_mode && !have_submounts(dentry)) {
+                struct dentry *new;
+                new = d_lookup(dentry->d_parent, &dentry->d_name);
+                if (new) {
+                        if (active)
+                                dput(active);
+                        return new;
+                } else {
+                        if (!status)
+                                status = -ENOENT;
                }
        }
        /*
-         * If this dentry is unhashed, then we shouldn't honour this
+         * If we had a mount failure, return status to user space.
-         * lookup.  Returning ENOENT here doesn't do the right thing
+         * If the mount succeeded and we used a dentry from the active queue
-         * for all system calls, but it should be OK for the operations
+         * return it.
-         * we permit from an autofs.
         */
-        if (!oz_mode && d_unhashed(dentry)) {
+        if (status) {
+                dentry = ERR_PTR(status);
+                if (active)
+                        dput(active);
+                return dentry;
+        } else {
                /*
-                 * A user space application can (and has done in the past)
+                 * Valid successful mount, return active dentry or NULL
-                 * remove and re-create this directory during the callback.
+                 * for a new dentry.
-                 * This can leave us with an unhashed dentry, but a
-                 * successful mount!  So we need to perform another
-                 * cached lookup in case the dentry now exists.
                 */
-                struct dentry *parent = dentry->d_parent;
+                if (active)
-                struct dentry *new = d_lookup(parent, &dentry->d_name);
+                        return active;
-                if (new != NULL)
-                        dentry = new;
-                else
-                        dentry = ERR_PTR(-ENOENT);
-                if (unhashed)
-                        dput(unhashed);
-                return dentry;
        }
-        if (unhashed)
-                return unhashed;
        return NULL;
 }
@@ -624,11 +834,6 @@ static int autofs4_dir_symlink(struct inode *dir,
        if (!ino)
                return -ENOMEM;
-        spin_lock(&sbi->lookup_lock);
-        if (!list_empty(&ino->active))
-                list_del_init(&ino->active);
-        spin_unlock(&sbi->lookup_lock);
        ino->size = strlen(symname);
        cp = kmalloc(ino->size + 1, GFP_KERNEL);
        if (!cp) {
@@ -705,10 +910,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        dir->i_mtime = CURRENT_TIME;
        spin_lock(&dcache_lock);
-        spin_lock(&sbi->lookup_lock);
-        if (list_empty(&ino->expiring))
-                list_add(&ino->expiring, &sbi->expiring_list);
-        spin_unlock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
@@ -734,10 +935,6 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
                spin_unlock(&dcache_lock);
                return -ENOTEMPTY;
        }
-        spin_lock(&sbi->lookup_lock);
-        if (list_empty(&ino->expiring))
-                list_add(&ino->expiring, &sbi->expiring_list);
-        spin_unlock(&sbi->lookup_lock);
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
@@ -775,11 +972,6 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        if (!ino)
                return -ENOMEM;
-        spin_lock(&sbi->lookup_lock);
-        if (!list_empty(&ino->active))
-                list_del_init(&ino->active);
-        spin_unlock(&sbi->lookup_lock);
        inode = autofs4_get_inode(dir->i_sb, ino);
        if (!inode) {
                if (!dentry->d_fsdata)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index b639dcf7c778..346b69405363 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -32,7 +32,7 @@
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int aout_core_dump(struct coredump_params *cprm);
 static struct linux_binfmt aout_format = {
        .module         = THIS_MODULE,
@@ -89,8 +89,9 @@ if (file->f_op->llseek) { \
 * dumping of the process results in another error..
 */
-static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int aout_core_dump(struct coredump_params *cprm)
 {
+        struct file *file = cprm->file;
        mm_segment_t fs;
        int has_dumped = 0;
        unsigned long dump_start, dump_size;
@@ -108,16 +109,16 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
        current->flags |= PF_DUMPCORE;
        strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
        dump.u_ar0 = offsetof(struct user, regs);
-        dump.signal = signr;
+        dump.signal = cprm->signr;
-        aout_dump_thread(regs, &dump);
+        aout_dump_thread(cprm->regs, &dump);
 /* If the size of the dump file exceeds the rlimit, then see what would happen
   if we wrote the stack, but not the data area.  */
-        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
+        if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit)
                dump.u_dsize = 0;
 /* Make sure we have enough room to write the stack and data areas. */
-        if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
+        if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit)
                dump.u_ssize = 0;
 /* make sure we actually have a data and stack area to dump */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d15ea1790bfb..edd90c49003c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,8 +44,8 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
 * If we don't support core dumping, then supply a NULL so we
 * don't even try.
 */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
-static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int elf_core_dump(struct coredump_params *cprm);
 #else
 #define elf_core_dump   NULL
 #endif
@@ -1101,12 +1101,7 @@ out:
        return error;
 }
-/*
+#ifdef CONFIG_ELF_CORE
- * Note that some platforms still use traditional core dumps and not
- * the ELF core dump.  Each platform can select it as appropriate.
- */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 /*
 * ELF core dumper
 *
@@ -1277,8 +1272,9 @@ static int writenote(struct memelfnote *men, struct file *file,
 }
 #undef DUMP_WRITE
-#define DUMP_WRITE(addr, nr)    \
+#define DUMP_WRITE(addr, nr)                            \
-        if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
+        if ((size += (nr)) > cprm->limit ||             \
+            !dump_write(cprm->file, (addr), (nr)))      \
                goto end_coredump;
 static void fill_elf_header(struct elfhdr *elf, int segs,
@@ -1906,7 +1902,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma,
 * and then they are actually written out.  If we run out of core limit
 * we just truncate.
 */
-static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int elf_core_dump(struct coredump_params *cprm)
 {
        int has_dumped = 0;
        mm_segment_t fs;
@@ -1952,7 +1948,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
         * notes.  This also sets up the file header.
         */
        if (!fill_note_info(elf, segs + 1, /* including notes section */
-                            &info, signr, regs))
+                            &info, cprm->signr, cprm->regs))
                goto cleanup;
        has_dumped = 1;
@@ -2014,14 +2010,14 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 #endif
        /* write out the notes section */
-        if (!write_note_info(&info, file, &foffset))
+        if (!write_note_info(&info, cprm->file, &foffset))
                goto end_coredump;
-        if (elf_coredump_extra_notes_write(file, &foffset))
+        if (elf_coredump_extra_notes_write(cprm->file, &foffset))
                goto end_coredump;
        /* Align to page */
-        if (!dump_seek(file, dataoff - foffset))
+        if (!dump_seek(cprm->file, dataoff - foffset))
                goto end_coredump;
        for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -2038,12 +2034,13 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
                        page = get_dump_page(addr);
                        if (page) {
                                void *kaddr = kmap(page);
-                                stop = ((size += PAGE_SIZE) > limit) ||
+                                stop = ((size += PAGE_SIZE) > cprm->limit) ||
-                                        !dump_write(file, kaddr, PAGE_SIZE);
+                                        !dump_write(cprm->file, kaddr,
+                                                    PAGE_SIZE);
                                kunmap(page);
                                page_cache_release(page);
                        } else
-                                stop = !dump_seek(file, PAGE_SIZE);
+                                stop = !dump_seek(cprm->file, PAGE_SIZE);
                        if (stop)
                                goto end_coredump;
                }
@@ -2063,7 +2060,7 @@ out:
        return has_dumped;
 }
-#endif          /* USE_ELF_CORE_DUMP */
+#endif          /* CONFIG_ELF_CORE */
 static int __init init_elf_binfmt(void)
 {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 79d2b1aa389f..c57d9ce5ff7e 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -75,14 +75,14 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
 static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *,
                                             struct file *, struct mm_struct *);
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
-static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit);
+static int elf_fdpic_core_dump(struct coredump_params *cprm);
 #endif
 static struct linux_binfmt elf_fdpic_format = {
        .module         = THIS_MODULE,
        .load_binary    = load_elf_fdpic_binary,
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
        .core_dump      = elf_fdpic_core_dump,
 #endif
        .min_coredump   = ELF_EXEC_PAGESIZE,
@@ -171,6 +171,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 #ifdef ELF_FDPIC_PLAT_INIT
        unsigned long dynaddr;
 #endif
+#ifndef CONFIG_MMU
+        unsigned long stack_prot;
+#endif
        struct file *interpreter = NULL; /* to shut gcc up */
        char *interpreter_name = NULL;
        int executable_stack;
@@ -316,6 +319,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
         * defunct, deceased, etc. after this point we have to exit via
         * error_kill */
        set_personality(PER_LINUX_FDPIC);
+        if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
+                current->personality |= READ_IMPLIES_EXEC;
        set_binfmt(&elf_fdpic_format);
        current->mm->start_code = 0;
@@ -377,9 +382,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
        if (stack_size < PAGE_SIZE * 2)
                stack_size = PAGE_SIZE * 2;
+        stack_prot = PROT_READ | PROT_WRITE;
+        if (executable_stack == EXSTACK_ENABLE_X ||
+            (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
+                stack_prot |= PROT_EXEC;
        down_write(&current->mm->mmap_sem);
-        current->mm->start_brk = do_mmap(NULL, 0, stack_size,
+        current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
-                                         PROT_READ | PROT_WRITE | PROT_EXEC,
                                         MAP_PRIVATE | MAP_ANONYMOUS |
                                         MAP_UNINITIALIZED | MAP_GROWSDOWN,
                                         0);
@@ -1201,7 +1210,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 *
 * Modelled on fs/binfmt_elf.c core dumper
 */
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 /*
 * These are the only things you should do on a core-file: use only these
@@ -1326,8 +1335,9 @@ static int writenote(struct memelfnote *men, struct file *file)
 #undef DUMP_WRITE
 #undef DUMP_SEEK
-#define DUMP_WRITE(addr, nr)    \
+#define DUMP_WRITE(addr, nr)                            \
-        if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \
+        if ((size += (nr)) > cprm->limit ||             \
+            !dump_write(cprm->file, (addr), (nr)))      \
                goto end_coredump;
 static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
@@ -1582,8 +1592,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size,
 * and then they are actually written out.  If we run out of core limit
 * we just truncate.
 */
-static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
+static int elf_fdpic_core_dump(struct coredump_params *cprm)
-                               struct file *file, unsigned long limit)
 {
 #define NUM_NOTES       6
        int has_dumped = 0;
@@ -1642,7 +1651,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                goto cleanup;
 #endif
-        if (signr) {
+        if (cprm->signr) {
                struct core_thread *ct;
                struct elf_thread_status *tmp;
@@ -1661,14 +1670,14 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                        int sz;
                        tmp = list_entry(t, struct elf_thread_status, list);
-                        sz = elf_dump_thread_status(signr, tmp);
+                        sz = elf_dump_thread_status(cprm->signr, tmp);
                        thread_status_size += sz;
                }
        }
        /* now collect the dump for the current */
-        fill_prstatus(prstatus, current, signr);
+        fill_prstatus(prstatus, current, cprm->signr);
-        elf_core_copy_regs(&prstatus->pr_reg, regs);
+        elf_core_copy_regs(&prstatus->pr_reg, cprm->regs);
        segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
@@ -1703,7 +1712,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        /* Try to dump the FPU. */
        if ((prstatus->pr_fpvalid =
-             elf_core_copy_task_fpregs(current, regs, fpu)))
+             elf_core_copy_task_fpregs(current, cprm->regs, fpu)))
                fill_note(notes + numnote++,
                          "CORE", NT_PRFPREG, sizeof(*fpu), fpu);
 #ifdef ELF_CORE_COPY_XFPREGS
@@ -1774,7 +1783,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
        /* write out the notes section */
        for (i = 0; i < numnote; i++)
-                if (!writenote(notes + i, file))
+                if (!writenote(notes + i, cprm->file))
                        goto end_coredump;
        /* write out the thread status notes section */
@@ -1783,25 +1792,26 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs,
                                list_entry(t, struct elf_thread_status, list);
                for (i = 0; i < tmp->num_notes; i++)
-                        if (!writenote(&tmp->notes[i], file))
+                        if (!writenote(&tmp->notes[i], cprm->file))
                                goto end_coredump;
        }
-        if (!dump_seek(file, dataoff))
+        if (!dump_seek(cprm->file, dataoff))
                goto end_coredump;
-        if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0)
+        if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
+                                    mm_flags) < 0)
                goto end_coredump;
 #ifdef ELF_CORE_WRITE_EXTRA_DATA
        ELF_CORE_WRITE_EXTRA_DATA;
 #endif
-        if (file->f_pos != offset) {
+        if (cprm->file->f_pos != offset) {
                /* Sanity check */
                printk(KERN_WARNING
                       "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n",
-                       file->f_pos, offset);
+                       cprm->file->f_pos, offset);
        }
 end_coredump:
@@ -1826,4 +1836,4 @@ cleanup:
 #undef NUM_NOTES
 }
-#endif          /* USE_ELF_CORE_DUMP */
+#endif          /* CONFIG_ELF_CORE */
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index a2796651e756..d4a00ea1054c 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p);
 #endif
 static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
-static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit);
+static int flat_core_dump(struct coredump_params *cprm);
 static struct linux_binfmt flat_format = {
        .module         = THIS_MODULE,
@@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = {
 * Currently only a stub-function.
 */
-static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit)
+static int flat_core_dump(struct coredump_params *cprm)
 {
        printk("Process %s:%d received signr %d and should have core dumped\n",
-                        current->comm, current->pid, (int) signr);
+                        current->comm, current->pid, (int) cprm->signr);
        return(1);
 }
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index eff74b9c9e77..2a9b5330cc5e 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -43,7 +43,7 @@ static int load_som_library(struct file *);
 * don't even try.
 */
 #if 0
-static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit);
+static int som_core_dump(struct coredump_params *cprm);
 #else
 #define som_core_dump   NULL
 #endif
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 402afe0a0bfb..7bb3c020e570 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,7 +4,6 @@ config BTRFS_FS
        select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
-        select FS_JOURNAL_INFO
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 361604244271..2e9e69987a82 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -73,13 +73,13 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        return acl;
 }
-static int btrfs_xattr_get_acl(struct inode *inode, int type,
+static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
-                               void *value, size_t size)
+                void *value, size_t size, int type)
 {
        struct posix_acl *acl;
        int ret = 0;
-        acl = btrfs_get_acl(inode, type);
+        acl = btrfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
@@ -94,7 +94,8 @@ static int btrfs_xattr_get_acl(struct inode *inode, int type,
 /*
 * Needs to be called with fs_mutex held
 */
-static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+                         struct inode *inode, struct posix_acl *acl, int type)
 {
        int ret, size = 0;
        const char *name;
@@ -140,8 +141,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
                        goto out;
        }
-        ret = __btrfs_setxattr(inode, name, value, size, 0);
+        ret = __btrfs_setxattr(trans, inode, name, value, size, 0);
 out:
        kfree(value);
@@ -151,10 +151,10 @@ out:
        return ret;
 }
-static int btrfs_xattr_set_acl(struct inode *inode, int type,
+static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
-                               const void *value, size_t size)
+                const void *value, size_t size, int flags, int type)
 {
-        int ret = 0;
+        int ret;
        struct posix_acl *acl = NULL;
        if (value) {
@@ -167,38 +167,13 @@ static int btrfs_xattr_set_acl(struct inode *inode, int type,
                }
        }
-        ret = btrfs_set_acl(inode, acl, type);
+        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
        posix_acl_release(acl);
        return ret;
 }
-static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name,
-                                      void *value, size_t size)
-{
-        return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name,
-                                      const void *value, size_t size, int flags)
-{
-        return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name,
-                                       void *value, size_t size)
-{
-        return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name,
-                               const void *value, size_t size, int flags)
-{
-        return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 int btrfs_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl;
@@ -221,7 +196,8 @@ int btrfs_check_acl(struct inode *inode, int mask)
 * stuff has been fixed to work with that.  If the locking stuff changes, we
 * need to re-evaluate the acl locking stuff.
 */
-int btrfs_init_acl(struct inode *inode, struct inode *dir)
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+                   struct inode *inode, struct inode *dir)
 {
        struct posix_acl *acl = NULL;
        int ret = 0;
@@ -246,7 +222,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
                mode_t mode;
                if (S_ISDIR(inode->i_mode)) {
-                        ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT);
+                        ret = btrfs_set_acl(trans, inode, acl,
+                                            ACL_TYPE_DEFAULT);
                        if (ret)
                                goto failed;
                }
@@ -261,7 +238,7 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
                        inode->i_mode = mode;
                        if (ret > 0) {
                                /* we need an acl */
-                                ret = btrfs_set_acl(inode, clone,
+                                ret = btrfs_set_acl(trans, inode, clone,
                                                    ACL_TYPE_ACCESS);
                        }
                }
@@ -294,7 +271,7 @@ int btrfs_acl_chmod(struct inode *inode)
        ret = posix_acl_chmod_masq(clone, inode->i_mode);
        if (!ret)
-                ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS);
+                ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
        posix_acl_release(clone);
@@ -303,14 +280,16 @@ int btrfs_acl_chmod(struct inode *inode)
 struct xattr_handler btrfs_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .get    = btrfs_xattr_acl_default_get,
+        .flags  = ACL_TYPE_DEFAULT,
-        .set    = btrfs_xattr_acl_default_set,
+        .get    = btrfs_xattr_acl_get,
+        .set    = btrfs_xattr_acl_set,
 };
 struct xattr_handler btrfs_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .get    = btrfs_xattr_acl_access_get,
+        .flags  = ACL_TYPE_ACCESS,
-        .set    = btrfs_xattr_acl_access_set,
+        .get    = btrfs_xattr_acl_get,
+        .set    = btrfs_xattr_acl_set,
 };
 #else /* CONFIG_BTRFS_FS_POSIX_ACL */
@@ -320,7 +299,8 @@ int btrfs_acl_chmod(struct inode *inode)
        return 0;
 }
-int btrfs_init_acl(struct inode *inode, struct inode *dir)
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+                   struct inode *inode, struct inode *dir)
 {
        return 0;
 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f6783a42f010..3f1f50d9d916 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,9 +44,6 @@ struct btrfs_inode {
         */
        struct extent_io_tree io_failure_tree;
-        /* held while inesrting or deleting extents from files */
-        struct mutex extent_mutex;
        /* held while logging the inode in tree-log.c */
        struct mutex log_mutex;
@@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 {
-        inode->i_size = size;
+        i_size_write(inode, size);
        BTRFS_I(inode)->disk_i_size = size;
 }
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ec96f3a6d536..c4bc570a396e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *src_buf);
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int level, int slot);
+static int setup_items_for_insert(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root, struct btrfs_path *path,
+                        struct btrfs_key *cpu_key, u32 *data_size,
+                        u32 total_data, u32 total_size, int nr);
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                btrfs_free_extent(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf->start, buf->len,
-                                  parent_start, root->root_key.objectid,
+                                parent_start, root->root_key.objectid, level);
-                                  level, 0);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                btrfs_free_extent(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf->start, buf->len,
-                                  parent_start, root->root_key.objectid,
+                                parent_start, root->root_key.objectid, level);
-                                  level, 0);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
-                ret = btrfs_free_extent(trans, root, mid->start, mid->len,
+                ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
-                                        0, root->root_key.objectid, level, 1);
+                                            0, root->root_key.objectid, level);
                /* once for the root ptr */
                free_extent_buffer(mid);
                return ret;
@@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                                       1);
                        if (wret)
                                ret = wret;
-                        wret = btrfs_free_extent(trans, root, bytenr,
+                        wret = btrfs_free_tree_block(trans, root,
-                                                 blocksize, 0,
+                                                     bytenr, blocksize, 0,
-                                                 root->root_key.objectid,
+                                                     root->root_key.objectid,
-                                                 level, 0);
+                                                     level);
                        if (wret)
                                ret = wret;
                } else {
@@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                wret = del_ptr(trans, root, path, level + 1, pslot);
                if (wret)
                        ret = wret;
-                wret = btrfs_free_extent(trans, root, bytenr, blocksize,
+                wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
-                                         0, root->root_key.objectid,
+                                         0, root->root_key.objectid, level);
-                                         level, 0);
                if (wret)
                        ret = wret;
        } else {
@@ -2997,75 +2999,85 @@ again:
        return ret;
 }
-/*
+static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
- * This function splits a single item into two items,
+                                         struct btrfs_root *root,
- * giving 'new_key' to the new item and splitting the
+                                         struct btrfs_path *path, int ins_len)
- * old one at split_offset (from the start of the item).
- *
- * The path may be released by this operation.  After
- * the split, the path is pointing to the old item.  The
- * new item is going to be in the same node as the old one.
- *
- * Note, the item being split must be smaller enough to live alone on
- * a tree block with room for one extra struct btrfs_item
- *
- * This allows us to split the item in place, keeping a lock on the
- * leaf the entire time.
- */
-int btrfs_split_item(struct btrfs_trans_handle *trans,
-                     struct btrfs_root *root,
-                     struct btrfs_path *path,
-                     struct btrfs_key *new_key,
-                     unsigned long split_offset)
 {
-        u32 item_size;
+        struct btrfs_key key;
        struct extent_buffer *leaf;
-        struct btrfs_key orig_key;
+        struct btrfs_file_extent_item *fi;
-        struct btrfs_item *item;
+        u64 extent_len = 0;
-        struct btrfs_item *new_item;
+        u32 item_size;
-        int ret = 0;
+        int ret;
-        int slot;
-        u32 nritems;
-        u32 orig_offset;
-        struct btrfs_disk_key disk_key;
-        char *buf;
        leaf = path->nodes[0];
-        btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]);
+        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item))
-                goto split;
+        BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
+               key.type != BTRFS_EXTENT_CSUM_KEY);
+        if (btrfs_leaf_free_space(root, leaf) >= ins_len)
+                return 0;
        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        if (key.type == BTRFS_EXTENT_DATA_KEY) {
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+        }
        btrfs_release_path(root, path);
-        path->search_for_split = 1;
        path->keep_locks = 1;
+        path->search_for_split = 1;
-        ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1);
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        path->search_for_split = 0;
+        if (ret < 0)
+                goto err;
+        ret = -EAGAIN;
+        leaf = path->nodes[0];
        /* if our item isn't there or got smaller, return now */
-        if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0],
+        if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
-                                                        path->slots[0])) {
+                goto err;
-                path->keep_locks = 0;
-                return -EAGAIN;
+        if (key.type == BTRFS_EXTENT_DATA_KEY) {
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
+                        goto err;
        }
        btrfs_set_path_blocking(path);
-        ret = split_leaf(trans, root, &orig_key, path,
+        ret = split_leaf(trans, root, &key, path, ins_len, 1);
-                         sizeof(struct btrfs_item), 1);
-        path->keep_locks = 0;
        BUG_ON(ret);
+        path->keep_locks = 0;
        btrfs_unlock_up_safe(path, 1);
+        return 0;
+err:
+        path->keep_locks = 0;
+        return ret;
+}
+static noinline int split_item(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_path *path,
+                               struct btrfs_key *new_key,
+                               unsigned long split_offset)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        struct btrfs_item *new_item;
+        int slot;
+        char *buf;
+        u32 nritems;
+        u32 item_size;
+        u32 orig_offset;
+        struct btrfs_disk_key disk_key;
        leaf = path->nodes[0];
        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-split:
-        /*
-         * make sure any changes to the path from split_leaf leave it
-         * in a blocking state
-         */
        btrfs_set_path_blocking(path);
        item = btrfs_item_nr(leaf, path->slots[0]);
@@ -3073,19 +3085,19 @@ split:
        item_size = btrfs_item_size(leaf, item);
        buf = kmalloc(item_size, GFP_NOFS);
+        if (!buf)
+                return -ENOMEM;
        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
                            path->slots[0]), item_size);
-        slot = path->slots[0] + 1;
-        leaf = path->nodes[0];
+        slot = path->slots[0] + 1;
        nritems = btrfs_header_nritems(leaf);
        if (slot != nritems) {
                /* shift the items */
                memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
-                              btrfs_item_nr_offset(slot),
+                                btrfs_item_nr_offset(slot),
-                              (nritems - slot) * sizeof(struct btrfs_item));
+                                (nritems - slot) * sizeof(struct btrfs_item));
        }
        btrfs_cpu_key_to_disk(&disk_key, new_key);
@@ -3113,16 +3125,81 @@ split:
                            item_size - split_offset);
        btrfs_mark_buffer_dirty(leaf);
-        ret = 0;
+        BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
-        if (btrfs_leaf_free_space(root, leaf) < 0) {
-                btrfs_print_leaf(root, leaf);
-                BUG();
-        }
        kfree(buf);
+        return 0;
+}
+/*
+ * This function splits a single item into two items,
+ * giving 'new_key' to the new item and splitting the
+ * old one at split_offset (from the start of the item).
+ *
+ * The path may be released by this operation.  After
+ * the split, the path is pointing to the old item.  The
+ * new item is going to be in the same node as the old one.
+ *
+ * Note, the item being split must be smaller enough to live alone on
+ * a tree block with room for one extra struct btrfs_item
+ *
+ * This allows us to split the item in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_split_item(struct btrfs_trans_handle *trans,
+                     struct btrfs_root *root,
+                     struct btrfs_path *path,
+                     struct btrfs_key *new_key,
+                     unsigned long split_offset)
+{
+        int ret;
+        ret = setup_leaf_for_split(trans, root, path,
+                                   sizeof(struct btrfs_item));
+        if (ret)
+                return ret;
+        ret = split_item(trans, root, path, new_key, split_offset);
        return ret;
 }
 /*
+ * This function duplicate a item, giving 'new_key' to the new item.
+ * It guarantees both items live in the same tree leaf and the new item
+ * is contiguous with the original item.
+ *
+ * This allows us to split file extent in place, keeping a lock on the
+ * leaf the entire time.
+ */
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         struct btrfs_key *new_key)
+{
+        struct extent_buffer *leaf;
+        int ret;
+        u32 item_size;
+        leaf = path->nodes[0];
+        item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+        ret = setup_leaf_for_split(trans, root, path,
+                                   item_size + sizeof(struct btrfs_item));
+        if (ret)
+                return ret;
+        path->slots[0]++;
+        ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
+                                     item_size, item_size +
+                                     sizeof(struct btrfs_item), 1);
+        BUG_ON(ret);
+        leaf = path->nodes[0];
+        memcpy_extent_buffer(leaf,
+                             btrfs_item_ptr_offset(leaf, path->slots[0]),
+                             btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
+                             item_size);
+        return 0;
+}
+/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
@@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
+        ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
-                                0, root->root_key.objectid, 0, 0);
+                                    0, root->root_key.objectid, 0);
        return ret;
 }
 /*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 444b3e9b92a4..9f806dd04c27 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -310,6 +310,9 @@ struct btrfs_header {
 #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
                                        sizeof(struct btrfs_item) - \
                                        sizeof(struct btrfs_file_extent_item))
+#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
+                                 sizeof(struct btrfs_item) -\
+                                 sizeof(struct btrfs_dir_item))
 /*
@@ -859,8 +862,9 @@ struct btrfs_fs_info {
        struct mutex ordered_operations_mutex;
        struct rw_semaphore extent_commit_sem;
-        struct rw_semaphore subvol_sem;
+        struct rw_semaphore cleanup_work_sem;
+        struct rw_semaphore subvol_sem;
        struct srcu_struct subvol_srcu;
        struct list_head trans_list;
@@ -868,6 +872,9 @@ struct btrfs_fs_info {
        struct list_head dead_roots;
        struct list_head caching_block_groups;
+        spinlock_t delayed_iput_lock;
+        struct list_head delayed_iputs;
        atomic_t nr_async_submits;
        atomic_t async_submit_draining;
        atomic_t nr_async_bios;
@@ -1034,12 +1041,12 @@ struct btrfs_root {
        int ref_cows;
        int track_dirty;
        int in_radix;
+        int clean_orphans;
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
        struct btrfs_key defrag_max;
        int defrag_running;
-        int defrag_level;
        char *name;
        int in_sysfs;
@@ -1975,6 +1982,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size);
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 bytenr, u32 blocksize,
+                          u64 parent, u64 root_objectid, int level);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2089,6 +2100,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
                     struct btrfs_path *path,
                     struct btrfs_key *new_key,
                     unsigned long split_offset);
+int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root,
+                         struct btrfs_path *path,
+                         struct btrfs_key *new_key);
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_path *p, int
                      ins_len, int cow);
@@ -2196,9 +2211,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
                              struct btrfs_path *path,
                              struct btrfs_dir_item *di);
 int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, const char *name,
+                            struct btrfs_root *root,
-                            u16 name_len, const void *data, u16 data_len,
+                            struct btrfs_path *path, u64 objectid,
-                            u64 dir);
+                            const char *name, u16 name_len,
+                            const void *data, u16 data_len);
 struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path, u64 dir,
@@ -2292,7 +2308,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               struct inode *inode, u64 new_size,
                               u32 min_type);
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
 int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
@@ -2332,6 +2348,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_add_delayed_iput(struct inode *inode);
+void btrfs_run_delayed_iputs(struct btrfs_root *root);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
@@ -2345,12 +2363,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-                       struct btrfs_root *root, struct inode *inode,
+                       u64 start, u64 end, u64 *hint_byte, int drop_cache);
-                       u64 start, u64 end, u64 locked_end,
-                       u64 inline_limit, u64 *hint_block, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
@@ -2380,7 +2395,8 @@ int btrfs_check_acl(struct inode *inode, int mask);
 #else
 #define btrfs_check_acl NULL
 #endif
-int btrfs_init_acl(struct inode *inode, struct inode *dir);
+int btrfs_init_acl(struct btrfs_trans_handle *trans,
+                   struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 /* relocation.c */
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f3a6075519cc..e9103b3baa49 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
 * into the tree
 */
 int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, const char *name,
+                            struct btrfs_root *root,
-                            u16 name_len, const void *data, u16 data_len,
+                            struct btrfs_path *path, u64 objectid,
-                            u64 dir)
+                            const char *name, u16 name_len,
+                            const void *data, u16 data_len)
 {
        int ret = 0;
-        struct btrfs_path *path;
        struct btrfs_dir_item *dir_item;
        unsigned long name_ptr, data_ptr;
        struct btrfs_key key, location;
@@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        u32 data_size;
-        key.objectid = dir;
+        BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
+        key.objectid = objectid;
        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
        key.offset = btrfs_name_hash(name, name_len);
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        if (name_len + data_len + sizeof(struct btrfs_dir_item) >
-            BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item))
-                return -ENOSPC;
        data_size = sizeof(*dir_item) + name_len + data_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
@@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
        write_extent_buffer(leaf, data, data_ptr, data_len);
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_free_path(path);
        return ret;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 02b6afbd7450..009e3bd18f23 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->stripesize = stripesize;
        root->ref_cows = 0;
        root->track_dirty = 0;
+        root->in_radix = 0;
+        root->clean_orphans = 0;
        root->fs_info = fs_info;
        root->objectid = objectid;
@@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->defrag_trans_start = fs_info->generation;
        init_completion(&root->kobj_unregister);
        root->defrag_running = 0;
-        root->defrag_level = 0;
        root->root_key.objectid = objectid;
        root->anon_super.s_root = NULL;
        root->anon_super.s_dev = 0;
@@ -980,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
        while (1) {
                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-                                    0, &start, &end, EXTENT_DIRTY);
+                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
                if (ret)
                        break;
-                clear_extent_dirty(&log_root_tree->dirty_log_pages,
+                clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
-                                   start, end, GFP_NOFS);
+                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
        eb = fs_info->log_root_tree->node;
@@ -1210,8 +1211,10 @@ again:
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)root->root_key.objectid,
                                root);
-        if (ret == 0)
+        if (ret == 0) {
                root->in_radix = 1;
+                root->clean_orphans = 1;
+        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        radix_tree_preload_end();
        if (ret) {
@@ -1225,10 +1228,6 @@ again:
        ret = btrfs_find_dead_roots(fs_info->tree_root,
                                    root->root_key.objectid);
        WARN_ON(ret);
-        if (!(fs_info->sb->s_flags & MS_RDONLY))
-                btrfs_orphan_cleanup(root);
        return root;
 fail:
        free_fs_root(root);
@@ -1477,6 +1476,7 @@ static int cleaner_kthread(void *arg)
                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
                    mutex_trylock(&root->fs_info->cleaner_mutex)) {
+                        btrfs_run_delayed_iputs(root);
                        btrfs_clean_old_snapshots(root);
                        mutex_unlock(&root->fs_info->cleaner_mutex);
                }
@@ -1606,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
+        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
        INIT_LIST_HEAD(&fs_info->ordered_operations);
@@ -1614,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
+        spin_lock_init(&fs_info->delayed_iput_lock);
        init_completion(&fs_info->kobj_unregister);
        fs_info->tree_root = tree_root;
@@ -1689,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        init_rwsem(&fs_info->extent_commit_sem);
+        init_rwsem(&fs_info->cleanup_work_sem);
        init_rwsem(&fs_info->subvol_sem);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
@@ -2386,8 +2389,14 @@ int btrfs_commit_super(struct btrfs_root *root)
        int ret;
        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_run_delayed_iputs(root);
        btrfs_clean_old_snapshots(root);
        mutex_unlock(&root->fs_info->cleaner_mutex);
+        /* wait until ongoing cleanup work done */
+        down_write(&root->fs_info->cleanup_work_sem);
+        up_write(&root->fs_info->cleanup_work_sem);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 94627c4cc193..56e50137d0e6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -195,6 +195,14 @@ static int exclude_super_stripes(struct btrfs_root *root,
        int stripe_len;
        int i, nr, ret;
+        if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
+                stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
+                cache->bytes_super += stripe_len;
+                ret = add_excluded_extent(root, cache->key.objectid,
+                                          stripe_len);
+                BUG_ON(ret);
+        }
        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
                bytenr = btrfs_sb_offset(i);
                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
@@ -255,7 +263,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                if (ret)
                        break;
-                if (extent_start == start) {
+                if (extent_start <= start) {
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
@@ -2880,9 +2888,9 @@ static noinline void flush_delalloc_async(struct btrfs_work *work)
        root = async->root;
        info = async->info;
-        btrfs_start_delalloc_inodes(root);
+        btrfs_start_delalloc_inodes(root, 0);
        wake_up(&info->flush_wait);
-        btrfs_wait_ordered_extents(root, 0);
+        btrfs_wait_ordered_extents(root, 0, 0);
        spin_lock(&info->lock);
        info->flushing = 0;
@@ -2956,8 +2964,8 @@ static void flush_delalloc(struct btrfs_root *root,
        return;
 flush:
-        btrfs_start_delalloc_inodes(root);
+        btrfs_start_delalloc_inodes(root, 0);
-        btrfs_wait_ordered_extents(root, 0);
+        btrfs_wait_ordered_extents(root, 0, 0);
        spin_lock(&info->lock);
        info->flushing = 0;
@@ -3454,14 +3462,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        else
                old_val -= num_bytes;
        btrfs_set_super_bytes_used(&info->super_copy, old_val);
-        /* block accounting for root item */
-        old_val = btrfs_root_used(&root->root_item);
-        if (alloc)
-                old_val += num_bytes;
-        else
-                old_val -= num_bytes;
-        btrfs_set_root_used(&root->root_item, old_val);
        spin_unlock(&info->delalloc_lock);
        while (total) {
@@ -4049,6 +4049,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
+int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          u64 bytenr, u32 blocksize,
+                          u64 parent, u64 root_objectid, int level)
+{
+        u64 used;
+        spin_lock(&root->node_lock);
+        used = btrfs_root_used(&root->root_item) - blocksize;
+        btrfs_set_root_used(&root->root_item, used);
+        spin_unlock(&root->node_lock);
+        return btrfs_free_extent(trans, root, bytenr, blocksize,
+                                 parent, root_objectid, level, 0);
+}
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
        u64 mask = ((u64)root->stripesize - 1);
@@ -4578,7 +4593,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        u64 search_start = 0;
-        struct btrfs_fs_info *info = root->fs_info;
        data = btrfs_get_alloc_profile(root, data);
 again:
@@ -4586,17 +4600,9 @@ again:
         * the only place that sets empty_size is btrfs_realloc_node, which
         * is not called recursively on allocations
         */
-        if (empty_size || root->ref_cows) {
+        if (empty_size || root->ref_cows)
-                if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
-                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                     2 * 1024 * 1024,
-                                     BTRFS_BLOCK_GROUP_METADATA |
-                                     (info->metadata_alloc_profile &
-                                      info->avail_metadata_alloc_bits), 0);
-                }
                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                     num_bytes + 2 * 1024 * 1024, data, 0);
-        }
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -4897,6 +4903,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
                                        extent_op);
                BUG_ON(ret);
        }
+        if (root_objectid == root->root_key.objectid) {
+                u64 used;
+                spin_lock(&root->node_lock);
+                used = btrfs_root_used(&root->root_item) + num_bytes;
+                btrfs_set_root_used(&root->root_item, used);
+                spin_unlock(&root->node_lock);
+        }
        return ret;
 }
@@ -4919,8 +4933,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        btrfs_set_buffer_uptodate(buf);
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-                set_extent_dirty(&root->dirty_log_pages, buf->start,
+                /*
-                         buf->start + buf->len - 1, GFP_NOFS);
+                 * we allow two log transactions at a time, use different
+                 * EXENT bit to differentiate dirty pages.
+                 */
+                if (root->log_transid % 2 == 0)
+                        set_extent_dirty(&root->dirty_log_pages, buf->start,
+                                        buf->start + buf->len - 1, GFP_NOFS);
+                else
+                        set_extent_new(&root->dirty_log_pages, buf->start,
+                                        buf->start + buf->len - 1, GFP_NOFS);
        } else {
                set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77f759302e12..feaa13b105d9 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                }
                flags = em->flags;
                if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
-                        if (em->start <= start &&
+                        if (testend && em->start + em->len >= start + len) {
-                            (!testend || em->start + em->len >= start + len)) {
                                free_extent_map(em);
                                write_unlock(&em_tree->lock);
                                break;
                        }
-                        if (start < em->start) {
+                        start = em->start + em->len;
-                                len = em->start - start;
+                        if (testend)
-                        } else {
                                len = start + len - (em->start + em->len);
-                                start = em->start + em->len;
-                        }
                        free_extent_map(em);
                        write_unlock(&em_tree->lock);
                        continue;
@@ -265,319 +261,247 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 * If an extent intersects the range but is not entirely inside the range
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
- *
- * inline_limit is used to tell this code which offsets in the file to keep
- * if they contain inline extents.
 */
-noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-                       struct btrfs_root *root, struct inode *inode,
+                       u64 start, u64 end, u64 *hint_byte, int drop_cache)
-                       u64 start, u64 end, u64 locked_end,
-                       u64 inline_limit, u64 *hint_byte, int drop_cache)
 {
-        u64 extent_end = 0;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 search_start = start;
-        u64 ram_bytes = 0;
-        u64 disk_bytenr = 0;
-        u64 orig_locked_end = locked_end;
-        u8 compression;
-        u8 encryption;
-        u16 other_encoding = 0;
        struct extent_buffer *leaf;
-        struct btrfs_file_extent_item *extent;
+        struct btrfs_file_extent_item *fi;
        struct btrfs_path *path;
        struct btrfs_key key;
-        struct btrfs_file_extent_item old;
+        struct btrfs_key new_key;
-        int keep;
+        u64 search_start = start;
-        int slot;
+        u64 disk_bytenr = 0;
-        int bookend;
+        u64 num_bytes = 0;
-        int found_type = 0;
+        u64 extent_offset = 0;
-        int found_extent;
+        u64 extent_end = 0;
-        int found_inline;
+        int del_nr = 0;
+        int del_slot = 0;
+        int extent_type;
        int recow;
        int ret;
-        inline_limit = 0;
        if (drop_cache)
                btrfs_drop_extent_cache(inode, start, end - 1, 0);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        while (1) {
                recow = 0;
-                btrfs_release_path(root, path);
                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
                                               search_start, -1);
                if (ret < 0)
-                        goto out;
+                        break;
-                if (ret > 0) {
+                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
-                        if (path->slots[0] == 0) {
+                        leaf = path->nodes[0];
-                                ret = 0;
+                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
-                                goto out;
+                        if (key.objectid == inode->i_ino &&
-                        }
+                            key.type == BTRFS_EXTENT_DATA_KEY)
-                        path->slots[0]--;
+                                path->slots[0]--;
                }
+                ret = 0;
 next_slot:
-                keep = 0;
-                bookend = 0;
-                found_extent = 0;
-                found_inline = 0;
-                compression = 0;
-                encryption = 0;
-                extent = NULL;
                leaf = path->nodes[0];
-                slot = path->slots[0];
+                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
-                ret = 0;
+                        BUG_ON(del_nr > 0);
-                btrfs_item_key_to_cpu(leaf, &key, slot);
+                        ret = btrfs_next_leaf(root, path);
-                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
+                        if (ret < 0)
-                    key.offset >= end) {
+                                break;
-                        goto out;
+                        if (ret > 0) {
-                }
+                                ret = 0;
-                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
+                                break;
-                    key.objectid != inode->i_ino) {
-                        goto out;
-                }
-                if (recow) {
-                        search_start = max(key.offset, start);
-                        continue;
-                }
-                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
-                        extent = btrfs_item_ptr(leaf, slot,
-                                                struct btrfs_file_extent_item);
-                        found_type = btrfs_file_extent_type(leaf, extent);
-                        compression = btrfs_file_extent_compression(leaf,
-                                                                    extent);
-                        encryption = btrfs_file_extent_encryption(leaf,
-                                                                  extent);
-                        other_encoding = btrfs_file_extent_other_encoding(leaf,
-                                                                  extent);
-                        if (found_type == BTRFS_FILE_EXTENT_REG ||
-                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-                                extent_end =
-                                     btrfs_file_extent_disk_bytenr(leaf,
-                                                                   extent);
-                                if (extent_end)
-                                        *hint_byte = extent_end;
-                                extent_end = key.offset +
-                                     btrfs_file_extent_num_bytes(leaf, extent);
-                                ram_bytes = btrfs_file_extent_ram_bytes(leaf,
-                                                                extent);
-                                found_extent = 1;
-                        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
-                                found_inline = 1;
-                                extent_end = key.offset +
-                                     btrfs_file_extent_inline_len(leaf, extent);
                        }
+                        leaf = path->nodes[0];
+                        recow = 1;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+                if (key.objectid > inode->i_ino ||
+                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+                        break;
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_type = btrfs_file_extent_type(leaf, fi);
+                if (extent_type == BTRFS_FILE_EXTENT_REG ||
+                    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+                        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+                        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+                        extent_offset = btrfs_file_extent_offset(leaf, fi);
+                        extent_end = key.offset +
+                                btrfs_file_extent_num_bytes(leaf, fi);
+                } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+                        extent_end = key.offset +
+                                btrfs_file_extent_inline_len(leaf, fi);
                } else {
+                        WARN_ON(1);
                        extent_end = search_start;
                }
-                /* we found nothing we can drop */
+                if (extent_end <= search_start) {
-                if ((!found_extent && !found_inline) ||
+                        path->slots[0]++;
-                    search_start >= extent_end) {
-                        int nextret;
-                        u32 nritems;
-                        nritems = btrfs_header_nritems(leaf);
-                        if (slot >= nritems - 1) {
-                                nextret = btrfs_next_leaf(root, path);
-                                if (nextret)
-                                        goto out;
-                                recow = 1;
-                        } else {
-                                path->slots[0]++;
-                        }
                        goto next_slot;
                }
-                if (end <= extent_end && start >= key.offset && found_inline)
+                search_start = max(key.offset, start);
-                        *hint_byte = EXTENT_MAP_INLINE;
+                if (recow) {
+                        btrfs_release_path(root, path);
-                if (found_extent) {
+                        continue;
-                        read_extent_buffer(leaf, &old, (unsigned long)extent,
-                                           sizeof(old));
-                }
-                if (end < extent_end && end >= key.offset) {
-                        bookend = 1;
-                        if (found_inline && start <= key.offset)
-                                keep = 1;
                }
-                if (bookend && found_extent) {
+                /*
-                        if (locked_end < extent_end) {
+                 *     | - range to drop - |
-                                ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+                 *  | -------- extent -------- |
-                                                locked_end, extent_end - 1,
+                 */
-                                                GFP_NOFS);
+                if (start > key.offset && end < extent_end) {
-                                if (!ret) {
+                        BUG_ON(del_nr > 0);
-                                        btrfs_release_path(root, path);
+                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
-                                        lock_extent(&BTRFS_I(inode)->io_tree,
-                                                locked_end, extent_end - 1,
+                        memcpy(&new_key, &key, sizeof(new_key));
-                                                GFP_NOFS);
+                        new_key.offset = start;
-                                        locked_end = extent_end;
+                        ret = btrfs_duplicate_item(trans, root, path,
-                                        continue;
+                                                   &new_key);
-                                }
+                        if (ret == -EAGAIN) {
-                                locked_end = extent_end;
+                                btrfs_release_path(root, path);
+                                continue;
                        }
-                        disk_bytenr = le64_to_cpu(old.disk_bytenr);
+                        if (ret < 0)
-                        if (disk_bytenr != 0) {
+                                break;
+                        leaf = path->nodes[0];
+                        fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
+                                            struct btrfs_file_extent_item);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        start - key.offset);
+                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_file_extent_item);
+                        extent_offset += start - key.offset;
+                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        extent_end - start);
+                        btrfs_mark_buffer_dirty(leaf);
+                        if (disk_bytenr > 0) {
                                ret = btrfs_inc_extent_ref(trans, root,
-                                           disk_bytenr,
+                                                disk_bytenr, num_bytes, 0,
-                                           le64_to_cpu(old.disk_num_bytes), 0,
+                                                root->root_key.objectid,
-                                           root->root_key.objectid,
+                                                new_key.objectid,
-                                           key.objectid, key.offset -
+                                                start - extent_offset);
-                                           le64_to_cpu(old.offset));
                                BUG_ON(ret);
+                                *hint_byte = disk_bytenr;
                        }
+                        key.offset = start;
                }
+                /*
+                 *  | ---- range to drop ----- |
+                 *      | -------- extent -------- |
+                 */
+                if (start <= key.offset && end < extent_end) {
+                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
-                if (found_inline) {
+                        memcpy(&new_key, &key, sizeof(new_key));
-                        u64 mask = root->sectorsize - 1;
+                        new_key.offset = end;
-                        search_start = (extent_end + mask) & ~mask;
+                        btrfs_set_item_key_safe(trans, root, path, &new_key);
-                } else
-                        search_start = extent_end;
+                        extent_offset += end - key.offset;
+                        btrfs_set_file_extent_offset(leaf, fi, extent_offset);
-                /* truncate existing extent */
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
-                if (start > key.offset) {
+                                                        extent_end - end);
-                        u64 new_num;
+                        btrfs_mark_buffer_dirty(leaf);
-                        u64 old_num;
+                        if (disk_bytenr > 0) {
-                        keep = 1;
+                                inode_sub_bytes(inode, end - key.offset);
-                        WARN_ON(start & (root->sectorsize - 1));
+                                *hint_byte = disk_bytenr;
-                        if (found_extent) {
-                                new_num = start - key.offset;
-                                old_num = btrfs_file_extent_num_bytes(leaf,
-                                                                      extent);
-                                *hint_byte =
-                                        btrfs_file_extent_disk_bytenr(leaf,
-                                                                      extent);
-                                if (btrfs_file_extent_disk_bytenr(leaf,
-                                                                  extent)) {
-                                        inode_sub_bytes(inode, old_num -
-                                                        new_num);
-                                }
-                                btrfs_set_file_extent_num_bytes(leaf,
-                                                        extent, new_num);
-                                btrfs_mark_buffer_dirty(leaf);
-                        } else if (key.offset < inline_limit &&
-                                   (end > extent_end) &&
-                                   (inline_limit < extent_end)) {
-                                u32 new_size;
-                                new_size = btrfs_file_extent_calc_inline_size(
-                                                   inline_limit - key.offset);
-                                inode_sub_bytes(inode, extent_end -
-                                                inline_limit);
-                                btrfs_set_file_extent_ram_bytes(leaf, extent,
-                                                        new_size);
-                                if (!compression && !encryption) {
-                                        btrfs_truncate_item(trans, root, path,
-                                                            new_size, 1);
-                                }
                        }
+                        break;
                }
-                /* delete the entire extent */
-                if (!keep) {
-                        if (found_inline)
-                                inode_sub_bytes(inode, extent_end -
-                                                key.offset);
-                        ret = btrfs_del_item(trans, root, path);
-                        /* TODO update progress marker and return */
-                        BUG_ON(ret);
-                        extent = NULL;
-                        btrfs_release_path(root, path);
-                        /* the extent will be freed later */
-                }
-                if (bookend && found_inline && start <= key.offset) {
-                        u32 new_size;
-                        new_size = btrfs_file_extent_calc_inline_size(
-                                                   extent_end - end);
-                        inode_sub_bytes(inode, end - key.offset);
-                        btrfs_set_file_extent_ram_bytes(leaf, extent,
-                                                        new_size);
-                        if (!compression && !encryption)
-                                ret = btrfs_truncate_item(trans, root, path,
-                                                          new_size, 0);
-                        BUG_ON(ret);
-                }
-                /* create bookend, splitting the extent in two */
-                if (bookend && found_extent) {
-                        struct btrfs_key ins;
-                        ins.objectid = inode->i_ino;
-                        ins.offset = end;
-                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
-                        btrfs_release_path(root, path);
+                search_start = extent_end;
-                        path->leave_spinning = 1;
+                /*
-                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
+                 *       | ---- range to drop ----- |
-                                                      sizeof(*extent));
+                 *  | -------- extent -------- |
-                        BUG_ON(ret);
+                 */
+                if (start > key.offset && end >= extent_end) {
+                        BUG_ON(del_nr > 0);
+                        BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
-                        leaf = path->nodes[0];
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
-                        extent = btrfs_item_ptr(leaf, path->slots[0],
+                                                        start - key.offset);
-                                                struct btrfs_file_extent_item);
+                        btrfs_mark_buffer_dirty(leaf);
-                        write_extent_buffer(leaf, &old,
+                        if (disk_bytenr > 0) {
-                                            (unsigned long)extent, sizeof(old));
+                                inode_sub_bytes(inode, extent_end - start);
+                                *hint_byte = disk_bytenr;
-                        btrfs_set_file_extent_compression(leaf, extent,
+                        }
-                                                          compression);
+                        if (end == extent_end)
-                        btrfs_set_file_extent_encryption(leaf, extent,
+                                break;
-                                                         encryption);
-                        btrfs_set_file_extent_other_encoding(leaf, extent,
-                                                             other_encoding);
-                        btrfs_set_file_extent_offset(leaf, extent,
-                                    le64_to_cpu(old.offset) + end - key.offset);
-                        WARN_ON(le64_to_cpu(old.num_bytes) <
-                                (extent_end - end));
-                        btrfs_set_file_extent_num_bytes(leaf, extent,
-                                                        extent_end - end);
-                        /*
+                        path->slots[0]++;
-                         * set the ram bytes to the size of the full extent
+                        goto next_slot;
-                         * before splitting.  This is a worst case flag,
-                         * but its the best we can do because we don't know
-                         * how splitting affects compression
-                         */
-                        btrfs_set_file_extent_ram_bytes(leaf, extent,
-                                                        ram_bytes);
-                        btrfs_set_file_extent_type(leaf, extent, found_type);
-                        btrfs_unlock_up_safe(path, 1);
-                        btrfs_mark_buffer_dirty(path->nodes[0]);
-                        btrfs_set_lock_blocking(path->nodes[0]);
-                        path->leave_spinning = 0;
-                        btrfs_release_path(root, path);
-                        if (disk_bytenr != 0)
-                                inode_add_bytes(inode, extent_end - end);
                }
-                if (found_extent && !keep) {
+                /*
-                        u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
+                 *  | ---- range to drop ----- |
+                 *    | ------ extent ------ |
+                 */
+                if (start <= key.offset && end >= extent_end) {
+                        if (del_nr == 0) {
+                                del_slot = path->slots[0];
+                                del_nr = 1;
+                        } else {
+                                BUG_ON(del_slot + del_nr != path->slots[0]);
+                                del_nr++;
+                        }
-                        if (old_disk_bytenr != 0) {
+                        if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
                                inode_sub_bytes(inode,
-                                                le64_to_cpu(old.num_bytes));
+                                                extent_end - key.offset);
+                                extent_end = ALIGN(extent_end,
+                                                   root->sectorsize);
+                        } else if (disk_bytenr > 0) {
                                ret = btrfs_free_extent(trans, root,
-                                                old_disk_bytenr,
+                                                disk_bytenr, num_bytes, 0,
-                                                le64_to_cpu(old.disk_num_bytes),
+                                                root->root_key.objectid,
-                                                0, root->root_key.objectid,
                                                key.objectid, key.offset -
-                                                le64_to_cpu(old.offset));
+                                                extent_offset);
                                BUG_ON(ret);
-                                *hint_byte = old_disk_bytenr;
+                                inode_sub_bytes(inode,
+                                                extent_end - key.offset);
+                                *hint_byte = disk_bytenr;
                        }
-                }
-                if (search_start >= end) {
+                        if (end == extent_end)
-                        ret = 0;
+                                break;
-                        goto out;
+                        if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
+                                path->slots[0]++;
+                                goto next_slot;
+                        }
+                        ret = btrfs_del_items(trans, root, path, del_slot,
+                                              del_nr);
+                        BUG_ON(ret);
+                        del_nr = 0;
+                        del_slot = 0;
+                        btrfs_release_path(root, path);
+                        continue;
                }
+                BUG_ON(1);
        }
-out:
-        btrfs_free_path(path);
+        if (del_nr > 0) {
-        if (locked_end > orig_locked_end) {
+                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-                unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
+                BUG_ON(ret);
-                              locked_end - 1, GFP_NOFS);
        }
+        btrfs_free_path(path);
        return ret;
 }
@@ -620,23 +544,23 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot,
 * two or three.
 */
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
                              struct inode *inode, u64 start, u64 end)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_buffer *leaf;
        struct btrfs_path *path;
        struct btrfs_file_extent_item *fi;
        struct btrfs_key key;
+        struct btrfs_key new_key;
        u64 bytenr;
        u64 num_bytes;
        u64 extent_end;
        u64 orig_offset;
        u64 other_start;
        u64 other_end;
-        u64 split = start;
+        u64 split;
-        u64 locked_end = end;
+        int del_nr = 0;
-        int extent_type;
+        int del_slot = 0;
-        int split_end = 1;
        int ret;
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -644,12 +568,10 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 again:
+        split = start;
        key.objectid = inode->i_ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
-        if (split == start)
+        key.offset = split;
-                key.offset = split;
-        else
-                key.offset = split - 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0 && path->slots[0] > 0)
@@ -661,8 +583,8 @@ again:
               key.type != BTRFS_EXTENT_DATA_KEY);
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
-        extent_type = btrfs_file_extent_type(leaf, fi);
+        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
-        BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
+               BTRFS_FILE_EXTENT_PREALLOC);
        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
        BUG_ON(key.offset > start || extent_end < end);
@@ -670,150 +592,91 @@ again:
        num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
        orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
-        if (key.offset == start)
+        while (start > key.offset || end < extent_end) {
-                split = end;
+                if (key.offset == start)
+                        split = end;
-        if (key.offset == start && extent_end == end) {
-                int del_nr = 0;
+                memcpy(&new_key, &key, sizeof(new_key));
-                int del_slot = 0;
+                new_key.offset = split;
-                other_start = end;
+                ret = btrfs_duplicate_item(trans, root, path, &new_key);
-                other_end = 0;
+                if (ret == -EAGAIN) {
-                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
+                        btrfs_release_path(root, path);
-                                     bytenr, &other_start, &other_end)) {
+                        goto again;
-                        extent_end = other_end;
-                        del_slot = path->slots[0] + 1;
-                        del_nr++;
-                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                0, root->root_key.objectid,
-                                                inode->i_ino, orig_offset);
-                        BUG_ON(ret);
-                }
-                other_start = 0;
-                other_end = start;
-                if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
-                                     bytenr, &other_start, &other_end)) {
-                        key.offset = other_start;
-                        del_slot = path->slots[0];
-                        del_nr++;
-                        ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-                                                0, root->root_key.objectid,
-                                                inode->i_ino, orig_offset);
-                        BUG_ON(ret);
-                }
-                split_end = 0;
-                if (del_nr == 0) {
-                        btrfs_set_file_extent_type(leaf, fi,
-                                                   BTRFS_FILE_EXTENT_REG);
-                        goto done;
                }
+                BUG_ON(ret < 0);
-                fi = btrfs_item_ptr(leaf, del_slot - 1,
+                leaf = path->nodes[0];
+                fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
                                    struct btrfs_file_extent_item);
-                btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
                btrfs_set_file_extent_num_bytes(leaf, fi,
-                                                extent_end - key.offset);
+                                                split - key.offset);
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
+                btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                extent_end - split);
                btrfs_mark_buffer_dirty(leaf);
-                ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+                                           root->root_key.objectid,
+                                           inode->i_ino, orig_offset);
                BUG_ON(ret);
-                goto release;
-        } else if (split == start) {
-                if (locked_end < extent_end) {
-                        ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-                                        locked_end, extent_end - 1, GFP_NOFS);
-                        if (!ret) {
-                                btrfs_release_path(root, path);
-                                lock_extent(&BTRFS_I(inode)->io_tree,
-                                        locked_end, extent_end - 1, GFP_NOFS);
-                                locked_end = extent_end;
-                                goto again;
-                        }
-                        locked_end = extent_end;
-                }
-                btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-        } else  {
-                BUG_ON(key.offset != start);
-                key.offset = split;
-                btrfs_set_file_extent_offset(leaf, fi, key.offset -
-                                             orig_offset);
-                btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
-                btrfs_set_item_key_safe(trans, root, path, &key);
-                extent_end = split;
-        }
-        if (extent_end == end) {
+                if (split == start) {
-                split_end = 0;
+                        key.offset = start;
-                extent_type = BTRFS_FILE_EXTENT_REG;
+                } else {
-        }
+                        BUG_ON(start != key.offset);
-        if (extent_end == end && split == start) {
-                other_start = end;
-                other_end = 0;
-                if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
-                                     bytenr, &other_start, &other_end)) {
-                        path->slots[0]++;
-                        fi = btrfs_item_ptr(leaf, path->slots[0],
-                                            struct btrfs_file_extent_item);
-                        key.offset = split;
-                        btrfs_set_item_key_safe(trans, root, path, &key);
-                        btrfs_set_file_extent_offset(leaf, fi, key.offset -
-                                                     orig_offset);
-                        btrfs_set_file_extent_num_bytes(leaf, fi,
-                                                        other_end - split);
-                        goto done;
-                }
-        }
-        if (extent_end == end && split == end) {
-                other_start = 0;
-                other_end = start;
-                if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
-                                     bytenr, &other_start, &other_end)) {
                        path->slots[0]--;
-                        fi = btrfs_item_ptr(leaf, path->slots[0],
+                        extent_end = end;
-                                            struct btrfs_file_extent_item);
-                        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
-                                                        other_start);
-                        goto done;
                }
        }
-        btrfs_mark_buffer_dirty(leaf);
-        ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
-                                   root->root_key.objectid,
-                                   inode->i_ino, orig_offset);
-        BUG_ON(ret);
-        btrfs_release_path(root, path);
-        key.offset = start;
-        ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
-        BUG_ON(ret);
-        leaf = path->nodes[0];
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
-        btrfs_set_file_extent_generation(leaf, fi, trans->transid);
-        btrfs_set_file_extent_type(leaf, fi, extent_type);
-        btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
-        btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-        btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
-        btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
-        btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
-        btrfs_set_file_extent_compression(leaf, fi, 0);
-        btrfs_set_file_extent_encryption(leaf, fi, 0);
-        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-done:
-        btrfs_mark_buffer_dirty(leaf);
-release:
+        other_start = end;
-        btrfs_release_path(root, path);
+        other_end = 0;
-        if (split_end && split == start) {
+        if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
-                split = end;
+                             bytenr, &other_start, &other_end)) {
-                goto again;
+                extent_end = other_end;
+                del_slot = path->slots[0] + 1;
+                del_nr++;
+                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                        0, root->root_key.objectid,
+                                        inode->i_ino, orig_offset);
+                BUG_ON(ret);
        }
-        if (locked_end > end) {
+        other_start = 0;
-                unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
+        other_end = start;
-                              GFP_NOFS);
+        if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
+                             bytenr, &other_start, &other_end)) {
+                key.offset = other_start;
+                del_slot = path->slots[0];
+                del_nr++;
+                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+                                        0, root->root_key.objectid,
+                                        inode->i_ino, orig_offset);
+                BUG_ON(ret);
        }
+        if (del_nr == 0) {
+                btrfs_set_file_extent_type(leaf, fi,
+                                           BTRFS_FILE_EXTENT_REG);
+                btrfs_mark_buffer_dirty(leaf);
+                goto out;
+        }
+        fi = btrfs_item_ptr(leaf, del_slot - 1,
+                            struct btrfs_file_extent_item);
+        btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
+        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                        extent_end - key.offset);
+        btrfs_mark_buffer_dirty(leaf);
+        ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
+        BUG_ON(ret);
+out:
        btrfs_free_path(path);
        return 0;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b3ad168a0bfc..5440bab23635 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode,
                                   u64 start, u64 end, int *page_started,
                                   unsigned long *nr_written, int unlock);
-static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
+static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
+                                     struct inode *inode,  struct inode *dir)
 {
        int err;
-        err = btrfs_init_acl(inode, dir);
+        err = btrfs_init_acl(trans, inode, dir);
        if (!err)
-                err = btrfs_xattr_security_init(inode, dir);
+                err = btrfs_xattr_security_init(trans, inode, dir);
        return err;
 }
@@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
+        /*
+         * we're an inline extent, so nobody can
+         * extend the file past i_size without locking
+         * a page we already have locked.
+         *
+         * We must do any isize and inode updates
+         * before we unlock the pages.  Otherwise we
+         * could end up racing with unlink.
+         */
        BTRFS_I(inode)->disk_i_size = inode->i_size;
        btrfs_update_inode(trans, root, inode);
        return 0;
 fail:
        btrfs_free_path(path);
@@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                return 1;
        }
-        ret = btrfs_drop_extents(trans, root, inode, start,
+        ret = btrfs_drop_extents(trans, inode, start, aligned_end,
-                                 aligned_end, aligned_end, start,
                                 &hint_byte, 1);
        BUG_ON(ret);
@@ -416,7 +426,6 @@ again:
                                                    start, end,
                                                    total_compressed, pages);
                }
-                btrfs_end_transaction(trans, root);
                if (ret == 0) {
                        /*
                         * inline extent creation worked, we don't need
@@ -430,9 +439,11 @@ again:
                             EXTENT_CLEAR_DELALLOC |
                             EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
-                        ret = 0;
+                        btrfs_end_transaction(trans, root);
                        goto free_pages_out;
                }
+                btrfs_end_transaction(trans, root);
        }
        if (will_compress) {
@@ -543,7 +554,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
        if (list_empty(&async_cow->extents))
                return 0;
-        trans = btrfs_join_transaction(root, 1);
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
@@ -590,19 +600,15 @@ retry:
                lock_extent(io_tree, async_extent->start,
                            async_extent->start + async_extent->ram_size - 1,
                            GFP_NOFS);
-                /*
-                 * here we're doing allocation and writeback of the
-                 * compressed pages
-                 */
-                btrfs_drop_extent_cache(inode, async_extent->start,
-                                        async_extent->start +
-                                        async_extent->ram_size - 1, 0);
+                trans = btrfs_join_transaction(root, 1);
                ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
                                           0, alloc_hint,
                                           (u64)-1, &ins, 1);
+                btrfs_end_transaction(trans, root);
                if (ret) {
                        int i;
                        for (i = 0; i < async_extent->nr_pages; i++) {
@@ -618,6 +624,14 @@ retry:
                        goto retry;
                }
+                /*
+                 * here we're doing allocation and writeback of the
+                 * compressed pages
+                 */
+                btrfs_drop_extent_cache(inode, async_extent->start,
+                                        async_extent->start +
+                                        async_extent->ram_size - 1, 0);
                em = alloc_extent_map(GFP_NOFS);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
@@ -649,8 +663,6 @@ retry:
                                               BTRFS_ORDERED_COMPRESSED);
                BUG_ON(ret);
-                btrfs_end_transaction(trans, root);
                /*
                 * clear dirty, set writeback and unlock the pages.
                 */
@@ -672,13 +684,11 @@ retry:
                                    async_extent->nr_pages);
                BUG_ON(ret);
-                trans = btrfs_join_transaction(root, 1);
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
                cond_resched();
        }
-        btrfs_end_transaction(trans, root);
        return 0;
 }
@@ -742,6 +752,7 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
                        *nr_written = *nr_written +
                             (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
                        *page_started = 1;
@@ -1596,7 +1607,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                       struct inode *inode, u64 file_pos,
                                       u64 disk_bytenr, u64 disk_num_bytes,
                                       u64 num_bytes, u64 ram_bytes,
-                                       u64 locked_end,
                                       u8 compression, u8 encryption,
                                       u16 other_encoding, int extent_type)
 {
@@ -1622,9 +1632,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
         * the caller is expected to unpin it and allow it to be merged
         * with the others.
         */
-        ret = btrfs_drop_extents(trans, root, inode, file_pos,
+        ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
-                                 file_pos + num_bytes, locked_end,
+                                 &hint, 0);
-                                 file_pos, &hint, 0);
        BUG_ON(ret);
        ins.objectid = inode->i_ino;
@@ -1730,23 +1739,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                }
        }
-        trans = btrfs_join_transaction(root, 1);
        if (!ordered_extent)
                ordered_extent = btrfs_lookup_ordered_extent(inode, start);
        BUG_ON(!ordered_extent);
-        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
-                goto nocow;
+                BUG_ON(!list_empty(&ordered_extent->list));
+                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+                if (!ret) {
+                        trans = btrfs_join_transaction(root, 1);
+                        ret = btrfs_update_inode(trans, root, inode);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, root);
+                }
+                goto out;
+        }
        lock_extent(io_tree, ordered_extent->file_offset,
                    ordered_extent->file_offset + ordered_extent->len - 1,
                    GFP_NOFS);
+        trans = btrfs_join_transaction(root, 1);
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
                BUG_ON(compressed);
-                ret = btrfs_mark_extent_written(trans, root, inode,
+                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                ordered_extent->len);
@@ -1758,8 +1776,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                                ordered_extent->file_offset +
-                                                ordered_extent->len,
                                                compressed, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
@@ -1770,22 +1786,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        unlock_extent(io_tree, ordered_extent->file_offset,
                    ordered_extent->file_offset + ordered_extent->len - 1,
                    GFP_NOFS);
-nocow:
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        mutex_lock(&BTRFS_I(inode)->extent_mutex);
+        /* this also removes the ordered extent from the tree */
-        btrfs_ordered_update_i_size(inode, ordered_extent);
+        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_update_inode(trans, root, inode);
-        btrfs_remove_ordered_extent(inode, ordered_extent);
+        BUG_ON(ret);
-        mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+        btrfs_end_transaction(trans, root);
+out:
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
        btrfs_put_ordered_extent(ordered_extent);
-        btrfs_end_transaction(trans, root);
        return 0;
 }
@@ -2008,6 +2022,54 @@ zeroit:
        return -EIO;
 }
+struct delayed_iput {
+        struct list_head list;
+        struct inode *inode;
+};
+void btrfs_add_delayed_iput(struct inode *inode)
+{
+        struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+        struct delayed_iput *delayed;
+        if (atomic_add_unless(&inode->i_count, -1, 1))
+                return;
+        delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
+        delayed->inode = inode;
+        spin_lock(&fs_info->delayed_iput_lock);
+        list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+        spin_unlock(&fs_info->delayed_iput_lock);
+}
+void btrfs_run_delayed_iputs(struct btrfs_root *root)
+{
+        LIST_HEAD(list);
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct delayed_iput *delayed;
+        int empty;
+        spin_lock(&fs_info->delayed_iput_lock);
+        empty = list_empty(&fs_info->delayed_iputs);
+        spin_unlock(&fs_info->delayed_iput_lock);
+        if (empty)
+                return;
+        down_read(&root->fs_info->cleanup_work_sem);
+        spin_lock(&fs_info->delayed_iput_lock);
+        list_splice_init(&fs_info->delayed_iputs, &list);
+        spin_unlock(&fs_info->delayed_iput_lock);
+        while (!list_empty(&list)) {
+                delayed = list_entry(list.next, struct delayed_iput, list);
+                list_del(&delayed->list);
+                iput(delayed->inode);
+                kfree(delayed);
+        }
+        up_read(&root->fs_info->cleanup_work_sem);
+}
 /*
 * This creates an orphan entry for the given inode in case something goes
 * wrong in the middle of an unlink/truncate.
@@ -2080,16 +2142,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
-        path = btrfs_alloc_path();
+        if (!xchg(&root->clean_orphans, 0))
-        if (!path)
                return;
+        path = btrfs_alloc_path();
+        BUG_ON(!path);
        path->reada = -1;
        key.objectid = BTRFS_ORPHAN_OBJECTID;
        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
        key.offset = (u64)-1;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0) {
@@ -2834,37 +2897,40 @@ out:
 * min_type is the minimum key type to truncate down to.  If set to 0, this
 * will kill all the items on this inode, including the INODE_ITEM_KEY.
 */
-noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
+                               struct btrfs_root *root,
-                                        struct inode *inode,
+                               struct inode *inode,
-                                        u64 new_size, u32 min_type)
+                               u64 new_size, u32 min_type)
 {
-        int ret;
        struct btrfs_path *path;
-        struct btrfs_key key;
-        struct btrfs_key found_key;
-        u32 found_type = (u8)-1;
        struct extent_buffer *leaf;
        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
        u64 extent_start = 0;
        u64 extent_num_bytes = 0;
        u64 extent_offset = 0;
        u64 item_end = 0;
+        u64 mask = root->sectorsize - 1;
+        u32 found_type = (u8)-1;
        int found_extent;
        int del_item;
        int pending_del_nr = 0;
        int pending_del_slot = 0;
        int extent_type = -1;
        int encoding;
-        u64 mask = root->sectorsize - 1;
+        int ret;
+        int err = 0;
+        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->reada = -1;
-        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
        key.offset = (u64)-1;
        key.type = (u8)-1;
@@ -2872,17 +2938,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 search_again:
        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret < 0)
+        if (ret < 0) {
-                goto error;
+                err = ret;
+                goto out;
+        }
        if (ret > 0) {
                /* there are no items in the tree for us to truncate, we're
                 * done
                 */
-                if (path->slots[0] == 0) {
+                if (path->slots[0] == 0)
-                        ret = 0;
+                        goto out;
-                        goto error;
-                }
                path->slots[0]--;
        }
@@ -2917,28 +2983,17 @@ search_again:
                        }
                        item_end--;
                }
-                if (item_end < new_size) {
+                if (found_type > min_type) {
-                        if (found_type == BTRFS_DIR_ITEM_KEY)
+                        del_item = 1;
-                                found_type = BTRFS_INODE_ITEM_KEY;
+                } else {
-                        else if (found_type == BTRFS_EXTENT_ITEM_KEY)
+                        if (item_end < new_size)
-                                found_type = BTRFS_EXTENT_DATA_KEY;
-                        else if (found_type == BTRFS_EXTENT_DATA_KEY)
-                                found_type = BTRFS_XATTR_ITEM_KEY;
-                        else if (found_type == BTRFS_XATTR_ITEM_KEY)
-                                found_type = BTRFS_INODE_REF_KEY;
-                        else if (found_type)
-                                found_type--;
-                        else
                                break;
-                        btrfs_set_key_type(&key, found_type);
+                        if (found_key.offset >= new_size)
-                        goto next;
+                                del_item = 1;
+                        else
+                                del_item = 0;
                }
-                if (found_key.offset >= new_size)
-                        del_item = 1;
-                else
-                        del_item = 0;
                found_extent = 0;
                /* FIXME, shrink the extent if the ref count is only 1 */
                if (found_type != BTRFS_EXTENT_DATA_KEY)
                        goto delete;
@@ -3025,42 +3080,36 @@ delete:
                                                inode->i_ino, extent_offset);
                        BUG_ON(ret);
                }
-next:
-                if (path->slots[0] == 0) {
-                        if (pending_del_nr)
-                                goto del_pending;
-                        btrfs_release_path(root, path);
-                        if (found_type == BTRFS_INODE_ITEM_KEY)
-                                break;
-                        goto search_again;
-                }
-                path->slots[0]--;
+                if (found_type == BTRFS_INODE_ITEM_KEY)
-                if (pending_del_nr &&
+                        break;
-                    path->slots[0] + 1 != pending_del_slot) {
-                        struct btrfs_key debug;
+                if (path->slots[0] == 0 ||
-del_pending:
+                    path->slots[0] != pending_del_slot) {
-                        btrfs_item_key_to_cpu(path->nodes[0], &debug,
+                        if (root->ref_cows) {
-                                              pending_del_slot);
+                                err = -EAGAIN;
-                        ret = btrfs_del_items(trans, root, path,
+                                goto out;
-                                              pending_del_slot,
+                        }
-                                              pending_del_nr);
+                        if (pending_del_nr) {
-                        BUG_ON(ret);
+                                ret = btrfs_del_items(trans, root, path,
-                        pending_del_nr = 0;
+                                                pending_del_slot,
+                                                pending_del_nr);
+                                BUG_ON(ret);
+                                pending_del_nr = 0;
+                        }
                        btrfs_release_path(root, path);
-                        if (found_type == BTRFS_INODE_ITEM_KEY)
-                                break;
                        goto search_again;
+                } else {
+                        path->slots[0]--;
                }
        }
-        ret = 0;
+out:
-error:
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
        }
        btrfs_free_path(path);
-        return ret;
+        return err;
 }
 /*
@@ -3180,10 +3229,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        if (size <= hole_start)
                return 0;
-        err = btrfs_truncate_page(inode->i_mapping, inode->i_size);
-        if (err)
-                return err;
        while (1) {
                struct btrfs_ordered_extent *ordered;
                btrfs_wait_ordered_range(inode, hole_start,
@@ -3196,9 +3241,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                btrfs_put_ordered_extent(ordered);
        }
-        trans = btrfs_start_transaction(root, 1);
-        btrfs_set_trans_block_group(trans, inode);
        cur_offset = hole_start;
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
@@ -3206,40 +3248,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                BUG_ON(IS_ERR(em) || !em);
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
-                if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        err = btrfs_drop_extents(trans, root, inode,
-                                                 cur_offset,
-                                                 cur_offset + hole_size,
-                                                 block_end,
-                                                 cur_offset, &hint_byte, 1);
-                        if (err)
-                                break;
-                        err = btrfs_reserve_metadata_space(root, 1);
+                        err = btrfs_reserve_metadata_space(root, 2);
                        if (err)
                                break;
+                        trans = btrfs_start_transaction(root, 1);
+                        btrfs_set_trans_block_group(trans, inode);
+                        err = btrfs_drop_extents(trans, inode, cur_offset,
+                                                 cur_offset + hole_size,
+                                                 &hint_byte, 1);
+                        BUG_ON(err);
                        err = btrfs_insert_file_extent(trans, root,
                                        inode->i_ino, cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
+                        BUG_ON(err);
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
-                        btrfs_unreserve_metadata_space(root, 1);
+                        btrfs_end_transaction(trans, root);
+                        btrfs_unreserve_metadata_space(root, 2);
                }
                free_extent_map(em);
                cur_offset = last_byte;
-                if (err || cur_offset >= block_end)
+                if (cur_offset >= block_end)
                        break;
        }
-        btrfs_end_transaction(trans, root);
        unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
        return err;
 }
+static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        int ret;
+        if (attr->ia_size == inode->i_size)
+                return 0;
+        if (attr->ia_size > inode->i_size) {
+                unsigned long limit;
+                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                if (attr->ia_size > inode->i_sb->s_maxbytes)
+                        return -EFBIG;
+                if (limit != RLIM_INFINITY && attr->ia_size > limit) {
+                        send_sig(SIGXFSZ, current, 0);
+                        return -EFBIG;
+                }
+        }
+        ret = btrfs_reserve_metadata_space(root, 1);
+        if (ret)
+                return ret;
+        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        ret = btrfs_orphan_add(trans, inode);
+        BUG_ON(ret);
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_unreserve_metadata_space(root, 1);
+        btrfs_btree_balance_dirty(root, nr);
+        if (attr->ia_size > inode->i_size) {
+                ret = btrfs_cont_expand(inode, attr->ia_size);
+                if (ret) {
+                        btrfs_truncate(inode);
+                        return ret;
+                }
+                i_size_write(inode, attr->ia_size);
+                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                trans = btrfs_start_transaction(root, 1);
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+                if (inode->i_nlink > 0) {
+                        ret = btrfs_orphan_del(trans, inode);
+                        BUG_ON(ret);
+                }
+                nr = trans->blocks_used;
+                btrfs_end_transaction(trans, root);
+                btrfs_btree_balance_dirty(root, nr);
+                return 0;
+        }
+        /*
+         * We're truncating a file that used to have good data down to
+         * zero. Make sure it gets into the ordered flush list so that
+         * any new writes get down to disk quickly.
+         */
+        if (attr->ia_size == 0)
+                BTRFS_I(inode)->ordered_data_close = 1;
+        /* we don't support swapfiles, so vmtruncate shouldn't fail */
+        ret = vmtruncate(inode, attr->ia_size);
+        BUG_ON(ret);
+        return 0;
+}
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
@@ -3250,23 +3372,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                if (attr->ia_size > inode->i_size) {
+                err = btrfs_setattr_size(inode, attr);
-                        err = btrfs_cont_expand(inode, attr->ia_size);
+                if (err)
-                        if (err)
+                        return err;
-                                return err;
-                } else if (inode->i_size > 0 &&
-                           attr->ia_size == 0) {
-                        /* we're truncating a file that used to have good
-                         * data down to zero.  Make sure it gets into
-                         * the ordered flush list so that any new writes
-                         * get down to disk quickly.
-                         */
-                        BTRFS_I(inode)->ordered_data_close = 1;
-                }
        }
+        attr->ia_valid &= ~ATTR_SIZE;
-        err = inode_setattr(inode, attr);
+        if (attr->ia_valid)
+                err = inode_setattr(inode, attr);
        if (!err && ((attr->ia_valid & ATTR_MODE)))
                err = btrfs_acl_chmod(inode);
@@ -3287,36 +3400,43 @@ void btrfs_delete_inode(struct inode *inode)
        }
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
+        if (root->fs_info->log_root_recovering) {
+                BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+                goto no_delete;
+        }
        if (inode->i_nlink > 0) {
                BUG_ON(btrfs_root_refs(&root->root_item) != 0);
                goto no_delete;
        }
        btrfs_i_size_write(inode, 0);
-        trans = btrfs_join_transaction(root, 1);
-        btrfs_set_trans_block_group(trans, inode);
+        while (1) {
-        ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0);
+                trans = btrfs_start_transaction(root, 1);
-        if (ret) {
+                btrfs_set_trans_block_group(trans, inode);
-                btrfs_orphan_del(NULL, inode);
+                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-                goto no_delete_lock;
-        }
-        btrfs_orphan_del(trans, inode);
+                if (ret != -EAGAIN)
+                        break;
-        nr = trans->blocks_used;
+                nr = trans->blocks_used;
-        clear_inode(inode);
+                btrfs_end_transaction(trans, root);
+                trans = NULL;
+                btrfs_btree_balance_dirty(root, nr);
+        }
-        btrfs_end_transaction(trans, root);
+        if (ret == 0) {
-        btrfs_btree_balance_dirty(root, nr);
+                ret = btrfs_orphan_del(trans, inode);
-        return;
+                BUG_ON(ret);
+        }
-no_delete_lock:
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
 no_delete:
        clear_inode(inode);
+        return;
 }
 /*
@@ -3569,7 +3689,6 @@ static noinline void init_btrfs_i(struct inode *inode)
        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
 }
@@ -3695,6 +3814,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        }
        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
+        if (root != sub_root) {
+                down_read(&root->fs_info->cleanup_work_sem);
+                if (!(inode->i_sb->s_flags & MS_RDONLY))
+                        btrfs_orphan_cleanup(sub_root);
+                up_read(&root->fs_info->cleanup_work_sem);
+        }
        return inode;
 }
@@ -4219,7 +4345,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4290,7 +4416,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -4336,6 +4462,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (inode->i_nlink == 0)
                return -ENOENT;
+        /* do not allow sys_link's with other subvols of the same device */
+        if (root->objectid != BTRFS_I(inode)->root->objectid)
+                return -EPERM;
        /*
         * 1 item for inode ref
         * 2 items for dir items
@@ -4423,7 +4553,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        drop_on_err = 1;
-        err = btrfs_init_inode_security(inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir);
        if (err)
                goto out_fail;
@@ -5074,17 +5204,20 @@ static void btrfs_truncate(struct inode *inode)
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
-        if (!S_ISREG(inode->i_mode))
+        if (!S_ISREG(inode->i_mode)) {
-                return;
+                WARN_ON(1);
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
+        }
        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
        if (ret)
                return;
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
        trans = btrfs_start_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -5106,21 +5239,32 @@ static void btrfs_truncate(struct inode *inode)
        if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
                btrfs_add_ordered_operation(trans, root, inode);
-        btrfs_set_trans_block_group(trans, inode);
+        while (1) {
-        btrfs_i_size_write(inode, inode->i_size);
+                ret = btrfs_truncate_inode_items(trans, root, inode,
+                                                 inode->i_size,
+                                                 BTRFS_EXTENT_DATA_KEY);
+                if (ret != -EAGAIN)
+                        break;
-        ret = btrfs_orphan_add(trans, inode);
+                ret = btrfs_update_inode(trans, root, inode);
-        if (ret)
+                BUG_ON(ret);
-                goto out;
-        /* FIXME, add redo link to tree so we don't leak on crash */
+                nr = trans->blocks_used;
-        ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size,
+                btrfs_end_transaction(trans, root);
-                                      BTRFS_EXTENT_DATA_KEY);
+                btrfs_btree_balance_dirty(root, nr);
-        btrfs_update_inode(trans, root, inode);
+                trans = btrfs_start_transaction(root, 1);
+                btrfs_set_trans_block_group(trans, inode);
+        }
-        ret = btrfs_orphan_del(trans, inode);
+        if (ret == 0 && inode->i_nlink > 0) {
+                ret = btrfs_orphan_del(trans, inode);
+                BUG_ON(ret);
+        }
+        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-out:
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
        BUG_ON(ret);
@@ -5217,9 +5361,9 @@ void btrfs_destroy_inode(struct inode *inode)
        spin_lock(&root->list_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
+                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
-                       " list\n", inode->i_ino);
+                       inode->i_ino);
-                dump_stack();
+                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
        spin_unlock(&root->list_lock);
@@ -5476,7 +5620,7 @@ out_fail:
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
@@ -5495,7 +5639,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
                spin_unlock(&root->fs_info->delalloc_lock);
                if (inode) {
                        filemap_flush(inode->i_mapping);
-                        iput(inode);
+                        if (delay_iput)
+                                btrfs_add_delayed_iput(inode);
+                        else
+                                iput(inode);
                }
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
@@ -5569,7 +5716,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(inode))
                goto out_unlock;
-        err = btrfs_init_inode_security(inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
@@ -5641,10 +5788,10 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct btrfs_trans_handle *trans,
+static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
-                               struct inode *inode, u64 start, u64 end,
+                               u64 alloc_hint, int mode)
-                               u64 locked_end, u64 alloc_hint, int mode)
 {
+        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 alloc_size;
@@ -5655,43 +5802,56 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
        while (num_bytes > 0) {
                alloc_size = min(num_bytes, root->fs_info->max_extent);
-                ret = btrfs_reserve_metadata_space(root, 1);
+                trans = btrfs_start_transaction(root, 1);
-                if (ret)
-                        goto out;
                ret = btrfs_reserve_extent(trans, root, alloc_size,
                                           root->sectorsize, 0, alloc_hint,
                                           (u64)-1, &ins, 1);
                if (ret) {
                        WARN_ON(1);
-                        goto out;
+                        goto stop_trans;
+                }
+                ret = btrfs_reserve_metadata_space(root, 3);
+                if (ret) {
+                        btrfs_free_reserved_extent(root, ins.objectid,
+                                                   ins.offset);
+                        goto stop_trans;
                }
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,
-                                                  ins.offset, locked_end,
+                                                  ins.offset, 0, 0, 0,
-                                                  0, 0, 0,
                                                  BTRFS_FILE_EXTENT_PREALLOC);
                BUG_ON(ret);
                btrfs_drop_extent_cache(inode, cur_offset,
                                        cur_offset + ins.offset -1, 0);
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
                alloc_hint = ins.objectid + ins.offset;
-                btrfs_unreserve_metadata_space(root, 1);
-        }
-out:
-        if (cur_offset > start) {
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                    cur_offset > i_size_read(inode))
+                    cur_offset > inode->i_size) {
-                        btrfs_i_size_write(inode, cur_offset);
+                        i_size_write(inode, cur_offset);
+                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+                }
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
+                btrfs_end_transaction(trans, root);
+                btrfs_unreserve_metadata_space(root, 3);
        }
+        return ret;
+stop_trans:
+        btrfs_end_transaction(trans, root);
        return ret;
 }
 static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5705,8 +5865,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        u64 locked_end;
        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
        struct extent_map *em;
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *root;
        int ret;
        alloc_start = offset & ~mask;
@@ -5725,9 +5883,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
-        root = BTRFS_I(inode)->root;
+        ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
-        ret = btrfs_check_data_free_space(root, inode,
                                          alloc_end - alloc_start);
        if (ret)
                goto out;
@@ -5736,12 +5892,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        while (1) {
                struct btrfs_ordered_extent *ordered;
-                trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
-                if (!trans) {
-                        ret = -EIO;
-                        goto out_free;
-                }
                /* the extent lock is ordered inside the running
                 * transaction
                 */
@@ -5755,8 +5905,6 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        btrfs_put_ordered_extent(ordered);
                        unlock_extent(&BTRFS_I(inode)->io_tree,
                                      alloc_start, locked_end, GFP_NOFS);
-                        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
                        /*
                         * we can't wait on the range with the transaction
                         * running or with the extent lock held
@@ -5777,10 +5925,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                BUG_ON(IS_ERR(em) || !em);
                last_byte = min(extent_map_end(em), alloc_end);
                last_byte = (last_byte + mask) & ~mask;
-                if (em->block_start == EXTENT_MAP_HOLE) {
+                if (em->block_start == EXTENT_MAP_HOLE ||
-                        ret = prealloc_file_range(trans, inode, cur_offset,
+                    (cur_offset >= inode->i_size &&
-                                        last_byte, locked_end + 1,
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                                        alloc_hint, mode);
+                        ret = prealloc_file_range(inode,
+                                                  cur_offset, last_byte,
+                                                  alloc_hint, mode);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
@@ -5799,9 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                      GFP_NOFS);
-        btrfs_end_transaction(trans, BTRFS_I(inode)->root);
+        btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
-out_free:
+                                       alloc_end - alloc_start);
-        btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cdbb054102b9..645a17927a8f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 objectid;
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
-        unsigned long nr = 1;
        /*
         * 1 - inode item
@@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_set_root_generation(&root_item, trans->transid);
        btrfs_set_root_level(&root_item, 0);
        btrfs_set_root_refs(&root_item, 1);
-        btrfs_set_root_used(&root_item, 0);
+        btrfs_set_root_used(&root_item, leaf->len);
        btrfs_set_root_last_snapshot(&root_item, 0);
        memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
@@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root,
        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
-        nr = trans->blocks_used;
        err = btrfs_commit_transaction(trans, root);
        if (err && !ret)
                ret = err;
        btrfs_unreserve_metadata_space(root, 6);
-        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
 static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
                           char *name, int namelen)
 {
+        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
        struct btrfs_trans_handle *trans;
-        int ret = 0;
+        int ret;
-        int err;
-        unsigned long nr = 0;
        if (!root->ref_cows)
                return -EINVAL;
@@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
         */
        ret = btrfs_reserve_metadata_space(root, 6);
        if (ret)
-                goto fail_unlock;
+                goto fail;
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
        if (!pending_snapshot) {
                ret = -ENOMEM;
                btrfs_unreserve_metadata_space(root, 6);
-                goto fail_unlock;
+                goto fail;
        }
        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
        if (!pending_snapshot->name) {
                ret = -ENOMEM;
                kfree(pending_snapshot);
                btrfs_unreserve_metadata_space(root, 6);
-                goto fail_unlock;
+                goto fail;
        }
        memcpy(pending_snapshot->name, name, namelen);
        pending_snapshot->name[namelen] = '\0';
@@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        pending_snapshot->root = root;
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        err = btrfs_commit_transaction(trans, root);
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        btrfs_unreserve_metadata_space(root, 6);
-fail_unlock:
+        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
-        btrfs_btree_balance_dirty(root, nr);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto fail;
+        }
+        BUG_ON(!inode);
+        d_instantiate(dentry, inode);
+        ret = 0;
+fail:
        return ret;
 }
@@ -1027,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        BUG_ON(!trans);
        /* punch hole in destination first */
-        btrfs_drop_extents(trans, root, inode, off, off + len,
+        btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
-                           off + len, 0, &hint_byte, 1);
        /* clone data */
        key.objectid = src->i_ino;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5799bc46a309..b10a49d4bc6a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 /*
 * remove an ordered extent from the tree.  No references are dropped
- * but, anyone waiting on this extent is woken up.
+ * and you must wake_up entry->wait.  You must hold the tree mutex
+ * while you call this function.
 */
-int btrfs_remove_ordered_extent(struct inode *inode,
+static int __btrfs_remove_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        tree = &BTRFS_I(inode)->ordered_tree;
-        mutex_lock(&tree->mutex);
        node = &entry->rb_node;
        rb_erase(node, &tree->tree);
        tree->last = NULL;
@@ -326,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        }
        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+        return 0;
+}
+/*
+ * remove an ordered extent from the tree.  No references are dropped
+ * but any waiters are woken.
+ */
+int btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        int ret;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        mutex_lock(&tree->mutex);
+        ret = __btrfs_remove_ordered_extent(inode, entry);
        mutex_unlock(&tree->mutex);
        wake_up(&entry->wait);
-        return 0;
+        return ret;
 }
 /*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+                               int nocow_only, int delay_iput)
 {
        struct list_head splice;
        struct list_head *cur;
@@ -372,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
                if (inode) {
                        btrfs_start_ordered_extent(inode, ordered, 1);
                        btrfs_put_ordered_extent(ordered);
-                        iput(inode);
+                        if (delay_iput)
+                                btrfs_add_delayed_iput(inode);
+                        else
+                                iput(inode);
                } else {
                        btrfs_put_ordered_extent(ordered);
                }
@@ -430,7 +451,7 @@ again:
                                btrfs_wait_ordered_range(inode, 0, (u64)-1);
                        else
                                filemap_flush(inode->i_mapping);
-                        iput(inode);
+                        btrfs_add_delayed_iput(inode);
                }
                cond_resched();
@@ -589,7 +610,7 @@ out:
 * After an extent is done, call this to conditionally update the on disk
 * i_size.  i_size is updated to cover any fully written part of the file.
 */
-int btrfs_ordered_update_i_size(struct inode *inode,
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered)
 {
        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
@@ -597,18 +618,30 @@ int btrfs_ordered_update_i_size(struct inode *inode,
        u64 disk_i_size;
        u64 new_i_size;
        u64 i_size_test;
+        u64 i_size = i_size_read(inode);
        struct rb_node *node;
+        struct rb_node *prev = NULL;
        struct btrfs_ordered_extent *test;
+        int ret = 1;
+        if (ordered)
+                offset = entry_end(ordered);
        mutex_lock(&tree->mutex);
        disk_i_size = BTRFS_I(inode)->disk_i_size;
+        /* truncate file */
+        if (disk_i_size > i_size) {
+                BTRFS_I(inode)->disk_i_size = i_size;
+                ret = 0;
+                goto out;
+        }
        /*
         * if the disk i_size is already at the inode->i_size, or
         * this ordered extent is inside the disk i_size, we're done
         */
-        if (disk_i_size >= inode->i_size ||
+        if (disk_i_size == i_size || offset <= disk_i_size) {
-            ordered->file_offset + ordered->len <= disk_i_size) {
                goto out;
        }
@@ -616,8 +649,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
         * we can't update the disk_isize if there are delalloc bytes
         * between disk_i_size and  this ordered extent
         */
-        if (test_range_bit(io_tree, disk_i_size,
+        if (test_range_bit(io_tree, disk_i_size, offset - 1,
-                           ordered->file_offset + ordered->len - 1,
                           EXTENT_DELALLOC, 0, NULL)) {
                goto out;
        }
@@ -626,20 +658,32 @@ int btrfs_ordered_update_i_size(struct inode *inode,
         * if we find an ordered extent then we can't update disk i_size
         * yet
         */
-        node = &ordered->rb_node;
+        if (ordered) {
-        while (1) {
+                node = rb_prev(&ordered->rb_node);
-                node = rb_prev(node);
+        } else {
-                if (!node)
+                prev = tree_search(tree, offset);
-                        break;
+                /*
+                 * we insert file extents without involving ordered struct,
+                 * so there should be no ordered struct cover this offset
+                 */
+                if (prev) {
+                        test = rb_entry(prev, struct btrfs_ordered_extent,
+                                        rb_node);
+                        BUG_ON(offset_in_entry(test, offset));
+                }
+                node = prev;
+        }
+        while (node) {
                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
                if (test->file_offset + test->len <= disk_i_size)
                        break;
-                if (test->file_offset >= inode->i_size)
+                if (test->file_offset >= i_size)
                        break;
                if (test->file_offset >= disk_i_size)
                        goto out;
+                node = rb_prev(node);
        }
-        new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode));
+        new_i_size = min_t(u64, offset, i_size);
        /*
         * at this point, we know we can safely update i_size to at least
@@ -647,7 +691,14 @@ int btrfs_ordered_update_i_size(struct inode *inode,
         * walk forward and see if ios from higher up in the file have
         * finished.
         */
-        node = rb_next(&ordered->rb_node);
+        if (ordered) {
+                node = rb_next(&ordered->rb_node);
+        } else {
+                if (prev)
+                        node = rb_next(prev);
+                else
+                        node = rb_first(&tree->tree);
+        }
        i_size_test = 0;
        if (node) {
                /*
@@ -655,10 +706,10 @@ int btrfs_ordered_update_i_size(struct inode *inode,
                 * between our ordered extent and the next one.
                 */
                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-                if (test->file_offset > entry_end(ordered))
+                if (test->file_offset > offset)
                        i_size_test = test->file_offset;
        } else {
-                i_size_test = i_size_read(inode);
+                i_size_test = i_size;
        }
        /*
@@ -667,15 +718,25 @@ int btrfs_ordered_update_i_size(struct inode *inode,
         * are no delalloc bytes in this area, it is safe to update
         * disk_i_size to the end of the region.
         */
-        if (i_size_test > entry_end(ordered) &&
+        if (i_size_test > offset &&
-            !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
+            !test_range_bit(io_tree, offset, i_size_test - 1,
-                           EXTENT_DELALLOC, 0, NULL)) {
+                            EXTENT_DELALLOC, 0, NULL)) {
-                new_i_size = min_t(u64, i_size_test, i_size_read(inode));
+                new_i_size = min_t(u64, i_size_test, i_size);
        }
        BTRFS_I(inode)->disk_i_size = new_i_size;
+        ret = 0;
 out:
+        /*
+         * we need to remove the ordered extent with the tree lock held
+         * so that other people calling this function don't find our fully
+         * processed ordered entry and skip updating the i_size
+         */
+        if (ordered)
+                __btrfs_remove_ordered_extent(inode, ordered);
        mutex_unlock(&tree->mutex);
-        return 0;
+        if (ordered)
+                wake_up(&ordered->wait);
+        return ret;
 }
 /*
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f82e87488ca8..1fe1282ef47c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
-int btrfs_ordered_update_i_size(struct inode *inode,
+int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
 int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
 int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root,
+                               int nocow_only, int delay_iput);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cfcc93c93a7b..a9728680eca8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        return 0;
 }
+static void put_inodes(struct list_head *list)
+{
+        struct inodevec *ivec;
+        while (!list_empty(list)) {
+                ivec = list_entry(list->next, struct inodevec, list);
+                list_del(&ivec->list);
+                while (ivec->nr > 0) {
+                        ivec->nr--;
+                        iput(ivec->inode[ivec->nr]);
+                }
+                kfree(ivec);
+        }
+}
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)
@@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                btrfs_btree_balance_dirty(root, nr);
+                /*
+                 * put inodes outside transaction, otherwise we may deadlock.
+                 */
+                put_inodes(&inode_list);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }
@@ -1752,19 +1771,7 @@ out:
        btrfs_btree_balance_dirty(root, nr);
-        /*
+        put_inodes(&inode_list);
-         * put inodes while we aren't holding the tree locks
-         */
-        while (!list_empty(&inode_list)) {
-                struct inodevec *ivec;
-                ivec = list_entry(inode_list.next, struct inodevec, list);
-                list_del(&ivec->list);
-                while (ivec->nr > 0) {
-                        ivec->nr--;
-                        iput(ivec->inode[ivec->nr]);
-                }
-                kfree(ivec);
-        }
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
@@ -3534,8 +3541,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
               (unsigned long long)rc->block_group->key.objectid,
               (unsigned long long)rc->block_group->flags);
-        btrfs_start_delalloc_inodes(fs_info->tree_root);
+        btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-        btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+        btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
        while (1) {
                rc->extents_found = 0;
@@ -3755,6 +3762,7 @@ out:
                                       BTRFS_DATA_RELOC_TREE_OBJECTID);
                if (IS_ERR(fs_root))
                        err = PTR_ERR(fs_root);
+                btrfs_orphan_cleanup(fs_root);
        }
        return err;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 752a5463bf53..3f9b45704fcd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -128,6 +128,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        substring_t args[MAX_OPT_ARGS];
        char *p, *num;
        int intarg;
+        int ret = 0;
        if (!options)
                return 0;
@@ -262,12 +263,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_discard:
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
+                case Opt_err:
+                        printk(KERN_INFO "btrfs: unrecognized mount option "
+                               "'%s'\n", p);
+                        ret = -EINVAL;
+                        goto out;
                default:
                        break;
                }
        }
+out:
        kfree(options);
-        return 0;
+        return ret;
 }
 /*
@@ -405,8 +412,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                return 0;
        }
-        btrfs_start_delalloc_inodes(root);
+        btrfs_start_delalloc_inodes(root, 0);
-        btrfs_wait_ordered_extents(root, 0);
+        btrfs_wait_ordered_extents(root, 0, 0);
        trans = btrfs_start_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
@@ -450,6 +457,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",notreelog");
        if (btrfs_test_opt(root, FLUSHONCOMMIT))
                seq_puts(seq, ",flushoncommit");
+        if (btrfs_test_opt(root, DISCARD))
+                seq_puts(seq, ",discard");
        if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
                seq_puts(seq, ",noacl");
        return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index c207e8c32c9b..b2acc79f1b34 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        if (throttle)
+                btrfs_run_delayed_iputs(root);
        return 0;
 }
@@ -354,7 +357,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 * those extents are sent to disk but does not wait on them
 */
 int btrfs_write_marked_extents(struct btrfs_root *root,
-                               struct extent_io_tree *dirty_pages)
+                               struct extent_io_tree *dirty_pages, int mark)
 {
        int ret;
        int err = 0;
@@ -367,7 +370,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        while (1) {
                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-                                            EXTENT_DIRTY);
+                                            mark);
                if (ret)
                        break;
                while (start <= end) {
@@ -413,7 +416,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 * on all the pages and clear them from the dirty pages state tree
 */
 int btrfs_wait_marked_extents(struct btrfs_root *root,
-                              struct extent_io_tree *dirty_pages)
+                              struct extent_io_tree *dirty_pages, int mark)
 {
        int ret;
        int err = 0;
@@ -425,12 +428,12 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
        unsigned long index;
        while (1) {
-                ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
+                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-                                            EXTENT_DIRTY);
+                                            mark);
                if (ret)
                        break;
-                clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
                while (start <= end) {
                        index = start >> PAGE_CACHE_SHIFT;
                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
@@ -460,13 +463,13 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
 * those extents are on disk for transaction or log commit
 */
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-                                        struct extent_io_tree *dirty_pages)
+                                struct extent_io_tree *dirty_pages, int mark)
 {
        int ret;
        int ret2;
-        ret = btrfs_write_marked_extents(root, dirty_pages);
+        ret = btrfs_write_marked_extents(root, dirty_pages, mark);
-        ret2 = btrfs_wait_marked_extents(root, dirty_pages);
+        ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
        return ret || ret2;
 }
@@ -479,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                return filemap_write_and_wait(btree_inode->i_mapping);
        }
        return btrfs_write_and_wait_marked_extents(root,
-                                           &trans->transaction->dirty_pages);
+                                           &trans->transaction->dirty_pages,
+                                           EXTENT_DIRTY);
 }
 /*
@@ -497,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 {
        int ret;
        u64 old_root_bytenr;
+        u64 old_root_used;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
+        old_root_used = btrfs_root_used(&root->root_item);
        btrfs_write_dirty_block_groups(trans, root);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
-                if (old_root_bytenr == root->node->start)
+                if (old_root_bytenr == root->node->start &&
+                    old_root_used == btrfs_root_used(&root->root_item))
                        break;
                btrfs_set_root_node(&root->root_item, root->node);
@@ -512,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                        &root->root_item);
                BUG_ON(ret);
+                old_root_used = btrfs_root_used(&root->root_item);
                ret = btrfs_write_dirty_block_groups(trans, root);
                BUG_ON(ret);
        }
@@ -795,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        memcpy(&pending->root_key, &key, sizeof(key));
 fail:
        kfree(new_root_item);
-        btrfs_unreserve_metadata_space(root, 6);
        return ret;
 }
@@ -807,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
        u64 index = 0;
        struct btrfs_trans_handle *trans;
        struct inode *parent_inode;
-        struct inode *inode;
        struct btrfs_root *parent_root;
        parent_inode = pending->dentry->d_parent->d_inode;
@@ -839,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
        BUG_ON(ret);
-        inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
-        d_instantiate(pending->dentry, inode);
 fail:
        btrfs_end_transaction(trans, fs_info->fs_root);
        return ret;
@@ -994,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
                if (flush_on_commit) {
-                        btrfs_start_delalloc_inodes(root);
+                        btrfs_start_delalloc_inodes(root, 1);
-                        ret = btrfs_wait_ordered_extents(root, 0);
+                        ret = btrfs_wait_ordered_extents(root, 0, 1);
                        BUG_ON(ret);
                } else if (snap_pending) {
-                        ret = btrfs_wait_ordered_extents(root, 1);
+                        ret = btrfs_wait_ordered_extents(root, 0, 1);
                        BUG_ON(ret);
                }
@@ -1116,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                current->journal_info = NULL;
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
+        if (current != root->fs_info->transaction_kthread)
+                btrfs_run_delayed_iputs(root);
        return ret;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index d4e3e7a6938c..93c7ccb33118 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-                                        struct extent_io_tree *dirty_pages);
+                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_write_marked_extents(struct btrfs_root *root,
-                                        struct extent_io_tree *dirty_pages);
+                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
-                                        struct extent_io_tree *dirty_pages);
+                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 741666a7676a..4a9434b622ec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -542,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
-        ret = btrfs_drop_extents(trans, root, inode,
+        ret = btrfs_drop_extents(trans, inode, start, extent_end,
-                         start, extent_end, extent_end, start, &alloc_hint, 1);
+                                 &alloc_hint, 1);
        BUG_ON(ret);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
@@ -930,6 +930,17 @@ out_nowrite:
        return 0;
 }
+static int insert_orphan_item(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root, u64 offset)
+{
+        int ret;
+        ret = btrfs_find_orphan_item(root, offset);
+        if (ret > 0)
+                ret = btrfs_insert_orphan_item(trans, root, offset);
+        return ret;
+}
 /*
 * There are a few corners where the link count of the file can't
 * be properly maintained during replay.  So, instead of adding
@@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        }
        BTRFS_I(inode)->index_cnt = (u64)-1;
-        if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+        if (inode->i_nlink == 0) {
-                ret = replay_dir_deletes(trans, root, NULL, path,
+                if (S_ISDIR(inode->i_mode)) {
-                                         inode->i_ino, 1);
+                        ret = replay_dir_deletes(trans, root, NULL, path,
+                                                 inode->i_ino, 1);
+                        BUG_ON(ret);
+                }
+                ret = insert_orphan_item(trans, root, inode->i_ino);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                /* inode keys are done during the first stage */
                if (key.type == BTRFS_INODE_ITEM_KEY &&
                    wc->stage == LOG_WALK_REPLAY_INODES) {
-                        struct inode *inode;
                        struct btrfs_inode_item *inode_item;
                        u32 mode;
@@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                             eb, i, &key);
                        BUG_ON(ret);
-                        /* for regular files, truncate away
+                        /* for regular files, make sure corresponding
-                         * extents past the new EOF
+                         * orhpan item exist. extents past the new EOF
+                         * will be truncated later by orphan cleanup.
                         */
                        if (S_ISREG(mode)) {
-                                inode = read_one_inode(root,
+                                ret = insert_orphan_item(wc->trans, root,
-                                                       key.objectid);
+                                                         key.objectid);
-                                BUG_ON(!inode);
-                                ret = btrfs_truncate_inode_items(wc->trans,
-                                        root, inode, inode->i_size,
-                                        BTRFS_EXTENT_DATA_KEY);
                                BUG_ON(ret);
-                                /* if the nlink count is zero here, the iput
-                                 * will free the inode.  We bump it to make
-                                 * sure it doesn't get freed until the link
-                                 * count fixup is done
-                                 */
-                                if (inode->i_nlink == 0) {
-                                        btrfs_inc_nlink(inode);
-                                        btrfs_update_inode(wc->trans,
-                                                           root, inode);
-                                }
-                                iput(inode);
                        }
                        ret = link_to_fixup_dir(wc->trans, root,
                                                path, key.objectid);
                        BUG_ON(ret);
@@ -1977,10 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 {
        int index1;
        int index2;
+        int mark;
        int ret;
        struct btrfs_root *log = root->log_root;
        struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
-        u64 log_transid = 0;
+        unsigned long log_transid = 0;
        mutex_lock(&root->log_mutex);
        index1 = root->log_transid % 2;
@@ -2014,24 +2014,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                goto out;
        }
+        log_transid = root->log_transid;
+        if (log_transid % 2 == 0)
+                mark = EXTENT_DIRTY;
+        else
+                mark = EXTENT_NEW;
        /* we start IO on  all the marked extents here, but we don't actually
         * wait for them until later.
         */
-        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages);
+        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
        BUG_ON(ret);
        btrfs_set_root_node(&log->root_item, log->node);
        root->log_batch = 0;
-        log_transid = root->log_transid;
        root->log_transid++;
        log->log_transid = root->log_transid;
        root->log_start_pid = 0;
        smp_mb();
        /*
-         * log tree has been flushed to disk, new modifications of
+         * IO has been started, blocks of the log tree have WRITTEN flag set
-         * the log will be written to new positions. so it's safe to
+         * in their headers. new modifications of the log will be written to
-         * allow log writers to go in.
+         * new positions. so it's safe to allow log writers to go in.
         */
        mutex_unlock(&root->log_mutex);
@@ -2052,7 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
-                btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
@@ -2072,16 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * check the full commit flag again
         */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
-                btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = -EAGAIN;
                goto out_wake_log_root;
        }
        ret = btrfs_write_and_wait_marked_extents(log_root_tree,
-                                &log_root_tree->dirty_log_pages);
+                                &log_root_tree->dirty_log_pages,
+                                EXTENT_DIRTY | EXTENT_NEW);
        BUG_ON(ret);
-        btrfs_wait_marked_extents(log, &log->dirty_log_pages);
+        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
        btrfs_set_super_log_root(&root->fs_info->super_for_commit,
                                log_root_tree->node->start);
@@ -2147,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
        while (1) {
                ret = find_first_extent_bit(&log->dirty_log_pages,
-                                    0, &start, &end, EXTENT_DIRTY);
+                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
                if (ret)
                        break;
-                clear_extent_dirty(&log->dirty_log_pages,
+                clear_extent_bits(&log->dirty_log_pages, start, end,
-                                   start, end, GFP_NOFS);
+                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
        if (log->log_transid > 0) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7eda483d7b5a..198cff28766d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                max_chunk_size = 10 * calc_size;
                min_stripe_size = 64 * 1024 * 1024;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-                max_chunk_size = 4 * calc_size;
+                max_chunk_size = 256 * 1024 * 1024;
                min_stripe_size = 32 * 1024 * 1024;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
                calc_size = 8 * 1024 * 1024;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index b6dd5967c48a..193b58f7d3f3 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,22 +85,23 @@ out:
        return ret;
 }
-int __btrfs_setxattr(struct inode *inode, const char *name,
+static int do_setxattr(struct btrfs_trans_handle *trans,
-                            const void *value, size_t size, int flags)
+                       struct inode *inode, const char *name,
+                       const void *value, size_t size, int flags)
 {
        struct btrfs_dir_item *di;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
        struct btrfs_path *path;
-        int ret = 0, mod = 0;
+        size_t name_len = strlen(name);
+        int ret = 0;
+        if (name_len + size > BTRFS_MAX_XATTR_SIZE(root))
+                return -ENOSPC;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        trans = btrfs_join_transaction(root, 1);
-        btrfs_set_trans_block_group(trans, inode);
        /* first lets see if we already have this xattr */
        di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
                                strlen(name), -1);
@@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
                }
                ret = btrfs_delete_one_dir_name(trans, root, path, di);
-                if (ret)
+                BUG_ON(ret);
-                        goto out;
                btrfs_release_path(root, path);
                /* if we don't have a value then we are removing the xattr */
-                if (!value) {
+                if (!value)
-                        mod = 1;
                        goto out;
-                }
        } else {
                btrfs_release_path(root, path);
@@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
        }
        /* ok we have to create a completely new xattr */
-        ret = btrfs_insert_xattr_item(trans, root, name, strlen(name),
+        ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
-                                      value, size, inode->i_ino);
+                                      name, name_len, value, size);
+        BUG_ON(ret);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int __btrfs_setxattr(struct btrfs_trans_handle *trans,
+                     struct inode *inode, const char *name,
+                     const void *value, size_t size, int flags)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        if (trans)
+                return do_setxattr(trans, inode, name, value, size, flags);
+        ret = btrfs_reserve_metadata_space(root, 2);
        if (ret)
-                goto out;
+                return ret;
-        mod = 1;
-out:
+        trans = btrfs_start_transaction(root, 1);
-        if (mod) {
+        if (!trans) {
-                inode->i_ctime = CURRENT_TIME;
+                ret = -ENOMEM;
-                ret = btrfs_update_inode(trans, root, inode);
+                goto out;
        }
+        btrfs_set_trans_block_group(trans, inode);
-        btrfs_end_transaction(trans, root);
+        ret = do_setxattr(trans, inode, name, value, size, flags);
-        btrfs_free_path(path);
+        if (ret)
+                goto out;
+        inode->i_ctime = CURRENT_TIME;
+        ret = btrfs_update_inode(trans, root, inode);
+        BUG_ON(ret);
+out:
+        btrfs_end_transaction_throttle(trans, root);
+        btrfs_unreserve_metadata_space(root, 2);
        return ret;
 }
@@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        if (size == 0)
                value = "";  /* empty EA, do not remove */
-        return __btrfs_setxattr(dentry->d_inode, name, value, size, flags);
+        return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size,
+                                flags);
 }
 int btrfs_removexattr(struct dentry *dentry, const char *name)
@@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
        if (!btrfs_is_valid_xattr(name))
                return -EOPNOTSUPP;
-        return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
+        return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
+                                XATTR_REPLACE);
 }
-int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
+int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+                              struct inode *inode, struct inode *dir)
 {
        int err;
        size_t len;
@@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
        } else {
                strcpy(name, XATTR_SECURITY_PREFIX);
                strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-                err = __btrfs_setxattr(inode, name, value, len, 0);
+                err = __btrfs_setxattr(trans, inode, name, value, len, 0);
                kfree(name);
        }
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index c71e9c3cf3f7..721efa0346e0 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[];
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
                void *buffer, size_t size);
-extern int __btrfs_setxattr(struct inode *inode, const char *name,
+extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
-                const void *value, size_t size, int flags);
+                            struct inode *inode, const char *name,
+                            const void *value, size_t size, int flags);
 extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
                void *buffer, size_t size);
 extern int btrfs_setxattr(struct dentry *dentry, const char *name,
                const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
-extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
+extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+                                     struct inode *inode, struct inode *dir);
 #endif /* __XATTR__ */
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 3797e0077b35..2906077ac798 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -84,7 +84,7 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
 static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
 {
        struct cachefiles_object *fsdef;
-        struct nameidata nd;
+        struct path path;
        struct kstatfs stats;
        struct dentry *graveyard, *cachedir, *root;
        const struct cred *saved_cred;
@@ -114,15 +114,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
        _debug("- fsdef %p", fsdef);
        /* look up the directory at the root of the cache */
-        memset(&nd, 0, sizeof(nd));
+        ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path);
-        ret = path_lookup(cache->rootdirname, LOOKUP_DIRECTORY, &nd);
        if (ret < 0)
                goto error_open_root;
-        cache->mnt = mntget(nd.path.mnt);
+        cache->mnt = path.mnt;
-        root = dget(nd.path.dentry);
+        root = path.dentry;
-        path_put(&nd.path);
        /* check parameters */
        ret = -EOPNOTSUPP;
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index a6c8c6fe8df9..1d8332563863 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <linux/ima.h>
 #include "internal.h"
 /*
@@ -923,7 +922,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
        if (IS_ERR(file)) {
                ret = PTR_ERR(file);
        } else {
-                ima_counts_get(file);
                ret = -EIO;
                if (file->f_op->write) {
                        pos = (loff_t) page->index << PAGE_SHIFT;
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 094ea65afc85..7b2600b380d7 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -5,7 +5,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem
 mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
 Disable use of server inode numbers when server only
 partially supports them (e.g. for one server querying inode numbers on
-FindFirst fails but QPathInfo queries works).
+FindFirst fails but QPathInfo queries works). Fix oops with dfs in 
+cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
+for OpenOffice when on forcedirectio mount e.g.)
 Version 1.60
 -------------
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 29f1da761bbf..8c6a03627176 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -758,7 +758,7 @@ const struct file_operations cifs_file_ops = {
 };
 const struct file_operations cifs_file_direct_ops = {
-        /* no mmap, no aio, no readv -
+        /* no aio, no readv -
           BB reevaluate whether they can be done with directio, no cache */
        .read = cifs_user_read,
        .write = cifs_user_write,
@@ -767,6 +767,7 @@ const struct file_operations cifs_file_direct_ops = {
        .lock = cifs_lock,
        .fsync = cifs_fsync,
        .flush = cifs_flush,
+        .mmap = cifs_file_mmap,
        .splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
        .unlocked_ioctl  = cifs_ioctl,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 63ea83ff687f..3bbcaa716b3c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2287,12 +2287,12 @@ int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
                char *mount_data_global, const char *devname)
 {
-        int rc = 0;
+        int rc;
        int xid;
        struct smb_vol *volume_info;
-        struct cifsSesInfo *pSesInfo = NULL;
+        struct cifsSesInfo *pSesInfo;
-        struct cifsTconInfo *tcon = NULL;
+        struct cifsTconInfo *tcon;
-        struct TCP_Server_Info *srvTcp = NULL;
+        struct TCP_Server_Info *srvTcp;
        char   *full_path;
        char *mount_data = mount_data_global;
 #ifdef CONFIG_CIFS_DFS_UPCALL
@@ -2301,6 +2301,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
        int referral_walks_count = 0;
 try_mount_again:
 #endif
+        rc = 0;
+        tcon = NULL;
+        pSesInfo = NULL;
+        srvTcp = NULL;
        full_path = NULL;
        xid = GetXid();
@@ -2597,6 +2601,7 @@ remote_path_check:
                        cleanup_volume_info(&volume_info);
                        referral_walks_count++;
+                        FreeXid(xid);
                        goto try_mount_again;
                }
 #else /* No DFS support, return error on mount */
diff --git a/fs/cifs/export.c b/fs/cifs/export.c
index 75949d6a5f1b..6177f7cca16a 100644
--- a/fs/cifs/export.c
+++ b/fs/cifs/export.c
@@ -24,7 +24,7 @@
 */
 /*
-  * See Documentation/filesystems/Exporting
+  * See Documentation/filesystems/nfs/Exporting
  * and examples in fs/exportfs
  *
  * Since cifs is a network file system, an "fsid" must be included for
diff --git a/fs/compat.c b/fs/compat.c
index 6c19040ffeef..00d90c2e66f0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -38,8 +38,6 @@
 #include <linux/dirent.h>
 #include <linux/fsnotify.h>
 #include <linux/highuid.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/personality.h>
 #include <linux/rwsem.h>
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 14cbc831422a..332dd00f0894 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1600,8 +1600,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
        case KDSKBMETA:
        case KDSKBLED:
        case KDSETLED:
-        /* SG stuff */
-        case SG_SET_TRANSFORM:
        /* AUTOFS */
        case AUTOFS_IOC_READY:
        case AUTOFS_IOC_FAIL:
diff --git a/fs/dcache.c b/fs/dcache.c
index a100fa35a48f..953173a293a9 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -978,6 +978,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
        q.hash = full_name_hash(q.name, q.len);
        return d_alloc(parent, &q);
 }
+EXPORT_SYMBOL(d_alloc_name);
 /* the caller must hold dcache_lock */
 static void __d_instantiate(struct dentry *dentry, struct inode *inode)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b912270942fa..e82adc2debb7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -53,13 +53,6 @@
 *
 * If blkfactor is zero then the user's request was aligned to the filesystem's
 * blocksize.
- *
- * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems.
- * This determines whether we need to do the fancy locking which prevents
- * direct-IO from being able to read uninitialised disk blocks.  If its zero
- * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_mutex is
- * not held for the entire direct write (taken briefly, initially, during a
- * direct read though, but its never held for the duration of a direct-IO).
 */
 struct dio {
@@ -68,7 +61,7 @@ struct dio {
        struct inode *inode;
        int rw;
        loff_t i_size;                  /* i_size when submitted */
-        int lock_type;                  /* doesn't change */
+        int flags;                      /* doesn't change */
        unsigned blkbits;               /* doesn't change */
        unsigned blkfactor;             /* When we're using an alignment which
                                           is finer than the filesystem's soft
@@ -104,6 +97,18 @@ struct dio {
        unsigned cur_page_len;          /* Nr of bytes at cur_page_offset */
        sector_t cur_page_block;        /* Where it starts */
+        /* BIO completion state */
+        spinlock_t bio_lock;            /* protects BIO fields below */
+        unsigned long refcount;         /* direct_io_worker() and bios */
+        struct bio *bio_list;           /* singly linked via bi_private */
+        struct task_struct *waiter;     /* waiting task (NULL if none) */
+        /* AIO related stuff */
+        struct kiocb *iocb;             /* kiocb */
+        int is_async;                   /* is IO async ? */
+        int io_error;                   /* IO error in completion path */
+        ssize_t result;                 /* IO result */
        /*
         * Page fetching state. These variables belong to dio_refill_pages().
         */
@@ -115,22 +120,16 @@ struct dio {
         * Page queue.  These variables belong to dio_refill_pages() and
         * dio_get_page().
         */
-        struct page *pages[DIO_PAGES];  /* page buffer */
        unsigned head;                  /* next page to process */
        unsigned tail;                  /* last valid page + 1 */
        int page_errors;                /* errno from get_user_pages() */
-        /* BIO completion state */
+        /*
-        spinlock_t bio_lock;            /* protects BIO fields below */
+         * pages[] (and any fields placed after it) are not zeroed out at
-        unsigned long refcount;         /* direct_io_worker() and bios */
+         * allocation time.  Don't add new fields after pages[] unless you
-        struct bio *bio_list;           /* singly linked via bi_private */
+         * wish that they not be zeroed.
-        struct task_struct *waiter;     /* waiting task (NULL if none) */
+         */
+        struct page *pages[DIO_PAGES];  /* page buffer */
-        /* AIO related stuff */
-        struct kiocb *iocb;             /* kiocb */
-        int is_async;                   /* is IO async ? */
-        int io_error;                   /* IO error in completion path */
-        ssize_t result;                 /* IO result */
 };
 /*
@@ -240,7 +239,8 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
        if (dio->end_io && dio->result)
                dio->end_io(dio->iocb, offset, transferred,
                            dio->map_bh.b_private);
-        if (dio->lock_type == DIO_LOCKING)
+        if (dio->flags & DIO_LOCKING)
                /* lockdep: non-owner release */
                up_read_non_owner(&dio->inode->i_alloc_sem);
@@ -515,21 +515,24 @@ static int get_more_blocks(struct dio *dio)
                map_bh->b_state = 0;
                map_bh->b_size = fs_count << dio->inode->i_blkbits;
+                /*
+                 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
+                 * forbid block creations: only overwrites are permitted.
+                 * We will return early to the caller once we see an
+                 * unmapped buffer head returned, and the caller will fall
+                 * back to buffered I/O.
+                 *
+                 * Otherwise the decision is left to the get_blocks method,
+                 * which may decide to handle it or also return an unmapped
+                 * buffer head.
+                 */
                create = dio->rw & WRITE;
-                if (dio->lock_type == DIO_LOCKING) {
+                if (dio->flags & DIO_SKIP_HOLES) {
                        if (dio->block_in_file < (i_size_read(dio->inode) >>
                                                        dio->blkbits))
                                create = 0;
-                } else if (dio->lock_type == DIO_NO_LOCKING) {
-                        create = 0;
                }
-                /*
-                 * For writes inside i_size we forbid block creations: only
-                 * overwrites are permitted.  We fall back to buffered writes
-                 * at a higher level for inside-i_size block-instantiating
-                 * writes.
-                 */
                ret = (*dio->get_block)(dio->inode, fs_startblk,
                                                map_bh, create);
        }
@@ -1039,7 +1042,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
         * we can let i_mutex go now that its achieved its purpose
         * of protecting us from looking up uninitialized blocks.
         */
-        if ((rw == READ) && (dio->lock_type == DIO_LOCKING))
+        if (rw == READ && (dio->flags & DIO_LOCKING))
                mutex_unlock(&dio->inode->i_mutex);
        /*
@@ -1086,30 +1089,28 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 /*
 * This is a library function for use by filesystem drivers.
- * The locking rules are governed by the dio_lock_type parameter.
 *
- * DIO_NO_LOCKING (no locking, for raw block device access)
+ * The locking rules are governed by the flags parameter:
- * For writes, i_mutex is not held on entry; it is never taken.
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
 *
- * DIO_LOCKING (simple locking for regular files)
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- * For writes we are called under i_mutex and return with i_mutex held, even
+ *    internal locking but rather rely on the filesystem to synchronize
- * though it is internally dropped.
+ *    direct I/O reads/writes versus each other and truncate.
- * For reads, i_mutex is not held on entry, but it is taken and dropped before
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
- * returning.
+ *    entry and are never taken.
- *
- * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of
- *      uninitialised data, allowing parallel direct readers and writers)
- * For writes we are called without i_mutex, return without it, never touch it.
- * For reads we are called under i_mutex and return with i_mutex held, even
- * though it may be internally dropped.
- *
- * Additional i_alloc_sem locking requirements described inline below.
 */
 ssize_t
 __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int dio_lock_type)
+        int flags)
 {
        int seg;
        size_t size;
@@ -1120,8 +1121,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        ssize_t retval = -EINVAL;
        loff_t end = offset;
        struct dio *dio;
-        int release_i_mutex = 0;
-        int acquire_i_mutex = 0;
        if (rw & WRITE)
                rw = WRITE_ODIRECT_PLUG;
@@ -1151,48 +1150,41 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                }
        }
-        dio = kzalloc(sizeof(*dio), GFP_KERNEL);
+        dio = kmalloc(sizeof(*dio), GFP_KERNEL);
        retval = -ENOMEM;
        if (!dio)
                goto out;
        /*
-         * For block device access DIO_NO_LOCKING is used,
+         * Believe it or not, zeroing out the page array caused a .5%
-         *      neither readers nor writers do any locking at all
+         * performance regression in a database benchmark.  So, we take
-         * For regular files using DIO_LOCKING,
+         * care to only zero out what's needed.
-         *      readers need to grab i_mutex and i_alloc_sem
-         *      writers need to grab i_alloc_sem only (i_mutex is already held)
-         * For regular files using DIO_OWN_LOCKING,
-         *      neither readers nor writers take any locks here
         */
-        dio->lock_type = dio_lock_type;
+        memset(dio, 0, offsetof(struct dio, pages));
-        if (dio_lock_type != DIO_NO_LOCKING) {
+        dio->flags = flags;
+        if (dio->flags & DIO_LOCKING) {
                /* watch out for a 0 len io from a tricksy fs */
                if (rw == READ && end > offset) {
-                        struct address_space *mapping;
+                        struct address_space *mapping =
+                                        iocb->ki_filp->f_mapping;
-                        mapping = iocb->ki_filp->f_mapping;
+                        /* will be released by direct_io_worker */
-                        if (dio_lock_type != DIO_OWN_LOCKING) {
+                        mutex_lock(&inode->i_mutex);
-                                mutex_lock(&inode->i_mutex);
-                                release_i_mutex = 1;
-                        }
                        retval = filemap_write_and_wait_range(mapping, offset,
                                                              end - 1);
                        if (retval) {
+                                mutex_unlock(&inode->i_mutex);
                                kfree(dio);
                                goto out;
                        }
-                        if (dio_lock_type == DIO_OWN_LOCKING) {
-                                mutex_unlock(&inode->i_mutex);
-                                acquire_i_mutex = 1;
-                        }
                }
-                if (dio_lock_type == DIO_LOCKING)
+                /*
-                        /* lockdep: not the owner will release it */
+                 * Will be released at I/O completion, possibly in a
-                        down_read_non_owner(&inode->i_alloc_sem);
+                 * different thread.
+                 */
+                down_read_non_owner(&inode->i_alloc_sem);
        }
        /*
@@ -1210,24 +1202,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
-         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by
+         *
-         * it's own meaner.
+         * NOTE: filesystems with their own locking have to handle this
+         * on their own.
         */
-        if (unlikely(retval < 0 && (rw & WRITE))) {
+        if (flags & DIO_LOCKING) {
-                loff_t isize = i_size_read(inode);
+                if (unlikely((rw & WRITE) && retval < 0)) {
+                        loff_t isize = i_size_read(inode);
-                if (end > isize && dio_lock_type == DIO_LOCKING)
+                        if (end > isize)
-                        vmtruncate(inode, isize);
+                                vmtruncate(inode, isize);
+                }
        }
-        if (rw == READ && dio_lock_type == DIO_LOCKING)
-                release_i_mutex = 0;
 out:
-        if (release_i_mutex)
-                mutex_unlock(&inode->i_mutex);
-        else if (acquire_i_mutex)
-                mutex_lock(&inode->i_mutex);
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 2dda5ade75bc..8f006a0d6076 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -62,7 +62,7 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
                struct inode *lower_inode =
                        ecryptfs_inode_to_lower(dentry->d_inode);
-                fsstack_copy_attr_all(dentry->d_inode, lower_inode, NULL);
+                fsstack_copy_attr_all(dentry->d_inode, lower_inode);
        }
 out:
        return rc;
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 056fed62d0de..429ca0b3ba08 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -626,9 +626,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        lower_new_dir_dentry->d_inode, lower_new_dentry);
        if (rc)
                goto out_lock;
-        fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode, NULL);
+        fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode);
        if (new_dir != old_dir)
-                fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode, NULL);
+                fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode);
 out_lock:
        unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
        dput(lower_new_dentry->d_parent);
@@ -967,7 +967,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
        rc = notify_change(lower_dentry, ia);
        mutex_unlock(&lower_dentry->d_inode->i_mutex);
 out:
-        fsstack_copy_attr_all(inode, lower_inode, NULL);
+        fsstack_copy_attr_all(inode, lower_inode);
        return rc;
 }
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index c6ac85d6c701..567bc4b9f70a 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -35,7 +35,6 @@
 #include <linux/key.h>
 #include <linux/parser.h>
 #include <linux/fs_stack.h>
-#include <linux/ima.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -119,7 +118,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
        const struct cred *cred = current_cred();
        struct ecryptfs_inode_info *inode_info =
                ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
-        int opened_lower_file = 0;
        int rc = 0;
        mutex_lock(&inode_info->lower_file_mutex);
@@ -136,12 +134,9 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
                               "for lower_dentry [0x%p] and lower_mnt [0x%p]; "
                               "rc = [%d]\n", lower_dentry, lower_mnt, rc);
                        inode_info->lower_file = NULL;
-                } else
+                }
-                        opened_lower_file = 1;
        }
        mutex_unlock(&inode_info->lower_file_mutex);
-        if (opened_lower_file)
-                ima_counts_get(inode_info->lower_file);
        return rc;
 }
@@ -194,7 +189,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry,
                init_special_inode(inode, lower_inode->i_mode,
                                   lower_inode->i_rdev);
        dentry->d_op = &ecryptfs_dops;
-        fsstack_copy_attr_all(inode, lower_inode, NULL);
+        fsstack_copy_attr_all(inode, lower_inode);
        /* This size will be overwritten for real files w/ headers and
         * other metadata */
        fsstack_copy_inode_size(inode, lower_inode);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8b47e4200e65..d26402ff06ea 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -339,7 +339,7 @@ struct file *eventfd_file_create(unsigned int count, int flags)
        ctx->flags = flags;
        file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx,
-                                  flags & EFD_SHARED_FCNTL_FLAGS);
+                                  O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS));
        if (IS_ERR(file))
                eventfd_free_ctx(ctx);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 366c503f9657..bd056a5b4efc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
         * a file structure and a free file descriptor.
         */
        error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-                                 flags & O_CLOEXEC);
+                                 O_RDWR | (flags & O_CLOEXEC));
        if (error < 0)
                ep_free(ep);
diff --git a/fs/exec.c b/fs/exec.c
index 623a5cc3076a..632b02e34ec7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -826,7 +826,9 @@ static int de_thread(struct task_struct *tsk)
                attach_pid(tsk, PIDTYPE_PID,  task_pid(leader));
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);
                list_replace_rcu(&leader->tasks, &tsk->tasks);
+                list_replace_init(&leader->sibling, &tsk->sibling);
                tsk->group_leader = tsk;
                leader->group_leader = tsk;
@@ -1761,17 +1763,20 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
        struct inode * inode;
-        struct file * file;
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
        int flag = 0;
        int ispipe = 0;
-        unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur;
        char **helper_argv = NULL;
        int helper_argc = 0;
        int dump_count = 0;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
+        struct coredump_params cprm = {
+                .signr = signr,
+                .regs = regs,
+                .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur,
+        };
        audit_core_dumps(signr);
@@ -1827,15 +1832,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        ispipe = format_corename(corename, signr);
        unlock_kernel();
-        if ((!ispipe) && (core_limit < binfmt->min_coredump))
+        if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
                goto fail_unlock;
        if (ispipe) {
-                if (core_limit == 0) {
+                if (cprm.limit == 0) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
                         * we're not writing to the file system, but we use
-                         * core_limit of 0 here as a speacial value. Any
+                         * cprm.limit of 0 here as a speacial value. Any
                         * non-zero limit gets set to RLIM_INFINITY below, but
                         * a limit of 0 skips the dump.  This is a consistent
                         * way to catch recursive crashes.  We can still crash
@@ -1868,25 +1873,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                core_limit = RLIM_INFINITY;
+                cprm.limit = RLIM_INFINITY;
                /* SIGPIPE can happen, but it's just never processed */
                if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
-                                &file)) {
+                                &cprm.file)) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
                        goto fail_dropcount;
                }
        } else
-                file = filp_open(corename,
+                cprm.file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
-        if (IS_ERR(file))
+        if (IS_ERR(cprm.file))
                goto fail_dropcount;
-        inode = file->f_path.dentry->d_inode;
+        inode = cprm.file->f_path.dentry->d_inode;
        if (inode->i_nlink > 1)
                goto close_fail;        /* multiple links - don't dump */
-        if (!ispipe && d_unhashed(file->f_path.dentry))
+        if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
                goto close_fail;
        /* AK: actually i see no reason to not allow this for named pipes etc.,
@@ -1899,21 +1904,22 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
         */
        if (inode->i_uid != current_fsuid())
                goto close_fail;
-        if (!file->f_op)
+        if (!cprm.file->f_op)
                goto close_fail;
-        if (!file->f_op->write)
+        if (!cprm.file->f_op->write)
                goto close_fail;
-        if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0)
+        if (!ispipe &&
+            do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
                goto close_fail;
-        retval = binfmt->core_dump(signr, regs, file, core_limit);
+        retval = binfmt->core_dump(&cprm);
        if (retval)
                current->signal->group_exit_code |= 0x80;
 close_fail:
        if (ispipe && core_pipe_limit)
-                wait_for_dump_helpers(file);
+                wait_for_dump_helpers(cprm.file);
-        filp_close(file, NULL);
+        filp_close(cprm.file, NULL);
 fail_dropcount:
        if (dump_count)
                atomic_dec(&core_dump_count);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 698a8636d39c..2afbcebeda71 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -738,13 +738,28 @@ static int exofs_write_begin_export(struct file *file,
                                        fsdata);
 }
+static int exofs_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        /* According to comment in simple_write_end i_mutex is held */
+        loff_t i_size = inode->i_size;
+        int ret;
+        ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
+        if (i_size != inode->i_size)
+                mark_inode_dirty(inode);
+        return ret;
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
        .writepage      = exofs_writepage,
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
-        .write_end      = simple_write_end,
+        .write_end      = exofs_write_end,
 };
 /******************************************************************************
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
index 423033addd1f..c52e9888b8ab 100644
--- a/fs/exofs/pnfs.h
+++ b/fs/exofs/pnfs.h
@@ -15,13 +15,7 @@
 #ifndef __EXOFS_PNFS_H__
 #define __EXOFS_PNFS_H__
-#if defined(CONFIG_PNFS)
+#if ! defined(__PNFS_OSD_XDR_H__)
-/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
-#include "../nfs/objlayout/pnfs_osd_xdr.h"
-#else /* defined(CONFIG_PNFS) */
 enum pnfs_iomode {
        IOMODE_READ = 1,
@@ -46,6 +40,6 @@ struct pnfs_osd_data_map {
        u32     odm_raid_algorithm;
 };
-#endif /* else defined(CONFIG_PNFS) */
+#endif /* ! defined(__PNFS_OSD_XDR_H__) */
 #endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 197c7db583c7..e9e175949a63 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -6,7 +6,7 @@
 * and for mapping back from file handles to dentries.
 *
 * For details on why we do all the strange and hairy things in here
- * take a look at Documentation/filesystems/Exporting.
+ * take a look at Documentation/filesystems/nfs/Exporting.
 */
 #include <linux/exportfs.h>
 #include <linux/fs.h>
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a63d44256a70..a99e54318c3d 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -339,12 +339,12 @@ ext2_acl_chmod(struct inode *inode)
 * Extended attribut handlers
 */
 static size_t
-ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -352,12 +352,12 @@ ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
 }
 static size_t
-ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
-                            const char *name, size_t name_len)
+                            const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -365,15 +365,18 @@ ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+                   size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ext2_get_acl(inode, type);
+        acl = ext2_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -385,33 +388,17 @@ ext2_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 }
 static int
-ext2_xattr_get_acl_access(struct inode *inode, const char *name,
+ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                          void *buffer, size_t size)
+                   size_t size, int flags, int type)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-ext2_xattr_get_acl_default(struct inode *inode, const char *name,
-                           void *buffer, size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
-                   size_t size)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!is_owner_or_cap(dentry->d_inode))
                return -EPERM;
        if (value) {
@@ -426,41 +413,25 @@ ext2_xattr_set_acl(struct inode *inode, int type, const void *value,
        } else
                acl = NULL;
-        error = ext2_set_acl(inode, type, acl);
+        error = ext2_set_acl(dentry->d_inode, type, acl);
 release_and_out:
        posix_acl_release(acl);
        return error;
 }
-static int
-ext2_xattr_set_acl_access(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int
-ext2_xattr_set_acl_default(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ext2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ext2_xattr_list_acl_access,
-        .get    = ext2_xattr_get_acl_access,
+        .get    = ext2_xattr_get_acl,
-        .set    = ext2_xattr_set_acl_access,
+        .set    = ext2_xattr_set_acl,
 };
 struct xattr_handler ext2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext2_xattr_list_acl_default,
-        .get    = ext2_xattr_get_acl_default,
+        .get    = ext2_xattr_get_acl,
-        .set    = ext2_xattr_set_acl_default,
+        .set    = ext2_xattr_set_acl,
 };
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index fc2bd05d3559..7516957273ed 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -721,5 +721,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .fsync          = simple_fsync,
+        .fsync          = ext2_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index da318b0fa637..061914add3cf 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -155,6 +155,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 /* file.c */
+extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a2f3afd1a1c1..586e3589d4c2 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -19,6 +19,7 @@
 */
 #include <linux/time.h>
+#include <linux/pagemap.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -38,6 +39,22 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
+int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        int ret;
+        struct super_block *sb = dentry->d_inode->i_sb;
+        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
+        ret = simple_fsync(file, dentry, datasync);
+        if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
+                /* We don't really know where the IO error happened... */
+                ext2_error(sb, __func__,
+                           "detected IO error when writing metadata buffers");
+                ret = -EIO;
+        }
+        return ret;
+}
 /*
 * We have mostly NULL's here: the current defaults are ok for
 * the ext2 filesystem.
@@ -55,7 +72,7 @@ const struct file_operations ext2_file_operations = {
        .mmap           = generic_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = simple_fsync,
+        .fsync          = ext2_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
 };
@@ -72,7 +89,7 @@ const struct file_operations ext2_xip_file_operations = {
        .mmap           = xip_file_mmap,
        .open           = generic_file_open,
        .release        = ext2_release_file,
-        .fsync          = simple_fsync,
+        .fsync          = ext2_fsync,
 };
 #endif
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1388802b7803..f9cb54a585ce 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1105,9 +1105,30 @@ failed_sbi:
        return ret;
 }
+static void ext2_clear_super_error(struct super_block *sb)
+{
+        struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
+        if (buffer_write_io_error(sbh)) {
+                /*
+                 * Oh, dear.  A previous attempt to write the
+                 * superblock failed.  This could happen because the
+                 * USB device was yanked out.  Or it could happen to
+                 * be a transient write error and maybe the block will
+                 * be remapped.  Nothing we can do but to retry the
+                 * write and hope for the best.
+                 */
+                printk(KERN_ERR "EXT2-fs: %s previous I/O error to "
+                       "superblock detected", sb->s_id);
+                clear_buffer_write_io_error(sbh);
+                set_buffer_uptodate(sbh);
+        }
+}
 static void ext2_commit_super (struct super_block * sb,
                               struct ext2_super_block * es)
 {
+        ext2_clear_super_error(sb);
        es->s_wtime = cpu_to_le32(get_seconds());
        mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
        sb->s_dirt = 0;
@@ -1115,6 +1136,7 @@ static void ext2_commit_super (struct super_block * sb,
 static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
 {
+        ext2_clear_super_error(sb);
        es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
        es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
        es->s_wtime = cpu_to_le32(get_seconds());
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 7913531ec6d5..904f00642f84 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -60,6 +60,7 @@
 #include <linux/mbcache.h>
 #include <linux/quotaops.h>
 #include <linux/rwsem.h>
+#include <linux/security.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
@@ -249,8 +250,9 @@ cleanup:
 * used / required on success.
 */
 static int
-ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        struct ext2_xattr_entry *entry;
        char *end;
@@ -300,9 +302,10 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
                        ext2_xattr_handler(entry->e_name_index);
                if (handler) {
-                        size_t size = handler->list(inode, buffer, rest,
+                        size_t size = handler->list(dentry, buffer, rest,
                                                    entry->e_name,
-                                                    entry->e_name_len);
+                                                    entry->e_name_len,
+                                                    handler->flags);
                        if (buffer) {
                                if (size > rest) {
                                        error = -ERANGE;
@@ -330,7 +333,7 @@ cleanup:
 ssize_t
 ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        return ext2_xattr_list(dentry->d_inode, buffer, size);
+        return ext2_xattr_list(dentry, buffer, size);
 }
 /*
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 70c0dbdcdcb7..c8155845ac05 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -11,8 +11,8 @@
 #include "xattr.h"
 static size_t
-ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-                         const char *name, size_t name_len)
+                         const char *name, size_t name_len, int type)
 {
        const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -26,22 +26,22 @@ ext2_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_security_get(struct inode *inode, const char *name,
+ext2_xattr_security_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                       void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_get(inode, EXT2_XATTR_INDEX_SECURITY, name,
+        return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
                              buffer, size);
 }
 static int
-ext2_xattr_security_set(struct inode *inode, const char *name,
+ext2_xattr_security_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
+        return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name,
                              value, size, flags);
 }
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index e8219f8eae9f..2a26d71f4771 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -13,8 +13,8 @@
 #include "xattr.h"
 static size_t
-ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -31,22 +31,22 @@ ext2_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_trusted_get(struct inode *inode, const char *name,
+ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_get(inode, EXT2_XATTR_INDEX_TRUSTED, name,
+        return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
                              buffer, size);
 }
 static int
-ext2_xattr_trusted_set(struct inode *inode, const char *name,
+ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
+        return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name,
                              value, size, flags);
 }
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 92495d28c62f..3f6caf3684b4 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -12,13 +12,13 @@
 #include "xattr.h"
 static size_t
-ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-                     const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return 0;
        if (list && total_len <= list_size) {
@@ -30,27 +30,28 @@ ext2_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext2_xattr_user_get(struct inode *inode, const char *name,
+ext2_xattr_user_get(struct dentry *dentry, const char *name,
-                    void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name, buffer, size);
+        return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER,
+                              name, buffer, size);
 }
 static int
-ext2_xattr_user_set(struct inode *inode, const char *name,
+ext2_xattr_user_set(struct dentry *dentry, const char *name,
-                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
+        return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext2_xattr_user_handler = {
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index c9b0df376b5f..82ba34158661 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -366,12 +366,12 @@ out:
 * Extended attribute handlers
 */
 static size_t
-ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -379,12 +379,12 @@ ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
 }
 static size_t
-ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t name_len)
+                            const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -392,15 +392,18 @@ ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
 }
 static int
-ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+                   size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ext3_get_acl(inode, type);
+        acl = ext3_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -412,31 +415,16 @@ ext3_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 }
 static int
-ext3_xattr_get_acl_access(struct inode *inode, const char *name,
+ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                          void *buffer, size_t size)
+                   size_t size, int flags, int type)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-ext3_xattr_get_acl_default(struct inode *inode, const char *name,
-                           void *buffer, size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
-                   size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        handle_t *handle;
        struct posix_acl *acl;
        int error, retries = 0;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
        if (!is_owner_or_cap(inode))
@@ -468,34 +456,18 @@ release_and_out:
        return error;
 }
-static int
-ext3_xattr_set_acl_access(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int
-ext3_xattr_set_acl_default(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext3_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ext3_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ext3_xattr_list_acl_access,
-        .get    = ext3_xattr_get_acl_access,
+        .get    = ext3_xattr_get_acl,
-        .set    = ext3_xattr_set_acl_access,
+        .set    = ext3_xattr_set_acl,
 };
 struct xattr_handler ext3_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext3_xattr_list_acl_default,
-        .get    = ext3_xattr_get_acl_default,
+        .get    = ext3_xattr_get_acl,
-        .set    = ext3_xattr_set_acl_default,
+        .set    = ext3_xattr_set_acl,
 };
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ad14227f509e..455e6e6e5cb9 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -970,7 +970,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
                if (max_blocks > DIO_MAX_BLOCKS)
                        max_blocks = DIO_MAX_BLOCKS;
                handle = ext3_journal_start(inode, DIO_CREDITS +
-                                2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb));
+                                EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb));
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out;
@@ -3146,8 +3146,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
-                handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
+                handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
-                                        EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
+                                        EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
@@ -3239,7 +3239,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
 #ifdef CONFIG_QUOTA
        /* We know that structure was already allocated during vfs_dq_init so
         * we will be updating only the data blocks + inodes */
-        ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
+        ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
 #endif
        return ret;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index aad6400c9b77..7b0e44f7d66f 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1699,7 +1699,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1733,7 +1733,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1769,7 +1769,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
@@ -1920,7 +1920,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        struct ext3_iloc iloc;
        int err = 0, rc;
-        lock_super(sb);
+        mutex_lock(&EXT3_SB(sb)->s_orphan_lock);
        if (!list_empty(&EXT3_I(inode)->i_orphan))
                goto out_unlock;
@@ -1929,9 +1929,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        /* @@@ FIXME: Observation from aviro:
         * I think I can trigger J_ASSERT in ext3_orphan_add().  We block
-         * here (on lock_super()), so race with ext3_link() which might bump
+         * here (on s_orphan_lock), so race with ext3_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
+         *
+         * tytso, 4/25/2009: I'm not sure how that could happen;
+         * shouldn't the fs core protect us from these sort of
+         * unlink()/link() races?
         */
        J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -1968,7 +1972,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode)
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-        unlock_super(sb);
+        mutex_unlock(&EXT3_SB(sb)->s_orphan_lock);
        ext3_std_error(inode->i_sb, err);
        return err;
 }
@@ -1986,11 +1990,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
        struct ext3_iloc iloc;
        int err = 0;
-        lock_super(inode->i_sb);
+        mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
-        if (list_empty(&ei->i_orphan)) {
+        if (list_empty(&ei->i_orphan))
-                unlock_super(inode->i_sb);
+                goto out;
-                return 0;
-        }
        ino_next = NEXT_ORPHAN(inode);
        prev = ei->i_orphan.prev;
@@ -2040,7 +2042,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
        ext3_std_error(inode->i_sb, err);
 out:
-        unlock_super(inode->i_sb);
+        mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock);
        return err;
 out_brelse:
@@ -2175,7 +2177,7 @@ static int ext3_symlink (struct inode * dir,
 retry:
        handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
                                        EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-                                        2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
+                                        EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 5f83b6179178..54351ac7cef9 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                err = -EBUSY;
                goto exit_journal;
@@ -324,7 +324,7 @@ exit_bh:
        brelse(bh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
@@ -662,11 +662,12 @@ exit_free:
 * important part is that the new block and inode counts are in the backup
 * superblocks, and the location of the new group metadata in the GDT backups.
 *
- * We do not need lock_super() for this, because these blocks are not
+ * We do not need take the s_resize_lock for this, because these
- * otherwise touched by the filesystem code when it is mounted.  We don't
+ * blocks are not otherwise touched by the filesystem code when it is
- * need to worry about last changing from sbi->s_groups_count, because the
+ * mounted.  We don't need to worry about last changing from
- * worst that can happen is that we do not copy the full number of backups
+ * sbi->s_groups_count, because the worst that can happen is that we
- * at this time.  The resize which changed s_groups_count will backup again.
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
 */
 static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
@@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                ext3_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
@@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        /*
         * OK, now we've set up the new group.  Time to make it active.
         *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
         * so we have to be safe wrt. concurrent accesses the group
         * data.  So we need to be careful to set all of the relevant
         * group descriptor data etc. *before* we enable the group.
@@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
         *
         * The precise rules we use are:
         *
-         * * Writers of s_groups_count *must* hold lock_super
+         * * Writers of s_groups_count *must* hold s_resize_lock
         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-         * * Readers must hold lock_super() over the access
+         * * Readers must hold s_resize_lock over the access
         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
@@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
        ext3_journal_dirty_metadata(handle, sbi->s_sbh);
 exit_journal:
-        unlock_super(sb);
+        mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext3_journal_stop(handle)) && !err)
                err = err2;
        if (!err) {
@@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
-         * taking lock_super() below. */
+         * taking the s_resize_lock below. */
        o_blocks_count = le32_to_cpu(es->s_blocks_count);
        o_groups_count = EXT3_SB(sb)->s_groups_count;
@@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                goto exit_put;
        }
-        lock_super(sb);
+        mutex_lock(&EXT3_SB(sb)->s_resize_lock);
        if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
                ext3_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
-                unlock_super(sb);
+                mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
                ext3_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
@@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
                                                 EXT3_SB(sb)->s_sbh))) {
                ext3_warning(sb, __func__,
                             "error %d on journal write access", err);
-                unlock_super(sb);
+                mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
                ext3_journal_stop(handle);
                goto exit_put;
        }
        es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
        ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-        unlock_super(sb);
+        mutex_unlock(&EXT3_SB(sb)->s_resize_lock);
        ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
                   o_blocks_count + add);
        ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 7ad1e8c30bd0..afa2b569da10 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1928,6 +1928,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        sb->dq_op = &ext3_quota_operations;
 #endif
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+        mutex_init(&sbi->s_orphan_lock);
+        mutex_init(&sbi->s_resize_lock);
        sb->s_root = NULL;
@@ -2014,14 +2016,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
        }
        ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
-        /*
-         * akpm: core read_super() calls in here with the superblock locked.
-         * That deadlocks, because orphan cleanup needs to lock the superblock
-         * in numerous places.  Here we just pop the lock - it's relatively
-         * harmless, because we are now ready to accept write_super() requests,
-         * and aviro says that's the only reason for hanging onto the
-         * superblock lock.
-         */
        EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
        ext3_orphan_cleanup(sb, es);
        EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
@@ -2403,13 +2398,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
        if (journal_flush(journal) < 0)
                goto out;
-        lock_super(sb);
        if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
                ext3_commit_super(sb, es, 1);
        }
-        unlock_super(sb);
 out:
        journal_unlock_updates(journal);
@@ -2601,13 +2594,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                            (sbi->s_mount_state & EXT3_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
-                        /*
-                         * We have to unlock super so that we can wait for
-                         * transactions.
-                         */
-                        unlock_super(sb);
                        ext3_mark_recovery_complete(sb, es);
-                        lock_super(sb);
                } else {
                        __le32 ret;
                        if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 387d92d00b97..66895ccf76c7 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -99,7 +99,7 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *,
                                                 struct mb_cache_entry **);
 static void ext3_xattr_rehash(struct ext3_xattr_header *,
                              struct ext3_xattr_entry *);
-static int ext3_xattr_list(struct inode *inode, char *buffer,
+static int ext3_xattr_list(struct dentry *dentry, char *buffer,
                           size_t buffer_size);
 static struct mb_cache *ext3_xattr_cache;
@@ -147,7 +147,7 @@ ext3_xattr_handler(int name_index)
 ssize_t
 ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        return ext3_xattr_list(dentry->d_inode, buffer, size);
+        return ext3_xattr_list(dentry, buffer, size);
 }
 static int
@@ -332,7 +332,7 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name,
 }
 static int
-ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
+ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
 {
        size_t rest = buffer_size;
@@ -342,9 +342,10 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
                        ext3_xattr_handler(entry->e_name_index);
                if (handler) {
-                        size_t size = handler->list(inode, buffer, rest,
+                        size_t size = handler->list(dentry, buffer, rest,
                                                    entry->e_name,
-                                                    entry->e_name_len);
+                                                    entry->e_name_len,
+                                                    handler->flags);
                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
@@ -357,8 +358,9 @@ ext3_xattr_list_entries(struct inode *inode, struct ext3_xattr_entry *entry,
 }
 static int
-ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        int error;
@@ -383,7 +385,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
                goto cleanup;
        }
        ext3_xattr_cache_insert(bh);
-        error = ext3_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+        error = ext3_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 cleanup:
        brelse(bh);
@@ -392,8 +394,9 @@ cleanup:
 }
 static int
-ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct ext3_xattr_ibody_header *header;
        struct ext3_inode *raw_inode;
        struct ext3_iloc iloc;
@@ -411,7 +414,7 @@ ext3_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
        error = ext3_xattr_check_names(IFIRST(header), end);
        if (error)
                goto cleanup;
-        error = ext3_xattr_list_entries(inode, IFIRST(header),
+        error = ext3_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);
 cleanup:
@@ -430,12 +433,12 @@ cleanup:
 * used / required on success.
 */
 static int
-ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        int i_error, b_error;
-        down_read(&EXT3_I(inode)->xattr_sem);
+        down_read(&EXT3_I(dentry->d_inode)->xattr_sem);
-        i_error = ext3_xattr_ibody_list(inode, buffer, buffer_size);
+        i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size);
        if (i_error < 0) {
                b_error = 0;
        } else {
@@ -443,11 +446,11 @@ ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
                        buffer += i_error;
                        buffer_size -= i_error;
                }
-                b_error = ext3_xattr_block_list(inode, buffer, buffer_size);
+                b_error = ext3_xattr_block_list(dentry, buffer, buffer_size);
                if (b_error < 0)
                        i_error = 0;
        }
-        up_read(&EXT3_I(inode)->xattr_sem);
+        up_read(&EXT3_I(dentry->d_inode)->xattr_sem);
        return i_error + b_error;
 }
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 37b81097bdf2..474348788dd9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -12,8 +12,8 @@
 #include "xattr.h"
 static size_t
-ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ext3_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-                         const char *name, size_t name_len)
+                         const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +28,23 @@ ext3_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext3_xattr_security_get(struct inode *inode, const char *name,
+ext3_xattr_security_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_get(inode, EXT3_XATTR_INDEX_SECURITY, name,
+        return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext3_xattr_security_set(struct inode *inode, const char *name,
+ext3_xattr_security_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SECURITY, name,
+        return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 int
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index c7c41a410c4b..e5562845ed96 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -14,8 +14,8 @@
 #include "xattr.h"
 static size_t
-ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ext3_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -32,22 +32,22 @@ ext3_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext3_xattr_trusted_get(struct inode *inode, const char *name,
+ext3_xattr_trusted_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                       void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, name,
+        return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext3_xattr_trusted_set(struct inode *inode, const char *name,
+ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext3_xattr_set(inode, EXT3_XATTR_INDEX_TRUSTED, name,
+        return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name,
                              value, size, flags);
 }
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 430fe63b31b3..3bcfe9ee0a68 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -13,13 +13,13 @@
 #include "xattr.h"
 static size_t
-ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ext3_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-                     const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return 0;
        if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext3_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext3_xattr_user_get(struct inode *inode, const char *name,
+ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer,
-                    void *buffer, size_t size)
+                size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name, buffer, size);
+        return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER,
+                              name, buffer, size);
 }
 static int
-ext3_xattr_user_set(struct inode *inode, const char *name,
+ext3_xattr_user_set(struct dentry *dentry, const char *name,
-                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext3_xattr_set(inode, EXT3_XATTR_INDEX_USER, name,
+        return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext3_xattr_user_handler = {
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index e5f6774846e4..9ed1bb1f319f 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -2,7 +2,6 @@ config EXT4_FS
        tristate "The Extended 4 (ext4) filesystem"
        select JBD2
        select CRC16
-        select FS_JOURNAL_INFO
        help
          This is the next generation of the ext3 filesystem.
@@ -29,6 +28,7 @@ config EXT4_FS
 config EXT4_USE_FOR_EXT23
        bool "Use ext4 for ext2/ext3 file systems"
+        depends on EXT4_FS
        depends on EXT3_FS=n || EXT2_FS=n
        default y
        help
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 0df88b2a69b0..8a2a29d35a6f 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -364,12 +364,12 @@ out:
 * Extended attribute handlers
 */
 static size_t
-ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
+ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -377,12 +377,12 @@ ext4_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
 }
 static size_t
-ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
+ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t name_len)
+                            const char *name, size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return 0;
        if (list && size <= list_len)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -390,15 +390,18 @@ ext4_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
 }
 static int
-ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
+                   size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!test_opt(inode->i_sb, POSIX_ACL))
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        if (!test_opt(dentry->d_sb, POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ext4_get_acl(inode, type);
+        acl = ext4_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -410,31 +413,16 @@ ext4_xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 }
 static int
-ext4_xattr_get_acl_access(struct inode *inode, const char *name,
+ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
-                          void *buffer, size_t size)
+                   size_t size, int flags, int type)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-ext4_xattr_get_acl_default(struct inode *inode, const char *name,
-                           void *buffer, size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-ext4_xattr_set_acl(struct inode *inode, int type, const void *value,
-                   size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        handle_t *handle;
        struct posix_acl *acl;
        int error, retries = 0;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!test_opt(inode->i_sb, POSIX_ACL))
                return -EOPNOTSUPP;
        if (!is_owner_or_cap(inode))
@@ -466,34 +454,18 @@ release_and_out:
        return error;
 }
-static int
-ext4_xattr_set_acl_access(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int
-ext4_xattr_set_acl_default(struct inode *inode, const char *name,
-                           const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ext4_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ext4_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ext4_xattr_list_acl_access,
-        .get    = ext4_xattr_get_acl_access,
+        .get    = ext4_xattr_get_acl,
-        .set    = ext4_xattr_set_acl_access,
+        .set    = ext4_xattr_set_acl,
 };
 struct xattr_handler ext4_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ext4_xattr_list_acl_default,
-        .get    = ext4_xattr_get_acl_default,
+        .get    = ext4_xattr_get_acl,
-        .set    = ext4_xattr_set_acl_default,
+        .set    = ext4_xattr_set_acl,
 };
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 4df8621ec31c..a60ab9aad57d 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -16,7 +16,6 @@
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
-#include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include "ext4.h"
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ab31e65d46d0..af7b62699ea9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -699,11 +699,17 @@ struct ext4_inode_info {
        unsigned int i_reserved_meta_blocks;
        unsigned int i_allocated_meta_blocks;
        unsigned short i_delalloc_reserved_flag;
+        sector_t i_da_metadata_calc_last_lblock;
+        int i_da_metadata_calc_len;
        /* on-disk additional length */
        __u16 i_extra_isize;
        spinlock_t i_block_reservation_lock;
+#ifdef CONFIG_QUOTA
+        /* quota space reservation, managed internally by quota code */
+        qsize_t i_reserved_quota;
+#endif
        /* completed async DIOs that might need unwritten extents handling */
        struct list_head i_aio_dio_complete_list;
@@ -1435,7 +1441,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
-extern qsize_t ext4_get_reserved_space(struct inode *inode);
+extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern int flush_aio_dio_completed_IO(struct inode *inode);
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 2ca686454e87..bdb6ce7e2eb4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
        ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
 }
-extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+                                         sector_t lblocks);
 extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3a7928f825e4..7d7b74e94687 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 * to allocate @blocks
 * Worse case is one block per extent
 */
-int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        int lcap, icap, rcap, leafs, idxs, num;
+        struct ext4_inode_info *ei = EXT4_I(inode);
-        int newextents = blocks;
+        int idxs, num = 0;
-        rcap = ext4_ext_space_root_idx(inode, 0);
-        lcap = ext4_ext_space_block(inode, 0);
-        icap = ext4_ext_space_block_idx(inode, 0);
-        /* number of new leaf blocks needed */
+        idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
-        num = leafs = (newextents + lcap - 1) / lcap;
+                / sizeof(struct ext4_extent_idx));
        /*
-         * Worse case, we need separate index block(s)
+         * If the new delayed allocation block is contiguous with the
-         * to link all new leaf blocks
+         * previous da block, it can share index blocks with the
+         * previous block, so we only need to allocate a new index
+         * block every idxs leaf blocks.  At ldxs**2 blocks, we need
+         * an additional index block, and at ldxs**3 blocks, yet
+         * another index blocks.
         */
-        idxs = (leafs + icap - 1) / icap;
+        if (ei->i_da_metadata_calc_len &&
-        do {
+            ei->i_da_metadata_calc_last_lblock+1 == lblock) {
-                num += idxs;
+                if ((ei->i_da_metadata_calc_len % idxs) == 0)
-                idxs = (idxs + icap - 1) / icap;
+                        num++;
-        } while (idxs > rcap);
+                if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0)
+                        num++;
+                if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) {
+                        num++;
+                        ei->i_da_metadata_calc_len = 0;
+                } else
+                        ei->i_da_metadata_calc_len++;
+                ei->i_da_metadata_calc_last_lblock++;
+                return num;
+        }
-        return num;
+        /*
+         * In the worst case we need a new set of index blocks at
+         * every level of the inode's extent tree.
+         */
+        ei->i_da_metadata_calc_len = 1;
+        ei->i_da_metadata_calc_last_lblock = lblock;
+        return ext_depth(inode) + 1;
 }
 static int
@@ -3023,6 +3038,14 @@ out:
        return err;
 }
+static void unmap_underlying_metadata_blocks(struct block_device *bdev,
+                        sector_t block, int count)
+{
+        int i;
+        for (i = 0; i < count; i++)
+                unmap_underlying_metadata(bdev, block + i);
+}
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock, unsigned int max_blocks,
@@ -3098,6 +3121,18 @@ out:
        } else
                allocated = ret;
        set_buffer_new(bh_result);
+        /*
+         * if we allocated more blocks than requested
+         * we need to make sure we unmap the extra block
+         * allocated. The actual needed block will get
+         * unmapped later when we find the buffer_head marked
+         * new.
+         */
+        if (allocated > max_blocks) {
+                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
+                                        newblock + max_blocks,
+                                        allocated - max_blocks);
+        }
 map_out:
        set_buffer_mapped(bh_result);
 out1:
@@ -3190,7 +3225,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * this situation is possible, though, _during_ tree modification;
         * this is why assert can't be put in ext4_ext_find_extent()
         */
-        BUG_ON(path[depth].p_ext == NULL && depth != 0);
+        if (path[depth].p_ext == NULL && depth != 0) {
+                ext4_error(inode->i_sb, __func__, "bad extent address "
+                           "inode: %lu, iblock: %d, depth: %d",
+                           inode->i_ino, iblock, depth);
+                err = -EIO;
+                goto out2;
+        }
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0b22497d92e1..98bd140aad01 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                return ext4_force_commit(inode->i_sb);
        commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
-        if (jbd2_log_start_commit(journal, commit_tid))
+        if (jbd2_log_start_commit(journal, commit_tid)) {
+                /*
+                 * When the journal is on a different device than the
+                 * fs data disk, we need to issue the barrier in
+                 * writeback mode.  (In ordered mode, the jbd2 layer
+                 * will take care of issuing the barrier.  In
+                 * data=journal, all of the data blocks are written to
+                 * the journal device.)
+                 */
+                if (ext4_should_writeback_data(inode) &&
+                    (journal->j_fs_dev != journal->j_dev) &&
+                    (journal->j_flags & JBD2_BARRIER))
+                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
                jbd2_log_wait_commit(journal, commit_tid);
-        else if (journal->j_flags & JBD2_BARRIER)
+        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        return ret;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5352db1a3086..c818972c8302 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1003,83 +1003,94 @@ out:
        return err;
 }
-qsize_t ext4_get_reserved_space(struct inode *inode)
+#ifdef CONFIG_QUOTA
+qsize_t *ext4_get_reserved_space(struct inode *inode)
 {
-        unsigned long long total;
+        return &EXT4_I(inode)->i_reserved_quota;
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks +
-                EXT4_I(inode)->i_reserved_meta_blocks;
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        return (total << inode->i_blkbits);
 }
+#endif
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate @blocks for non extent file based file
+ * to allocate a new block at @lblocks for non extent file based file
 */
-static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_indirect_calc_metadata_amount(struct inode *inode,
+                                              sector_t lblock)
 {
-        int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+        struct ext4_inode_info *ei = EXT4_I(inode);
-        int ind_blks, dind_blks, tind_blks;
+        int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1;
+        int blk_bits;
-        /* number of new indirect blocks needed */
-        ind_blks = (blocks + icap - 1) / icap;
-        dind_blks = (ind_blks + icap - 1) / icap;
+        if (lblock < EXT4_NDIR_BLOCKS)
+                return 0;
-        tind_blks = 1;
+        lblock -= EXT4_NDIR_BLOCKS;
-        return ind_blks + dind_blks + tind_blks;
+        if (ei->i_da_metadata_calc_len &&
+            (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) {
+                ei->i_da_metadata_calc_len++;
+                return 0;
+        }
+        ei->i_da_metadata_calc_last_lblock = lblock & dind_mask;
+        ei->i_da_metadata_calc_len = 1;
+        blk_bits = roundup_pow_of_two(lblock + 1);
+        return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
 }
 /*
 * Calculate the number of metadata blocks need to reserve
- * to allocate given number of blocks
+ * to allocate a block located at @lblock
 */
-static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        if (!blocks)
-                return 0;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
-                return ext4_ext_calc_metadata_amount(inode, blocks);
+                return ext4_ext_calc_metadata_amount(inode, lblock);
-        return ext4_indirect_calc_metadata_amount(inode, blocks);
+        return ext4_indirect_calc_metadata_amount(inode, lblock);
 }
+/*
+ * Called with i_data_sem down, which is important since we can call
+ * ext4_discard_preallocations() from here.
+ */
 static void ext4_da_update_reserve_space(struct inode *inode, int used)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        int total, mdb, mdb_free;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        int mdb_free = 0;
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* recalculate the number of metablocks still need to be reserved */
+        spin_lock(&ei->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks - used;
+        if (unlikely(used > ei->i_reserved_data_blocks)) {
-        mdb = ext4_calc_metadata_amount(inode, total);
+                ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
+                         "with only %d reserved data blocks\n",
-        /* figure out how many metablocks to release */
+                         __func__, inode->i_ino, used,
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+                         ei->i_reserved_data_blocks);
-        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+                WARN_ON(1);
+                used = ei->i_reserved_data_blocks;
-        if (mdb_free) {
+        }
-                /* Account for allocated meta_blocks */
-                mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+        /* Update per-inode reservations */
+        ei->i_reserved_data_blocks -= used;
-                /* update fs dirty blocks counter */
+        used += ei->i_allocated_meta_blocks;
+        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
+        ei->i_allocated_meta_blocks = 0;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
+        if (ei->i_reserved_data_blocks == 0) {
+                /*
+                 * We can release all of the reserved metadata blocks
+                 * only when we have written all of the delayed
+                 * allocation blocks.
+                 */
+                mdb_free = ei->i_reserved_meta_blocks;
+                ei->i_reserved_meta_blocks = 0;
+                ei->i_da_metadata_calc_len = 0;
                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
-                EXT4_I(inode)->i_allocated_meta_blocks = 0;
-                EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        }
-        /* update per-inode reservations */
-        BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
-        EXT4_I(inode)->i_reserved_data_blocks -= used;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /*
+        /* Update quota subsystem */
-         * free those over-booking quota for metadata blocks
+        vfs_dq_claim_block(inode, used);
-         */
        if (mdb_free)
                vfs_dq_release_reservation_block(inode, mdb_free);
@@ -1088,7 +1099,8 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
-        if (!total && (atomic_read(&inode->i_writecount) == 0))
+        if ((ei->i_reserved_data_blocks == 0) &&
+            (atomic_read(&inode->i_writecount) == 0))
                ext4_discard_preallocations(inode);
 }
@@ -1797,11 +1809,15 @@ static int ext4_journalled_write_end(struct file *file,
        return ret ? ret : copied;
 }
-static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+/*
+ * Reserve a single block located at lblock
+ */
+static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
 {
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        unsigned long md_needed, mdblocks, total = 0;
+        struct ext4_inode_info *ei = EXT4_I(inode);
+        unsigned long md_needed, md_reserved;
        /*
         * recalculate the amount of metadata blocks to reserve
@@ -1809,86 +1825,90 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
         * worse case is one extent per block
         */
 repeat:
-        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        spin_lock(&ei->i_block_reservation_lock);
-        total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+        md_reserved = ei->i_reserved_meta_blocks;
-        mdblocks = ext4_calc_metadata_amount(inode, total);
+        md_needed = ext4_calc_metadata_amount(inode, lblock);
-        BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+        spin_unlock(&ei->i_block_reservation_lock);
-        md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
-        total = md_needed + nrblocks;
        /*
         * Make quota reservation here to prevent quota overflow
         * later. Real quota accounting is done at pages writeout
         * time.
         */
-        if (vfs_dq_reserve_block(inode, total)) {
+        if (vfs_dq_reserve_block(inode, md_needed + 1)) {
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                /* 
+                 * We tend to badly over-estimate the amount of
+                 * metadata blocks which are needed, so if we have
+                 * reserved any metadata blocks, try to force out the
+                 * inode and see if we have any better luck.
+                 */
+                if (md_reserved && retries++ <= 3)
+                        goto retry;
                return -EDQUOT;
        }
-        if (ext4_claim_free_blocks(sbi, total)) {
+        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                vfs_dq_release_reservation_block(inode, md_needed + 1);
-                vfs_dq_release_reservation_block(inode, total);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
+                retry:
+                        if (md_reserved)
+                                write_inode_now(inode, (retries == 3));
                        yield();
                        goto repeat;
                }
                return -ENOSPC;
        }
-        EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+        spin_lock(&ei->i_block_reservation_lock);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+        ei->i_reserved_data_blocks++;
+        ei->i_reserved_meta_blocks += md_needed;
+        spin_unlock(&ei->i_block_reservation_lock);
-        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
        return 0;       /* success */
 }
 static void ext4_da_release_space(struct inode *inode, int to_free)
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-        int total, mdb, mdb_free, release;
+        struct ext4_inode_info *ei = EXT4_I(inode);
        if (!to_free)
                return;         /* Nothing to release, exit */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
-        if (!EXT4_I(inode)->i_reserved_data_blocks) {
+        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
-                 * if there is no reserved blocks, but we try to free some
+                 * if there aren't enough reserved blocks, then the
-                 * then the counter is messed up somewhere.
+                 * counter is messed up somewhere.  Since this
-                 * but since this function is called from invalidate
+                 * function is called from invalidate page, it's
-                 * page, it's harmless to return without any action
+                 * harmless to return without any action.
                 */
-                printk(KERN_INFO "ext4 delalloc try to release %d reserved "
+                ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
-                            "blocks for inode %lu, but there is no reserved "
+                         "ino %lu, to_free %d with only %d reserved "
-                            "data blocks\n", to_free, inode->i_ino);
+                         "data blocks\n", inode->i_ino, to_free,
-                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                         ei->i_reserved_data_blocks);
-                return;
+                WARN_ON(1);
+                to_free = ei->i_reserved_data_blocks;
        }
+        ei->i_reserved_data_blocks -= to_free;
-        /* recalculate the number of metablocks still need to be reserved */
+        if (ei->i_reserved_data_blocks == 0) {
-        total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
+                /*
-        mdb = ext4_calc_metadata_amount(inode, total);
+                 * We can release all of the reserved metadata blocks
+                 * only when we have written all of the delayed
-        /* figure out how many metablocks to release */
+                 * allocation blocks.
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+                 */
-        mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+                to_free += ei->i_reserved_meta_blocks;
+                ei->i_reserved_meta_blocks = 0;
-        release = to_free + mdb_free;
+                ei->i_da_metadata_calc_len = 0;
+        }
-        /* update fs dirty blocks counter for truncate case */
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, release);
-        /* update per-inode reservations */
+        /* update fs dirty blocks counter */
-        BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
-        EXT4_I(inode)->i_reserved_data_blocks -= to_free;
-        BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
-        EXT4_I(inode)->i_reserved_meta_blocks = mdb;
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        vfs_dq_release_reservation_block(inode, release);
+        vfs_dq_release_reservation_block(inode, to_free);
 }
 static void ext4_da_page_release_reservation(struct page *page,
@@ -2494,7 +2514,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
                 */
-                ret = ext4_da_reserve_space(inode, 1);
+                ret = ext4_da_reserve_space(inode, iblock);
                if (ret)
                        /* not enough space to reserve */
                        return ret;
@@ -2968,8 +2988,7 @@ retry:
 out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
-        if (wbc->nr_to_write > nr_to_writebump)
+        wbc->nr_to_write -= nr_to_writebump;
-                wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
@@ -2994,11 +3013,18 @@ static int ext4_nonda_switch(struct super_block *sb)
        if (2 * free_blocks < 3 * dirty_blocks ||
                free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) {
                /*
-                 * free block count is less that 150% of dirty blocks
+                 * free block count is less than 150% of dirty blocks
-                 * or free blocks is less that watermark
+                 * or free blocks is less than watermark
                 */
                return 1;
        }
+        /*
+         * Even if we don't switch but are nearing capacity,
+         * start pushing delalloc when 1/2 of free blocks are dirty.
+         */
+        if (free_blocks < 2 * dirty_blocks)
+                writeback_inodes_sb_if_idle(sb);
        return 0;
 }
@@ -4794,6 +4820,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(raw_inode);
        ei->i_disksize = inode->i_size;
+#ifdef CONFIG_QUOTA
+        ei->i_reserved_quota = 0;
+#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b1fd3daadc9c..d34afad3e137 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
                /* release all the reserved blocks if non delalloc */
                percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
-        else {
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
-                                                ac->ac_b_ex.fe_len);
-                /* convert reserved quota blocks to real quota blocks */
-                vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
-        }
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 0ca811061bc7..436521cae456 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -17,7 +17,6 @@
 #include <linux/proc_fs.h>
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
-#include <linux/version.h>
 #include <linux/blkdev.h>
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 827bde1f2594..735c20d5fd56 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -702,8 +702,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_reserved_data_blocks = 0;
        ei->i_reserved_meta_blocks = 0;
        ei->i_allocated_meta_blocks = 0;
+        ei->i_da_metadata_calc_len = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
+#ifdef CONFIG_QUOTA
+        ei->i_reserved_quota = 0;
+#endif
        INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
        ei->cur_aio_dio = NULL;
        ei->i_sync_tid = 0;
@@ -1014,7 +1018,9 @@ static const struct dquot_operations ext4_quota_operations = {
        .reserve_space  = dquot_reserve_space,
        .claim_space    = dquot_claim_space,
        .release_rsv    = dquot_release_reserved_space,
+#ifdef CONFIG_QUOTA
        .get_reserved_space = ext4_get_reserved_space,
+#endif
        .alloc_inode    = dquot_alloc_inode,
        .free_space     = dquot_free_space,
        .free_inode     = dquot_free_inode,
@@ -2169,9 +2175,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
        struct super_block *sb = sbi->s_buddy_cache->i_sb;
        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                        sbi->s_kbytes_written + 
+                        (unsigned long long)(sbi->s_kbytes_written +
                        ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
-                          EXT4_SB(sb)->s_sectors_written_start) >> 1));
+                          EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
@@ -4000,6 +4006,7 @@ static inline void unregister_as_ext2(void)
 {
        unregister_filesystem(&ext2_fs_type);
 }
+MODULE_ALIAS("ext2");
 #else
 static inline void register_as_ext2(void) { }
 static inline void unregister_as_ext2(void) { }
@@ -4026,6 +4033,7 @@ static inline void unregister_as_ext3(void)
 {
        unregister_filesystem(&ext3_fs_type);
 }
+MODULE_ALIAS("ext3");
 #else
 static inline void register_as_ext3(void) { }
 static inline void unregister_as_ext3(void) { }
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 910bf9a59cb3..f3a2f7ed45aa 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -92,7 +92,7 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
                                                 struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
                              struct ext4_xattr_entry *);
-static int ext4_xattr_list(struct inode *inode, char *buffer,
+static int ext4_xattr_list(struct dentry *dentry, char *buffer,
                           size_t buffer_size);
 static struct mb_cache *ext4_xattr_cache;
@@ -140,7 +140,7 @@ ext4_xattr_handler(int name_index)
 ssize_t
 ext4_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
-        return ext4_xattr_list(dentry->d_inode, buffer, size);
+        return ext4_xattr_list(dentry, buffer, size);
 }
 static int
@@ -325,7 +325,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 }
 static int
-ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
+ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
                        char *buffer, size_t buffer_size)
 {
        size_t rest = buffer_size;
@@ -335,9 +335,10 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
                        ext4_xattr_handler(entry->e_name_index);
                if (handler) {
-                        size_t size = handler->list(inode, buffer, rest,
+                        size_t size = handler->list(dentry, buffer, rest,
                                                    entry->e_name,
-                                                    entry->e_name_len);
+                                                    entry->e_name_len,
+                                                    handler->flags);
                        if (buffer) {
                                if (size > rest)
                                        return -ERANGE;
@@ -350,8 +351,9 @@ ext4_xattr_list_entries(struct inode *inode, struct ext4_xattr_entry *entry,
 }
 static int
-ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct buffer_head *bh = NULL;
        int error;
@@ -376,7 +378,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size)
                goto cleanup;
        }
        ext4_xattr_cache_insert(bh);
-        error = ext4_xattr_list_entries(inode, BFIRST(bh), buffer, buffer_size);
+        error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 cleanup:
        brelse(bh);
@@ -385,8 +387,9 @@ cleanup:
 }
 static int
-ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
+        struct inode *inode = dentry->d_inode;
        struct ext4_xattr_ibody_header *header;
        struct ext4_inode *raw_inode;
        struct ext4_iloc iloc;
@@ -404,7 +407,7 @@ ext4_xattr_ibody_list(struct inode *inode, char *buffer, size_t buffer_size)
        error = ext4_xattr_check_names(IFIRST(header), end);
        if (error)
                goto cleanup;
-        error = ext4_xattr_list_entries(inode, IFIRST(header),
+        error = ext4_xattr_list_entries(dentry, IFIRST(header),
                                        buffer, buffer_size);
 cleanup:
@@ -423,12 +426,12 @@ cleanup:
 * used / required on success.
 */
 static int
-ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
+ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
        int i_error, b_error;
-        down_read(&EXT4_I(inode)->xattr_sem);
+        down_read(&EXT4_I(dentry->d_inode)->xattr_sem);
-        i_error = ext4_xattr_ibody_list(inode, buffer, buffer_size);
+        i_error = ext4_xattr_ibody_list(dentry, buffer, buffer_size);
        if (i_error < 0) {
                b_error = 0;
        } else {
@@ -436,11 +439,11 @@ ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
                        buffer += i_error;
                        buffer_size -= i_error;
                }
-                b_error = ext4_xattr_block_list(inode, buffer, buffer_size);
+                b_error = ext4_xattr_block_list(dentry, buffer, buffer_size);
                if (b_error < 0)
                        i_error = 0;
        }
-        up_read(&EXT4_I(inode)->xattr_sem);
+        up_read(&EXT4_I(dentry->d_inode)->xattr_sem);
        return i_error + b_error;
 }
@@ -1329,6 +1332,8 @@ retry:
                        goto cleanup;
                kfree(b_entry_name);
                kfree(buffer);
+                b_entry_name = NULL;
+                buffer = NULL;
                brelse(is->iloc.bh);
                kfree(is);
                kfree(bs);
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index ca5f89fc6cae..983c253999a7 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -12,8 +12,8 @@
 #include "xattr.h"
 static size_t
-ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
+ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
-                         const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
        const size_t total_len = prefix_len + name_len + 1;
@@ -28,23 +28,23 @@ ext4_xattr_security_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext4_xattr_security_get(struct inode *inode, const char *name,
+ext4_xattr_security_get(struct dentry *dentry, const char *name,
-                       void *buffer, size_t size)
+                       void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, name,
+        return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext4_xattr_security_set(struct inode *inode, const char *name,
+ext4_xattr_security_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name,
+        return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 int
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index ac1a52cf2a37..15b50edc6587 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -14,8 +14,8 @@
 #include "xattr.h"
 static size_t
-ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
+ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -32,23 +32,23 @@ ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext4_xattr_trusted_get(struct inode *inode, const char *name,
+ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
-                       void *buffer, size_t size)
+                size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+        return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
-                              buffer, size);
+                              name, buffer, size);
 }
 static int
-ext4_xattr_trusted_set(struct inode *inode, const char *name,
+ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name,
+        return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext4_xattr_trusted_handler = {
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d91aa61b42aa..c4ce05746ce1 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -13,13 +13,13 @@
 #include "xattr.h"
 static size_t
-ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
+ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
-                     const char *name, size_t name_len)
+                     const char *name, size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return 0;
        if (list && total_len <= list_size) {
@@ -31,26 +31,27 @@ ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 }
 static int
-ext4_xattr_user_get(struct inode *inode, const char *name,
+ext4_xattr_user_get(struct dentry *dentry, const char *name,
-                    void *buffer, size_t size)
+                    void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext4_xattr_get(inode, EXT4_XATTR_INDEX_USER, name, buffer, size);
+        return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER,
+                              name, buffer, size);
 }
 static int
-ext4_xattr_user_set(struct inode *inode, const char *name,
+ext4_xattr_user_set(struct dentry *dentry, const char *name,
-                    const void *value, size_t size, int flags)
+                    const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        if (!test_opt(inode->i_sb, XATTR_USER))
+        if (!test_opt(dentry->d_sb, XATTR_USER))
                return -EOPNOTSUPP;
-        return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name,
+        return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER,
-                              value, size, flags);
+                              name, value, size, flags);
 }
 struct xattr_handler ext4_xattr_user_handler = {
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 7db0979c6b72..e6efdfa0f6db 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -44,7 +44,8 @@ struct fat_mount_options {
                 nocase:1,        /* Does this need case conversion? 0=need case conversion*/
                 usefree:1,       /* Use free_clusters for FAT32 */
                 tz_utc:1,        /* Filesystem timestamps are in UTC */
-                 rodir:1;         /* allow ATTR_RO for directory */
+                 rodir:1,         /* allow ATTR_RO for directory */
+                 discard:1;       /* Issue discard requests on deletions */
 };
 #define FAT_HASH_BITS   8
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index a81037721a6f..81184d3b75a3 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -566,16 +566,21 @@ int fat_free_clusters(struct inode *inode, int cluster)
                        goto error;
                }
-                /* 
+                if (sbi->options.discard) {
-                 * Issue discard for the sectors we no longer care about,
+                        /*
-                 * batching contiguous clusters into one request
+                         * Issue discard for the sectors we no longer
-                 */
+                         * care about, batching contiguous clusters
-                if (cluster != fatent.entry + 1) {
+                         * into one request
-                        int nr_clus = fatent.entry - first_cl + 1;
+                         */
+                        if (cluster != fatent.entry + 1) {
-                        sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl),
+                                int nr_clus = fatent.entry - first_cl + 1;
-                                         nr_clus * sbi->sec_per_clus);
-                        first_cl = cluster;
+                                sb_issue_discard(sb,
+                                        fat_clus_to_blknr(sbi, first_cl),
+                                        nr_clus * sbi->sec_per_clus);
+                                first_cl = cluster;
+                        }
                }
                ops->ent_put(&fatent, FAT_ENT_FREE);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 76b7961ab663..14da530b05ca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -858,6 +858,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",errors=panic");
        else
                seq_puts(m, ",errors=remount-ro");
+        if (opts->discard)
+                seq_puts(m, ",discard");
        return 0;
 }
@@ -871,7 +873,7 @@ enum {
        Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes,
        Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
        Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
-        Opt_err_panic, Opt_err_ro, Opt_err,
+        Opt_err_panic, Opt_err_ro, Opt_discard, Opt_err,
 };
 static const match_table_t fat_tokens = {
@@ -899,6 +901,7 @@ static const match_table_t fat_tokens = {
        {Opt_err_cont, "errors=continue"},
        {Opt_err_panic, "errors=panic"},
        {Opt_err_ro, "errors=remount-ro"},
+        {Opt_discard, "discard"},
        {Opt_obsolate, "conv=binary"},
        {Opt_obsolate, "conv=text"},
        {Opt_obsolate, "conv=auto"},
@@ -1136,6 +1139,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug,
                case Opt_rodir:
                        opts->rodir = 1;
                        break;
+                case Opt_discard:
+                        opts->discard = 1;
+                        break;
                /* obsolete mount options */
                case Opt_obsolate:
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 0f55f5cb732f..d3da05f26465 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/time.h>
 #include "fat.h"
 /*
@@ -157,10 +158,6 @@ extern struct timezone sys_tz;
 #define SECS_PER_MIN    60
 #define SECS_PER_HOUR   (60 * 60)
 #define SECS_PER_DAY    (SECS_PER_HOUR * 24)
-#define UNIX_SECS_1980  315532800L
-#if BITS_PER_LONG == 64
-#define UNIX_SECS_2108  4354819200L
-#endif
 /* days between 1.1.70 and 1.1.80 (2 leap days) */
 #define DAYS_DELTA      (365 * 10 + 2)
 /* 120 (2100 - 1980) isn't leap year */
@@ -213,58 +210,35 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
 void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts,
                       __le16 *time, __le16 *date, u8 *time_cs)
 {
-        time_t second = ts->tv_sec;
+        struct tm tm;
-        time_t day, leap_day, month, year;
+        time_to_tm(ts->tv_sec, sbi->options.tz_utc ? 0 :
+                   -sys_tz.tz_minuteswest * 60, &tm);
-        if (!sbi->options.tz_utc)
+        /*  FAT can only support year between 1980 to 2107 */
-                second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
+        if (tm.tm_year < 1980 - 1900) {
-        /* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
-        if (second < UNIX_SECS_1980) {
                *time = 0;
                *date = cpu_to_le16((0 << 9) | (1 << 5) | 1);
                if (time_cs)
                        *time_cs = 0;
                return;
        }
-#if BITS_PER_LONG == 64
+        if (tm.tm_year > 2107 - 1900) {
-        if (second >= UNIX_SECS_2108) {
                *time = cpu_to_le16((23 << 11) | (59 << 5) | 29);
                *date = cpu_to_le16((127 << 9) | (12 << 5) | 31);
                if (time_cs)
                        *time_cs = 199;
                return;
        }
-#endif
-        day = second / SECS_PER_DAY - DAYS_DELTA;
+        /* from 1900 -> from 1980 */
-        year = day / 365;
+        tm.tm_year -= 80;
-        leap_day = (year + 3) / 4;
+        /* 0~11 -> 1~12 */
-        if (year > YEAR_2100)           /* 2100 isn't leap year */
+        tm.tm_mon++;
-                leap_day--;
+        /* 0~59 -> 0~29(2sec counts) */
-        if (year * 365 + leap_day > day)
+        tm.tm_sec >>= 1;
-                year--;
-        leap_day = (year + 3) / 4;
-        if (year > YEAR_2100)           /* 2100 isn't leap year */
-                leap_day--;
-        day -= year * 365 + leap_day;
-        if (IS_LEAP_YEAR(year) && day == days_in_year[3]) {
-                month = 2;
-        } else {
-                if (IS_LEAP_YEAR(year) && day > days_in_year[3])
-                        day--;
-                for (month = 1; month < 12; month++) {
-                        if (days_in_year[month + 1] > day)
-                                break;
-                }
-        }
-        day -= days_in_year[month];
-        *time = cpu_to_le16(((second / SECS_PER_HOUR) % 24) << 11
+        *time = cpu_to_le16(tm.tm_hour << 11 | tm.tm_min << 5 | tm.tm_sec);
-                            | ((second / SECS_PER_MIN) % 60) << 5
+        *date = cpu_to_le16(tm.tm_year << 9 | tm.tm_mon << 5 | tm.tm_mday);
-                            | (second % SECS_PER_MIN) >> 1);
-        *date = cpu_to_le16((year << 9) | (month << 5) | (day + 1));
        if (time_cs)
                *time_cs = (ts->tv_sec & 1) * 100 + ts->tv_nsec / 10000000;
 }
@@ -285,4 +259,3 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
        }
        return err;
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 4bef4c01ec6f..69652c5bd5f0 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -21,9 +21,12 @@
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
 #include <linux/percpu_counter.h>
+#include <linux/ima.h>
 #include <asm/atomic.h>
+#include "internal.h"
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
        .max_files = NR_FILE
@@ -147,8 +150,6 @@ fail:
        return NULL;
 }
-EXPORT_SYMBOL(get_empty_filp);
 /**
 * alloc_file - allocate and initialize a 'struct file'
 * @mnt: the vfsmount on which the file will reside
@@ -164,8 +165,8 @@ EXPORT_SYMBOL(get_empty_filp);
 * If all the callers of init_file() are eliminated, its
 * code should be moved into this function.
 */
-struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
+struct file *alloc_file(struct path *path, fmode_t mode,
-                fmode_t mode, const struct file_operations *fop)
+                const struct file_operations *fop)
 {
        struct file *file;
@@ -173,35 +174,8 @@ struct file *alloc_file(struct vfsmount *mnt, struct dentry *dentry,
        if (!file)
                return NULL;
-        init_file(file, mnt, dentry, mode, fop);
+        file->f_path = *path;
-        return file;
+        file->f_mapping = path->dentry->d_inode->i_mapping;
-}
-EXPORT_SYMBOL(alloc_file);
-/**
- * init_file - initialize a 'struct file'
- * @file: the already allocated 'struct file' to initialized
- * @mnt: the vfsmount on which the file resides
- * @dentry: the dentry representing this file
- * @mode: the mode the file is opened with
- * @fop: the 'struct file_operations' for this file
- *
- * Use this instead of setting the members directly.  Doing so
- * avoids making mistakes like forgetting the mntget() or
- * forgetting to take a write on the mnt.
- *
- * Note: This is a crappy interface.  It is here to make
- * merging with the existing users of get_empty_filp()
- * who have complex failure logic easier.  All users
- * of this should be moving to alloc_file().
- */
-int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
-           fmode_t mode, const struct file_operations *fop)
-{
-        int error = 0;
-        file->f_path.dentry = dentry;
-        file->f_path.mnt = mntget(mnt);
-        file->f_mapping = dentry->d_inode->i_mapping;
        file->f_mode = mode;
        file->f_op = fop;
@@ -211,14 +185,14 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
         * visible.  We do this for consistency, and so
         * that we can do debugging checks at __fput()
         */
-        if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
+        if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
                file_take_write(file);
-                error = mnt_clone_write(mnt);
+                WARN_ON(mnt_clone_write(path->mnt));
-                WARN_ON(error);
        }
-        return error;
+        ima_counts_get(file);
+        return file;
 }
-EXPORT_SYMBOL(init_file);
+EXPORT_SYMBOL(alloc_file);
 void fput(struct file *file)
 {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 49bc1b8e8f19..1a7c42c64ff4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -242,6 +242,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 /**
 * bdi_start_writeback - start writeback
 * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
 * @nr_pages: the number of pages to write
 *
 * Description:
@@ -1187,6 +1188,23 @@ void writeback_inodes_sb(struct super_block *sb)
 EXPORT_SYMBOL(writeback_inodes_sb);
 /**
+ * writeback_inodes_sb_if_idle  -       start writeback if none underway
+ * @sb: the superblock
+ *
+ * Invoke writeback_inodes_sb if no writeback is currently underway.
+ * Returns 1 if writeback was started, 0 if not.
+ */
+int writeback_inodes_sb_if_idle(struct super_block *sb)
+{
+        if (!writeback_in_progress(sb->s_bdi)) {
+                writeback_inodes_sb(sb);
+                return 1;
+        } else
+                return 0;
+}
+EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
+/**
 * sync_inodes_sb       -       sync sb inode pages
 * @sb: the superblock
 *
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index e590242fa41a..3221a0c7944e 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL(fscache_object_destroy);
 */
 static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
 {
-        struct fscache_object *pobj, *obj, *minobj = NULL;
+        struct fscache_object *pobj, *obj = NULL, *minobj = NULL;
        struct rb_node *p;
        unsigned long pos;
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index e0b53aa7bbec..55458031e501 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -1,62 +1,58 @@
 /*
- * fs/generic_acl.c
- *
 * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
 *
 * This file is released under the GPL.
+ *
+ * Generic ACL support for in-memory filesystems.
 */
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/generic_acl.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
-/**
- * generic_acl_list  -  Generic xattr_handler->list() operation
+static size_t
- * @ops:        Filesystem specific getacl and setacl callbacks
+generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
- */
+                const char *name, size_t name_len, int type)
-size_t
-generic_acl_list(struct inode *inode, struct generic_acl_operations *ops,
-                 int type, char *list, size_t list_size)
 {
        struct posix_acl *acl;
-        const char *name;
+        const char *xname;
        size_t size;
-        acl = ops->getacl(inode, type);
+        acl = get_cached_acl(dentry->d_inode, type);
        if (!acl)
                return 0;
        posix_acl_release(acl);
-        switch(type) {
+        switch (type) {
-                case ACL_TYPE_ACCESS:
+        case ACL_TYPE_ACCESS:
-                        name = POSIX_ACL_XATTR_ACCESS;
+                xname = POSIX_ACL_XATTR_ACCESS;
-                        break;
+                break;
+        case ACL_TYPE_DEFAULT:
-                case ACL_TYPE_DEFAULT:
+                xname = POSIX_ACL_XATTR_DEFAULT;
-                        name = POSIX_ACL_XATTR_DEFAULT;
+                break;
-                        break;
+        default:
+                return 0;
-                default:
-                        return 0;
        }
-        size = strlen(name) + 1;
+        size = strlen(xname) + 1;
        if (list && size <= list_size)
-                memcpy(list, name, size);
+                memcpy(list, xname, size);
        return size;
 }
-/**
+static int
- * generic_acl_get  -  Generic xattr_handler->get() operation
+generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
- * @ops:        Filesystem specific getacl and setacl callbacks
+                     size_t size, int type)
- */
-int
-generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
-                int type, void *buffer, size_t size)
 {
        struct posix_acl *acl;
        int error;
-        acl = ops->getacl(inode, type);
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
+        acl = get_cached_acl(dentry->d_inode, type);
        if (!acl)
                return -ENODATA;
        error = posix_acl_to_xattr(acl, buffer, size);
@@ -65,17 +61,16 @@ generic_acl_get(struct inode *inode, struct generic_acl_operations *ops,
        return error;
 }
-/**
+static int
- * generic_acl_set  -  Generic xattr_handler->set() operation
+generic_acl_set(struct dentry *dentry, const char *name, const void *value,
- * @ops:        Filesystem specific getacl and setacl callbacks
+                     size_t size, int flags, int type)
- */
-int
-generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
-                int type, const void *value, size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl = NULL;
        int error;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
        if (!is_owner_or_cap(inode))
@@ -91,28 +86,27 @@ generic_acl_set(struct inode *inode, struct generic_acl_operations *ops,
                error = posix_acl_valid(acl);
                if (error)
                        goto failed;
-                switch(type) {
+                switch (type) {
-                        case ACL_TYPE_ACCESS:
+                case ACL_TYPE_ACCESS:
-                                mode = inode->i_mode;
+                        mode = inode->i_mode;
-                                error = posix_acl_equiv_mode(acl, &mode);
+                        error = posix_acl_equiv_mode(acl, &mode);
-                                if (error < 0)
+                        if (error < 0)
-                                        goto failed;
+                                goto failed;
-                                inode->i_mode = mode;
+                        inode->i_mode = mode;
-                                if (error == 0) {
+                        if (error == 0) {
-                                        posix_acl_release(acl);
+                                posix_acl_release(acl);
-                                        acl = NULL;
+                                acl = NULL;
-                                }
+                        }
-                                break;
+                        break;
+                case ACL_TYPE_DEFAULT:
-                        case ACL_TYPE_DEFAULT:
+                        if (!S_ISDIR(inode->i_mode)) {
-                                if (!S_ISDIR(inode->i_mode)) {
+                                error = -EINVAL;
-                                        error = -EINVAL;
+                                goto failed;
-                                        goto failed;
+                        }
-                                }
+                        break;
-                                break;
                }
        }
-        ops->setacl(inode, type, acl);
+        set_cached_acl(inode, type, acl);
        error = 0;
 failed:
        posix_acl_release(acl);
@@ -121,14 +115,12 @@ failed:
 /**
 * generic_acl_init  -  Take care of acl inheritance at @inode create time
- * @ops:        Filesystem specific getacl and setacl callbacks
 *
 * Files created inside a directory with a default ACL inherit the
 * directory's default ACL.
 */
 int
-generic_acl_init(struct inode *inode, struct inode *dir,
+generic_acl_init(struct inode *inode, struct inode *dir)
-                 struct generic_acl_operations *ops)
 {
        struct posix_acl *acl = NULL;
        mode_t mode = inode->i_mode;
@@ -136,7 +128,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
        inode->i_mode = mode & ~current_umask();
        if (!S_ISLNK(inode->i_mode))
-                acl = ops->getacl(dir, ACL_TYPE_DEFAULT);
+                acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
        if (acl) {
                struct posix_acl *clone;
@@ -145,7 +137,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
                        error = -ENOMEM;
                        if (!clone)
                                goto cleanup;
-                        ops->setacl(inode, ACL_TYPE_DEFAULT, clone);
+                        set_cached_acl(inode, ACL_TYPE_DEFAULT, clone);
                        posix_acl_release(clone);
                }
                clone = posix_acl_clone(acl, GFP_KERNEL);
@@ -156,7 +148,7 @@ generic_acl_init(struct inode *inode, struct inode *dir,
                if (error >= 0) {
                        inode->i_mode = mode;
                        if (error > 0)
-                                ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+                                set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
                }
                posix_acl_release(clone);
        }
@@ -169,20 +161,19 @@ cleanup:
 /**
 * generic_acl_chmod  -  change the access acl of @inode upon chmod()
- * @ops:        FIlesystem specific getacl and setacl callbacks
 *
 * A chmod also changes the permissions of the owner, group/mask, and
 * other ACL entries.
 */
 int
-generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
+generic_acl_chmod(struct inode *inode)
 {
        struct posix_acl *acl, *clone;
        int error = 0;
        if (S_ISLNK(inode->i_mode))
                return -EOPNOTSUPP;
-        acl = ops->getacl(inode, ACL_TYPE_ACCESS);
+        acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
        if (acl) {
                clone = posix_acl_clone(acl, GFP_KERNEL);
                posix_acl_release(acl);
@@ -190,8 +181,37 @@ generic_acl_chmod(struct inode *inode, struct generic_acl_operations *ops)
                        return -ENOMEM;
                error = posix_acl_chmod_masq(clone, inode->i_mode);
                if (!error)
-                        ops->setacl(inode, ACL_TYPE_ACCESS, clone);
+                        set_cached_acl(inode, ACL_TYPE_ACCESS, clone);
                posix_acl_release(clone);
        }
        return error;
 }
+int
+generic_check_acl(struct inode *inode, int mask)
+{
+        struct posix_acl *acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
+        if (acl) {
+                int error = posix_acl_permission(inode, acl, mask);
+                posix_acl_release(acl);
+                return error;
+        }
+        return -EAGAIN;
+}
+struct xattr_handler generic_acl_access_handler = {
+        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
+        .list   = generic_acl_list,
+        .get    = generic_acl_get,
+        .set    = generic_acl_set,
+};
+struct xattr_handler generic_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .list   = generic_acl_list,
+        .get    = generic_acl_get,
+        .set    = generic_acl_set,
+};
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index b192c661caa6..4dcddf83326f 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -10,7 +10,6 @@ config GFS2_FS
        select SLOW_WORK
        select QUOTA
        select QUOTACTL
-        select FS_JOURNAL_INFO
        help
          A cluster filesystem.
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3eb1ea846173..87ee309d4c24 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -126,7 +126,7 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
        error = posix_acl_to_xattr(acl, data, len);
        if (error < 0)
                goto out;
-        error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, data, len, 0);
+        error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
        if (!error)
                set_cached_acl(inode, type, acl);
 out:
@@ -232,9 +232,10 @@ static int gfs2_acl_type(const char *name)
        return -EINVAL;
 }
-static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
-                                 void *buffer, size_t size)
+                                 void *buffer, size_t size, int xtype)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl;
        int type;
        int error;
@@ -255,9 +256,11 @@ static int gfs2_xattr_system_get(struct inode *inode, const char *name,
        return error;
 }
-static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
-                                 const void *value, size_t size, int flags)
+                                 const void *value, size_t size, int flags,
+                                 int xtype)
 {
+        struct inode *inode = dentry->d_inode;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct posix_acl *acl = NULL;
        int error = 0, type;
@@ -319,7 +322,7 @@ static int gfs2_xattr_system_set(struct inode *inode, const char *name,
        }
 set_acl:
-        error = gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, 0);
+        error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
        if (!error) {
                if (acl)
                        set_cached_acl(inode, type, acl);
@@ -334,6 +337,7 @@ out:
 struct xattr_handler gfs2_xattr_system_handler = {
        .prefix = XATTR_SYSTEM_PREFIX,
+        .flags  = GFS2_EATYPE_SYS,
        .get    = gfs2_xattr_system_get,
        .set    = gfs2_xattr_system_set,
 };
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 26ba2a4c4a2d..6e220f4eee7d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -125,7 +125,7 @@ static struct inode *gfs2_iget_skip(struct super_block *sb,
 * directory entry when gfs2_inode_lookup() is invoked. Part of the code
 * segment inside gfs2_inode_lookup code needs to get moved around.
 *
- * Clean up I_LOCK and I_NEW as well.
+ * Clears I_NEW as well.
 **/
 void gfs2_set_iop(struct inode *inode)
@@ -801,7 +801,8 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
                return err;
        }
-        err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
+        err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
+                               GFS2_EATYPE_SECURITY);
        kfree(value);
        kfree(name);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 912f5cbc4740..8a04108e0c22 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -567,18 +567,17 @@ out:
 /**
 * gfs2_xattr_get - Get a GFS2 extended attribute
 * @inode: The inode
- * @type: The type of extended attribute
 * @name: The name of the extended attribute
 * @buffer: The buffer to write the result into
 * @size: The size of the buffer
+ * @type: The type of extended attribute
 *
 * Returns: actual size of data on success, -errno on error
 */
+static int gfs2_xattr_get(struct dentry *dentry, const char *name,
-int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+                void *buffer, size_t size, int type)
-                   void *buffer, size_t size)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
        struct gfs2_ea_location el;
        int error;
@@ -1119,7 +1118,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 /**
 * gfs2_xattr_remove - Remove a GFS2 extended attribute
- * @inode: The inode
+ * @ip: The inode
 * @type: The type of the extended attribute
 * @name: The name of the extended attribute
 *
@@ -1130,9 +1129,8 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
 * Returns: 0, or errno on failure
 */
-static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
+static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name)
 {
-        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_ea_location el;
        int error;
@@ -1156,24 +1154,24 @@ static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
 }
 /**
- * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
+ * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
- * @inode: The inode
+ * @ip: The inode
- * @type: The type of the extended attribute
 * @name: The name of the extended attribute
 * @value: The value of the extended attribute (NULL for remove)
 * @size: The size of the @value argument
 * @flags: Create or Replace
+ * @type: The type of the extended attribute
 *
 * See gfs2_xattr_remove() for details of the removal of xattrs.
 *
 * Returns: 0 or errno on failure
 */
-int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+int __gfs2_xattr_set(struct inode *inode, const char *name,
-                   const void *value, size_t size, int flags)
+                   const void *value, size_t size, int flags, int type)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct gfs2_ea_location el;
        unsigned int namel = strlen(name);
        int error;
@@ -1184,7 +1182,7 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name,
                return -ERANGE;
        if (value == NULL)
-                return gfs2_xattr_remove(inode, type, name);
+                return gfs2_xattr_remove(ip, type, name);
        if (ea_check_size(sdp, namel, size))
                return -ERANGE;
@@ -1224,6 +1222,13 @@ int gfs2_xattr_set(struct inode *inode, int type, const char *name,
        return error;
 }
+static int gfs2_xattr_set(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags, int type)
+{
+        return __gfs2_xattr_set(dentry->d_inode, name, value,
+                                size, flags, type);
+}
 static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
                                  struct gfs2_ea_header *ea, char *data)
 {
@@ -1529,40 +1534,18 @@ out_alloc:
        return error;
 }
-static int gfs2_xattr_user_get(struct inode *inode, const char *name,
-                               void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
-}
-static int gfs2_xattr_user_set(struct inode *inode, const char *name,
-                               const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
-}
-static int gfs2_xattr_security_get(struct inode *inode, const char *name,
-                                   void *buffer, size_t size)
-{
-        return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
-}
-static int gfs2_xattr_security_set(struct inode *inode, const char *name,
-                                   const void *value, size_t size, int flags)
-{
-        return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
-}
 static struct xattr_handler gfs2_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
-        .get    = gfs2_xattr_user_get,
+        .flags  = GFS2_EATYPE_USR,
-        .set    = gfs2_xattr_user_set,
+        .get    = gfs2_xattr_get,
+        .set    = gfs2_xattr_set,
 };
 static struct xattr_handler gfs2_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
-        .get    = gfs2_xattr_security_get,
+        .flags  = GFS2_EATYPE_SECURITY,
-        .set    = gfs2_xattr_security_set,
+        .get    = gfs2_xattr_get,
+        .set    = gfs2_xattr_set,
 };
 struct xattr_handler *gfs2_xattr_handlers[] = {
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index 8d6ae5813c4d..d392f8358f2f 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -53,10 +53,9 @@ struct gfs2_ea_location {
        struct gfs2_ea_header *el_prev;
 };
-extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+extern int __gfs2_xattr_set(struct inode *inode, const char *name,
-                          void *buffer, size_t size);
+                            const void *value, size_t size,
-extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+                            int flags, int type);
-                          const void *value, size_t size, int flags);
 extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
 extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f2feaa06bf26..cadc4ce48656 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -14,6 +14,7 @@
 #include <linux/magic.h>
 #include <linux/sched.h>
 #include <linux/smp_lock.h>
+#include <linux/bitmap.h>
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
@@ -115,15 +116,13 @@ static void hpfs_put_super(struct super_block *s)
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 {
        struct quad_buffer_head qbh;
-        unsigned *bits;
+        unsigned long *bits;
-        unsigned i, count;
+        unsigned count;
-        if (!(bits = hpfs_map_4sectors(s, secno, &qbh, 4))) return 0;
-        count = 0;
+        bits = hpfs_map_4sectors(s, secno, &qbh, 4);
-        for (i = 0; i < 2048 / sizeof(unsigned); i++) {
+        if (!bits)
-                unsigned b; 
+                return 0;
-                if (!bits[i]) continue;
+        count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
-                for (b = bits[i]; b; b>>=1) count += b & 1;
-        }
        hpfs_brelse4(&qbh);
        return count;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 87a1258953b8..a0bbd3d1b41a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,7 +30,6 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/magic.h>
 #include <asm/uaccess.h>
@@ -922,7 +921,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        int error = -ENOMEM;
        struct file *file;
        struct inode *inode;
-        struct dentry *dentry, *root;
+        struct path path;
+        struct dentry *root;
        struct qstr quick_string;
        *user = NULL;
@@ -944,10 +944,11 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
        quick_string.name = name;
        quick_string.len = strlen(quick_string.name);
        quick_string.hash = 0;
-        dentry = d_alloc(root, &quick_string);
+        path.dentry = d_alloc(root, &quick_string);
-        if (!dentry)
+        if (!path.dentry)
                goto out_shm_unlock;
+        path.mnt = mntget(hugetlbfs_vfsmount);
        error = -ENOSPC;
        inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(),
                                current_fsgid(), S_IFREG | S_IRWXUGO, 0);
@@ -960,24 +961,22 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
                        acctflag))
                goto out_inode;
-        d_instantiate(dentry, inode);
+        d_instantiate(path.dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;
        error = -ENFILE;
-        file = alloc_file(hugetlbfs_vfsmount, dentry,
+        file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
-                        FMODE_WRITE | FMODE_READ,
                        &hugetlbfs_file_operations);
        if (!file)
                goto out_dentry; /* inode is already attached */
-        ima_counts_get(file);
        return file;
 out_inode:
        iput(inode);
 out_dentry:
-        dput(dentry);
+        path_put(&path);
 out_shm_unlock:
        if (*user) {
                user_shm_unlock(size, *user);
diff --git a/fs/inode.c b/fs/inode.c
index 06c1f02de611..03dfeb2e3928 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -113,7 +113,7 @@ static void wake_up_inode(struct inode *inode)
         * Prevent speculative execution through spin_unlock(&inode_lock);
         */
        smp_mb();
-        wake_up_bit(&inode->i_state, __I_LOCK);
+        wake_up_bit(&inode->i_state, __I_NEW);
 }
 /**
@@ -690,17 +690,17 @@ void unlock_new_inode(struct inode *inode)
        }
 #endif
        /*
-         * This is special!  We do not need the spinlock when clearing I_LOCK,
+         * This is special!  We do not need the spinlock when clearing I_NEW,
         * because we're guaranteed that nobody else tries to do anything about
         * the state of the inode when it is locked, as we just created it (so
-         * there can be no old holders that haven't tested I_LOCK).
+         * there can be no old holders that haven't tested I_NEW).
         * However we must emit the memory barrier so that other CPUs reliably
-         * see the clearing of I_LOCK after the other inode initialisation has
+         * see the clearing of I_NEW after the other inode initialisation has
         * completed.
         */
        smp_mb();
-        WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
+        WARN_ON(!(inode->i_state & I_NEW));
-        inode->i_state &= ~(I_LOCK|I_NEW);
+        inode->i_state &= ~I_NEW;
        wake_up_inode(inode);
 }
 EXPORT_SYMBOL(unlock_new_inode);
@@ -731,7 +731,7 @@ static struct inode *get_new_inode(struct super_block *sb,
                                goto set_failed;
                        __inode_add_to_lists(sb, head, inode);
-                        inode->i_state = I_LOCK|I_NEW;
+                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
                        /* Return the locked inode with I_NEW set, the
@@ -778,7 +778,7 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
                if (!old) {
                        inode->i_ino = ino;
                        __inode_add_to_lists(sb, head, inode);
-                        inode->i_state = I_LOCK|I_NEW;
+                        inode->i_state = I_NEW;
                        spin_unlock(&inode_lock);
                        /* Return the locked inode with I_NEW set, the
@@ -1083,7 +1083,7 @@ int insert_inode_locked(struct inode *inode)
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
-        inode->i_state |= I_LOCK|I_NEW;
+        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
                struct inode *old = NULL;
@@ -1120,7 +1120,7 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
        struct super_block *sb = inode->i_sb;
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
-        inode->i_state |= I_LOCK|I_NEW;
+        inode->i_state |= I_NEW;
        while (1) {
                struct hlist_node *node;
@@ -1510,7 +1510,7 @@ EXPORT_SYMBOL(inode_wait);
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
- * It doesn't matter if I_LOCK is not set initially, a call to
+ * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_inode() after removing from the hash list will DTRT.
 *
 * This is called with inode_lock held.
@@ -1518,8 +1518,8 @@ EXPORT_SYMBOL(inode_wait);
 static void __wait_on_freeing_inode(struct inode *inode)
 {
        wait_queue_head_t *wq;
-        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK);
+        DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
-        wq = bit_waitqueue(&inode->i_state, __I_LOCK);
+        wq = bit_waitqueue(&inode->i_state, __I_NEW);
        prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode_lock);
        schedule();
diff --git a/fs/internal.h b/fs/internal.h
index 515175b8b72e..e96a1667d749 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -79,8 +79,16 @@ extern void chroot_fs_refs(struct path *, struct path *);
 * file_table.c
 */
 extern void mark_files_ro(struct super_block *);
+extern struct file *get_empty_filp(void);
 /*
 * super.c
 */
 extern int do_remount_sb(struct super_block *, int, void *, int);
+/*
+ * open.c
+ */
+struct nameidata;
+extern struct file *nameidata_to_filp(struct nameidata *);
+extern void release_open_intent(struct nameidata *);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index e81a30593ba9..ed752cb38474 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -9,7 +9,7 @@
 *
 * The following files are helpful:
 *
- *     Documentation/filesystems/Exporting
+ *     Documentation/filesystems/nfs/Exporting
 *     fs/exportfs/expfs.c.
 */
diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig
index a8408983abd4..4e28beeed157 100644
--- a/fs/jbd/Kconfig
+++ b/fs/jbd/Kconfig
@@ -1,6 +1,5 @@
 config JBD
        tristate
-        select FS_JOURNAL_INFO
        help
          This is a generic journalling layer for block devices.  It is
          currently used by the ext3 file system, but it could also be
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 4160afad6d00..bd224eec9b07 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void)
 {
        jbd_debugfs_dir = debugfs_create_dir("jbd", NULL);
        if (jbd_debugfs_dir)
-                jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO,
+                jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR,
                                               jbd_debugfs_dir,
                                               &journal_enable_debug);
 }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig
index 0f7d1ceafdfd..f32f346f4b0a 100644
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,7 +1,6 @@
 config JBD2
        tristate
        select CRC32
-        select FS_JOURNAL_INFO
        help
          This is a generic journaling layer for block devices that support
          both 32-bit and 64-bit block numbers.  It is currently used by
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index ca0f5eb62b20..886849370950 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -22,6 +22,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
 /*
@@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        journal->j_tail_sequence = first_tid;
        journal->j_tail = blocknr;
        spin_unlock(&journal->j_state_lock);
+        /*
+         * If there is an external journal, we need to make sure that
+         * any data blocks that were recently written out --- perhaps
+         * by jbd2_log_do_checkpoint() --- are flushed out before we
+         * drop the transactions from the external journal.  It's
+         * unlikely this will be necessary, especially with a
+         * appropriately sized journal, but we need this to guarantee
+         * correctness.  Fortunately jbd2_cleanup_journal_tail()
+         * doesn't get called all that often.
+         */
+        if ((journal->j_fs_dev != journal->j_dev) &&
+            (journal->j_flags & JBD2_BARRIER))
+                blkdev_issue_flush(journal->j_fs_dev, NULL);
        if (!(journal->j_flags & JBD2_ABORT))
                jbd2_journal_update_superblock(journal, 1);
        return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6a10238d2c63..1bc74b6f26d2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal,
                        ret = err;
                spin_lock(&journal->j_list_lock);
                J_ASSERT(jinode->i_transaction == commit_transaction);
+                commit_transaction->t_flushed_data_blocks = 1;
                jinode->i_flags &= ~JI_COMMIT_RUNNING;
                wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
        }
@@ -708,8 +709,17 @@ start_journal_io:
                }
        }
-        /* Done it all: now write the commit record asynchronously. */
+        /* 
+         * If the journal is not located on the file system device,
+         * then we must flush the file system device before we issue
+         * the commit record
+         */
+        if (commit_transaction->t_flushed_data_blocks &&
+            (journal->j_fs_dev != journal->j_dev) &&
+            (journal->j_flags & JBD2_BARRIER))
+                blkdev_issue_flush(journal->j_fs_dev, NULL);
+        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                err = journal_submit_commit_record(journal, commit_transaction,
@@ -720,13 +730,6 @@ start_journal_io:
                        blkdev_issue_flush(journal->j_dev, NULL);
        }
-        /*
-         * This is the right place to wait for data buffers both for ASYNC
-         * and !ASYNC commit. If commit is ASYNC, we need to wait only after
-         * the commit block went to disk (which happens above). If commit is
-         * SYNC, we need to wait for data buffers before we start writing
-         * commit block, which happens below in such setting.
-         */
        err = journal_finish_inode_data_buffers(journal, commit_transaction);
        if (err) {
                printk(KERN_WARNING
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b7ca3a92a4db..ac0d027595d0 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -814,7 +814,7 @@ static journal_t * journal_init_common (void)
        journal_t *journal;
        int err;
-        journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL);
+        journal = kzalloc(sizeof(*journal), GFP_KERNEL);
        if (!journal)
                goto fail;
@@ -2115,7 +2115,8 @@ static void __init jbd2_create_debugfs_entry(void)
 {
        jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL);
        if (jbd2_debugfs_dir)
-                jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO,
+                jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME,
+                                               S_IRUGO | S_IWUSR,
                                               jbd2_debugfs_dir,
                                               &jbd2_journal_enable_debug);
 }
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7edb62e97419..7cdc3196476a 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -350,8 +350,8 @@ int jffs2_acl_chmod(struct inode *inode)
        return rc;
 }
-static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
-                                         const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
@@ -360,8 +360,8 @@ static size_t jffs2_acl_access_listxattr(struct inode *inode, char *list, size_t
        return retlen;
 }
-static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
-                                          const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
@@ -370,12 +370,16 @@ static size_t jffs2_acl_default_listxattr(struct inode *inode, char *list, size_
        return retlen;
 }
-static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_t size)
+static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
+                void *buffer, size_t size, int type)
 {
        struct posix_acl *acl;
        int rc;
-        acl = jffs2_get_acl(inode, type);
+        if (name[0] != '\0')
+                return -EINVAL;
+        acl = jffs2_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (!acl)
@@ -386,26 +390,15 @@ static int jffs2_acl_getxattr(struct inode *inode, int type, void *buffer, size_
        return rc;
 }
-static int jffs2_acl_access_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
+static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
-{
+                const void *value, size_t size, int flags, int type)
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_getxattr(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int jffs2_acl_default_getxattr(struct inode *inode, const char *name, void *buffer, size_t size)
-{
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_getxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value, size_t size)
 {
        struct posix_acl *acl;
        int rc;
-        if (!is_owner_or_cap(inode))
+        if (name[0] != '\0')
+                return -EINVAL;
+        if (!is_owner_or_cap(dentry->d_inode))
                return -EPERM;
        if (value) {
@@ -420,38 +413,24 @@ static int jffs2_acl_setxattr(struct inode *inode, int type, const void *value,
        } else {
                acl = NULL;
        }
-        rc = jffs2_set_acl(inode, type, acl);
+        rc = jffs2_set_acl(dentry->d_inode, type, acl);
 out:
        posix_acl_release(acl);
        return rc;
 }
-static int jffs2_acl_access_setxattr(struct inode *inode, const char *name,
-                                     const void *buffer, size_t size, int flags)
-{
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_setxattr(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int jffs2_acl_default_setxattr(struct inode *inode, const char *name,
-                                      const void *buffer, size_t size, int flags)
-{
-        if (name[0] != '\0')
-                return -EINVAL;
-        return jffs2_acl_setxattr(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
 struct xattr_handler jffs2_acl_access_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_access_listxattr,
-        .get    = jffs2_acl_access_getxattr,
+        .get    = jffs2_acl_getxattr,
-        .set    = jffs2_acl_access_setxattr,
+        .set    = jffs2_acl_setxattr,
 };
 struct xattr_handler jffs2_acl_default_xattr_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = jffs2_acl_default_listxattr,
-        .get    = jffs2_acl_default_getxattr,
+        .get    = jffs2_acl_getxattr,
-        .set    = jffs2_acl_default_setxattr,
+        .set    = jffs2_acl_setxattr,
 };
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 090c556ffed2..3b6f2fa12cff 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -700,7 +700,8 @@ static int jffs2_garbage_collect_metadata(struct jffs2_sb_info *c, struct jffs2_
        struct jffs2_raw_inode ri;
        struct jffs2_node_frag *last_frag;
        union jffs2_device_node dev;
-        char *mdata = NULL, mdatalen = 0;
+        char *mdata = NULL;
+        int mdatalen = 0;
        uint32_t alloclen, ilen;
        int ret;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 378991cfe40f..e22de8397b74 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -1284,7 +1284,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c,
                                f->target = NULL;
                                mutex_unlock(&f->sem);
                                jffs2_do_clear_inode(c, f);
-                                return -ret;
+                                return ret;
                        }
                        f->target[je32_to_cpu(latest_node->csize)] = '\0';
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 02c39c64ecb3..eaccee058583 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -44,26 +44,28 @@ int jffs2_init_security(struct inode *inode, struct inode *dir)
 }
 /* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct inode *inode, const char *name,
+static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
-                                   void *buffer, size_t size)
+                                   void *buffer, size_t size, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size);
+        return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+                                 name, buffer, size);
 }
-static int jffs2_security_setxattr(struct inode *inode, const char *name, const void *buffer,
+static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
-                                   size_t size, int flags)
+                const void *buffer, size_t size, int flags, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags);
+        return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY,
+                                 name, buffer, size, flags);
 }
-static size_t jffs2_security_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
-                                       const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 6caf1e1ee26d..800171dca53b 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -23,7 +23,7 @@
 int jffs2_sum_init(struct jffs2_sb_info *c)
 {
-        uint32_t sum_size = max_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
+        uint32_t sum_size = min_t(uint32_t, c->sector_size, MAX_SUMMARY_SIZE);
        c->summary = kzalloc(sizeof(struct jffs2_summary), GFP_KERNEL);
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4b107881acd5..9e75c62c85d6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -990,9 +990,11 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
                if (!xhandle)
                        continue;
                if (buffer) {
-                        rc = xhandle->list(inode, buffer+len, size-len, xd->xname, xd->name_len);
+                        rc = xhandle->list(dentry, buffer+len, size-len,
+                                           xd->xname, xd->name_len, xd->flags);
                } else {
-                        rc = xhandle->list(inode, NULL, 0, xd->xname, xd->name_len);
+                        rc = xhandle->list(dentry, NULL, 0, xd->xname,
+                                           xd->name_len, xd->flags);
                }
                if (rc < 0)
                        goto out;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 8ec5765ef348..3e5a5e356e05 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,24 +16,26 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
-static int jffs2_trusted_getxattr(struct inode *inode, const char *name,
+static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
-                                  void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size);
+        return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+                                 name, buffer, size);
 }
-static int jffs2_trusted_setxattr(struct inode *inode, const char *name, const void *buffer,
+static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
-                                  size_t size, int flags)
+                const void *buffer, size_t size, int flags, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags);
+        return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED,
+                                 name, buffer, size, flags);
 }
-static size_t jffs2_trusted_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
-                                      const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8bbeab90ada1..8544af67dffe 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,24 +16,26 @@
 #include <linux/mtd/mtd.h>
 #include "nodelist.h"
-static int jffs2_user_getxattr(struct inode *inode, const char *name,
+static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
-                               void *buffer, size_t size)
+                               void *buffer, size_t size, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_getxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size);
+        return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+                                 name, buffer, size);
 }
-static int jffs2_user_setxattr(struct inode *inode, const char *name, const void *buffer,
+static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
-                               size_t size, int flags)
+                const void *buffer, size_t size, int flags, int type)
 {
        if (!strcmp(name, ""))
                return -EINVAL;
-        return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags);
+        return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER,
+                                 name, buffer, size, flags);
 }
-static size_t jffs2_user_listxattr(struct inode *inode, char *list, size_t list_size,
+static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
-                                   const char *name, size_t name_len)
+                size_t list_size, const char *name, size_t name_len, int type)
 {
        size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index f26e4d03ada5..d945ea76b445 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -1292,7 +1292,7 @@ int txCommit(tid_t tid,		/* transaction identifier */
                 */
                /*
                 * I believe this code is no longer needed.  Splitting I_LOCK
-                 * into two bits, I_LOCK and I_SYNC should prevent this
+                 * into two bits, I_NEW and I_SYNC should prevent this
                 * deadlock as well.  But since I don't have a JFS testload
                 * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done.
                 * Joern
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 2234c73fc577..d929a822a74e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -524,7 +524,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         * Page cache is indexed by long.
         * I would use MAX_LFS_FILESIZE, but it's only half as big
         */
-        sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes);
+        sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
 #endif
        sb->s_time_gran = 1;
        return 0;
diff --git a/fs/libfs.c b/fs/libfs.c
index 219576c52d80..6e8d17e1dc4c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -848,7 +848,6 @@ EXPORT_SYMBOL(simple_write_end);
 EXPORT_SYMBOL(simple_dir_inode_operations);
 EXPORT_SYMBOL(simple_dir_operations);
 EXPORT_SYMBOL(simple_empty);
-EXPORT_SYMBOL(d_alloc_name);
 EXPORT_SYMBOL(simple_fill_super);
 EXPORT_SYMBOL(simple_getattr);
 EXPORT_SYMBOL(simple_link);
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index bd173a6ca3b1..a7966eed3c17 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -11,10 +11,6 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
-#include <linux/in.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index e1d28ddd2169..56c9519d900a 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -11,10 +11,6 @@
 #include <linux/time.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
-#include <linux/in.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/lockd/lockd.h>
 #include <linux/lockd/share.h>
diff --git a/fs/namei.c b/fs/namei.c
index 87f97ba90ad1..b55440baf7ab 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -35,7 +35,7 @@
 #include <linux/fs_struct.h>
 #include <asm/uaccess.h>
-#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
+#include "internal.h"
 /* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
@@ -108,8 +108,6 @@
 * any extra contention...
 */
-static int __link_path_walk(const char *name, struct nameidata *nd);
 /* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
@@ -234,6 +232,7 @@ int generic_permission(struct inode *inode, int mask,
        /*
         * Searching includes executable on directories, else just read.
         */
+        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
                if (capable(CAP_DAC_READ_SEARCH))
                        return 0;
@@ -414,36 +413,55 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 }
 /*
- * Internal lookup() using the new generic dcache.
+ * force_reval_path - force revalidation of a dentry
- * SMP-safe
+ *
+ * In some situations the path walking code will trust dentries without
+ * revalidating them. This causes problems for filesystems that depend on
+ * d_revalidate to handle file opens (e.g. NFSv4). When FS_REVAL_DOT is set
+ * (which indicates that it's possible for the dentry to go stale), force
+ * a d_revalidate call before proceeding.
+ *
+ * Returns 0 if the revalidation was successful. If the revalidation fails,
+ * either return the error returned by d_revalidate or -ESTALE if the
+ * revalidation it just returned 0. If d_revalidate returns 0, we attempt to
+ * invalidate the dentry. It's up to the caller to handle putting references
+ * to the path if necessary.
 */
-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
+static int
+force_reval_path(struct path *path, struct nameidata *nd)
 {
-        struct dentry * dentry = __d_lookup(parent, name);
+        int status;
+        struct dentry *dentry = path->dentry;
-        /* lockess __d_lookup may fail due to concurrent d_move() 
+        /*
-         * in some unrelated directory, so try with d_lookup
+         * only check on filesystems where it's possible for the dentry to
+         * become stale. It's assumed that if this flag is set then the
+         * d_revalidate op will also be defined.
         */
-        if (!dentry)
+        if (!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT))
-                dentry = d_lookup(parent, name);
+                return 0;
-        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+        status = dentry->d_op->d_revalidate(dentry, nd);
-                dentry = do_revalidate(dentry, nd);
+        if (status > 0)
+                return 0;
-        return dentry;
+        if (!status) {
+                d_invalidate(dentry);
+                status = -ESTALE;
+        }
+        return status;
 }
 /*
- * Short-cut version of permission(), for calling by
+ * Short-cut version of permission(), for calling on directories
- * path_walk(), when dcache lock is held.  Combines parts
+ * during pathname resolution.  Combines parts of permission()
- * of permission() and generic_permission(), and tests ONLY for
+ * and generic_permission(), and tests ONLY for MAY_EXEC permission.
- * MAY_EXEC permission.
 *
 * If appropriate, check DAC only.  If not appropriate, or
- * short-cut DAC fails, then call permission() to do more
+ * short-cut DAC fails, then call ->permission() to do more
 * complete permission check.
 */
-static int exec_permission_lite(struct inode *inode)
+static int exec_permission(struct inode *inode)
 {
        int ret;
@@ -465,99 +483,6 @@ ok:
        return security_inode_permission(inode, MAY_EXEC);
 }
-/*
- * This is called when everything else fails, and we actually have
- * to go to the low-level filesystem to find out what we should do..
- *
- * We get the directory semaphore, and after getting that we also
- * make sure that nobody added the entry to the dcache in the meantime..
- * SMP-safe
- */
-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, struct nameidata *nd)
-{
-        struct dentry * result;
-        struct inode *dir = parent->d_inode;
-        mutex_lock(&dir->i_mutex);
-        /*
-         * First re-do the cached lookup just in case it was created
-         * while we waited for the directory semaphore..
-         *
-         * FIXME! This could use version numbering or similar to
-         * avoid unnecessary cache lookups.
-         *
-         * The "dcache_lock" is purely to protect the RCU list walker
-         * from concurrent renames at this point (we mustn't get false
-         * negatives from the RCU list walk here, unlike the optimistic
-         * fast walk).
-         *
-         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
-         */
-        result = d_lookup(parent, name);
-        if (!result) {
-                struct dentry *dentry;
-                /* Don't create child dentry for a dead directory. */
-                result = ERR_PTR(-ENOENT);
-                if (IS_DEADDIR(dir))
-                        goto out_unlock;
-                dentry = d_alloc(parent, name);
-                result = ERR_PTR(-ENOMEM);
-                if (dentry) {
-                        result = dir->i_op->lookup(dir, dentry, nd);
-                        if (result)
-                                dput(dentry);
-                        else
-                                result = dentry;
-                }
-out_unlock:
-                mutex_unlock(&dir->i_mutex);
-                return result;
-        }
-        /*
-         * Uhhuh! Nasty case: the cache was re-populated while
-         * we waited on the semaphore. Need to revalidate.
-         */
-        mutex_unlock(&dir->i_mutex);
-        if (result->d_op && result->d_op->d_revalidate) {
-                result = do_revalidate(result, nd);
-                if (!result)
-                        result = ERR_PTR(-ENOENT);
-        }
-        return result;
-}
-/*
- * Wrapper to retry pathname resolution whenever the underlying
- * file system returns an ESTALE.
- *
- * Retry the whole path once, forcing real lookup requests
- * instead of relying on the dcache.
- */
-static __always_inline int link_path_walk(const char *name, struct nameidata *nd)
-{
-        struct path save = nd->path;
-        int result;
-        /* make sure the stuff we saved doesn't go away */
-        path_get(&save);
-        result = __link_path_walk(name, nd);
-        if (result == -ESTALE) {
-                /* nd->path had been dropped */
-                nd->path = save;
-                path_get(&nd->path);
-                nd->flags |= LOOKUP_REVAL;
-                result = __link_path_walk(name, nd);
-        }
-        path_put(&save);
-        return result;
-}
 static __always_inline void set_root(struct nameidata *nd)
 {
        if (!nd->root.mnt) {
@@ -569,6 +494,8 @@ static __always_inline void set_root(struct nameidata *nd)
        }
 }
+static int link_path_walk(const char *, struct nameidata *);
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
        int res = 0;
@@ -641,11 +568,14 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata
                error = 0;
                if (s)
                        error = __vfs_follow_link(nd, s);
+                else if (nd->last_type == LAST_BIND) {
+                        error = force_reval_path(&nd->path, nd);
+                        if (error)
+                                path_put(&nd->path);
+                }
                if (dentry->d_inode->i_op->put_link)
                        dentry->d_inode->i_op->put_link(dentry, nd, cookie);
        }
-        path_put(path);
        return error;
 }
@@ -672,6 +602,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
        current->total_link_count++;
        nd->depth++;
        err = __do_follow_link(path, nd);
+        path_put(path);
        current->link_count--;
        nd->depth--;
        return err;
@@ -797,8 +728,19 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
                     struct path *path)
 {
        struct vfsmount *mnt = nd->path.mnt;
-        struct dentry *dentry = __d_lookup(nd->path.dentry, name);
+        struct dentry *dentry, *parent;
+        struct inode *dir;
+        /*
+         * See if the low-level filesystem might want
+         * to use its own hash..
+         */
+        if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
+                int err = nd->path.dentry->d_op->d_hash(nd->path.dentry, name);
+                if (err < 0)
+                        return err;
+        }
+        dentry = __d_lookup(nd->path.dentry, name);
        if (!dentry)
                goto need_lookup;
        if (dentry->d_op && dentry->d_op->d_revalidate)
@@ -810,7 +752,59 @@ done:
        return 0;
 need_lookup:
-        dentry = real_lookup(nd->path.dentry, name, nd);
+        parent = nd->path.dentry;
+        dir = parent->d_inode;
+        mutex_lock(&dir->i_mutex);
+        /*
+         * First re-do the cached lookup just in case it was created
+         * while we waited for the directory semaphore..
+         *
+         * FIXME! This could use version numbering or similar to
+         * avoid unnecessary cache lookups.
+         *
+         * The "dcache_lock" is purely to protect the RCU list walker
+         * from concurrent renames at this point (we mustn't get false
+         * negatives from the RCU list walk here, unlike the optimistic
+         * fast walk).
+         *
+         * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
+         */
+        dentry = d_lookup(parent, name);
+        if (!dentry) {
+                struct dentry *new;
+                /* Don't create child dentry for a dead directory. */
+                dentry = ERR_PTR(-ENOENT);
+                if (IS_DEADDIR(dir))
+                        goto out_unlock;
+                new = d_alloc(parent, name);
+                dentry = ERR_PTR(-ENOMEM);
+                if (new) {
+                        dentry = dir->i_op->lookup(dir, new, nd);
+                        if (dentry)
+                                dput(new);
+                        else
+                                dentry = new;
+                }
+out_unlock:
+                mutex_unlock(&dir->i_mutex);
+                if (IS_ERR(dentry))
+                        goto fail;
+                goto done;
+        }
+        /*
+         * Uhhuh! Nasty case: the cache was re-populated while
+         * we waited on the semaphore. Need to revalidate.
+         */
+        mutex_unlock(&dir->i_mutex);
+        if (dentry->d_op && dentry->d_op->d_revalidate) {
+                dentry = do_revalidate(dentry, nd);
+                if (!dentry)
+                        dentry = ERR_PTR(-ENOENT);
+        }
        if (IS_ERR(dentry))
                goto fail;
        goto done;
@@ -835,7 +829,7 @@ fail:
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
-static int __link_path_walk(const char *name, struct nameidata *nd)
+static int link_path_walk(const char *name, struct nameidata *nd)
 {
        struct path next;
        struct inode *inode;
@@ -858,7 +852,7 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                unsigned int c;
                nd->flags |= LOOKUP_CONTINUE;
-                err = exec_permission_lite(inode);
+                err = exec_permission(inode);
                if (err)
                        break;
@@ -898,16 +892,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        case 1:
                                continue;
                }
-                /*
-                 * See if the low-level filesystem might want
-                 * to use its own hash..
-                 */
-                if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-                        err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
-                                                            &this);
-                        if (err < 0)
-                                break;
-                }
                /* This does the actual lookups.. */
                err = do_lookup(nd, &this, &next);
                if (err)
@@ -953,12 +937,6 @@ last_component:
                        case 1:
                                goto return_reval;
                }
-                if (nd->path.dentry->d_op && nd->path.dentry->d_op->d_hash) {
-                        err = nd->path.dentry->d_op->d_hash(nd->path.dentry,
-                                                            &this);
-                        if (err < 0)
-                                break;
-                }
                err = do_lookup(nd, &this, &next);
                if (err)
                        break;
@@ -1017,8 +995,27 @@ return_err:
 static int path_walk(const char *name, struct nameidata *nd)
 {
+        struct path save = nd->path;
+        int result;
        current->total_link_count = 0;
-        return link_path_walk(name, nd);
+        /* make sure the stuff we saved doesn't go away */
+        path_get(&save);
+        result = link_path_walk(name, nd);
+        if (result == -ESTALE) {
+                /* nd->path had been dropped */
+                current->total_link_count = 0;
+                nd->path = save;
+                path_get(&nd->path);
+                nd->flags |= LOOKUP_REVAL;
+                result = link_path_walk(name, nd);
+        }
+        path_put(&save);
+        return result;
 }
 static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
@@ -1141,36 +1138,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
        return retval;
 }
-/**
- * path_lookup_open - lookup a file path with open intent
- * @dfd: the directory to use as base, or AT_FDCWD
- * @name: pointer to file name
- * @lookup_flags: lookup intent flags
- * @nd: pointer to nameidata
- * @open_flags: open intent flags
- */
-static int path_lookup_open(int dfd, const char *name,
-                unsigned int lookup_flags, struct nameidata *nd, int open_flags)
-{
-        struct file *filp = get_empty_filp();
-        int err;
-        if (filp == NULL)
-                return -ENFILE;
-        nd->intent.open.file = filp;
-        nd->intent.open.flags = open_flags;
-        nd->intent.open.create_mode = 0;
-        err = do_path_lookup(dfd, name, lookup_flags|LOOKUP_OPEN, nd);
-        if (IS_ERR(nd->intent.open.file)) {
-                if (err == 0) {
-                        err = PTR_ERR(nd->intent.open.file);
-                        path_put(&nd->path);
-                }
-        } else if (err != 0)
-                release_open_intent(nd);
-        return err;
-}
 static struct dentry *__lookup_hash(struct qstr *name,
                struct dentry *base, struct nameidata *nd)
 {
@@ -1191,7 +1158,17 @@ static struct dentry *__lookup_hash(struct qstr *name,
                        goto out;
        }
-        dentry = cached_lookup(base, name, nd);
+        dentry = __d_lookup(base, name);
+        /* lockess __d_lookup may fail due to concurrent d_move()
+         * in some unrelated directory, so try with d_lookup
+         */
+        if (!dentry)
+                dentry = d_lookup(base, name);
+        if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
+                dentry = do_revalidate(dentry, nd);
        if (!dentry) {
                struct dentry *new;
@@ -1223,7 +1200,7 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 {
        int err;
-        err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC);
+        err = exec_permission(nd->path.dentry->d_inode);
        if (err)
                return ERR_PTR(err);
        return __lookup_hash(&nd->last, nd->path.dentry, nd);
@@ -1273,7 +1250,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (err)
                return ERR_PTR(err);
-        err = inode_permission(base->d_inode, MAY_EXEC);
+        err = exec_permission(base->d_inode);
        if (err)
                return ERR_PTR(err);
        return __lookup_hash(&this, base, NULL);
@@ -1511,69 +1488,45 @@ int may_open(struct path *path, int acc_mode, int flag)
        if (error)
                return error;
-        error = ima_path_check(path, acc_mode ?
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
-                               ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
-                               IMA_COUNT_UPDATE);
-        if (error)
-                return error;
        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
-                error = -EPERM;
                if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
-                        goto err_out;
+                        return -EPERM;
                if (flag & O_TRUNC)
-                        goto err_out;
+                        return -EPERM;
        }
        /* O_NOATIME can only be set by the owner or superuser */
-        if (flag & O_NOATIME)
+        if (flag & O_NOATIME && !is_owner_or_cap(inode))
-                if (!is_owner_or_cap(inode)) {
+                return -EPERM;
-                        error = -EPERM;
-                        goto err_out;
-                }
        /*
         * Ensure there are no outstanding leases on the file.
         */
-        error = break_lease(inode, flag);
+        return break_lease(inode, flag);
-        if (error)
+}
-                goto err_out;
-        if (flag & O_TRUNC) {
-                error = get_write_access(inode);
-                if (error)
-                        goto err_out;
-                /*
-                 * Refuse to truncate files with mandatory locks held on them.
-                 */
-                error = locks_verify_locked(inode);
-                if (!error)
-                        error = security_path_truncate(path, 0,
-                                               ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
-                if (!error) {
-                        vfs_dq_init(inode);
-                        error = do_truncate(dentry, 0,
-                                            ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
-                                            NULL);
-                }
-                put_write_access(inode);
-                if (error)
-                        goto err_out;
-        } else
-                if (flag & FMODE_WRITE)
-                        vfs_dq_init(inode);
-        return 0;
+static int handle_truncate(struct path *path)
-err_out:
+{
-        ima_counts_put(path, acc_mode ?
+        struct inode *inode = path->dentry->d_inode;
-                       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+        int error = get_write_access(inode);
-                       ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+        if (error)
+                return error;
+        /*
+         * Refuse to truncate files with mandatory locks held on them.
+         */
+        error = locks_verify_locked(inode);
+        if (!error)
+                error = security_path_truncate(path, 0,
+                                       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
+        if (!error) {
+                error = do_truncate(path->dentry, 0,
+                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
+                                    NULL);
+        }
+        put_write_access(inode);
        return error;
 }
@@ -1628,7 +1581,7 @@ static inline int open_to_namei_flags(int flag)
        return flag;
 }
-static int open_will_write_to_fs(int flag, struct inode *inode)
+static int open_will_truncate(int flag, struct inode *inode)
 {
        /*
         * We'll never write to the fs underlying
@@ -1650,10 +1603,10 @@ struct file *do_filp_open(int dfd, const char *pathname,
        struct file *filp;
        struct nameidata nd;
        int error;
-        struct path path;
+        struct path path, save;
        struct dentry *dir;
        int count = 0;
-        int will_write;
+        int will_truncate;
        int flag = open_to_namei_flags(open_flag);
        /*
@@ -1681,8 +1634,23 @@ struct file *do_filp_open(int dfd, const char *pathname,
         * The simplest case - just a plain lookup.
         */
        if (!(flag & O_CREAT)) {
-                error = path_lookup_open(dfd, pathname, lookup_flags(flag),
+                filp = get_empty_filp();
-                                         &nd, flag);
+                if (filp == NULL)
+                        return ERR_PTR(-ENFILE);
+                nd.intent.open.file = filp;
+                filp->f_flags = open_flag;
+                nd.intent.open.flags = flag;
+                nd.intent.open.create_mode = 0;
+                error = do_path_lookup(dfd, pathname,
+                                        lookup_flags(flag)|LOOKUP_OPEN, &nd);
+                if (IS_ERR(nd.intent.open.file)) {
+                        if (error == 0) {
+                                error = PTR_ERR(nd.intent.open.file);
+                                path_put(&nd.path);
+                        }
+                } else if (error)
+                        release_open_intent(&nd);
                if (error)
                        return ERR_PTR(error);
                goto ok;
@@ -1717,6 +1685,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
        if (filp == NULL)
                goto exit_parent;
        nd.intent.open.file = filp;
+        filp->f_flags = open_flag;
        nd.intent.open.flags = flag;
        nd.intent.open.create_mode = mode;
        dir = nd.path.dentry;
@@ -1757,14 +1726,18 @@ do_last:
                        mnt_drop_write(nd.path.mnt);
                        goto exit;
                }
-                filp = nameidata_to_filp(&nd, open_flag);
+                filp = nameidata_to_filp(&nd);
-                if (IS_ERR(filp))
-                        ima_counts_put(&nd.path,
-                                       acc_mode & (MAY_READ | MAY_WRITE |
-                                                   MAY_EXEC));
                mnt_drop_write(nd.path.mnt);
                if (nd.root.mnt)
                        path_put(&nd.root);
+                if (!IS_ERR(filp)) {
+                        error = ima_path_check(&filp->f_path, filp->f_mode &
+                                       (MAY_READ | MAY_WRITE | MAY_EXEC));
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
                return filp;
        }
@@ -1792,7 +1765,7 @@ do_last:
        path_to_nameidata(&path, &nd);
        error = -EISDIR;
-        if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
+        if (S_ISDIR(path.dentry->d_inode->i_mode))
                goto exit;
 ok:
        /*
@@ -1805,28 +1778,45 @@ ok:
         * be avoided. Taking this mnt write here
         * ensures that (2) can not occur.
         */
-        will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
+        will_truncate = open_will_truncate(flag, nd.path.dentry->d_inode);
-        if (will_write) {
+        if (will_truncate) {
                error = mnt_want_write(nd.path.mnt);
                if (error)
                        goto exit;
        }
        error = may_open(&nd.path, acc_mode, flag);
        if (error) {
-                if (will_write)
+                if (will_truncate)
                        mnt_drop_write(nd.path.mnt);
                goto exit;
        }
-        filp = nameidata_to_filp(&nd, open_flag);
+        filp = nameidata_to_filp(&nd);
-        if (IS_ERR(filp))
+        if (!IS_ERR(filp)) {
-                ima_counts_put(&nd.path,
+                error = ima_path_check(&filp->f_path, filp->f_mode &
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+                               (MAY_READ | MAY_WRITE | MAY_EXEC));
+                if (error) {
+                        fput(filp);
+                        filp = ERR_PTR(error);
+                }
+        }
+        if (!IS_ERR(filp)) {
+                if (acc_mode & MAY_WRITE)
+                        vfs_dq_init(nd.path.dentry->d_inode);
+                if (will_truncate) {
+                        error = handle_truncate(&nd.path);
+                        if (error) {
+                                fput(filp);
+                                filp = ERR_PTR(error);
+                        }
+                }
+        }
        /*
         * It is now safe to drop the mnt write
         * because the filp has had a write taken
         * on its behalf.
         */
-        if (will_write)
+        if (will_truncate)
                mnt_drop_write(nd.path.mnt);
        if (nd.root.mnt)
                path_put(&nd.root);
@@ -1863,7 +1853,18 @@ do_link:
        error = security_inode_follow_link(path.dentry, &nd);
        if (error)
                goto exit_dput;
+        save = nd.path;
+        path_get(&save);
        error = __do_follow_link(&path, &nd);
+        if (error == -ESTALE) {
+                /* nd.path had been dropped */
+                nd.path = save;
+                path_get(&nd.path);
+                nd.flags |= LOOKUP_REVAL;
+                error = __do_follow_link(&path, &nd);
+        }
+        path_put(&save);
+        path_put(&path);
        if (error) {
                /* Does someone understand code flow here? Or it is only
                 * me so stupid? Anathema to whoever designed this non-sense
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 2a77bc25d5af..59e5673b4597 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -90,7 +90,7 @@ config ROOT_NFS
          If you want your system to mount its root file system via NFS,
          choose Y here.  This is common practice for managing systems
          without local permanent storage.  For details, read
-          <file:Documentation/filesystems/nfsroot.txt>.
+          <file:Documentation/filesystems/nfs/nfsroot.txt>.
          Most people say N here.
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7e57b04e4014..865265bdca03 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -108,6 +108,10 @@ enum {
        NFS_OWNER_RECLAIM_NOGRACE
 };
+#define NFS_LOCK_NEW            0
+#define NFS_LOCK_RECLAIM        1
+#define NFS_LOCK_EXPIRED        2
 /*
 * struct nfs4_state maintains the client-side state for a given
 * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -282,6 +286,7 @@ extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter);
 extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task);
 extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
+extern void nfs_release_seqid(struct nfs_seqid *seqid);
 extern void nfs_free_seqid(struct nfs_seqid *seqid);
 extern const nfs4_stateid zero_stateid;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 9f5f11ecfd93..198d51d17c13 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -64,6 +64,7 @@
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
 static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
 static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
@@ -341,6 +342,27 @@ nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
                free_slotid, tbl->highest_used_slotid);
 }
+/*
+ * Signal state manager thread if session is drained
+ */
+static void nfs41_check_drain_session_complete(struct nfs4_session *ses)
+{
+        struct rpc_task *task;
+        if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) {
+                task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq);
+                if (task)
+                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+                return;
+        }
+        if (ses->fc_slot_table.highest_used_slotid != -1)
+                return;
+        dprintk("%s COMPLETE: Session Drained\n", __func__);
+        complete(&ses->complete);
+}
 static void nfs41_sequence_free_slot(const struct nfs_client *clp,
                              struct nfs4_sequence_res *res)
 {
@@ -356,15 +378,7 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp,
        spin_lock(&tbl->slot_tbl_lock);
        nfs4_free_slot(tbl, res->sr_slotid);
+        nfs41_check_drain_session_complete(clp->cl_session);
-        /* Signal state manager thread if session is drained */
-        if (test_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
-                if (tbl->highest_used_slotid == -1) {
-                        dprintk("%s COMPLETE: Session Drained\n", __func__);
-                        complete(&clp->cl_session->complete);
-                }
-        } else
-                rpc_wake_up_next(&tbl->slot_tbl_waitq);
        spin_unlock(&tbl->slot_tbl_lock);
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
 }
@@ -421,7 +435,7 @@ out:
 * Note: must be called with under the slot_tbl_lock.
 */
 static u8
-nfs4_find_slot(struct nfs4_slot_table *tbl, struct rpc_task *task)
+nfs4_find_slot(struct nfs4_slot_table *tbl)
 {
        int slotid;
        u8 ret_id = NFS4_MAX_SLOT_TABLE;
@@ -463,7 +477,8 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        tbl = &session->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
-        if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state)) {
+        if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) &&
+            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
                /*
                 * The state manager will wait until the slot table is empty.
                 * Schedule the reset thread
@@ -474,7 +489,15 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
                return -EAGAIN;
        }
-        slotid = nfs4_find_slot(tbl, task);
+        if (!rpc_queue_empty(&tbl->slot_tbl_waitq) &&
+            !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
+                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
+                spin_unlock(&tbl->slot_tbl_lock);
+                dprintk("%s enforce FIFO order\n", __func__);
+                return -EAGAIN;
+        }
+        slotid = nfs4_find_slot(tbl);
        if (slotid == NFS4_MAX_SLOT_TABLE) {
                rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
                spin_unlock(&tbl->slot_tbl_lock);
@@ -483,6 +506,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
        }
        spin_unlock(&tbl->slot_tbl_lock);
+        rpc_task_set_priority(task, RPC_PRIORITY_NORMAL);
        slot = tbl->slots + slotid;
        args->sa_session = session;
        args->sa_slotid = slotid;
@@ -545,6 +569,12 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata)
        rpc_call_start(task);
 }
+static void nfs41_call_priv_sync_prepare(struct rpc_task *task, void *calldata)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs41_call_sync_prepare(task, calldata);
+}
 static void nfs41_call_sync_done(struct rpc_task *task, void *calldata)
 {
        struct nfs41_call_sync_data *data = calldata;
@@ -557,12 +587,18 @@ struct rpc_call_ops nfs41_call_sync_ops = {
        .rpc_call_done = nfs41_call_sync_done,
 };
+struct rpc_call_ops nfs41_call_priv_sync_ops = {
+        .rpc_call_prepare = nfs41_call_priv_sync_prepare,
+        .rpc_call_done = nfs41_call_sync_done,
+};
 static int nfs4_call_sync_sequence(struct nfs_client *clp,
                                   struct rpc_clnt *clnt,
                                   struct rpc_message *msg,
                                   struct nfs4_sequence_args *args,
                                   struct nfs4_sequence_res *res,
-                                   int cache_reply)
+                                   int cache_reply,
+                                   int privileged)
 {
        int ret;
        struct rpc_task *task;
@@ -580,6 +616,8 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp,
        };
        res->sr_slotid = NFS4_MAX_SLOT_TABLE;
+        if (privileged)
+                task_setup.callback_ops = &nfs41_call_priv_sync_ops;
        task = rpc_run_task(&task_setup);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
@@ -597,7 +635,7 @@ int _nfs4_call_sync_session(struct nfs_server *server,
                            int cache_reply)
 {
        return nfs4_call_sync_sequence(server->nfs_client, server->client,
-                                       msg, args, res, cache_reply);
+                                       msg, args, res, cache_reply, 0);
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -1035,7 +1073,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
        memset(&opendata->o_res, 0, sizeof(opendata->o_res));
        memset(&opendata->c_res, 0, sizeof(opendata->c_res));
        nfs4_init_opendata_res(opendata);
-        ret = _nfs4_proc_open(opendata);
+        ret = _nfs4_recover_proc_open(opendata);
        if (ret != 0)
                return ret; 
        newstate = nfs4_opendata_to_nfs4_state(opendata);
@@ -1326,6 +1364,12 @@ out_no_action:
 }
+static void nfs4_recover_open_prepare(struct rpc_task *task, void *calldata)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs4_open_prepare(task, calldata);
+}
 static void nfs4_open_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
@@ -1384,10 +1428,13 @@ static const struct rpc_call_ops nfs4_open_ops = {
        .rpc_release = nfs4_open_release,
 };
-/*
+static const struct rpc_call_ops nfs4_recover_open_ops = {
- * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+        .rpc_call_prepare = nfs4_recover_open_prepare,
- */
+        .rpc_call_done = nfs4_open_done,
-static int _nfs4_proc_open(struct nfs4_opendata *data)
+        .rpc_release = nfs4_open_release,
+};
+static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover)
 {
        struct inode *dir = data->dir->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
@@ -1414,21 +1461,57 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
        data->rpc_done = 0;
        data->rpc_status = 0;
        data->cancelled = 0;
+        if (isrecover)
+                task_setup_data.callback_ops = &nfs4_recover_open_ops;
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task))
-                return PTR_ERR(task);
+                return PTR_ERR(task);
-        status = nfs4_wait_for_completion_rpc_task(task);
+        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0) {
+        if (status != 0) {
-                data->cancelled = 1;
+                data->cancelled = 1;
-                smp_wmb();
+                smp_wmb();
-        } else
+        } else
-                status = data->rpc_status;
+                status = data->rpc_status;
-        rpc_put_task(task);
+        rpc_put_task(task);
+        return status;
+}
+static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
+{
+        struct inode *dir = data->dir->d_inode;
+        struct nfs_openres *o_res = &data->o_res;
+        int status;
+        status = nfs4_run_open_task(data, 1);
        if (status != 0 || !data->rpc_done)
                return status;
-        if (o_res->fh.size == 0)
+        nfs_refresh_inode(dir, o_res->dir_attr);
-                _nfs4_proc_lookup(dir, o_arg->name, &o_res->fh, o_res->f_attr);
+        if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
+                status = _nfs4_proc_open_confirm(data);
+                if (status != 0)
+                        return status;
+        }
+        return status;
+}
+/*
+ * Note: On error, nfs4_proc_open will free the struct nfs4_opendata
+ */
+static int _nfs4_proc_open(struct nfs4_opendata *data)
+{
+        struct inode *dir = data->dir->d_inode;
+        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_openargs *o_arg = &data->o_arg;
+        struct nfs_openres *o_res = &data->o_res;
+        int status;
+        status = nfs4_run_open_task(data, 0);
+        if (status != 0 || !data->rpc_done)
+                return status;
        if (o_arg->open_flags & O_CREAT) {
                update_changeattr(dir, &o_res->cinfo);
@@ -1752,11 +1835,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                        if (calldata->arg.fmode == 0)
                                break;
                default:
-                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
+                        if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
-                                nfs_restart_rpc(task, server->nfs_client);
+                                rpc_restart_call_prepare(task);
-                                return;
-                        }
        }
+        nfs_release_seqid(calldata->arg.seqid);
        nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 }
@@ -1848,8 +1930,6 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
        calldata->arg.stateid = &state->open_stateid;
-        if (nfs4_has_session(server->nfs_client))
-                memset(calldata->arg.stateid->data, 0, 4);    /* clear seqid */
        /* Serialization for the sequence id */
        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
        if (calldata->arg.seqid == NULL)
@@ -3941,6 +4021,12 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
+static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
+{
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+        nfs4_lock_prepare(task, calldata);
+}
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_lockdata *data = calldata;
@@ -3996,7 +4082,13 @@ static const struct rpc_call_ops nfs4_lock_ops = {
        .rpc_release = nfs4_lock_release,
 };
-static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int reclaim)
+static const struct rpc_call_ops nfs4_recover_lock_ops = {
+        .rpc_call_prepare = nfs4_recover_lock_prepare,
+        .rpc_call_done = nfs4_lock_done,
+        .rpc_release = nfs4_lock_release,
+};
+static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type)
 {
        struct nfs4_lockdata *data;
        struct rpc_task *task;
@@ -4020,8 +4112,11 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                return -ENOMEM;
        if (IS_SETLKW(cmd))
                data->arg.block = 1;
-        if (reclaim != 0)
+        if (recovery_type > NFS_LOCK_NEW) {
-                data->arg.reclaim = 1;
+                if (recovery_type == NFS_LOCK_RECLAIM)
+                        data->arg.reclaim = NFS_LOCK_RECLAIM;
+                task_setup_data.callback_ops = &nfs4_recover_lock_ops;
+        }
        msg.rpc_argp = &data->arg,
        msg.rpc_resp = &data->res,
        task_setup_data.callback_data = data;
@@ -4048,7 +4143,7 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
                /* Cache the lock if possible... */
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
-                err = _nfs4_do_setlk(state, F_SETLK, request, 1);
+                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
                if (err != -NFS4ERR_DELAY)
                        break;
                nfs4_handle_exception(server, err, &exception);
@@ -4068,7 +4163,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
        do {
                if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
                        return 0;
-                err = _nfs4_do_setlk(state, F_SETLK, request, 0);
+                err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
                switch (err) {
                default:
                        goto out;
@@ -4104,7 +4199,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                status = do_vfs_lock(request->fl_file, request);
                goto out_unlock;
        }
-        status = _nfs4_do_setlk(state, cmd, request, 0);
+        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
        if (status != 0)
                goto out_unlock;
        /* Note: we always want to sleep here! */
@@ -4187,7 +4282,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
        if (err != 0)
                goto out;
        do {
-                err = _nfs4_do_setlk(state, F_SETLK, fl, 0);
+                err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW);
                switch (err) {
                        default:
                                printk(KERN_ERR "%s: unhandled error %d.\n",
@@ -4395,11 +4490,12 @@ static void nfs4_get_lease_time_prepare(struct rpc_task *task,
                        (struct nfs4_get_lease_time_data *)calldata;
        dprintk("--> %s\n", __func__);
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
        /* just setup sequence, do not trigger session recovery
           since we're invoked within one */
        ret = nfs41_setup_sequence(data->clp->cl_session,
-                                        &data->args->la_seq_args,
+                                   &data->args->la_seq_args,
-                                        &data->res->lr_seq_res, 0, task);
+                                   &data->res->lr_seq_res, 0, task);
        BUG_ON(ret == -EAGAIN);
        rpc_call_start(task);
@@ -4619,7 +4715,7 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
        tbl = &session->fc_slot_table;
        tbl->highest_used_slotid = -1;
        spin_lock_init(&tbl->slot_tbl_lock);
-        rpc_init_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
+        rpc_init_priority_wait_queue(&tbl->slot_tbl_waitq, "ForeChannel Slot table");
        tbl = &session->bc_slot_table;
        tbl->highest_used_slotid = -1;
@@ -4838,14 +4934,22 @@ int nfs4_init_session(struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
        struct nfs4_session *session;
+        unsigned int rsize, wsize;
        int ret;
        if (!nfs4_has_session(clp))
                return 0;
+        rsize = server->rsize;
+        if (rsize == 0)
+                rsize = NFS_MAX_FILE_IO_SIZE;
+        wsize = server->wsize;
+        if (wsize == 0)
+                wsize = NFS_MAX_FILE_IO_SIZE;
        session = clp->cl_session;
-        session->fc_attrs.max_rqst_sz = server->wsize + nfs41_maxwrite_overhead;
+        session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead;
-        session->fc_attrs.max_resp_sz = server->rsize + nfs41_maxread_overhead;
+        session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead;
        ret = nfs4_recover_expired_lease(server);
        if (!ret)
@@ -4871,7 +4975,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
        args.sa_cache_this = 0;
        return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args,
-                                       &res, 0);
+                                       &res, args.sa_cache_this, 1);
 }
 void nfs41_sequence_call_done(struct rpc_task *task, void *data)
@@ -4953,6 +5057,7 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data)
 {
        struct nfs4_reclaim_complete_data *calldata = data;
+        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
        if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args,
                                &calldata->res.seq_res, 0, task))
                return;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e76427e6346f..6d263ed79e92 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -135,16 +135,30 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
        return status;
 }
-static void nfs41_end_drain_session(struct nfs_client *clp,
+static void nfs4_end_drain_session(struct nfs_client *clp)
-                struct nfs4_session *ses)
 {
-        if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state))
+        struct nfs4_session *ses = clp->cl_session;
-                rpc_wake_up(&ses->fc_slot_table.slot_tbl_waitq);
+        int max_slots;
+        if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) {
+                spin_lock(&ses->fc_slot_table.slot_tbl_lock);
+                max_slots = ses->fc_slot_table.max_slots;
+                while (max_slots--) {
+                        struct rpc_task *task;
+                        task = rpc_wake_up_next(&ses->fc_slot_table.
+                                                slot_tbl_waitq);
+                        if (!task)
+                                break;
+                        rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED);
+                }
+                spin_unlock(&ses->fc_slot_table.slot_tbl_lock);
+        }
 }
-static int nfs41_begin_drain_session(struct nfs_client *clp,
+static int nfs4_begin_drain_session(struct nfs_client *clp)
-                struct nfs4_session *ses)
 {
+        struct nfs4_session *ses = clp->cl_session;
        struct nfs4_slot_table *tbl = &ses->fc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
@@ -162,16 +176,13 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
-        status = nfs41_begin_drain_session(clp, clp->cl_session);
+        nfs4_begin_drain_session(clp);
-        if (status != 0)
-                goto out;
        status = nfs4_proc_exchange_id(clp, cred);
        if (status != 0)
                goto out;
        status = nfs4_proc_create_session(clp);
        if (status != 0)
                goto out;
-        nfs41_end_drain_session(clp, clp->cl_session);
        nfs41_setup_state_renewal(clp);
        nfs_mark_client_ready(clp, NFS_CS_READY);
 out:
@@ -755,16 +766,21 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter)
        return new;
 }
-void nfs_free_seqid(struct nfs_seqid *seqid)
+void nfs_release_seqid(struct nfs_seqid *seqid)
 {
        if (!list_empty(&seqid->list)) {
                struct rpc_sequence *sequence = seqid->sequence->sequence;
                spin_lock(&sequence->lock);
-                list_del(&seqid->list);
+                list_del_init(&seqid->list);
                spin_unlock(&sequence->lock);
                rpc_wake_up(&sequence->wait);
        }
+}
+void nfs_free_seqid(struct nfs_seqid *seqid)
+{
+        nfs_release_seqid(seqid);
        kfree(seqid);
 }
@@ -1257,13 +1273,9 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 static int nfs4_reset_session(struct nfs_client *clp)
 {
-        struct nfs4_session *ses = clp->cl_session;
        int status;
-        status = nfs41_begin_drain_session(clp, ses);
+        nfs4_begin_drain_session(clp);
-        if (status != 0)
-                return status;
        status = nfs4_proc_destroy_session(clp->cl_session);
        if (status && status != -NFS4ERR_BADSESSION &&
            status != -NFS4ERR_DEADSESSION) {
@@ -1279,19 +1291,17 @@ static int nfs4_reset_session(struct nfs_client *clp)
 out:
        /*
         * Let the state manager reestablish state
-         * without waking other tasks yet.
         */
-        if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+        if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
-                /* Wake up the next rpc task */
+            status == 0)
-                nfs41_end_drain_session(clp, ses);
+                nfs41_setup_state_renewal(clp);
-                if (status == 0)
-                        nfs41_setup_state_renewal(clp);
-        }
        return status;
 }
 #else /* CONFIG_NFS_V4_1 */
 static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
+static int nfs4_end_drain_session(struct nfs_client *clp) { return 0; }
 #endif /* CONFIG_NFS_V4_1 */
 /* Set NFS4CLNT_LEASE_EXPIRED for all v4.0 errors and for recoverable errors
@@ -1382,6 +1392,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
                                goto out_error;
                }
+                nfs4_end_drain_session(clp);
                if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
                        nfs_client_return_marked_delegations(clp);
                        continue;
@@ -1398,6 +1409,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
 out_error:
        printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
                        " with error %d\n", clp->cl_hostname, -status);
+        nfs4_end_drain_session(clp);
        nfs4_clear_state_manager_bit(clp);
 }
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index 8f9a20556f79..d3854d94b7cf 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -7,8 +7,6 @@
 #include <linux/types.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/cred.h>
 #include <linux/sched.h>
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 36fcabbf5186..79717a40daba 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,15 +1,7 @@
-/*
+/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
- * linux/fs/nfsd/auth.c
- *
- * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
- */
-#include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/sunrpc/svcauth.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/export.h>
 #include "auth.h"
 int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
new file mode 100644
index 000000000000..d892be61016c
--- /dev/null
+++ b/fs/nfsd/cache.h
@@ -0,0 +1,83 @@
+/*
+ * Request reply cache. This was heavily inspired by the
+ * implementation in 4.3BSD/4.4BSD.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSCACHE_H
+#define NFSCACHE_H
+#include <linux/sunrpc/svc.h>
+/*
+ * Representation of a reply cache entry.
+ */
+struct svc_cacherep {
+        struct hlist_node       c_hash;
+        struct list_head        c_lru;
+        unsigned char           c_state,        /* unused, inprog, done */
+                                c_type,         /* status, buffer */
+                                c_secure : 1;   /* req came from port < 1024 */
+        struct sockaddr_in      c_addr;
+        __be32                  c_xid;
+        u32                     c_prot;
+        u32                     c_proc;
+        u32                     c_vers;
+        unsigned long           c_timestamp;
+        union {
+                struct kvec     u_vec;
+                __be32          u_status;
+        }                       c_u;
+};
+#define c_replvec               c_u.u_vec
+#define c_replstat              c_u.u_status
+/* cache entry states */
+enum {
+        RC_UNUSED,
+        RC_INPROG,
+        RC_DONE
+};
+/* return values */
+enum {
+        RC_DROPIT,
+        RC_REPLY,
+        RC_DOIT,
+        RC_INTR
+};
+/*
+ * Cache types.
+ * We may want to add more types one day, e.g. for diropres and
+ * attrstat replies. Using cache entries with fixed length instead
+ * of buffer pointers may be more efficient.
+ */
+enum {
+        RC_NOCACHE,
+        RC_REPLSTAT,
+        RC_REPLBUFF,
+};
+/*
+ * If requests are retransmitted within this interval, they're dropped.
+ */
+#define RC_DELAY                (HZ/5)
+int     nfsd_reply_cache_init(void);
+void    nfsd_reply_cache_shutdown(void);
+int     nfsd_cache_lookup(struct svc_rqst *, int);
+void    nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+#ifdef CONFIG_NFSD_V4
+void    nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
+#else  /* CONFIG_NFSD_V4 */
+static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
+{
+}
+#endif /* CONFIG_NFSD_V4 */
+#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c1c9e035d4a4..c487810a2366 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1,7 +1,5 @@
 #define MSNFS   /* HACK HACK */
 /*
- * linux/fs/nfsd/export.c
- *
 * NFS exporting and validation.
 *
 * We maintain a list of clients, each of which has a list of
@@ -14,29 +12,16 @@
 * Copyright (C) 1995, 1996 Olaf Kirch, <okir@monad.swb.de>
 */
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/stat.h>
-#include <linux/in.h>
-#include <linux/seq_file.h>
-#include <linux/syscalls.h>
-#include <linux/rwsem.h>
-#include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/mount.h>
-#include <linux/hash.h>
 #include <linux/module.h>
 #include <linux/exportfs.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/nfsfh.h>
 #include <linux/nfsd/syscall.h>
-#include <linux/lockd/bind.h>
-#include <linux/sunrpc/msg_prot.h>
-#include <linux/sunrpc/gss_api.h>
 #include <net/ipv6.h>
+#include "nfsd.h"
+#include "nfsfh.h"
 #define NFSDDBG_FACILITY        NFSDDBG_EXPORT
 typedef struct auth_domain      svc_client;
@@ -369,16 +354,25 @@ static struct svc_export *svc_export_update(struct svc_export *new,
                                            struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
-static int check_export(struct inode *inode, int flags, unsigned char *uuid)
+static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
 {
-        /* We currently export only dirs and regular files.
+        /*
-         * This is what umountd does.
+         * We currently export only dirs, regular files, and (for v4
+         * pseudoroot) symlinks.
         */
        if (!S_ISDIR(inode->i_mode) &&
+            !S_ISLNK(inode->i_mode) &&
            !S_ISREG(inode->i_mode))
                return -ENOTDIR;
+        /*
+         * Mountd should never pass down a writeable V4ROOT export, but,
+         * just to make sure:
+         */
+        if (*flags & NFSEXP_V4ROOT)
+                *flags |= NFSEXP_READONLY;
        /* There are two requirements on a filesystem to be exportable.
         * 1:  We must be able to identify the filesystem from a number.
         *       either a device number (so FS_REQUIRES_DEV needed)
@@ -387,7 +381,7 @@ static int check_export(struct inode *inode, int flags, unsigned char *uuid)
         *       This means that s_export_op must be set.
         */
        if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) &&
-            !(flags & NFSEXP_FSID) &&
+            !(*flags & NFSEXP_FSID) &&
            uuid == NULL) {
                dprintk("exp_export: export of non-dev fs without fsid\n");
                return -EINVAL;
@@ -602,7 +596,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                                goto out4;
                }
-                err = check_export(exp.ex_path.dentry->d_inode, exp.ex_flags,
+                err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags,
                                   exp.ex_uuid);
                if (err)
                        goto out4;
@@ -1041,7 +1035,7 @@ exp_export(struct nfsctl_export *nxp)
                goto finish;
        }
-        err = check_export(path.dentry->d_inode, nxp->ex_flags, NULL);
+        err = check_export(path.dentry->d_inode, &nxp->ex_flags, NULL);
        if (err) goto finish;
        err = -ENOMEM;
@@ -1320,6 +1314,23 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
        return exp;
 }
+static struct svc_export *find_fsidzero_export(struct svc_rqst *rqstp)
+{
+        struct svc_export *exp;
+        u32 fsidv[2];
+        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+        exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
+        /*
+         * We shouldn't have accepting an nfsv4 request at all if we
+         * don't have a pseudoexport!:
+         */
+        if (IS_ERR(exp) && PTR_ERR(exp) == -ENOENT)
+                exp = ERR_PTR(-ESERVERFAULT);
+        return exp;
+}
 /*
 * Called when we need the filehandle for the root of the pseudofs,
 * for a given NFSv4 client.   The root is defined to be the
@@ -1330,11 +1341,8 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp)
 {
        struct svc_export *exp;
        __be32 rv;
-        u32 fsidv[2];
-        mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL);
+        exp = find_fsidzero_export(rqstp);
-        exp = rqst_exp_find(rqstp, FSID_NUM, fsidv);
        if (IS_ERR(exp))
                return nfserrno(PTR_ERR(exp));
        rv = fh_compose(fhp, exp, exp->ex_path.dentry, NULL);
@@ -1425,6 +1433,7 @@ static struct flags {
        { NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
        { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
        { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}},
+        { NFSEXP_V4ROOT, {"v4root", ""}},
 #ifdef MSNFS
        { NFSEXP_MSNFS, {"msnfs", ""}},
 #endif
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index b2786a5f9afe..0c6d81670137 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/lockd.c
- *
 * This file contains all the stubs needed when communicating with lockd.
 * This level of indirection is necessary so we can run nfsd+lockd without
 * requiring the nfs client to be compiled in/loaded, and vice versa.
@@ -8,14 +6,10 @@
 * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/types.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/lockd/bind.h>
+#include "nfsd.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_LOCKD
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 4e3219e84116..f20589d2ae27 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -1,19 +1,15 @@
 /*
- * linux/fs/nfsd/nfs2acl.c
- *
 * Process version 2 NFSACL requests.
 *
 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
 */
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/nfs.h>
+/* FIXME: nfsacl.h is a broken header */
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
-#include <linux/nfsd/xdr3.h>
-#include <linux/posix_acl.h>
 #include <linux/nfsacl.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 #define RETURN_STATUS(st)       { resp->status = (st); return (st); }
@@ -217,6 +213,16 @@ static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
 * XDR encode functions
 */
+/*
+ * There must be an encoding function for void results so svc_process
+ * will work properly.
+ */
+int
+nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p, void *dummy)
+{
+        return xdr_ressize_check(rqstp, p);
+}
 /* GETACL */
 static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p,
                struct nfsd3_getaclres *resp)
@@ -308,7 +314,6 @@ static int nfsaclsvc_release_access(struct svc_rqst *rqstp, __be32 *p,
 }
 #define nfsaclsvc_decode_voidargs       NULL
-#define nfsaclsvc_encode_voidres        NULL
 #define nfsaclsvc_release_void          NULL
 #define nfsd3_fhandleargs       nfsd_fhandle
 #define nfsd3_attrstatres       nfsd_attrstat
@@ -346,5 +351,5 @@ struct svc_version	nfsd_acl_version2 = {
                .vs_proc        = nfsd_acl_procedures2,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-                .vs_hidden      = 1,
+                .vs_hidden      = 0,
 };
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9981dbb377a3..e0c4846bad92 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -1,18 +1,15 @@
 /*
- * linux/fs/nfsd/nfs3acl.c
- *
 * Process version 3 NFSACL requests.
 *
 * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
 */
-#include <linux/sunrpc/svc.h>
+#include "nfsd.h"
-#include <linux/nfs3.h>
+/* FIXME: nfsacl.h is a broken header */
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr3.h>
-#include <linux/posix_acl.h>
 #include <linux/nfsacl.h>
+#include "cache.h"
+#include "xdr3.h"
+#include "vfs.h"
 #define RETURN_STATUS(st)       { resp->status = (st); return (st); }
@@ -264,6 +261,6 @@ struct svc_version	nfsd_acl_version3 = {
                .vs_proc        = nfsd_acl_procedures3,
                .vs_dispatch    = nfsd_dispatch,
                .vs_xdrsize     = NFS3_SVC_XDRSIZE,
-                .vs_hidden      = 1,
+                .vs_hidden      = 0,
 };
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index a713c418a922..3d68f45a37b9 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -1,30 +1,16 @@
 /*
- * linux/fs/nfsd/nfs3proc.c
- *
 * Process version 3 NFS requests.
 *
 * Copyright (C) 1996, 1997, 1998 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/linkage.h>
-#include <linux/time.h>
-#include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/ext2_fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/major.h>
 #include <linux/magic.h>
-#include <linux/sunrpc/svc.h>
+#include "cache.h"
-#include <linux/nfsd/nfsd.h>
+#include "xdr3.h"
-#include <linux/nfsd/cache.h>
+#include "vfs.h"
-#include <linux/nfsd/xdr3.h>
-#include <linux/nfs3.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index d0a2ce1b4324..2a533a0af2a9 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfs3xdr.c
- *
 * XDR support for nfsd/protocol version 3.
 *
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
@@ -8,19 +6,8 @@
 * 2003-08-09 Jamie Lokier: Use htonl() for nanoseconds, not htons()!
 */
-#include <linux/types.h>
-#include <linux/time.h>
-#include <linux/nfs3.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/mm.h>
+#include "xdr3.h"
-#include <linux/vfs.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/xdr3.h>
 #include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 725d02f210e2..88150685df34 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfs4acl/acl.c
- *
 *  Common NFSv4 ACL handling code.
 *
 *  Copyright (c) 2002, 2003 The Regents of the University of Michigan.
@@ -36,15 +34,7 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/module.h>
 #include <linux/nfs_fs.h>
-#include <linux/posix_acl.h>
-#include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
@@ -389,7 +379,7 @@ sort_pacl(struct posix_acl *pacl)
        sort_pacl_range(pacl, 1, i-1);
        BUG_ON(pacl->a_entries[i].e_tag != ACL_GROUP_OBJ);
-        j = i++;
+        j = ++i;
        while (pacl->a_entries[j].e_tag == ACL_GROUP)
                j++;
        sort_pacl_range(pacl, i, j-1);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 24e8d78f8dde..c6eed2a3b093 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1,6 +1,4 @@
 /*
- *  linux/fs/nfsd/nfs4callback.c
- *
 *  Copyright (c) 2001 The Regents of the University of Michigan.
 *  All rights reserved.
 *
@@ -33,22 +31,9 @@
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/inet.h>
-#include <linux/errno.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/kthread.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/svcsock.h>
+#include "nfsd.h"
-#include <linux/nfsd/nfsd.h>
+#include "state.h"
-#include <linux/nfsd/state.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/nfs4.h>
-#include <linux/sunrpc/xprtsock.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index ba2c199592fd..6e2983b27f3c 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfsd/nfs4idmap.c
- *
 *  Mapping of UID/GIDs to name and vice versa.
 *
 *  Copyright (c) 2002, 2003 The Regents of the University of
@@ -35,22 +33,9 @@
 */
 #include <linux/module.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfs.h>
-#include <linux/nfs4.h>
-#include <linux/nfs_fs.h>
-#include <linux/nfs_page.h>
-#include <linux/sunrpc/cache.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/list.h>
-#include <linux/time.h>
 #include <linux/seq_file.h>
-#include <linux/sunrpc/svcauth.h>
+#include <linux/sched.h>
 /*
 * Cache entry
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index bebc0c2e1b0a..37514c469846 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1,6 +1,4 @@
 /*
- *  fs/nfsd/nfs4proc.c
- *
 *  Server-side procedures for NFSv4.
 *
 *  Copyright (c) 2002 The Regents of the University of Michigan.
@@ -34,20 +32,11 @@
 *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include <linux/param.h>
-#include <linux/major.h>
-#include <linux/slab.h>
 #include <linux/file.h>
-#include <linux/sunrpc/svc.h>
+#include "cache.h"
-#include <linux/nfsd/nfsd.h>
+#include "xdr4.h"
-#include <linux/nfsd/cache.h>
+#include "vfs.h"
-#include <linux/nfs4.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
-#include <linux/nfs4_acl.h>
-#include <linux/sunrpc/gss_api.h>
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -170,7 +159,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
                accmode |= NFSD_MAY_READ;
        if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
                accmode |= (NFSD_MAY_WRITE | NFSD_MAY_TRUNC);
-        if (open->op_share_deny & NFS4_SHARE_DENY_WRITE)
+        if (open->op_share_deny & NFS4_SHARE_DENY_READ)
                accmode |= NFSD_MAY_WRITE;
        status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index b5348405046b..5a754f7b71ed 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -1,6 +1,4 @@
 /*
-*  linux/fs/nfsd/nfs4recover.c
-*
 *  Copyright (c) 2004 The Regents of the University of Michigan.
 *  All rights reserved.
 *
@@ -33,20 +31,14 @@
 *
 */
-#include <linux/err.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfs4.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
-#include <linux/param.h>
 #include <linux/file.h>
 #include <linux/namei.h>
-#include <asm/uaccess.h>
-#include <linux/scatterlist.h>
 #include <linux/crypto.h>
 #include <linux/sched.h>
-#include <linux/mount.h>
+#include "nfsd.h"
+#include "state.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2153f9bdbebd..f19ed866c95f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1,6 +1,4 @@
 /*
-*  linux/fs/nfsd/nfs4state.c
-*
 *  Copyright (c) 2001 The Regents of the University of Michigan.
 *  All rights reserved.
 *
@@ -34,28 +32,14 @@
 *
 */
-#include <linux/param.h>
-#include <linux/major.h>
-#include <linux/slab.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/workqueue.h>
 #include <linux/smp_lock.h>
-#include <linux/kthread.h>
-#include <linux/nfs4.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
 #include <linux/namei.h>
 #include <linux/swap.h>
-#include <linux/mutex.h>
-#include <linux/lockd/bind.h>
-#include <linux/module.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/clnt.h>
+#include "xdr4.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -477,13 +461,14 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan)
 /*
 * fchan holds the client values on input, and the server values on output
+ * sv_max_mesg is the maximum payload plus one page for overhead.
 */
 static int init_forechannel_attrs(struct svc_rqst *rqstp,
                                  struct nfsd4_channel_attrs *session_fchan,
                                  struct nfsd4_channel_attrs *fchan)
 {
        int status = 0;
-        __u32   maxcount = svc_max_payload(rqstp);
+        __u32   maxcount = nfsd_serv->sv_max_mesg;
        /* headerpadsz set to zero in encode routine */
@@ -523,6 +508,15 @@ free_session_slots(struct nfsd4_session *ses)
                kfree(ses->se_slots[i]);
 }
+/*
+ * We don't actually need to cache the rpc and session headers, so we
+ * can allocate a little less for each slot:
+ */
+static inline int slot_bytes(struct nfsd4_channel_attrs *ca)
+{
+        return ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+}
 static int
 alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
                   struct nfsd4_create_session *cses)
@@ -554,7 +548,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
        memcpy(new, &tmp, sizeof(*new));
        /* allocate each struct nfsd4_slot and data cache in one piece */
-        cachesize = new->se_fchannel.maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+        cachesize = slot_bytes(&new->se_fchannel);
        for (i = 0; i < new->se_fchannel.maxreqs; i++) {
                sp = kzalloc(sizeof(*sp) + cachesize, GFP_KERNEL);
                if (!sp)
@@ -628,10 +622,12 @@ void
 free_session(struct kref *kref)
 {
        struct nfsd4_session *ses;
+        int mem;
        ses = container_of(kref, struct nfsd4_session, se_ref);
        spin_lock(&nfsd_drc_lock);
-        nfsd_drc_mem_used -= ses->se_fchannel.maxreqs * NFSD_SLOT_CACHE_SIZE;
+        mem = ses->se_fchannel.maxreqs * slot_bytes(&ses->se_fchannel);
+        nfsd_drc_mem_used -= mem;
        spin_unlock(&nfsd_drc_lock);
        free_session_slots(ses);
        kfree(ses);
@@ -2404,11 +2400,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
        memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
-        dprintk("NFSD: delegation stateid=(%08x/%08x/%08x/%08x)\n\n",
+        dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
-                     dp->dl_stateid.si_boot,
+                STATEID_VAL(&dp->dl_stateid));
-                     dp->dl_stateid.si_stateownerid,
-                     dp->dl_stateid.si_fileid,
-                     dp->dl_stateid.si_generation);
 out:
        if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
                        && flag == NFS4_OPEN_DELEGATE_NONE
@@ -2498,9 +2491,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
        status = nfs_ok;
-        dprintk("nfs4_process_open2: stateid=(%08x/%08x/%08x/%08x)\n",
+        dprintk("%s: stateid=" STATEID_FMT "\n", __func__,
-                    stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
+                STATEID_VAL(&stp->st_stateid));
-                    stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
 out:
        if (fp)
                put_nfs4_file(fp);
@@ -2666,9 +2658,8 @@ STALE_STATEID(stateid_t *stateid)
 {
        if (time_after((unsigned long)boot_time,
                        (unsigned long)stateid->si_boot)) {
-                dprintk("NFSD: stale stateid (%08x/%08x/%08x/%08x)!\n",
+                dprintk("NFSD: stale stateid " STATEID_FMT "!\n",
-                        stateid->si_boot, stateid->si_stateownerid,
+                        STATEID_VAL(stateid));
-                        stateid->si_fileid, stateid->si_generation);
                return 1;
        }
        return 0;
@@ -2680,9 +2671,8 @@ EXPIRED_STATEID(stateid_t *stateid)
        if (time_before((unsigned long)boot_time,
                        ((unsigned long)stateid->si_boot)) &&
            time_before((unsigned long)(stateid->si_boot + lease_time), get_seconds())) {
-                dprintk("NFSD: expired stateid (%08x/%08x/%08x/%08x)!\n",
+                dprintk("NFSD: expired stateid " STATEID_FMT "!\n",
-                        stateid->si_boot, stateid->si_stateownerid,
+                        STATEID_VAL(stateid));
-                        stateid->si_fileid, stateid->si_generation);
                return 1;
        }
        return 0;
@@ -2696,9 +2686,8 @@ stateid_error_map(stateid_t *stateid)
        if (EXPIRED_STATEID(stateid))
                return nfserr_expired;
-        dprintk("NFSD: bad stateid (%08x/%08x/%08x/%08x)!\n",
+        dprintk("NFSD: bad stateid " STATEID_FMT "!\n",
-                stateid->si_boot, stateid->si_stateownerid,
+                STATEID_VAL(stateid));
-                stateid->si_fileid, stateid->si_generation);
        return nfserr_bad_stateid;
 }
@@ -2884,10 +2873,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
        struct svc_fh *current_fh = &cstate->current_fh;
        __be32 status;
-        dprintk("NFSD: preprocess_seqid_op: seqid=%d " 
+        dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
-                        "stateid = (%08x/%08x/%08x/%08x)\n", seqid,
+                seqid, STATEID_VAL(stateid));
-                stateid->si_boot, stateid->si_stateownerid, stateid->si_fileid,
-                stateid->si_generation);
        *stpp = NULL;
        *sopp = NULL;
@@ -3019,12 +3006,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        sop->so_confirmed = 1;
        update_stateid(&stp->st_stateid);
        memcpy(&oc->oc_resp_stateid, &stp->st_stateid, sizeof(stateid_t));
-        dprintk("NFSD: nfsd4_open_confirm: success, seqid=%d " 
+        dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
-                "stateid=(%08x/%08x/%08x/%08x)\n", oc->oc_seqid,
+                __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stateid));
-                         stp->st_stateid.si_boot,
-                         stp->st_stateid.si_stateownerid,
-                         stp->st_stateid.si_fileid,
-                         stp->st_stateid.si_generation);
        nfsd4_create_clid_dir(sop->so_client);
 out:
@@ -3283,9 +3266,8 @@ find_delegation_stateid(struct inode *ino, stateid_t *stid)
        struct nfs4_file *fp;
        struct nfs4_delegation *dl;
-        dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n",
+        dprintk("NFSD: %s: stateid=" STATEID_FMT "\n", __func__,
-                    stid->si_boot, stid->si_stateownerid,
+                STATEID_VAL(stid));
-                    stid->si_fileid, stid->si_generation);
        fp = find_file(ino);
        if (!fp)
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0fbd50cee1f6..a8587e90fd5a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -40,24 +40,16 @@
 * at the end of nfs4svc_decode_compoundargs.
 */
-#include <linux/param.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
 #include <linux/namei.h>
-#include <linux/vfs.h>
+#include <linux/statfs.h>
 #include <linux/utsname.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/state.h>
-#include <linux/nfsd/xdr4.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
-#include <linux/sunrpc/gss_api.h>
 #include <linux/sunrpc/svcauth_gss.h>
+#include "xdr4.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
 /*
@@ -2204,11 +2196,14 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
         * we will not follow the cross mount and will fill the attribtutes
         * directly from the mountpoint dentry.
         */
-        if (d_mountpoint(dentry) && !attributes_need_mount(cd->rd_bmval))
+        if (nfsd_mountpoint(dentry, exp)) {
-                ignore_crossmnt = 1;
-        else if (d_mountpoint(dentry)) {
                int err;
+                if (!(exp->ex_flags & NFSEXP_V4ROOT)
+                                && !attributes_need_mount(cd->rd_bmval)) {
+                        ignore_crossmnt = 1;
+                        goto out_encode;
+                }
                /*
                 * Why the heck aren't we just using nfsd_lookup??
                 * Different "."/".." handling?  Something else?
@@ -2224,6 +2219,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
                        goto out_put;
        }
+out_encode:
        nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
                                        cd->rd_rqstp, ignore_crossmnt);
 out_put:
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 4638635c5d87..da08560c4818 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfscache.c
- *
 * Request reply cache. This is currently a global cache, but this may
 * change in the future and be a per-client cache.
 *
@@ -10,16 +8,8 @@
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/kernel.h>
+#include "nfsd.h"
-#include <linux/time.h>
+#include "cache.h"
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
 /* Size of reply cache. Common values are:
 * 4.3BSD:      128
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 5c01fc148ce8..2604c3e70ea5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1,46 +1,20 @@
 /*
- * linux/fs/nfsd/nfsctl.c
- *
 * Syscall interface to knfsd.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/module.h>
-#include <linux/linkage.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
 #include <linux/namei.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
-#include <linux/syscalls.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/inet.h>
-#include <linux/string.h>
 #include <linux/ctype.h>
-#include <linux/nfs.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/lockd/bind.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
 #include <linux/sunrpc/clnt.h>
-#include <asm/uaccess.h>
+#include "nfsd.h"
-#include <net/ipv6.h>
+#include "cache.h"
 /*
 *      We have a single directory with 9 nodes in it.
@@ -55,6 +29,7 @@ enum {
        NFSD_Getfd,
        NFSD_Getfs,
        NFSD_List,
+        NFSD_Export_features,
        NFSD_Fh,
        NFSD_FO_UnlockIP,
        NFSD_FO_UnlockFS,
@@ -173,6 +148,24 @@ static const struct file_operations exports_operations = {
        .owner          = THIS_MODULE,
 };
+static int export_features_show(struct seq_file *m, void *v)
+{
+        seq_printf(m, "0x%x 0x%x\n", NFSEXP_ALLFLAGS, NFSEXP_SECINFO_FLAGS);
+        return 0;
+}
+static int export_features_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, export_features_show, NULL);
+}
+static struct file_operations export_features_operations = {
+        .open           = export_features_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
@@ -1330,6 +1323,8 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
                [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_List] = {"exports", &exports_operations, S_IRUGO},
+                [NFSD_Export_features] = {"export_features",
+                                        &export_features_operations, S_IRUGO},
                [NFSD_FO_UnlockIP] = {"unlock_ip",
                                        &transaction_ops, S_IWUSR|S_IRUSR},
                [NFSD_FO_UnlockFS] = {"unlock_filesystem",
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
new file mode 100644
index 000000000000..e942a1aaac92
--- /dev/null
+++ b/fs/nfsd/nfsd.h
@@ -0,0 +1,338 @@
+/*
+ * Hodge-podge collection of knfsd-related stuff.
+ * I will sort this out later.
+ *
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef LINUX_NFSD_NFSD_H
+#define LINUX_NFSD_NFSD_H
+#include <linux/types.h>
+#include <linux/mount.h>
+#include <linux/nfsd/debug.h>
+#include <linux/nfsd/export.h>
+#include <linux/nfsd/stats.h>
+/*
+ * nfsd version
+ */
+#define NFSD_SUPPORTED_MINOR_VERSION    1
+struct readdir_cd {
+        __be32                  err;    /* 0, nfserr, or nfserr_eof */
+};
+extern struct svc_program       nfsd_program;
+extern struct svc_version       nfsd_version2, nfsd_version3,
+                                nfsd_version4;
+extern u32                      nfsd_supported_minorversion;
+extern struct mutex             nfsd_mutex;
+extern struct svc_serv          *nfsd_serv;
+extern spinlock_t               nfsd_drc_lock;
+extern unsigned int             nfsd_drc_max_mem;
+extern unsigned int             nfsd_drc_mem_used;
+extern const struct seq_operations nfs_exports_op;
+/*
+ * Function prototypes.
+ */
+int             nfsd_svc(unsigned short port, int nrservs);
+int             nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp);
+int             nfsd_nrthreads(void);
+int             nfsd_nrpools(void);
+int             nfsd_get_nrthreads(int n, int *);
+int             nfsd_set_nrthreads(int n, int *);
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+#ifdef CONFIG_NFSD_V2_ACL
+extern struct svc_version nfsd_acl_version2;
+#else
+#define nfsd_acl_version2 NULL
+#endif
+#ifdef CONFIG_NFSD_V3_ACL
+extern struct svc_version nfsd_acl_version3;
+#else
+#define nfsd_acl_version3 NULL
+#endif
+#endif
+enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL };
+int nfsd_vers(int vers, enum vers_op change);
+int nfsd_minorversion(u32 minorversion, enum vers_op change);
+void nfsd_reset_versions(void);
+int nfsd_create_serv(void);
+extern int nfsd_max_blksize;
+static inline int nfsd_v4client(struct svc_rqst *rq)
+{
+        return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
+}
+/* 
+ * NFSv4 State
+ */
+#ifdef CONFIG_NFSD_V4
+extern unsigned int max_delegations;
+int nfs4_state_init(void);
+void nfsd4_free_slabs(void);
+int nfs4_state_start(void);
+void nfs4_state_shutdown(void);
+time_t nfs4_lease_time(void);
+void nfs4_reset_lease(time_t leasetime);
+int nfs4_reset_recoverydir(char *recdir);
+#else
+static inline int nfs4_state_init(void) { return 0; }
+static inline void nfsd4_free_slabs(void) { }
+static inline int nfs4_state_start(void) { return 0; }
+static inline void nfs4_state_shutdown(void) { }
+static inline time_t nfs4_lease_time(void) { return 0; }
+static inline void nfs4_reset_lease(time_t leasetime) { }
+static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
+#endif
+/*
+ * lockd binding
+ */
+void            nfsd_lockd_init(void);
+void            nfsd_lockd_shutdown(void);
+/*
+ * These macros provide pre-xdr'ed values for faster operation.
+ */
+#define nfs_ok                  cpu_to_be32(NFS_OK)
+#define nfserr_perm             cpu_to_be32(NFSERR_PERM)
+#define nfserr_noent            cpu_to_be32(NFSERR_NOENT)
+#define nfserr_io               cpu_to_be32(NFSERR_IO)
+#define nfserr_nxio             cpu_to_be32(NFSERR_NXIO)
+#define nfserr_eagain           cpu_to_be32(NFSERR_EAGAIN)
+#define nfserr_acces            cpu_to_be32(NFSERR_ACCES)
+#define nfserr_exist            cpu_to_be32(NFSERR_EXIST)
+#define nfserr_xdev             cpu_to_be32(NFSERR_XDEV)
+#define nfserr_nodev            cpu_to_be32(NFSERR_NODEV)
+#define nfserr_notdir           cpu_to_be32(NFSERR_NOTDIR)
+#define nfserr_isdir            cpu_to_be32(NFSERR_ISDIR)
+#define nfserr_inval            cpu_to_be32(NFSERR_INVAL)
+#define nfserr_fbig             cpu_to_be32(NFSERR_FBIG)
+#define nfserr_nospc            cpu_to_be32(NFSERR_NOSPC)
+#define nfserr_rofs             cpu_to_be32(NFSERR_ROFS)
+#define nfserr_mlink            cpu_to_be32(NFSERR_MLINK)
+#define nfserr_opnotsupp        cpu_to_be32(NFSERR_OPNOTSUPP)
+#define nfserr_nametoolong      cpu_to_be32(NFSERR_NAMETOOLONG)
+#define nfserr_notempty         cpu_to_be32(NFSERR_NOTEMPTY)
+#define nfserr_dquot            cpu_to_be32(NFSERR_DQUOT)
+#define nfserr_stale            cpu_to_be32(NFSERR_STALE)
+#define nfserr_remote           cpu_to_be32(NFSERR_REMOTE)
+#define nfserr_wflush           cpu_to_be32(NFSERR_WFLUSH)
+#define nfserr_badhandle        cpu_to_be32(NFSERR_BADHANDLE)
+#define nfserr_notsync          cpu_to_be32(NFSERR_NOT_SYNC)
+#define nfserr_badcookie        cpu_to_be32(NFSERR_BAD_COOKIE)
+#define nfserr_notsupp          cpu_to_be32(NFSERR_NOTSUPP)
+#define nfserr_toosmall         cpu_to_be32(NFSERR_TOOSMALL)
+#define nfserr_serverfault      cpu_to_be32(NFSERR_SERVERFAULT)
+#define nfserr_badtype          cpu_to_be32(NFSERR_BADTYPE)
+#define nfserr_jukebox          cpu_to_be32(NFSERR_JUKEBOX)
+#define nfserr_denied           cpu_to_be32(NFSERR_DENIED)
+#define nfserr_deadlock         cpu_to_be32(NFSERR_DEADLOCK)
+#define nfserr_expired          cpu_to_be32(NFSERR_EXPIRED)
+#define nfserr_bad_cookie       cpu_to_be32(NFSERR_BAD_COOKIE)
+#define nfserr_same             cpu_to_be32(NFSERR_SAME)
+#define nfserr_clid_inuse       cpu_to_be32(NFSERR_CLID_INUSE)
+#define nfserr_stale_clientid   cpu_to_be32(NFSERR_STALE_CLIENTID)
+#define nfserr_resource         cpu_to_be32(NFSERR_RESOURCE)
+#define nfserr_moved            cpu_to_be32(NFSERR_MOVED)
+#define nfserr_nofilehandle     cpu_to_be32(NFSERR_NOFILEHANDLE)
+#define nfserr_minor_vers_mismatch      cpu_to_be32(NFSERR_MINOR_VERS_MISMATCH)
+#define nfserr_share_denied     cpu_to_be32(NFSERR_SHARE_DENIED)
+#define nfserr_stale_stateid    cpu_to_be32(NFSERR_STALE_STATEID)
+#define nfserr_old_stateid      cpu_to_be32(NFSERR_OLD_STATEID)
+#define nfserr_bad_stateid      cpu_to_be32(NFSERR_BAD_STATEID)
+#define nfserr_bad_seqid        cpu_to_be32(NFSERR_BAD_SEQID)
+#define nfserr_symlink          cpu_to_be32(NFSERR_SYMLINK)
+#define nfserr_not_same         cpu_to_be32(NFSERR_NOT_SAME)
+#define nfserr_restorefh        cpu_to_be32(NFSERR_RESTOREFH)
+#define nfserr_attrnotsupp      cpu_to_be32(NFSERR_ATTRNOTSUPP)
+#define nfserr_bad_xdr          cpu_to_be32(NFSERR_BAD_XDR)
+#define nfserr_openmode         cpu_to_be32(NFSERR_OPENMODE)
+#define nfserr_locks_held       cpu_to_be32(NFSERR_LOCKS_HELD)
+#define nfserr_op_illegal       cpu_to_be32(NFSERR_OP_ILLEGAL)
+#define nfserr_grace            cpu_to_be32(NFSERR_GRACE)
+#define nfserr_no_grace         cpu_to_be32(NFSERR_NO_GRACE)
+#define nfserr_reclaim_bad      cpu_to_be32(NFSERR_RECLAIM_BAD)
+#define nfserr_badname          cpu_to_be32(NFSERR_BADNAME)
+#define nfserr_cb_path_down     cpu_to_be32(NFSERR_CB_PATH_DOWN)
+#define nfserr_locked           cpu_to_be32(NFSERR_LOCKED)
+#define nfserr_wrongsec         cpu_to_be32(NFSERR_WRONGSEC)
+#define nfserr_badiomode                cpu_to_be32(NFS4ERR_BADIOMODE)
+#define nfserr_badlayout                cpu_to_be32(NFS4ERR_BADLAYOUT)
+#define nfserr_bad_session_digest       cpu_to_be32(NFS4ERR_BAD_SESSION_DIGEST)
+#define nfserr_badsession               cpu_to_be32(NFS4ERR_BADSESSION)
+#define nfserr_badslot                  cpu_to_be32(NFS4ERR_BADSLOT)
+#define nfserr_complete_already         cpu_to_be32(NFS4ERR_COMPLETE_ALREADY)
+#define nfserr_conn_not_bound_to_session cpu_to_be32(NFS4ERR_CONN_NOT_BOUND_TO_SESSION)
+#define nfserr_deleg_already_wanted     cpu_to_be32(NFS4ERR_DELEG_ALREADY_WANTED)
+#define nfserr_back_chan_busy           cpu_to_be32(NFS4ERR_BACK_CHAN_BUSY)
+#define nfserr_layouttrylater           cpu_to_be32(NFS4ERR_LAYOUTTRYLATER)
+#define nfserr_layoutunavailable        cpu_to_be32(NFS4ERR_LAYOUTUNAVAILABLE)
+#define nfserr_nomatching_layout        cpu_to_be32(NFS4ERR_NOMATCHING_LAYOUT)
+#define nfserr_recallconflict           cpu_to_be32(NFS4ERR_RECALLCONFLICT)
+#define nfserr_unknown_layouttype       cpu_to_be32(NFS4ERR_UNKNOWN_LAYOUTTYPE)
+#define nfserr_seq_misordered           cpu_to_be32(NFS4ERR_SEQ_MISORDERED)
+#define nfserr_sequence_pos             cpu_to_be32(NFS4ERR_SEQUENCE_POS)
+#define nfserr_req_too_big              cpu_to_be32(NFS4ERR_REQ_TOO_BIG)
+#define nfserr_rep_too_big              cpu_to_be32(NFS4ERR_REP_TOO_BIG)
+#define nfserr_rep_too_big_to_cache     cpu_to_be32(NFS4ERR_REP_TOO_BIG_TO_CACHE)
+#define nfserr_retry_uncached_rep       cpu_to_be32(NFS4ERR_RETRY_UNCACHED_REP)
+#define nfserr_unsafe_compound          cpu_to_be32(NFS4ERR_UNSAFE_COMPOUND)
+#define nfserr_too_many_ops             cpu_to_be32(NFS4ERR_TOO_MANY_OPS)
+#define nfserr_op_not_in_session        cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION)
+#define nfserr_hash_alg_unsupp          cpu_to_be32(NFS4ERR_HASH_ALG_UNSUPP)
+#define nfserr_clientid_busy            cpu_to_be32(NFS4ERR_CLIENTID_BUSY)
+#define nfserr_pnfs_io_hole             cpu_to_be32(NFS4ERR_PNFS_IO_HOLE)
+#define nfserr_seq_false_retry          cpu_to_be32(NFS4ERR_SEQ_FALSE_RETRY)
+#define nfserr_bad_high_slot            cpu_to_be32(NFS4ERR_BAD_HIGH_SLOT)
+#define nfserr_deadsession              cpu_to_be32(NFS4ERR_DEADSESSION)
+#define nfserr_encr_alg_unsupp          cpu_to_be32(NFS4ERR_ENCR_ALG_UNSUPP)
+#define nfserr_pnfs_no_layout           cpu_to_be32(NFS4ERR_PNFS_NO_LAYOUT)
+#define nfserr_not_only_op              cpu_to_be32(NFS4ERR_NOT_ONLY_OP)
+#define nfserr_wrong_cred               cpu_to_be32(NFS4ERR_WRONG_CRED)
+#define nfserr_wrong_type               cpu_to_be32(NFS4ERR_WRONG_TYPE)
+#define nfserr_dirdeleg_unavail         cpu_to_be32(NFS4ERR_DIRDELEG_UNAVAIL)
+#define nfserr_reject_deleg             cpu_to_be32(NFS4ERR_REJECT_DELEG)
+#define nfserr_returnconflict           cpu_to_be32(NFS4ERR_RETURNCONFLICT)
+#define nfserr_deleg_revoked            cpu_to_be32(NFS4ERR_DELEG_REVOKED)
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ *  Client should resend eventually
+ */
+#define nfserr_dropit           cpu_to_be32(30000)
+/* end-of-file indicator in readdir */
+#define nfserr_eof              cpu_to_be32(30001)
+/* replay detected */
+#define nfserr_replay_me        cpu_to_be32(11001)
+/* nfs41 replay detected */
+#define nfserr_replay_cache     cpu_to_be32(11002)
+/* Check for dir entries '.' and '..' */
+#define isdotent(n, l)  (l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
+/*
+ * Time of server startup
+ */
+extern struct timeval   nfssvc_boot;
+#ifdef CONFIG_NFSD_V4
+/* before processing a COMPOUND operation, we have to check that there
+ * is enough space in the buffer for XDR encode to succeed.  otherwise,
+ * we might process an operation with side effects, and be unable to
+ * tell the client that the operation succeeded.
+ *
+ * COMPOUND_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an "ordinary" _successful_ operation.  (GETATTR,
+ * READ, READDIR, and READLINK have their own buffer checks.)  if we
+ * fall below this level, we fail the next operation with NFS4ERR_RESOURCE.
+ *
+ * COMPOUND_ERR_SLACK_SPACE - this is the minimum bytes of buffer space
+ * needed to encode an operation which has failed with NFS4ERR_RESOURCE.
+ * care is taken to ensure that we never fall below this level for any
+ * reason.
+ */
+#define COMPOUND_SLACK_SPACE            140    /* OP_GETFH */
+#define COMPOUND_ERR_SLACK_SPACE        12     /* OP_SETATTR */
+#define NFSD_LEASE_TIME                 (nfs4_lease_time())
+#define NFSD_LAUNDROMAT_MINTIMEOUT      10   /* seconds */
+/*
+ * The following attributes are currently not supported by the NFSv4 server:
+ *    ARCHIVE       (deprecated anyway)
+ *    HIDDEN        (unlikely to be supported any time soon)
+ *    MIMETYPE      (unlikely to be supported any time soon)
+ *    QUOTA_*       (will be supported in a forthcoming patch)
+ *    SYSTEM        (unlikely to be supported any time soon)
+ *    TIME_BACKUP   (unlikely to be supported any time soon)
+ *    TIME_CREATE   (unlikely to be supported any time soon)
+ */
+#define NFSD4_SUPPORTED_ATTRS_WORD0                                                         \
+(FATTR4_WORD0_SUPPORTED_ATTRS   | FATTR4_WORD0_TYPE         | FATTR4_WORD0_FH_EXPIRE_TYPE   \
+ | FATTR4_WORD0_CHANGE          | FATTR4_WORD0_SIZE         | FATTR4_WORD0_LINK_SUPPORT     \
+ | FATTR4_WORD0_SYMLINK_SUPPORT | FATTR4_WORD0_NAMED_ATTR   | FATTR4_WORD0_FSID             \
+ | FATTR4_WORD0_UNIQUE_HANDLES  | FATTR4_WORD0_LEASE_TIME   | FATTR4_WORD0_RDATTR_ERROR     \
+ | FATTR4_WORD0_ACLSUPPORT      | FATTR4_WORD0_CANSETTIME   | FATTR4_WORD0_CASE_INSENSITIVE \
+ | FATTR4_WORD0_CASE_PRESERVING | FATTR4_WORD0_CHOWN_RESTRICTED                             \
+ | FATTR4_WORD0_FILEHANDLE      | FATTR4_WORD0_FILEID       | FATTR4_WORD0_FILES_AVAIL      \
+ | FATTR4_WORD0_FILES_FREE      | FATTR4_WORD0_FILES_TOTAL  | FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_HOMOGENEOUS      \
+ | FATTR4_WORD0_MAXFILESIZE     | FATTR4_WORD0_MAXLINK      | FATTR4_WORD0_MAXNAME          \
+ | FATTR4_WORD0_MAXREAD         | FATTR4_WORD0_MAXWRITE     | FATTR4_WORD0_ACL)
+#define NFSD4_SUPPORTED_ATTRS_WORD1                                                         \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_NO_TRUNC     | FATTR4_WORD1_NUMLINKS         \
+ | FATTR4_WORD1_OWNER           | FATTR4_WORD1_OWNER_GROUP  | FATTR4_WORD1_RAWDEV           \
+ | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
+ | FATTR4_WORD1_SPACE_USED      | FATTR4_WORD1_TIME_ACCESS  | FATTR4_WORD1_TIME_ACCESS_SET  \
+ | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
+ | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
+#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
+        NFSD4_SUPPORTED_ATTRS_WORD0
+#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
+        NFSD4_SUPPORTED_ATTRS_WORD1
+#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
+        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+static inline u32 nfsd_suppattrs0(u32 minorversion)
+{
+        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
+                            : NFSD4_SUPPORTED_ATTRS_WORD0;
+}
+static inline u32 nfsd_suppattrs1(u32 minorversion)
+{
+        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
+                            : NFSD4_SUPPORTED_ATTRS_WORD1;
+}
+static inline u32 nfsd_suppattrs2(u32 minorversion)
+{
+        return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD2
+                            : NFSD4_SUPPORTED_ATTRS_WORD2;
+}
+/* These will return ERR_INVAL if specified in GETATTR or READDIR. */
+#define NFSD_WRITEONLY_ATTRS_WORD1                                                          \
+(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
+/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+#define NFSD_WRITEABLE_ATTRS_WORD0                                                          \
+(FATTR4_WORD0_SIZE              | FATTR4_WORD0_ACL                                         )
+#define NFSD_WRITEABLE_ATTRS_WORD1                                                          \
+(FATTR4_WORD1_MODE              | FATTR4_WORD1_OWNER         | FATTR4_WORD1_OWNER_GROUP     \
+ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
+#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
+        NFSD_WRITEABLE_ATTRS_WORD0
+/*
+ * we currently store the exclusive create verifier in the v_{a,m}time
+ * attributes so the client can't set these at create time using EXCLUSIVE4_1
+ */
+#define NFSD_SUPPATTR_EXCLCREAT_WORD1 \
+        (NFSD_WRITEABLE_ATTRS_WORD1 & \
+         ~(FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET))
+#define NFSD_SUPPATTR_EXCLCREAT_WORD2 \
+        NFSD_WRITEABLE_ATTRS_WORD2
+#endif /* CONFIG_NFSD_V4 */
+#endif /* LINUX_NFSD_NFSD_H */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 01965b2f3a76..55c8e63af0be 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfsfh.c
- *
 * NFS server file handle treatment.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
@@ -9,19 +7,11 @@
 * ... and again Southern-Winter 2001 to support export_operations
 */
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/unistd.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/dcache.h>
 #include <linux/exportfs.h>
-#include <linux/mount.h>
-#include <linux/sunrpc/clnt.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcauth_gss.h>
-#include <linux/nfsd/nfsd.h>
+#include "nfsd.h"
+#include "vfs.h"
 #include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_FH
@@ -96,8 +86,10 @@ nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int type)
 static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
                                          struct svc_export *exp)
 {
+        int flags = nfsexp_flags(rqstp, exp);
        /* Check if the request originated from a secure port. */
-        if (!rqstp->rq_secure && EX_SECURE(exp)) {
+        if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
                RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
                dprintk(KERN_WARNING
                       "nfsd: request from insecure port %s!\n",
@@ -109,6 +101,36 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
        return nfserrno(nfsd_setuser(rqstp, exp));
 }
+static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+        struct dentry *dentry, struct svc_export *exp)
+{
+        if (!(exp->ex_flags & NFSEXP_V4ROOT))
+                return nfs_ok;
+        /*
+         * v2/v3 clients have no need for the V4ROOT export--they use
+         * the mount protocl instead; also, further V4ROOT checks may be
+         * in v4-specific code, in which case v2/v3 clients could bypass
+         * them.
+         */
+        if (!nfsd_v4client(rqstp))
+                return nfserr_stale;
+        /*
+         * We're exposing only the directories and symlinks that have to be
+         * traversed on the way to real exports:
+         */
+        if (unlikely(!S_ISDIR(dentry->d_inode->i_mode) &&
+                     !S_ISLNK(dentry->d_inode->i_mode)))
+                return nfserr_stale;
+        /*
+         * A pseudoroot export gives permission to access only one
+         * single directory; the kernel has to make another upcall
+         * before granting access to anything else under it:
+         */
+        if (unlikely(dentry != exp->ex_path.dentry))
+                return nfserr_stale;
+        return nfs_ok;
+}
 /*
 * Use the given filehandle to look up the corresponding export and
 * dentry.  On success, the results are used to set fh_export and
@@ -232,14 +254,6 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
                goto out;
        }
-        if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) {
-                error = nfsd_setuser_and_check_port(rqstp, exp);
-                if (error) {
-                        dput(dentry);
-                        goto out;
-                }
-        }
        if (S_ISDIR(dentry->d_inode->i_mode) &&
                        (dentry->d_flags & DCACHE_DISCONNECTED)) {
                printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n",
@@ -294,28 +308,32 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
                error = nfsd_set_fh_dentry(rqstp, fhp);
                if (error)
                        goto out;
-                dentry = fhp->fh_dentry;
-                exp = fhp->fh_export;
-        } else {
-                /*
-                 * just rechecking permissions
-                 * (e.g. nfsproc_create calls fh_verify, then nfsd_create
-                 * does as well)
-                 */
-                dprintk("nfsd: fh_verify - just checking\n");
-                dentry = fhp->fh_dentry;
-                exp = fhp->fh_export;
-                /*
-                 * Set user creds for this exportpoint; necessary even
-                 * in the "just checking" case because this may be a
-                 * filehandle that was created by fh_compose, and that
-                 * is about to be used in another nfsv4 compound
-                 * operation.
-                 */
-                error = nfsd_setuser_and_check_port(rqstp, exp);
-                if (error)
-                        goto out;
        }
+        dentry = fhp->fh_dentry;
+        exp = fhp->fh_export;
+        /*
+         * We still have to do all these permission checks, even when
+         * fh_dentry is already set:
+         *      - fh_verify may be called multiple times with different
+         *        "access" arguments (e.g. nfsd_proc_create calls
+         *        fh_verify(...,NFSD_MAY_EXEC) first, then later (in
+         *        nfsd_create) calls fh_verify(...,NFSD_MAY_CREATE).
+         *      - in the NFSv4 case, the filehandle may have been filled
+         *        in by fh_compose, and given a dentry, but further
+         *        compound operations performed with that filehandle
+         *        still need permissions checks.  In the worst case, a
+         *        mountpoint crossing may have changed the export
+         *        options, and we may now need to use a different uid
+         *        (for example, if different id-squashing options are in
+         *        effect on the new filesystem).
+         */
+        error = check_pseudo_root(rqstp, dentry, exp);
+        if (error)
+                goto out;
+        error = nfsd_setuser_and_check_port(rqstp, exp);
+        if (error)
+                goto out;
        error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type);
        if (error)
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
new file mode 100644
index 000000000000..cdfb8c6a4206
--- /dev/null
+++ b/fs/nfsd/nfsfh.h
@@ -0,0 +1,208 @@
+/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
+#ifndef _LINUX_NFSD_FH_INT_H
+#define _LINUX_NFSD_FH_INT_H
+#include <linux/nfsd/nfsfh.h>
+enum nfsd_fsid {
+        FSID_DEV = 0,
+        FSID_NUM,
+        FSID_MAJOR_MINOR,
+        FSID_ENCODE_DEV,
+        FSID_UUID4_INUM,
+        FSID_UUID8,
+        FSID_UUID16,
+        FSID_UUID16_INUM,
+};
+enum fsid_source {
+        FSIDSOURCE_DEV,
+        FSIDSOURCE_FSID,
+        FSIDSOURCE_UUID,
+};
+extern enum fsid_source fsid_source(struct svc_fh *fhp);
+/* This might look a little large to "inline" but in all calls except
+ * one, 'vers' is constant so moste of the function disappears.
+ */
+static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
+                           u32 fsid, unsigned char *uuid)
+{
+        u32 *up;
+        switch(vers) {
+        case FSID_DEV:
+                fsidv[0] = htonl((MAJOR(dev)<<16) |
+                                 MINOR(dev));
+                fsidv[1] = ino_t_to_u32(ino);
+                break;
+        case FSID_NUM:
+                fsidv[0] = fsid;
+                break;
+        case FSID_MAJOR_MINOR:
+                fsidv[0] = htonl(MAJOR(dev));
+                fsidv[1] = htonl(MINOR(dev));
+                fsidv[2] = ino_t_to_u32(ino);
+                break;
+        case FSID_ENCODE_DEV:
+                fsidv[0] = new_encode_dev(dev);
+                fsidv[1] = ino_t_to_u32(ino);
+                break;
+        case FSID_UUID4_INUM:
+                /* 4 byte fsid and inode number */
+                up = (u32*)uuid;
+                fsidv[0] = ino_t_to_u32(ino);
+                fsidv[1] = up[0] ^ up[1] ^ up[2] ^ up[3];
+                break;
+        case FSID_UUID8:
+                /* 8 byte fsid  */
+                up = (u32*)uuid;
+                fsidv[0] = up[0] ^ up[2];
+                fsidv[1] = up[1] ^ up[3];
+                break;
+        case FSID_UUID16:
+                /* 16 byte fsid - NFSv3+ only */
+                memcpy(fsidv, uuid, 16);
+                break;
+        case FSID_UUID16_INUM:
+                /* 8 byte inode and 16 byte fsid */
+                *(u64*)fsidv = (u64)ino;
+                memcpy(fsidv+2, uuid, 16);
+                break;
+        default: BUG();
+        }
+}
+static inline int key_len(int type)
+{
+        switch(type) {
+        case FSID_DEV:          return 8;
+        case FSID_NUM:          return 4;
+        case FSID_MAJOR_MINOR:  return 12;
+        case FSID_ENCODE_DEV:   return 8;
+        case FSID_UUID4_INUM:   return 8;
+        case FSID_UUID8:        return 8;
+        case FSID_UUID16:       return 16;
+        case FSID_UUID16_INUM:  return 24;
+        default: return 0;
+        }
+}
+/*
+ * Shorthand for dprintk()'s
+ */
+extern char * SVCFH_fmt(struct svc_fh *fhp);
+/*
+ * Function prototypes
+ */
+__be32  fh_verify(struct svc_rqst *, struct svc_fh *, int, int);
+__be32  fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
+__be32  fh_update(struct svc_fh *);
+void    fh_put(struct svc_fh *);
+static __inline__ struct svc_fh *
+fh_copy(struct svc_fh *dst, struct svc_fh *src)
+{
+        WARN_ON(src->fh_dentry || src->fh_locked);
+                        
+        *dst = *src;
+        return dst;
+}
+static inline void
+fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
+{
+        dst->fh_size = src->fh_size;
+        memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
+}
+static __inline__ struct svc_fh *
+fh_init(struct svc_fh *fhp, int maxsize)
+{
+        memset(fhp, 0, sizeof(*fhp));
+        fhp->fh_maxsize = maxsize;
+        return fhp;
+}
+#ifdef CONFIG_NFSD_V3
+/*
+ * Fill in the pre_op attr for the wcc data
+ */
+static inline void
+fill_pre_wcc(struct svc_fh *fhp)
+{
+        struct inode    *inode;
+        inode = fhp->fh_dentry->d_inode;
+        if (!fhp->fh_pre_saved) {
+                fhp->fh_pre_mtime = inode->i_mtime;
+                fhp->fh_pre_ctime = inode->i_ctime;
+                fhp->fh_pre_size  = inode->i_size;
+                fhp->fh_pre_change = inode->i_version;
+                fhp->fh_pre_saved = 1;
+        }
+}
+extern void fill_post_wcc(struct svc_fh *);
+#else
+#define fill_pre_wcc(ignored)
+#define fill_post_wcc(notused)
+#endif /* CONFIG_NFSD_V3 */
+/*
+ * Lock a file handle/inode
+ * NOTE: both fh_lock and fh_unlock are done "by hand" in
+ * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
+ * so, any changes here should be reflected there.
+ */
+static inline void
+fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
+{
+        struct dentry   *dentry = fhp->fh_dentry;
+        struct inode    *inode;
+        BUG_ON(!dentry);
+        if (fhp->fh_locked) {
+                printk(KERN_WARNING "fh_lock: %s/%s already locked!\n",
+                        dentry->d_parent->d_name.name, dentry->d_name.name);
+                return;
+        }
+        inode = dentry->d_inode;
+        mutex_lock_nested(&inode->i_mutex, subclass);
+        fill_pre_wcc(fhp);
+        fhp->fh_locked = 1;
+}
+static inline void
+fh_lock(struct svc_fh *fhp)
+{
+        fh_lock_nested(fhp, I_MUTEX_NORMAL);
+}
+/*
+ * Unlock a file handle/inode
+ */
+static inline void
+fh_unlock(struct svc_fh *fhp)
+{
+        BUG_ON(!fhp->fh_dentry);
+        if (fhp->fh_locked) {
+                fill_post_wcc(fhp);
+                mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex);
+                fhp->fh_locked = 0;
+        }
+}
+#endif /* _LINUX_NFSD_FH_INT_H */
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 0eb9c820b7a6..a047ad6111ef 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -1,29 +1,14 @@
 /*
- * nfsproc2.c   Process version 2 NFS requests.
- * linux/fs/nfsd/nfs2proc.c
- * 
 * Process version 2 NFS requests.
 *
 * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/linkage.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/in.h>
 #include <linux/namei.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/sunrpc/clnt.h>
+#include "cache.h"
-#include <linux/sunrpc/svc.h>
+#include "xdr.h"
-#include <linux/nfsd/nfsd.h>
+#include "vfs.h"
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
 typedef struct svc_rqst svc_rqst;
 typedef struct svc_buf  svc_buf;
@@ -758,6 +743,7 @@ nfserrno (int errno)
                { nfserr_io, -ETXTBSY },
                { nfserr_notsupp, -EOPNOTSUPP },
                { nfserr_toosmall, -ETOOSMALL },
+                { nfserr_serverfault, -ESERVERFAULT },
        };
        int     i;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 67ea83eedd43..171699eb07c8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/nfssvc.c
- *
 * Central processing for nfsd.
 *
 * Authors:     Olaf Kirch (okir@monad.swb.de)
@@ -8,33 +6,19 @@
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/time.h>
-#include <linux/errno.h>
-#include <linux/nfs.h>
-#include <linux/in.h>
-#include <linux/uio.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/smp.h>
 #include <linux/freezer.h>
 #include <linux/fs_struct.h>
-#include <linux/kthread.h>
 #include <linux/swap.h>
-#include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
-#include <linux/sunrpc/cache.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/stats.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/syscall.h>
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include "nfsd.h"
+#include "cache.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY        NFSDDBG_SVC
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index afd08e2c90a5..4ce005dbf3e6 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -1,20 +1,10 @@
 /*
- * linux/fs/nfsd/nfsxdr.c
- *
 * XDR support for nfsd
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/types.h>
+#include "xdr.h"
-#include <linux/time.h>
-#include <linux/nfs.h>
-#include <linux/vfs.h>
-#include <linux/sunrpc/xdr.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/xdr.h>
-#include <linux/mm.h>
 #include "auth.h"
 #define NFSDDBG_FACILITY                NFSDDBG_XDR
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
new file mode 100644
index 000000000000..fefeae27f25e
--- /dev/null
+++ b/fs/nfsd/state.h
@@ -0,0 +1,408 @@
+/*
+ *  Copyright (c) 2001 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson <andros@umich.edu>
+ *  
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _NFSD4_STATE_H
+#define _NFSD4_STATE_H
+#include <linux/nfsd/nfsfh.h>
+#include "nfsfh.h"
+typedef struct {
+        u32             cl_boot;
+        u32             cl_id;
+} clientid_t;
+typedef struct {
+        u32             so_boot;
+        u32             so_stateownerid;
+        u32             so_fileid;
+} stateid_opaque_t;
+typedef struct {
+        u32                     si_generation;
+        stateid_opaque_t        si_opaque;
+} stateid_t;
+#define si_boot           si_opaque.so_boot
+#define si_stateownerid   si_opaque.so_stateownerid
+#define si_fileid         si_opaque.so_fileid
+#define STATEID_FMT     "(%08x/%08x/%08x/%08x)"
+#define STATEID_VAL(s) \
+        (s)->si_boot, \
+        (s)->si_stateownerid, \
+        (s)->si_fileid, \
+        (s)->si_generation
+struct nfsd4_cb_sequence {
+        /* args/res */
+        u32                     cbs_minorversion;
+        struct nfs4_client      *cbs_clp;
+};
+struct nfs4_delegation {
+        struct list_head        dl_perfile;
+        struct list_head        dl_perclnt;
+        struct list_head        dl_recall_lru;  /* delegation recalled */
+        atomic_t                dl_count;       /* ref count */
+        struct nfs4_client      *dl_client;
+        struct nfs4_file        *dl_file;
+        struct file_lock        *dl_flock;
+        struct file             *dl_vfs_file;
+        u32                     dl_type;
+        time_t                  dl_time;
+/* For recall: */
+        u32                     dl_ident;
+        stateid_t               dl_stateid;
+        struct knfsd_fh         dl_fh;
+        int                     dl_retries;
+};
+/* client delegation callback info */
+struct nfs4_cb_conn {
+        /* SETCLIENTID info */
+        struct sockaddr_storage cb_addr;
+        size_t                  cb_addrlen;
+        u32                     cb_prog;
+        u32                     cb_minorversion;
+        u32                     cb_ident;       /* minorversion 0 only */
+        /* RPC client info */
+        atomic_t                cb_set;     /* successful CB_NULL call */
+        struct rpc_clnt *       cb_client;
+};
+/* Maximum number of slots per session. 160 is useful for long haul TCP */
+#define NFSD_MAX_SLOTS_PER_SESSION     160
+/* Maximum number of operations per session compound */
+#define NFSD_MAX_OPS_PER_COMPOUND       16
+/* Maximum  session per slot cache size */
+#define NFSD_SLOT_CACHE_SIZE            1024
+/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION       32
+#define NFSD_MAX_MEM_PER_SESSION  \
+                (NFSD_CACHE_SIZE_SLOTS_PER_SESSION * NFSD_SLOT_CACHE_SIZE)
+struct nfsd4_slot {
+        bool    sl_inuse;
+        bool    sl_cachethis;
+        u16     sl_opcnt;
+        u32     sl_seqid;
+        __be32  sl_status;
+        u32     sl_datalen;
+        char    sl_data[];
+};
+struct nfsd4_channel_attrs {
+        u32             headerpadsz;
+        u32             maxreq_sz;
+        u32             maxresp_sz;
+        u32             maxresp_cached;
+        u32             maxops;
+        u32             maxreqs;
+        u32             nr_rdma_attrs;
+        u32             rdma_attrs;
+};
+struct nfsd4_create_session {
+        clientid_t                      clientid;
+        struct nfs4_sessionid           sessionid;
+        u32                             seqid;
+        u32                             flags;
+        struct nfsd4_channel_attrs      fore_channel;
+        struct nfsd4_channel_attrs      back_channel;
+        u32                             callback_prog;
+        u32                             uid;
+        u32                             gid;
+};
+/* The single slot clientid cache structure */
+struct nfsd4_clid_slot {
+        u32                             sl_seqid;
+        __be32                          sl_status;
+        struct nfsd4_create_session     sl_cr_ses;
+};
+struct nfsd4_session {
+        struct kref             se_ref;
+        struct list_head        se_hash;        /* hash by sessionid */
+        struct list_head        se_perclnt;
+        u32                     se_flags;
+        struct nfs4_client      *se_client;     /* for expire_client */
+        struct nfs4_sessionid   se_sessionid;
+        struct nfsd4_channel_attrs se_fchannel;
+        struct nfsd4_channel_attrs se_bchannel;
+        struct nfsd4_slot       *se_slots[];    /* forward channel slots */
+};
+static inline void
+nfsd4_put_session(struct nfsd4_session *ses)
+{
+        extern void free_session(struct kref *kref);
+        kref_put(&ses->se_ref, free_session);
+}
+static inline void
+nfsd4_get_session(struct nfsd4_session *ses)
+{
+        kref_get(&ses->se_ref);
+}
+/* formatted contents of nfs4_sessionid */
+struct nfsd4_sessionid {
+        clientid_t      clientid;
+        u32             sequence;
+        u32             reserved;
+};
+#define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+/*
+ * struct nfs4_client - one per client.  Clientids live here.
+ *      o Each nfs4_client is hashed by clientid.
+ *
+ *      o Each nfs4_clients is also hashed by name 
+ *        (the opaque quantity initially sent by the client to identify itself).
+ *        
+ *      o cl_perclient list is used to ensure no dangling stateowner references
+ *        when we expire the nfs4_client
+ */
+struct nfs4_client {
+        struct list_head        cl_idhash;      /* hash by cl_clientid.id */
+        struct list_head        cl_strhash;     /* hash by cl_name */
+        struct list_head        cl_openowners;
+        struct list_head        cl_delegations;
+        struct list_head        cl_lru;         /* tail queue */
+        struct xdr_netobj       cl_name;        /* id generated by client */
+        char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
+        nfs4_verifier           cl_verifier;    /* generated by client */
+        time_t                  cl_time;        /* time of last lease renewal */
+        struct sockaddr_storage cl_addr;        /* client ipaddress */
+        u32                     cl_flavor;      /* setclientid pseudoflavor */
+        char                    *cl_principal;  /* setclientid principal name */
+        struct svc_cred         cl_cred;        /* setclientid principal */
+        clientid_t              cl_clientid;    /* generated by server */
+        nfs4_verifier           cl_confirm;     /* generated by server */
+        struct nfs4_cb_conn     cl_cb_conn;     /* callback info */
+        atomic_t                cl_count;       /* ref count */
+        u32                     cl_firststate;  /* recovery dir creation */
+        /* for nfs41 */
+        struct list_head        cl_sessions;
+        struct nfsd4_clid_slot  cl_cs_slot;     /* create_session slot */
+        u32                     cl_exchange_flags;
+        struct nfs4_sessionid   cl_sessionid;
+        /* for nfs41 callbacks */
+        /* We currently support a single back channel with a single slot */
+        unsigned long           cl_cb_slot_busy;
+        u32                     cl_cb_seq_nr;
+        struct svc_xprt         *cl_cb_xprt;    /* 4.1 callback transport */
+        struct rpc_wait_queue   cl_cb_waitq;    /* backchannel callers may */
+                                                /* wait here for slots */
+};
+/* struct nfs4_client_reset
+ * one per old client. Populates reset_str_hashtbl. Filled from conf_id_hashtbl
+ * upon lease reset, or from upcall to state_daemon (to read in state
+ * from non-volitile storage) upon reboot.
+ */
+struct nfs4_client_reclaim {
+        struct list_head        cr_strhash;     /* hash by cr_name */
+        char                    cr_recdir[HEXDIR_LEN]; /* recover dir */
+};
+static inline void
+update_stateid(stateid_t *stateid)
+{
+        stateid->si_generation++;
+}
+/* A reasonable value for REPLAY_ISIZE was estimated as follows:  
+ * The OPEN response, typically the largest, requires 
+ *   4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) +  8(verifier) + 
+ *   4(deleg. type) + 8(deleg. stateid) + 4(deleg. recall flag) + 
+ *   20(deleg. space limit) + ~32(deleg. ace) = 112 bytes 
+ */
+#define NFSD4_REPLAY_ISIZE       112 
+/*
+ * Replay buffer, where the result of the last seqid-mutating operation 
+ * is cached. 
+ */
+struct nfs4_replay {
+        __be32                  rp_status;
+        unsigned int            rp_buflen;
+        char                    *rp_buf;
+        unsigned                intrp_allocated;
+        struct knfsd_fh         rp_openfh;
+        char                    rp_ibuf[NFSD4_REPLAY_ISIZE];
+};
+/*
+* nfs4_stateowner can either be an open_owner, or a lock_owner
+*
+*    so_idhash:  stateid_hashtbl[] for open owner, lockstateid_hashtbl[]
+*         for lock_owner
+*    so_strhash: ownerstr_hashtbl[] for open_owner, lock_ownerstr_hashtbl[]
+*         for lock_owner
+*    so_perclient: nfs4_client->cl_perclient entry - used when nfs4_client
+*         struct is reaped.
+*    so_perfilestate: heads the list of nfs4_stateid (either open or lock) 
+*         and is used to ensure no dangling nfs4_stateid references when we 
+*         release a stateowner.
+*    so_perlockowner: (open) nfs4_stateid->st_perlockowner entry - used when
+*         close is called to reap associated byte-range locks
+*    so_close_lru: (open) stateowner is placed on this list instead of being
+*         reaped (when so_perfilestate is empty) to hold the last close replay.
+*         reaped by laundramat thread after lease period.
+*/
+struct nfs4_stateowner {
+        struct kref             so_ref;
+        struct list_head        so_idhash;   /* hash by so_id */
+        struct list_head        so_strhash;   /* hash by op_name */
+        struct list_head        so_perclient;
+        struct list_head        so_stateids;
+        struct list_head        so_perstateid; /* for lockowners only */
+        struct list_head        so_close_lru; /* tail queue */
+        time_t                  so_time; /* time of placement on so_close_lru */
+        int                     so_is_open_owner; /* 1=openowner,0=lockowner */
+        u32                     so_id;
+        struct nfs4_client *    so_client;
+        /* after increment in ENCODE_SEQID_OP_TAIL, represents the next
+         * sequence id expected from the client: */
+        u32                     so_seqid;
+        struct xdr_netobj       so_owner;     /* open owner name */
+        int                     so_confirmed; /* successful OPEN_CONFIRM? */
+        struct nfs4_replay      so_replay;
+};
+/*
+*  nfs4_file: a file opened by some number of (open) nfs4_stateowners.
+*    o fi_perfile list is used to search for conflicting 
+*      share_acces, share_deny on the file.
+*/
+struct nfs4_file {
+        atomic_t                fi_ref;
+        struct list_head        fi_hash;    /* hash by "struct inode *" */
+        struct list_head        fi_stateids;
+        struct list_head        fi_delegations;
+        struct inode            *fi_inode;
+        u32                     fi_id;      /* used with stateowner->so_id 
+                                             * for stateid_hashtbl hash */
+        bool                    fi_had_conflict;
+};
+/*
+* nfs4_stateid can either be an open stateid or (eventually) a lock stateid
+*
+* (open)nfs4_stateid: one per (open)nfs4_stateowner, nfs4_file
+*
+*       st_hash: stateid_hashtbl[] entry or lockstateid_hashtbl entry
+*       st_perfile: file_hashtbl[] entry.
+*       st_perfile_state: nfs4_stateowner->so_perfilestate
+*       st_perlockowner: (open stateid) list of lock nfs4_stateowners
+*       st_access_bmap: used only for open stateid
+*       st_deny_bmap: used only for open stateid
+*       st_openstp: open stateid lock stateid was derived from
+*
+* XXX: open stateids and lock stateids have diverged sufficiently that
+* we should consider defining separate structs for the two cases.
+*/
+struct nfs4_stateid {
+        struct list_head              st_hash; 
+        struct list_head              st_perfile;
+        struct list_head              st_perstateowner;
+        struct list_head              st_lockowners;
+        struct nfs4_stateowner      * st_stateowner;
+        struct nfs4_file            * st_file;
+        stateid_t                     st_stateid;
+        struct file                 * st_vfs_file;
+        unsigned long                 st_access_bmap;
+        unsigned long                 st_deny_bmap;
+        struct nfs4_stateid         * st_openstp;
+};
+/* flags for preprocess_seqid_op() */
+#define HAS_SESSION             0x00000001
+#define CONFIRM                 0x00000002
+#define OPEN_STATE              0x00000004
+#define LOCK_STATE              0x00000008
+#define RD_STATE                0x00000010
+#define WR_STATE                0x00000020
+#define CLOSE_STATE             0x00000040
+#define seqid_mutating_err(err)                       \
+        (((err) != nfserr_stale_clientid) &&    \
+        ((err) != nfserr_bad_seqid) &&          \
+        ((err) != nfserr_stale_stateid) &&      \
+        ((err) != nfserr_bad_stateid))
+struct nfsd4_compound_state;
+extern __be32 nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
+                stateid_t *stateid, int flags, struct file **filp);
+extern void nfs4_lock_state(void);
+extern void nfs4_unlock_state(void);
+extern int nfs4_in_grace(void);
+extern __be32 nfs4_check_open_reclaim(clientid_t *clid);
+extern void put_nfs4_client(struct nfs4_client *clp);
+extern void nfs4_free_stateowner(struct kref *kref);
+extern int set_callback_cred(void);
+extern void nfsd4_probe_callback(struct nfs4_client *clp);
+extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
+extern void nfs4_put_delegation(struct nfs4_delegation *dp);
+extern __be32 nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
+extern void nfsd4_init_recdir(char *recdir_name);
+extern int nfsd4_recdir_load(void);
+extern void nfsd4_shutdown_recdir(void);
+extern int nfs4_client_to_reclaim(const char *name);
+extern int nfs4_has_reclaimed_state(const char *name, bool use_exchange_id);
+extern void nfsd4_recdir_purge_old(void);
+extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
+static inline void
+nfs4_put_stateowner(struct nfs4_stateowner *so)
+{
+        kref_put(&so->so_ref, nfs4_free_stateowner);
+}
+static inline void
+nfs4_get_stateowner(struct nfs4_stateowner *so)
+{
+        kref_get(&so->so_ref);
+}
+#endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 71944cddf680..5232d3e8fb2f 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -1,6 +1,4 @@
 /*
- * linux/fs/nfsd/stats.c
- *
 * procfs-based user access to knfsd statistics
 *
 * /proc/net/rpc/nfsd
@@ -23,18 +21,13 @@
 * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
 */
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/stat.h>
 #include <linux/module.h>
-#include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/stats.h>
-#include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/stats.h>
+#include "nfsd.h"
 struct nfsd_stats       nfsdstats;
 struct svc_stat         nfsd_svcstats = {
        .program        = &nfsd_program,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index a293f0273263..c194793b642b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1,7 +1,5 @@
 #define MSNFS   /* HACK HACK */
 /*
- * linux/fs/nfsd/vfs.c
- *
 * File operations used by nfsd. Some of these have been ripped from
 * other parts of the kernel because they weren't exported, others
 * are partial duplicates with added or changed functionality.
@@ -16,48 +14,31 @@
 * Zerocpy NFS support (C) 2002 Hirokazu Takahashi <taka@valinux.co.jp>
 */
-#include <linux/string.h>
-#include <linux/time.h>
-#include <linux/errno.h>
 #include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/major.h>
 #include <linux/splice.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
 #include <linux/fcntl.h>
-#include <linux/net.h>
-#include <linux/unistd.h>
-#include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/in.h>
-#include <linux/module.h>
 #include <linux/namei.h>
-#include <linux/vfs.h>
 #include <linux/delay.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#ifdef CONFIG_NFSD_V3
-#include <linux/nfs3.h>
-#include <linux/nfsd/xdr3.h>
-#endif /* CONFIG_NFSD_V3 */
-#include <linux/nfsd/nfsfh.h>
 #include <linux/quotaops.h>
 #include <linux/fsnotify.h>
-#include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
+#include <linux/jhash.h>
+#include <linux/ima.h>
+#include <asm/uaccess.h>
+#ifdef CONFIG_NFSD_V3
+#include "xdr3.h"
+#endif /* CONFIG_NFSD_V3 */
 #ifdef CONFIG_NFSD_V4
-#include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
 #include <linux/nfsd_idmap.h>
-#include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
-#include <linux/jhash.h>
-#include <linux/ima.h>
-#include <asm/uaccess.h>
+#include "nfsd.h"
+#include "vfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_FILEOP
@@ -89,12 +70,6 @@ struct raparm_hbucket {
 #define RAPARM_HASH_MASK        (RAPARM_HASH_SIZE-1)
 static struct raparm_hbucket    raparm_hash[RAPARM_HASH_SIZE];
-static inline int
-nfsd_v4client(struct svc_rqst *rq)
-{
-    return rq->rq_prog == NFS_PROGRAM && rq->rq_vers == 4;
-}
 /* 
 * Called from nfsd_lookup and encode_dirent. Check if we have crossed 
 * a mount point.
@@ -116,8 +91,16 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
        exp2 = rqst_exp_get_by_name(rqstp, &path);
        if (IS_ERR(exp2)) {
-                if (PTR_ERR(exp2) != -ENOENT)
+                err = PTR_ERR(exp2);
-                        err = PTR_ERR(exp2);
+                /*
+                 * We normally allow NFS clients to continue
+                 * "underneath" a mountpoint that is not exported.
+                 * The exception is V4ROOT, where no traversal is ever
+                 * allowed without an explicit export of the new
+                 * directory.
+                 */
+                if (err == -ENOENT && !(exp->ex_flags & NFSEXP_V4ROOT))
+                        err = 0;
                path_put(&path);
                goto out;
        }
@@ -141,6 +124,53 @@ out:
        return err;
 }
+static void follow_to_parent(struct path *path)
+{
+        struct dentry *dp;
+        while (path->dentry == path->mnt->mnt_root && follow_up(path))
+                ;
+        dp = dget_parent(path->dentry);
+        dput(path->dentry);
+        path->dentry = dp;
+}
+static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, struct svc_export **exp, struct dentry **dentryp)
+{
+        struct svc_export *exp2;
+        struct path path = {.mnt = mntget((*exp)->ex_path.mnt),
+                            .dentry = dget(dparent)};
+        follow_to_parent(&path);
+        exp2 = rqst_exp_parent(rqstp, &path);
+        if (PTR_ERR(exp2) == -ENOENT) {
+                *dentryp = dget(dparent);
+        } else if (IS_ERR(exp2)) {
+                path_put(&path);
+                return PTR_ERR(exp2);
+        } else {
+                *dentryp = dget(path.dentry);
+                exp_put(*exp);
+                *exp = exp2;
+        }
+        path_put(&path);
+        return 0;
+}
+/*
+ * For nfsd purposes, we treat V4ROOT exports as though there was an
+ * export at *every* directory.
+ */
+int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp)
+{
+        if (d_mountpoint(dentry))
+                return 1;
+        if (!(exp->ex_flags & NFSEXP_V4ROOT))
+                return 0;
+        return dentry->d_inode != NULL;
+}
 __be32
 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                   const char *name, unsigned int len,
@@ -169,35 +199,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                        dentry = dget(dparent);
                else if (dparent != exp->ex_path.dentry)
                        dentry = dget_parent(dparent);
-                else if (!EX_NOHIDE(exp))
+                else if (!EX_NOHIDE(exp) && !nfsd_v4client(rqstp))
                        dentry = dget(dparent); /* .. == . just like at / */
                else {
                        /* checking mountpoint crossing is very different when stepping up */
-                        struct svc_export *exp2 = NULL;
+                        host_err = nfsd_lookup_parent(rqstp, dparent, &exp, &dentry);
-                        struct dentry *dp;
+                        if (host_err)
-                        struct path path = {.mnt = mntget(exp->ex_path.mnt),
-                                            .dentry = dget(dparent)};
-                        while (path.dentry == path.mnt->mnt_root &&
-                               follow_up(&path))
-                                ;
-                        dp = dget_parent(path.dentry);
-                        dput(path.dentry);
-                        path.dentry = dp;
-                        exp2 = rqst_exp_parent(rqstp, &path);
-                        if (PTR_ERR(exp2) == -ENOENT) {
-                                dentry = dget(dparent);
-                        } else if (IS_ERR(exp2)) {
-                                host_err = PTR_ERR(exp2);
-                                path_put(&path);
                                goto out_nfserr;
-                        } else {
-                                dentry = dget(path.dentry);
-                                exp_put(exp);
-                                exp = exp2;
-                        }
-                        path_put(&path);
                }
        } else {
                fh_lock(fhp);
@@ -208,7 +216,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
                /*
                 * check if we have crossed a mount point ...
                 */
-                if (d_mountpoint(dentry)) {
+                if (nfsd_mountpoint(dentry, exp)) {
                        if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
                                dput(dentry);
                                goto out_nfserr;
@@ -744,8 +752,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                            flags, current_cred());
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
-        else
-                ima_counts_get(*filp);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -774,12 +780,9 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
        int (*fsync) (struct file *, struct dentry *, int);
        int err;
-        err = filemap_fdatawrite(inode->i_mapping);
+        err = filemap_write_and_wait(inode->i_mapping);
        if (err == 0 && fop && (fsync = fop->fsync))
                err = fsync(filp, dp, 0);
-        if (err == 0)
-                err = filemap_fdatawait(inode->i_mapping);
        return err;
 }
@@ -2124,8 +2127,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
         */
        path.mnt = exp->ex_path.mnt;
        path.dentry = dentry;
-        err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+        err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC));
-                             IMA_COUNT_LEAVE);
 nfsd_out:
        return err? nfserrno(err) : 0;
 }
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
new file mode 100644
index 000000000000..4b1de0a9ea75
--- /dev/null
+++ b/fs/nfsd/vfs.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef LINUX_NFSD_VFS_H
+#define LINUX_NFSD_VFS_H
+#include "nfsfh.h"
+/*
+ * Flags for nfsd_permission
+ */
+#define NFSD_MAY_NOP            0
+#define NFSD_MAY_EXEC           1 /* == MAY_EXEC */
+#define NFSD_MAY_WRITE          2 /* == MAY_WRITE */
+#define NFSD_MAY_READ           4 /* == MAY_READ */
+#define NFSD_MAY_SATTR          8
+#define NFSD_MAY_TRUNC          16
+#define NFSD_MAY_LOCK           32
+#define NFSD_MAY_OWNER_OVERRIDE 64
+#define NFSD_MAY_LOCAL_ACCESS   128 /* IRIX doing local access check on device special file*/
+#define NFSD_MAY_BYPASS_GSS_ON_ROOT 256
+#define NFSD_MAY_CREATE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE)
+#define NFSD_MAY_REMOVE         (NFSD_MAY_EXEC|NFSD_MAY_WRITE|NFSD_MAY_TRUNC)
+/*
+ * Callback function for readdir
+ */
+typedef int (*nfsd_dirop_t)(struct inode *, struct dentry *, int, int);
+/* nfsd/vfs.c */
+int             fh_lock_parent(struct svc_fh *, struct dentry *);
+int             nfsd_racache_init(int);
+void            nfsd_racache_shutdown(void);
+int             nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+                                struct svc_export **expp);
+__be32          nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+                                const char *, unsigned int, struct svc_fh *);
+__be32           nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+                                const char *, unsigned int,
+                                struct svc_export **, struct dentry **);
+__be32          nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+                                struct iattr *, int, time_t);
+int nfsd_mountpoint(struct dentry *, struct svc_export *);
+#ifdef CONFIG_NFSD_V4
+__be32          nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
+                    struct nfs4_acl *);
+int             nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
+#endif /* CONFIG_NFSD_V4 */
+__be32          nfsd_create(struct svc_rqst *, struct svc_fh *,
+                                char *name, int len, struct iattr *attrs,
+                                int type, dev_t rdev, struct svc_fh *res);
+#ifdef CONFIG_NFSD_V3
+__be32          nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+__be32          nfsd_create_v3(struct svc_rqst *, struct svc_fh *,
+                                char *name, int len, struct iattr *attrs,
+                                struct svc_fh *res, int createmode,
+                                u32 *verifier, int *truncp, int *created);
+__be32          nfsd_commit(struct svc_rqst *, struct svc_fh *,
+                                loff_t, unsigned long);
+#endif /* CONFIG_NFSD_V3 */
+__be32          nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+                                int, struct file **);
+void            nfsd_close(struct file *);
+__be32          nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *,
+                                loff_t, struct kvec *, int, unsigned long *);
+__be32          nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
+                                loff_t, struct kvec *,int, unsigned long *, int *);
+__be32          nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+                                char *, int *);
+__be32          nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+                                char *name, int len, char *path, int plen,
+                                struct svc_fh *res, struct iattr *);
+__be32          nfsd_link(struct svc_rqst *, struct svc_fh *,
+                                char *, int, struct svc_fh *);
+__be32          nfsd_rename(struct svc_rqst *,
+                                struct svc_fh *, char *, int,
+                                struct svc_fh *, char *, int);
+__be32          nfsd_remove(struct svc_rqst *,
+                                struct svc_fh *, char *, int);
+__be32          nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
+                                char *name, int len);
+int             nfsd_truncate(struct svc_rqst *, struct svc_fh *,
+                                unsigned long size);
+__be32          nfsd_readdir(struct svc_rqst *, struct svc_fh *,
+                             loff_t *, struct readdir_cd *, filldir_t);
+__be32          nfsd_statfs(struct svc_rqst *, struct svc_fh *,
+                                struct kstatfs *, int access);
+int             nfsd_notify_change(struct inode *, struct iattr *);
+__be32          nfsd_permission(struct svc_rqst *, struct svc_export *,
+                                struct dentry *, int);
+int             nfsd_sync_dir(struct dentry *dp);
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
+int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
+#endif
+#endif /* LINUX_NFSD_VFS_H */
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
new file mode 100644
index 000000000000..53b1863dd8f6
--- /dev/null
+++ b/fs/nfsd/xdr.h
@@ -0,0 +1,173 @@
+/* XDR types for nfsd. This is mainly a typing exercise. */
+#ifndef LINUX_NFSD_H
+#define LINUX_NFSD_H
+#include <linux/vfs.h>
+#include "nfsd.h"
+#include "nfsfh.h"
+struct nfsd_fhandle {
+        struct svc_fh           fh;
+};
+struct nfsd_sattrargs {
+        struct svc_fh           fh;
+        struct iattr            attrs;
+};
+struct nfsd_diropargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+};
+struct nfsd_readargs {
+        struct svc_fh           fh;
+        __u32                   offset;
+        __u32                   count;
+        int                     vlen;
+};
+struct nfsd_writeargs {
+        svc_fh                  fh;
+        __u32                   offset;
+        int                     len;
+        int                     vlen;
+};
+struct nfsd_createargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+        struct iattr            attrs;
+};
+struct nfsd_renameargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd_readlinkargs {
+        struct svc_fh           fh;
+        char *                  buffer;
+};
+        
+struct nfsd_linkargs {
+        struct svc_fh           ffh;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd_symlinkargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        char *                  tname;
+        unsigned int            tlen;
+        struct iattr            attrs;
+};
+struct nfsd_readdirargs {
+        struct svc_fh           fh;
+        __u32                   cookie;
+        __u32                   count;
+        __be32 *                buffer;
+};
+struct nfsd_attrstat {
+        struct svc_fh           fh;
+        struct kstat            stat;
+};
+struct nfsd_diropres  {
+        struct svc_fh           fh;
+        struct kstat            stat;
+};
+struct nfsd_readlinkres {
+        int                     len;
+};
+struct nfsd_readres {
+        struct svc_fh           fh;
+        unsigned long           count;
+        struct kstat            stat;
+};
+struct nfsd_readdirres {
+        int                     count;
+        struct readdir_cd       common;
+        __be32 *                buffer;
+        int                     buflen;
+        __be32 *                offset;
+};
+struct nfsd_statfsres {
+        struct kstatfs          stats;
+};
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd_xdrstore {
+        struct nfsd_sattrargs   sattr;
+        struct nfsd_diropargs   dirop;
+        struct nfsd_readargs    read;
+        struct nfsd_writeargs   write;
+        struct nfsd_createargs  create;
+        struct nfsd_renameargs  rename;
+        struct nfsd_linkargs    link;
+        struct nfsd_symlinkargs symlink;
+        struct nfsd_readdirargs readdir;
+};
+#define NFS2_SVC_XDRSIZE        sizeof(union nfsd_xdrstore)
+int nfssvc_decode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_sattrargs *);
+int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_diropargs *);
+int nfssvc_decode_readargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_readargs *);
+int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_writeargs *);
+int nfssvc_decode_createargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_createargs *);
+int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_renameargs *);
+int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_readlinkargs *);
+int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_linkargs *);
+int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_symlinkargs *);
+int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *,
+                                struct nfsd_readdirargs *);
+int nfssvc_encode_void(struct svc_rqst *, __be32 *, void *);
+int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *, struct nfsd_attrstat *);
+int nfssvc_encode_diropres(struct svc_rqst *, __be32 *, struct nfsd_diropres *);
+int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *, struct nfsd_readlinkres *);
+int nfssvc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd_readres *);
+int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *, struct nfsd_statfsres *);
+int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *, struct nfsd_readdirres *);
+int nfssvc_encode_entry(void *, const char *name,
+                        int namlen, loff_t offset, u64 ino, unsigned int);
+int nfssvc_release_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+/* Helper functions for NFSv2 ACL code */
+__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp);
+__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+#endif /* LINUX_NFSD_H */
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
new file mode 100644
index 000000000000..7df980eb0562
--- /dev/null
+++ b/fs/nfsd/xdr3.h
@@ -0,0 +1,344 @@
+/*
+ * XDR types for NFSv3 in nfsd.
+ *
+ * Copyright (C) 1996-1998, Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _LINUX_NFSD_XDR3_H
+#define _LINUX_NFSD_XDR3_H
+#include "xdr.h"
+struct nfsd3_sattrargs {
+        struct svc_fh           fh;
+        struct iattr            attrs;
+        int                     check_guard;
+        time_t                  guardtime;
+};
+struct nfsd3_diropargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+};
+struct nfsd3_accessargs {
+        struct svc_fh           fh;
+        unsigned int            access;
+};
+struct nfsd3_readargs {
+        struct svc_fh           fh;
+        __u64                   offset;
+        __u32                   count;
+        int                     vlen;
+};
+struct nfsd3_writeargs {
+        svc_fh                  fh;
+        __u64                   offset;
+        __u32                   count;
+        int                     stable;
+        __u32                   len;
+        int                     vlen;
+};
+struct nfsd3_createargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+        int                     createmode;
+        struct iattr            attrs;
+        __be32 *                verf;
+};
+struct nfsd3_mknodargs {
+        struct svc_fh           fh;
+        char *                  name;
+        unsigned int            len;
+        __u32                   ftype;
+        __u32                   major, minor;
+        struct iattr            attrs;
+};
+struct nfsd3_renameargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd3_readlinkargs {
+        struct svc_fh           fh;
+        char *                  buffer;
+};
+struct nfsd3_linkargs {
+        struct svc_fh           ffh;
+        struct svc_fh           tfh;
+        char *                  tname;
+        unsigned int            tlen;
+};
+struct nfsd3_symlinkargs {
+        struct svc_fh           ffh;
+        char *                  fname;
+        unsigned int            flen;
+        char *                  tname;
+        unsigned int            tlen;
+        struct iattr            attrs;
+};
+struct nfsd3_readdirargs {
+        struct svc_fh           fh;
+        __u64                   cookie;
+        __u32                   dircount;
+        __u32                   count;
+        __be32 *                verf;
+        __be32 *                buffer;
+};
+struct nfsd3_commitargs {
+        struct svc_fh           fh;
+        __u64                   offset;
+        __u32                   count;
+};
+struct nfsd3_getaclargs {
+        struct svc_fh           fh;
+        int                     mask;
+};
+struct posix_acl;
+struct nfsd3_setaclargs {
+        struct svc_fh           fh;
+        int                     mask;
+        struct posix_acl        *acl_access;
+        struct posix_acl        *acl_default;
+};
+struct nfsd3_attrstat {
+        __be32                  status;
+        struct svc_fh           fh;
+        struct kstat            stat;
+};
+/* LOOKUP, CREATE, MKDIR, SYMLINK, MKNOD */
+struct nfsd3_diropres  {
+        __be32                  status;
+        struct svc_fh           dirfh;
+        struct svc_fh           fh;
+};
+struct nfsd3_accessres {
+        __be32                  status;
+        struct svc_fh           fh;
+        __u32                   access;
+};
+struct nfsd3_readlinkres {
+        __be32                  status;
+        struct svc_fh           fh;
+        __u32                   len;
+};
+struct nfsd3_readres {
+        __be32                  status;
+        struct svc_fh           fh;
+        unsigned long           count;
+        int                     eof;
+};
+struct nfsd3_writeres {
+        __be32                  status;
+        struct svc_fh           fh;
+        unsigned long           count;
+        int                     committed;
+};
+struct nfsd3_renameres {
+        __be32                  status;
+        struct svc_fh           ffh;
+        struct svc_fh           tfh;
+};
+struct nfsd3_linkres {
+        __be32                  status;
+        struct svc_fh           tfh;
+        struct svc_fh           fh;
+};
+struct nfsd3_readdirres {
+        __be32                  status;
+        struct svc_fh           fh;
+        int                     count;
+        __be32                  verf[2];
+        struct readdir_cd       common;
+        __be32 *                buffer;
+        int                     buflen;
+        __be32 *                offset;
+        __be32 *                offset1;
+        struct svc_rqst *       rqstp;
+};
+struct nfsd3_fsstatres {
+        __be32                  status;
+        struct kstatfs          stats;
+        __u32                   invarsec;
+};
+struct nfsd3_fsinfores {
+        __be32                  status;
+        __u32                   f_rtmax;
+        __u32                   f_rtpref;
+        __u32                   f_rtmult;
+        __u32                   f_wtmax;
+        __u32                   f_wtpref;
+        __u32                   f_wtmult;
+        __u32                   f_dtpref;
+        __u64                   f_maxfilesize;
+        __u32                   f_properties;
+};
+struct nfsd3_pathconfres {
+        __be32                  status;
+        __u32                   p_link_max;
+        __u32                   p_name_max;
+        __u32                   p_no_trunc;
+        __u32                   p_chown_restricted;
+        __u32                   p_case_insensitive;
+        __u32                   p_case_preserving;
+};
+struct nfsd3_commitres {
+        __be32                  status;
+        struct svc_fh           fh;
+};
+struct nfsd3_getaclres {
+        __be32                  status;
+        struct svc_fh           fh;
+        int                     mask;
+        struct posix_acl        *acl_access;
+        struct posix_acl        *acl_default;
+};
+/* dummy type for release */
+struct nfsd3_fhandle_pair {
+        __u32                   dummy;
+        struct svc_fh           fh1;
+        struct svc_fh           fh2;
+};
+/*
+ * Storage requirements for XDR arguments and results.
+ */
+union nfsd3_xdrstore {
+        struct nfsd3_sattrargs          sattrargs;
+        struct nfsd3_diropargs          diropargs;
+        struct nfsd3_readargs           readargs;
+        struct nfsd3_writeargs          writeargs;
+        struct nfsd3_createargs         createargs;
+        struct nfsd3_renameargs         renameargs;
+        struct nfsd3_linkargs           linkargs;
+        struct nfsd3_symlinkargs        symlinkargs;
+        struct nfsd3_readdirargs        readdirargs;
+        struct nfsd3_diropres           diropres;
+        struct nfsd3_accessres          accessres;
+        struct nfsd3_readlinkres        readlinkres;
+        struct nfsd3_readres            readres;
+        struct nfsd3_writeres           writeres;
+        struct nfsd3_renameres          renameres;
+        struct nfsd3_linkres            linkres;
+        struct nfsd3_readdirres         readdirres;
+        struct nfsd3_fsstatres          fsstatres;
+        struct nfsd3_fsinfores          fsinfores;
+        struct nfsd3_pathconfres        pathconfres;
+        struct nfsd3_commitres          commitres;
+        struct nfsd3_getaclres          getaclres;
+};
+#define NFS3_SVC_XDRSIZE                sizeof(union nfsd3_xdrstore)
+int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *, struct nfsd_fhandle *);
+int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_sattrargs *);
+int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_diropargs *);
+int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_accessargs *);
+int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readargs *);
+int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_writeargs *);
+int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_createargs *);
+int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_createargs *);
+int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_mknodargs *);
+int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_renameargs *);
+int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readlinkargs *);
+int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_linkargs *);
+int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_symlinkargs *);
+int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readdirargs *);
+int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readdirargs *);
+int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *,
+                                struct nfsd3_commitargs *);
+int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *,
+                                struct nfsd3_attrstat *);
+int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *,
+                                struct nfsd3_attrstat *);
+int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_diropres *);
+int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_accessres *);
+int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readlinkres *);
+int nfs3svc_encode_readres(struct svc_rqst *, __be32 *, struct nfsd3_readres *);
+int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *, struct nfsd3_writeres *);
+int nfs3svc_encode_createres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_diropres *);
+int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_renameres *);
+int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_linkres *);
+int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_readdirres *);
+int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_fsstatres *);
+int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *,
+                                struct nfsd3_fsinfores *);
+int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_pathconfres *);
+int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *,
+                                struct nfsd3_commitres *);
+int nfs3svc_release_fhandle(struct svc_rqst *, __be32 *,
+                                struct nfsd3_attrstat *);
+int nfs3svc_release_fhandle2(struct svc_rqst *, __be32 *,
+                                struct nfsd3_fhandle_pair *);
+int nfs3svc_encode_entry(void *, const char *name,
+                                int namlen, loff_t offset, u64 ino,
+                                unsigned int);
+int nfs3svc_encode_entry_plus(void *, const char *name,
+                                int namlen, loff_t offset, u64 ino,
+                                unsigned int);
+/* Helper functions for NFSv3 ACL code */
+__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
+                                struct svc_fh *fhp);
+__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+#endif /* _LINUX_NFSD_XDR3_H */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
new file mode 100644
index 000000000000..efa337739534
--- /dev/null
+++ b/fs/nfsd/xdr4.h
@@ -0,0 +1,562 @@
+/*
+ *  Server-side types for NFSv4.
+ *
+ *  Copyright (c) 2002 The Regents of the University of Michigan.
+ *  All rights reserved.
+ *
+ *  Kendrick Smith <kmsmith@umich.edu>
+ *  Andy Adamson   <andros@umich.edu>
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *  3. Neither the name of the University nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ *  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _LINUX_NFSD_XDR4_H
+#define _LINUX_NFSD_XDR4_H
+#include "state.h"
+#include "nfsd.h"
+#define NFSD4_MAX_TAGLEN        128
+#define XDR_LEN(n)                     (((n) + 3) & ~3)
+struct nfsd4_compound_state {
+        struct svc_fh           current_fh;
+        struct svc_fh           save_fh;
+        struct nfs4_stateowner  *replay_owner;
+        /* For sessions DRC */
+        struct nfsd4_session    *session;
+        struct nfsd4_slot       *slot;
+        __be32                  *datap;
+        size_t                  iovlen;
+        u32                     minorversion;
+        u32                     status;
+};
+static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+{
+        return cs->slot != NULL;
+}
+struct nfsd4_change_info {
+        u32             atomic;
+        bool            change_supported;
+        u32             before_ctime_sec;
+        u32             before_ctime_nsec;
+        u64             before_change;
+        u32             after_ctime_sec;
+        u32             after_ctime_nsec;
+        u64             after_change;
+};
+struct nfsd4_access {
+        u32             ac_req_access;      /* request */
+        u32             ac_supported;       /* response */
+        u32             ac_resp_access;     /* response */
+};
+struct nfsd4_close {
+        u32             cl_seqid;           /* request */
+        stateid_t       cl_stateid;         /* request+response */
+        struct nfs4_stateowner * cl_stateowner; /* response */
+};
+struct nfsd4_commit {
+        u64             co_offset;          /* request */
+        u32             co_count;           /* request */
+        nfs4_verifier   co_verf;            /* response */
+};
+struct nfsd4_create {
+        u32             cr_namelen;         /* request */
+        char *          cr_name;            /* request */
+        u32             cr_type;            /* request */
+        union {                             /* request */
+                struct {
+                        u32 namelen;
+                        char *name;
+                } link;   /* NF4LNK */
+                struct {
+                        u32 specdata1;
+                        u32 specdata2;
+                } dev;    /* NF4BLK, NF4CHR */
+        } u;
+        u32             cr_bmval[3];        /* request */
+        struct iattr    cr_iattr;           /* request */
+        struct nfsd4_change_info  cr_cinfo; /* response */
+        struct nfs4_acl *cr_acl;
+};
+#define cr_linklen      u.link.namelen
+#define cr_linkname     u.link.name
+#define cr_specdata1    u.dev.specdata1
+#define cr_specdata2    u.dev.specdata2
+struct nfsd4_delegreturn {
+        stateid_t       dr_stateid;
+};
+struct nfsd4_getattr {
+        u32             ga_bmval[3];        /* request */
+        struct svc_fh   *ga_fhp;            /* response */
+};
+struct nfsd4_link {
+        u32             li_namelen;         /* request */
+        char *          li_name;            /* request */
+        struct nfsd4_change_info  li_cinfo; /* response */
+};
+struct nfsd4_lock_denied {
+        clientid_t      ld_clientid;
+        struct nfs4_stateowner   *ld_sop;
+        u64             ld_start;
+        u64             ld_length;
+        u32             ld_type;
+};
+struct nfsd4_lock {
+        /* request */
+        u32             lk_type;
+        u32             lk_reclaim;         /* boolean */
+        u64             lk_offset;
+        u64             lk_length;
+        u32             lk_is_new;
+        union {
+                struct {
+                        u32             open_seqid;
+                        stateid_t       open_stateid;
+                        u32             lock_seqid;
+                        clientid_t      clientid;
+                        struct xdr_netobj owner;
+                } new;
+                struct {
+                        stateid_t       lock_stateid;
+                        u32             lock_seqid;
+                } old;
+        } v;
+        /* response */
+        union {
+                struct {
+                        stateid_t               stateid;
+                } ok;
+                struct nfsd4_lock_denied        denied;
+        } u;
+        /* The lk_replay_owner is the open owner in the open_to_lock_owner
+         * case and the lock owner otherwise: */
+        struct nfs4_stateowner *lk_replay_owner;
+};
+#define lk_new_open_seqid       v.new.open_seqid
+#define lk_new_open_stateid     v.new.open_stateid
+#define lk_new_lock_seqid       v.new.lock_seqid
+#define lk_new_clientid         v.new.clientid
+#define lk_new_owner            v.new.owner
+#define lk_old_lock_stateid     v.old.lock_stateid
+#define lk_old_lock_seqid       v.old.lock_seqid
+#define lk_rflags       u.ok.rflags
+#define lk_resp_stateid u.ok.stateid
+#define lk_denied       u.denied
+struct nfsd4_lockt {
+        u32                             lt_type;
+        clientid_t                      lt_clientid;
+        struct xdr_netobj               lt_owner;
+        u64                             lt_offset;
+        u64                             lt_length;
+        struct nfs4_stateowner *        lt_stateowner;
+        struct nfsd4_lock_denied        lt_denied;
+};
+ 
+struct nfsd4_locku {
+        u32             lu_type;
+        u32             lu_seqid;
+        stateid_t       lu_stateid;
+        u64             lu_offset;
+        u64             lu_length;
+        struct nfs4_stateowner  *lu_stateowner;
+};
+struct nfsd4_lookup {
+        u32             lo_len;             /* request */
+        char *          lo_name;            /* request */
+};
+struct nfsd4_putfh {
+        u32             pf_fhlen;           /* request */
+        char            *pf_fhval;          /* request */
+};
+struct nfsd4_open {
+        u32             op_claim_type;      /* request */
+        struct xdr_netobj op_fname;         /* request - everything but CLAIM_PREV */
+        u32             op_delegate_type;   /* request - CLAIM_PREV only */
+        stateid_t       op_delegate_stateid; /* request - response */
+        u32             op_create;          /* request */
+        u32             op_createmode;      /* request */
+        u32             op_bmval[3];        /* request */
+        struct iattr    iattr;              /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+        nfs4_verifier   verf;               /* EXCLUSIVE4 */
+        clientid_t      op_clientid;        /* request */
+        struct xdr_netobj op_owner;           /* request */
+        u32             op_seqid;           /* request */
+        u32             op_share_access;    /* request */
+        u32             op_share_deny;      /* request */
+        stateid_t       op_stateid;         /* response */
+        u32             op_recall;          /* recall */
+        struct nfsd4_change_info  op_cinfo; /* response */
+        u32             op_rflags;          /* response */
+        int             op_truncate;        /* used during processing */
+        struct nfs4_stateowner *op_stateowner; /* used during processing */
+        struct nfs4_acl *op_acl;
+};
+#define op_iattr        iattr
+#define op_verf         verf
+struct nfsd4_open_confirm {
+        stateid_t       oc_req_stateid          /* request */;
+        u32             oc_seqid                /* request */;
+        stateid_t       oc_resp_stateid         /* response */;
+        struct nfs4_stateowner * oc_stateowner; /* response */
+};
+struct nfsd4_open_downgrade {
+        stateid_t       od_stateid;
+        u32             od_seqid;
+        u32             od_share_access;
+        u32             od_share_deny;
+        struct nfs4_stateowner *od_stateowner;
+};
+struct nfsd4_read {
+        stateid_t       rd_stateid;         /* request */
+        u64             rd_offset;          /* request */
+        u32             rd_length;          /* request */
+        int             rd_vlen;
+        struct file     *rd_filp;
+        
+        struct svc_rqst *rd_rqstp;          /* response */
+        struct svc_fh * rd_fhp;             /* response */
+};
+struct nfsd4_readdir {
+        u64             rd_cookie;          /* request */
+        nfs4_verifier   rd_verf;            /* request */
+        u32             rd_dircount;        /* request */
+        u32             rd_maxcount;        /* request */
+        u32             rd_bmval[3];        /* request */
+        struct svc_rqst *rd_rqstp;          /* response */
+        struct svc_fh * rd_fhp;             /* response */
+        struct readdir_cd       common;
+        __be32 *                buffer;
+        int                     buflen;
+        __be32 *                offset;
+};
+struct nfsd4_release_lockowner {
+        clientid_t        rl_clientid;
+        struct xdr_netobj rl_owner;
+};
+struct nfsd4_readlink {
+        struct svc_rqst *rl_rqstp;          /* request */
+        struct svc_fh * rl_fhp;             /* request */
+};
+struct nfsd4_remove {
+        u32             rm_namelen;         /* request */
+        char *          rm_name;            /* request */
+        struct nfsd4_change_info  rm_cinfo; /* response */
+};
+struct nfsd4_rename {
+        u32             rn_snamelen;        /* request */
+        char *          rn_sname;           /* request */
+        u32             rn_tnamelen;        /* request */
+        char *          rn_tname;           /* request */
+        struct nfsd4_change_info  rn_sinfo; /* response */
+        struct nfsd4_change_info  rn_tinfo; /* response */
+};
+struct nfsd4_secinfo {
+        u32 si_namelen;                                 /* request */
+        char *si_name;                                  /* request */
+        struct svc_export *si_exp;                      /* response */
+};
+struct nfsd4_setattr {
+        stateid_t       sa_stateid;         /* request */
+        u32             sa_bmval[3];        /* request */
+        struct iattr    sa_iattr;           /* request */
+        struct nfs4_acl *sa_acl;
+};
+struct nfsd4_setclientid {
+        nfs4_verifier   se_verf;            /* request */
+        u32             se_namelen;         /* request */
+        char *          se_name;            /* request */
+        u32             se_callback_prog;   /* request */
+        u32             se_callback_netid_len;  /* request */
+        char *          se_callback_netid_val;  /* request */
+        u32             se_callback_addr_len;   /* request */
+        char *          se_callback_addr_val;   /* request */
+        u32             se_callback_ident;  /* request */
+        clientid_t      se_clientid;        /* response */
+        nfs4_verifier   se_confirm;         /* response */
+};
+struct nfsd4_setclientid_confirm {
+        clientid_t      sc_clientid;
+        nfs4_verifier   sc_confirm;
+};
+/* also used for NVERIFY */
+struct nfsd4_verify {
+        u32             ve_bmval[3];        /* request */
+        u32             ve_attrlen;         /* request */
+        char *          ve_attrval;         /* request */
+};
+struct nfsd4_write {
+        stateid_t       wr_stateid;         /* request */
+        u64             wr_offset;          /* request */
+        u32             wr_stable_how;      /* request */
+        u32             wr_buflen;          /* request */
+        int             wr_vlen;
+        u32             wr_bytes_written;   /* response */
+        u32             wr_how_written;     /* response */
+        nfs4_verifier   wr_verifier;        /* response */
+};
+struct nfsd4_exchange_id {
+        nfs4_verifier   verifier;
+        struct xdr_netobj clname;
+        u32             flags;
+        clientid_t      clientid;
+        u32             seqid;
+        int             spa_how;
+};
+struct nfsd4_sequence {
+        struct nfs4_sessionid   sessionid;              /* request/response */
+        u32                     seqid;                  /* request/response */
+        u32                     slotid;                 /* request/response */
+        u32                     maxslots;               /* request/response */
+        u32                     cachethis;              /* request */
+#if 0
+        u32                     target_maxslots;        /* response */
+        u32                     status_flags;           /* response */
+#endif /* not yet */
+};
+struct nfsd4_destroy_session {
+        struct nfs4_sessionid   sessionid;
+};
+struct nfsd4_op {
+        int                                     opnum;
+        __be32                                  status;
+        union {
+                struct nfsd4_access             access;
+                struct nfsd4_close              close;
+                struct nfsd4_commit             commit;
+                struct nfsd4_create             create;
+                struct nfsd4_delegreturn        delegreturn;
+                struct nfsd4_getattr            getattr;
+                struct svc_fh *                 getfh;
+                struct nfsd4_link               link;
+                struct nfsd4_lock               lock;
+                struct nfsd4_lockt              lockt;
+                struct nfsd4_locku              locku;
+                struct nfsd4_lookup             lookup;
+                struct nfsd4_verify             nverify;
+                struct nfsd4_open               open;
+                struct nfsd4_open_confirm       open_confirm;
+                struct nfsd4_open_downgrade     open_downgrade;
+                struct nfsd4_putfh              putfh;
+                struct nfsd4_read               read;
+                struct nfsd4_readdir            readdir;
+                struct nfsd4_readlink           readlink;
+                struct nfsd4_remove             remove;
+                struct nfsd4_rename             rename;
+                clientid_t                      renew;
+                struct nfsd4_secinfo            secinfo;
+                struct nfsd4_setattr            setattr;
+                struct nfsd4_setclientid        setclientid;
+                struct nfsd4_setclientid_confirm setclientid_confirm;
+                struct nfsd4_verify             verify;
+                struct nfsd4_write              write;
+                struct nfsd4_release_lockowner  release_lockowner;
+                /* NFSv4.1 */
+                struct nfsd4_exchange_id        exchange_id;
+                struct nfsd4_create_session     create_session;
+                struct nfsd4_destroy_session    destroy_session;
+                struct nfsd4_sequence           sequence;
+        } u;
+        struct nfs4_replay *                    replay;
+};
+struct nfsd4_compoundargs {
+        /* scratch variables for XDR decode */
+        __be32 *                        p;
+        __be32 *                        end;
+        struct page **                  pagelist;
+        int                             pagelen;
+        __be32                          tmp[8];
+        __be32 *                        tmpp;
+        struct tmpbuf {
+                struct tmpbuf *next;
+                void (*release)(const void *);
+                void *buf;
+        }                               *to_free;
+        struct svc_rqst                 *rqstp;
+        u32                             taglen;
+        char *                          tag;
+        u32                             minorversion;
+        u32                             opcnt;
+        struct nfsd4_op                 *ops;
+        struct nfsd4_op                 iops[8];
+};
+struct nfsd4_compoundres {
+        /* scratch variables for XDR encode */
+        __be32 *                        p;
+        __be32 *                        end;
+        struct xdr_buf *                xbuf;
+        struct svc_rqst *               rqstp;
+        u32                             taglen;
+        char *                          tag;
+        u32                             opcnt;
+        __be32 *                        tagp; /* tag, opcount encode location */
+        struct nfsd4_compound_state     cstate;
+};
+static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp)
+{
+        struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
+        return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE;
+}
+static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp)
+{
+        return !resp->cstate.slot->sl_cachethis || nfsd4_is_solo_sequence(resp);
+}
+#define NFS4_SVC_XDRSIZE                sizeof(struct nfsd4_compoundargs)
+static inline void
+set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+{
+        BUG_ON(!fhp->fh_pre_saved || !fhp->fh_post_saved);
+        cinfo->atomic = 1;
+        cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode);
+        if (cinfo->change_supported) {
+                cinfo->before_change = fhp->fh_pre_change;
+                cinfo->after_change = fhp->fh_post_change;
+        } else {
+                cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+                cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+                cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+                cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+        }
+}
+int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
+int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
+                struct nfsd4_compoundargs *);
+int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
+                struct nfsd4_compoundres *);
+void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
+void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
+                       struct dentry *dentry, __be32 *buffer, int *countp,
+                       u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_setclientid *setclid);
+extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_setclientid_confirm *setclientid_confirm);
+extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
+extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+                struct nfsd4_sequence *seq);
+extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+struct nfsd4_exchange_id *);
+                extern __be32 nfsd4_create_session(struct svc_rqst *,
+                struct nfsd4_compound_state *,
+                struct nfsd4_create_session *);
+extern __be32 nfsd4_sequence(struct svc_rqst *,
+                struct nfsd4_compound_state *,
+                struct nfsd4_sequence *);
+extern __be32 nfsd4_destroy_session(struct svc_rqst *,
+                struct nfsd4_compound_state *,
+                struct nfsd4_destroy_session *);
+extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
+                struct nfsd4_open *open);
+extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
+                struct svc_fh *current_fh, struct nfsd4_open *open);
+extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
+extern __be32 nfsd4_close(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_close *close);
+extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_open_downgrade *od);
+extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *,
+                struct nfsd4_lock *lock);
+extern __be32 nfsd4_lockt(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_lockt *lockt);
+extern __be32 nfsd4_locku(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_locku *locku);
+extern __be32
+nfsd4_release_lockowner(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *,
+                struct nfsd4_release_lockowner *rlockowner);
+extern void nfsd4_release_compoundargs(struct nfsd4_compoundargs *);
+extern __be32 nfsd4_delegreturn(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *, struct nfsd4_delegreturn *dr);
+extern __be32 nfsd4_renew(struct svc_rqst *rqstp,
+                          struct nfsd4_compound_state *, clientid_t *clid);
+#endif
+/*
+ * Local variables:
+ *  c-basic-offset: 8
+ * End:
+ */
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 1225af7b2166..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -2,7 +2,6 @@ config NILFS2_FS
        tristate "NILFS2 file system support (EXPERIMENTAL)"
        depends on EXPERIMENTAL
        select CRC32
-        select FS_JOURNAL_INFO
        help
          NILFS2 is a log-structured file system (LFS) supporting continuous
          snapshotting.  In addition to versioning capability of the entire
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index f4a14ea2ed9c..effdbdbe6c11 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -417,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
        key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
                                         bmap->b_inode->i_blkbits);
-        for (pbh = page_buffers(bh->b_page); pbh != bh;
+        for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page)
-             pbh = pbh->b_this_page, key++);
+                key++;
        return key;
 }
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index d5ad54e204a5..18737818db63 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
                        tnicps += nicps;
                        nilfs_mdt_mark_buffer_dirty(cp_bh);
                        nilfs_mdt_mark_dirty(cpfile);
-                        if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
+                        if (!nilfs_cpfile_is_in_first(cpfile, cno)) {
-                            (count = nilfs_cpfile_block_sub_valid_checkpoints(
+                                count =
-                                    cpfile, cp_bh, kaddr, nicps)) == 0) {
+                                  nilfs_cpfile_block_sub_valid_checkpoints(
-                                /* make hole */
+                                                cpfile, cp_bh, kaddr, nicps);
-                                kunmap_atomic(kaddr, KM_USER0);
+                                if (count == 0) {
-                                brelse(cp_bh);
+                                        /* make hole */
-                                ret = nilfs_cpfile_delete_checkpoint_block(
+                                        kunmap_atomic(kaddr, KM_USER0);
-                                        cpfile, cno);
+                                        brelse(cp_bh);
-                                if (ret == 0)
+                                        ret =
-                                        continue;
+                                          nilfs_cpfile_delete_checkpoint_block(
-                                printk(KERN_ERR "%s: cannot delete block\n",
+                                                                   cpfile, cno);
-                                       __func__);
+                                        if (ret == 0)
-                                break;
+                                                continue;
+                                        printk(KERN_ERR
+                                               "%s: cannot delete block\n",
+                                               __func__);
+                                        break;
+                                }
                        }
                }
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index d369ac718277..236753df5cdf 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
        struct nilfs_direct *direct;
        __u64 ptr;
-        direct = (struct nilfs_direct *)bmap;
+        direct = (struct nilfs_direct *)bmap;  /* XXX: use macro for level 1 */
-        if ((key > NILFS_DIRECT_KEY_MAX) ||
+        if (key > NILFS_DIRECT_KEY_MAX || level != 1)
-            (level != 1) ||     /* XXX: use macro for level 1 */
+                return -ENOENT;
-            ((ptr = nilfs_direct_get_ptr(direct, key)) ==
+        ptr = nilfs_direct_get_ptr(direct, key);
-             NILFS_BMAP_INVALID_PTR))
+        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
        if (ptrp != NULL)
@@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap,
        sector_t blocknr;
        int ret, cnt;
-        if (key > NILFS_DIRECT_KEY_MAX ||
+        if (key > NILFS_DIRECT_KEY_MAX)
-            (ptr = nilfs_direct_get_ptr(direct, key)) ==
+                return -ENOENT;
-            NILFS_BMAP_INVALID_PTR)
+        ptr = nilfs_direct_get_ptr(direct, key);
+        if (ptr == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
        if (NILFS_BMAP_USE_VBN(bmap)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index f6af76042d80..d6b2b83de363 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -480,7 +480,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                                      unsigned int cmd, void __user *argp)
 {
        struct nilfs_argv argv[5];
-        const static size_t argsz[5] = {
+        static const size_t argsz[5] = {
                sizeof(struct nilfs_vdesc),
                sizeof(struct nilfs_period),
                sizeof(__u64),
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5403b3ef3a42..8173faee31e6 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1118,8 +1118,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
        /* Abandoning the newly allocated superblock */
        mutex_unlock(&nilfs->ns_mount_mutex);
        put_nilfs(nilfs);
-        up_write(&s->s_umount);
+        deactivate_locked_super(s);
-        deactivate_super(s);
        /*
         * deactivate_super() invokes close_bdev_exclusive().
         * We must finish all post-cleaning before this call;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5ef5f365a5c8..8271cf05c957 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -646,6 +646,7 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        struct fsnotify_group *group;
        struct user_struct *user;
        struct file *filp;
+        struct path path;
        int fd, ret;
        /* Check the IN_* constants for consistency.  */
@@ -659,12 +660,6 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
        if (fd < 0)
                return fd;
-        filp = get_empty_filp();
-        if (!filp) {
-                ret = -ENFILE;
-                goto out_put_fd;
-        }
        user = get_current_user();
        if (unlikely(atomic_read(&user->inotify_devs) >=
                        inotify_max_user_instances)) {
@@ -679,24 +674,28 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
                goto out_free_uid;
        }
-        filp->f_op = &inotify_fops;
+        atomic_inc(&user->inotify_devs);
-        filp->f_path.mnt = mntget(inotify_mnt);
-        filp->f_path.dentry = dget(inotify_mnt->mnt_root);
+        path.mnt = inotify_mnt;
-        filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
+        path.dentry = inotify_mnt->mnt_root;
-        filp->f_mode = FMODE_READ;
+        path_get(&path);
+        filp = alloc_file(&path, FMODE_READ, &inotify_fops);
+        if (!filp)
+                goto Enfile;
        filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
        filp->private_data = group;
-        atomic_inc(&user->inotify_devs);
        fd_install(fd, filp);
        return fd;
+Enfile:
+        ret = -ENFILE;
+        path_put(&path);
+        atomic_dec(&user->inotify_devs);
 out_free_uid:
        free_uid(user);
-        put_filp(filp);
-out_put_fd:
        put_unused_fd(fd);
        return ret;
 }
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 9938034762cc..dc2505abb6d7 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -530,7 +530,7 @@ err_corrupt_attr:
 * the ntfs inode.
 *
 * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
 *    i_count is set to 1, so it is not going to go away
 *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
 *    is allowed to write to them. We should of course be honouring them but
@@ -1207,7 +1207,7 @@ err_out:
 * necessary fields in @vi as well as initializing the ntfs inode.
 *
 * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
 *    i_count is set to 1, so it is not going to go away
 *
 * Return 0 on success and -errno on error.  In the error case, the inode will
@@ -1474,7 +1474,7 @@ err_out:
 * normal directory inodes.
 *
 * Q: What locks are held when the function is called?
- * A: i_state has I_LOCK set, hence the inode is locked, also
+ * A: i_state has I_NEW set, hence the inode is locked, also
 *    i_count is set to 1, so it is not going to go away
 *
 * Return 0 on success and -errno on error.  In the error case, the inode will
diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig
index 701b7a3a872e..0d840669698e 100644
--- a/fs/ocfs2/Kconfig
+++ b/fs/ocfs2/Kconfig
@@ -6,6 +6,7 @@ config OCFS2_FS
        select CRC32
        select QUOTA
        select QUOTA_TREE
+        select FS_POSIX_ACL
        help
          OCFS2 is a general purpose extent based shared disk cluster file
          system with many similarities to ext3. It supports 64 bit inode
@@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS
          This option will enable expensive consistency checks. Enable
          this option for debugging only as it is likely to decrease
          performance of the filesystem.
-config OCFS2_FS_POSIX_ACL
-        bool "OCFS2 POSIX Access Control Lists"
-        depends on OCFS2_FS
-        select FS_POSIX_ACL
-        default n
-        help
-          Posix Access Control Lists (ACLs) support permissions for users and
-          groups beyond the owner/group/world scheme.
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 31f25ce32c97..600d2d2ade11 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -39,11 +39,8 @@ ocfs2-objs := \
        ver.o                   \
        quota_local.o           \
        quota_global.o          \
-        xattr.o
+        xattr.o                 \
+        acl.o
-ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
-ocfs2-objs += acl.o
-endif
 ocfs2_stackglue-objs := stackglue.o
 ocfs2_stack_o2cb-objs := stack_o2cb.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index fbeaec762103..0501974bedd0 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -98,15 +98,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
                                              int type,
                                              struct buffer_head *di_bh)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int name_index;
        char *value = NULL;
        struct posix_acl *acl;
        int retval;
-        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
-                return NULL;
        switch (type) {
        case ACL_TYPE_ACCESS:
                name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -331,13 +327,14 @@ cleanup:
        return ret;
 }
-static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
                                          char *list,
                                          size_t list_len,
                                          const char *name,
-                                          size_t name_len)
+                                          size_t name_len,
+                                          int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -348,13 +345,14 @@ static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
        return size;
 }
-static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
                                           char *list,
                                           size_t list_len,
                                           const char *name,
-                                           size_t name_len)
+                                           size_t name_len,
+                                           int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
@@ -365,19 +363,19 @@ static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
        return size;
 }
-static int ocfs2_xattr_get_acl(struct inode *inode,
+static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
-                               int type,
+                void *buffer, size_t size, int type)
-                               void *buffer,
-                               size_t size)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        struct posix_acl *acl;
        int ret;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return -EOPNOTSUPP;
-        acl = ocfs2_get_acl(inode, type);
+        acl = ocfs2_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -388,35 +386,16 @@ static int ocfs2_xattr_get_acl(struct inode *inode,
        return ret;
 }
-static int ocfs2_xattr_get_acl_access(struct inode *inode,
+static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
-                                      const char *name,
+                const void *value, size_t size, int flags, int type)
-                                      void *buffer,
-                                      size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int ocfs2_xattr_get_acl_default(struct inode *inode,
-                                       const char *name,
-                                       void *buffer,
-                                       size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int ocfs2_xattr_set_acl(struct inode *inode,
-                               int type,
-                               const void *value,
-                               size_t size)
 {
+        struct inode *inode = dentry->d_inode;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl;
        int ret = 0;
+        if (strcmp(name, "") != 0)
+                return -EINVAL;
        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
                return -EOPNOTSUPP;
@@ -442,38 +421,18 @@ cleanup:
        return ret;
 }
-static int ocfs2_xattr_set_acl_access(struct inode *inode,
-                                      const char *name,
-                                      const void *value,
-                                      size_t size,
-                                      int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static int ocfs2_xattr_set_acl_default(struct inode *inode,
-                                       const char *name,
-                                       const void *value,
-                                       size_t size,
-                                       int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
 struct xattr_handler ocfs2_xattr_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
+        .flags  = ACL_TYPE_ACCESS,
        .list   = ocfs2_xattr_list_acl_access,
-        .get    = ocfs2_xattr_get_acl_access,
+        .get    = ocfs2_xattr_get_acl,
-        .set    = ocfs2_xattr_set_acl_access,
+        .set    = ocfs2_xattr_set_acl,
 };
 struct xattr_handler ocfs2_xattr_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
        .list   = ocfs2_xattr_list_acl_default,
-        .get    = ocfs2_xattr_get_acl_default,
+        .get    = ocfs2_xattr_get_acl,
-        .set    = ocfs2_xattr_set_acl_default,
+        .set    = ocfs2_xattr_set_acl,
 };
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 8f6389ed4da5..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,8 +26,6 @@ struct ocfs2_acl_entry {
        __le32 e_id;
 };
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
 extern int ocfs2_check_acl(struct inode *, int);
 extern int ocfs2_acl_chmod(struct inode *);
 extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
@@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
                          struct ocfs2_alloc_context *,
                          struct ocfs2_alloc_context *);
-#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
-#define ocfs2_check_acl NULL
-static inline int ocfs2_acl_chmod(struct inode *inode)
-{
-        return 0;
-}
-static inline int ocfs2_init_acl(handle_t *handle,
-                                 struct inode *inode,
-                                 struct inode *dir,
-                                 struct buffer_head *di_bh,
-                                 struct buffer_head *dir_bh,
-                                 struct ocfs2_alloc_context *meta_ac,
-                                 struct ocfs2_alloc_context *data_ac)
-{
-        return 0;
-}
-#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
 #endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 7c7198a5bc90..d17bdc718f74 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1765,9 +1765,9 @@ set_and_inc:
 *
 * The array index of the subtree root is passed back.
 */
-static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
-                                   struct ocfs2_path *left,
+                            struct ocfs2_path *left,
-                                   struct ocfs2_path *right)
+                            struct ocfs2_path *right)
 {
        int i = 0;
@@ -2872,8 +2872,8 @@ out:
 * This looks similar, but is subtly different to
 * ocfs2_find_cpos_for_left_leaf().
 */
-static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
-                                          struct ocfs2_path *path, u32 *cpos)
+                                   struct ocfs2_path *path, u32 *cpos)
 {
        int i, j, ret = 0;
        u64 blkno;
@@ -7190,8 +7190,8 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
         * wait on them - the truncate_inode_pages() call later will
         * do that for us.
         */
-        ret = do_sync_mapping_range(inode->i_mapping, range_start,
+        ret = filemap_fdatawrite_range(inode->i_mapping, range_start,
-                                    range_end - 1, SYNC_FILE_RANGE_WRITE);
+                                       range_end - 1);
        if (ret)
                mlog_errno(ret);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 9c122d574464..1db4359ccb90 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle,
 int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
                              handle_t *handle,
                              struct ocfs2_path *path);
+int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+                                   struct ocfs2_path *path, u32 *cpos);
+int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
+                            struct ocfs2_path *left,
+                            struct ocfs2_path *right);
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index deb2b132ae5e..3dae4a13f6e4 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -547,6 +547,9 @@ bail:
 *
 * called like this: dio->get_blocks(dio->inode, fs_startblk,
 *                                      fs_count, map_bh, dio->rw == WRITE);
+ *
+ * Note that we never bother to allocate blocks here, and thus ignore the
+ * create argument.
 */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
@@ -563,14 +566,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-        /*
-         * Any write past EOF is not allowed because we'd be extending.
-         */
-        if (create && (iblock + max_blocks) > inode_blocks) {
-                ret = -EIO;
-                goto bail;
-        }
        /* This figures out the size of the next contiguous block, and
         * our logical offset */
        ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
@@ -582,15 +577,6 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                goto bail;
        }
-        if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) {
-                ocfs2_error(inode->i_sb,
-                            "Inode %llu has a hole at block %llu\n",
-                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
-                            (unsigned long long)iblock);
-                ret = -EROFS;
-                goto bail;
-        }
        /* We should already CoW the refcounted extent. */
        BUG_ON(ext_flags & OCFS2_EXT_REFCOUNTED);
        /*
@@ -601,20 +587,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
         */
        if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
                map_bh(bh_result, inode->i_sb, p_blkno);
-        else {
+        else
-                /*
-                 * ocfs2_prepare_inode_for_write() should have caught
-                 * the case where we'd be filling a hole and triggered
-                 * a buffered write instead.
-                 */
-                if (create) {
-                        ret = -EIO;
-                        mlog_errno(ret);
-                        goto bail;
-                }
                clear_buffer_mapped(bh_result);
-        }
        /* make sure we don't map more than max_blocks blocks here as
           that's all the kernel will handle at this point. */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index c452d116b892..eda5b8bcddd5 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -176,7 +176,8 @@ static void o2hb_write_timeout(struct work_struct *work)
 static void o2hb_arm_write_timeout(struct o2hb_region *reg)
 {
-        mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);
+        mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
+             O2HB_MAX_WRITE_TIMEOUT_MS);
        cancel_delayed_work(&reg->hr_write_timeout_work);
        reg->hr_last_timeout_start = jiffies;
@@ -874,7 +875,8 @@ static int o2hb_thread(void *data)
                do_gettimeofday(&after_hb);
                elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
-                mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
+                mlog(ML_HEARTBEAT,
+                     "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
                     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
                     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
                     elapsed_msec);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 7ee6188bc79a..c81142e3ef84 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -35,6 +35,10 @@
 * cluster references throughout where nodes are looked up */
 struct o2nm_cluster *o2nm_single_cluster = NULL;
+char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = {
+                "reset",        /* O2NM_FENCE_RESET */
+                "panic",        /* O2NM_FENCE_PANIC */
+};
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
@@ -579,6 +583,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write(
        return o2nm_cluster_attr_write(page, count,
                                       &cluster->cl_reconnect_delay_ms);
 }
+static ssize_t o2nm_cluster_attr_fence_method_read(
+        struct o2nm_cluster *cluster, char *page)
+{
+        ssize_t ret = 0;
+        if (cluster)
+                ret = sprintf(page, "%s\n",
+                              o2nm_fence_method_desc[cluster->cl_fence_method]);
+        return ret;
+}
+static ssize_t o2nm_cluster_attr_fence_method_write(
+        struct o2nm_cluster *cluster, const char *page, size_t count)
+{
+        unsigned int i;
+        if (page[count - 1] != '\n')
+                goto bail;
+        for (i = 0; i < O2NM_FENCE_METHODS; ++i) {
+                if (count != strlen(o2nm_fence_method_desc[i]) + 1)
+                        continue;
+                if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1))
+                        continue;
+                if (cluster->cl_fence_method != i) {
+                        printk(KERN_INFO "ocfs2: Changing fence method to %s\n",
+                               o2nm_fence_method_desc[i]);
+                        cluster->cl_fence_method = i;
+                }
+                return count;
+        }
+bail:
+        return -EINVAL;
+}
 static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = {
        .attr   = { .ca_owner = THIS_MODULE,
                    .ca_name = "idle_timeout_ms",
@@ -603,10 +644,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = {
        .store  = o2nm_cluster_attr_reconnect_delay_ms_write,
 };
+static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = {
+        .attr   = { .ca_owner = THIS_MODULE,
+                    .ca_name = "fence_method",
+                    .ca_mode = S_IRUGO | S_IWUSR },
+        .show   = o2nm_cluster_attr_fence_method_read,
+        .store  = o2nm_cluster_attr_fence_method_write,
+};
 static struct configfs_attribute *o2nm_cluster_attrs[] = {
        &o2nm_cluster_attr_idle_timeout_ms.attr,
        &o2nm_cluster_attr_keepalive_delay_ms.attr,
        &o2nm_cluster_attr_reconnect_delay_ms.attr,
+        &o2nm_cluster_attr_fence_method.attr,
        NULL,
 };
 static ssize_t o2nm_cluster_show(struct config_item *item,
@@ -778,6 +828,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g
        cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT;
        cluster->cl_idle_timeout_ms    = O2NET_IDLE_TIMEOUT_MS_DEFAULT;
        cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT;
+        cluster->cl_fence_method       = O2NM_FENCE_RESET;
        ret = &cluster->cl_group;
        o2nm_single_cluster = cluster;
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index c992ea0da4ad..09ea2d388bbb 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,6 +33,12 @@
 #include <linux/configfs.h>
 #include <linux/rbtree.h>
+enum o2nm_fence_method {
+        O2NM_FENCE_RESET        = 0,
+        O2NM_FENCE_PANIC,
+        O2NM_FENCE_METHODS,     /* Number of fence methods */
+};
 struct o2nm_node {
        spinlock_t              nd_lock;
        struct config_item      nd_item;
@@ -58,6 +64,7 @@ struct o2nm_cluster {
        unsigned int            cl_idle_timeout_ms;
        unsigned int            cl_keepalive_delay_ms;
        unsigned int            cl_reconnect_delay_ms;
+        enum o2nm_fence_method  cl_fence_method;
        /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */
        unsigned long   cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index bbacf7da48a4..639024033fce 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -74,8 +74,20 @@ static void o2quo_fence_self(void)
         * threads can still schedule, etc, etc */
        o2hb_stop_all_regions();
-        printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+        switch (o2nm_single_cluster->cl_fence_method) {
-        emergency_restart();
+        case O2NM_FENCE_PANIC:
+                panic("*** ocfs2 is very sorry to be fencing this system by "
+                      "panicing ***\n");
+                break;
+        default:
+                WARN_ON(o2nm_single_cluster->cl_fence_method >=
+                        O2NM_FENCE_METHODS);
+        case O2NM_FENCE_RESET:
+                printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this "
+                       "system by restarting ***\n");
+                emergency_restart();
+                break;
+        };
 }
 /* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index d9fa3d22e17c..2f9e4e19a4f2 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2589,6 +2589,14 @@ retry:
                             "begin reco msg (%d)\n", dlm->name, nodenum, ret);
                        ret = 0;
                }
+                if (ret == -EAGAIN) {
+                        mlog(0, "%s: trying to start recovery of node "
+                             "%u, but node %u is waiting for last recovery "
+                             "to complete, backoff for a bit\n", dlm->name,
+                             dead_node, nodenum);
+                        msleep(100);
+                        goto retry;
+                }
                if (ret < 0) {
                        struct dlm_lock_resource *res;
                        /* this is now a serious problem, possibly ENOMEM 
@@ -2608,14 +2616,6 @@ retry:
                         * another ENOMEM */
                        msleep(100);
                        goto retry;
-                } else if (ret == EAGAIN) {
-                        mlog(0, "%s: trying to start recovery of node "
-                             "%u, but node %u is waiting for last recovery "
-                             "to complete, backoff for a bit\n", dlm->name,
-                             dead_node, nodenum);
-                        /* TODO Look into replacing msleep with cond_resched() */
-                        msleep(100);
-                        goto retry;
                }
        }
@@ -2639,7 +2639,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data,
                     dlm->name, br->node_idx, br->dead_node,
                     dlm->reco.dead_node, dlm->reco.new_master);
                spin_unlock(&dlm->spinlock);
-                return EAGAIN;
+                return -EAGAIN;
        }
        spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 843db64e9d4a..d35a27f4523e 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -37,6 +37,7 @@
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
+#include "symlink.h"
 #include "buffer_head_io.h"
@@ -703,6 +704,12 @@ out:
        return ret;
 }
+/*
+ * The ocfs2_fiemap_inline() may be a little bit misleading, since
+ * it not only handles the fiemap for inlined files, but also deals
+ * with the fast symlink, cause they have no difference for extent
+ * mapping per se.
+ */
 static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
                               struct fiemap_extent_info *fieinfo,
                               u64 map_start)
@@ -715,11 +722,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh,
        struct ocfs2_inode_info *oi = OCFS2_I(inode);
        di = (struct ocfs2_dinode *)di_bh->b_data;
-        id_count = le16_to_cpu(di->id2.i_data.id_count);
+        if (ocfs2_inode_is_fast_symlink(inode))
+                id_count = ocfs2_fast_symlink_chars(inode->i_sb);
+        else
+                id_count = le16_to_cpu(di->id2.i_data.id_count);
        if (map_start < id_count) {
                phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits;
-                phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data);
+                if (ocfs2_inode_is_fast_symlink(inode))
+                        phys += offsetof(struct ocfs2_dinode, id2.i_symlink);
+                else
+                        phys += offsetof(struct ocfs2_dinode,
+                                         id2.i_data.id_data);
                ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count,
                                              flags);
@@ -756,9 +770,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        down_read(&OCFS2_I(inode)->ip_alloc_sem);
        /*
-         * Handle inline-data separately.
+         * Handle inline-data and fast symlink separately.
         */
-        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+        if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
+            ocfs2_inode_is_fast_symlink(inode)) {
                ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start);
                goto out_unlock;
        }
@@ -786,6 +801,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                fe_flags = 0;
                if (rec.e_flags & OCFS2_EXT_UNWRITTEN)
                        fe_flags |= FIEMAP_EXTENT_UNWRITTEN;
+                if (rec.e_flags & OCFS2_EXT_REFCOUNTED)
+                        fe_flags |= FIEMAP_EXTENT_SHARED;
                if (is_last)
                        fe_flags |= FIEMAP_EXTENT_LAST;
                len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3d30a1c974a8..06ccf6a86d35 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1772,7 +1772,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                         loff_t *ppos,
                                         size_t count,
                                         int appending,
-                                         int *direct_io)
+                                         int *direct_io,
+                                         int *has_refcount)
 {
        int ret = 0, meta_level = 0;
        struct inode *inode = dentry->d_inode;
@@ -1833,6 +1834,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                                                               saved_pos,
                                                               count,
                                                               &meta_level);
+                        if (has_refcount)
+                                *has_refcount = 1;
                }
                if (ret < 0) {
@@ -1856,6 +1859,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
                        break;
                }
+                if (has_refcount && *has_refcount == 1) {
+                        *direct_io = 0;
+                        break;
+                }
                /*
                 * Allowing concurrent direct writes means
                 * i_size changes wouldn't be synchronized, so
@@ -1899,7 +1906,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
                                    loff_t pos)
 {
        int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
-        int can_do_direct;
+        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
@@ -1942,7 +1949,7 @@ relock:
        can_do_direct = direct_io;
        ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
                                            iocb->ki_left, appending,
-                                            &can_do_direct);
+                                            &can_do_direct, &has_refcount);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2006,14 +2013,16 @@ out_dio:
        /* buffered aio wouldn't have proper lock coverage today */
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
-        if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) {
+        if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode) ||
+            (file->f_flags & O_DIRECT && has_refcount)) {
                ret = filemap_fdatawrite_range(file->f_mapping, pos,
                                               pos + count - 1);
                if (ret < 0)
                        written = ret;
                if (!ret && (old_size != i_size_read(inode) ||
-                    old_clusters != OCFS2_I(inode)->ip_clusters)) {
+                    old_clusters != OCFS2_I(inode)->ip_clusters ||
+                    has_refcount)) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
@@ -2062,7 +2071,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
        int ret;
        ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos,
-                                            sd->total_len, 0, NULL);
+                                            sd->total_len, 0, NULL, NULL);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index f010b22b1c44..50fb26a6a5f5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2108,6 +2108,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        }
        did_quota_inode = 1;
+        inode->i_nlink = 0;
        /* do the real work now. */
        status = ocfs2_mknod_locked(osb, dir, inode,
                                    0, &new_di_bh, parent_di_bh, handle,
@@ -2136,6 +2137,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        if (status < 0)
                mlog_errno(status);
+        insert_inode_hash(inode);
 leave:
        if (status < 0 && did_quota_inode)
                vfs_dq_free_inode(inode);
@@ -2267,6 +2269,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        di = (struct ocfs2_dinode *)di_bh->b_data;
        le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL);
        di->i_orphaned_slot = 0;
+        inode->i_nlink = 1;
+        ocfs2_set_links_count(di, inode->i_nlink);
        ocfs2_journal_dirty(handle, di_bh);
        status = ocfs2_add_entry(handle, dentry, inode,
@@ -2284,7 +2288,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                goto out_commit;
        }
-        insert_inode_hash(inode);
        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
@@ -2326,4 +2329,5 @@ const struct inode_operations ocfs2_dir_iops = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
+        .fiemap         = ocfs2_fiemap,
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index d963d8638709..9362eea7424b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -245,9 +245,11 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
        OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
        OCFS2_MOUNT_INODE64 = 1 << 7,   /* Allow inode numbers > 2^32 */
-        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+        OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */
-        OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+        OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9,      /* Disable POSIX access
-        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
+                                                   control lists */
+        OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
+        OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
 };
 #define OCFS2_OSB_SOFT_RO                       0x0001
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e9431e4a5e7c..1a1a679e51b5 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1202,7 +1202,7 @@ struct ocfs2_local_disk_dqinfo {
 /* Header of one chunk of a quota file */
 struct ocfs2_local_disk_chunk {
        __le32 dqc_free;        /* Number of free entries in the bitmap */
-        u8 dqc_bitmap[0];       /* Bitmap of entries in the corresponding
+        __u8 dqc_bitmap[0];     /* Bitmap of entries in the corresponding
                                 * chunk of quota file */
 };
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 30967e3f5e43..74db2be75dd6 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -276,7 +276,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
        spin_unlock(&osb->osb_lock);
 }
-void ocfs2_kref_remove_refcount_tree(struct kref *kref)
+static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
 {
        struct ocfs2_refcount_tree *tree =
                container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
@@ -524,23 +524,6 @@ out:
        return ret;
 }
-int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw,
-                                      struct ocfs2_refcount_tree **ret_tree,
-                                      struct buffer_head **ref_bh)
-{
-        int ret;
-        u64 ref_blkno;
-        ret = ocfs2_get_refcount_block(inode, &ref_blkno);
-        if (ret) {
-                mlog_errno(ret);
-                return ret;
-        }
-        return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno,
-                                        rw, ret_tree, ref_bh);
-}
 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
                                struct ocfs2_refcount_tree *tree, int rw)
 {
@@ -969,6 +952,103 @@ out:
 }
 /*
+ * Find the end range for a leaf refcount block indicated by
+ * el->l_recs[index].e_blkno.
+ */
+static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
+                                       struct buffer_head *ref_root_bh,
+                                       struct ocfs2_extent_block *eb,
+                                       struct ocfs2_extent_list *el,
+                                       int index,  u32 *cpos_end)
+{
+        int ret, i, subtree_root;
+        u32 cpos;
+        u64 blkno;
+        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
+        struct ocfs2_path *left_path = NULL, *right_path = NULL;
+        struct ocfs2_extent_tree et;
+        struct ocfs2_extent_list *tmp_el;
+        if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
+                /*
+                 * We have a extent rec after index, so just use the e_cpos
+                 * of the next extent rec.
+                 */
+                *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
+                return 0;
+        }
+        if (!eb || (eb && !eb->h_next_leaf_blk)) {
+                /*
+                 * We are the last extent rec, so any high cpos should
+                 * be stored in this leaf refcount block.
+                 */
+                *cpos_end = UINT_MAX;
+                return 0;
+        }
+        /*
+         * If the extent block isn't the last one, we have to find
+         * the subtree root between this extent block and the next
+         * leaf extent block and get the corresponding e_cpos from
+         * the subroot. Otherwise we may corrupt the b-tree.
+         */
+        ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
+        left_path = ocfs2_new_path_from_et(&et);
+        if (!left_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
+        ret = ocfs2_find_path(ci, left_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        right_path = ocfs2_new_path_from_path(left_path);
+        if (!right_path) {
+                ret = -ENOMEM;
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_find_path(ci, right_path, cpos);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        subtree_root = ocfs2_find_subtree_root(&et, left_path,
+                                               right_path);
+        tmp_el = left_path->p_node[subtree_root].el;
+        blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
+        for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+                if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
+                        *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
+                        break;
+                }
+        }
+        BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+out:
+        ocfs2_free_path(left_path);
+        ocfs2_free_path(right_path);
+        return ret;
+}
+/*
 * Given a cpos and len, try to find the refcount record which contains cpos.
 * 1. If cpos can be found in one refcount record, return the record.
 * 2. If cpos can't be found, return a fake record which start from cpos
@@ -983,10 +1063,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
                                  struct buffer_head **ret_bh)
 {
        int ret = 0, i, found;
-        u32 low_cpos;
+        u32 low_cpos, uninitialized_var(cpos_end);
        struct ocfs2_extent_list *el;
-        struct ocfs2_extent_rec *tmp, *rec = NULL;
+        struct ocfs2_extent_rec *rec = NULL;
-        struct ocfs2_extent_block *eb;
+        struct ocfs2_extent_block *eb = NULL;
        struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
        struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
        struct ocfs2_refcount_block *rb =
@@ -1034,12 +1114,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
                }
        }
-        /* adjust len when we have ocfs2_extent_rec after it. */
+        if (found) {
-        if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) {
+                ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
-                tmp = &el->l_recs[i+1];
+                                                  eb, el, i, &cpos_end);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
-                if (le32_to_cpu(tmp->e_cpos) < cpos + len)
+                if (cpos_end < low_cpos + len)
-                        len = le32_to_cpu(tmp->e_cpos) - cpos;
+                        len = cpos_end - low_cpos;
        }
        ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
@@ -1418,7 +1502,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
        /* change old and new rl_used accordingly. */
        le16_add_cpu(&rl->rl_used, -num_moved);
-        new_rl->rl_used = cpu_to_le32(num_moved);
+        new_rl->rl_used = cpu_to_le16(num_moved);
        sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
             sizeof(struct ocfs2_refcount_rec),
@@ -1797,7 +1881,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                recs_need++;
        /* If the leaf block don't have enough record, expand it. */
-        if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) {
+        if (le16_to_cpu(rf_list->rl_used) + recs_need >
+                                         le16_to_cpu(rf_list->rl_count)) {
                struct ocfs2_refcount_rec tmp_rec;
                u64 cpos = le64_to_cpu(orig_rec->r_cpos);
                len = le32_to_cpu(orig_rec->r_clusters);
@@ -1859,7 +1944,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
                memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
                le64_add_cpu(&tail_rec->r_cpos,
                             le32_to_cpu(tail_rec->r_clusters) - len);
-                tail_rec->r_clusters = le32_to_cpu(len);
+                tail_rec->r_clusters = cpu_to_le32(len);
        }
        /*
@@ -3840,8 +3925,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
        }
        ret = ocfs2_insert_extent(handle, et, cpos,
-                        cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
+                        ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
-                                                             p_cluster)),
                        num_clusters, ext_flags, meta_ac);
        if (ret) {
                mlog_errno(ret);
@@ -4253,8 +4337,8 @@ static int ocfs2_user_path_parent(const char __user *path,
 * @new_dentry:        target dentry
 * @preserve:  if true, preserve all file attributes
 */
-int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
+static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
-                      struct dentry *new_dentry, bool preserve)
+                             struct dentry *new_dentry, bool preserve)
 {
        struct inode *inode = old_dentry->d_inode;
        int error;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ff4c798a5635..da78a2a334fd 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -814,7 +814,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
 static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
 {
        dlm_lockspace_t *fsdlm;
-        struct ocfs2_live_connection *control;
+        struct ocfs2_live_connection *uninitialized_var(control);
        int rc = 0;
        BUG_ON(conn == NULL);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 14f47d2bfe02..26069917a9f5 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -100,6 +100,8 @@ struct mount_options
 static int ocfs2_parse_options(struct super_block *sb, char *options,
                               struct mount_options *mopt,
                               int is_remount);
+static int ocfs2_check_set_options(struct super_block *sb,
+                                   struct mount_options *options);
 static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt);
 static void ocfs2_put_super(struct super_block *sb);
 static int ocfs2_mount_volume(struct super_block *sb);
@@ -600,7 +602,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
        lock_kernel();
-        if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+        if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
+            !ocfs2_check_set_options(sb, &parsed_options)) {
                ret = -EINVAL;
                goto out;
        }
@@ -691,8 +694,6 @@ unlock_osb:
        if (!ret) {
                /* Only save off the new mount options in case of a successful
                 * remount. */
-                if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
-                        parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                osb->s_mount_opt = parsed_options.mount_opt;
                osb->s_atime_quantum = parsed_options.atime_quantum;
                osb->preferred_slot = parsed_options.slot;
@@ -701,6 +702,10 @@ unlock_osb:
                if (!ocfs2_is_hard_readonly(osb))
                        ocfs2_set_journal_params(osb);
+                sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+                        ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ?
+                                                        MS_POSIXACL : 0);
        }
 out:
        unlock_kernel();
@@ -1011,31 +1016,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        brelse(bh);
        bh = NULL;
-        if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+        if (!ocfs2_check_set_options(sb, &parsed_options)) {
-                parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+                status = -EINVAL;
+                goto read_super_error;
+        }
        osb->s_mount_opt = parsed_options.mount_opt;
        osb->s_atime_quantum = parsed_options.atime_quantum;
        osb->preferred_slot = parsed_options.slot;
        osb->osb_commit_interval = parsed_options.commit_interval;
        osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
        osb->local_alloc_bits = osb->local_alloc_default_bits;
-        if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
-            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-                status = -EINVAL;
-                mlog(ML_ERROR, "User quotas were requested, but this "
-                     "filesystem does not have the feature enabled.\n");
-                goto read_super_error;
-        }
-        if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
-            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-                status = -EINVAL;
-                mlog(ML_ERROR, "Group quotas were requested, but this "
-                     "filesystem does not have the feature enabled.\n");
-                goto read_super_error;
-        }
        status = ocfs2_verify_userspace_stack(osb, &parsed_options);
        if (status)
@@ -1245,6 +1235,40 @@ static struct file_system_type ocfs2_fs_type = {
        .next           = NULL
 };
+static int ocfs2_check_set_options(struct super_block *sb,
+                                   struct mount_options *options)
+{
+        if (options->mount_opt & OCFS2_MOUNT_USRQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+                mlog(ML_ERROR, "User quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                return 0;
+        }
+        if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+            !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+                                         OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+                mlog(ML_ERROR, "Group quotas were requested, but this "
+                     "filesystem does not have the feature enabled.\n");
+                return 0;
+        }
+        if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+            !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) {
+                mlog(ML_ERROR, "ACL support requested but extended attributes "
+                     "feature is not enabled\n");
+                return 0;
+        }
+        /* No ACL setting specified? Use XATTR feature... */
+        if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL |
+                                    OCFS2_MOUNT_NO_POSIX_ACL))) {
+                if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR))
+                        options->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                else
+                        options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
+        }
+        return 1;
+}
 static int ocfs2_parse_options(struct super_block *sb,
                               char *options,
                               struct mount_options *mopt,
@@ -1392,40 +1416,19 @@ static int ocfs2_parse_options(struct super_block *sb,
                        mopt->mount_opt |= OCFS2_MOUNT_INODE64;
                        break;
                case Opt_usrquota:
-                        /* We check only on remount, otherwise features
-                         * aren't yet initialized. */
-                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                            OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
-                                mlog(ML_ERROR, "User quota requested but "
-                                     "filesystem feature is not set\n");
-                                status = 0;
-                                goto bail;
-                        }
                        mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
                        break;
                case Opt_grpquota:
-                        if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
-                            OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
-                                mlog(ML_ERROR, "Group quota requested but "
-                                     "filesystem feature is not set\n");
-                                status = 0;
-                                goto bail;
-                        }
                        mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
                        break;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
                case Opt_acl:
                        mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+                        mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
                        break;
                case Opt_noacl:
+                        mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL;
                        mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
                        break;
-#else
-                case Opt_acl:
-                case Opt_noacl:
-                        printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
-                        break;
-#endif
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1502,12 +1505,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        if (opts & OCFS2_MOUNT_INODE64)
                seq_printf(s, ",inode64");
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
        if (opts & OCFS2_MOUNT_POSIX_ACL)
                seq_printf(s, ",acl");
        else
                seq_printf(s, ",noacl");
-#endif
        return 0;
 }
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index e3421030a69f..49b133ccbf11 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
+        .fiemap         = ocfs2_fiemap,
 };
 const struct inode_operations ocfs2_fast_symlink_inode_operations = {
        .readlink       = ocfs2_readlink,
@@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = ocfs2_listxattr,
        .removexattr    = generic_removexattr,
+        .fiemap         = ocfs2_fiemap,
 };
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index fe3419068df2..8fc6fb071c6d 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
 struct xattr_handler *ocfs2_xattr_handlers[] = {
        &ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
        &ocfs2_xattr_acl_access_handler,
        &ocfs2_xattr_acl_default_handler,
-#endif
        &ocfs2_xattr_trusted_handler,
        &ocfs2_xattr_security_handler,
        NULL
@@ -109,12 +107,10 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
 static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
        [OCFS2_XATTR_INDEX_USER]        = &ocfs2_xattr_user_handler,
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
        [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
                                        = &ocfs2_xattr_acl_access_handler,
        [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
                                        = &ocfs2_xattr_acl_default_handler,
-#endif
        [OCFS2_XATTR_INDEX_TRUSTED]     = &ocfs2_xattr_trusted_handler,
        [OCFS2_XATTR_INDEX_SECURITY]    = &ocfs2_xattr_security_handler,
 };
@@ -205,8 +201,6 @@ static int ocfs2_get_xattr_tree_value_root(struct super_block *sb,
                                           int offset,
                                           struct ocfs2_xattr_value_root **xv,
                                           struct buffer_head **bh);
-static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
-                                    const void *value, size_t size, int flags);
 static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
 {
@@ -6066,7 +6060,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
                 * to the extent block, so just calculate a maximum record num.
                 */
                if (!xv->xr_list.l_tree_depth)
-                        *num_recs += xv->xr_list.l_next_free_rec;
+                        *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec);
                else
                        *num_recs += ocfs2_clusters_for_bytes(sb,
                                                              XATTR_SIZE_MAX);
@@ -6978,9 +6972,9 @@ int ocfs2_init_security_and_acl(struct inode *dir,
        ret = ocfs2_init_security_get(inode, dir, &si);
        if (!ret) {
-                ret = ocfs2_xattr_security_set(inode, si.name,
+                ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
-                                               si.value, si.value_len,
+                                      si.name, si.value, si.value_len,
-                                               XATTR_CREATE);
+                                      XATTR_CREATE);
                if (ret) {
                        mlog_errno(ret);
                        goto leave;
@@ -7008,9 +7002,9 @@ leave:
 /*
 * 'security' attributes support
 */
-static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
                                        size_t list_size, const char *name,
-                                        size_t name_len)
+                                        size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -7023,23 +7017,23 @@ static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
        return total_len;
 }
-static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
-                                    void *buffer, size_t size)
+                                    void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+        return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
-                               buffer, size);
+                               name, buffer, size);
 }
-static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
-                                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+        return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY,
-                               size, flags);
+                               name, value, size, flags);
 }
 int ocfs2_init_security_get(struct inode *inode,
@@ -7076,9 +7070,9 @@ struct xattr_handler ocfs2_xattr_security_handler = {
 /*
 * 'trusted' attributes support
 */
-static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
                                       size_t list_size, const char *name,
-                                       size_t name_len)
+                                       size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
@@ -7091,23 +7085,23 @@ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
        return total_len;
 }
-static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name,
+static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
-                                   void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name,
+        return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
-                               buffer, size);
+                               name, buffer, size);
 }
-static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name,
+static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
-                                   const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value,
+        return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED,
-                               size, flags);
+                               name, value, size, flags);
 }
 struct xattr_handler ocfs2_xattr_trusted_handler = {
@@ -7120,13 +7114,13 @@ struct xattr_handler ocfs2_xattr_trusted_handler = {
 /*
 * 'user' attributes support
 */
-static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
+static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
                                    size_t list_size, const char *name,
-                                    size_t name_len)
+                                    size_t name_len, int type)
 {
        const size_t prefix_len = XATTR_USER_PREFIX_LEN;
        const size_t total_len = prefix_len + name_len + 1;
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return 0;
@@ -7139,31 +7133,31 @@ static size_t ocfs2_xattr_user_list(struct inode *inode, char *list,
        return total_len;
 }
-static int ocfs2_xattr_user_get(struct inode *inode, const char *name,
+static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
-                                void *buffer, size_t size)
+                void *buffer, size_t size, int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        if (strcmp(name, "") == 0)
                return -EINVAL;
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return -EOPNOTSUPP;
-        return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name,
+        return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name,
                               buffer, size);
 }
-static int ocfs2_xattr_user_set(struct inode *inode, const char *name,
+static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
-                                const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
-        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
        if (strcmp(name, "") == 0)
                return -EINVAL;
        if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
                return -EOPNOTSUPP;
-        return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value,
+        return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER,
-                               size, flags);
+                               name, value, size, flags);
 }
 struct xattr_handler ocfs2_xattr_user_handler = {
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 08e36389f56d..abd72a47f520 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info {
 extern struct xattr_handler ocfs2_xattr_user_handler;
 extern struct xattr_handler ocfs2_xattr_trusted_handler;
 extern struct xattr_handler ocfs2_xattr_security_handler;
-#ifdef CONFIG_OCFS2_FS_POSIX_ACL
 extern struct xattr_handler ocfs2_xattr_acl_access_handler;
 extern struct xattr_handler ocfs2_xattr_acl_default_handler;
-#endif
 extern struct xattr_handler *ocfs2_xattr_handlers[];
 ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/open.c b/fs/open.c
index b4b31d277f3a..040cef72bc00 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -30,6 +30,9 @@
 #include <linux/audit.h>
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
+#include <linux/ima.h>
+#include "internal.h"
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
@@ -818,15 +821,14 @@ static inline int __get_file_write_access(struct inode *inode,
 }
 static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
-                                        int flags, struct file *f,
+                                        struct file *f,
                                        int (*open)(struct inode *, struct file *),
                                        const struct cred *cred)
 {
        struct inode *inode;
        int error;
-        f->f_flags = flags;
+        f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
-        f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK |
                                FMODE_PREAD | FMODE_PWRITE;
        inode = dentry->d_inode;
        if (f->f_mode & FMODE_WRITE) {
@@ -855,6 +857,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
                if (error)
                        goto cleanup_all;
        }
+        ima_counts_get(f);
        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -926,7 +929,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry
        if (IS_ERR(dentry))
                goto out_err;
        nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt),
-                                             nd->intent.open.flags - 1,
                                             nd->intent.open.file,
                                             open, cred);
 out:
@@ -945,7 +947,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp);
 *
 * Note that this function destroys the original nameidata
 */
-struct file *nameidata_to_filp(struct nameidata *nd, int flags)
+struct file *nameidata_to_filp(struct nameidata *nd)
 {
        const struct cred *cred = current_cred();
        struct file *filp;
@@ -954,7 +956,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags)
        filp = nd->intent.open.file;
        /* Has the filesystem initialised the file for us? */
        if (filp->f_path.dentry == NULL)
-                filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp,
+                filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
                                     NULL, cred);
        else
                path_put(&nd->path);
@@ -993,7 +995,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
                return ERR_PTR(error);
        }
-        return __dentry_open(dentry, mnt, flags, f, NULL, cred);
+        f->f_flags = flags;
+        return __dentry_open(dentry, mnt, f, NULL, cred);
 }
 EXPORT_SYMBOL(dentry_open);
diff --git a/fs/pipe.c b/fs/pipe.c
index ae17d026aaa3..37ba29ff3158 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -906,17 +906,6 @@ void free_pipe_info(struct inode *inode)
 }
 static struct vfsmount *pipe_mnt __read_mostly;
-static int pipefs_delete_dentry(struct dentry *dentry)
-{
-        /*
-         * At creation time, we pretended this dentry was hashed
-         * (by clearing DCACHE_UNHASHED bit in d_flags)
-         * At delete time, we restore the truth : not hashed.
-         * (so that dput() can proceed correctly)
-         */
-        dentry->d_flags |= DCACHE_UNHASHED;
-        return 0;
-}
 /*
 * pipefs_dname() is called from d_path().
@@ -928,7 +917,6 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 }
 static const struct dentry_operations pipefs_dentry_operations = {
-        .d_delete       = pipefs_delete_dentry,
        .d_dname        = pipefs_dname,
 };
@@ -974,7 +962,7 @@ struct file *create_write_pipe(int flags)
        int err;
        struct inode *inode;
        struct file *f;
-        struct dentry *dentry;
+        struct path path;
        struct qstr name = { .name = "" };
        err = -ENFILE;
@@ -983,21 +971,16 @@ struct file *create_write_pipe(int flags)
                goto err;
        err = -ENOMEM;
-        dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
+        path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
-        if (!dentry)
+        if (!path.dentry)
                goto err_inode;
+        path.mnt = mntget(pipe_mnt);
-        dentry->d_op = &pipefs_dentry_operations;
+        path.dentry->d_op = &pipefs_dentry_operations;
-        /*
+        d_instantiate(path.dentry, inode);
-         * We dont want to publish this dentry into global dentry hash table.
-         * We pretend dentry is already hashed, by unsetting DCACHE_UNHASHED
-         * This permits a working /proc/$pid/fd/XXX on pipes
-         */
-        dentry->d_flags &= ~DCACHE_UNHASHED;
-        d_instantiate(dentry, inode);
        err = -ENFILE;
-        f = alloc_file(pipe_mnt, dentry, FMODE_WRITE, &write_pipefifo_fops);
+        f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
        if (!f)
                goto err_dentry;
        f->f_mapping = inode->i_mapping;
@@ -1009,7 +992,7 @@ struct file *create_write_pipe(int flags)
 err_dentry:
        free_pipe_info(inode);
-        dput(dentry);
+        path_put(&path);
        return ERR_PTR(err);
 err_inode:
@@ -1028,20 +1011,14 @@ void free_write_pipe(struct file *f)
 struct file *create_read_pipe(struct file *wrf, int flags)
 {
-        struct file *f = get_empty_filp();
+        /* Grab pipe from the writer */
+        struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
+                                    &read_pipefifo_fops);
        if (!f)
                return ERR_PTR(-ENFILE);
-        /* Grab pipe from the writer */
-        f->f_path = wrf->f_path;
        path_get(&wrf->f_path);
-        f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
-        f->f_pos = 0;
        f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-        f->f_op = &read_pipefifo_fops;
-        f->f_mode = FMODE_READ;
-        f->f_version = 0;
        return f;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 4badde179b18..f560325c444f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -134,13 +134,16 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
 * simple bit tests.
 */
 static const char *task_state_array[] = {
-        "R (running)",          /*  0 */
+        "R (running)",          /*   0 */
-        "S (sleeping)",         /*  1 */
+        "S (sleeping)",         /*   1 */
-        "D (disk sleep)",       /*  2 */
+        "D (disk sleep)",       /*   2 */
-        "T (stopped)",          /*  4 */
+        "T (stopped)",          /*   4 */
-        "T (tracing stop)",     /*  8 */
+        "t (tracing stop)",     /*   8 */
-        "Z (zombie)",           /* 16 */
+        "Z (zombie)",           /*  16 */
-        "X (dead)"              /* 32 */
+        "X (dead)",             /*  32 */
+        "x (dead)",             /*  64 */
+        "K (wakekill)",         /* 128 */
+        "W (waking)",           /* 256 */
 };
 static inline const char *get_task_state(struct task_struct *tsk)
@@ -148,6 +151,8 @@ static inline const char *get_task_state(struct task_struct *tsk)
        unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
        const char **p = &task_state_array[0];
+        BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
        while (state) {
                p++;
                state >>= 1;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4df4a464a919..18d5cc62d8ed 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2266,7 +2266,7 @@ static const struct inode_operations proc_attr_dir_inode_operations = {
 #endif
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
                                         size_t count, loff_t *ppos)
 {
@@ -2623,7 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_FAULT_INJECTION
        REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
 #endif
-#if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
+#ifdef CONFIG_ELF_CORE
        REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
 #endif
 #ifdef CONFIG_TASK_IO_ACCOUNTING
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index fa678abc9db1..480cb1065eec 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -429,7 +429,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
                        unsigned int ino;
                        ino = de->low_ino;
-                        de_get(de);
+                        pde_get(de);
                        spin_unlock(&proc_subdir_lock);
                        error = -EINVAL;
                        inode = proc_get_inode(dir->i_sb, ino, de);
@@ -445,7 +445,7 @@ out_unlock:
                return NULL;
        }
        if (de)
-                de_put(de);
+                pde_put(de);
        return ERR_PTR(error);
 }
@@ -509,17 +509,17 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
                                struct proc_dir_entry *next;
                                /* filldir passes info to user space */
-                                de_get(de);
+                                pde_get(de);
                                spin_unlock(&proc_subdir_lock);
                                if (filldir(dirent, de->name, de->namelen, filp->f_pos,
                                            de->low_ino, de->mode >> 12) < 0) {
-                                        de_put(de);
+                                        pde_put(de);
                                        goto out;
                                }
                                spin_lock(&proc_subdir_lock);
                                filp->f_pos++;
                                next = de->next;
-                                de_put(de);
+                                pde_put(de);
                                de = next;
                        } while (de);
                        spin_unlock(&proc_subdir_lock);
@@ -763,7 +763,7 @@ out:
        return NULL;
 }
-void free_proc_entry(struct proc_dir_entry *de)
+static void free_proc_entry(struct proc_dir_entry *de)
 {
        unsigned int ino = de->low_ino;
@@ -777,6 +777,12 @@ void free_proc_entry(struct proc_dir_entry *de)
        kfree(de);
 }
+void pde_put(struct proc_dir_entry *pde)
+{
+        if (atomic_dec_and_test(&pde->count))
+                free_proc_entry(pde);
+}
 /*
 * Remove a /proc entry and free it if it's not currently in use.
 */
@@ -845,6 +851,5 @@ continue_removing:
        WARN(de->subdir, KERN_WARNING "%s: removing non-empty directory "
                        "'%s/%s', leaking at least '%s'\n", __func__,
                        de->parent->name, de->name, de->subdir->name);
-        if (atomic_dec_and_test(&de->count))
+        pde_put(de);
-                free_proc_entry(de);
 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d78ade305541..445a02bcaab3 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,29 +24,6 @@
 #include "internal.h"
-struct proc_dir_entry *de_get(struct proc_dir_entry *de)
-{
-        atomic_inc(&de->count);
-        return de;
-}
-/*
- * Decrements the use count and checks for deferred deletion.
- */
-void de_put(struct proc_dir_entry *de)
-{
-        if (!atomic_read(&de->count)) {
-                printk("de_put: entry %s already free!\n", de->name);
-                return;
-        }
-        if (atomic_dec_and_test(&de->count))
-                free_proc_entry(de);
-}
-/*
- * Decrement the use count of the proc_dir_entry.
- */
 static void proc_delete_inode(struct inode *inode)
 {
        struct proc_dir_entry *de;
@@ -59,7 +36,7 @@ static void proc_delete_inode(struct inode *inode)
        /* Let go of any associated proc directory entry */
        de = PROC_I(inode)->pde;
        if (de)
-                de_put(de);
+                pde_put(de);
        if (PROC_I(inode)->sysctl)
                sysctl_head_put(PROC_I(inode)->sysctl);
        clear_inode(inode);
@@ -480,7 +457,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
                }
                unlock_new_inode(inode);
        } else
-               de_put(de);
+               pde_put(de);
        return inode;
 }                       
@@ -495,7 +472,7 @@ int proc_fill_super(struct super_block *s)
        s->s_op = &proc_sops;
        s->s_time_gran = 1;
        
-        de_get(&proc_root);
+        pde_get(&proc_root);
        root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root);
        if (!root_inode)
                goto out_no_root;
@@ -509,6 +486,6 @@ int proc_fill_super(struct super_block *s)
 out_no_root:
        printk("proc_read_super: get root inode failed\n");
        iput(root_inode);
-        de_put(&proc_root);
+        pde_put(&proc_root);
        return -ENOMEM;
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 753ca37002c8..1f24a3eddd12 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -61,8 +61,6 @@ extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
-void free_proc_entry(struct proc_dir_entry *de);
 void proc_init_inodecache(void);
 static inline struct pid *proc_pid(struct inode *inode)
@@ -101,8 +99,12 @@ unsigned long task_vsize(struct mm_struct *);
 int task_statm(struct mm_struct *, int *, int *, int *, int *);
 void task_mem(struct seq_file *, struct mm_struct *);
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
-void de_put(struct proc_dir_entry *de);
+{
+        atomic_inc(&pde->count);
+        return pde;
+}
+void pde_put(struct proc_dir_entry *pde);
 extern struct vfsmount *proc_mnt;
 int proc_fill_super(struct super_block *);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 5033ce0d254b..180cf5a0bd67 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -71,52 +72,12 @@ static const struct file_operations proc_kpagecount_operations = {
 * physical page flags.
 */
-/* These macros are used to decouple internal flags from exported ones */
-#define KPF_LOCKED              0
-#define KPF_ERROR               1
-#define KPF_REFERENCED          2
-#define KPF_UPTODATE            3
-#define KPF_DIRTY               4
-#define KPF_LRU                 5
-#define KPF_ACTIVE              6
-#define KPF_SLAB                7
-#define KPF_WRITEBACK           8
-#define KPF_RECLAIM             9
-#define KPF_BUDDY               10
-/* 11-20: new additions in 2.6.31 */
-#define KPF_MMAP                11
-#define KPF_ANON                12
-#define KPF_SWAPCACHE           13
-#define KPF_SWAPBACKED          14
-#define KPF_COMPOUND_HEAD       15
-#define KPF_COMPOUND_TAIL       16
-#define KPF_HUGE                17
-#define KPF_UNEVICTABLE         18
-#define KPF_HWPOISON            19
-#define KPF_NOPAGE              20
-#define KPF_KSM                 21
-/* kernel hacking assistances
- * WARNING: subject to change, never rely on them!
- */
-#define KPF_RESERVED            32
-#define KPF_MLOCKED             33
-#define KPF_MAPPEDTODISK        34
-#define KPF_PRIVATE             35
-#define KPF_PRIVATE_2           36
-#define KPF_OWNER_PRIVATE       37
-#define KPF_ARCH                38
-#define KPF_UNCACHED            39
 static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit)
 {
        return ((kflags >> kbit) & 1) << ubit;
 }
-static u64 get_uflags(struct page *page)
+u64 stable_page_flags(struct page *page)
 {
        u64 k;
        u64 u;
@@ -219,7 +180,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
                else
                        ppage = NULL;
-                if (put_user(get_uflags(ppage), out)) {
+                if (put_user(stable_page_flags(ppage), out)) {
                        ret = -EFAULT;
                        break;
                }
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 32f5d131a644..22e0d60e53ef 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -17,13 +17,6 @@
 #include <linux/bitops.h>
 #include "qnx4.h"
-#if 0
-int qnx4_new_block(struct super_block *sb)
-{
-        return 0;
-}
-#endif  /*  0  */
 static void count_bits(register const char *bmPart, register int size,
                       int *const tf)
 {
@@ -35,22 +28,7 @@ static void count_bits(register const char *bmPart, register int size,
        }
        do {
                b = *bmPart++;
-                if ((b & 1) == 0)
+                tot += 8 - hweight8(b);
-                        tot++;
-                if ((b & 2) == 0)
-                        tot++;
-                if ((b & 4) == 0)
-                        tot++;
-                if ((b & 8) == 0)
-                        tot++;
-                if ((b & 16) == 0)
-                        tot++;
-                if ((b & 32) == 0)
-                        tot++;
-                if ((b & 64) == 0)
-                        tot++;
-                if ((b & 128) == 0)
-                        tot++;
                size--;
        } while (size != 0);
        *tf = tot;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 449f5a66dd34..ebf3440d28ca 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -64,25 +64,7 @@ static struct buffer_head *qnx4_getblk(struct inode *inode, int nr,
                result = sb_getblk(inode->i_sb, nr);
                return result;
        }
-        if (!create) {
+        return NULL;
-                return NULL;
-        }
-#if 0
-        tmp = qnx4_new_block(inode->i_sb);
-        if (!tmp) {
-                return NULL;
-        }
-        result = sb_getblk(inode->i_sb, tmp);
-        if (tst) {
-                qnx4_free_block(inode->i_sb, tmp);
-                brelse(result);
-                goto repeat;
-        }
-        tst = tmp;
-#endif
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        return result;
 }
 struct buffer_head *qnx4_bread(struct inode *inode, int block, int create)
@@ -113,8 +95,6 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
        if ( phys ) {
                // logical block is before EOF
                map_bh(bh, inode->i_sb, phys);
-        } else if ( create ) {
-                // to be done.
        }
        return 0;
 }
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index cd6bb9a33c13..dea86abdf2e7 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -323,6 +323,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
+/* Dirtify all the dquots - this can block when journalling */
+static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+{
+        int ret, err, cnt;
+        ret = err = 0;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+                if (dquot[cnt])
+                        /* Even in case of error we have to continue */
+                        ret = mark_dquot_dirty(dquot[cnt]);
+                if (!err)
+                        err = ret;
+        }
+        return err;
+}
+static inline void dqput_all(struct dquot **dquot)
+{
+        unsigned int cnt;
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+                dqput(dquot[cnt]);
+}
 /* This function needs dq_list_lock */
 static inline int clear_dquot_dirty(struct dquot *dquot)
 {
@@ -1268,8 +1292,7 @@ int dquot_initialize(struct inode *inode, int type)
 out_err:
        up_write(&sb_dqopt(sb)->dqptr_sem);
        /* Drop unused references */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+        dqput_all(got);
-                dqput(got[cnt]);
        return ret;
 }
 EXPORT_SYMBOL(dquot_initialize);
@@ -1288,9 +1311,7 @@ int dquot_drop(struct inode *inode)
                inode->i_dquot[cnt] = NULL;
        }
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        dqput_all(put);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                dqput(put[cnt]);
        return 0;
 }
 EXPORT_SYMBOL(dquot_drop);
@@ -1319,6 +1340,67 @@ void vfs_dq_drop(struct inode *inode)
 EXPORT_SYMBOL(vfs_dq_drop);
 /*
+ * inode_reserved_space is managed internally by quota, and protected by
+ * i_lock similar to i_blocks+i_bytes.
+ */
+static qsize_t *inode_reserved_space(struct inode * inode)
+{
+        /* Filesystem must explicitly define it's own method in order to use
+         * quota reservation interface */
+        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
+        return inode->i_sb->dq_op->get_reserved_space(inode);
+}
+static void inode_add_rsv_space(struct inode *inode, qsize_t number)
+{
+        spin_lock(&inode->i_lock);
+        *inode_reserved_space(inode) += number;
+        spin_unlock(&inode->i_lock);
+}
+static void inode_claim_rsv_space(struct inode *inode, qsize_t number)
+{
+        spin_lock(&inode->i_lock);
+        *inode_reserved_space(inode) -= number;
+        __inode_add_bytes(inode, number);
+        spin_unlock(&inode->i_lock);
+}
+static void inode_sub_rsv_space(struct inode *inode, qsize_t number)
+{
+        spin_lock(&inode->i_lock);
+        *inode_reserved_space(inode) -= number;
+        spin_unlock(&inode->i_lock);
+}
+static qsize_t inode_get_rsv_space(struct inode *inode)
+{
+        qsize_t ret;
+        spin_lock(&inode->i_lock);
+        ret = *inode_reserved_space(inode);
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+static void inode_incr_space(struct inode *inode, qsize_t number,
+                                int reserve)
+{
+        if (reserve)
+                inode_add_rsv_space(inode, number);
+        else
+                inode_add_bytes(inode, number);
+}
+static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+{
+        if (reserve)
+                inode_sub_rsv_space(inode, number);
+        else
+                inode_sub_bytes(inode, number);
+}
+/*
 * Following four functions update i_blocks+i_bytes fields and
 * quota information (together with appropriate checks)
 * NOTE: We absolutely rely on the fact that caller dirties
@@ -1336,6 +1418,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
        int cnt, ret = QUOTA_OK;
        char warntype[MAXQUOTAS];
+        /*
+         * First test before acquiring mutex - solves deadlocks when we
+         * re-enter the quota code and are already holding the mutex
+         */
+        if (IS_NOQUOTA(inode)) {
+                inode_incr_space(inode, number, reserve);
+                goto out;
+        }
+        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+        if (IS_NOQUOTA(inode)) {
+                inode_incr_space(inode, number, reserve);
+                goto out_unlock;
+        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warntype[cnt] = QUOTA_NL_NOWARN;
@@ -1346,7 +1443,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
                    == NO_QUOTA) {
                        ret = NO_QUOTA;
-                        goto out_unlock;
+                        spin_unlock(&dq_data_lock);
+                        goto out_flush_warn;
                }
        }
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1357,64 +1455,29 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                else
                        dquot_incr_space(inode->i_dquot[cnt], number);
        }
-        if (!reserve)
+        inode_incr_space(inode, number, reserve);
-                inode_add_bytes(inode, number);
-out_unlock:
        spin_unlock(&dq_data_lock);
+        if (reserve)
+                goto out_flush_warn;
+        mark_all_dquot_dirty(inode->i_dquot);
+out_flush_warn:
        flush_warnings(inode->i_dquot, warntype);
+out_unlock:
+        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
        return ret;
 }
 int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
 {
-        int cnt, ret = QUOTA_OK;
+        return __dquot_alloc_space(inode, number, warn, 0);
-        /*
-         * First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex
-         */
-        if (IS_NOQUOTA(inode)) {
-                inode_add_bytes(inode, number);
-                goto out;
-        }
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode)) {
-                inode_add_bytes(inode, number);
-                goto out_unlock;
-        }
-        ret = __dquot_alloc_space(inode, number, warn, 0);
-        if (ret == NO_QUOTA)
-                goto out_unlock;
-        /* Dirtify all the dquots - this can block when journalling */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (inode->i_dquot[cnt])
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
-out_unlock:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-        return ret;
 }
 EXPORT_SYMBOL(dquot_alloc_space);
 int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
 {
-        int ret = QUOTA_OK;
+        return __dquot_alloc_space(inode, number, warn, 1);
-        if (IS_NOQUOTA(inode))
-                goto out;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode))
-                goto out_unlock;
-        ret = __dquot_alloc_space(inode, number, warn, 1);
-out_unlock:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-        return ret;
 }
 EXPORT_SYMBOL(dquot_reserve_space);
@@ -1455,10 +1518,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 warn_put_all:
        spin_unlock(&dq_data_lock);
        if (ret == QUOTA_OK)
-                /* Dirtify all the dquots - this can block when journalling */
+                mark_all_dquot_dirty(inode->i_dquot);
-                for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                        if (inode->i_dquot[cnt])
-                                mark_dquot_dirty(inode->i_dquot[cnt]);
        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return ret;
@@ -1471,14 +1531,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
        int ret = QUOTA_OK;
        if (IS_NOQUOTA(inode)) {
-                inode_add_bytes(inode, number);
+                inode_claim_rsv_space(inode, number);
                goto out;
        }
        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        if (IS_NOQUOTA(inode))  {
                up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-                inode_add_bytes(inode, number);
+                inode_claim_rsv_space(inode, number);
                goto out;
        }
@@ -1490,12 +1550,9 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
                                                        number);
        }
        /* Update inode bytes */
-        inode_add_bytes(inode, number);
+        inode_claim_rsv_space(inode, number);
        spin_unlock(&dq_data_lock);
-        /* Dirtify all the dquots - this can block when journalling */
+        mark_all_dquot_dirty(inode->i_dquot);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (inode->i_dquot[cnt])
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 out:
        return ret;
@@ -1503,38 +1560,9 @@ out:
 EXPORT_SYMBOL(dquot_claim_space);
 /*
- * Release reserved quota space
- */
-void dquot_release_reserved_space(struct inode *inode, qsize_t number)
-{
-        int cnt;
-        if (IS_NOQUOTA(inode))
-                goto out;
-        down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        if (IS_NOQUOTA(inode))
-                goto out_unlock;
-        spin_lock(&dq_data_lock);
-        /* Release reserved dquots */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                if (inode->i_dquot[cnt])
-                        dquot_free_reserved_space(inode->i_dquot[cnt], number);
-        }
-        spin_unlock(&dq_data_lock);
-out_unlock:
-        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-out:
-        return;
-}
-EXPORT_SYMBOL(dquot_release_reserved_space);
-/*
 * This operation can block, but only after everything is updated
 */
-int dquot_free_space(struct inode *inode, qsize_t number)
+int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
@@ -1543,7 +1571,7 @@ int dquot_free_space(struct inode *inode, qsize_t number)
         * re-enter the quota code and are already holding the mutex */
        if (IS_NOQUOTA(inode)) {
 out_sub:
-                inode_sub_bytes(inode, number);
+                inode_decr_space(inode, number, reserve);
                return QUOTA_OK;
        }
@@ -1558,21 +1586,40 @@ out_sub:
                if (!inode->i_dquot[cnt])
                        continue;
                warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
-                dquot_decr_space(inode->i_dquot[cnt], number);
+                if (reserve)
+                        dquot_free_reserved_space(inode->i_dquot[cnt], number);
+                else
+                        dquot_decr_space(inode->i_dquot[cnt], number);
        }
-        inode_sub_bytes(inode, number);
+        inode_decr_space(inode, number, reserve);
        spin_unlock(&dq_data_lock);
-        /* Dirtify all the dquots - this can block when journalling */
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+        if (reserve)
-                if (inode->i_dquot[cnt])
+                goto out_unlock;
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
+        mark_all_dquot_dirty(inode->i_dquot);
+out_unlock:
        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return QUOTA_OK;
 }
+int dquot_free_space(struct inode *inode, qsize_t number)
+{
+        return  __dquot_free_space(inode, number, 0);
+}
 EXPORT_SYMBOL(dquot_free_space);
 /*
+ * Release reserved quota space
+ */
+void dquot_release_reserved_space(struct inode *inode, qsize_t number)
+{
+        __dquot_free_space(inode, number, 1);
+}
+EXPORT_SYMBOL(dquot_release_reserved_space);
+/*
 * This operation can block, but only after everything is updated
 */
 int dquot_free_inode(const struct inode *inode, qsize_t number)
@@ -1599,10 +1646,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
                dquot_decr_inodes(inode->i_dquot[cnt], number);
        }
        spin_unlock(&dq_data_lock);
-        /* Dirtify all the dquots - this can block when journalling */
+        mark_all_dquot_dirty(inode->i_dquot);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (inode->i_dquot[cnt])
-                        mark_dquot_dirty(inode->i_dquot[cnt]);
        flush_warnings(inode->i_dquot, warntype);
        up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
        return QUOTA_OK;
@@ -1610,19 +1654,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 EXPORT_SYMBOL(dquot_free_inode);
 /*
- * call back function, get reserved quota space from underlying fs
- */
-qsize_t dquot_get_reserved_space(struct inode *inode)
-{
-        qsize_t reserved_space = 0;
-        if (sb_any_quota_active(inode->i_sb) &&
-            inode->i_sb->dq_op->get_reserved_space)
-                reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
-        return reserved_space;
-}
-/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 *
 * This operation can block, but only after everything is updated
@@ -1665,7 +1696,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        }
        spin_lock(&dq_data_lock);
        cur_space = inode_get_bytes(inode);
-        rsv_space = dquot_get_reserved_space(inode);
+        rsv_space = inode_get_rsv_space(inode);
        space = cur_space + rsv_space;
        /* Build the transfer_from list and check the limits */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1709,25 +1740,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        spin_unlock(&dq_data_lock);
        up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-        /* Dirtify all the dquots - this can block when journalling */
+        mark_all_dquot_dirty(transfer_from);
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        mark_all_dquot_dirty(transfer_to);
-                if (transfer_from[cnt])
+        /* The reference we got is transferred to the inode */
-                        mark_dquot_dirty(transfer_from[cnt]);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-                if (transfer_to[cnt]) {
+                transfer_to[cnt] = NULL;
-                        mark_dquot_dirty(transfer_to[cnt]);
-                        /* The reference we got is transferred to the inode */
-                        transfer_to[cnt] = NULL;
-                }
-        }
 warn_put_all:
        flush_warnings(transfer_to, warntype_to);
        flush_warnings(transfer_from, warntype_from_inodes);
        flush_warnings(transfer_from, warntype_from_space);
 put_all:
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+        dqput_all(transfer_from);
-                dqput(transfer_from[cnt]);
+        dqput_all(transfer_to);
-                dqput(transfer_to[cnt]);
-        }
        return ret;
 over_quota:
        spin_unlock(&dq_data_lock);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 3dfc23e02135..e3da02f4986f 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -97,8 +97,11 @@ static int v2_read_file_info(struct super_block *sb, int type)
        unsigned int version;
        if (!v2_read_header(sb, type, &dqhead))
-                return 0;
+                return -1;
        version = le32_to_cpu(dqhead.dqh_version);
+        if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) ||
+            (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1))
+                return -1;
        size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
               sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
@@ -120,8 +123,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
                info->dqi_maxilimit = 0xffffffff;
        } else {
                /* used space is stored as unsigned 64-bit value */
-                info->dqi_maxblimit = 0xffffffffffffffff;       /* 2^64-1 */
+                info->dqi_maxblimit = 0xffffffffffffffffULL;    /* 2^64-1 */
-                info->dqi_maxilimit = 0xffffffffffffffff;
+                info->dqi_maxilimit = 0xffffffffffffffffULL;
        }
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 32fae4040ebf..2efc57173fd7 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -60,7 +60,7 @@ const struct inode_operations ramfs_file_inode_operations = {
 */
 int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
-        unsigned long npages, xpages, loop, limit;
+        unsigned long npages, xpages, loop;
        struct page *pages;
        unsigned order;
        void *data;
diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig
index ac7cd75c86f8..513f431038f9 100644
--- a/fs/reiserfs/Kconfig
+++ b/fs/reiserfs/Kconfig
@@ -1,7 +1,6 @@
 config REISERFS_FS
        tristate "Reiserfs support"
        select CRC32
-        select FS_JOURNAL_INFO
        help
          Stores not just filenames but the files themselves in a balanced
          tree.  Uses journalling.
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 6a9e30c041dd..792b3cb2cd18 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,11 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
 reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
                 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
                 hashes.o tail_conversion.o journal.o resize.o \
-                 item_ops.o ioctl.o procfs.o xattr.o lock.o
+                 item_ops.o ioctl.o xattr.o lock.o
+ifeq ($(CONFIG_REISERFS_PROC_INFO),y)
+reiserfs-objs += procfs.o
+endif
 ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
 reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 685495707181..65c872761177 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1277,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb)
        struct reiserfs_bitmap_info *bitmap;
        unsigned int bmap_nr = reiserfs_bmap_count(sb);
+        /* Avoid lock recursion in fault case */
+        reiserfs_write_unlock(sb);
        bitmap = vmalloc(sizeof(*bitmap) * bmap_nr);
+        reiserfs_write_lock(sb);
        if (bitmap == NULL)
                return -ENOMEM;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 3a28e7751b3c..1150ebb2536f 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -31,11 +31,12 @@ void reiserfs_delete_inode(struct inode *inode)
            JOURNAL_PER_BALANCE_CNT * 2 +
            2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
        struct reiserfs_transaction_handle th;
+        int depth;
        int err;
        truncate_inode_pages(&inode->i_data, 0);
-        reiserfs_write_lock(inode->i_sb);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
        if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {  /* also handles bad_inode case */
@@ -74,7 +75,7 @@ void reiserfs_delete_inode(struct inode *inode)
      out:
        clear_inode(inode);     /* note this must go after the journal_end to prevent deadlock */
        inode->i_blocks = 0;
-        reiserfs_write_unlock(inode->i_sb);
+        reiserfs_write_unlock_once(inode->i_sb, depth);
 }
 static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
@@ -2538,6 +2539,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
        return reiserfs_write_full_page(page, wbc);
 }
+static void reiserfs_truncate_failed_write(struct inode *inode)
+{
+        truncate_inode_pages(inode->i_mapping, inode->i_size);
+        reiserfs_truncate_file(inode, 0);
+}
 static int reiserfs_write_begin(struct file *file,
                                struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned flags,
@@ -2604,6 +2611,8 @@ static int reiserfs_write_begin(struct file *file,
        if (ret) {
                unlock_page(page);
                page_cache_release(page);
+                /* Truncate allocated blocks */
+                reiserfs_truncate_failed_write(inode);
        }
        return ret;
 }
@@ -2701,9 +2710,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
         ** transaction tracking stuff when the size changes.  So, we have
         ** to do the i_size updates here.
         */
-        pos += copied;
+        if (pos + copied > inode->i_size) {
-        if (pos > inode->i_size) {
                struct reiserfs_transaction_handle myth;
                lock_depth = reiserfs_write_lock_once(inode->i_sb);
                locked = true;
@@ -2721,7 +2728,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                        goto journal_error;
                reiserfs_update_inode_transaction(inode);
-                inode->i_size = pos;
+                inode->i_size = pos + copied;
                /*
                 * this will just nest into our transaction.  It's important
                 * to use mark_inode_dirty so the inode gets pushed around on the
@@ -2751,6 +2758,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
                reiserfs_write_unlock_once(inode->i_sb, lock_depth);
        unlock_page(page);
        page_cache_release(page);
+        if (pos + len > inode->i_size)
+                reiserfs_truncate_failed_write(inode);
        return ret == 0 ? copied : ret;
      journal_error:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 2f8a7e7b8dab..83ac4d3b3cb0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2009,10 +2009,11 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
                destroy_workqueue(commit_wq);
                commit_wq = NULL;
        }
-        reiserfs_write_lock(sb);
        free_journal_ram(sb);
+        reiserfs_write_lock(sb);
        return 0;
 }
@@ -2758,11 +2759,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        struct reiserfs_journal *journal;
        struct reiserfs_journal_list *jl;
        char b[BDEVNAME_SIZE];
+        int ret;
+        /*
+         * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS
+         * dependency inversion warnings.
+         */
+        reiserfs_write_unlock(sb);
        journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal));
        if (!journal) {
                reiserfs_warning(sb, "journal-1256",
                                 "unable to get memory for journal structure");
+                reiserfs_write_lock(sb);
                return 1;
        }
        memset(journal, 0, sizeof(struct reiserfs_journal));
@@ -2771,10 +2779,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
        INIT_LIST_HEAD(&journal->j_working_list);
        INIT_LIST_HEAD(&journal->j_journal_list);
        journal->j_persistent_trans = 0;
-        if (reiserfs_allocate_list_bitmaps(sb,
+        ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap,
-                                           journal->j_list_bitmap,
+                                           reiserfs_bmap_count(sb));
-                                           reiserfs_bmap_count(sb)))
+        reiserfs_write_lock(sb);
+        if (ret)
                goto free_and_return;
        allocate_bitmap_nodes(sb);
        /* reserved for journal area support */
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index ee2cfc0fd8a7..b87aa2c1afc1 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -86,3 +86,12 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
                reiserfs_panic(sb, "%s called without kernel lock held %d",
                               caller);
 }
+#ifdef CONFIG_REISERFS_CHECK
+void reiserfs_lock_check_recursive(struct super_block *sb)
+{
+        struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
+        WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n");
+}
+#endif
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e296ff72a6cc..9d4dcf0b07cb 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -921,6 +921,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
        struct reiserfs_transaction_handle th;
        int jbegin_count;
        unsigned long savelink;
+        int depth;
        inode = dentry->d_inode;
@@ -932,7 +933,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
            JOURNAL_PER_BALANCE_CNT * 2 + 2 +
            4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-        reiserfs_write_lock(dir->i_sb);
+        depth = reiserfs_write_lock_once(dir->i_sb);
        retval = journal_begin(&th, dir->i_sb, jbegin_count);
        if (retval)
                goto out_unlink;
@@ -993,7 +994,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
        retval = journal_end(&th, dir->i_sb, jbegin_count);
        reiserfs_check_path(&path);
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, depth);
        return retval;
      end_unlink:
@@ -1003,7 +1004,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
        if (err)
                retval = err;
      out_unlink:
-        reiserfs_write_unlock(dir->i_sb);
+        reiserfs_write_unlock_once(dir->i_sb, depth);
        return retval;
 }
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 9229e5514a4e..7a9981196c1c 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -17,8 +17,6 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
-#ifdef CONFIG_REISERFS_PROC_INFO
 /*
 * LOCKING:
 *
@@ -48,14 +46,6 @@ static int show_version(struct seq_file *m, struct super_block *sb)
        return 0;
 }
-int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
-                                    int count, int *eof, void *data)
-{
-        *start = buffer;
-        *eof = 1;
-        return 0;
-}
 #define SF( x ) ( r -> x )
 #define SFP( x ) SF( s_proc_info_data.x )
 #define SFPL( x ) SFP( x[ level ] )
@@ -538,19 +528,6 @@ int reiserfs_proc_info_done(struct super_block *sb)
        return 0;
 }
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-                                                     read_proc_t * func)
-{
-        return (proc_info_root) ? create_proc_read_entry(name, 0,
-                                                         proc_info_root,
-                                                         func, NULL) : NULL;
-}
-void reiserfs_proc_unregister_global(const char *name)
-{
-        remove_proc_entry(name, proc_info_root);
-}
 int reiserfs_proc_info_global_init(void)
 {
        if (proc_info_root == NULL) {
@@ -572,48 +549,6 @@ int reiserfs_proc_info_global_done(void)
        }
        return 0;
 }
-/* REISERFS_PROC_INFO */
-#else
-int reiserfs_proc_info_init(struct super_block *sb)
-{
-        return 0;
-}
-int reiserfs_proc_info_done(struct super_block *sb)
-{
-        return 0;
-}
-struct proc_dir_entry *reiserfs_proc_register_global(char *name,
-                                                     read_proc_t * func)
-{
-        return NULL;
-}
-void reiserfs_proc_unregister_global(const char *name)
-{;
-}
-int reiserfs_proc_info_global_init(void)
-{
-        return 0;
-}
-int reiserfs_proc_info_global_done(void)
-{
-        return 0;
-}
-int reiserfs_global_version_in_proc(char *buffer, char **start,
-                                    off_t offset,
-                                    int count, int *eof, void *data)
-{
-        return 0;
-}
-/* REISERFS_PROC_INFO */
-#endif
 /*
 * Revision 1.1.8.2  2001/07/15 17:08:42  god
 *  . use get_super() in procfs.c
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 339b0baf2af6..b4a7dd03bdb9 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2222,8 +2222,6 @@ static int __init init_reiserfs_fs(void)
        }
        reiserfs_proc_info_global_init();
-        reiserfs_proc_register_global("version",
-                                      reiserfs_global_version_in_proc);
        ret = register_filesystem(&reiserfs_fs_type);
@@ -2231,7 +2229,6 @@ static int __init init_reiserfs_fs(void)
                return 0;
        }
-        reiserfs_proc_unregister_global("version");
        reiserfs_proc_info_global_done();
        destroy_inodecache();
@@ -2240,7 +2237,6 @@ static int __init init_reiserfs_fs(void)
 static void __exit exit_reiserfs_fs(void)
 {
-        reiserfs_proc_unregister_global("version");
        reiserfs_proc_info_global_done();
        unregister_filesystem(&reiserfs_fs_type);
        destroy_inodecache();
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 58aa8e75f7f5..c3b004ee627b 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -48,6 +48,7 @@
 #include <net/checksum.h>
 #include <linux/stat.h>
 #include <linux/quotaops.h>
+#include <linux/security.h>
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
@@ -82,7 +83,8 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
        vfs_dq_init(dir);
-        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+        reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
+                                        I_MUTEX_CHILD, dir->i_sb);
        error = dir->i_op->unlink(dir, dentry);
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -97,7 +99,8 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
        BUG_ON(!mutex_is_locked(&dir->i_mutex));
        vfs_dq_init(dir);
-        mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
+        reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex,
+                                        I_MUTEX_CHILD, dir->i_sb);
        dentry_unhash(dentry);
        error = dir->i_op->rmdir(dir, dentry);
        if (!error)
@@ -234,16 +237,22 @@ static int reiserfs_for_each_xattr(struct inode *inode,
        if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1)
                return 0;
+        reiserfs_write_unlock(inode->i_sb);
        dir = open_xa_dir(inode, XATTR_REPLACE);
        if (IS_ERR(dir)) {
                err = PTR_ERR(dir);
+                reiserfs_write_lock(inode->i_sb);
                goto out;
        } else if (!dir->d_inode) {
                err = 0;
+                reiserfs_write_lock(inode->i_sb);
                goto out_dir;
        }
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR);
+        reiserfs_write_lock(inode->i_sb);
        buf.xadir = dir;
        err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos);
        while ((err == 0 || err == -ENOSPC) && buf.count) {
@@ -282,8 +291,9 @@ static int reiserfs_for_each_xattr(struct inode *inode,
                err = journal_begin(&th, inode->i_sb, blocks);
                if (!err) {
                        int jerror;
-                        mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
+                        reiserfs_mutex_lock_nested_safe(
-                                          I_MUTEX_XATTR);
+                                          &dir->d_parent->d_inode->i_mutex,
+                                          I_MUTEX_XATTR, inode->i_sb);
                        err = action(dir, data);
                        jerror = journal_end(&th, inode->i_sb, blocks);
                        mutex_unlock(&dir->d_parent->d_inode->i_mutex);
@@ -479,11 +489,16 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
        if (!buffer)
                return lookup_and_delete_xattr(inode, name);
+        reiserfs_write_unlock(inode->i_sb);
        dentry = xattr_lookup(inode, name, flags);
-        if (IS_ERR(dentry))
+        if (IS_ERR(dentry)) {
+                reiserfs_write_lock(inode->i_sb);
                return PTR_ERR(dentry);
+        }
-        down_write(&REISERFS_I(inode)->i_xattr_sem);
+        down_read(&REISERFS_I(inode)->i_xattr_sem);
+        reiserfs_write_lock(inode->i_sb);
        xahash = xattr_hash(buffer, buffer_size);
        while (buffer_pos < buffer_size || buffer_pos == 0) {
@@ -726,15 +741,14 @@ ssize_t
 reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
                  size_t size)
 {
-        struct inode *inode = dentry->d_inode;
        struct xattr_handler *handler;
-        handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-        if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
+        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        return handler->get(inode, name, buffer, size);
+        return handler->get(dentry, name, buffer, size, handler->flags);
 }
 /*
@@ -746,15 +760,14 @@ int
 reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                  size_t size, int flags)
 {
-        struct inode *inode = dentry->d_inode;
        struct xattr_handler *handler;
-        handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-        if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
+        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, value, size, flags);
+        return handler->set(dentry, name, value, size, flags, handler->flags);
 }
 /*
@@ -764,21 +777,20 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 */
 int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-        struct inode *inode = dentry->d_inode;
        struct xattr_handler *handler;
-        handler = find_xattr_handler_prefix(inode->i_sb->s_xattr, name);
+        handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
-        if (!handler || get_inode_sd_version(inode) == STAT_DATA_V1)
+        if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
+        return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
 }
 struct listxattr_buf {
        size_t size;
        size_t pos;
        char *buf;
-        struct inode *inode;
+        struct dentry *dentry;
 };
 static int listxattr_filler(void *buf, const char *name, int namelen,
@@ -789,17 +801,19 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
        if (name[0] != '.' ||
            (namelen != 1 && (name[1] != '.' || namelen != 2))) {
                struct xattr_handler *handler;
-                handler = find_xattr_handler_prefix(b->inode->i_sb->s_xattr,
+                handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
                                                    name);
                if (!handler)   /* Unsupported xattr name */
                        return 0;
                if (b->buf) {
-                        size = handler->list(b->inode, b->buf + b->pos,
+                        size = handler->list(b->dentry, b->buf + b->pos,
-                                         b->size, name, namelen);
+                                         b->size, name, namelen,
+                                         handler->flags);
                        if (size > b->size)
                                return -ERANGE;
                } else {
-                        size = handler->list(b->inode, NULL, 0, name, namelen);
+                        size = handler->list(b->dentry, NULL, 0, name,
+                                             namelen, handler->flags);
                }
                b->pos += size;
@@ -820,7 +834,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
        int err = 0;
        loff_t pos = 0;
        struct listxattr_buf buf = {
-                .inode = dentry->d_inode,
+                .dentry = dentry,
                .buf = buffer,
                .size = buffer ? size : 0,
        };
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 35d6e672a279..cc32e6ada67b 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -15,8 +15,10 @@ static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
                            struct posix_acl *acl);
 static int
-xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
+posix_acl_set(struct dentry *dentry, const char *name, const void *value,
+                size_t size, int flags, int type)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl;
        int error, error2;
        struct reiserfs_transaction_handle th;
@@ -60,15 +62,16 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 }
 static int
-xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
+posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
+                size_t size, int type)
 {
        struct posix_acl *acl;
        int error;
-        if (!reiserfs_posixacl(inode->i_sb))
+        if (!reiserfs_posixacl(dentry->d_sb))
                return -EOPNOTSUPP;
-        acl = reiserfs_get_acl(inode, type);
+        acl = reiserfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -482,30 +485,12 @@ int reiserfs_acl_chmod(struct inode *inode)
        return error;
 }
-static int
+static size_t posix_acl_access_list(struct dentry *dentry, char *list,
-posix_acl_access_get(struct inode *inode, const char *name,
-                     void *buffer, size_t size)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
-                return -EINVAL;
-        return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
-}
-static int
-posix_acl_access_set(struct inode *inode, const char *name,
-                     const void *value, size_t size, int flags)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
-                return -EINVAL;
-        return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
-}
-static size_t posix_acl_access_list(struct inode *inode, char *list,
                                    size_t list_size, const char *name,
-                                    size_t name_len)
+                                    size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-        if (!reiserfs_posixacl(inode->i_sb))
+        if (!reiserfs_posixacl(dentry->d_sb))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
@@ -514,35 +499,18 @@ static size_t posix_acl_access_list(struct inode *inode, char *list,
 struct xattr_handler reiserfs_posix_acl_access_handler = {
        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .get = posix_acl_access_get,
+        .flags = ACL_TYPE_ACCESS,
-        .set = posix_acl_access_set,
+        .get = posix_acl_get,
+        .set = posix_acl_set,
        .list = posix_acl_access_list,
 };
-static int
+static size_t posix_acl_default_list(struct dentry *dentry, char *list,
-posix_acl_default_get(struct inode *inode, const char *name,
-                      void *buffer, size_t size)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
-                return -EINVAL;
-        return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
-}
-static int
-posix_acl_default_set(struct inode *inode, const char *name,
-                      const void *value, size_t size, int flags)
-{
-        if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
-                return -EINVAL;
-        return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
-}
-static size_t posix_acl_default_list(struct inode *inode, char *list,
                                     size_t list_size, const char *name,
-                                     size_t name_len)
+                                     size_t name_len, int type)
 {
        const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-        if (!reiserfs_posixacl(inode->i_sb))
+        if (!reiserfs_posixacl(dentry->d_sb))
                return 0;
        if (list && size <= list_size)
                memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
@@ -551,7 +519,8 @@ static size_t posix_acl_default_list(struct inode *inode, char *list,
 struct xattr_handler reiserfs_posix_acl_default_handler = {
        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .get = posix_acl_default_get,
+        .flags = ACL_TYPE_DEFAULT,
-        .set = posix_acl_default_set,
+        .get = posix_acl_get,
+        .set = posix_acl_set,
        .list = posix_acl_default_list,
 };
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index a92c8792c0f6..d8b5bfcbdd30 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -8,36 +8,37 @@
 #include <asm/uaccess.h>
 static int
-security_get(struct inode *inode, const char *name, void *buffer, size_t size)
+security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+                int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
                return -EINVAL;
-        if (IS_PRIVATE(inode))
+        if (IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_get(inode, name, buffer, size);
+        return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
 }
 static int
-security_set(struct inode *inode, const char *name, const void *buffer,
+security_set(struct dentry *dentry, const char *name, const void *buffer,
-             size_t size, int flags)
+             size_t size, int flags, int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
                return -EINVAL;
-        if (IS_PRIVATE(inode))
+        if (IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_set(inode, name, buffer, size, flags);
+        return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
 }
-static size_t security_list(struct inode *inode, char *list, size_t list_len,
+static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
-                            const char *name, size_t namelen)
+                            const char *name, size_t namelen, int handler_flags)
 {
        const size_t len = namelen + 1;
-        if (IS_PRIVATE(inode))
+        if (IS_PRIVATE(dentry->d_inode))
                return 0;
        if (list && len <= list_len) {
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a865042f75e2..5b08aaca3daf 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,36 +8,37 @@
 #include <asm/uaccess.h>
 static int
-trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
+trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+            int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
                return -EINVAL;
-        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
+        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_get(inode, name, buffer, size);
+        return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
 }
 static int
-trusted_set(struct inode *inode, const char *name, const void *buffer,
+trusted_set(struct dentry *dentry, const char *name, const void *buffer,
-            size_t size, int flags)
+            size_t size, int flags, int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
                return -EINVAL;
-        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
+        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
                return -EPERM;
-        return reiserfs_xattr_set(inode, name, buffer, size, flags);
+        return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
 }
-static size_t trusted_list(struct inode *inode, char *list, size_t list_size,
+static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
-                           const char *name, size_t name_len)
+                           const char *name, size_t name_len, int handler_flags)
 {
        const size_t len = name_len + 1;
-        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
+        if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode))
                return 0;
        if (list && len <= list_size) {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index e3238dc4f3db..75d59c49b911 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,34 +7,35 @@
 #include <asm/uaccess.h>
 static int
-user_get(struct inode *inode, const char *name, void *buffer, size_t size)
+user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
+         int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_USER_PREFIX))
                return -EINVAL;
-        if (!reiserfs_xattrs_user(inode->i_sb))
+        if (!reiserfs_xattrs_user(dentry->d_sb))
                return -EOPNOTSUPP;
-        return reiserfs_xattr_get(inode, name, buffer, size);
+        return reiserfs_xattr_get(dentry->d_inode, name, buffer, size);
 }
 static int
-user_set(struct inode *inode, const char *name, const void *buffer,
+user_set(struct dentry *dentry, const char *name, const void *buffer,
-         size_t size, int flags)
+         size_t size, int flags, int handler_flags)
 {
        if (strlen(name) < sizeof(XATTR_USER_PREFIX))
                return -EINVAL;
-        if (!reiserfs_xattrs_user(inode->i_sb))
+        if (!reiserfs_xattrs_user(dentry->d_sb))
                return -EOPNOTSUPP;
-        return reiserfs_xattr_set(inode, name, buffer, size, flags);
+        return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags);
 }
-static size_t user_list(struct inode *inode, char *list, size_t list_size,
+static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
-                        const char *name, size_t name_len)
+                        const char *name, size_t name_len, int handler_flags)
 {
        const size_t len = name_len + 1;
-        if (!reiserfs_xattrs_user(inode->i_sb))
+        if (!reiserfs_xattrs_user(dentry->d_sb))
                return 0;
        if (list && len <= list_size) {
                memcpy(list, name, name_len);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b07565c94386..1dabe4ee02fe 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -236,7 +236,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                 * anon_inode_getfd() will install the fd.
                 */
                ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx,
-                                       flags & (O_CLOEXEC | O_NONBLOCK));
+                                       O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK)));
                if (ufd < 0)
                        kfree(ctx);
        } else {
diff --git a/fs/stack.c b/fs/stack.c
index 67716f6a1a4a..4a6f7f440658 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -7,18 +7,63 @@
 * This function cannot be inlined since i_size_{read,write} is rather
 * heavy-weight on 32-bit systems
 */
-void fsstack_copy_inode_size(struct inode *dst, const struct inode *src)
+void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
 {
-        i_size_write(dst, i_size_read((struct inode *)src));
+        loff_t i_size;
-        dst->i_blocks = src->i_blocks;
+        blkcnt_t i_blocks;
+        /*
+         * i_size_read() includes its own seqlocking and protection from
+         * preemption (see include/linux/fs.h): we need nothing extra for
+         * that here, and prefer to avoid nesting locks than attempt to keep
+         * i_size and i_blocks in sync together.
+         */
+        i_size = i_size_read(src);
+        /*
+         * But if CONFIG_LBDAF (on 32-bit), we ought to make an effort to
+         * keep the two halves of i_blocks in sync despite SMP or PREEMPT -
+         * though stat's generic_fillattr() doesn't bother, and we won't be
+         * applying quotas (where i_blocks does become important) at the
+         * upper level.
+         *
+         * We don't actually know what locking is used at the lower level;
+         * but if it's a filesystem that supports quotas, it will be using
+         * i_lock as in inode_add_bytes().  tmpfs uses other locking, and
+         * its 32-bit is (just) able to exceed 2TB i_size with the aid of
+         * holes; but its i_blocks cannot carry into the upper long without
+         * almost 2TB swap - let's ignore that case.
+         */
+        if (sizeof(i_blocks) > sizeof(long))
+                spin_lock(&src->i_lock);
+        i_blocks = src->i_blocks;
+        if (sizeof(i_blocks) > sizeof(long))
+                spin_unlock(&src->i_lock);
+        /*
+         * If CONFIG_SMP or CONFIG_PREEMPT on 32-bit, it's vital for
+         * fsstack_copy_inode_size() to hold some lock around
+         * i_size_write(), otherwise i_size_read() may spin forever (see
+         * include/linux/fs.h).  We don't necessarily hold i_mutex when this
+         * is called, so take i_lock for that case.
+         *
+         * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the
+         * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
+         * for that case too, and do both at once by combining the tests.
+         *
+         * There is none of this locking overhead in the 64-bit case.
+         */
+        if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+                spin_lock(&dst->i_lock);
+        i_size_write(dst, i_size);
+        dst->i_blocks = i_blocks;
+        if (sizeof(i_size) > sizeof(long) || sizeof(i_blocks) > sizeof(long))
+                spin_unlock(&dst->i_lock);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_inode_size);
-/* copy all attributes; get_nlinks is optional way to override the i_nlink
+/* copy all attributes */
- * copying
+void fsstack_copy_attr_all(struct inode *dest, const struct inode *src)
- */
-void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
-                                int (*get_nlinks)(struct inode *))
 {
        dest->i_mode = src->i_mode;
        dest->i_uid = src->i_uid;
@@ -29,14 +74,6 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src,
        dest->i_ctime = src->i_ctime;
        dest->i_blkbits = src->i_blkbits;
        dest->i_flags = src->i_flags;
+        dest->i_nlink = src->i_nlink;
-        /*
-         * Update the nlinks AFTER updating the above fields, because the
-         * get_links callback may depend on them.
-         */
-        if (!get_nlinks)
-                dest->i_nlink = src->i_nlink;
-        else
-                dest->i_nlink = (*get_nlinks)(dest);
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
diff --git a/fs/stat.c b/fs/stat.c
index 075694e31d8b..c4ecd52c5737 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename,
 }
 #endif /* __ARCH_WANT_STAT64 */
-void inode_add_bytes(struct inode *inode, loff_t bytes)
+/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
+void __inode_add_bytes(struct inode *inode, loff_t bytes)
 {
-        spin_lock(&inode->i_lock);
        inode->i_blocks += bytes >> 9;
        bytes &= 511;
        inode->i_bytes += bytes;
@@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes)
                inode->i_blocks++;
                inode->i_bytes -= 512;
        }
+}
+void inode_add_bytes(struct inode *inode, loff_t bytes)
+{
+        spin_lock(&inode->i_lock);
+        __inode_add_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
 }
diff --git a/fs/super.c b/fs/super.c
index 19eb70b374bc..aff046b0fe78 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type,
                        return error;
                }
                s->s_flags |= MS_ACTIVE;
+        } else {
+                do_remount_sb(s, flags, data, 0);
        }
-        do_remount_sb(s, flags, data, 0);
        simple_set_mnt(mnt, s);
        return 0;
 }
diff --git a/fs/sync.c b/fs/sync.c
index 36752a683481..418727a2a239 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -355,6 +355,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
 {
        int ret;
        struct file *file;
+        struct address_space *mapping;
        loff_t endbyte;                 /* inclusive */
        int fput_needed;
        umode_t i_mode;
@@ -405,7 +406,28 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
                        !S_ISLNK(i_mode))
                goto out_put;
-        ret = do_sync_mapping_range(file->f_mapping, offset, endbyte, flags);
+        mapping = file->f_mapping;
+        if (!mapping) {
+                ret = -EINVAL;
+                goto out_put;
+        }
+        ret = 0;
+        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
+                ret = filemap_fdatawait_range(mapping, offset, endbyte);
+                if (ret < 0)
+                        goto out_put;
+        }
+        if (flags & SYNC_FILE_RANGE_WRITE) {
+                ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+                if (ret < 0)
+                        goto out_put;
+        }
+        if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
+                ret = filemap_fdatawait_range(mapping, offset, endbyte);
 out_put:
        fput_light(file, fput_needed);
 out:
@@ -437,38 +459,3 @@ asmlinkage long SyS_sync_file_range2(long fd, long flags,
 }
 SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
 #endif
-/*
- * `endbyte' is inclusive
- */
-int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
-                          loff_t endbyte, unsigned int flags)
-{
-        int ret;
-        if (!mapping) {
-                ret = -EINVAL;
-                goto out;
-        }
-        ret = 0;
-        if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
-                ret = filemap_fdatawait_range(mapping, offset, endbyte);
-                if (ret < 0)
-                        goto out;
-        }
-        if (flags & SYNC_FILE_RANGE_WRITE) {
-                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
-                                                WB_SYNC_ALL);
-                if (ret < 0)
-                        goto out;
-        }
-        if (flags & SYNC_FILE_RANGE_WAIT_AFTER) {
-                ret = filemap_fdatawait_range(mapping, offset, endbyte);
-        }
-out:
-        return ret;
-}
-EXPORT_SYMBOL_GPL(do_sync_mapping_range);
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 60c702bc10ae..a0a500af24a1 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd)
 *      @attr:  attribute descriptor.
 */
-int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+int sysfs_create_bin_file(struct kobject *kobj,
+                          const struct bin_attribute *attr)
 {
        BUG_ON(!kobj || !kobj->sd || !attr);
@@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr)
 *      @attr:  attribute descriptor.
 */
-void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr)
+void sysfs_remove_bin_file(struct kobject *kobj,
+                           const struct bin_attribute *attr)
 {
        sysfs_hash_and_remove(kobj->sd, attr->attr.name);
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index f05f2303a8b8..699f371b9f12 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -106,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
                        return NULL;
                t = atomic_cmpxchg(&sd->s_active, v, v + 1);
-                if (likely(t == v))
+                if (likely(t == v)) {
+                        rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
                        return sd;
+                }
                if (t < 0)
                        return NULL;
@@ -130,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd)
        if (unlikely(!sd))
                return;
+        rwsem_release(&sd->dep_map, 1, _RET_IP_);
        v = atomic_dec_return(&sd->s_active);
        if (likely(v != SD_DEACTIVATED_BIAS))
                return;
@@ -194,15 +197,21 @@ static void sysfs_deactivate(struct sysfs_dirent *sd)
        BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED));
        sd->s_sibling = (void *)&wait;
+        rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
        /* atomic_add_return() is a mb(), put_active() will always see
         * the updated sd->s_sibling.
         */
        v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-        if (v != SD_DEACTIVATED_BIAS)
+        if (v != SD_DEACTIVATED_BIAS) {
+                lock_contended(&sd->dep_map, _RET_IP_);
                wait_for_completion(&wait);
+        }
        sd->s_sibling = NULL;
+        lock_acquired(&sd->dep_map, _RET_IP_);
+        rwsem_release(&sd->dep_map, 1, _RET_IP_);
 }
 static int sysfs_alloc_ino(ino_t *pino)
@@ -345,6 +354,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
        atomic_set(&sd->s_count, 1);
        atomic_set(&sd->s_active, 0);
+        sysfs_dirent_init_lockdep(sd);
        sd->s_name = name;
        sd->s_mode = mode;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index ca52e7b9d8f8..cdd9377a6e06 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,7 @@
 * This file is released under the GPLv2.
 */
+#include <linux/lockdep.h>
 #include <linux/fs.h>
 struct sysfs_open_dirent;
@@ -50,6 +51,9 @@ struct sysfs_inode_attrs {
 struct sysfs_dirent {
        atomic_t                s_count;
        atomic_t                s_active;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+        struct lockdep_map      dep_map;
+#endif
        struct sysfs_dirent     *s_parent;
        struct sysfs_dirent     *s_sibling;
        const char              *s_name;
@@ -84,6 +88,17 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
        return sd->s_flags & SYSFS_TYPE_MASK;
 }
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define sysfs_dirent_init_lockdep(sd)                           \
+do {                                                            \
+        static struct lock_class_key __key;                     \
+                                                                \
+        lockdep_init_map(&sd->dep_map, "s_active", &__key, 0);  \
+} while(0)
+#else
+#define sysfs_dirent_init_lockdep(sd) do {} while(0)
+#endif
 /*
 * Context structure to be used while adding/removing nodes.
 */
diff --git a/fs/timerfd.c b/fs/timerfd.c
index b042bd7034b1..1bfc95ad5f71 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -200,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
        hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
        ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-                               flags & TFD_SHARED_FCNTL_FLAGS);
+                               O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
        if (ufd < 0)
                kfree(ctx);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 39849f887e72..16a6444330ec 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -45,7 +45,7 @@
 *
 * Similarly, @i_mutex is not always locked in 'ubifs_readpage()', e.g., the
 * read-ahead path does not lock it ("sys_read -> generic_file_aio_read ->
- * ondemand_readahead -> readpage"). In case of readahead, @I_LOCK flag is not
+ * ondemand_readahead -> readpage"). In case of readahead, @I_SYNC flag is not
 * set as well. However, UBIFS disables readahead.
 */
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6f671f1ac271..22af68f8b682 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -70,13 +70,13 @@ static inline unsigned long ufs_dir_pages(struct inode *inode)
        return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
 }
-ino_t ufs_inode_by_name(struct inode *dir, struct dentry *dentry)
+ino_t ufs_inode_by_name(struct inode *dir, struct qstr *qstr)
 {
        ino_t res = 0;
        struct ufs_dir_entry *de;
        struct page *page;
        
-        de = ufs_find_entry(dir, dentry, &page);
+        de = ufs_find_entry(dir, qstr, &page);
        if (de) {
                res = fs32_to_cpu(dir->i_sb, de->d_ino);
                ufs_put_page(page);
@@ -249,12 +249,12 @@ struct ufs_dir_entry *ufs_dotdot(struct inode *dir, struct page **p)
 * (as a parameter - res_dir). Page is returned mapped and unlocked.
 * Entry is guaranteed to be valid.
 */
-struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry,
+struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct qstr *qstr,
                                     struct page **res_page)
 {
        struct super_block *sb = dir->i_sb;
-        const char *name = dentry->d_name.name;
+        const char *name = qstr->name;
-        int namelen = dentry->d_name.len;
+        int namelen = qstr->len;
        unsigned reclen = UFS_DIR_REC_LEN(namelen);
        unsigned long start, n;
        unsigned long npages = ufs_dir_pages(dir);
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 23119fe7ad62..4c26d9e8bc94 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -56,7 +56,7 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
                return ERR_PTR(-ENAMETOOLONG);
        lock_kernel();
-        ino = ufs_inode_by_name(dir, dentry);
+        ino = ufs_inode_by_name(dir, &dentry->d_name);
        if (ino) {
                inode = ufs_iget(dir->i_sb, ino);
                if (IS_ERR(inode)) {
@@ -237,7 +237,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
-        de = ufs_find_entry(dir, dentry, &page);
+        de = ufs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -281,7 +281,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        old_de = ufs_find_entry(old_dir, old_dentry, &old_page);
+        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
@@ -301,7 +301,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto out_dir;
                err = -ENOENT;
-                new_de = ufs_find_entry(new_dir, new_dentry, &new_page);
+                new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
                if (!new_de)
                        goto out_dir;
                inode_inc_link_count(old_inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 5faed7954d0a..143c20bfb04b 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -66,6 +66,7 @@
 */
+#include <linux/exportfs.h>
 #include <linux/module.h>
 #include <linux/bitops.h>
@@ -96,6 +97,56 @@
 #include "swab.h"
 #include "util.h"
+static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
+{
+        struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+        struct inode *inode;
+        if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+                return ERR_PTR(-ESTALE);
+        inode = ufs_iget(sb, ino);
+        if (IS_ERR(inode))
+                return ERR_CAST(inode);
+        if (generation && inode->i_generation != generation) {
+                iput(inode);
+                return ERR_PTR(-ESTALE);
+        }
+        return inode;
+}
+static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                                       int fh_len, int fh_type)
+{
+        return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+                                       int fh_len, int fh_type)
+{
+        return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode);
+}
+static struct dentry *ufs_get_parent(struct dentry *child)
+{
+        struct qstr dot_dot = {
+                .name   = "..",
+                .len    = 2,
+        };
+        ino_t ino;
+        ino = ufs_inode_by_name(child->d_inode, &dot_dot);
+        if (!ino)
+                return ERR_PTR(-ENOENT);
+        return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino));
+}
+static const struct export_operations ufs_export_ops = {
+        .fh_to_dentry   = ufs_fh_to_dentry,
+        .fh_to_parent   = ufs_fh_to_parent,
+        .get_parent     = ufs_get_parent,
+};
 #ifdef CONFIG_UFS_DEBUG
 /*
 * Print contents of ufs_super_block, useful for debugging
@@ -990,6 +1041,7 @@ magic_found:
         * Read ufs_super_block into internal data structures
         */
        sb->s_op = &ufs_super_ops;
+        sb->s_export_op = &ufs_export_ops;
        sb->dq_op = NULL; /***/
        sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 644e77e13599..0b4c39bc0d9e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -86,9 +86,9 @@ extern void ufs_put_cylinder (struct super_block *, unsigned);
 /* dir.c */
 extern const struct inode_operations ufs_dir_inode_operations;
 extern int ufs_add_link (struct dentry *, struct inode *);
-extern ino_t ufs_inode_by_name(struct inode *, struct dentry *);
+extern ino_t ufs_inode_by_name(struct inode *, struct qstr *);
 extern int ufs_make_empty(struct inode *, struct inode *);
-extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct dentry *, struct page **);
+extern struct ufs_dir_entry *ufs_find_entry(struct inode *, struct qstr *, struct page **);
 extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page *);
 extern int ufs_empty_dir (struct inode *);
 extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **);
diff --git a/fs/xattr.c b/fs/xattr.c
index 6d4f6d3449fb..46f87e828b48 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -615,12 +615,11 @@ ssize_t
 generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
 {
        struct xattr_handler *handler;
-        struct inode *inode = dentry->d_inode;
-        handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->get(inode, name, buffer, size);
+        return handler->get(dentry, name, buffer, size, handler->flags);
 }
 /*
@@ -630,18 +629,20 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 ssize_t
 generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 {
-        struct inode *inode = dentry->d_inode;
+        struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
-        struct xattr_handler *handler, **handlers = inode->i_sb->s_xattr;
        unsigned int size = 0;
        if (!buffer) {
-                for_each_xattr_handler(handlers, handler)
+                for_each_xattr_handler(handlers, handler) {
-                        size += handler->list(inode, NULL, 0, NULL, 0);
+                        size += handler->list(dentry, NULL, 0, NULL, 0,
+                                              handler->flags);
+                }
        } else {
                char *buf = buffer;
                for_each_xattr_handler(handlers, handler) {
-                        size = handler->list(inode, buf, buffer_size, NULL, 0);
+                        size = handler->list(dentry, buf, buffer_size,
+                                             NULL, 0, handler->flags);
                        if (size > buffer_size)
                                return -ERANGE;
                        buf += size;
@@ -659,14 +660,13 @@ int
 generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
 {
        struct xattr_handler *handler;
-        struct inode *inode = dentry->d_inode;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
-        handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, value, size, flags);
+        return handler->set(dentry, name, value, size, 0, handler->flags);
 }
 /*
@@ -677,12 +677,12 @@ int
 generic_removexattr(struct dentry *dentry, const char *name)
 {
        struct xattr_handler *handler;
-        struct inode *inode = dentry->d_inode;
-        handler = xattr_resolve_name(inode->i_sb->s_xattr, &name);
+        handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
        if (!handler)
                return -EOPNOTSUPP;
-        return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
+        return handler->set(dentry, name, NULL, 0,
+                            XATTR_REPLACE, handler->flags);
 }
 EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index 69e598b6986f..2512125dfa7c 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -354,37 +354,14 @@ xfs_acl_chmod(struct inode *inode)
        return error;
 }
-/*
- * System xattr handlers.
- *
- * Currently Posix ACLs are the only system namespace extended attribute
- * handlers supported by XFS, so we just implement the handlers here.
- * If we ever support other system extended attributes this will need
- * some refactoring.
- */
 static int
-xfs_decode_acl(const char *name)
+xfs_xattr_acl_get(struct dentry *dentry, const char *name,
-{
+                void *value, size_t size, int type)
-        if (strcmp(name, "posix_acl_access") == 0)
-                return ACL_TYPE_ACCESS;
-        else if (strcmp(name, "posix_acl_default") == 0)
-                return ACL_TYPE_DEFAULT;
-        return -EINVAL;
-}
-static int
-xfs_xattr_system_get(struct inode *inode, const char *name,
-                void *value, size_t size)
 {
        struct posix_acl *acl;
-        int type, error;
+        int error;
-        type = xfs_decode_acl(name);
-        if (type < 0)
-                return type;
-        acl = xfs_get_acl(inode, type);
+        acl = xfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl == NULL)
@@ -397,15 +374,13 @@ xfs_xattr_system_get(struct inode *inode, const char *name,
 }
 static int
-xfs_xattr_system_set(struct inode *inode, const char *name,
+xfs_xattr_acl_set(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int type)
 {
+        struct inode *inode = dentry->d_inode;
        struct posix_acl *acl = NULL;
-        int error = 0, type;
+        int error = 0;
-        type = xfs_decode_acl(name);
-        if (type < 0)
-                return type;
        if (flags & XATTR_CREATE)
                return -EINVAL;
        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
@@ -462,8 +437,16 @@ xfs_xattr_system_set(struct inode *inode, const char *name,
        return error;
 }
-struct xattr_handler xfs_xattr_system_handler = {
+struct xattr_handler xfs_xattr_acl_access_handler = {
-        .prefix = XATTR_SYSTEM_PREFIX,
+        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .get    = xfs_xattr_system_get,
+        .flags  = ACL_TYPE_ACCESS,
-        .set    = xfs_xattr_system_set,
+        .get    = xfs_xattr_acl_get,
+        .set    = xfs_xattr_acl_set,
+};
+struct xattr_handler xfs_xattr_acl_default_handler = {
+        .prefix = POSIX_ACL_XATTR_DEFAULT,
+        .flags  = ACL_TYPE_DEFAULT,
+        .get    = xfs_xattr_acl_get,
+        .set    = xfs_xattr_acl_set,
 };
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index d798c54296eb..66abe36c1213 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1474,19 +1474,13 @@ xfs_vm_direct_IO(
        bdev = xfs_find_bdev_for_inode(XFS_I(inode));
-        if (rw == WRITE) {
+        iocb->private = xfs_alloc_ioend(inode, rw == WRITE ?
-                iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
+                                        IOMAP_UNWRITTEN : IOMAP_READ);
-                ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
-                        bdev, iov, offset, nr_segs,
+        ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
-                        xfs_get_blocks_direct,
+                                            offset, nr_segs,
-                        xfs_end_io_direct);
+                                            xfs_get_blocks_direct,
-        } else {
+                                            xfs_end_io_direct);
-                iocb->private = xfs_alloc_ioend(inode, IOMAP_READ);
-                ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
-                        bdev, iov, offset, nr_segs,
-                        xfs_get_blocks_direct,
-                        xfs_end_io_direct);
-        }
        if (unlikely(ret != -EIOCBQUEUED && iocb->private))
                xfs_destroy_ioend(iocb->private);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index b4c7d4248aac..77b8be81c769 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -292,6 +292,7 @@ _xfs_buf_free_pages(
 {
        if (bp->b_pages != bp->b_page_array) {
                kmem_free(bp->b_pages);
+                bp->b_pages = NULL;
        }
 }
@@ -323,9 +324,8 @@ xfs_buf_free(
                                ASSERT(!PagePrivate(page));
                        page_cache_release(page);
                }
-                _xfs_buf_free_pages(bp);
        }
+        _xfs_buf_free_pages(bp);
        xfs_buf_deallocate(bp);
 }
@@ -1149,10 +1149,14 @@ _xfs_buf_ioapply(
        if (bp->b_flags & XBF_ORDERED) {
                ASSERT(!(bp->b_flags & XBF_READ));
                rw = WRITE_BARRIER;
-        } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+        } else if (bp->b_flags & XBF_LOG_BUFFER) {
                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
                bp->b_flags &= ~_XBF_RUN_QUEUES;
                rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
+        } else if (bp->b_flags & _XBF_RUN_QUEUES) {
+                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
+                bp->b_flags &= ~_XBF_RUN_QUEUES;
+                rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
        } else {
                rw = (bp->b_flags & XBF_WRITE) ? WRITE :
                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a509f4addc2a..a34c7b54822d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -55,6 +55,7 @@ typedef enum {
        XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
        XBF_ORDERED = (1 << 11),    /* use ordered writes                  */
        XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead             */
+        XBF_LOG_BUFFER = (1 << 13), /* this is a buffer used for the log   */
        /* flags used only as arguments to access routines */
        XBF_LOCK = (1 << 14),       /* lock requested                      */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 1d5b298ba8b2..225946012d0b 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -794,7 +794,7 @@ xfs_setup_inode(
        struct inode            *inode = &ip->i_vnode;
        inode->i_ino = ip->i_ino;
-        inode->i_state = I_NEW|I_LOCK;
+        inode->i_state = I_NEW;
        inode_add_to_lists(ip->i_mount->m_super, inode);
        inode->i_mode   = ip->i_d.di_mode;
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index 497c7fb75cc1..0b1878857fc3 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -30,10 +30,10 @@
 static int
-__xfs_xattr_get(struct inode *inode, const char *name,
+xfs_xattr_get(struct dentry *dentry, const char *name,
                void *value, size_t size, int xflags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip = XFS_I(dentry->d_inode);
        int error, asize = size;
        if (strcmp(name, "") == 0)
@@ -52,10 +52,10 @@ __xfs_xattr_get(struct inode *inode, const char *name,
 }
 static int
-__xfs_xattr_set(struct inode *inode, const char *name, const void *value,
+xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
                size_t size, int flags, int xflags)
 {
-        struct xfs_inode *ip = XFS_I(inode);
+        struct xfs_inode *ip = XFS_I(dentry->d_inode);
        if (strcmp(name, "") == 0)
                return -EINVAL;
@@ -71,75 +71,34 @@ __xfs_xattr_set(struct inode *inode, const char *name, const void *value,
        return -xfs_attr_set(ip, name, (void *)value, size, xflags);
 }
-static int
-xfs_xattr_user_get(struct inode *inode, const char *name,
-                void *value, size_t size)
-{
-        return __xfs_xattr_get(inode, name, value, size, 0);
-}
-static int
-xfs_xattr_user_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        return __xfs_xattr_set(inode, name, value, size, flags, 0);
-}
 static struct xattr_handler xfs_xattr_user_handler = {
        .prefix = XATTR_USER_PREFIX,
-        .get    = xfs_xattr_user_get,
+        .flags  = 0, /* no flags implies user namespace */
-        .set    = xfs_xattr_user_set,
+        .get    = xfs_xattr_get,
+        .set    = xfs_xattr_set,
 };
-static int
-xfs_xattr_trusted_get(struct inode *inode, const char *name,
-                void *value, size_t size)
-{
-        return __xfs_xattr_get(inode, name, value, size, ATTR_ROOT);
-}
-static int
-xfs_xattr_trusted_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        return __xfs_xattr_set(inode, name, value, size, flags, ATTR_ROOT);
-}
 static struct xattr_handler xfs_xattr_trusted_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
-        .get    = xfs_xattr_trusted_get,
+        .flags  = ATTR_ROOT,
-        .set    = xfs_xattr_trusted_set,
+        .get    = xfs_xattr_get,
+        .set    = xfs_xattr_set,
 };
-static int
-xfs_xattr_secure_get(struct inode *inode, const char *name,
-                void *value, size_t size)
-{
-        return __xfs_xattr_get(inode, name, value, size, ATTR_SECURE);
-}
-static int
-xfs_xattr_secure_set(struct inode *inode, const char *name,
-                const void *value, size_t size, int flags)
-{
-        return __xfs_xattr_set(inode, name, value, size, flags, ATTR_SECURE);
-}
 static struct xattr_handler xfs_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
-        .get    = xfs_xattr_secure_get,
+        .flags  = ATTR_SECURE,
-        .set    = xfs_xattr_secure_set,
+        .get    = xfs_xattr_get,
+        .set    = xfs_xattr_set,
 };
 struct xattr_handler *xfs_xattr_handlers[] = {
        &xfs_xattr_user_handler,
        &xfs_xattr_trusted_handler,
        &xfs_xattr_security_handler,
 #ifdef CONFIG_XFS_POSIX_ACL
-        &xfs_xattr_system_handler,
+        &xfs_xattr_acl_access_handler,
+        &xfs_xattr_acl_default_handler,
 #endif
        NULL
 };
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 947b150df8ed..00fd357c3e46 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,7 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
 extern int posix_acl_access_exists(struct inode *inode);
 extern int posix_acl_default_exists(struct inode *inode);
-extern struct xattr_handler xfs_xattr_system_handler;
+extern struct xattr_handler xfs_xattr_acl_access_handler;
+extern struct xattr_handler xfs_xattr_acl_default_handler;
 #else
 # define xfs_check_acl                                  NULL
 # define xfs_get_acl(inode, type)                       NULL
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 5549d495947f..cf07ca7c22e7 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -46,20 +46,12 @@ typedef struct xfs_bmdr_block {
 #define BMBT_STARTBLOCK_BITLEN  52
 #define BMBT_BLOCKCOUNT_BITLEN  21
+typedef struct xfs_bmbt_rec {
-#define BMBT_USE_64     1
-typedef struct xfs_bmbt_rec_32
-{
-        __uint32_t              l0, l1, l2, l3;
-} xfs_bmbt_rec_32_t;
-typedef struct xfs_bmbt_rec_64
-{
        __be64                  l0, l1;
-} xfs_bmbt_rec_64_t;
+} xfs_bmbt_rec_t;
 typedef __uint64_t      xfs_bmbt_rec_base_t;    /* use this for casts */
-typedef xfs_bmbt_rec_64_t xfs_bmbt_rec_t, xfs_bmdr_rec_t;
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
 typedef struct xfs_bmbt_rec_host {
        __uint64_t              l0, l1;
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index f5c904a10c11..fa402a6bbbcf 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -91,7 +91,7 @@ xfs_inode_alloc(
        ip->i_new_size = 0;
        /* prevent anyone from using this yet */
-        VFS_I(ip)->i_state = I_NEW|I_LOCK;
+        VFS_I(ip)->i_state = I_NEW;
        return ip;
 }
@@ -217,7 +217,7 @@ xfs_iget_cache_hit(
                        trace_xfs_iget_reclaim(ip);
                        goto out_error;
                }
-                inode->i_state = I_LOCK|I_NEW;
+                inode->i_state = I_NEW;
        } else {
                /* If the VFS inode is being torn down, pause and try again. */
                if (!igrab(inode)) {
@@ -478,17 +478,21 @@ xfs_ireclaim(
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_perag        *pag;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
        XFS_STATS_INC(xs_ig_reclaims);
        /*
-         * Remove the inode from the per-AG radix tree.  It doesn't matter
+         * Remove the inode from the per-AG radix tree.
-         * if it was never added to it because radix_tree_delete can deal
+         *
-         * with that case just fine.
+         * Because radix_tree_delete won't complain even if the item was never
+         * added to the tree assert that it's been there before to catch
+         * problems with the inode life time early on.
         */
        pag = xfs_get_perag(mp, ip->i_ino);
        write_lock(&pag->pag_ici_lock);
-        radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
+        if (!radix_tree_delete(&pag->pag_ici_root, agino))
+                ASSERT(0);
        write_unlock(&pag->pag_ici_lock);
        xfs_put_perag(mp, pag);
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 65bae4c9b8bf..cc8df1ac7783 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -127,7 +127,7 @@ static inline int xfs_ilog_fdata(int w)
 #ifdef __KERNEL__
 struct xfs_buf;
-struct xfs_bmbt_rec_64;
+struct xfs_bmbt_rec;
 struct xfs_inode;
 struct xfs_mount;
@@ -140,9 +140,9 @@ typedef struct xfs_inode_log_item {
        unsigned short          ili_flags;         /* misc flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
-        struct xfs_bmbt_rec_64  *ili_extents_buf;  /* array of logged
+        struct xfs_bmbt_rec     *ili_extents_buf;  /* array of logged
                                                      data exts */
-        struct xfs_bmbt_rec_64  *ili_aextents_buf; /* array of logged
+        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
                                                      attr exts */
        unsigned int            ili_pushbuf_flag;  /* one bit used in push_ail */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4cb1792040e3..600b5b06aaeb 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1441,6 +1441,7 @@ xlog_sync(xlog_t		*log,
        XFS_BUF_ZEROFLAGS(bp);
        XFS_BUF_BUSY(bp);
        XFS_BUF_ASYNC(bp);
+        bp->b_flags |= XBF_LOG_BUFFER;
        /*
         * Do an ordered write for the log block.
         * Its unnecessary to flush the first split block in the log wrap case.
@@ -1478,6 +1479,7 @@ xlog_sync(xlog_t		*log,
                XFS_BUF_ZEROFLAGS(bp);
                XFS_BUF_BUSY(bp);
                XFS_BUF_ASYNC(bp);
+                bp->b_flags |= XBF_LOG_BUFFER;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        XFS_BUF_ORDERED(bp);
                dptr = XFS_BUF_PTR(bp);