Merge commit 'v2.6.34' into next

author: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2010-05-19 13:12:41 -0400
committer: Dmitry Torokhov <dmitry.torokhov@gmail.com> 2010-05-19 13:12:41 -0400
commit: 8d0bc2b456103a34c11e01305cd1aed1cde579e5 (patch)
tree: 5e1e6ad55cc9e2b5c5617f6f320114b8cff9e3f3 /fs
parent: 30ba3ead05763b172acaa65ae1be71af2a878940 (diff)
parent: e40152ee1e1c7a63f4777791863215e3faa37a86 (diff)
115 files changed, 1433 insertions, 780 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 5c5bc8480070..f8b86e92cd66 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -238,6 +238,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                return ERR_PTR(-ENOMEM);
        }
+        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+        if (rc) {
+                __putname(v9ses->aname);
+                __putname(v9ses->uname);
+                return ERR_PTR(rc);
+        }
        spin_lock(&v9fs_sessionlist_lock);
        list_add(&v9ses->slist, &v9fs_sessionlist);
        spin_unlock(&v9fs_sessionlist_lock);
@@ -301,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        return fid;
 error:
+        bdi_destroy(&v9ses->bdi);
        return ERR_PTR(retval);
 }
@@ -326,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
        __putname(v9ses->uname);
        __putname(v9ses->aname);
+        bdi_destroy(&v9ses->bdi);
        spin_lock(&v9fs_sessionlist_lock);
        list_del(&v9ses->slist);
        spin_unlock(&v9fs_sessionlist_lock);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a0a8d3dd1361..bec4d0bcb458 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,7 @@
 *  Boston, MA  02111-1301  USA
 *
 */
+#include <linux/backing-dev.h>
 /**
 * enum p9_session_flags - option flags for each 9P session
@@ -102,6 +103,7 @@ struct v9fs_session_info {
        u32 uid;                /* if ACCESS_SINGLE, the uid that has access */
        struct p9_client *clnt; /* 9p client */
        struct list_head slist; /* list of sessions registered with v9fs */
+        struct backing_dev_info bdi;
 };
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 491108bd6e0d..806da5d3b3a0 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -77,6 +77,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
        sb->s_op = &v9fs_super_ops;
+        sb->s_bdi = &v9ses->bdi;
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c54dad4e6063..a10f2582844f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
 #include <linux/workqueue.h>
 #include <linux/sched.h>
 #include <linux/fscache.h>
+#include <linux/backing-dev.h>
 #include "afs.h"
 #include "afs_vl.h"
@@ -313,6 +314,7 @@ struct afs_volume {
        unsigned short          rjservers;      /* number of servers discarded due to -ENOMEDIUM */
        struct afs_server       *servers[8];    /* servers on which volume resides (ordered) */
        struct rw_semaphore     server_sem;     /* lock for accessing current server */
+        struct backing_dev_info bdi;
 };
 /*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 5e813a816ce4..b3feddc4f7d6 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
        struct afs_super_info *super;
        struct vfsmount *mnt;
-        struct page *page = NULL;
+        struct page *page;
        size_t size;
-        char *buf, *devname = NULL, *options = NULL;
+        char *buf, *devname, *options;
        int ret;
        _enter("{%s}", mntpt->d_name.name);
@@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        ret = -EINVAL;
        size = mntpt->d_inode->i_size;
        if (size > PAGE_SIZE - 1)
-                goto error;
+                goto error_no_devname;
        ret = -ENOMEM;
        devname = (char *) get_zeroed_page(GFP_KERNEL);
        if (!devname)
-                goto error;
+                goto error_no_devname;
        options = (char *) get_zeroed_page(GFP_KERNEL);
        if (!options)
-                goto error;
+                goto error_no_options;
        /* read the contents of the AFS special symlink */
        page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
        if (IS_ERR(page)) {
                ret = PTR_ERR(page);
-                goto error;
+                goto error_no_page;
        }
        ret = -EIO;
@@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        return mnt;
 error:
-        if (page)
+        page_cache_release(page);
-                page_cache_release(page);
+error_no_page:
-        if (devname)
+        free_page((unsigned long) options);
-                free_page((unsigned long) devname);
+error_no_options:
-        if (options)
+        free_page((unsigned long) devname);
-                free_page((unsigned long) options);
+error_no_devname:
        _leave(" = %d", ret);
        return ERR_PTR(ret);
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 14f6431598ad..e932e5a3a0c1 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data)
        sb->s_magic             = AFS_FS_MAGIC;
        sb->s_op                = &afs_super_ops;
        sb->s_fs_info           = as;
+        sb->s_bdi               = &as->volume->bdi;
        /* allocate the root inode and dentry */
        fid.vid         = as->volume->vid;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index a353e69e2391..401eeb21869f 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
+        ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+        if (ret)
+                goto error_bdi;
        init_rwsem(&volume->server_sem);
        /* look up all the applicable server records */
@@ -151,6 +155,8 @@ error:
        return ERR_PTR(ret);
 error_discard:
+        bdi_destroy(&volume->bdi);
+error_bdi:
        up_write(&params->cell->vl_sem);
        for (loop = volume->nservers - 1; loop >= 0; loop--)
@@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume)
        for (loop = volume->nservers - 1; loop >= 0; loop--)
                afs_put_server(volume->servers[loop]);
+        bdi_destroy(&volume->bdi);
        kfree(volume);
        _leave(" [destroyed]");
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 109a6c606d92..e8e5e63ac950 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -177,8 +177,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
                }
        /* Trigger mount for path component or follow link */
        } else if (ino->flags & AUTOFS_INF_PENDING ||
-                        autofs4_need_mount(flags) ||
+                        autofs4_need_mount(flags)) {
-                        current->link_count) {
                DPRINTK("waiting for mount name=%.*s",
                        dentry->d_name.len, dentry->d_name.name);
@@ -262,7 +261,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
                spin_unlock(&dcache_lock);
                spin_unlock(&sbi->fs_lock);
-                status = try_to_fill_dentry(dentry, 0);
+                status = try_to_fill_dentry(dentry, nd->flags);
                if (status)
                        goto out_error;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 7ab23e006e4c..2c5f9a0e5d72 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                                }
                        } else if (!mm->start_data) {
                                mm->start_data = seg->addr;
-#ifndef CONFIG_MMU
                                mm->end_data = seg->addr + phdr->p_memsz;
-#endif
                        }
-#ifdef CONFIG_MMU
-                        if (seg->addr + phdr->p_memsz > mm->end_data)
-                                mm->end_data = seg->addr + phdr->p_memsz;
-#endif
                }
                seg++;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e0e769bdca59..49566c1687d8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
        if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
                printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
-                       (int) r,(int)(start_brk-start_code),(int)text_len);
+                       (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
                goto failed;
        }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 2a6d0193f139..6dcee88c2e5d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -406,16 +406,23 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 
 int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
-        struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+        struct inode *bd_inode = filp->f_mapping->host;
+        struct block_device *bdev = I_BDEV(bd_inode);
        int error;
-        error = sync_blockdev(bdev);
+        /*
-        if (error)
+         * There is no need to serialise calls to blkdev_issue_flush with
-                return error;
+         * i_mutex and doing so causes performance issues with concurrent
-        
+         * O_SYNC writers to a block device.
+         */
+        mutex_unlock(&bd_inode->i_mutex);
        error = blkdev_issue_flush(bdev, NULL);
        if (error == -EOPNOTSUPP)
                error = 0;
+        mutex_lock(&bd_inode->i_mutex);
        return error;
 }
 EXPORT_SYMBOL(blkdev_fsync);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e7b8f2c89ccb..feca04197d02 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -44,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
 * complete.  This is used during reads to verify checksums, and it is used
@@ -1375,19 +1373,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
-        bdi->name = "btrfs";
        bdi->capabilities = BDI_CAP_MAP_COPY;
-        err = bdi_init(bdi);
+        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
        if (err)
                return err;
-        err = bdi_register(bdi, NULL, "btrfs-%d",
-                                atomic_inc_return(&btrfs_bdi_num));
-        if (err) {
-                bdi_destroy(bdi);
-                return err;
-        }
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
        bdi->unplug_io_data     = info;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e84ef60ffe35..97a97839a867 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1481,12 +1481,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                ret = -EBADF;
                goto out_drop_write;
        }
        src = src_file->f_dentry->d_inode;
        ret = -EINVAL;
        if (src == inode)
                goto out_fput;
+        /* the src must be open for reading */
+        if (!(src_file->f_mode & FMODE_READ))
+                goto out_fput;
        ret = -EISDIR;
        if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
                goto out_fput;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index f7c255f9c624..a8cd821226da 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -34,6 +34,7 @@ struct cachefiles_object {
        loff_t                          i_size;         /* object size */
        unsigned long                   flags;
 #define CACHEFILES_OBJECT_ACTIVE        0               /* T if marked active */
+#define CACHEFILES_OBJECT_BURIED        1               /* T if preemptively buried */
        atomic_t                        usage;          /* object usage count */
        uint8_t                         type;           /* object type */
        uint8_t                         new;            /* T if object new */
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index d5db84a1ee0d..f4a7840bf42c 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -93,6 +93,59 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object,
 }
 /*
+ * mark the owner of a dentry, if there is one, to indicate that that dentry
+ * has been preemptively deleted
+ * - the caller must hold the i_mutex on the dentry's parent as required to
+ *   call vfs_unlink(), vfs_rmdir() or vfs_rename()
+ */
+static void cachefiles_mark_object_buried(struct cachefiles_cache *cache,
+                                          struct dentry *dentry)
+{
+        struct cachefiles_object *object;
+        struct rb_node *p;
+        _enter(",'%*.*s'",
+               dentry->d_name.len, dentry->d_name.len, dentry->d_name.name);
+        write_lock(&cache->active_lock);
+        p = cache->active_nodes.rb_node;
+        while (p) {
+                object = rb_entry(p, struct cachefiles_object, active_node);
+                if (object->dentry > dentry)
+                        p = p->rb_left;
+                else if (object->dentry < dentry)
+                        p = p->rb_right;
+                else
+                        goto found_dentry;
+        }
+        write_unlock(&cache->active_lock);
+        _leave(" [no owner]");
+        return;
+        /* found the dentry for  */
+found_dentry:
+        kdebug("preemptive burial: OBJ%x [%s] %p",
+               object->fscache.debug_id,
+               fscache_object_states[object->fscache.state],
+               dentry);
+        if (object->fscache.state < FSCACHE_OBJECT_DYING) {
+                printk(KERN_ERR "\n");
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Can't preemptively bury live object\n");
+                cachefiles_printk_object(object, NULL);
+        } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
+                printk(KERN_ERR "CacheFiles: Error:"
+                       " Object already preemptively buried\n");
+        }
+        write_unlock(&cache->active_lock);
+        _leave(" [owner marked]");
+}
+/*
 * record the fact that an object is now active
 */
 static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
@@ -219,7 +272,8 @@ requeue:
 */
 static int cachefiles_bury_object(struct cachefiles_cache *cache,
                                  struct dentry *dir,
-                                  struct dentry *rep)
+                                  struct dentry *rep,
+                                  bool preemptive)
 {
        struct dentry *grave, *trap;
        char nbuffer[8 + 8 + 1];
@@ -229,11 +283,16 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
               dir->d_name.len, dir->d_name.len, dir->d_name.name,
               rep->d_name.len, rep->d_name.len, rep->d_name.name);
+        _debug("remove %p from %p", rep, dir);
        /* non-directories can just be unlinked */
        if (!S_ISDIR(rep->d_inode->i_mode)) {
                _debug("unlink stale object");
                ret = vfs_unlink(dir->d_inode, rep);
+                if (preemptive)
+                        cachefiles_mark_object_buried(cache, rep);
                mutex_unlock(&dir->d_inode->i_mutex);
                if (ret == -EIO)
@@ -325,6 +384,9 @@ try_again:
        if (ret != 0 && ret != -ENOMEM)
                cachefiles_io_error(cache, "Rename failed with error %d", ret);
+        if (preemptive)
+                cachefiles_mark_object_buried(cache, rep);
        unlock_rename(cache->graveyard, dir);
        dput(grave);
        _leave(" = 0");
@@ -340,7 +402,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        struct dentry *dir;
        int ret;
-        _enter(",{%p}", object->dentry);
+        _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry);
        ASSERT(object->dentry);
        ASSERT(object->dentry->d_inode);
@@ -350,15 +412,25 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
        mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
-        /* we need to check that our parent is _still_ our parent - it may have
+        if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
-         * been renamed */
+                /* object allocation for the same key preemptively deleted this
-        if (dir == object->dentry->d_parent) {
+                 * object's file so that it could create its own file */
-                ret = cachefiles_bury_object(cache, dir, object->dentry);
+                _debug("object preemptively buried");
-        } else {
-                /* it got moved, presumably by cachefilesd culling it, so it's
-                 * no longer in the key path and we can ignore it */
                mutex_unlock(&dir->d_inode->i_mutex);
                ret = 0;
+        } else {
+                /* we need to check that our parent is _still_ our parent - it
+                 * may have been renamed */
+                if (dir == object->dentry->d_parent) {
+                        ret = cachefiles_bury_object(cache, dir,
+                                                     object->dentry, false);
+                } else {
+                        /* it got moved, presumably by cachefilesd culling it,
+                         * so it's no longer in the key path and we can ignore
+                         * it */
+                        mutex_unlock(&dir->d_inode->i_mutex);
+                        ret = 0;
+                }
        }
        dput(dir);
@@ -381,7 +453,9 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
        const char *name;
        int ret, nlen;
-        _enter("{%p},,%s,", parent->dentry, key);
+        _enter("OBJ%x{%p},OBJ%x,%s,",
+               parent->fscache.debug_id, parent->dentry,
+               object->fscache.debug_id, key);
        cache = container_of(parent->fscache.cache,
                             struct cachefiles_cache, cache);
@@ -509,7 +583,7 @@ lookup_again:
                         * mutex) */
                        object->dentry = NULL;
-                        ret = cachefiles_bury_object(cache, dir, next);
+                        ret = cachefiles_bury_object(cache, dir, next, true);
                        dput(next);
                        next = NULL;
@@ -828,7 +902,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
        /*  actually remove the victim (drops the dir mutex) */
        _debug("bury");
-        ret = cachefiles_bury_object(cache, dir, victim);
+        ret = cachefiles_bury_object(cache, dir, victim, false);
        if (ret < 0)
                goto error;
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index b5808cdb2232..039b5011d83b 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -77,6 +77,8 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
 /*
 * check the security details of the on-disk cache
 * - must be called with security override in force
+ * - must return with a security override in force - even in the case of an
+ *   error
 */
 int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
                                        struct dentry *root,
@@ -99,6 +101,8 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache,
         * which create files */
        ret = set_create_files_as(new, root->d_inode);
        if (ret < 0) {
+                abort_creds(new);
+                cachefiles_begin_secure(cache, _saved_cred);
                _leave(" = %d [cfa]", ret);
                return ret;
        }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index aa3cd7cc3e40..a9005d862ed4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -337,16 +337,15 @@ out:
 /*
 * Get ref for the oldest snapc for an inode with dirty data... that is, the
 * only snap context we are allowed to write back.
- *
- * Caller holds i_lock.
 */
-static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-                                                      u64 *snap_size)
+                                                    u64 *snap_size)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_snap_context *snapc = NULL;
        struct ceph_cap_snap *capsnap = NULL;
+        spin_lock(&inode->i_lock);
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
                     capsnap->context, capsnap->dirty_pages);
@@ -357,21 +356,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
                        break;
                }
        }
-        if (!snapc && ci->i_snap_realm) {
+        if (!snapc && ci->i_head_snapc) {
-                snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+                snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
        }
-        return snapc;
-}
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-                                                    u64 *snap_size)
-{
-        struct ceph_snap_context *snapc = NULL;
-        spin_lock(&inode->i_lock);
-        snapc = __get_oldest_context(inode, snap_size);
        spin_unlock(&inode->i_lock);
        return snapc;
 }
@@ -392,7 +381,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        int len = PAGE_CACHE_SIZE;
        loff_t i_size;
        int err = 0;
-        struct ceph_snap_context *snapc;
+        struct ceph_snap_context *snapc, *oldest;
        u64 snap_size = 0;
        long writeback_stat;
@@ -413,13 +402,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                dout("writepage %p page %p not dirty?\n", inode, page);
                goto out;
        }
-        if (snapc != get_oldest_context(inode, &snap_size)) {
+        oldest = get_oldest_context(inode, &snap_size);
+        if (snapc->seq > oldest->seq) {
                dout("writepage %p page %p snapc %p not writeable - noop\n",
                     inode, page, (void *)page->private);
                /* we should only noop if called by kswapd */
                WARN_ON((current->flags & PF_MEMALLOC) == 0);
+                ceph_put_snap_context(oldest);
                goto out;
        }
+        ceph_put_snap_context(oldest);
        /* is this a partial page at end of file? */
        if (snap_size)
@@ -458,7 +450,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        ClearPagePrivate(page);
        end_page_writeback(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-        ceph_put_snap_context(snapc);
+        ceph_put_snap_context(snapc);  /* page's reference */
 out:
        return err;
 }
@@ -512,12 +504,11 @@ static void writepages_finish(struct ceph_osd_request *req,
        int i;
        struct ceph_snap_context *snapc = req->r_snapc;
        struct address_space *mapping = inode->i_mapping;
-        struct writeback_control *wbc = req->r_wbc;
        __s32 rc = -EIO;
        u64 bytes = 0;
        struct ceph_client *client = ceph_inode_to_client(inode);
        long writeback_stat;
-        unsigned issued = __ceph_caps_issued(ci, NULL);
+        unsigned issued = ceph_caps_issued(ci);
        /* parse reply */
        replyhead = msg->front.iov_base;
@@ -554,13 +545,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                        clear_bdi_congested(&client->backing_dev_info,
                                            BLK_RW_ASYNC);
-                if (i >= wrote) {
+                ceph_put_snap_context((void *)page->private);
-                        dout("inode %p skipping page %p\n", inode, page);
-                        wbc->pages_skipped++;
-                }
                page->private = 0;
                ClearPagePrivate(page);
-                ceph_put_snap_context(snapc);
                dout("unlocking %d %p\n", i, page);
                end_page_writeback(page);
@@ -618,7 +605,7 @@ static int ceph_writepages_start(struct address_space *mapping,
        int range_whole = 0;
        int should_loop = 1;
        pgoff_t max_pages = 0, max_pages_ever = 0;
-        struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+        struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
        struct pagevec pvec;
        int done = 0;
        int rc = 0;
@@ -770,9 +757,10 @@ get_more_pages:
                        }
                        /* only if matching snap context */
-                        if (snapc != (void *)page->private) {
+                        pgsnapc = (void *)page->private;
-                                dout("page snapc %p != oldest %p\n",
+                        if (pgsnapc->seq > snapc->seq) {
-                                     (void *)page->private, snapc);
+                                dout("page snapc %p %lld > oldest %p %lld\n",
+                                     pgsnapc, pgsnapc->seq, snapc, snapc->seq);
                                unlock_page(page);
                                if (!locked_pages)
                                        continue; /* keep looking for snap */
@@ -806,7 +794,6 @@ get_more_pages:
                                alloc_page_vec(client, req);
                                req->r_callback = writepages_finish;
                                req->r_inode = inode;
-                                req->r_wbc = wbc;
                        }
                        /* note position of first page in pvec */
@@ -914,7 +901,10 @@ static int context_is_writeable_or_written(struct inode *inode,
                                           struct ceph_snap_context *snapc)
 {
        struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
-        return !oldest || snapc->seq <= oldest->seq;
+        int ret = !oldest || snapc->seq <= oldest->seq;
+        ceph_put_snap_context(oldest);
+        return ret;
 }
 /*
@@ -936,8 +926,8 @@ static int ceph_update_writeable_page(struct file *file,
        int pos_in_page = pos & ~PAGE_CACHE_MASK;
        int end_in_page = pos_in_page + len;
        loff_t i_size;
-        struct ceph_snap_context *snapc;
        int r;
+        struct ceph_snap_context *snapc, *oldest;
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
@@ -947,23 +937,24 @@ retry_locked:
        BUG_ON(!ci->i_snap_realm);
        down_read(&mdsc->snap_rwsem);
        BUG_ON(!ci->i_snap_realm->cached_context);
-        if (page->private &&
+        snapc = (void *)page->private;
-            (void *)page->private != ci->i_snap_realm->cached_context) {
+        if (snapc && snapc != ci->i_head_snapc) {
                /*
                 * this page is already dirty in another (older) snap
                 * context!  is it writeable now?
                 */
-                snapc = get_oldest_context(inode, NULL);
+                oldest = get_oldest_context(inode, NULL);
                up_read(&mdsc->snap_rwsem);
-                if (snapc != (void *)page->private) {
+                if (snapc->seq > oldest->seq) {
+                        ceph_put_snap_context(oldest);
                        dout(" page %p snapc %p not current or oldest\n",
-                             page, (void *)page->private);
+                             page, snapc);
                        /*
                         * queue for writeback, and wait for snapc to
                         * be writeable or written
                         */
-                        snapc = ceph_get_snap_context((void *)page->private);
+                        snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
                        r = wait_event_interruptible(ci->i_cap_wq,
@@ -973,6 +964,7 @@ retry_locked:
                                return r;
                        return -EAGAIN;
                }
+                ceph_put_snap_context(oldest);
                /* yay, writeable, do it now (without dropping page lock) */
                dout(" page %p snapc %p not current, but oldest\n",
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index f6394b94b866..818afe72e6c7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include "types.h"
 #include "auth_none.h"
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
index 56c05533a31c..8164df1a08be 100644
--- a/fs/ceph/auth_none.h
+++ b/fs/ceph/auth_none.h
@@ -1,6 +1,8 @@
 #ifndef _FS_CEPH_AUTH_NONE_H
 #define _FS_CEPH_AUTH_NONE_H
+#include <linux/slab.h>
 #include "auth.h"
 /*
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index d9001a4dc8cc..fee5a08da881 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -12,8 +12,6 @@
 #include "auth.h"
 #include "decode.h"
-struct kmem_cache *ceph_x_ticketbuf_cachep;
 #define TEMP_TICKET_BUF_LEN     256
 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
@@ -131,13 +129,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        char *ticket_buf;
        u8 struct_v;
-        dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
        if (!dbuf)
                return -ENOMEM;
        ret = -ENOMEM;
-        ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+        ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
-                                      GFP_NOFS | GFP_ATOMIC);
        if (!ticket_buf)
                goto out_dbuf;
@@ -251,9 +248,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        ret = 0;
 out:
-        kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+        kfree(ticket_buf);
 out_dbuf:
-        kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+        kfree(dbuf);
        return ret;
 bad:
@@ -605,8 +602,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
                remove_ticket_handler(ac, th);
        }
-        kmem_cache_destroy(ceph_x_ticketbuf_cachep);
        kfree(ac->private);
        ac->private = NULL;
 }
@@ -641,26 +636,20 @@ int ceph_x_init(struct ceph_auth_client *ac)
        int ret;
        dout("ceph_x_init %p\n", ac);
+        ret = -ENOMEM;
        xi = kzalloc(sizeof(*xi), GFP_NOFS);
        if (!xi)
-                return -ENOMEM;
+                goto out;
-        ret = -ENOMEM;
-        ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
-                                      TEMP_TICKET_BUF_LEN, 8,
-                                      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-                                      NULL);
-        if (!ceph_x_ticketbuf_cachep)
-                goto done_nomem;
        ret = -EINVAL;
        if (!ac->secret) {
                pr_err("no secret set (for auth_x protocol)\n");
-                goto done_nomem;
+                goto out_nomem;
        }
        ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
        if (ret)
-                goto done_nomem;
+                goto out_nomem;
        xi->starting = true;
        xi->ticket_handlers = RB_ROOT;
@@ -670,10 +659,9 @@ int ceph_x_init(struct ceph_auth_client *ac)
        ac->ops = &ceph_x_ops;
        return 0;
-done_nomem:
+out_nomem:
        kfree(xi);
-        if (ceph_x_ticketbuf_cachep)
+out:
-                kmem_cache_destroy(ceph_x_ticketbuf_cachep);
        return ret;
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3710e077a857..d9400534b279 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -858,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
 }
 /*
+ * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
+ *
 * caller should hold i_lock.
 * caller will not hold session s_mutex if called from destroy_inode.
 */
@@ -866,15 +868,10 @@ void __ceph_remove_cap(struct ceph_cap *cap)
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
-        /* remove from inode list */
-        rb_erase(&cap->ci_node, &ci->i_caps);
-        cap->ci = NULL;
-        if (ci->i_auth_cap == cap)
-                ci->i_auth_cap = NULL;
        /* remove from session list */
        spin_lock(&session->s_cap_lock);
        if (session->s_cap_iterator == cap) {
@@ -885,10 +882,18 @@ void __ceph_remove_cap(struct ceph_cap *cap)
                list_del_init(&cap->session_caps);
                session->s_nr_caps--;
                cap->session = NULL;
+                removed = 1;
        }
+        /* protect backpointer with s_cap_lock: see iterate_session_caps */
+        cap->ci = NULL;
        spin_unlock(&session->s_cap_lock);
-        if (cap->session == NULL)
+        /* remove from inode list */
+        rb_erase(&cap->ci_node, &ci->i_caps);
+        if (ci->i_auth_cap == cap)
+                ci->i_auth_cap = NULL;
+        if (removed)
                ceph_put_cap(cap);
        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
@@ -1205,6 +1210,12 @@ retry:
                if (capsnap->dirty_pages || capsnap->writing)
                        continue;
+                /*
+                 * if cap writeback already occurred, we should have dropped
+                 * the capsnap in ceph_put_wrbuffer_cap_refs.
+                 */
+                BUG_ON(capsnap->dirty == 0);
                /* pick mds, take s_mutex */
                mds = __ceph_get_cap_mds(ci, &mseq);
                if (session && session->s_mds != mds) {
@@ -1855,8 +1866,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
                } else {
                        pr_err("%p auth cap %p not mds%d ???\n", inode,
                               cap, session->s_mds);
-                        spin_unlock(&inode->i_lock);
                }
+                spin_unlock(&inode->i_lock);
        }
 }
@@ -2118,8 +2129,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
                }
        spin_unlock(&inode->i_lock);
-        dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+        dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
-             last ? "last" : "");
+             last ? " last" : "", put ? " put" : "");
        if (last && !flushsnaps)
                ceph_check_caps(ci, 0, NULL);
@@ -2143,7 +2154,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 {
        struct inode *inode = &ci->vfs_inode;
        int last = 0;
-        int last_snap = 0;
+        int complete_capsnap = 0;
+        int drop_capsnap = 0;
        int found = 0;
        struct ceph_cap_snap *capsnap = NULL;
@@ -2166,19 +2178,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
                list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                        if (capsnap->context == snapc) {
                                found = 1;
-                                capsnap->dirty_pages -= nr;
-                                last_snap = !capsnap->dirty_pages;
                                break;
                        }
                }
                BUG_ON(!found);
+                capsnap->dirty_pages -= nr;
+                if (capsnap->dirty_pages == 0) {
+                        complete_capsnap = 1;
+                        if (capsnap->dirty == 0)
+                                /* cap writeback completed before we created
+                                 * the cap_snap; no FLUSHSNAP is needed */
+                                drop_capsnap = 1;
+                }
                dout("put_wrbuffer_cap_refs on %p cap_snap %p "
-                     " snap %lld %d/%d -> %d/%d %s%s\n",
+                     " snap %lld %d/%d -> %d/%d %s%s%s\n",
                     inode, capsnap, capsnap->context->seq,
                     ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
                     ci->i_wrbuffer_ref, capsnap->dirty_pages,
                     last ? " (wrbuffer last)" : "",
-                     last_snap ? " (capsnap last)" : "");
+                     complete_capsnap ? " (complete capsnap)" : "",
+                     drop_capsnap ? " (drop capsnap)" : "");
+                if (drop_capsnap) {
+                        ceph_put_snap_context(capsnap->context);
+                        list_del(&capsnap->ci_item);
+                        list_del(&capsnap->flushing_item);
+                        ceph_put_cap_snap(capsnap);
+                }
        }
        spin_unlock(&inode->i_lock);
@@ -2186,10 +2211,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
        if (last) {
                ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
                iput(inode);
-        } else if (last_snap) {
+        } else if (complete_capsnap) {
                ceph_flush_snaps(ci);
                wake_up(&ci->i_cap_wq);
        }
+        if (drop_capsnap)
+                iput(inode);
 }
 /*
@@ -2465,8 +2492,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
                                break;
                        }
                        WARN_ON(capsnap->dirty_pages || capsnap->writing);
-                        dout(" removing cap_snap %p follows %lld\n",
+                        dout(" removing %p cap_snap %p follows %lld\n",
-                             capsnap, follows);
+                             inode, capsnap, follows);
                        ceph_put_snap_context(capsnap->context);
                        list_del(&capsnap->ci_item);
                        list_del(&capsnap->flushing_item);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 7261dc6c2ead..650d2db5ed26 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -171,11 +171,11 @@ more:
        spin_lock(&inode->i_lock);
        spin_lock(&dcache_lock);
+        last = dentry;
        if (err < 0)
                goto out_unlock;
-        last = dentry;
        p = p->prev;
        filp->f_pos++;
@@ -312,7 +312,7 @@ more:
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
-                req->r_num_caps = max_entries;
+                req->r_num_caps = max_entries + 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
                        ceph_mdsc_put_request(req);
@@ -489,6 +489,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                struct inode *inode = ceph_get_snapdir(parent);
                dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
                     dentry, dentry->d_name.len, dentry->d_name.name, inode);
+                BUG_ON(!d_unhashed(dentry));
                d_add(dentry, inode);
                err = 0;
        }
@@ -879,7 +880,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                 * do_request, above).  If there is no trace, we need
                 * to do it here.
                 */
+                /* d_move screws up d_subdirs order */
+                ceph_i_clear(new_dir, CEPH_I_COMPLETE);
                d_move(old_dentry, new_dentry);
+                /* ensure target dentry is invalidated, despite
+                   rehashing bug in vfs_rename_dir */
+                new_dentry->d_time = jiffies;
+                ceph_dentry(new_dentry)->lease_shared_gen = 0;
        }
        ceph_mdsc_put_request(req);
        return err;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4add3d5da2c1..ed6f19721d6e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -665,7 +665,8 @@ more:
                 * throw out any page cache pages in this range. this
                 * may block.
                 */
-                truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
+                truncate_inode_pages_range(inode->i_mapping, pos, 
+                                           (pos+len) | (PAGE_CACHE_SIZE-1));
        } else {
                pages = alloc_page_vector(num_pages);
                if (IS_ERR(pages)) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aca82d55cc53..85b4d2ffdeba 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -733,6 +733,10 @@ no_change:
                                __ceph_get_fmode(ci, cap_fmode);
                        spin_unlock(&inode->i_lock);
                }
+        } else if (cap_fmode >= 0) {
+                pr_warning("mds issued no caps on %llx.%llx\n",
+                           ceph_vinop(inode));
+                __ceph_get_fmode(ci, cap_fmode);
        }
        /* update delegation info? */
@@ -886,6 +890,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        struct inode *in = NULL;
        struct ceph_mds_reply_inode *ininfo;
        struct ceph_vino vino;
+        struct ceph_client *client = ceph_sb_to_client(sb);
        int i = 0;
        int err = 0;
@@ -949,7 +954,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        return err;
        }
-        if (rinfo->head->is_dentry && !req->r_aborted) {
+        /*
+         * ignore null lease/binding on snapdir ENOENT, or else we
+         * will have trouble splicing in the virtual snapdir later
+         */
+        if (rinfo->head->is_dentry && !req->r_aborted &&
+            (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
+                                               client->mount_args->snapdir_name,
+                                               req->r_dentry->d_name.len))) {
                /*
                 * lookup link rename   : null -> possibly existing inode
                 * mknod symlink mkdir  : null -> new inode
@@ -989,6 +1001,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                             dn, dn->d_name.len, dn->d_name.name);
                        dout("fill_trace doing d_move %p -> %p\n",
                             req->r_old_dentry, dn);
+                        /* d_move screws up d_subdirs order */
+                        ceph_i_clear(dir, CEPH_I_COMPLETE);
                        d_move(req->r_old_dentry, dn);
                        dout(" src %p '%.*s' dst %p '%.*s'\n",
                             req->r_old_dentry,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 60a9a4ae47be..24561a557e01 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -736,9 +736,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
 }
 /*
- * Helper to safely iterate over all caps associated with a session.
+ * Helper to safely iterate over all caps associated with a session, with
+ * special care taken to handle a racing __ceph_remove_cap().
 *
- * caller must hold session s_mutex
+ * Caller must hold session s_mutex.
 */
 static int iterate_session_caps(struct ceph_mds_session *session,
                                 int (*cb)(struct inode *, struct ceph_cap *,
@@ -2136,7 +2137,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
        struct ceph_mds_session *session = NULL;
        struct ceph_msg *reply;
        struct rb_node *p;
-        int err;
+        int err = -ENOMEM;
        struct ceph_pagelist *pagelist;
        pr_info("reconnect to recovering mds%d\n", mds);
@@ -2185,7 +2186,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
                goto fail;
        err = iterate_session_caps(session, encode_caps_cb, pagelist);
        if (err < 0)
-                goto out;
+                goto fail;
        /*
         * snaprealms.  we provide mds with the ino, seq (version), and
@@ -2213,28 +2214,31 @@ send:
        reply->nr_pages = calc_pages_for(0, pagelist->length);
        ceph_con_send(&session->s_con, reply);
-        if (session) {
+        session->s_state = CEPH_MDS_SESSION_OPEN;
-                session->s_state = CEPH_MDS_SESSION_OPEN;
+        mutex_unlock(&session->s_mutex);
-                __wake_requests(mdsc, &session->s_waiting);
-        }
+        mutex_lock(&mdsc->mutex);
+        __wake_requests(mdsc, &session->s_waiting);
+        mutex_unlock(&mdsc->mutex);
+        ceph_put_mds_session(session);
-out:
        up_read(&mdsc->snap_rwsem);
-        if (session) {
-                mutex_unlock(&session->s_mutex);
-                ceph_put_mds_session(session);
-        }
        mutex_lock(&mdsc->mutex);
        return;
 fail:
        ceph_msg_put(reply);
+        up_read(&mdsc->snap_rwsem);
+        mutex_unlock(&session->s_mutex);
+        ceph_put_mds_session(session);
 fail_nomsg:
        ceph_pagelist_release(pagelist);
        kfree(pagelist);
 fail_nopagelist:
-        pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
+        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-        goto out;
+        mutex_lock(&mdsc->mutex);
+        return;
 }
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 8f1715ffbe4b..cd4fadb6491a 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -30,6 +30,10 @@ static char tag_msg = CEPH_MSGR_TAG_MSG;
 static char tag_ack = CEPH_MSGR_TAG_ACK;
 static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key socket_class;
+#endif
 static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
@@ -228,6 +232,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
        con->sock = sock;
        sock->sk->sk_allocation = GFP_NOFS;
+#ifdef CONFIG_LOCKDEP
+        lockdep_set_class(&sock->sk->sk_lock, &socket_class);
+#endif
        set_sock_callbacks(sock, con);
        dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
@@ -333,6 +341,7 @@ static void reset_connection(struct ceph_connection *con)
                con->out_msg = NULL;
        }
        con->in_seq = 0;
+        con->in_seq_acked = 0;
 }
 /*
@@ -483,7 +492,14 @@ static void prepare_write_message(struct ceph_connection *con)
                list_move_tail(&m->list_head, &con->out_sent);
        }
-        m->hdr.seq = cpu_to_le64(++con->out_seq);
+        /*
+         * only assign outgoing seq # if we haven't sent this message
+         * yet.  if it is requeued, resend with it's original seq.
+         */
+        if (m->needs_out_seq) {
+                m->hdr.seq = cpu_to_le64(++con->out_seq);
+                m->needs_out_seq = false;
+        }
        dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
             m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -1325,6 +1341,7 @@ static int read_partial_message(struct ceph_connection *con)
        unsigned front_len, middle_len, data_len, data_off;
        int datacrc = con->msgr->nocrc;
        int skip;
+        u64 seq;
        dout("read_partial_message con %p msg %p\n", con, m);
@@ -1359,6 +1376,25 @@ static int read_partial_message(struct ceph_connection *con)
                return -EIO;
        data_off = le16_to_cpu(con->in_hdr.data_off);
+        /* verify seq# */
+        seq = le64_to_cpu(con->in_hdr.seq);
+        if ((s64)seq - (s64)con->in_seq < 1) {
+                pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
+                        ENTITY_NAME(con->peer_name),
+                        pr_addr(&con->peer_addr.in_addr),
+                        seq, con->in_seq + 1);
+                con->in_base_pos = -front_len - middle_len - data_len -
+                        sizeof(m->footer);
+                con->in_tag = CEPH_MSGR_TAG_READY;
+                con->in_seq++;
+                return 0;
+        } else if ((s64)seq - (s64)con->in_seq > 1) {
+                pr_err("read_partial_message bad seq %lld expected %lld\n",
+                       seq, con->in_seq + 1);
+                con->error_msg = "bad message sequence # for incoming message";
+                return -EBADMSG;
+        }
        /* allocate message? */
        if (!con->in_msg) {
                dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
@@ -1370,6 +1406,7 @@ static int read_partial_message(struct ceph_connection *con)
                        con->in_base_pos = -front_len - middle_len - data_len -
                                sizeof(m->footer);
                        con->in_tag = CEPH_MSGR_TAG_READY;
+                        con->in_seq++;
                        return 0;
                }
                if (IS_ERR(con->in_msg)) {
@@ -1956,6 +1993,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+        msg->needs_out_seq = true;
        /* queue */
        mutex_lock(&con->mutex);
        BUG_ON(!list_empty(&msg->list_head));
@@ -2021,6 +2060,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
                ceph_msg_put(con->in_msg);
                con->in_msg = NULL;
                con->in_tag = CEPH_MSGR_TAG_READY;
+                con->in_seq++;
        } else {
                dout("con_revoke_pages %p msg %p pages %p no-op\n",
                     con, con->in_msg, msg);
@@ -2054,15 +2094,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        kref_init(&m->kref);
        INIT_LIST_HEAD(&m->list_head);
+        m->hdr.tid = 0;
        m->hdr.type = cpu_to_le16(type);
+        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+        m->hdr.version = 0;
        m->hdr.front_len = cpu_to_le32(front_len);
        m->hdr.middle_len = 0;
        m->hdr.data_len = cpu_to_le32(page_len);
        m->hdr.data_off = cpu_to_le16(page_off);
-        m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+        m->hdr.reserved = 0;
        m->footer.front_crc = 0;
        m->footer.middle_crc = 0;
        m->footer.data_crc = 0;
+        m->footer.flags = 0;
        m->front_max = front_len;
        m->front_is_vmalloc = false;
        m->more_to_follow = false;
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a343dae73cdc..a5caf91cc971 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -86,6 +86,7 @@ struct ceph_msg {
        struct kref kref;
        bool front_is_vmalloc;
        bool more_to_follow;
+        bool needs_out_seq;
        int front_max;
        struct ceph_msgpool *pool;
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index c7b4dedaace6..3514f71ff85f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
 {
        struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
        struct ceph_pg pgid;
-        int o = -1;
+        int acting[CEPH_PG_MAX_SIZE];
+        int o = -1, num = 0;
        int err;
        dout("map_osds %p tid %lld\n", req, req->r_tid);
@@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc,
        pgid = reqhead->layout.ol_pgid;
        req->r_pgid = pgid;
-        o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+        err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+        if (err > 0) {
+                o = acting[0];
+                num = err;
+        }
        if ((req->r_osd && req->r_osd->o_osd == o &&
-             req->r_sent >= req->r_osd->o_incarnation) ||
+             req->r_sent >= req->r_osd->o_incarnation &&
+             req->r_num_pg_osds == num &&
+             memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
            (req->r_osd == NULL && o == -1))
                return 0;  /* no change */
@@ -587,6 +594,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
             req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
             req->r_osd ? req->r_osd->o_osd : -1);
+        /* record full pg acting set */
+        memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
+        req->r_num_pg_osds = num;
        if (req->r_osd) {
                __cancel_request(req);
                list_del_init(&req->r_osd_item);
@@ -612,7 +623,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
                __remove_osd_from_lru(req->r_osd);
                list_add(&req->r_osd_item, &req->r_osd->o_requests);
        }
-        err = 1;   /* osd changed */
+        err = 1;   /* osd or pg changed */
 out:
        return err;
@@ -779,16 +790,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        struct ceph_osd_request *req;
        u64 tid;
        int numops, object_len, flags;
+        s32 result;
        tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len < sizeof(*rhead))
                goto bad;
        numops = le32_to_cpu(rhead->num_ops);
        object_len = le32_to_cpu(rhead->object_len);
+        result = le32_to_cpu(rhead->result);
        if (msg->front.iov_len != sizeof(*rhead) + object_len +
            numops * sizeof(struct ceph_osd_op))
                goto bad;
-        dout("handle_reply %p tid %llu\n", msg, tid);
+        dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
        /* lookup */
        mutex_lock(&osdc->request_mutex);
@@ -834,7 +847,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
        dout("handle_reply tid %llu flags %d\n", tid, flags);
        /* either this is a read, or we got the safe response */
-        if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+        if (result < 0 ||
+            (flags & CEPH_OSD_FLAG_ONDISK) ||
            ((flags & CEPH_OSD_FLAG_WRITE) == 0))
                __unregister_request(osdc, req);
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index b0759911e7c3..ce776989ef6a 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -48,6 +48,8 @@ struct ceph_osd_request {
        struct list_head r_osd_item;
        struct ceph_osd *r_osd;
        struct ceph_pg   r_pgid;
+        int              r_pg_osds[CEPH_PG_MAX_SIZE];
+        int              r_num_pg_osds;
        struct ceph_connection *r_con_filling_msg;
@@ -66,7 +68,6 @@ struct ceph_osd_request {
        struct list_head  r_unsafe_item;
        struct inode *r_inode;                /* for use by callbacks */
-        struct writeback_control *r_wbc;      /* ditto */
        char              r_oid[40];          /* object name */
        int               r_oid_len;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 21c6623c4b07..cfdd8f4388b7 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -314,71 +314,6 @@ bad:
        return ERR_PTR(err);
 }
-/*
- * osd map
- */
-void ceph_osdmap_destroy(struct ceph_osdmap *map)
-{
-        dout("osdmap_destroy %p\n", map);
-        if (map->crush)
-                crush_destroy(map->crush);
-        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
-                struct ceph_pg_mapping *pg =
-                        rb_entry(rb_first(&map->pg_temp),
-                                 struct ceph_pg_mapping, node);
-                rb_erase(&pg->node, &map->pg_temp);
-                kfree(pg);
-        }
-        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
-                struct ceph_pg_pool_info *pi =
-                        rb_entry(rb_first(&map->pg_pools),
-                                 struct ceph_pg_pool_info, node);
-                rb_erase(&pi->node, &map->pg_pools);
-                kfree(pi);
-        }
-        kfree(map->osd_state);
-        kfree(map->osd_weight);
-        kfree(map->osd_addr);
-        kfree(map);
-}
-/*
- * adjust max osd value.  reallocate arrays.
- */
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
-{
-        u8 *state;
-        struct ceph_entity_addr *addr;
-        u32 *weight;
-        state = kcalloc(max, sizeof(*state), GFP_NOFS);
-        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
-        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
-        if (state == NULL || addr == NULL || weight == NULL) {
-                kfree(state);
-                kfree(addr);
-                kfree(weight);
-                return -ENOMEM;
-        }
-        /* copy old? */
-        if (map->osd_state) {
-                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
-                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
-                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-                kfree(map->osd_state);
-                kfree(map->osd_addr);
-                kfree(map->osd_weight);
-        }
-        map->osd_state = state;
-        map->osd_weight = weight;
-        map->osd_addr = addr;
-        map->max_osd = max;
-        return 0;
-}
 /*
 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
 * to a set of osds)
@@ -482,6 +417,13 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
        return NULL;
 }
+static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
+{
+        rb_erase(&pi->node, root);
+        kfree(pi->name);
+        kfree(pi);
+}
 void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
 {
        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
@@ -490,6 +432,98 @@ void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
 }
+static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+{
+        struct ceph_pg_pool_info *pi;
+        u32 num, len, pool;
+        ceph_decode_32_safe(p, end, num, bad);
+        dout(" %d pool names\n", num);
+        while (num--) {
+                ceph_decode_32_safe(p, end, pool, bad);
+                ceph_decode_32_safe(p, end, len, bad);
+                dout("  pool %d len %d\n", pool, len);
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (pi) {
+                        kfree(pi->name);
+                        pi->name = kmalloc(len + 1, GFP_NOFS);
+                        if (pi->name) {
+                                memcpy(pi->name, *p, len);
+                                pi->name[len] = '\0';
+                                dout("  name is %s\n", pi->name);
+                        }
+                }
+                *p += len;
+        }
+        return 0;
+bad:
+        return -EINVAL;
+}
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+        dout("osdmap_destroy %p\n", map);
+        if (map->crush)
+                crush_destroy(map->crush);
+        while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(rb_first(&map->pg_temp),
+                                 struct ceph_pg_mapping, node);
+                rb_erase(&pg->node, &map->pg_temp);
+                kfree(pg);
+        }
+        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+                struct ceph_pg_pool_info *pi =
+                        rb_entry(rb_first(&map->pg_pools),
+                                 struct ceph_pg_pool_info, node);
+                __remove_pg_pool(&map->pg_pools, pi);
+        }
+        kfree(map->osd_state);
+        kfree(map->osd_weight);
+        kfree(map->osd_addr);
+        kfree(map);
+}
+/*
+ * adjust max osd value.  reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+        u8 *state;
+        struct ceph_entity_addr *addr;
+        u32 *weight;
+        state = kcalloc(max, sizeof(*state), GFP_NOFS);
+        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+        if (state == NULL || addr == NULL || weight == NULL) {
+                kfree(state);
+                kfree(addr);
+                kfree(weight);
+                return -ENOMEM;
+        }
+        /* copy old? */
+        if (map->osd_state) {
+                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+                kfree(map->osd_state);
+                kfree(map->osd_addr);
+                kfree(map->osd_weight);
+        }
+        map->osd_state = state;
+        map->osd_weight = weight;
+        map->osd_addr = addr;
+        map->max_osd = max;
+        return 0;
+}
 /*
 * decode a full map.
 */
@@ -526,7 +560,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
        ceph_decode_32_safe(p, end, max, bad);
        while (max--) {
                ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
-                pi = kmalloc(sizeof(*pi), GFP_NOFS);
+                pi = kzalloc(sizeof(*pi), GFP_NOFS);
                if (!pi)
                        goto bad;
                pi->id = ceph_decode_32(p);
@@ -539,6 +573,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                __decode_pool(p, pi);
                __insert_pg_pool(&map->pg_pools, pi);
        }
+        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+                goto bad;
        ceph_decode_32_safe(p, end, map->pool_max, bad);
        ceph_decode_32_safe(p, end, map->flags, bad);
@@ -712,7 +750,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                }
                pi = __lookup_pg_pool(&map->pg_pools, pool);
                if (!pi) {
-                        pi = kmalloc(sizeof(*pi), GFP_NOFS);
+                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
                        if (!pi) {
                                err = -ENOMEM;
                                goto bad;
@@ -722,6 +760,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                }
                __decode_pool(p, pi);
        }
+        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
+                goto bad;
        /* old_pool */
        ceph_decode_32_safe(p, end, len, bad);
@@ -730,10 +770,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                ceph_decode_32_safe(p, end, pool, bad);
                pi = __lookup_pg_pool(&map->pg_pools, pool);
-                if (pi) {
+                if (pi)
-                        rb_erase(&pi->node, &map->pg_pools);
+                        __remove_pg_pool(&map->pg_pools, pi);
-                        kfree(pi);
-                }
        }
        /* new_up */
@@ -1003,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 }
 /*
+ * Return acting set for given pgid.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                        int *acting)
+{
+        int rawosds[CEPH_PG_MAX_SIZE], *osds;
+        int i, o, num = CEPH_PG_MAX_SIZE;
+        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        if (!osds)
+                return -1;
+        /* primary is first up osd */
+        o = 0;
+        for (i = 0; i < num; i++)
+                if (ceph_osd_is_up(osdmap, osds[i]))
+                        acting[o++] = osds[i];
+        return o;
+}
+/*
 * Return primary osd for given pgid, or -1 if none.
 */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-        int rawosds[10], *osds;
+        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, num = ARRAY_SIZE(rawosds);
+        int i, num = CEPH_PG_MAX_SIZE;
        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
        if (!osds)
@@ -1016,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
        /* primary is first up osd */
        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i])) {
+                if (ceph_osd_is_up(osdmap, osds[i]))
                        return osds[i];
-                        break;
-                }
        return -1;
 }
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 1fb55afb2642..970b547e510d 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -23,6 +23,7 @@ struct ceph_pg_pool_info {
        int id;
        struct ceph_pg_pool v;
        int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+        char *name;
 };
 struct ceph_pg_mapping {
@@ -119,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
                                   const char *oid,
                                   struct ceph_file_layout *fl,
                                   struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                               int *acting);
 extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
                                struct ceph_pg pgid);
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 26ac8b89a676..fd56451a871f 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,10 @@
 /*
 * osdmap encoding versions
 */
-#define CEPH_OSDMAP_INC_VERSION 4
+#define CEPH_OSDMAP_INC_VERSION     5
-#define CEPH_OSDMAP_VERSION     4
+#define CEPH_OSDMAP_INC_VERSION_EXT 5
+#define CEPH_OSDMAP_VERSION         5
+#define CEPH_OSDMAP_VERSION_EXT     5
 /*
 * fs id
@@ -56,6 +58,7 @@ struct ceph_timespec {
 #define CEPH_PG_LAYOUT_LINEAR 2
 #define CEPH_PG_LAYOUT_HYBRID 3
+#define CEPH_PG_MAX_SIZE      16  /* max # osds in a single pg */
 /*
 * placement group.
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e6f9bc57d472..d5114db70453 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -431,8 +431,7 @@ static int dup_array(u64 **dst, __le64 *src, int num)
 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
 * change).
 */
-void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+void ceph_queue_cap_snap(struct ceph_inode_info *ci)
-                         struct ceph_snap_context *snapc)
 {
        struct inode *inode = &ci->vfs_inode;
        struct ceph_cap_snap *capsnap;
@@ -451,10 +450,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
                   as no new writes are allowed to start when pending, so any
                   writes in progress now were started before the previous
                   cap_snap.  lucky us. */
-                dout("queue_cap_snap %p snapc %p seq %llu used %d"
+                dout("queue_cap_snap %p already pending\n", inode);
-                     " already pending\n", inode, snapc, snapc->seq, used);
                kfree(capsnap);
        } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+                struct ceph_snap_context *snapc = ci->i_head_snapc;
                igrab(inode);
                atomic_set(&capsnap->nref, 1);
@@ -463,7 +463,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
                INIT_LIST_HEAD(&capsnap->flushing_item);
                capsnap->follows = snapc->seq - 1;
-                capsnap->context = ceph_get_snap_context(snapc);
                capsnap->issued = __ceph_caps_issued(ci, NULL);
                capsnap->dirty = __ceph_caps_dirty(ci);
@@ -480,7 +479,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
                   snapshot. */
                capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
                ci->i_wrbuffer_ref_head = 0;
-                ceph_put_snap_context(ci->i_head_snapc);
+                capsnap->context = snapc;
                ci->i_head_snapc = NULL;
                list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
@@ -522,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
        capsnap->ctime = inode->i_ctime;
        capsnap->time_warp_seq = ci->i_time_warp_seq;
        if (capsnap->dirty_pages) {
-                dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+                dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
                     "still has %d dirty pages\n", inode, capsnap,
                     capsnap->context, capsnap->context->seq,
-                     capsnap->size, capsnap->dirty_pages);
+                     ceph_cap_string(capsnap->dirty), capsnap->size,
+                     capsnap->dirty_pages);
                return 0;
        }
-        dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+        dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
             inode, capsnap, capsnap->context,
-             capsnap->context->seq, capsnap->size);
+             capsnap->context->seq, ceph_cap_string(capsnap->dirty),
+             capsnap->size);
        spin_lock(&mdsc->snap_flush_lock);
        list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
@@ -602,7 +603,7 @@ more:
                                if (lastinode)
                                        iput(lastinode);
                                lastinode = inode;
-                                ceph_queue_cap_snap(ci, realm->cached_context);
+                                ceph_queue_cap_snap(ci);
                                spin_lock(&realm->inodes_with_caps_lock);
                        }
                        spin_unlock(&realm->inodes_with_caps_lock);
@@ -824,8 +825,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        spin_unlock(&realm->inodes_with_caps_lock);
                        spin_unlock(&inode->i_lock);
-                        ceph_queue_cap_snap(ci,
+                        ceph_queue_cap_snap(ci);
-                                            ci->i_snap_realm->cached_context);
                        iput(inode);
                        continue;
@@ -869,16 +869,20 @@ skip_inode:
                                continue;
                        ci = ceph_inode(inode);
                        spin_lock(&inode->i_lock);
-                        if (!ci->i_snap_realm)
+                        if (list_empty(&ci->i_snap_realm_item)) {
-                                goto split_skip_inode;
+                                struct ceph_snap_realm *oldrealm =
-                        ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+                                        ci->i_snap_realm;
-                        spin_lock(&realm->inodes_with_caps_lock);
-                        list_add(&ci->i_snap_realm_item,
+                                dout(" moving %p to split realm %llx %p\n",
-                                 &realm->inodes_with_caps);
+                                     inode, realm->ino, realm);
-                        ci->i_snap_realm = realm;
+                                spin_lock(&realm->inodes_with_caps_lock);
-                        spin_unlock(&realm->inodes_with_caps_lock);
+                                list_add(&ci->i_snap_realm_item,
-                        ceph_get_snap_realm(mdsc, realm);
+                                         &realm->inodes_with_caps);
-split_skip_inode:
+                                ci->i_snap_realm = realm;
+                                spin_unlock(&realm->inodes_with_caps_lock);
+                                ceph_get_snap_realm(mdsc, realm);
+                                ceph_put_snap_realm(mdsc, oldrealm);
+                        }
                        spin_unlock(&inode->i_lock);
                        iput(inode);
                }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 75d02eaa1279..110857ba9269 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -47,10 +47,20 @@ const char *ceph_file_part(const char *s, int len)
 */
 static void ceph_put_super(struct super_block *s)
 {
-        struct ceph_client *cl = ceph_client(s);
+        struct ceph_client *client = ceph_sb_to_client(s);
        dout("put_super\n");
-        ceph_mdsc_close_sessions(&cl->mdsc);
+        ceph_mdsc_close_sessions(&client->mdsc);
+        /*
+         * ensure we release the bdi before put_anon_super releases
+         * the device name.
+         */
+        if (s->s_bdi == &client->backing_dev_info) {
+                bdi_unregister(&client->backing_dev_info);
+                s->s_bdi = NULL;
+        }
        return;
 }
@@ -636,6 +646,8 @@ static void ceph_destroy_client(struct ceph_client *client)
        destroy_workqueue(client->pg_inv_wq);
        destroy_workqueue(client->trunc_wq);
+        bdi_destroy(&client->backing_dev_info);
        if (client->msgr)
                ceph_messenger_destroy(client->msgr);
        mempool_destroy(client->wb_pagevec_pool);
@@ -876,14 +888,14 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
        int err;
-        sb->s_bdi = &client->backing_dev_info;
        /* set ra_pages based on rsize mount option? */
        if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
                client->backing_dev_info.ra_pages =
                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        if (!err)
+                sb->s_bdi = &client->backing_dev_info;
        return err;
 }
@@ -957,9 +969,6 @@ static void ceph_kill_sb(struct super_block *s)
        dout("kill_sb %p\n", s);
        ceph_mdsc_pre_umount(&client->mdsc);
        kill_anon_super(s);    /* will call put_super after sb is r/o */
-        if (s->s_bdi == &client->backing_dev_info)
-                bdi_unregister(&client->backing_dev_info);
-        bdi_destroy(&client->backing_dev_info);
        ceph_destroy_client(client);
 }
@@ -996,9 +1005,10 @@ static int __init init_ceph(void)
        if (ret)
                goto out_icache;
-        pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+        pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
-                CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
-                CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+                CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
+                CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
        return 0;
 out_icache:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ca702c67bc66..13513b80d87f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
+#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
@@ -715,8 +716,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session,
                             struct ceph_msg *msg);
-extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
-                                struct ceph_snap_context *snapc);
 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                                  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 4797787c6a44..246a167cb913 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -18,6 +18,8 @@
 #ifndef _CIFS_FS_SB_H
 #define _CIFS_FS_SB_H
+#include <linux/backing-dev.h>
 #define CIFS_MOUNT_NO_PERM      1 /* do not do client vfs_perm check */
 #define CIFS_MOUNT_SET_UID      2 /* set current's euid in create etc. */
 #define CIFS_MOUNT_SERVER_INUM  4 /* inode numbers from uniqueid from server  */
@@ -50,5 +52,6 @@ struct cifs_sb_info {
 #ifdef CONFIG_CIFS_DFS_UPCALL
        char   *mountdata; /* mount options received at mount time */
 #endif
+        struct backing_dev_info bdi;
 };
 #endif                          /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ded66be6597c..ad235d604a0b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -103,6 +103,12 @@ cifs_read_super(struct super_block *sb, void *data,
        if (cifs_sb == NULL)
                return -ENOMEM;
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        if (rc) {
+                kfree(cifs_sb);
+                return rc;
+        }
 #ifdef CONFIG_CIFS_DFS_UPCALL
        /* copy mount params to sb for use in submounts */
        /* BB: should we move this after the mount so we
@@ -115,6 +121,7 @@ cifs_read_super(struct super_block *sb, void *data,
                int len = strlen(data);
                cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
                if (cifs_sb->mountdata == NULL) {
+                        bdi_destroy(&cifs_sb->bdi);
                        kfree(sb->s_fs_info);
                        sb->s_fs_info = NULL;
                        return -ENOMEM;
@@ -135,6 +142,7 @@ cifs_read_super(struct super_block *sb, void *data,
        sb->s_magic = CIFS_MAGIC_NUMBER;
        sb->s_op = &cifs_super_ops;
+        sb->s_bdi = &cifs_sb->bdi;
 /*      if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512)
            sb->s_blocksize =
                cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */
@@ -183,6 +191,7 @@ out_mount_failed:
                }
 #endif
                unload_nls(cifs_sb->local_nls);
+                bdi_destroy(&cifs_sb->bdi);
                kfree(cifs_sb);
        }
        return rc;
@@ -214,6 +223,7 @@ cifs_put_super(struct super_block *sb)
 #endif
        unload_nls(cifs_sb->local_nls);
+        bdi_destroy(&cifs_sb->bdi);
        kfree(cifs_sb);
        unlock_kernel();
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index ecf0ffbe2b64..0c2fd17439c8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -502,6 +502,7 @@ struct dfs_info3_param {
 #define CIFS_FATTR_DFS_REFERRAL         0x1
 #define CIFS_FATTR_DELETE_PENDING       0x2
 #define CIFS_FATTR_NEED_REVAL           0x4
+#define CIFS_FATTR_INO_COLLISION        0x8
 struct cifs_fattr {
        u32             cf_flags;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 35ec11716213..29b9ea244c81 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -715,6 +715,16 @@ cifs_find_inode(struct inode *inode, void *opaque)
        if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
                return 0;
+        /*
+         * uh oh -- it's a directory. We can't use it since hardlinked dirs are
+         * verboten. Disable serverino and return it as if it were found, the
+         * caller can discard it, generate a uniqueid and retry the find
+         */
+        if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
+                fattr->cf_flags |= CIFS_FATTR_INO_COLLISION;
+                cifs_autodisable_serverino(CIFS_SB(inode->i_sb));
+        }
        return 1;
 }
@@ -734,15 +744,22 @@ cifs_iget(struct super_block *sb, struct cifs_fattr *fattr)
        unsigned long hash;
        struct inode *inode;
+retry_iget5_locked:
        cFYI(1, ("looking for uniqueid=%llu", fattr->cf_uniqueid));
        /* hash down to 32-bits on 32-bit arch */
        hash = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid);
        inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr);
-        /* we have fattrs in hand, update the inode */
        if (inode) {
+                /* was there a problematic inode number collision? */
+                if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) {
+                        iput(inode);
+                        fattr->cf_uniqueid = iunique(sb, ROOT_I);
+                        fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION;
+                        goto retry_iget5_locked;
+                }
                cifs_fattr_to_inode(inode, fattr);
                if (sb->s_flags & MS_NOATIME)
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index a1695dcadd99..d97f9935a028 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -167,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                return -EBUSY;
        }
+        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+        if (error)
+                goto bdi_err;
        vc->vc_sb = sb;
        sb->s_fs_info = vc;
@@ -175,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_blocksize_bits = 12;
        sb->s_magic = CODA_SUPER_MAGIC;
        sb->s_op = &coda_super_operations;
+        sb->s_bdi = &vc->bdi;
        /* get root fid from Venus: this needs the root inode */
        error = venus_rootfid(sb, &fid);
@@ -200,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 error:
+        bdi_destroy(&vc->bdi);
+ bdi_err:
        if (root)
                iput(root);
        if (vc)
@@ -210,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 static void coda_put_super(struct super_block *sb)
 {
+        bdi_destroy(&coda_vcp(sb)->bdi);
        coda_vcp(sb)->vc_sb = NULL;
        sb->s_fs_info = NULL;
diff --git a/fs/compat.c b/fs/compat.c
index 4b6ed03cc478..05448730f840 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1531,8 +1531,6 @@ int compat_do_execve(char * filename,
        if (retval < 0)
                goto out;
-        current->stack_start = current->mm->start_stack;
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c32a1b6a856b..641640dc7ae5 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -102,7 +102,6 @@
 #include <linux/nbd.h>
 #include <linux/random.h>
 #include <linux/filter.h>
-#include <linux/pktcdvd.h>
 #include <linux/hiddev.h>
@@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE)
 COMPATIBLE_IOCTL(PPGETPHASE)
 COMPATIBLE_IOCTL(PPGETFLAGS)
 COMPATIBLE_IOCTL(PPSETFLAGS)
-/* pktcdvd */
-COMPATIBLE_IOCTL(PACKET_CTRL_CMD)
 /* Big A */
 /* sparc only */
 /* Big Q for sound/OSS */
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e48b52205aa..0b502f80c691 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -645,6 +645,7 @@ static void detach_groups(struct config_group *group)
                configfs_detach_group(sd->s_element);
                child->d_inode->i_flags |= S_DEAD;
+                dont_mount(child);
                mutex_unlock(&child->d_inode->i_mutex);
@@ -840,6 +841,7 @@ static int configfs_attach_item(struct config_item *parent_item,
                        mutex_lock(&dentry->d_inode->i_mutex);
                        configfs_remove_dir(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        dont_mount(dentry);
                        mutex_unlock(&dentry->d_inode->i_mutex);
                        d_delete(dentry);
                }
@@ -882,6 +884,7 @@ static int configfs_attach_group(struct config_item *parent_item,
                if (ret) {
                        configfs_detach_item(item);
                        dentry->d_inode->i_flags |= S_DEAD;
+                        dont_mount(dentry);
                }
                configfs_adjust_dir_dirent_depth_after_populate(sd);
                mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1725,6 +1728,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
        mutex_unlock(&configfs_symlink_mutex);
        configfs_detach_group(&group->cg_item);
        dentry->d_inode->i_flags |= S_DEAD;
+        dont_mount(dentry);
        mutex_unlock(&dentry->d_inode->i_mutex);
        d_delete(dentry);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index efb2b9400391..1cc087635a5e 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -382,8 +382,8 @@ out:
 static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num,
                                             struct ecryptfs_crypt_stat *crypt_stat)
 {
-        (*offset) = (crypt_stat->num_header_bytes_at_front
+        (*offset) = ecryptfs_lower_header_size(crypt_stat)
-                     + (crypt_stat->extent_size * extent_num));
+                    + (crypt_stat->extent_size * extent_num);
 }
 /**
@@ -835,13 +835,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat)
        set_extent_mask_and_shift(crypt_stat);
        crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES;
        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                crypt_stat->num_header_bytes_at_front = 0;
+                crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
        else {
                if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)
-                        crypt_stat->num_header_bytes_at_front =
+                        crypt_stat->metadata_size =
                                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
                else
-                        crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE;
+                        crypt_stat->metadata_size = PAGE_CACHE_SIZE;
        }
 }
@@ -1108,9 +1108,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written)
        (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES;
 }
-static void
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
-write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
-                     size_t *written)
+                                     size_t *written)
 {
        u32 flags = 0;
        int i;
@@ -1238,8 +1238,7 @@ ecryptfs_write_header_metadata(char *virt,
        header_extent_size = (u32)crypt_stat->extent_size;
        num_header_extents_at_front =
-                (u16)(crypt_stat->num_header_bytes_at_front
+                (u16)(crypt_stat->metadata_size / crypt_stat->extent_size);
-                      / crypt_stat->extent_size);
        put_unaligned_be32(header_extent_size, virt);
        virt += 4;
        put_unaligned_be16(num_header_extents_at_front, virt);
@@ -1292,7 +1291,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
        offset = ECRYPTFS_FILE_SIZE_BYTES;
        write_ecryptfs_marker((page_virt + offset), &written);
        offset += written;
-        write_ecryptfs_flags((page_virt + offset), crypt_stat, &written);
+        ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat,
+                                        &written);
        offset += written;
        ecryptfs_write_header_metadata((page_virt + offset), crypt_stat,
                                       &written);
@@ -1382,7 +1382,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
                rc = -EINVAL;
                goto out;
        }
-        virt_len = crypt_stat->num_header_bytes_at_front;
+        virt_len = crypt_stat->metadata_size;
        order = get_order(virt_len);
        /* Released in this function */
        virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
@@ -1428,16 +1428,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
        header_extent_size = get_unaligned_be32(virt);
        virt += sizeof(__be32);
        num_header_extents_at_front = get_unaligned_be16(virt);
-        crypt_stat->num_header_bytes_at_front =
+        crypt_stat->metadata_size = (((size_t)num_header_extents_at_front
-                (((size_t)num_header_extents_at_front
+                                     * (size_t)header_extent_size));
-                  * (size_t)header_extent_size));
        (*bytes_read) = (sizeof(__be32) + sizeof(__be16));
        if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE)
-            && (crypt_stat->num_header_bytes_at_front
+            && (crypt_stat->metadata_size
                < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) {
                rc = -EINVAL;
                printk(KERN_WARNING "Invalid header size: [%zd]\n",
-                       crypt_stat->num_header_bytes_at_front);
+                       crypt_stat->metadata_size);
        }
        return rc;
 }
@@ -1452,8 +1451,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat,
 */
 static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat)
 {
-        crypt_stat->num_header_bytes_at_front =
+        crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
 }
 /**
@@ -1607,6 +1605,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry)
                                                ecryptfs_dentry,
                                                ECRYPTFS_VALIDATE_HEADER_SIZE);
        if (rc) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode);
                if (rc) {
                        printk(KERN_DEBUG "Valid eCryptfs headers not found in "
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 542f625312f3..bfc2e0f78f00 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -35,6 +35,7 @@
 #include <linux/scatterlist.h>
 #include <linux/hash.h>
 #include <linux/nsproxy.h>
+#include <linux/backing-dev.h>
 /* Version verification for shared data structures w/ userspace */
 #define ECRYPTFS_VERSION_MAJOR 0x00
@@ -273,7 +274,7 @@ struct ecryptfs_crypt_stat {
        u32 flags;
        unsigned int file_version;
        size_t iv_bytes;
-        size_t num_header_bytes_at_front;
+        size_t metadata_size;
        size_t extent_size; /* Data extent size; default is 4096 */
        size_t key_size;
        size_t extent_shift;
@@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat {
 struct ecryptfs_sb_info {
        struct super_block *wsi_sb;
        struct ecryptfs_mount_crypt_stat mount_crypt_stat;
+        struct backing_dev_info bdi;
 };
 /* file private data. */
@@ -464,6 +466,14 @@ struct ecryptfs_daemon {
 extern struct mutex ecryptfs_daemon_hash_mux;
+static inline size_t
+ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
+                return 0;
+        return crypt_stat->metadata_size;
+}
 static inline struct ecryptfs_file_info *
 ecryptfs_file_to_private(struct file *file)
 {
@@ -651,6 +661,9 @@ int ecryptfs_decrypt_page(struct page *page);
 int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
 int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry);
+void ecryptfs_write_crypt_stat_flags(char *page_virt,
+                                     struct ecryptfs_crypt_stat *crypt_stat,
+                                     size_t *written);
 int ecryptfs_read_and_validate_header_region(char *data,
                                             struct inode *ecryptfs_inode);
 int ecryptfs_read_and_validate_xattr_region(char *page_virt,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d3362faf3852..e2d4418affac 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -324,6 +324,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
        rc = ecryptfs_read_and_validate_header_region(page_virt,
                                                      ecryptfs_dentry->d_inode);
        if (rc) {
+                memset(page_virt, 0, PAGE_CACHE_SIZE);
                rc = ecryptfs_read_and_validate_xattr_region(page_virt,
                                                             ecryptfs_dentry);
                if (rc) {
@@ -336,7 +337,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
                ecryptfs_dentry->d_sb)->mount_crypt_stat;
        if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) {
                if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-                        file_size = (crypt_stat->num_header_bytes_at_front
+                        file_size = (crypt_stat->metadata_size
                                     + i_size_read(lower_dentry->d_inode));
                else
                        file_size = i_size_read(lower_dentry->d_inode);
@@ -388,9 +389,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                       "lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                       ecryptfs_dentry->d_name.name);
+                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
        if (lower_dentry->d_inode)
@@ -417,9 +418,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
        mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
        if (IS_ERR(lower_dentry)) {
                rc = PTR_ERR(lower_dentry);
-                printk(KERN_ERR "%s: lookup_one_len() returned [%d] on "
+                ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
-                       "lower_dentry = [%s]\n", __func__, rc,
+                                "[%d] on lower_dentry = [%s]\n", __func__, rc,
-                       encrypted_and_encoded_name);
+                                encrypted_and_encoded_name);
                goto out_d_drop;
        }
 lookup_and_interpose:
@@ -456,8 +457,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
        rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0);
        if (rc)
                goto out_lock;
-        fsstack_copy_attr_times(dir, lower_new_dentry->d_inode);
+        fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode);
-        fsstack_copy_inode_size(dir, lower_new_dentry->d_inode);
+        fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode);
        old_dentry->d_inode->i_nlink =
                ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink;
        i_size_write(new_dentry->d_inode, file_size_save);
@@ -648,38 +649,17 @@ out_lock:
        return rc;
 }
-static int
+static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+                                   size_t *bufsiz)
 {
+        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
        char *lower_buf;
-        size_t lower_bufsiz;
+        size_t lower_bufsiz = PATH_MAX;
-        struct dentry *lower_dentry;
-        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
-        char *plaintext_name;
-        size_t plaintext_name_size;
        mm_segment_t old_fs;
        int rc;
-        lower_dentry = ecryptfs_dentry_to_lower(dentry);
-        if (!lower_dentry->d_inode->i_op->readlink) {
-                rc = -EINVAL;
-                goto out;
-        }
-        mount_crypt_stat = &ecryptfs_superblock_to_private(
-                                                dentry->d_sb)->mount_crypt_stat;
-        /*
-         * If the lower filename is encrypted, it will result in a significantly
-         * longer name.  If needed, truncate the name after decode and decrypt.
-         */
-        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
-                lower_bufsiz = PATH_MAX;
-        else
-                lower_bufsiz = bufsiz;
-        /* Released in this function */
        lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
-        if (lower_buf == NULL) {
+        if (!lower_buf) {
-                printk(KERN_ERR "%s: Out of memory whilst attempting to "
-                       "kmalloc [%zd] bytes\n", __func__, lower_bufsiz);
                rc = -ENOMEM;
                goto out;
        }
@@ -689,29 +669,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
                                                   (char __user *)lower_buf,
                                                   lower_bufsiz);
        set_fs(old_fs);
-        if (rc >= 0) {
+        if (rc < 0)
-                rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name,
+                goto out;
-                                                          &plaintext_name_size,
+        lower_bufsiz = rc;
-                                                          dentry, lower_buf,
+        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
-                                                          rc);
+                                                  lower_buf, lower_bufsiz);
-                if (rc) {
+out:
-                        printk(KERN_ERR "%s: Error attempting to decode and "
-                               "decrypt filename; rc = [%d]\n", __func__,
-                                rc);
-                        goto out_free_lower_buf;
-                }
-                /* Check for bufsiz <= 0 done in sys_readlinkat() */
-                rc = copy_to_user(buf, plaintext_name,
-                                  min((size_t) bufsiz, plaintext_name_size));
-                if (rc)
-                        rc = -EFAULT;
-                else
-                        rc = plaintext_name_size;
-                kfree(plaintext_name);
-                fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode);
-        }
-out_free_lower_buf:
        kfree(lower_buf);
+        return rc;
+}
+static int
+ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+        char *kbuf;
+        size_t kbufsiz, copied;
+        int rc;
+        rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+        if (rc)
+                goto out;
+        copied = min_t(size_t, bufsiz, kbufsiz);
+        rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
+        kfree(kbuf);
+        fsstack_copy_attr_atime(dentry->d_inode,
+                                ecryptfs_dentry_to_lower(dentry)->d_inode);
 out:
        return rc;
 }
@@ -769,7 +751,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat,
 {
        loff_t lower_size;
-        lower_size = crypt_stat->num_header_bytes_at_front;
+        lower_size = ecryptfs_lower_header_size(crypt_stat);
        if (upper_size != 0) {
                loff_t num_extents;
@@ -1016,6 +998,28 @@ out:
        return rc;
 }
+int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
+                          struct kstat *stat)
+{
+        struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+        int rc = 0;
+        mount_crypt_stat = &ecryptfs_superblock_to_private(
+                                                dentry->d_sb)->mount_crypt_stat;
+        generic_fillattr(dentry->d_inode, stat);
+        if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) {
+                char *target;
+                size_t targetsiz;
+                rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
+                if (!rc) {
+                        kfree(target);
+                        stat->size = targetsiz;
+                }
+        }
+        return rc;
+}
 int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
                     struct kstat *stat)
 {
@@ -1040,7 +1044,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->setxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1058,7 +1062,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
        int rc = 0;
        if (!lower_dentry->d_inode->i_op->getxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1085,7 +1089,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->listxattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1102,7 +1106,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
        lower_dentry = ecryptfs_dentry_to_lower(dentry);
        if (!lower_dentry->d_inode->i_op->removexattr) {
-                rc = -ENOSYS;
+                rc = -EOPNOTSUPP;
                goto out;
        }
        mutex_lock(&lower_dentry->d_inode->i_mutex);
@@ -1133,6 +1137,7 @@ const struct inode_operations ecryptfs_symlink_iops = {
        .put_link = ecryptfs_put_link,
        .permission = ecryptfs_permission,
        .setattr = ecryptfs_setattr,
+        .getattr = ecryptfs_getattr_link,
        .setxattr = ecryptfs_setxattr,
        .getxattr = ecryptfs_getxattr,
        .listxattr = ecryptfs_listxattr,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index af1a8f01ebac..760983d0f25e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -497,17 +497,25 @@ struct kmem_cache *ecryptfs_sb_info_cache;
 static int
 ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
 {
+        struct ecryptfs_sb_info *esi;
        int rc = 0;
        /* Released in ecryptfs_put_super() */
        ecryptfs_set_superblock_private(sb,
                                        kmem_cache_zalloc(ecryptfs_sb_info_cache,
                                                         GFP_KERNEL));
-        if (!ecryptfs_superblock_to_private(sb)) {
+        esi = ecryptfs_superblock_to_private(sb);
+        if (!esi) {
                ecryptfs_printk(KERN_WARNING, "Out of memory\n");
                rc = -ENOMEM;
                goto out;
        }
+        rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+        if (rc)
+                goto out;
+        sb->s_bdi = &esi->bdi;
        sb->s_op = &ecryptfs_sops;
        /* Released through deactivate_super(sb) from get_sb_nodev */
        sb->s_root = d_alloc(NULL, &(const struct qstr) {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index d491237c98e7..2ee9a3a7b68c 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -83,6 +83,19 @@ out:
        return rc;
 }
+static void strip_xattr_flag(char *page_virt,
+                             struct ecryptfs_crypt_stat *crypt_stat)
+{
+        if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
+                size_t written;
+                crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR;
+                ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat,
+                                                &written);
+                crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR;
+        }
+}
 /**
 *   Header Extent:
 *     Octets 0-7:        Unencrypted file size (big-endian)
@@ -98,19 +111,6 @@ out:
 *                        (big-endian)
 *     Octet  26:         Begin RFC 2440 authentication token packet set
 */
-static void set_header_info(char *page_virt,
-                            struct ecryptfs_crypt_stat *crypt_stat)
-{
-        size_t written;
-        size_t save_num_header_bytes_at_front =
-                crypt_stat->num_header_bytes_at_front;
-        crypt_stat->num_header_bytes_at_front =
-                ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE;
-        ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written);
-        crypt_stat->num_header_bytes_at_front =
-                save_num_header_bytes_at_front;
-}
 /**
 * ecryptfs_copy_up_encrypted_with_header
@@ -136,8 +136,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                                           * num_extents_per_page)
                                          + extent_num_in_page);
                size_t num_header_extents_at_front =
-                        (crypt_stat->num_header_bytes_at_front
+                        (crypt_stat->metadata_size / crypt_stat->extent_size);
-                         / crypt_stat->extent_size);
                if (view_extent_num < num_header_extents_at_front) {
                        /* This is a header extent */
@@ -147,9 +146,14 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                        memset(page_virt, 0, PAGE_CACHE_SIZE);
                        /* TODO: Support more than one header extent */
                        if (view_extent_num == 0) {
+                                size_t written;
                                rc = ecryptfs_read_xattr_region(
                                        page_virt, page->mapping->host);
-                                set_header_info(page_virt, crypt_stat);
+                                strip_xattr_flag(page_virt + 16, crypt_stat);
+                                ecryptfs_write_header_metadata(page_virt + 20,
+                                                               crypt_stat,
+                                                               &written);
                        }
                        kunmap_atomic(page_virt, KM_USER0);
                        flush_dcache_page(page);
@@ -162,7 +166,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
                        /* This is an encrypted data extent */
                        loff_t lower_offset =
                                ((view_extent_num * crypt_stat->extent_size)
-                                 - crypt_stat->num_header_bytes_at_front);
+                                 - crypt_stat->metadata_size);
                        rc = ecryptfs_read_lower_page_segment(
                                page, (lower_offset >> PAGE_CACHE_SHIFT),
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fcef41c1d2cf..0c0ae491d231 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -86,7 +86,6 @@ static void ecryptfs_destroy_inode(struct inode *inode)
                if (lower_dentry->d_inode) {
                        fput(inode_info->lower_file);
                        inode_info->lower_file = NULL;
-                        d_drop(lower_dentry);
                }
        }
        ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat);
@@ -123,6 +122,7 @@ static void ecryptfs_put_super(struct super_block *sb)
        lock_kernel();
        ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
+        bdi_destroy(&sb_info->bdi);
        kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
        ecryptfs_set_superblock_private(sb, NULL);
diff --git a/fs/exec.c b/fs/exec.c
index 49cdaa19e5b9..e6e94c626c2c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1387,8 +1387,6 @@ int do_execve(char * filename,
        if (retval < 0)
                goto out;
-        current->stack_start = current->mm->start_stack;
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 8442e353309f..22721b2fd890 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -35,6 +35,7 @@
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/backing-dev.h>
 #include "common.h"
 /* FIXME: Remove once pnfs hits mainline
@@ -84,6 +85,7 @@ struct exofs_sb_info {
        u32             s_next_generation;      /* next gen # to use          */
        atomic_t        s_curr_pending;         /* number of pending commands */
        uint8_t         s_cred[OSD_CAP_LEN];    /* credential for the fscb    */
+        struct          backing_dev_info bdi;   /* register our bdi with VFS  */
        struct pnfs_osd_data_map data_map;      /* Default raid to use
                                                 * FIXME: Needed ?
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 18e57ea1e5b4..03149b9a5178 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -302,6 +302,7 @@ static void exofs_put_super(struct super_block *sb)
        _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0],
                            sbi->layout.s_pid);
+        bdi_destroy(&sbi->bdi);
        exofs_free_sbi(sbi);
        sb->s_fs_info = NULL;
 }
@@ -546,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi)
                return -ENOMEM;
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+        if (ret)
+                goto free_bdi;
        /* use mount options to fill superblock */
        od = osduld_path_lookup(opts->dev_name);
        if (IS_ERR(od)) {
@@ -612,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* set up operation vectors */
+        sb->s_bdi = &sbi->bdi;
        sb->s_fs_info = sbi;
        sb->s_op = &exofs_sops;
        sb->s_export_op = &exofs_export_ops;
@@ -643,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
        return 0;
 free_sbi:
+        bdi_destroy(&sbi->bdi);
+free_bdi:
        EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
                  opts->dev_name, sbi->layout.s_pid, ret);
        exofs_free_sbi(sbi);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 94c8ee81f5e1..236b834b4ca8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3879,6 +3879,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
                physical += offset;
                length = EXT4_SB(inode->i_sb)->s_inode_size - offset;
                flags |= FIEMAP_EXTENT_DATA_INLINE;
+                brelse(iloc.bh);
        } else { /* external block */
                physical = EXT4_I(inode)->i_file_acl << blockbits;
                length = inode->i_sb->s_blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5381802d6052..81d605412844 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5375,7 +5375,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
        } else {
                struct ext4_iloc iloc;
-                err = ext4_get_inode_loc(inode, &iloc);
+                err = __ext4_get_inode_loc(inode, &iloc, 0);
                if (err)
                        return err;
                if (wbc->sync_mode == WB_SYNC_ALL)
@@ -5386,6 +5386,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                                   (unsigned long long)iloc.bh->b_blocknr);
                        err = -EIO;
                }
+                brelse(iloc.bh);
        }
        return err;
 }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bde9d0b170c2..b423a364dca3 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2535,6 +2535,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
+                if (test_opt(sb, DISCARD)) {
+                        ext4_fsblk_t discard_block;
+                        discard_block = entry->start_blk +
+                                ext4_group_first_block_no(sb, entry->group);
+                        trace_ext4_discard_blocks(sb,
+                                        (unsigned long long)discard_block,
+                                        entry->count);
+                        sb_issue_discard(sb, discard_block, entry->count);
+                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
                /* we expect to find existing buddy because it's pinned */
                BUG_ON(err != 0);
@@ -2556,16 +2567,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        page_cache_release(e4b.bd_bitmap_page);
                }
                ext4_unlock_group(sb, entry->group);
-                if (test_opt(sb, DISCARD)) {
-                        ext4_fsblk_t discard_block;
-                        discard_block = entry->start_blk +
-                                ext4_group_first_block_no(sb, entry->group);
-                        trace_ext4_discard_blocks(sb,
-                                        (unsigned long long)discard_block,
-                                        entry->count);
-                        sb_issue_discard(sb, discard_block, entry->count);
-                }
                kmem_cache_free(ext4_free_ext_cachep, entry);
                ext4_mb_release_desc(&e4b);
        }
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6c751106c2e5..7faefb4da939 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 #ifdef CONFIG_BLOCK
-#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits)
+static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
-#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits);
+{
+        return (offset >> inode->i_blkbits);
+}
+static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
+{
+        return (blk << inode->i_blkbits);
+}
 /**
 * __generic_block_fiemap - FIEMAP for block based inodes (no locking)
- * @inode - the inode to map
+ * @inode: the inode to map
- * @arg - the pointer to userspace where we copy everything to
+ * @fieinfo: the fiemap info struct that will be passed back to userspace
- * @get_block - the fs's get_block function
+ * @start: where to start mapping in the inode
+ * @len: how much space to map
+ * @get_block: the fs's get_block function
 *
 * This does FIEMAP for block based inodes.  Basically it will just loop
 * through get_block until we hit the number of extents we want to map, or we
@@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 */
 int __generic_block_fiemap(struct inode *inode,
-                           struct fiemap_extent_info *fieinfo, u64 start,
+                           struct fiemap_extent_info *fieinfo, loff_t start,
-                           u64 len, get_block_t *get_block)
+                           loff_t len, get_block_t *get_block)
 {
-        struct buffer_head tmp;
+        struct buffer_head map_bh;
-        unsigned long long start_blk;
+        sector_t start_blk, last_blk;
-        long long length = 0, map_len = 0;
+        loff_t isize = i_size_read(inode);
        u64 logical = 0, phys = 0, size = 0;
        u32 flags = FIEMAP_EXTENT_MERGED;
-        int ret = 0, past_eof = 0, whole_file = 0;
+        bool past_eof = false, whole_file = false;
+        int ret = 0;
-        if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC)))
+        ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
+        if (ret)
                return ret;
-        start_blk = logical_to_blk(inode, start);
+        /*
+         * Either the i_mutex or other appropriate locking needs to be held
-        length = (long long)min_t(u64, len, i_size_read(inode));
+         * since we expect isize to not change at all through the duration of
-        if (length < len)
+         * this call.
-                whole_file = 1;
+         */
+        if (len >= isize) {
+                whole_file = true;
+                len = isize;
+        }
-        map_len = length;
+        start_blk = logical_to_blk(inode, start);
+        last_blk = logical_to_blk(inode, start + len - 1);
        do {
                /*
                 * we set b_size to the total size we want so it will map as
                 * many contiguous blocks as possible at once
                 */
-                memset(&tmp, 0, sizeof(struct buffer_head));
+                memset(&map_bh, 0, sizeof(struct buffer_head));
-                tmp.b_size = map_len;
+                map_bh.b_size = len;
-                ret = get_block(inode, start_blk, &tmp, 0);
+                ret = get_block(inode, start_blk, &map_bh, 0);
                if (ret)
                        break;
                /* HOLE */
-                if (!buffer_mapped(&tmp)) {
+                if (!buffer_mapped(&map_bh)) {
-                        length -= blk_to_logical(inode, 1);
                        start_blk++;
                        /*
-                         * we want to handle the case where there is an
+                         * We want to handle the case where there is an
                         * allocated block at the front of the file, and then
                         * nothing but holes up to the end of the file properly,
                         * to make sure that extent at the front gets properly
                         * marked with FIEMAP_EXTENT_LAST
                         */
                        if (!past_eof &&
-                            blk_to_logical(inode, start_blk) >=
+                            blk_to_logical(inode, start_blk) >= isize)
-                            blk_to_logical(inode, 0)+i_size_read(inode))
                                past_eof = 1;
                        /*
-                         * first hole after going past the EOF, this is our
+                         * First hole after going past the EOF, this is our
                         * last extent
                         */
                        if (past_eof && size) {
@@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode,
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
-                                break;
+                        } else if (size) {
+                                ret = fiemap_fill_next_extent(fieinfo, logical,
+                                                              phys, size, flags);
+                                size = 0;
                        }
                        /* if we have holes up to/past EOF then we're done */
-                        if (length <= 0 || past_eof)
+                        if (start_blk > last_blk || past_eof || ret)
                                break;
                } else {
                        /*
-                         * we have gone over the length of what we wanted to
+                         * We have gone over the length of what we wanted to
                         * map, and it wasn't the entire file, so add the extent
                         * we got last time and exit.
                         *
@@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode,
                         * are good to go, just add the extent to the fieinfo
                         * and break
                         */
-                        if (length <= 0 && !whole_file) {
+                        if (start_blk > last_blk && !whole_file) {
                                ret = fiemap_fill_next_extent(fieinfo, logical,
                                                              phys, size,
                                                              flags);
@@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode,
                        }
                        logical = blk_to_logical(inode, start_blk);
-                        phys = blk_to_logical(inode, tmp.b_blocknr);
+                        phys = blk_to_logical(inode, map_bh.b_blocknr);
-                        size = tmp.b_size;
+                        size = map_bh.b_size;
                        flags = FIEMAP_EXTENT_MERGED;
-                        length -= tmp.b_size;
                        start_blk += logical_to_blk(inode, size);
                        /*
@@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode,
                         * soon as we find a hole that the last extent we found
                         * is marked with FIEMAP_EXTENT_LAST
                         */
-                        if (!past_eof &&
+                        if (!past_eof && logical + size >= isize)
-                            logical+size >=
+                                past_eof = true;
-                            blk_to_logical(inode, 0)+i_size_read(inode))
-                                past_eof = 1;
                }
                cond_resched();
        } while (1);
-        /* if ret is 1 then we just hit the end of the extent array */
+        /* If ret is 1 then we just hit the end of the extent array */
        if (ret == 1)
                ret = 0;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9dd126276c9f..ed9ba6fe04f5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -61,7 +61,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
                        inode->i_op = &page_symlink_inode_operations;
                        inode->i_mapping->a_ops = &jfs_aops;
                } else {
-                        inode->i_op = &jfs_symlink_inode_operations;
+                        inode->i_op = &jfs_fast_symlink_inode_operations;
                        /*
                         * The inline data should be null-terminated, but
                         * don't let on-disk corruption crash the kernel
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 6c4dfcbf3f55..9e2f6a721668 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -196,7 +196,7 @@ int dbMount(struct inode *ipbmap)
        bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
        bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
        bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
-        bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
+        bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
        bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
        bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
        bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
@@ -288,7 +288,7 @@ int dbSync(struct inode *ipbmap)
        dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
        dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
        dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
-        dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
+        dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight);
        dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
        dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
        dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
@@ -1441,7 +1441,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
         * tree index of this allocation group within the control page.
         */
        agperlev =
-            (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
+            (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth;
        ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
        /* dmap control page trees fan-out by 4 and a single allocation
@@ -1460,7 +1460,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
                 * the subtree to find the leftmost leaf that describes this
                 * free space.
                 */
-                for (k = bmp->db_agheigth; k > 0; k--) {
+                for (k = bmp->db_agheight; k > 0; k--) {
                        for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
                                if (l2nb <= dcp->stree[m + n]) {
                                        ti = m + n;
@@ -3607,7 +3607,7 @@ void dbFinalizeBmap(struct inode *ipbmap)
        }
        /*
-         * compute db_aglevel, db_agheigth, db_width, db_agstart:
+         * compute db_aglevel, db_agheight, db_width, db_agstart:
         * an ag is covered in aglevel dmapctl summary tree,
         * at agheight level height (from leaf) with agwidth number of nodes
         * each, which starts at agstart index node of the smmary tree node
@@ -3616,9 +3616,9 @@ void dbFinalizeBmap(struct inode *ipbmap)
        bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
        l2nl =
            bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
-        bmp->db_agheigth = l2nl >> 1;
+        bmp->db_agheight = l2nl >> 1;
-        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
+        bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1));
-        for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
+        for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0;
             i--) {
                bmp->db_agstart += n;
                n <<= 2;
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 1a6eb41569bc..6dcb906c55d8 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -210,7 +210,7 @@ struct dbmap_disk {
        __le32 dn_maxag;        /* 4: max active alloc group number     */
        __le32 dn_agpref;       /* 4: preferred alloc group (hint)      */
        __le32 dn_aglevel;      /* 4: dmapctl level holding the AG      */
-        __le32 dn_agheigth;     /* 4: height in dmapctl of the AG       */
+        __le32 dn_agheight;     /* 4: height in dmapctl of the AG       */
        __le32 dn_agwidth;      /* 4: width in dmapctl of the AG        */
        __le32 dn_agstart;      /* 4: start tree index at AG height     */
        __le32 dn_agl2size;     /* 4: l2 num of blks per alloc group    */
@@ -229,7 +229,7 @@ struct dbmap {
        int dn_maxag;           /* max active alloc group number        */
        int dn_agpref;          /* preferred alloc group (hint)         */
        int dn_aglevel;         /* dmapctl level holding the AG         */
-        int dn_agheigth;        /* height in dmapctl of the AG          */
+        int dn_agheight;        /* height in dmapctl of the AG          */
        int dn_agwidth;         /* width in dmapctl of the AG           */
        int dn_agstart;         /* start tree index at AG height        */
        int dn_agl2size;        /* l2 num of blks per alloc group       */
@@ -255,7 +255,7 @@ struct bmap {
 #define db_agsize       db_bmap.dn_agsize
 #define db_agl2size     db_bmap.dn_agl2size
 #define db_agwidth      db_bmap.dn_agwidth
-#define db_agheigth     db_bmap.dn_agheigth
+#define db_agheight     db_bmap.dn_agheight
 #define db_agstart      db_bmap.dn_agstart
 #define db_numag        db_bmap.dn_numag
 #define db_maxlevel     db_bmap.dn_maxlevel
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 79e2c79661df..9e6bda30a6e8 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -48,5 +48,6 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
+extern const struct inode_operations jfs_fast_symlink_inode_operations;
 extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif                          /* _H_JFS_INODE */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 4a3e9f39c21d..a9cf8e8675be 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -956,7 +956,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
         */
        if (ssize <= IDATASIZE) {
-                ip->i_op = &jfs_symlink_inode_operations;
+                ip->i_op = &jfs_fast_symlink_inode_operations;
                i_fastsymlink = JFS_IP(ip)->i_inline;
                memcpy(i_fastsymlink, name, ssize);
@@ -978,7 +978,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
        else {
                jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
-                ip->i_op = &page_symlink_inode_operations;
+                ip->i_op = &jfs_symlink_inode_operations;
                ip->i_mapping->a_ops = &jfs_aops;
                /*
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 7f24a0bb08ca..1aba0039f1c9 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
        struct inode *iplist[1];
        struct jfs_superblock *j_sb, *j_sb2;
        uint old_agsize;
+        int agsizechanged = 0;
        struct buffer_head *bh, *bh2;
        /* If the volume hasn't grown, get out now */
@@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         */
        if ((rc = dbExtendFS(ipbmap, XAddress, nblocks)))
                goto error_out;
+        agsizechanged |= (bmp->db_agsize != old_agsize);
        /*
         * the map now has extended to cover additional nblocks:
         * dn_mapsize = oldMapsize + nblocks;
@@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
         * will correctly identify the new ag);
         */
        /* if new AG size the same as old AG size, done! */
-        if (bmp->db_agsize != old_agsize) {
+        if (agsizechanged) {
                if ((rc = diExtendFS(ipimap, ipbmap)))
                        goto error_out;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 157382fa6256..b66832ac33ac 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -446,10 +446,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        /* initialize the mount flag and determine the default error handler */
        flag = JFS_ERR_REMOUNT_RO;
-        if (!parse_options((char *) data, sb, &newLVSize, &flag)) {
+        if (!parse_options((char *) data, sb, &newLVSize, &flag))
-                kfree(sbi);
+                goto out_kfree;
-                return -EINVAL;
-        }
        sbi->flag = flag;
 #ifdef CONFIG_JFS_POSIX_ACL
@@ -458,7 +456,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        if (newLVSize) {
                printk(KERN_ERR "resize option for remount only\n");
-                return -EINVAL;
+                goto out_kfree;
        }
        /*
@@ -478,7 +476,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
        inode = new_inode(sb);
        if (inode == NULL) {
                ret = -ENOMEM;
-                goto out_kfree;
+                goto out_unload;
        }
        inode->i_ino = 0;
        inode->i_nlink = 1;
@@ -550,9 +548,10 @@ out_mount_failed:
        make_bad_inode(sbi->direct_inode);
        iput(sbi->direct_inode);
        sbi->direct_inode = NULL;
-out_kfree:
+out_unload:
        if (sbi->nls_tab)
                unload_nls(sbi->nls_tab);
+out_kfree:
        kfree(sbi);
        return ret;
 }
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 4af1a05aad0a..205b946d8e0d 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-const struct inode_operations jfs_symlink_inode_operations = {
+const struct inode_operations jfs_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = jfs_follow_link,
+        .setattr        = jfs_setattr,
+        .setxattr       = jfs_setxattr,
+        .getxattr       = jfs_getxattr,
+        .listxattr      = jfs_listxattr,
+        .removexattr    = jfs_removexattr,
+};
+const struct inode_operations jfs_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = page_follow_link_light,
+        .put_link       = page_put_link,
+        .setattr        = jfs_setattr,
        .setxattr       = jfs_setxattr,
        .getxattr       = jfs_getxattr,
        .listxattr      = jfs_listxattr,
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
index 84e36f52fe95..76c242fbe1b0 100644
--- a/fs/logfs/gc.c
+++ b/fs/logfs/gc.c
@@ -459,6 +459,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target)
        struct logfs_block *block;
        int round, progress, last_progress = 0;
+        /*
+         * Doing too many changes to the segfile at once would result
+         * in a large number of aliases.  Write the journal before
+         * things get out of hand.
+         */
+        if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
+                logfs_write_anchor(sb);
        if (no_free_segments(sb) >= target &&
                        super->s_no_object_aliases < MAX_OBJ_ALIASES)
                return;
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
index 33bd260b8309..fb0a613f885b 100644
--- a/fs/logfs/journal.c
+++ b/fs/logfs/journal.c
@@ -389,7 +389,10 @@ static void journal_get_erase_count(struct logfs_area *area)
 static int journal_erase_segment(struct logfs_area *area)
 {
        struct super_block *sb = area->a_sb;
-        struct logfs_segment_header sh;
+        union {
+                struct logfs_segment_header sh;
+                unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
+        } u;
        u64 ofs;
        int err;
@@ -397,20 +400,21 @@ static int journal_erase_segment(struct logfs_area *area)
        if (err)
                return err;
-        sh.pad = 0;
+        memset(&u, 0, sizeof(u));
-        sh.type = SEG_JOURNAL;
+        u.sh.pad = 0;
-        sh.level = 0;
+        u.sh.type = SEG_JOURNAL;
-        sh.segno = cpu_to_be32(area->a_segno);
+        u.sh.level = 0;
-        sh.ec = cpu_to_be32(area->a_erase_count);
+        u.sh.segno = cpu_to_be32(area->a_segno);
-        sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        u.sh.ec = cpu_to_be32(area->a_erase_count);
-        sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
+        u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
+        u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
        /* This causes a bug in segment.c.  Not yet. */
        //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
        ofs = dev_ofs(sb, area->a_segno, 0);
-        area->a_used_bytes = ALIGN(sizeof(sh), 16);
+        area->a_used_bytes = sizeof(u);
-        logfs_buf_write(area, ofs, &sh, sizeof(sh));
+        logfs_buf_write(area, ofs, &u, sizeof(u));
        return 0;
 }
@@ -494,6 +498,8 @@ static void account_shadows(struct super_block *sb)
        btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
        btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
+        btree_grim_visitor32(&tree->segment_map, 0, NULL);
+        tree->no_shadowed_segments = 0;
        if (li->li_block) {
                /*
@@ -607,9 +613,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
        if (len == 0)
                return logfs_write_header(super, header, 0, type);
+        BUG_ON(len > sb->s_blocksize);
        compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
        if (compr_len < 0 || type == JE_ANCHOR) {
-                BUG_ON(len > sb->s_blocksize);
                memcpy(data, buf, len);
                compr_len = len;
                compr = COMPR_NONE;
@@ -661,6 +667,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
        if (ofs < 0)
                return ofs;
        logfs_buf_write(area, ofs, super->s_compressed_je, len);
+        BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
        super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
        return 0;
 }
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index b84b0eec6024..0a3df1a0c936 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -257,10 +257,14 @@ struct logfs_shadow {
 * struct shadow_tree
 * @new:                        shadows where old_ofs==0, indexed by new_ofs
 * @old:                        shadows where old_ofs!=0, indexed by old_ofs
+ * @segment_map:                bitfield of segments containing shadows
+ * @no_shadowed_segment:        number of segments containing shadows
 */
 struct shadow_tree {
        struct btree_head64 new;
        struct btree_head64 old;
+        struct btree_head32 segment_map;
+        int no_shadowed_segments;
 };
 struct object_alias_item {
@@ -305,13 +309,14 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
                level_t level, int child_no, __be64 val);
 struct logfs_block_ops {
        void    (*write_block)(struct logfs_block *block);
-        gc_level_t      (*block_level)(struct logfs_block *block);
        void    (*free_block)(struct super_block *sb, struct logfs_block*block);
        int     (*write_alias)(struct super_block *sb,
                        struct logfs_block *block,
                        write_alias_t *write_one_alias);
 };
+#define MAX_JOURNAL_ENTRIES 256
 struct logfs_super {
        struct mtd_info *s_mtd;                 /* underlying device */
        struct block_device *s_bdev;            /* underlying device */
@@ -378,7 +383,7 @@ struct logfs_super {
        u32      s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
        u64      s_last_version;
        struct logfs_area *s_journal_area;      /* open journal segment */
-        __be64  s_je_array[64];
+        __be64  s_je_array[MAX_JOURNAL_ENTRIES];
        int     s_no_je;
        int      s_sum_index;                   /* for the 12 summaries */
@@ -722,4 +727,10 @@ static inline struct logfs_area *get_area(struct super_block *sb,
        return logfs_super(sb)->s_area[(__force u8)gc_level];
 }
+static inline void logfs_mempool_destroy(mempool_t *pool)
+{
+        if (pool)
+                mempool_destroy(pool);
+}
 #endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index bff40253dfb2..3159db6958e5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -430,25 +430,6 @@ static void inode_write_block(struct logfs_block *block)
        }
 }
-static gc_level_t inode_block_level(struct logfs_block *block)
-{
-        BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
-        return GC_LEVEL(LOGFS_MAX_LEVELS);
-}
-static gc_level_t indirect_block_level(struct logfs_block *block)
-{
-        struct page *page;
-        struct inode *inode;
-        u64 bix;
-        level_t level;
-        page = block->page;
-        inode = page->mapping->host;
-        logfs_unpack_index(page->index, &bix, &level);
-        return expand_level(inode->i_ino, level);
-}
 /*
 * This silences a false, yet annoying gcc warning.  I hate it when my editor
 * jumps into bitops.h each time I recompile this file.
@@ -587,14 +568,12 @@ static void indirect_free_block(struct super_block *sb,
 static struct logfs_block_ops inode_block_ops = {
        .write_block = inode_write_block,
-        .block_level = inode_block_level,
        .free_block = inode_free_block,
        .write_alias = inode_write_alias,
 };
 struct logfs_block_ops indirect_block_ops = {
        .write_block = indirect_write_block,
-        .block_level = indirect_block_level,
        .free_block = indirect_free_block,
        .write_alias = indirect_write_alias,
 };
@@ -1241,6 +1220,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
        mempool_free(shadow, super->s_shadow_pool);
 }
+static void mark_segment(struct shadow_tree *tree, u32 segno)
+{
+        int err;
+        if (!btree_lookup32(&tree->segment_map, segno)) {
+                err = btree_insert32(&tree->segment_map, segno, (void *)1,
+                                GFP_NOFS);
+                BUG_ON(err);
+                tree->no_shadowed_segments++;
+        }
+}
 /**
 * fill_shadow_tree - Propagate shadow tree changes due to a write
 * @inode:      Inode owning the page
@@ -1288,6 +1279,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page,
                super->s_dirty_used_bytes += shadow->new_len;
                super->s_dirty_free_bytes += shadow->old_len;
+                mark_segment(tree, shadow->old_ofs >> super->s_segshift);
+                mark_segment(tree, shadow->new_ofs >> super->s_segshift);
        }
 }
@@ -1845,19 +1838,37 @@ static int __logfs_truncate(struct inode *inode, u64 size)
        return logfs_truncate_direct(inode, size);
 }
-int logfs_truncate(struct inode *inode, u64 size)
+/*
+ * Truncate, by changing the segment file, can consume a fair amount
+ * of resources.  So back off from time to time and do some GC.
+ * 8 or 2048 blocks should be well within safety limits even if
+ * every single block resided in a different segment.
+ */
+#define TRUNCATE_STEP   (8 * 1024 * 1024)
+int logfs_truncate(struct inode *inode, u64 target)
 {
        struct super_block *sb = inode->i_sb;
-        int err;
+        u64 size = i_size_read(inode);
+        int err = 0;
-        logfs_get_wblocks(sb, NULL, 1);
+        size = ALIGN(size, TRUNCATE_STEP);
-        err = __logfs_truncate(inode, size);
+        while (size > target) {
-        if (!err)
+                if (size > TRUNCATE_STEP)
-                err = __logfs_write_inode(inode, 0);
+                        size -= TRUNCATE_STEP;
-        logfs_put_wblocks(sb, NULL, 1);
+                else
+                        size = 0;
+                if (size < target)
+                        size = target;
+                logfs_get_wblocks(sb, NULL, 1);
+                err = __logfs_truncate(inode, target);
+                if (!err)
+                        err = __logfs_write_inode(inode, 0);
+                logfs_put_wblocks(sb, NULL, 1);
+        }
        if (!err)
-                err = vmtruncate(inode, size);
+                err = vmtruncate(inode, target);
        /* I don't trust error recovery yet. */
        WARN_ON(err);
@@ -2251,8 +2262,6 @@ void logfs_cleanup_rw(struct super_block *sb)
        struct logfs_super *super = logfs_super(sb);
        destroy_meta_inode(super->s_segfile_inode);
-        if (super->s_block_pool)
+        logfs_mempool_destroy(super->s_block_pool);
-                mempool_destroy(super->s_block_pool);
+        logfs_mempool_destroy(super->s_shadow_pool);
-        if (super->s_shadow_pool)
-                mempool_destroy(super->s_shadow_pool);
 }
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 801a3a141625..f77ce2b470ba 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -183,14 +183,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
        return 0;
 }
-static gc_level_t btree_block_level(struct logfs_block *block)
-{
-        return expand_level(block->ino, block->level);
-}
 static struct logfs_block_ops btree_block_ops = {
        .write_block    = btree_write_block,
-        .block_level    = btree_block_level,
        .free_block     = __free_block,
        .write_alias    = btree_write_alias,
 };
@@ -919,7 +913,7 @@ err:
        for (i--; i >= 0; i--)
                free_area(super->s_area[i]);
        free_area(super->s_journal_area);
-        mempool_destroy(super->s_alias_pool);
+        logfs_mempool_destroy(super->s_alias_pool);
        return -ENOMEM;
 }
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
index b60bfac3263c..d7c23ed8349a 100644
--- a/fs/logfs/super.c
+++ b/fs/logfs/super.c
@@ -12,6 +12,7 @@
 #include "logfs.h"
 #include <linux/bio.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include <linux/mtd/mtd.h>
 #include <linux/statfs.h>
 #include <linux/buffer_head.h>
@@ -137,6 +138,10 @@ static int logfs_sb_set(struct super_block *sb, void *_super)
        sb->s_fs_info = super;
        sb->s_mtd = super->s_mtd;
        sb->s_bdev = super->s_bdev;
+        if (sb->s_bdev)
+                sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+        if (sb->s_mtd)
+                sb->s_bdi = sb->s_mtd->backing_dev_info;
        return 0;
 }
@@ -328,27 +333,27 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
                goto fail;
        sb->s_root = d_alloc_root(rootdir);
-        if (!sb->s_root)
+        if (!sb->s_root) {
-                goto fail2;
+                iput(rootdir);
+                goto fail;
+        }
        super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
        if (!super->s_erase_page)
-                goto fail2;
+                goto fail;
        memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
        /* FIXME: check for read-only mounts */
        err = logfs_make_writeable(sb);
        if (err)
-                goto fail3;
+                goto fail1;
        log_super("LogFS: Finished mounting\n");
        simple_set_mnt(mnt, sb);
        return 0;
-fail3:
+fail1:
        __free_page(super->s_erase_page);
-fail2:
-        iput(rootdir);
 fail:
        iput(logfs_super(sb)->s_master_inode);
        return -EIO;
@@ -452,6 +457,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only)
        btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
        btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
+        btree_init_mempool32(&super->s_shadow_tree.segment_map,
+                        super->s_btree_pool);
        ret = logfs_init_mapping(sb);
        if (ret)
@@ -516,8 +523,8 @@ static void logfs_kill_sb(struct super_block *sb)
        if (super->s_erase_page)
                __free_page(super->s_erase_page);
        super->s_devops->put_device(sb);
-        mempool_destroy(super->s_btree_pool);
+        logfs_mempool_destroy(super->s_btree_pool);
-        mempool_destroy(super->s_alias_pool);
+        logfs_mempool_destroy(super->s_alias_pool);
        kfree(super);
        log_super("LogFS: Finished unmounting\n");
 }
diff --git a/fs/namei.c b/fs/namei.c
index a7dce91a7e42..b86b96fe1dc3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1641,7 +1641,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (nd->last.name[nd->last.len]) {
                if (open_flag & O_CREAT)
                        goto exit;
-                nd->flags |= LOOKUP_DIRECTORY;
+                nd->flags |= LOOKUP_DIRECTORY | LOOKUP_FOLLOW;
        }
        /* just plain open? */
@@ -1830,6 +1830,8 @@ reval:
        }
        if (open_flag & O_DIRECTORY)
                nd.flags |= LOOKUP_DIRECTORY;
+        if (!(open_flag & O_NOFOLLOW))
+                nd.flags |= LOOKUP_FOLLOW;
        filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
        while (unlikely(!filp)) { /* trailing symlink */
                struct path holder;
@@ -1837,7 +1839,7 @@ reval:
                void *cookie;
                error = -ELOOP;
                /* S_ISDIR part is a temporary automount kludge */
-                if ((open_flag & O_NOFOLLOW) && !S_ISDIR(inode->i_mode))
+                if (!(nd.flags & LOOKUP_FOLLOW) && !S_ISDIR(inode->i_mode))
                        goto exit_dput;
                if (count++ == 32)
                        goto exit_dput;
@@ -2174,8 +2176,10 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
                error = security_inode_rmdir(dir, dentry);
                if (!error) {
                        error = dir->i_op->rmdir(dir, dentry);
-                        if (!error)
+                        if (!error) {
                                dentry->d_inode->i_flags |= S_DEAD;
+                                dont_mount(dentry);
+                        }
                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2259,7 +2263,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
                if (!error) {
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error)
-                                dentry->d_inode->i_flags |= S_DEAD;
+                                dont_mount(dentry);
                }
        }
        mutex_unlock(&dentry->d_inode->i_mutex);
@@ -2570,17 +2574,20 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
                return error;
        target = new_dentry->d_inode;
-        if (target) {
+        if (target)
                mutex_lock(&target->i_mutex);
-                dentry_unhash(new_dentry);
-        }
        if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
                error = -EBUSY;
-        else 
+        else {
+                if (target)
+                        dentry_unhash(new_dentry);
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+        }
        if (target) {
-                if (!error)
+                if (!error) {
                        target->i_flags |= S_DEAD;
+                        dont_mount(new_dentry);
+                }
                mutex_unlock(&target->i_mutex);
                if (d_unhashed(new_dentry))
                        d_rehash(new_dentry);
@@ -2612,7 +2619,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
                error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
        if (!error) {
                if (target)
-                        target->i_flags |= S_DEAD;
+                        dont_mount(new_dentry);
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry, new_dentry);
        }
diff --git a/fs/namespace.c b/fs/namespace.c
index 8174c8ab5c70..f20cb57d1067 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1432,7 +1432,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (IS_DEADDIR(path->dentry->d_inode))
+        if (cant_mount(path->dentry))
                goto out_unlock;
        err = security_sb_check_sb(mnt, path);
@@ -1623,7 +1623,7 @@ static int do_move_mount(struct path *path, char *old_name)
        err = -ENOENT;
        mutex_lock(&path->dentry->d_inode->i_mutex);
-        if (IS_DEADDIR(path->dentry->d_inode))
+        if (cant_mount(path->dentry))
                goto out1;
        if (d_unlinked(path->dentry))
@@ -2234,7 +2234,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        if (!check_mnt(root.mnt))
                goto out2;
        error = -ENOENT;
-        if (IS_DEADDIR(new.dentry->d_inode))
+        if (cant_mount(old.dentry))
                goto out2;
        if (d_unlinked(new.dentry))
                goto out2;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index cf98da1be23e..fa3385154023 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        sb->s_blocksize_bits = 10;
        sb->s_magic = NCP_SUPER_MAGIC;
        sb->s_op = &ncp_sops;
+        sb->s_bdi = &server->bdi;
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
+        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+        if (error)
+                goto out_bdi;
        server->ncp_filp = ncp_filp;
        server->ncp_sock = sock;
        
@@ -719,6 +724,8 @@ out_fput2:
        if (server->info_filp)
                fput(server->info_filp);
 out_fput:
+        bdi_destroy(&server->bdi);
+out_bdi:
        /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
         * 
         * The previously used put_filp(ncp_filp); was bogous, since
@@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb)
        kill_pid(server->m.wdog_pid, SIGTERM, 1);
        put_pid(server->m.wdog_pid);
+        bdi_destroy(&server->bdi);
        kfree(server->priv.data);
        kfree(server->auth.object_name);
        vfree(server->rxbuf);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2a3d352c0bff..acc9c4943b84 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -966,6 +966,8 @@ out_error:
 static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source)
 {
        target->flags = source->flags;
+        target->rsize = source->rsize;
+        target->wsize = source->wsize;
        target->acregmin = source->acregmin;
        target->acregmax = source->acregmax;
        target->acdirmin = source->acdirmin;
@@ -1294,7 +1296,8 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR|
+                NFS_CAP_POSIX_LOCK;
        server->options = data->options;
        /* Get a client record */
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 15671245c6ee..ea61d26e7871 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -24,6 +24,8 @@
 static void nfs_do_free_delegation(struct nfs_delegation *delegation)
 {
+        if (delegation->cred)
+                put_rpccred(delegation->cred);
        kfree(delegation);
 }
@@ -36,13 +38,7 @@ static void nfs_free_delegation_callback(struct rcu_head *head)
 static void nfs_free_delegation(struct nfs_delegation *delegation)
 {
-        struct rpc_cred *cred;
-        cred = rcu_dereference(delegation->cred);
-        rcu_assign_pointer(delegation->cred, NULL);
        call_rcu(&delegation->rcu, nfs_free_delegation_callback);
-        if (cred)
-                put_rpccred(cred);
 }
 void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
@@ -129,21 +125,35 @@ again:
 */
 void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res)
 {
-        struct nfs_delegation *delegation = NFS_I(inode)->delegation;
+        struct nfs_delegation *delegation;
-        struct rpc_cred *oldcred;
+        struct rpc_cred *oldcred = NULL;
-        if (delegation == NULL)
+        rcu_read_lock();
-                return;
+        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        memcpy(delegation->stateid.data, res->delegation.data,
+        if (delegation != NULL) {
-                        sizeof(delegation->stateid.data));
+                spin_lock(&delegation->lock);
-        delegation->type = res->delegation_type;
+                if (delegation->inode != NULL) {
-        delegation->maxsize = res->maxsize;
+                        memcpy(delegation->stateid.data, res->delegation.data,
-        oldcred = delegation->cred;
+                               sizeof(delegation->stateid.data));
-        delegation->cred = get_rpccred(cred);
+                        delegation->type = res->delegation_type;
-        clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+                        delegation->maxsize = res->maxsize;
-        NFS_I(inode)->delegation_state = delegation->type;
+                        oldcred = delegation->cred;
-        smp_wmb();
+                        delegation->cred = get_rpccred(cred);
-        put_rpccred(oldcred);
+                        clear_bit(NFS_DELEGATION_NEED_RECLAIM,
+                                  &delegation->flags);
+                        NFS_I(inode)->delegation_state = delegation->type;
+                        spin_unlock(&delegation->lock);
+                        put_rpccred(oldcred);
+                        rcu_read_unlock();
+                } else {
+                        /* We appear to have raced with a delegation return. */
+                        spin_unlock(&delegation->lock);
+                        rcu_read_unlock();
+                        nfs_inode_set_delegation(inode, cred, res);
+                }
+        } else {
+                rcu_read_unlock();
+        }
 }
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -166,9 +176,13 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
        return inode;
 }
-static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
+static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi,
+                                                           const nfs4_stateid *stateid,
+                                                           struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+        struct nfs_delegation *delegation =
+                rcu_dereference_protected(nfsi->delegation,
+                                          lockdep_is_held(&clp->cl_lock));
        if (delegation == NULL)
                goto nomatch;
@@ -195,7 +209,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 {
        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
-        struct nfs_delegation *delegation;
+        struct nfs_delegation *delegation, *old_delegation;
        struct nfs_delegation *freeme = NULL;
        int status = 0;
@@ -213,10 +227,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        spin_lock_init(&delegation->lock);
        spin_lock(&clp->cl_lock);
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        old_delegation = rcu_dereference_protected(nfsi->delegation,
-                if (memcmp(&delegation->stateid, &nfsi->delegation->stateid,
+                                                   lockdep_is_held(&clp->cl_lock));
-                                        sizeof(delegation->stateid)) == 0 &&
+        if (old_delegation != NULL) {
-                                delegation->type == nfsi->delegation->type) {
+                if (memcmp(&delegation->stateid, &old_delegation->stateid,
+                                        sizeof(old_delegation->stateid)) == 0 &&
+                                delegation->type == old_delegation->type) {
                        goto out;
                }
                /*
@@ -226,12 +242,12 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                dfprintk(FILE, "%s: server %s handed out "
                                "a duplicate delegation!\n",
                                __func__, clp->cl_hostname);
-                if (delegation->type <= nfsi->delegation->type) {
+                if (delegation->type <= old_delegation->type) {
                        freeme = delegation;
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, NULL);
+                freeme = nfs_detach_delegation_locked(nfsi, NULL, clp);
        }
        list_add_rcu(&delegation->super_list, &clp->cl_delegations);
        nfsi->delegation_state = delegation->type;
@@ -301,7 +317,7 @@ restart:
                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL) {
@@ -330,9 +346,9 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        if (rcu_access_pointer(nfsi->delegation) != NULL) {
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL);
+                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL)
                        nfs_do_return_delegation(inode, delegation, 0);
@@ -346,9 +362,9 @@ int nfs_inode_return_delegation(struct inode *inode)
        struct nfs_delegation *delegation;
        int err = 0;
-        if (rcu_dereference(nfsi->delegation) != NULL) {
+        if (rcu_access_pointer(nfsi->delegation) != NULL) {
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(nfsi, NULL);
+                delegation = nfs_detach_delegation_locked(nfsi, NULL, clp);
                spin_unlock(&clp->cl_lock);
                if (delegation != NULL) {
                        nfs_msync_inode(inode);
@@ -526,7 +542,7 @@ restart:
                if (inode == NULL)
                        continue;
                spin_lock(&clp->cl_lock);
-                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+                delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL, clp);
                spin_unlock(&clp->cl_lock);
                rcu_read_unlock();
                if (delegation != NULL)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index c6f2750648f4..a7bb5c694aa3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -837,6 +837,8 @@ out_zap_parent:
                /* If we have submounts, don't unhash ! */
                if (have_submounts(dentry))
                        goto out_valid;
+                if (dentry->d_flags & DCACHE_DISCONNECTED)
+                        goto out_valid;
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
@@ -1025,12 +1027,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
                                res = NULL;
                                goto out;
                        /* This turned out not to be a regular file */
+                        case -EISDIR:
                        case -ENOTDIR:
                                goto no_open;
                        case -ELOOP:
                                if (!(nd->intent.open.flags & O_NOFOLLOW))
                                        goto no_open;
-                        /* case -EISDIR: */
                        /* case -EINVAL: */
                        default:
                                goto out;
@@ -1050,7 +1052,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        struct inode *dir;
        int openflags, ret = 0;
-        if (!is_atomic_open(nd))
+        if (!is_atomic_open(nd) || d_mountpoint(dentry))
                goto no_open;
        parent = dget_parent(dentry);
        dir = parent->d_inode;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 737128f777f3..50a56edca0b5 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -623,10 +623,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c
        list_for_each_entry(pos, &nfsi->open_files, list) {
                if (cred != NULL && pos->cred != cred)
                        continue;
-                if ((pos->mode & mode) == mode) {
+                if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode)
-                        ctx = get_nfs_open_context(pos);
+                        continue;
-                        break;
+                ctx = get_nfs_open_context(pos);
-                }
+                break;
        }
        spin_unlock(&inode->i_lock);
        return ctx;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fe0cd9eb1d4d..071fcedd517c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1523,6 +1523,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
                nfs_post_op_update_inode(dir, o_res->dir_attr);
        } else
                nfs_refresh_inode(dir, o_res->dir_attr);
+        if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
+                server->caps &= ~NFS_CAP_POSIX_LOCK;
        if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
                status = _nfs4_proc_open_confirm(data);
                if (status != 0)
@@ -1664,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in
        status = PTR_ERR(state);
        if (IS_ERR(state))
                goto err_opendata_put;
-        if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0)
+        if (server->caps & NFS_CAP_POSIX_LOCK)
                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
        nfs4_opendata_put(opendata);
        nfs4_put_state_owner(sp);
@@ -5216,9 +5218,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
        msg.rpc_resp = &calldata->res;
        task_setup_data.callback_data = calldata;
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task)) {
                status = PTR_ERR(task);
+                goto out;
+        }
        rpc_put_task(task);
+        return 0;
 out:
        dprintk("<-- %s status=%d\n", __func__, status);
        return status;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e01637240eeb..b4148fc00f9f 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2187,6 +2187,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (data->version == 4) {
                error = nfs4_try_mount(flags, dev_name, data, mnt);
                kfree(data->client_address);
+                kfree(data->nfs_server.export_path);
                goto out;
        }
 #endif  /* CONFIG_NFS_V4 */
@@ -2657,7 +2658,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
        devname = nfs_path(path->mnt->mnt_devname,
                        path->mnt->mnt_root, path->dentry,
                        page, PAGE_SIZE);
-        if (devname == NULL)
+        if (IS_ERR(devname))
                goto out_freepage;
        tmp = kstrdup(devname, GFP_KERNEL);
        if (tmp == NULL)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53ff70e23993..3aea3ca98ab7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page)
                struct inode *inode = page->mapping->host;
                struct nfs_server *nfss = NFS_SERVER(inode);
+                page_cache_get(page);
                if (atomic_long_inc_return(&nfss->writeback) >
                                NFS_CONGESTION_ON_THRESH) {
                        set_bdi_congested(&nfss->backing_dev_info,
@@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page)
        struct nfs_server *nfss = NFS_SERVER(inode);
        end_page_writeback(page);
+        page_cache_release(page);
        if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
@@ -421,6 +423,7 @@ static void
 nfs_mark_request_dirty(struct nfs_page *req)
 {
        __set_page_dirty_nobuffers(req->wb_page);
+        __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC);
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -660,9 +663,11 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
        req = nfs_setup_write_request(ctx, page, offset, count);
        if (IS_ERR(req))
                return PTR_ERR(req);
+        nfs_mark_request_dirty(req);
        /* Update file length */
        nfs_grow_file(page, offset, count);
        nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+        nfs_mark_request_dirty(req);
        nfs_clear_page_tag_locked(req);
        return 0;
 }
@@ -739,8 +744,6 @@ int nfs_updatepage(struct file *file, struct page *page,
        status = nfs_writepage_setup(ctx, page, offset, count);
        if (status < 0)
                nfs_set_pageerror(page);
-        else
-                __set_page_dirty_nobuffers(page);
        dprintk("NFS:       nfs_updatepage returns %d (isize %lld)\n",
                        status, (long long)i_size_read(inode));
@@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page,
 static void nfs_writepage_release(struct nfs_page *req)
 {
+        struct page *page = req->wb_page;
-        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) {
+        if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req))
-                nfs_end_page_writeback(req->wb_page);
                nfs_inode_remove_request(req);
-        } else
-                nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_tag_locked(req);
+        nfs_end_page_writeback(page);
 }
 static int flush_task_priority(int how)
@@ -779,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                int how)
 {
        struct inode *inode = req->wb_context->path.dentry->d_inode;
-        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -794,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                .callback_ops = call_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-                .flags = flags,
+                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
+        int ret = 0;
        /* Set up the RPC argument and reply structs
         * NB: take care not to mess about with data->commit et al. */
@@ -835,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
                (unsigned long long)data->args.offset);
        task = rpc_run_task(&task_setup_data);
-        if (IS_ERR(task))
+        if (IS_ERR(task)) {
-                return PTR_ERR(task);
+                ret = PTR_ERR(task);
+                goto out;
+        }
+        if (how & FLUSH_SYNC) {
+                ret = rpc_wait_for_completion_task(task);
+                if (ret == 0)
+                        ret = task->tk_status;
+        }
        rpc_put_task(task);
-        return 0;
+out:
+        return ret;
 }
 /* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -847,9 +857,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
 */
 static void nfs_redirty_request(struct nfs_page *req)
 {
+        struct page *page = req->wb_page;
        nfs_mark_request_dirty(req);
-        nfs_end_page_writeback(req->wb_page);
        nfs_clear_page_tag_locked(req);
+        nfs_end_page_writeback(page);
 }
 /*
@@ -1084,16 +1096,15 @@ static void nfs_writeback_release_full(void *calldata)
                if (nfs_write_need_commit(data)) {
                        memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
                        nfs_mark_request_commit(req);
-                        nfs_end_page_writeback(page);
                        dprintk(" marked for commit\n");
                        goto next;
                }
                dprintk(" OK\n");
 remove_request:
-                nfs_end_page_writeback(page);
                nfs_inode_remove_request(req);
        next:
                nfs_clear_page_tag_locked(req);
+                nfs_end_page_writeback(page);
        }
        nfs_writedata_release(calldata);
 }
@@ -1190,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+{
+        if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
+                return 1;
+        if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags,
+                                NFS_INO_COMMIT, nfs_wait_bit_killable,
+                                TASK_KILLABLE))
+                return 1;
+        return 0;
+}
+static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+{
+        clear_bit(NFS_INO_COMMIT, &nfsi->flags);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+}
 static void nfs_commitdata_release(void *data)
 {
        struct nfs_write_data *wdata = data;
@@ -1207,7 +1237,6 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 {
        struct nfs_page *first = nfs_list_entry(head->next);
        struct inode *inode = first->wb_context->path.dentry->d_inode;
-        int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC;
        int priority = flush_task_priority(how);
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -1222,7 +1251,7 @@ static int nfs_commit_rpcsetup(struct list_head *head,
                .callback_ops = &nfs_commit_ops,
                .callback_data = data,
                .workqueue = nfsiod_workqueue,
-                .flags = flags,
+                .flags = RPC_TASK_ASYNC,
                .priority = priority,
        };
@@ -1282,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
                                BDI_RECLAIMABLE);
                nfs_clear_page_tag_locked(req);
        }
+        nfs_commit_clear_lock(NFS_I(inode));
        return -ENOMEM;
 }
@@ -1337,6 +1367,7 @@ static void nfs_commit_release(void *calldata)
        next:
                nfs_clear_page_tag_locked(req);
        }
+        nfs_commit_clear_lock(NFS_I(data->inode));
        nfs_commitdata_release(calldata);
 }
@@ -1351,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = {
 static int nfs_commit_inode(struct inode *inode, int how)
 {
        LIST_HEAD(head);
-        int res;
+        int may_wait = how & FLUSH_SYNC;
+        int res = 0;
+        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
+                goto out;
        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
        spin_unlock(&inode->i_lock);
@@ -1360,7 +1394,13 @@ static int nfs_commit_inode(struct inode *inode, int how)
                int error = nfs_commit_list(inode, &head, how);
                if (error < 0)
                        return error;
-        }
+                if (may_wait)
+                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
+                                        nfs_wait_bit_killable,
+                                        TASK_KILLABLE);
+        } else
+                nfs_commit_clear_lock(NFS_I(inode));
+out:
        return res;
 }
@@ -1432,6 +1472,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
        BUG_ON(!PageLocked(page));
        for (;;) {
+                wait_on_page_writeback(page);
                req = nfs_page_find_request(page);
                if (req == NULL)
                        break;
@@ -1466,30 +1507,18 @@ int nfs_wb_page(struct inode *inode, struct page *page)
                .range_start = range_start,
                .range_end = range_end,
        };
-        struct nfs_page *req;
-        int need_commit;
        int ret;
        while(PagePrivate(page)) {
+                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
                }
-                req = nfs_find_and_lock_request(page);
+                ret = sync_inode(inode, &wbc);
-                if (!req)
+                if (ret < 0)
-                        break;
-                if (IS_ERR(req)) {
-                        ret = PTR_ERR(req);
                        goto out_error;
-                }
-                need_commit = test_bit(PG_CLEAN, &req->wb_flags);
-                nfs_clear_page_tag_locked(req);
-                if (need_commit) {
-                        ret = nfs_commit_inode(inode, FLUSH_SYNC);
-                        if (ret < 0)
-                                goto out_error;
-                }
        }
        return 0;
 out_error:
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e1703175ee28..34ccf815ea8a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -161,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
        argp->p = page_address(argp->pagelist[0]);
        argp->pagelist++;
        if (argp->pagelen < PAGE_SIZE) {
-                argp->end = p + (argp->pagelen>>2);
+                argp->end = argp->p + (argp->pagelen>>2);
                argp->pagelen = 0;
        } else {
-                argp->end = p + (PAGE_SIZE>>2);
+                argp->end = argp->p + (PAGE_SIZE>>2);
                argp->pagelen -= PAGE_SIZE;
        }
        memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
@@ -1426,10 +1426,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
                        argp->p = page_address(argp->pagelist[0]);
                        argp->pagelist++;
                        if (argp->pagelen < PAGE_SIZE) {
-                                argp->end = p + (argp->pagelen>>2);
+                                argp->end = argp->p + (argp->pagelen>>2);
                                argp->pagelen = 0;
                        } else {
-                                argp->end = p + (PAGE_SIZE>>2);
+                                argp->end = argp->p + (PAGE_SIZE>>2);
                                argp->pagelen -= PAGE_SIZE;
                        }
                }
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 0cdbc5e7655a..48145f505a6a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -749,6 +749,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_export_op = &nilfs_export_ops;
        sb->s_root = NULL;
        sb->s_time_gran = 1;
+        sb->s_bdi = nilfs->ns_bdi;
        err = load_nilfs(nilfs, sbi);
        if (err)
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 3e56dbffe729..b3a159b21cfd 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,6 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
+        select ANON_INODES
        select FSNOTIFY
        default y
        ---help---
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1afb0a10229f..e27960cd76ab 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -28,6 +28,7 @@
 #include <linux/path.h> /* struct path */
 #include <linux/slab.h> /* kmem_* */
 #include <linux/types.h>
+#include <linux/sched.h>
 #include "inotify.h"
@@ -146,6 +147,7 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_remove_all(&group->inotify_data.idr);
        idr_destroy(&group->inotify_data.idr);
+        free_uid(group->inotify_data.user);
 }
 void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 472cdf29ef82..e46ca685b9be 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -546,21 +546,24 @@ retry:
        if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
                goto out_err;
+        /* we are putting the mark on the idr, take a reference */
+        fsnotify_get_mark(&tmp_ientry->fsn_entry);
        spin_lock(&group->inotify_data.idr_lock);
        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
                                group->inotify_data.last_wd+1,
                                &tmp_ientry->wd);
        spin_unlock(&group->inotify_data.idr_lock);
        if (ret) {
+                /* we didn't get on the idr, drop the idr reference */
+                fsnotify_put_mark(&tmp_ientry->fsn_entry);
                /* idr was out of memory allocate and try again */
                if (ret == -EAGAIN)
                        goto retry;
                goto out_err;
        }
-        /* we put the mark on the idr, take a reference */
-        fsnotify_get_mark(&tmp_ientry->fsn_entry);
        /* we are on the idr, now get on the inode */
        ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
        if (ret) {
@@ -578,16 +581,13 @@ retry:
        /* return the watch descriptor for this new entry */
        ret = tmp_ientry->wd;
-        /* match the ref from fsnotify_init_markentry() */
-        fsnotify_put_mark(&tmp_ientry->fsn_entry);
        /* if this mark added a new event update the group mask */
        if (mask & ~group->mask)
                fsnotify_recalc_group_mask(group);
 out_err:
-        if (ret < 0)
+        /* match the ref from fsnotify_init_markentry() */
-                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+        fsnotify_put_mark(&tmp_ientry->fsn_entry);
        return ret;
 }
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index ecebb2276790..f9d5d3ffc75a 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -406,6 +406,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
                                struct buffer_head *bh)
 {
        int ret = 0;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
        mlog_entry_void();
@@ -425,6 +426,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
        get_bh(bh); /* for end_buffer_write_sync() */
        bh->b_end_io = end_buffer_write_sync;
+        ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check);
        submit_bh(WRITE, bh);
        wait_on_buffer(bh);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index a795eb91f4ea..12d5eb78a11a 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -184,9 +184,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
        BUG_ON(!lksb);
        /* only updates if this node masters the lockres */
+        spin_lock(&res->spinlock);
        if (res->owner == dlm->node_num) {
-                spin_lock(&res->spinlock);
                /* check the lksb flags for the direction */
                if (lksb->flags & DLM_LKSB_GET_LVB) {
                        mlog(0, "getting lvb from lockres for %s node\n",
@@ -201,8 +200,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
                 * here. In the future we might want to clear it at the time
                 * the put is actually done.
                 */
-                spin_unlock(&res->spinlock);
        }
+        spin_unlock(&res->spinlock);
        /* reset any lvb flags on the lksb */
        lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1b0de157a08c..b83d6107a1f5 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES);
 * O_RDONLY -> PRMODE level
 * O_WRONLY -> EXMODE level
 *
- * O_NONBLOCK -> LKM_NOQUEUE
+ * O_NONBLOCK -> NOQUEUE
 */
 static int dlmfs_decode_open_flags(int open_flags,
                                   int *level,
                                   int *flags)
 {
        if (open_flags & (O_WRONLY|O_RDWR))
-                *level = LKM_EXMODE;
+                *level = DLM_LOCK_EX;
        else
-                *level = LKM_PRMODE;
+                *level = DLM_LOCK_PR;
        *flags = 0;
        if (open_flags & O_NONBLOCK)
-                *flags |= LKM_NOQUEUE;
+                *flags |= DLM_LKF_NOQUEUE;
        return 0;
 }
@@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode,
                 * to be able userspace to be able to distinguish a
                 * valid lock request from one that simply couldn't be
                 * granted. */
-                if (flags & LKM_NOQUEUE && status == -EAGAIN)
+                if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN)
                        status = -ETXTBSY;
                kfree(fp);
                goto bail;
@@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode,
        status = 0;
        if (fp) {
                level = fp->fp_lock_level;
-                if (level != LKM_IVMODE)
+                if (level != DLM_LOCK_IV)
                        user_dlm_cluster_unlock(&ip->ip_lockres, level);
                kfree(fp);
@@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
        if ((count + *ppos) > i_size_read(inode))
                readlen = i_size_read(inode) - *ppos;
        else
-                readlen = count - *ppos;
+                readlen = count;
        lvb_buf = kmalloc(readlen, GFP_NOFS);
        if (!lvb_buf)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 17947dc8341e..a5fbd9cea968 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -684,6 +684,7 @@ restarted_transaction:
                if (why == RESTART_META) {
                        mlog(0, "restarting function.\n");
                        restart_func = 1;
+                        status = 0;
                } else {
                        BUG_ON(why != RESTART_TRANS);
@@ -1981,18 +1982,18 @@ relock:
        /* communicate with ocfs2_dio_end_io */
        ocfs2_iocb_set_rw_locked(iocb, rw_level);
-        if (direct_io) {
+        ret = generic_segment_checks(iov, &nr_segs, &ocount,
-                ret = generic_segment_checks(iov, &nr_segs, &ocount,
+                                     VERIFY_READ);
-                                             VERIFY_READ);
+        if (ret)
-                if (ret)
+                goto out_dio;
-                        goto out_dio;
-                count = ocount;
+        count = ocount;
-                ret = generic_write_checks(file, ppos, &count,
+        ret = generic_write_checks(file, ppos, &count,
-                                           S_ISBLK(inode->i_mode));
+                                   S_ISBLK(inode->i_mode));
-                if (ret)
+        if (ret)
-                        goto out_dio;
+                goto out_dio;
+        if (direct_io) {
                written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
                                                    ppos, count, ocount);
                if (written < 0) {
@@ -2007,7 +2008,10 @@ relock:
                        goto out_dio;
                }
        } else {
-                written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
+                current->backing_dev_info = file->f_mapping->backing_dev_info;
+                written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
+                                                      ppos, count, 0);
+                current->backing_dev_info = NULL;
        }
 out_dio:
@@ -2021,9 +2025,9 @@ out_dio:
                if (ret < 0)
                        written = ret;
-                if (!ret && (old_size != i_size_read(inode) ||
+                if (!ret && ((old_size != i_size_read(inode)) ||
-                    old_clusters != OCFS2_I(inode)->ip_clusters ||
+                             (old_clusters != OCFS2_I(inode)->ip_clusters) ||
-                    has_refcount)) {
+                             has_refcount)) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 07cc8bb68b6d..af189887201c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -558,6 +558,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
                handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
+                        handle = NULL;
                        mlog_errno(status);
                        goto out;
                }
@@ -639,11 +640,13 @@ static int ocfs2_remove_inode(struct inode *inode,
                goto bail_unlock;
        }
-        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
+        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
-                                  orphan_dir_bh);
+                status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-        if (status < 0) {
+                                          orphan_dir_bh);
-                mlog_errno(status);
+                if (status < 0) {
-                goto bail_commit;
+                        mlog_errno(status);
+                        goto bail_commit;
+                }
        }
        /* set the inodes dtime */
@@ -722,38 +725,39 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,
 static int ocfs2_wipe_inode(struct inode *inode,
                            struct buffer_head *di_bh)
 {
-        int status, orphaned_slot;
+        int status, orphaned_slot = -1;
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-        struct ocfs2_dinode *di;
+        struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
-        di = (struct ocfs2_dinode *) di_bh->b_data;
+        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
-        orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
+                orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
-        status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
+                status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
-        if (status)
+                if (status)
-                return status;
+                        return status;
-        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+                orphan_dir_inode = ocfs2_get_system_file_inode(osb,
-                                                       ORPHAN_DIR_SYSTEM_INODE,
+                                                               ORPHAN_DIR_SYSTEM_INODE,
-                                                       orphaned_slot);
+                                                               orphaned_slot);
-        if (!orphan_dir_inode) {
+                if (!orphan_dir_inode) {
-                status = -EEXIST;
+                        status = -EEXIST;
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto bail;
-        }
+                }
-        /* Lock the orphan dir. The lock will be held for the entire
+                /* Lock the orphan dir. The lock will be held for the entire
-         * delete_inode operation. We do this now to avoid races with
+                 * delete_inode operation. We do this now to avoid races with
-         * recovery completion on other nodes. */
+                 * recovery completion on other nodes. */
-        mutex_lock(&orphan_dir_inode->i_mutex);
+                mutex_lock(&orphan_dir_inode->i_mutex);
-        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+                status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
-        if (status < 0) {
+                if (status < 0) {
-                mutex_unlock(&orphan_dir_inode->i_mutex);
+                        mutex_unlock(&orphan_dir_inode->i_mutex);
-                mlog_errno(status);
+                        mlog_errno(status);
-                goto bail;
+                        goto bail;
+                }
        }
        /* we do this while holding the orphan dir lock because we
@@ -794,6 +798,9 @@ static int ocfs2_wipe_inode(struct inode *inode,
                mlog_errno(status);
 bail_unlock_dir:
+        if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)
+                return status;
        ocfs2_inode_unlock(orphan_dir_inode, 1);
        mutex_unlock(&orphan_dir_inode->i_mutex);
        brelse(orphan_dir_bh);
@@ -889,7 +896,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
        /* Do some basic inode verification... */
        di = (struct ocfs2_dinode *) di_bh->b_data;
-        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) {
+        if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) &&
+            !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
                /*
                 * Inodes in the orphan dir must have ORPHANED_FL.  The only
                 * inodes that come back out of the orphan dir are reflink
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ba4fe07b293c..0b28e1921a39 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -100,6 +100,8 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_MAYBE_ORPHANED      0x00000020
 /* Does someone have the file open O_DIRECT */
 #define OCFS2_INODE_OPEN_DIRECT         0x00000040
+/* Tell the inode wipe code it's not in orphan dir */
+#define OCFS2_INODE_SKIP_ORPHAN_DIR     0x00000080
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b1eb50ae4097..4cbb18f26c5f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -408,23 +408,28 @@ static int ocfs2_mknod(struct inode *dir,
                }
        }
-        status = ocfs2_add_entry(handle, dentry, inode,
+        /*
-                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
+         * Do this before adding the entry to the directory. We add
-                                 &lookup);
+         * also set d_op after success so that ->d_iput() will cleanup
-        if (status < 0) {
+         * the dentry lock even if ocfs2_add_entry() fails below.
+         */
+        status = ocfs2_dentry_attach_lock(dentry, inode,
+                                          OCFS2_I(dir)->ip_blkno);
+        if (status) {
                mlog_errno(status);
                goto leave;
        }
+        dentry->d_op = &ocfs2_dentry_ops;
-        status = ocfs2_dentry_attach_lock(dentry, inode,
+        status = ocfs2_add_entry(handle, dentry, inode,
-                                          OCFS2_I(dir)->ip_blkno);
+                                 OCFS2_I(inode)->ip_blkno, parent_fe_bh,
-        if (status) {
+                                 &lookup);
+        if (status < 0) {
                mlog_errno(status);
                goto leave;
        }
        insert_inode_hash(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
        status = 0;
 leave:
@@ -445,11 +450,6 @@ leave:
        ocfs2_free_dir_lookup_result(&lookup);
-        if ((status < 0) && inode) {
-                clear_nlink(inode);
-                iput(inode);
-        }
        if (inode_ac)
                ocfs2_free_alloc_context(inode_ac);
@@ -459,6 +459,17 @@ leave:
        if (meta_ac)
                ocfs2_free_alloc_context(meta_ac);
+        /*
+         * We should call iput after the i_mutex of the bitmap been
+         * unlocked in ocfs2_free_alloc_context, or the
+         * ocfs2_delete_inode will mutex_lock again.
+         */
+        if ((status < 0) && inode) {
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
+                clear_nlink(inode);
+                iput(inode);
+        }
        mlog_exit(status);
        return status;
@@ -1771,22 +1782,27 @@ static int ocfs2_symlink(struct inode *dir,
                }
        }
-        status = ocfs2_add_entry(handle, dentry, inode,
+        /*
-                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+         * Do this before adding the entry to the directory. We add
-                                 &lookup);
+         * also set d_op after success so that ->d_iput() will cleanup
-        if (status < 0) {
+         * the dentry lock even if ocfs2_add_entry() fails below.
+         */
+        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        if (status) {
                mlog_errno(status);
                goto bail;
        }
+        dentry->d_op = &ocfs2_dentry_ops;
-        status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno);
+        status = ocfs2_add_entry(handle, dentry, inode,
-        if (status) {
+                                 le64_to_cpu(fe->i_blkno), parent_fe_bh,
+                                 &lookup);
+        if (status < 0) {
                mlog_errno(status);
                goto bail;
        }
        insert_inode_hash(inode);
-        dentry->d_op = &ocfs2_dentry_ops;
        d_instantiate(dentry, inode);
 bail:
        if (status < 0 && did_quota)
@@ -1811,6 +1827,7 @@ bail:
        if (xattr_ac)
                ocfs2_free_alloc_context(xattr_ac);
        if ((status < 0) && inode) {
+                OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
                clear_nlink(inode);
                iput(inode);
        }
@@ -1976,6 +1993,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        }
        le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL);
+        OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
        /* Record which orphan dir our inode now resides
         * in. delete_inode will use this to determine which orphan
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bd96f6c7877e..5cbcd0f008fc 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4083,6 +4083,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode,
        di->i_attr = s_di->i_attr;
        if (preserve) {
+                t_inode->i_uid = s_inode->i_uid;
+                t_inode->i_gid = s_inode->i_gid;
+                t_inode->i_mode = s_inode->i_mode;
                di->i_uid = s_di->i_uid;
                di->i_gid = s_di->i_gid;
                di->i_mode = s_di->i_mode;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index e51f2ec2c5e5..885ab5513ac5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,7 +81,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
-#include <linux/swapops.h>
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -495,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                rsslim,
                mm ? mm->start_code : 0,
                mm ? mm->end_code : 0,
-                (permitted && mm) ? task->stack_start : 0,
+                (permitted && mm) ? mm->start_stack : 0,
                esp,
                eip,
                /* The signal information here is obsolete.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7621db800a74..8418fcc0a6ab 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2909,7 +2909,7 @@ out_no_task:
 */
 static const struct pid_entry tid_base_stuff[] = {
        DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
-        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations),
+        DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
        REG("environ",   S_IRUSR, proc_environ_operations),
        INF("auxv",      S_IRUSR, proc_pid_auxv),
        ONE("status",    S_IRUGO, proc_pid_status),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 070553427dd5..47f5b145f56e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -247,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
                                } else if (vma->vm_start <= mm->start_stack &&
                                           vma->vm_end >= mm->start_stack) {
                                        name = "[stack]";
-                                } else {
-                                        unsigned long stack_start;
-                                        struct proc_maps_private *pmp;
-                                        pmp = m->private;
-                                        stack_start = pmp->task->stack_start;
-                                        if (vma->vm_start <= stack_start &&
-                                            vma->vm_end >= stack_start) {
-                                                pad_len_spaces(m, len);
-                                                seq_printf(m,
-                                                 "[threadstack:%08lx]",
-#ifdef CONFIG_STACK_GROWSUP
-                                                 vma->vm_end - stack_start
-#else
-                                                 stack_start - vma->vm_start
-#endif
-                                                );
-                                        }
                                }
                        } else {
                                name = "[vdso]";
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index dad7fb247ddc..3e21b1e2ad3a 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING
          Note that this behavior is currently deprecated and may go away in
          future. Please use notification via netlink socket instead.
+config QUOTA_DEBUG
+        bool "Additional quota sanity checks"
+        depends on QUOTA
+        default n
+        help
+          If you say Y here, quota subsystem will perform some additional
+          sanity checks of quota internal structures. If unsure, say N.
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
         tristate
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a0a9405b202a..788b5802a7ce 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -80,8 +80,6 @@
 #include <asm/uaccess.h>
-#define __DQUOT_PARANOIA
 /*
 * There are three quota SMP locks. dq_list_lock protects all lists with quotas
 * and quota formats, dqstats structure containing statistics about the lists
@@ -695,7 +693,7 @@ void dqput(struct dquot *dquot)
        if (!dquot)
                return;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                printk("VFS: dqput: trying to free free dquot\n");
                printk("VFS: device %s, dquot of %s %d\n",
@@ -748,7 +746,7 @@ we_slept:
                goto we_slept;
        }
        atomic_dec(&dquot->dq_count);
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        /* sanity check */
        BUG_ON(!list_empty(&dquot->dq_free));
 #endif
@@ -845,7 +843,7 @@ we_slept:
                dquot = NULL;
                goto out;
        }
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        BUG_ON(!dquot->dq_sb);  /* Has somebody invalidated entry under us? */
 #endif
 out:
@@ -874,7 +872,7 @@ static int dqinit_needed(struct inode *inode, int type)
 static void add_dquot_ref(struct super_block *sb, int type)
 {
        struct inode *inode, *old_inode = NULL;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
 #endif
@@ -882,7 +880,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
                        continue;
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
 #endif
@@ -907,7 +905,7 @@ static void add_dquot_ref(struct super_block *sb, int type)
        spin_unlock(&inode_lock);
        iput(old_inode);
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened before quota"
                        " was turned on thus quota information is probably "
@@ -940,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
        inode->i_dquot[type] = NULL;
        if (dquot) {
                if (dqput_blocks(dquot)) {
-#ifdef __DQUOT_PARANOIA
+#ifdef CONFIG_QUOTA_DEBUG
                        if (atomic_read(&dquot->dq_count) != 1)
                                printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index f8a6075abf50..07930449a958 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -46,8 +46,6 @@ static inline bool is_privroot_deh(struct dentry *dir,
                                   struct reiserfs_de_head *deh)
 {
        struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-        if (reiserfs_expose_privroot(dir->d_sb))
-                return 0;
        return (dir == dir->d_parent && privroot->d_inode &&
                deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 4f9586bb7631..e7cc00e636dc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -554,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
        if (!err && new_size < i_size_read(dentry->d_inode)) {
                struct iattr newattrs = {
                        .ia_ctime = current_fs_time(inode->i_sb),
-                        .ia_size = buffer_size,
+                        .ia_size = new_size,
                        .ia_valid = ATTR_SIZE | ATTR_CTIME,
                };
@@ -973,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask)
        return generic_permission(inode, mask, NULL);
 }
-/* This will catch lookups from the fs root to .reiserfs_priv */
+static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
-static int
-xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
-        struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
+        return -EPERM;
-        if (container_of(q1, struct dentry, d_name) == priv_root)
-                return -ENOENT;
-        if (q1->len == name->len &&
-                   !memcmp(q1->name, name->name, name->len))
-                return 0;
-        return 1;
 }
 static const struct dentry_operations xattr_lookup_poison_ops = {
-        .d_compare = xattr_lookup_poison,
+        .d_revalidate = xattr_hide_revalidate,
 };
 int reiserfs_lookup_privroot(struct super_block *s)
@@ -1001,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
                                strlen(PRIVROOT_NAME));
        if (!IS_ERR(dentry)) {
                REISERFS_SB(s)->priv_root = dentry;
-                if (!reiserfs_expose_privroot(s))
+                dentry->d_op = &xattr_lookup_poison_ops;
-                        s->s_root->d_op = &xattr_lookup_poison_ops;
                if (dentry->d_inode)
                        dentry->d_inode->i_flags |= S_PRIVATE;
        } else
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1c4c8f089970..dfa1d67f8fca 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb)
        if (server->conn_pid)
                kill_pid(server->conn_pid, SIGTERM, 1);
+        bdi_destroy(&server->bdi);
        kfree(server->ops);
        smb_unload_nls(server);
        sb->s_fs_info = NULL;
@@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
        if (!server)
                goto out_no_server;
        sb->s_fs_info = server;
+        
+        if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY))
+                goto out_bdi;
+        sb->s_bdi = &server->bdi;
        server->super_block = sb;
        server->mnt = NULL;
@@ -624,6 +630,8 @@ out_no_smbiod:
 out_bad_option:
        kfree(mem);
 out_no_mem:
+        bdi_destroy(&server->bdi);
+out_bdi:
        if (!server->mnt)
                printk(KERN_ERR "smb_fill_super: allocation failure\n");
        sb->s_fs_info = NULL;
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 1cb0d81b164b..653c030eb840 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
        u64 cur_index = index >> msblk->devblksize_log2;
        int bytes, compressed, b = 0, k = 0, page = 0, avail;
+        bh = kcalloc(((srclength + msblk->devblksize - 1)
-        bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1,
+                >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
-                                sizeof(*bh), GFP_KERNEL);
        if (bh == NULL)
                return -ENOMEM;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 3550aec2f655..48b6f4a385a6 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -275,7 +275,8 @@ allocate_root:
        err = squashfs_read_inode(root, root_inode);
        if (err) {
-                iget_failed(root);
+                make_bad_inode(root);
+                iput(root);
                goto failed_mount;
        }
        insert_inode_hash(root);
@@ -353,6 +354,7 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sbi->id_table);
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
+                kfree(sbi->inode_lookup_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 15a03d0fb9f3..7a603874e483 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -128,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
                goto release_mutex;
        }
+        length = stream->total_out;
        mutex_unlock(&msblk->read_data_mutex);
-        return stream->total_out;
+        return length;
 release_mutex:
        mutex_unlock(&msblk->read_data_mutex);
diff --git a/fs/super.c b/fs/super.c
index f35ac6022109..1527e6a0ee35 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,7 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
+#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -693,6 +694,7 @@ int set_anon_super(struct super_block *s, void *data)
                return -EMFILE;
        }
        s->s_dev = MKDEV(0, dev & MINORMASK);
+        s->s_bdi = &noop_backing_dev_info;
        return 0;
 }
@@ -954,10 +956,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        if (error < 0)
                goto out_free_secdata;
        BUG_ON(!mnt->mnt_sb);
+        WARN_ON(!mnt->mnt_sb->s_bdi);
-        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
+        error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata);
-        if (error)
+        if (error)
-                goto out_sb;
+                goto out_sb;
        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
diff --git a/fs/sync.c b/fs/sync.c
index fc5c3d75cf3c..92b228176f7c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/backing-dev.h>
 #include "internal.h"
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
@@ -32,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
         * This should be safe, as we require bdi backing to actually
         * write out data in the first place
         */
-        if (!sb->s_bdi)
+        if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info)
                return 0;
        if (sb->s_qcop && sb->s_qcop->quota_sync)
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 4e50286a4cc3..1dabed286b4c 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -164,8 +164,8 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_
                                                        name, de->name))
                                        goto found;
                        }
+                        dir_put_page(page);
                }
-                dir_put_page(page);
                if (++n >= npages)
                        n = 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 52e06b487ced..29f1edca76de 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1209,6 +1209,7 @@ xfs_fs_put_super(
        xfs_unmountfs(mp);
        xfs_freesb(mp);
+        xfs_inode_shrinker_unregister(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_close_devices(mp);
        xfs_dmops_put(mp);
@@ -1622,6 +1623,8 @@ xfs_fs_fill_super(
        if (error)
                goto fail_vnrele;
+        xfs_inode_shrinker_register(mp);
        kfree(mtpt);
        return 0;
@@ -1867,6 +1870,7 @@ init_xfs_fs(void)
                goto out_cleanup_procfs;
        vfs_initquota();
+        xfs_inode_shrinker_init();
        error = register_filesystem(&xfs_fs_type);
        if (error)
@@ -1894,6 +1898,7 @@ exit_xfs_fs(void)
 {
        vfs_exitquota();
        unregister_filesystem(&xfs_fs_type);
+        xfs_inode_shrinker_destroy();
        xfs_sysctl_unregister();
        xfs_cleanup_procfs();
        xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 05cd85317f6f..a427c638d909 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -95,7 +95,8 @@ xfs_inode_ag_walk(
                                           struct xfs_perag *pag, int flags),
        int                     flags,
        int                     tag,
-        int                     exclusive)
+        int                     exclusive,
+        int                     *nr_to_scan)
 {
        uint32_t                first_index;
        int                     last_error = 0;
@@ -134,7 +135,7 @@ restart:
                if (error == EFSCORRUPTED)
                        break;
-        } while (1);
+        } while ((*nr_to_scan)--);
        if (skipped) {
                delay(1);
@@ -150,12 +151,15 @@ xfs_inode_ag_iterator(
                                           struct xfs_perag *pag, int flags),
        int                     flags,
        int                     tag,
-        int                     exclusive)
+        int                     exclusive,
+        int                     *nr_to_scan)
 {
        int                     error = 0;
        int                     last_error = 0;
        xfs_agnumber_t          ag;
+        int                     nr;
+        nr = nr_to_scan ? *nr_to_scan : INT_MAX;
        for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                struct xfs_perag        *pag;
@@ -165,14 +169,18 @@ xfs_inode_ag_iterator(
                        continue;
                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
-                                                exclusive);
+                                                exclusive, &nr);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
                        if (error == EFSCORRUPTED)
                                break;
                }
+                if (nr <= 0)
+                        break;
        }
+        if (nr_to_scan)
+                *nr_to_scan = nr;
        return XFS_ERROR(last_error);
 }
@@ -291,7 +299,7 @@ xfs_sync_data(
        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
-                                      XFS_ICI_NO_TAG, 0);
+                                      XFS_ICI_NO_TAG, 0, NULL);
        if (error)
                return XFS_ERROR(error);
@@ -310,7 +318,7 @@ xfs_sync_attr(
        ASSERT((flags & ~SYNC_WAIT) == 0);
        return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
-                                     XFS_ICI_NO_TAG, 0);
+                                     XFS_ICI_NO_TAG, 0, NULL);
 }
 STATIC int
@@ -673,6 +681,7 @@ __xfs_inode_set_reclaim_tag(
        radix_tree_tag_set(&pag->pag_ici_root,
                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
                           XFS_ICI_RECLAIM_TAG);
+        pag->pag_ici_reclaimable++;
 }
 /*
@@ -705,6 +714,7 @@ __xfs_inode_clear_reclaim_tag(
 {
        radix_tree_tag_clear(&pag->pag_ici_root,
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+        pag->pag_ici_reclaimable--;
 }
 /*
@@ -820,10 +830,10 @@ xfs_reclaim_inode(
         * call into reclaim to find it in a clean state instead of waiting for
         * it now. We also don't return errors here - if the error is transient
         * then the next reclaim pass will flush the inode, and if the error
-         * is permanent then the next sync reclaim will relcaim the inode and
+         * is permanent then the next sync reclaim will reclaim the inode and
         * pass on the error.
         */
-        if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+        if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                xfs_fs_cmn_err(CE_WARN, ip->i_mount,
                        "inode 0x%llx background reclaim flush failed with %d",
                        (long long)ip->i_ino, error);
@@ -854,5 +864,93 @@ xfs_reclaim_inodes(
        int             mode)
 {
        return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
-                                        XFS_ICI_RECLAIM_TAG, 1);
+                                        XFS_ICI_RECLAIM_TAG, 1, NULL);
+}
+/*
+ * Shrinker infrastructure.
+ *
+ * This is all far more complex than it needs to be. It adds a global list of
+ * mounts because the shrinkers can only call a global context. We need to make
+ * the shrinkers pass a context to avoid the need for global state.
+ */
+static LIST_HEAD(xfs_mount_list);
+static struct rw_semaphore xfs_mount_list_lock;
+static int
+xfs_reclaim_inode_shrink(
+        int             nr_to_scan,
+        gfp_t           gfp_mask)
+{
+        struct xfs_mount *mp;
+        struct xfs_perag *pag;
+        xfs_agnumber_t  ag;
+        int             reclaimable = 0;
+        if (nr_to_scan) {
+                if (!(gfp_mask & __GFP_FS))
+                        return -1;
+                down_read(&xfs_mount_list_lock);
+                list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+                        xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+                                        XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
+                        if (nr_to_scan <= 0)
+                                break;
+                }
+                up_read(&xfs_mount_list_lock);
+        }
+        down_read(&xfs_mount_list_lock);
+        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
+                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+                        pag = xfs_perag_get(mp, ag);
+                        if (!pag->pag_ici_init) {
+                                xfs_perag_put(pag);
+                                continue;
+                        }
+                        reclaimable += pag->pag_ici_reclaimable;
+                        xfs_perag_put(pag);
+                }
+        }
+        up_read(&xfs_mount_list_lock);
+        return reclaimable;
+}
+static struct shrinker xfs_inode_shrinker = {
+        .shrink = xfs_reclaim_inode_shrink,
+        .seeks = DEFAULT_SEEKS,
+};
+void __init
+xfs_inode_shrinker_init(void)
+{
+        init_rwsem(&xfs_mount_list_lock);
+        register_shrinker(&xfs_inode_shrinker);
+}
+void
+xfs_inode_shrinker_destroy(void)
+{
+        ASSERT(list_empty(&xfs_mount_list));
+        unregister_shrinker(&xfs_inode_shrinker);
+}
+void
+xfs_inode_shrinker_register(
+        struct xfs_mount        *mp)
+{
+        down_write(&xfs_mount_list_lock);
+        list_add_tail(&mp->m_mplist, &xfs_mount_list);
+        up_write(&xfs_mount_list_lock);
+}
+void
+xfs_inode_shrinker_unregister(
+        struct xfs_mount        *mp)
+{
+        down_write(&xfs_mount_list_lock);
+        list_del(&mp->m_mplist);
+        up_write(&xfs_mount_list_lock);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index d480c346cabb..cdcbaaca9880 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
 int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-        int flags, int tag, int write_lock);
+        int flags, int tag, int write_lock, int *nr_to_scan);
+void xfs_inode_shrinker_init(void);
+void xfs_inode_shrinker_destroy(void);
+void xfs_inode_shrinker_register(struct xfs_mount *mp);
+void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
 #endif
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 5d0ee8d492db..50bee07d6b0e 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -891,7 +891,8 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags,
+                                XFS_ICI_NO_TAG, 0, NULL);
 }
 /*------------------------------------------------------------------------*/
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index b1a5a1ff88ea..abb8222b88c9 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -223,6 +223,7 @@ typedef struct xfs_perag {
        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
+        int             pag_ici_reclaimable;    /* reclaimable inodes */
 #endif
        int             pagb_count;     /* pagb slots in use */
        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index cd27c9d6c71f..5bba29a07812 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -177,16 +177,26 @@ xfs_swap_extents_check_format(
            XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max)
                return EINVAL;
-        /* Check root block of temp in btree form to max in target */
+        /*
+         * If we are in a btree format, check that the temp root block will fit
+         * in the target and that it has enough extents to be in btree format
+         * in the target.
+         *
+         * Note that we have to be careful to allow btree->extent conversions
+         * (a common defrag case) which will occur when the temp inode is in
+         * extent format...
+         */
        if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_BOFF(ip) &&
+            ((XFS_IFORK_BOFF(ip) &&
-            tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip))
+              tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) ||
+             XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max))
                return EINVAL;
-        /* Check root block of target in btree form to max in temp */
+        /* Reciprocal target->temp btree format checks */
        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
-            XFS_IFORK_BOFF(tip) &&
+            ((XFS_IFORK_BOFF(tip) &&
-            ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip))
+              ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) ||
+             XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max))
                return EINVAL;
        return 0;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e8fba92d7cd9..2be019136287 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -745,9 +745,16 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 /*
 * Determine if we have a transaction that has gone to disk
- * that needs to be covered. Log activity needs to be idle (no AIL and
+ * that needs to be covered. To begin the transition to the idle state
- * nothing in the iclogs). And, we need to be in the right state indicating
+ * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * something has gone out.
+ * If we are then in a state where covering is needed, the caller is informed
+ * that dummy transactions are required to move the log into the idle state.
+ *
+ * Because this is called as part of the sync process, we should also indicate
+ * that dummy transactions should be issued in anything but the covered or
+ * idle states. This ensures that the log tail is accurately reflected in
+ * the log at the end of the sync, hence if a crash occurrs avoids replay
+ * of transactions where the metadata is already on disk.
 */
 int
 xfs_log_need_covered(xfs_mount_t *mp)
@@ -759,17 +766,24 @@ xfs_log_need_covered(xfs_mount_t *mp)
                return 0;
        spin_lock(&log->l_icloglock);
-        if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
+        switch (log->l_covered_state) {
-                (log->l_covered_state == XLOG_STATE_COVER_NEED2))
+        case XLOG_STATE_COVER_DONE:
-                        && !xfs_trans_ail_tail(log->l_ailp)
+        case XLOG_STATE_COVER_DONE2:
-                        && xlog_iclogs_empty(log)) {
+        case XLOG_STATE_COVER_IDLE:
-                if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                break;
-                        log->l_covered_state = XLOG_STATE_COVER_DONE;
+        case XLOG_STATE_COVER_NEED:
-                else {
+        case XLOG_STATE_COVER_NEED2:
-                        ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2);
+                if (!xfs_trans_ail_tail(log->l_ailp) &&
-                        log->l_covered_state = XLOG_STATE_COVER_DONE2;
+                    xlog_iclogs_empty(log)) {
+                        if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+                                log->l_covered_state = XLOG_STATE_COVER_DONE;
+                        else
+                                log->l_covered_state = XLOG_STATE_COVER_DONE2;
                }
+                /* FALLTHRU */
+        default:
                needed = 1;
+                break;
        }
        spin_unlock(&log->l_icloglock);
        return needed;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 4fa0bc7b983e..9ff48a16a7ee 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,6 +259,7 @@ typedef struct xfs_mount {
        wait_queue_head_t       m_wait_single_sync_task;
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
+        struct list_head        m_mplist;       /* inode shrinker mount list */
 } xfs_mount_t;
 /*
author	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2010-05-19 13:12:41 -0400
committer	Dmitry Torokhov <dmitry.torokhov@gmail.com>	2010-05-19 13:12:41 -0400
commit	8d0bc2b456103a34c11e01305cd1aed1cde579e5 (patch)
tree	5e1e6ad55cc9e2b5c5617f6f320114b8cff9e3f3 /fs
parent	30ba3ead05763b172acaa65ae1be71af2a878940 (diff)
parent	e40152ee1e1c7a63f4777791863215e3faa37a86 (diff)