Merge branch 'master' into for-linus

author: Chris Metcalf <cmetcalf@tilera.com> 2010-08-13 19:59:15 -0400
committer: Chris Metcalf <cmetcalf@tilera.com> 2010-08-13 19:59:15 -0400
commit: 7d72e6fa56c4100b9669efe0044f77ed9eb785a1 (patch)
tree: 5e90bf4969809a1ab20b97432b85be20ccfaa1f4 /fs
parent: ba00376b0b13f234d839541a7b36a5bf5c2a4036 (diff)
parent: 2be1f3a73dd02e38e181cf5abacb3d45a6a2d6b8 (diff)
112 files changed, 2320 insertions, 1013 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index d97c34a24f7a..c7c23eab9440 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1263,10 +1263,19 @@ static int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
                return PTR_ERR(fid);
        retval = p9_client_setattr(fid, &p9attr);
-        if (retval >= 0)
+        if (retval < 0)
-                retval = inode_setattr(dentry->d_inode, iattr);
+                return retval;
-        return retval;
+        if ((iattr->ia_valid & ATTR_SIZE) &&
+            iattr->ia_size != i_size_read(dentry->d_inode)) {
+                retval = vmtruncate(dentry->d_inode, iattr->ia_size);
+                if (retval)
+                        return retval;
+        }
+        setattr_copy(dentry->d_inode, iattr);
+        mark_inode_dirty(dentry->d_inode);
+        return 0;
 }
 /**
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index ffea35c63879..0d5eeadf6121 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -31,21 +31,20 @@ static struct afs_cell *afs_cell_root;
 * allocate a cell record and fill in its name, VL server address list and
 * allocate an anonymous key
 */
-static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
+static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen,
+                                       char *vllist)
 {
        struct afs_cell *cell;
        struct key *key;
-        size_t namelen;
        char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
        char  *dvllist = NULL, *_vllist = NULL;
        char  delimiter = ':';
        int ret;
-        _enter("%s,%s", name, vllist);
+        _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist);
        BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
-        namelen = strlen(name);
        if (namelen > AFS_MAXCELLNAME) {
                _leave(" = -ENAMETOOLONG");
                return ERR_PTR(-ENAMETOOLONG);
@@ -73,6 +72,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
        if (!vllist || strlen(vllist) < 7) {
                ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL);
                if (ret < 0) {
+                        if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY)
+                                /* translate these errors into something
+                                 * userspace might understand */
+                                ret = -EDESTADDRREQ;
                        _leave(" = %d", ret);
                        return ERR_PTR(ret);
                }
@@ -138,26 +141,29 @@ error:
 }
 /*
- * create a cell record
+ * afs_cell_crate() - create a cell record
- * - "name" is the name of the cell
+ * @name:       is the name of the cell.
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ * @namsesz:    is the strlen of the cell name.
+ * @vllist:     is a colon separated list of IP addresses in "a.b.c.d" format.
+ * @retref:     is T to return the cell reference when the cell exists.
 */
-struct afs_cell *afs_cell_create(const char *name, char *vllist)
+struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
+                                 char *vllist, bool retref)
 {
        struct afs_cell *cell;
        int ret;
-        _enter("%s,%s", name, vllist);
+        _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist);
        down_write(&afs_cells_sem);
        read_lock(&afs_cells_lock);
        list_for_each_entry(cell, &afs_cells, link) {
-                if (strcasecmp(cell->name, name) == 0)
+                if (strncasecmp(cell->name, name, namesz) == 0)
                        goto duplicate_name;
        }
        read_unlock(&afs_cells_lock);
-        cell = afs_cell_alloc(name, vllist);
+        cell = afs_cell_alloc(name, namesz, vllist);
        if (IS_ERR(cell)) {
                _leave(" = %ld", PTR_ERR(cell));
                up_write(&afs_cells_sem);
@@ -197,8 +203,18 @@ error:
        return ERR_PTR(ret);
 duplicate_name:
+        if (retref && !IS_ERR(cell))
+                afs_get_cell(cell);
        read_unlock(&afs_cells_lock);
        up_write(&afs_cells_sem);
+        if (retref) {
+                _leave(" = %p", cell);
+                return cell;
+        }
+        _leave(" = -EEXIST");
        return ERR_PTR(-EEXIST);
 }
@@ -229,7 +245,7 @@ int afs_cell_init(char *rootcell)
                *cp++ = 0;
        /* allocate a cell record for the root cell */
-        new_root = afs_cell_create(rootcell, cp);
+        new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false);
        if (IS_ERR(new_root)) {
                _leave(" = %ld", PTR_ERR(new_root));
                return PTR_ERR(new_root);
@@ -249,11 +265,12 @@ int afs_cell_init(char *rootcell)
 /*
 * lookup a cell record
 */
-struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
+                                 bool dns_cell)
 {
        struct afs_cell *cell;
-        _enter("\"%*.*s\",", namesz, namesz, name ? name : "");
+        _enter("\"%*.*s\",", namesz, namesz, name ?: "");
        down_read(&afs_cells_sem);
        read_lock(&afs_cells_lock);
@@ -267,6 +284,8 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
                        }
                }
                cell = ERR_PTR(-ENOENT);
+                if (dns_cell)
+                        goto create_cell;
        found:
                ;
        } else {
@@ -289,6 +308,15 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
        up_read(&afs_cells_sem);
        _leave(" = %p", cell);
        return cell;
+create_cell:
+        read_unlock(&afs_cells_lock);
+        up_read(&afs_cells_sem);
+        cell = afs_cell_create(name, namesz, NULL, true);
+        _leave(" = %p", cell);
+        return cell;
 }
 #if 0
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b42d5cc1d6d2..0d38c09bd55e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -477,6 +477,40 @@ static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
 }
 /*
+ * Try to auto mount the mountpoint with pseudo directory, if the autocell
+ * operation is setted.
+ */
+static struct inode *afs_try_auto_mntpt(
+        int ret, struct dentry *dentry, struct inode *dir, struct key *key,
+        struct afs_fid *fid)
+{
+        const char *devname = dentry->d_name.name;
+        struct afs_vnode *vnode = AFS_FS_I(dir);
+        struct inode *inode;
+        _enter("%d, %p{%s}, {%x:%u}, %p",
+               ret, dentry, devname, vnode->fid.vid, vnode->fid.vnode, key);
+        if (ret != -ENOENT ||
+            !test_bit(AFS_VNODE_AUTOCELL, &vnode->flags))
+                goto out;
+        inode = afs_iget_autocell(dir, devname, strlen(devname), key);
+        if (IS_ERR(inode)) {
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        *fid = AFS_FS_I(inode)->fid;
+        _leave("= %p", inode);
+        return inode;
+out:
+        _leave("= %d", ret);
+        return ERR_PTR(ret);
+}
+/*
 * look up an entry in a directory
 */
 static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
@@ -520,6 +554,13 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
        ret = afs_do_lookup(dir, dentry, &fid, key);
        if (ret < 0) {
+                inode = afs_try_auto_mntpt(ret, dentry, dir, key, &fid);
+                if (!IS_ERR(inode)) {
+                        key_put(key);
+                        goto success;
+                }
+                ret = PTR_ERR(inode);
                key_put(key);
                if (ret == -ENOENT) {
                        d_add(dentry, NULL);
@@ -539,6 +580,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
                return ERR_CAST(inode);
        }
+success:
        dentry->d_op = &afs_fs_dentry_operations;
        d_add(dentry, inode);
@@ -696,8 +738,9 @@ static int afs_d_delete(struct dentry *dentry)
                goto zap;
        if (dentry->d_inode &&
-            test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
+            (test_bit(AFS_VNODE_DELETED,   &AFS_FS_I(dentry->d_inode)->flags) ||
-                        goto zap;
+             test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags)))
+                goto zap;
        _leave(" = 0 [keep]");
        return 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 320ffef11574..0747339011c3 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,6 +19,8 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
 #include "internal.h"
 struct afs_iget_data {
@@ -102,6 +104,16 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
 }
 /*
+ * iget5() comparator for inode created by autocell operations
+ *
+ * These pseudo inodes don't match anything.
+ */
+static int afs_iget5_autocell_test(struct inode *inode, void *opaque)
+{
+        return 0;
+}
+/*
 * iget5() inode initialiser
 */
 static int afs_iget5_set(struct inode *inode, void *opaque)
@@ -118,6 +130,67 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
 }
 /*
+ * inode retrieval for autocell
+ */
+struct inode *afs_iget_autocell(struct inode *dir, const char *dev_name,
+                                int namesz, struct key *key)
+{
+        struct afs_iget_data data;
+        struct afs_super_info *as;
+        struct afs_vnode *vnode;
+        struct super_block *sb;
+        struct inode *inode;
+        static atomic_t afs_autocell_ino;
+        _enter("{%x:%u},%*.*s,",
+               AFS_FS_I(dir)->fid.vid, AFS_FS_I(dir)->fid.vnode,
+               namesz, namesz, dev_name ?: "");
+        sb = dir->i_sb;
+        as = sb->s_fs_info;
+        data.volume = as->volume;
+        data.fid.vid = as->volume->vid;
+        data.fid.unique = 0;
+        data.fid.vnode = 0;
+        inode = iget5_locked(sb, atomic_inc_return(&afs_autocell_ino),
+                             afs_iget5_autocell_test, afs_iget5_set,
+                             &data);
+        if (!inode) {
+                _leave(" = -ENOMEM");
+                return ERR_PTR(-ENOMEM);
+        }
+        _debug("GOT INODE %p { ino=%lu, vl=%x, vn=%x, u=%x }",
+               inode, inode->i_ino, data.fid.vid, data.fid.vnode,
+               data.fid.unique);
+        vnode = AFS_FS_I(inode);
+        /* there shouldn't be an existing inode */
+        BUG_ON(!(inode->i_state & I_NEW));
+        inode->i_size           = 0;
+        inode->i_mode           = S_IFDIR | S_IRUGO | S_IXUGO;
+        inode->i_op             = &afs_autocell_inode_operations;
+        inode->i_nlink          = 2;
+        inode->i_uid            = 0;
+        inode->i_gid            = 0;
+        inode->i_ctime.tv_sec   = get_seconds();
+        inode->i_ctime.tv_nsec  = 0;
+        inode->i_atime          = inode->i_mtime = inode->i_ctime;
+        inode->i_blocks         = 0;
+        inode->i_version        = 0;
+        inode->i_generation     = 0;
+        set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags);
+        inode->i_flags |= S_NOATIME;
+        unlock_new_inode(inode);
+        _leave(" = %p", inode);
+        return inode;
+}
+/*
 * inode retrieval
 */
 struct inode *afs_iget(struct super_block *sb, struct key *key,
@@ -314,6 +387,19 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 }
 /*
+ * discard an AFS inode
+ */
+int afs_drop_inode(struct inode *inode)
+{
+        _enter("");
+        if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
+                return generic_delete_inode(inode);
+        else
+                return generic_drop_inode(inode);
+}
+/*
 * clear an AFS inode
 */
 void afs_evict_inode(struct inode *inode)
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 8679089ce9a1..cca8eef736fc 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -42,6 +42,7 @@ typedef enum {
 struct afs_mount_params {
        bool                    rwpath;         /* T if the parent should be considered R/W */
        bool                    force;          /* T to force cell type */
+        bool                    autocell;       /* T if set auto mount operation */
        afs_voltype_t           type;           /* type of volume requested */
        int                     volnamesz;      /* size of volume name */
        const char              *volname;       /* name of volume to mount */
@@ -358,6 +359,8 @@ struct afs_vnode {
 #define AFS_VNODE_READLOCKED    7               /* set if vnode is read-locked on the server */
 #define AFS_VNODE_WRITELOCKED   8               /* set if vnode is write-locked on the server */
 #define AFS_VNODE_UNLOCKING     9               /* set if vnode is being unlocked on the server */
+#define AFS_VNODE_AUTOCELL      10              /* set if Vnode is an auto mount point */
+#define AFS_VNODE_PSEUDODIR     11              /* set if Vnode is a pseudo directory */
        long                    acl_order;      /* ACL check count (callback break count) */
@@ -468,8 +471,8 @@ extern struct list_head afs_proc_cells;
 #define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
 extern int afs_cell_init(char *);
-extern struct afs_cell *afs_cell_create(const char *, char *);
+extern struct afs_cell *afs_cell_create(const char *, unsigned, char *, bool);
-extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned, bool);
 extern struct afs_cell *afs_grab_cell(struct afs_cell *);
 extern void afs_put_cell(struct afs_cell *);
 extern void afs_cell_purge(void);
@@ -558,6 +561,8 @@ extern int afs_fs_release_lock(struct afs_server *, struct key *,
 /*
 * inode.c
 */
+extern struct inode *afs_iget_autocell(struct inode *, const char *, int,
+                                       struct key *);
 extern struct inode *afs_iget(struct super_block *, struct key *,
                              struct afs_fid *, struct afs_file_status *,
                              struct afs_callback *);
@@ -566,6 +571,7 @@ extern int afs_validate(struct afs_vnode *, struct key *);
 extern int afs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int afs_setattr(struct dentry *, struct iattr *);
 extern void afs_evict_inode(struct inode *);
+extern int afs_drop_inode(struct inode *);
 /*
 * main.c
@@ -581,6 +587,7 @@ extern int afs_abort_to_error(u32);
 * mntpt.c
 */
 extern const struct inode_operations afs_mntpt_inode_operations;
+extern const struct inode_operations afs_autocell_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
 extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
@@ -752,12 +759,6 @@ extern unsigned afs_debug;
 #define dbgprintk(FMT,...) \
        printk("[%-6.6s] "FMT"\n", current->comm ,##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf,1,2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
 #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
 #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
 #define kdebug(FMT,...) dbgprintk("    "FMT ,##__VA_ARGS__)
@@ -792,9 +793,9 @@ do {							\
 } while (0)
 #else
-#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
+#define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__)
-#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
+#define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__)
-#define _debug(FMT,...) _dbprintk("    "FMT ,##__VA_ARGS__)
+#define _debug(FMT,...) no_printk("    "FMT ,##__VA_ARGS__)
 #endif
 /*
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index a9e23039ea34..6d552686c498 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -38,6 +38,11 @@ const struct inode_operations afs_mntpt_inode_operations = {
        .getattr        = afs_getattr,
 };
+const struct inode_operations afs_autocell_inode_operations = {
+        .follow_link    = afs_mntpt_follow_link,
+        .getattr        = afs_getattr,
+};
 static LIST_HEAD(afs_vfsmounts);
 static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
@@ -136,20 +141,16 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 {
        struct afs_super_info *super;
        struct vfsmount *mnt;
+        struct afs_vnode *vnode;
        struct page *page;
-        size_t size;
+        char *devname, *options;
-        char *buf, *devname, *options;
+        bool rwpath = false;
        int ret;
        _enter("{%s}", mntpt->d_name.name);
        BUG_ON(!mntpt->d_inode);
-        ret = -EINVAL;
-        size = mntpt->d_inode->i_size;
-        if (size > PAGE_SIZE - 1)
-                goto error_no_devname;
        ret = -ENOMEM;
        devname = (char *) get_zeroed_page(GFP_KERNEL);
        if (!devname)
@@ -159,28 +160,59 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
        if (!options)
                goto error_no_options;
-        /* read the contents of the AFS special symlink */
+        vnode = AFS_FS_I(mntpt->d_inode);
-        page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+        if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) {
-        if (IS_ERR(page)) {
+                /* if the directory is a pseudo directory, use the d_name */
-                ret = PTR_ERR(page);
+                static const char afs_root_cell[] = ":root.cell.";
-                goto error_no_page;
+                unsigned size = mntpt->d_name.len;
+                ret = -ENOENT;
+                if (size < 2 || size > AFS_MAXCELLNAME)
+                        goto error_no_page;
+                if (mntpt->d_name.name[0] == '.') {
+                        devname[0] = '#';
+                        memcpy(devname + 1, mntpt->d_name.name, size - 1);
+                        memcpy(devname + size, afs_root_cell,
+                               sizeof(afs_root_cell));
+                        rwpath = true;
+                } else {
+                        devname[0] = '%';
+                        memcpy(devname + 1, mntpt->d_name.name, size);
+                        memcpy(devname + size + 1, afs_root_cell,
+                               sizeof(afs_root_cell));
+                }
+        } else {
+                /* read the contents of the AFS special symlink */
+                loff_t size = i_size_read(mntpt->d_inode);
+                char *buf;
+                ret = -EINVAL;
+                if (size > PAGE_SIZE - 1)
+                        goto error_no_page;
+                page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL);
+                if (IS_ERR(page)) {
+                        ret = PTR_ERR(page);
+                        goto error_no_page;
+                }
+                ret = -EIO;
+                if (PageError(page))
+                        goto error;
+                buf = kmap_atomic(page, KM_USER0);
+                memcpy(devname, buf, size);
+                kunmap_atomic(buf, KM_USER0);
+                page_cache_release(page);
+                page = NULL;
        }
-        ret = -EIO;
-        if (PageError(page))
-                goto error;
-        buf = kmap_atomic(page, KM_USER0);
-        memcpy(devname, buf, size);
-        kunmap_atomic(buf, KM_USER0);
-        page_cache_release(page);
-        page = NULL;
        /* work out what options we want */
        super = AFS_FS_S(mntpt->d_sb);
        memcpy(options, "cell=", 5);
        strcpy(options + 5, super->volume->cell->name);
-        if (super->volume->type == AFSVL_RWVOL)
+        if (super->volume->type == AFSVL_RWVOL || rwpath)
                strcat(options, ",rwpath");
        /* try and do the mount */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 852739d262a9..096b23f821a1 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -294,7 +294,7 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
        if (strcmp(kbuf, "add") == 0) {
                struct afs_cell *cell;
-                cell = afs_cell_create(name, args);
+                cell = afs_cell_create(name, strlen(name), args, false);
                if (IS_ERR(cell)) {
                        ret = PTR_ERR(cell);
                        goto done;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 67cf810e0fd6..654d8fdbf01f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -100,6 +100,7 @@ int afs_open_socket(void)
        ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
        if (ret < 0) {
                sock_release(socket);
+                destroy_workqueue(afs_async_calls);
                _leave(" = %d [bind]", ret);
                return ret;
        }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 9cf80f02da16..77e1e5a61154 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
@@ -48,6 +49,7 @@ struct file_system_type afs_fs_type = {
 static const struct super_operations afs_super_ops = {
        .statfs         = afs_statfs,
        .alloc_inode    = afs_alloc_inode,
+        .drop_inode     = afs_drop_inode,
        .destroy_inode  = afs_destroy_inode,
        .evict_inode    = afs_evict_inode,
        .put_super      = afs_put_super,
@@ -62,12 +64,14 @@ enum {
        afs_opt_cell,
        afs_opt_rwpath,
        afs_opt_vol,
+        afs_opt_autocell,
 };
 static const match_table_t afs_options_list = {
        { afs_opt_cell,         "cell=%s"       },
        { afs_opt_rwpath,       "rwpath"        },
        { afs_opt_vol,          "vol=%s"        },
+        { afs_opt_autocell,     "autocell"      },
        { afs_no_opt,           NULL            },
 };
@@ -151,7 +155,8 @@ static int afs_parse_options(struct afs_mount_params *params,
                switch (token) {
                case afs_opt_cell:
                        cell = afs_cell_lookup(args[0].from,
-                                               args[0].to - args[0].from);
+                                               args[0].to - args[0].from,
+                                               false);
                        if (IS_ERR(cell))
                                return PTR_ERR(cell);
                        afs_put_cell(params->cell);
@@ -166,6 +171,10 @@ static int afs_parse_options(struct afs_mount_params *params,
                        *devname = args[0].from;
                        break;
+                case afs_opt_autocell:
+                        params->autocell = 1;
+                        break;
                default:
                        printk(KERN_ERR "kAFS:"
                               " Unknown or invalid mount option: '%s'\n", p);
@@ -252,10 +261,10 @@ static int afs_parse_device_name(struct afs_mount_params *params,
        /* lookup the cell record */
        if (cellname || !params->cell) {
-                cell = afs_cell_lookup(cellname, cellnamesz);
+                cell = afs_cell_lookup(cellname, cellnamesz, true);
                if (IS_ERR(cell)) {
-                        printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
+                        printk(KERN_ERR "kAFS: unable to lookup cell '%*.*s'\n",
-                               cellname ?: "");
+                               cellnamesz, cellnamesz, cellname ?: "");
                        return PTR_ERR(cell);
                }
                afs_put_cell(params->cell);
@@ -321,6 +330,9 @@ static int afs_fill_super(struct super_block *sb, void *data)
        if (IS_ERR(inode))
                goto error_inode;
+        if (params->autocell)
+                set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags);
        ret = -ENOMEM;
        root = d_alloc_root(inode);
        if (!root)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 48e056e70fd6..cb1bd38dc08c 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -204,8 +204,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags)
        }
        /* Initialize expiry counter after successful mount */
-        if (ino)
+        ino->last_used = jiffies;
-                ino->last_used = jiffies;
        spin_lock(&sbi->fs_lock);
        ino->flags &= ~AUTOFS_INF_PENDING;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 66411463b734..50e8c8582faa 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1340,10 +1340,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        /*
         * hooks: /n/, see "layering violations".
         */
-        ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+        if (!for_part) {
-        if (ret != 0) {
+                ret = devcgroup_inode_permission(bdev->bd_inode, perm);
-                bdput(bdev);
+                if (ret != 0) {
-                return ret;
+                        bdput(bdev);
+                        return ret;
+                }
        }
 restart:
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 24eb0d37241a..727caedcdd92 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -552,8 +552,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
 */
 static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
 {
-        struct fs_struct *fs;
+        struct path path;
-        struct dentry *dir;
        const struct cred *saved_cred;
        int ret;
@@ -573,24 +572,21 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
        }
        /* extract the directory dentry from the cwd */
-        fs = current->fs;
+        get_fs_pwd(current->fs, &path);
-        read_lock(&fs->lock);
-        dir = dget(fs->pwd.dentry);
-        read_unlock(&fs->lock);
-        if (!S_ISDIR(dir->d_inode->i_mode))
+        if (!S_ISDIR(path.dentry->d_inode->i_mode))
                goto notdir;
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = cachefiles_cull(cache, dir, args);
+        ret = cachefiles_cull(cache, path.dentry, args);
        cachefiles_end_secure(cache, saved_cred);
-        dput(dir);
+        path_put(&path);
        _leave(" = %d", ret);
        return ret;
 notdir:
-        dput(dir);
+        path_put(&path);
        kerror("cull command requires dirfd to be a directory");
        return -ENOTDIR;
@@ -628,8 +624,7 @@ inval:
 */
 static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
 {
-        struct fs_struct *fs;
+        struct path path;
-        struct dentry *dir;
        const struct cred *saved_cred;
        int ret;
@@ -649,24 +644,21 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
        }
        /* extract the directory dentry from the cwd */
-        fs = current->fs;
+        get_fs_pwd(current->fs, &path);
-        read_lock(&fs->lock);
-        dir = dget(fs->pwd.dentry);
-        read_unlock(&fs->lock);
-        if (!S_ISDIR(dir->d_inode->i_mode))
+        if (!S_ISDIR(path.dentry->d_inode->i_mode))
                goto notdir;
        cachefiles_begin_secure(cache, &saved_cred);
-        ret = cachefiles_check_in_use(cache, dir, args);
+        ret = cachefiles_check_in_use(cache, path.dentry, args);
        cachefiles_end_secure(cache, saved_cred);
-        dput(dir);
+        path_put(&path);
        //_leave(" = %d", ret);
        return ret;
 notdir:
-        dput(dir);
+        path_put(&path);
        kerror("inuse command requires dirfd to be a directory");
        return -ENOTDIR;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index a8cd821226da..bd6bc1bde2d7 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -267,13 +267,6 @@ do {									\
 #define dbgprintk(FMT, ...) \
        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline void _dbprintk(const char *fmt, ...)
-        __attribute__((format(printf, 1, 2)));
-static inline void _dbprintk(const char *fmt, ...)
-{
-}
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
@@ -304,9 +297,9 @@ do {							\
 } while (0)
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 #if 1 /* defined(__KDEBUGALL) */
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be8..278e1172600d 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),)
 obj-$(CONFIG_CEPH_FS) += ceph.o
-ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
        export.o caps.o snap.o xattr.o \
        messenger.o msgpool.o buffer.o pagelist.o \
        mds_client.o mdsmap.o \
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d9c60b84949a..5598a0d02295 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -309,7 +309,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
                }
-                if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
+                if (add_to_page_cache_lru(page, mapping, page->index,
+                                          GFP_NOFS)) {
                        page_cache_release(page);
                        dout("readpages %p add_to_page_cache failed %p\n",
                             inode, page);
@@ -552,7 +553,7 @@ static void writepages_finish(struct ceph_osd_request *req,
                 * page truncation thread, possibly losing some data that
                 * raced its way in
                 */
-                if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+                if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
                        generic_error_remove_page(inode->i_mapping, page);
                unlock_page(page);
@@ -797,9 +798,12 @@ get_more_pages:
                        dout("%p will write page %p idx %lu\n",
                             inode, page, page->index);
-                        writeback_stat = atomic_long_inc_return(&client->writeback_count);
+                        writeback_stat =
-                        if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+                               atomic_long_inc_return(&client->writeback_count);
-                                set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+                        if (writeback_stat > CONGESTION_ON_THRESH(
+                                    client->mount_args->congestion_kb)) {
+                                set_bdi_congested(&client->backing_dev_info,
+                                                  BLK_RW_ASYNC);
                        }
                        set_page_writeback(page);
@@ -1036,7 +1040,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
                *pagep = page;
                dout("write_begin file %p inode %p page %p %d~%d\n", file,
-                inode, page, (int)pos, (int)len);
+                     inode, page, (int)pos, (int)len);
                r = ceph_update_writeable_page(file, pos, len, page);
        } while (r == -EAGAIN);
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
index 67b2c030924b..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/fs/ceph/armor.c
@@ -1,11 +1,15 @@
 #include <linux/errno.h>
+int ceph_armor(char *dst, const char *src, const char *end);
+int ceph_unarmor(char *dst, const char *src, const char *end);
 /*
 * base64 encode/decode.
 */
-const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char *pem_key =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 static int encode_bits(int c)
 {
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 89490beaf537..6d2e30600627 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -20,7 +20,7 @@ static u32 supported_protocols[] = {
        CEPH_AUTH_CEPHX
 };
-int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
 {
        switch (protocol) {
        case CEPH_AUTH_NONE:
@@ -133,8 +133,8 @@ bad:
        return -ERANGE;
 }
-int ceph_build_auth_request(struct ceph_auth_client *ac,
+static int ceph_build_auth_request(struct ceph_auth_client *ac,
-                           void *msg_buf, size_t msg_len)
+                                   void *msg_buf, size_t msg_len)
 {
        struct ceph_mon_request_header *monhdr = msg_buf;
        void *p = monhdr + 1;
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 6d44053ecff1..582e0b2caf8a 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -87,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret,
 /*
 * get existing (or insert new) ticket handler
 */
-struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+static struct ceph_x_ticket_handler *
-                                                 int service)
+get_ticket_handler(struct ceph_auth_client *ac, int service)
 {
        struct ceph_x_ticket_handler *th;
        struct ceph_x_info *xi = ac->private;
@@ -429,7 +429,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
                auth->struct_v = 1;
                auth->key = 0;
                for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
-                        auth->key ^= *u;
+                        auth->key ^= *(__le64 *)u;
                dout(" server_challenge %llx client_challenge %llx key %llx\n",
                     xi->server_challenge, le64_to_cpu(auth->client_challenge),
                     le64_to_cpu(auth->key));
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index c67535d70aa6..cd39f17021de 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref)
        kfree(b);
 }
-int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
-{
-        b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
-        if (b->vec.iov_base) {
-                b->is_vmalloc = false;
-        } else {
-                b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
-                b->is_vmalloc = true;
-        }
-        if (!b->vec.iov_base)
-                return -ENOMEM;
-        b->alloc_len = len;
-        b->vec.iov_len = len;
-        return 0;
-}
 int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
 {
        size_t len;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b81be9a56487..7bf182b03973 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps)
        return cap_str[i];
 }
-/*
+void ceph_caps_init(struct ceph_mds_client *mdsc)
- * Cap reservations
- *
- * Maintain a global pool of preallocated struct ceph_caps, referenced
- * by struct ceph_caps_reservations.  This ensures that we preallocate
- * memory needed to successfully process an MDS response.  (If an MDS
- * sends us cap information and we fail to process it, we will have
- * problems due to the client and MDS being out of sync.)
- *
- * Reservations are 'owned' by a ceph_cap_reservation context.
- */
-static spinlock_t caps_list_lock;
-static struct list_head caps_list;  /* unused (reserved or unreserved) */
-static int caps_total_count;        /* total caps allocated */
-static int caps_use_count;          /* in use */
-static int caps_reserve_count;      /* unused, reserved */
-static int caps_avail_count;        /* unused, unreserved */
-static int caps_min_count;          /* keep at least this many (unreserved) */
-void __init ceph_caps_init(void)
 {
-        INIT_LIST_HEAD(&caps_list);
+        INIT_LIST_HEAD(&mdsc->caps_list);
-        spin_lock_init(&caps_list_lock);
+        spin_lock_init(&mdsc->caps_list_lock);
 }
-void ceph_caps_finalize(void)
+void ceph_caps_finalize(struct ceph_mds_client *mdsc)
 {
        struct ceph_cap *cap;
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        while (!list_empty(&caps_list)) {
+        while (!list_empty(&mdsc->caps_list)) {
-                cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+                cap = list_first_entry(&mdsc->caps_list,
+                                       struct ceph_cap, caps_item);
                list_del(&cap->caps_item);
                kmem_cache_free(ceph_cap_cachep, cap);
        }
-        caps_total_count = 0;
+        mdsc->caps_total_count = 0;
-        caps_avail_count = 0;
+        mdsc->caps_avail_count = 0;
-        caps_use_count = 0;
+        mdsc->caps_use_count = 0;
-        caps_reserve_count = 0;
+        mdsc->caps_reserve_count = 0;
-        caps_min_count = 0;
+        mdsc->caps_min_count = 0;
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
 }
-void ceph_adjust_min_caps(int delta)
+void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
 {
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        caps_min_count += delta;
+        mdsc->caps_min_count += delta;
-        BUG_ON(caps_min_count < 0);
+        BUG_ON(mdsc->caps_min_count < 0);
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
 }
-int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+int ceph_reserve_caps(struct ceph_mds_client *mdsc,
+                      struct ceph_cap_reservation *ctx, int need)
 {
        int i;
        struct ceph_cap *cap;
@@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
        dout("reserve caps ctx=%p need=%d\n", ctx, need);
        /* first reserve any caps that are already allocated */
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        if (caps_avail_count >= need)
+        if (mdsc->caps_avail_count >= need)
                have = need;
        else
-                have = caps_avail_count;
+                have = mdsc->caps_avail_count;
-        caps_avail_count -= have;
+        mdsc->caps_avail_count -= have;
-        caps_reserve_count += have;
+        mdsc->caps_reserve_count += have;
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+                                         mdsc->caps_reserve_count +
-        spin_unlock(&caps_list_lock);
+                                         mdsc->caps_avail_count);
+        spin_unlock(&mdsc->caps_list_lock);
        for (i = have; i < need; i++) {
                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
        }
        BUG_ON(have + alloc != need);
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
-        caps_total_count += alloc;
+        mdsc->caps_total_count += alloc;
-        caps_reserve_count += alloc;
+        mdsc->caps_reserve_count += alloc;
-        list_splice(&newcaps, &caps_list);
+        list_splice(&newcaps, &mdsc->caps_list);
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+                                         mdsc->caps_reserve_count +
-        spin_unlock(&caps_list_lock);
+                                         mdsc->caps_avail_count);
+        spin_unlock(&mdsc->caps_list_lock);
        ctx->count = need;
        dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
-             ctx, caps_total_count, caps_use_count, caps_reserve_count,
+             ctx, mdsc->caps_total_count, mdsc->caps_use_count,
-             caps_avail_count);
+             mdsc->caps_reserve_count, mdsc->caps_avail_count);
        return 0;
 out_alloc_count:
@@ -220,26 +205,29 @@ out_alloc_count:
        return ret;
 }
-int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+                        struct ceph_cap_reservation *ctx)
 {
        dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
        if (ctx->count) {
-                spin_lock(&caps_list_lock);
+                spin_lock(&mdsc->caps_list_lock);
-                BUG_ON(caps_reserve_count < ctx->count);
+                BUG_ON(mdsc->caps_reserve_count < ctx->count);
-                caps_reserve_count -= ctx->count;
+                mdsc->caps_reserve_count -= ctx->count;
-                caps_avail_count += ctx->count;
+                mdsc->caps_avail_count += ctx->count;
                ctx->count = 0;
                dout("unreserve caps %d = %d used + %d resv + %d avail\n",
-                     caps_total_count, caps_use_count, caps_reserve_count,
+                     mdsc->caps_total_count, mdsc->caps_use_count,
-                     caps_avail_count);
+                     mdsc->caps_reserve_count, mdsc->caps_avail_count);
-                BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+                BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-                       caps_avail_count);
+                                                 mdsc->caps_reserve_count +
-                spin_unlock(&caps_list_lock);
+                                                 mdsc->caps_avail_count);
+                spin_unlock(&mdsc->caps_list_lock);
        }
        return 0;
 }
-static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
+                                struct ceph_cap_reservation *ctx)
 {
        struct ceph_cap *cap = NULL;
@@ -247,71 +235,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
        if (!ctx) {
                cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
                if (cap) {
-                        caps_use_count++;
+                        mdsc->caps_use_count++;
-                        caps_total_count++;
+                        mdsc->caps_total_count++;
                }
                return cap;
        }
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
        dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
-             ctx, ctx->count, caps_total_count, caps_use_count,
+             ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
-             caps_reserve_count, caps_avail_count);
+             mdsc->caps_reserve_count, mdsc->caps_avail_count);
        BUG_ON(!ctx->count);
-        BUG_ON(ctx->count > caps_reserve_count);
+        BUG_ON(ctx->count > mdsc->caps_reserve_count);
-        BUG_ON(list_empty(&caps_list));
+        BUG_ON(list_empty(&mdsc->caps_list));
        ctx->count--;
-        caps_reserve_count--;
+        mdsc->caps_reserve_count--;
-        caps_use_count++;
+        mdsc->caps_use_count++;
-        cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+        cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
        list_del(&cap->caps_item);
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+               mdsc->caps_reserve_count + mdsc->caps_avail_count);
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
        return cap;
 }
-void ceph_put_cap(struct ceph_cap *cap)
+void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
 {
-        spin_lock(&caps_list_lock);
+        spin_lock(&mdsc->caps_list_lock);
        dout("put_cap %p %d = %d used + %d resv + %d avail\n",
-             cap, caps_total_count, caps_use_count,
+             cap, mdsc->caps_total_count, mdsc->caps_use_count,
-             caps_reserve_count, caps_avail_count);
+             mdsc->caps_reserve_count, mdsc->caps_avail_count);
-        caps_use_count--;
+        mdsc->caps_use_count--;
        /*
         * Keep some preallocated caps around (ceph_min_count), to
         * avoid lots of free/alloc churn.
         */
-        if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+        if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
-                caps_total_count--;
+                                      mdsc->caps_min_count) {
+                mdsc->caps_total_count--;
                kmem_cache_free(ceph_cap_cachep, cap);
        } else {
-                caps_avail_count++;
+                mdsc->caps_avail_count++;
-                list_add(&cap->caps_item, &caps_list);
+                list_add(&cap->caps_item, &mdsc->caps_list);
        }
-        BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+        BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
-               caps_avail_count);
+               mdsc->caps_reserve_count + mdsc->caps_avail_count);
-        spin_unlock(&caps_list_lock);
+        spin_unlock(&mdsc->caps_list_lock);
 }
 void ceph_reservation_status(struct ceph_client *client,
                             int *total, int *avail, int *used, int *reserved,
                             int *min)
 {
+        struct ceph_mds_client *mdsc = &client->mdsc;
        if (total)
-                *total = caps_total_count;
+                *total = mdsc->caps_total_count;
        if (avail)
-                *avail = caps_avail_count;
+                *avail = mdsc->caps_avail_count;
        if (used)
-                *used = caps_use_count;
+                *used = mdsc->caps_use_count;
        if (reserved)
-                *reserved = caps_reserve_count;
+                *reserved = mdsc->caps_reserve_count;
        if (min)
-                *min = caps_min_count;
+                *min = mdsc->caps_min_count;
 }
 /*
@@ -336,22 +327,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
        return NULL;
 }
+struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+        struct ceph_cap *cap;
+        spin_lock(&ci->vfs_inode.i_lock);
+        cap = __get_cap_for_mds(ci, mds);
+        spin_unlock(&ci->vfs_inode.i_lock);
+        return cap;
+}
 /*
- * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
- * -1.
 */
-static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
 {
        struct ceph_cap *cap;
        int mds = -1;
        struct rb_node *p;
-        /* prefer mds with WR|WRBUFFER|EXCL caps */
+        /* prefer mds with WR|BUFFER|EXCL caps */
        for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
                cap = rb_entry(p, struct ceph_cap, ci_node);
                mds = cap->mds;
-                if (mseq)
-                        *mseq = cap->mseq;
                if (cap->issued & (CEPH_CAP_FILE_WR |
                                   CEPH_CAP_FILE_BUFFER |
                                   CEPH_CAP_FILE_EXCL))
@@ -364,7 +362,7 @@ int ceph_get_cap_mds(struct inode *inode)
 {
        int mds;
        spin_lock(&inode->i_lock);
-        mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+        mds = __ceph_get_cap_mds(ceph_inode(inode));
        spin_unlock(&inode->i_lock);
        return mds;
 }
@@ -483,8 +481,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
         * Each time we receive FILE_CACHE anew, we increment
         * i_rdcache_gen.
         */
-        if ((issued & CEPH_CAP_FILE_CACHE) &&
+        if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-            (had & CEPH_CAP_FILE_CACHE) == 0)
+            (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
                ci->i_rdcache_gen++;
        /*
@@ -543,7 +541,7 @@ retry:
                        new_cap = NULL;
                } else {
                        spin_unlock(&inode->i_lock);
-                        new_cap = get_cap(caps_reservation);
+                        new_cap = get_cap(mdsc, caps_reservation);
                        if (new_cap == NULL)
                                return -ENOMEM;
                        goto retry;
@@ -588,6 +586,7 @@ retry:
                } else {
                        pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
                               realmino);
+                        WARN_ON(!realm);
                }
        }
@@ -831,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
 {
        int want = 0;
        int mode;
-        for (mode = 0; mode < 4; mode++)
+        for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
                if (ci->i_nr_by_mode[mode])
                        want |= ceph_caps_for_mode(mode);
        return want;
@@ -901,7 +900,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
                ci->i_auth_cap = NULL;
        if (removed)
-                ceph_put_cap(cap);
+                ceph_put_cap(mdsc, cap);
        if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
                struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -1197,6 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 */
 void __ceph_flush_snaps(struct ceph_inode_info *ci,
                        struct ceph_mds_session **psession)
+                __releases(ci->vfs_inode->i_lock)
+                __acquires(ci->vfs_inode->i_lock)
 {
        struct inode *inode = &ci->vfs_inode;
        int mds;
@@ -1232,7 +1233,13 @@ retry:
                BUG_ON(capsnap->dirty == 0);
                /* pick mds, take s_mutex */
-                mds = __ceph_get_cap_mds(ci, &mseq);
+                if (ci->i_auth_cap == NULL) {
+                        dout("no auth cap (migrating?), doing nothing\n");
+                        goto out;
+                }
+                mds = ci->i_auth_cap->session->s_mds;
+                mseq = ci->i_auth_cap->mseq;
                if (session && session->s_mds != mds) {
                        dout("oops, wrong session %p mutex\n", session);
                        mutex_unlock(&session->s_mutex);
@@ -1251,8 +1258,8 @@ retry:
                        }
                        /*
                         * if session == NULL, we raced against a cap
-                         * deletion.  retry, and we'll get a better
+                         * deletion or migration.  retry, and we'll
-                         * @mds value next time.
+                         * get a better @mds value next time.
                         */
                        spin_lock(&inode->i_lock);
                        goto retry;
@@ -1290,6 +1297,7 @@ retry:
        list_del_init(&ci->i_snap_flush_item);
        spin_unlock(&mdsc->snap_flush_lock);
+out:
        if (psession)
                *psession = session;
        else if (session) {
@@ -1435,7 +1443,6 @@ static int try_nonblocking_invalidate(struct inode *inode)
 */
 void ceph_check_caps(struct ceph_inode_info *ci, int flags,
                     struct ceph_mds_session *session)
-        __releases(session->s_mutex)
 {
        struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
        struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1510,11 +1517,13 @@ retry_locked:
            ci->i_wrbuffer_ref == 0 &&               /* no dirty pages... */
            ci->i_rdcache_gen &&                     /* may have cached pages */
            (file_wanted == 0 ||                     /* no open files */
-             (revoking & CEPH_CAP_FILE_CACHE)) &&     /*  or revoking cache */
+             (revoking & (CEPH_CAP_FILE_CACHE|
+                          CEPH_CAP_FILE_LAZYIO))) && /*  or revoking cache */
            !tried_invalidate) {
                dout("check_caps trying to invalidate on %p\n", inode);
                if (try_nonblocking_invalidate(inode) < 0) {
-                        if (revoking & CEPH_CAP_FILE_CACHE) {
+                        if (revoking & (CEPH_CAP_FILE_CACHE|
+                                        CEPH_CAP_FILE_LAZYIO)) {
                                dout("check_caps queuing invalidate\n");
                                queue_invalidate = 1;
                                ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -2250,8 +2259,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
                             struct ceph_mds_session *session,
                             struct ceph_cap *cap,
                             struct ceph_buffer *xattr_buf)
-        __releases(inode->i_lock)
+                __releases(inode->i_lock)
-        __releases(session->s_mutex)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2278,6 +2286,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
         * will invalidate _after_ writeback.)
         */
        if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+            (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
            !ci->i_wrbuffer_ref) {
                if (try_nonblocking_invalidate(inode) == 0) {
                        revoked_rdcache = 1;
@@ -2369,15 +2378,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
        /* revocation, grant, or no-op? */
        if (cap->issued & ~newcaps) {
-                dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+                int revoking = cap->issued & ~newcaps;
-                     ceph_cap_string(newcaps));
-                if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+                dout("revocation: %s -> %s (revoking %s)\n",
-                        writeback = 1; /* will delay ack */
+                     ceph_cap_string(cap->issued),
-                else if (dirty & ~newcaps)
+                     ceph_cap_string(newcaps),
-                        check_caps = 1;  /* initiate writeback in check_caps */
+                     ceph_cap_string(revoking));
-                else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+                if (revoking & used & CEPH_CAP_FILE_BUFFER)
-                           revoked_rdcache)
+                        writeback = 1;  /* initiate writeback; will delay ack */
-                        check_caps = 2;     /* send revoke ack in check_caps */
+                else if (revoking == CEPH_CAP_FILE_CACHE &&
+                         (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
+                         queue_invalidate)
+                        ; /* do nothing yet, invalidation will be queued */
+                else if (cap == ci->i_auth_cap)
+                        check_caps = 1; /* check auth cap only */
+                else
+                        check_caps = 2; /* check all caps */
                cap->issued = newcaps;
                cap->implemented |= newcaps;
        } else if (cap->issued == newcaps) {
@@ -2568,7 +2584,8 @@ static void handle_cap_trunc(struct inode *inode,
 * caller holds s_mutex
 */
 static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
-                              struct ceph_mds_session *session)
+                              struct ceph_mds_session *session,
+                              int *open_target_sessions)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2600,6 +2617,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
                        ci->i_cap_exporting_mds = mds;
                        ci->i_cap_exporting_mseq = mseq;
                        ci->i_cap_exporting_issued = cap->issued;
+                        /*
+                         * make sure we have open sessions with all possible
+                         * export targets, so that we get the matching IMPORT
+                         */
+                        *open_target_sessions = 1;
                }
                __ceph_remove_cap(cap);
        }
@@ -2675,6 +2698,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        u64 size, max_size;
        u64 tid;
        void *snaptrace;
+        size_t snaptrace_len;
+        void *flock;
+        u32 flock_len;
+        int open_target_sessions = 0;
        dout("handle_caps from mds%d\n", mds);
@@ -2683,7 +2710,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        if (msg->front.iov_len < sizeof(*h))
                goto bad;
        h = msg->front.iov_base;
-        snaptrace = h + 1;
        op = le32_to_cpu(h->op);
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
@@ -2693,6 +2719,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        size = le64_to_cpu(h->size);
        max_size = le64_to_cpu(h->max_size);
+        snaptrace = h + 1;
+        snaptrace_len = le32_to_cpu(h->snap_trace_len);
+        if (le16_to_cpu(msg->hdr.version) >= 2) {
+                void *p, *end;
+                p = snaptrace + snaptrace_len;
+                end = msg->front.iov_base + msg->front.iov_len;
+                ceph_decode_32_safe(&p, end, flock_len, bad);
+                flock = p;
+        } else {
+                flock = NULL;
+                flock_len = 0;
+        }
        mutex_lock(&session->s_mutex);
        session->s_seq++;
        dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2714,7 +2755,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                 * along for the mds (who clearly thinks we still have this
                 * cap).
                 */
-                ceph_add_cap_releases(mdsc, session, -1);
+                ceph_add_cap_releases(mdsc, session);
                ceph_send_cap_releases(mdsc, session);
                goto done;
        }
@@ -2726,12 +2767,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                goto done;
        case CEPH_CAP_OP_EXPORT:
-                handle_cap_export(inode, h, session);
+                handle_cap_export(inode, h, session, &open_target_sessions);
                goto done;
        case CEPH_CAP_OP_IMPORT:
                handle_cap_import(mdsc, inode, h, session,
-                                  snaptrace, le32_to_cpu(h->snap_trace_len));
+                                  snaptrace, snaptrace_len);
                ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
                                session);
                goto done_unlocked;
@@ -2773,6 +2814,8 @@ done:
 done_unlocked:
        if (inode)
                iput(inode);
+        if (open_target_sessions)
+                ceph_mdsc_open_export_target_sessions(mdsc, session);
        return;
 bad:
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
index 793f50cb7c22..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/fs/ceph/ceph_frag.h
@@ -1,5 +1,5 @@
-#ifndef _FS_CEPH_FRAG_H
+#ifndef FS_CEPH_FRAG_H
-#define _FS_CEPH_FRAG_H
+#define FS_CEPH_FRAG_H
 /*
 * "Frags" are a way to describe a subset of a 32-bit number space,
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index 79d76bc4303f..3ac6cc7c1156 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
 int ceph_flags_to_mode(int flags)
 {
+        int mode;
 #ifdef O_DIRECTORY  /* fixme */
        if ((flags & O_DIRECTORY) == O_DIRECTORY)
                return CEPH_FILE_MODE_PIN;
 #endif
+        if ((flags & O_APPEND) == O_APPEND)
+                flags |= O_WRONLY;
+        if ((flags & O_ACCMODE) == O_RDWR)
+                mode = CEPH_FILE_MODE_RDWR;
+        else if ((flags & O_ACCMODE) == O_WRONLY)
+                mode = CEPH_FILE_MODE_WR;
+        else
+                mode = CEPH_FILE_MODE_RD;
 #ifdef O_LAZY
        if (flags & O_LAZY)
-                return CEPH_FILE_MODE_LAZY;
+                mode |= CEPH_FILE_MODE_LAZY;
 #endif
-        if ((flags & O_APPEND) == O_APPEND)
-                flags |= O_WRONLY;
-        flags &= O_ACCMODE;
+        return mode;
-        if ((flags & O_RDWR) == O_RDWR)
-                return CEPH_FILE_MODE_RDWR;
-        if ((flags & O_WRONLY) == O_WRONLY)
-                return CEPH_FILE_MODE_WR;
-        return CEPH_FILE_MODE_RD;
 }
 int ceph_caps_for_mode(int mode)
 {
-        switch (mode) {
+        int caps = CEPH_CAP_PIN;
-        case CEPH_FILE_MODE_PIN:
-                return CEPH_CAP_PIN;
+        if (mode & CEPH_FILE_MODE_RD)
-        case CEPH_FILE_MODE_RD:
+                caps |= CEPH_CAP_FILE_SHARED |
-                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
-        case CEPH_FILE_MODE_RDWR:
+        if (mode & CEPH_FILE_MODE_WR)
-                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+                caps |= CEPH_CAP_FILE_EXCL |
-                        CEPH_CAP_FILE_EXCL |
-                        CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
-                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
-                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
-                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-        case CEPH_FILE_MODE_WR:
-                return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
-                        CEPH_CAP_FILE_EXCL |
                        CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
                        CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
                        CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
-        }
+        if (mode & CEPH_FILE_MODE_LAZY)
-        return 0;
+                caps |= CEPH_CAP_FILE_LAZYIO;
+        return caps;
 }
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 2fa992eaf7da..d5619ac86711 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -9,27 +9,13 @@
 * LGPL2
 */
-#ifndef _FS_CEPH_CEPH_FS_H
+#ifndef CEPH_FS_H
-#define _FS_CEPH_CEPH_FS_H
+#define CEPH_FS_H
 #include "msgr.h"
 #include "rados.h"
 /*
- * Ceph release version
- */
-#define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 20
-#define CEPH_VERSION_PATCH 0
-#define _CEPH_STRINGIFY(x) #x
-#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
-#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
-        "." CEPH_STRINGIFY(z)
-#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
-                                       CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
-/*
 * subprotocol versions.  when specific messages types or high-level
 * protocols change, bump the affected components.  we keep rev
 * internal cluster protocols separately from the public,
@@ -53,18 +39,10 @@
 /*
 * feature bits
 */
-#define CEPH_FEATURE_UID        1
+#define CEPH_FEATURE_UID            (1<<0)
-#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_NOSRCADDR      (1<<1)
-#define CEPH_FEATURE_FLOCK      4
+#define CEPH_FEATURE_MONCLOCKCHECK  (1<<2)
+#define CEPH_FEATURE_FLOCK          (1<<3)
-#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
-#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
-#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
-#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
 /*
@@ -96,6 +74,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_CRYPTO_NONE 0x0
 #define CEPH_CRYPTO_AES  0x1
+#define CEPH_AES_IV "cephsageyudagreg"
 /* security/authentication protocols */
 #define CEPH_AUTH_UNKNOWN       0x0
 #define CEPH_AUTH_NONE          0x1
@@ -275,6 +255,7 @@ extern const char *ceph_mds_state_name(int s);
 #define CEPH_LOCK_IDFT        512   /* dir frag tree */
 #define CEPH_LOCK_INEST       1024  /* mds internal */
 #define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_IFLOCK      4096  /* advisory file locks */
 #define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
 /* client_session ops */
@@ -316,6 +297,8 @@ enum {
        CEPH_MDS_OP_RMXATTR    = 0x01106,
        CEPH_MDS_OP_SETLAYOUT  = 0x01107,
        CEPH_MDS_OP_SETATTR    = 0x01108,
+        CEPH_MDS_OP_SETFILELOCK= 0x01109,
+        CEPH_MDS_OP_GETFILELOCK= 0x00110,
        CEPH_MDS_OP_MKNOD      = 0x01201,
        CEPH_MDS_OP_LINK       = 0x01202,
@@ -386,6 +369,15 @@ union ceph_mds_request_args {
        struct {
                struct ceph_file_layout layout;
        } __attribute__ ((packed)) setlayout;
+        struct {
+                __u8 rule; /* currently fcntl or flock */
+                __u8 type; /* shared, exclusive, remove*/
+                __le64 pid; /* process id requesting the lock */
+                __le64 pid_namespace;
+                __le64 start; /* initial location to lock */
+                __le64 length; /* num bytes to lock from start */
+                __u8 wait; /* will caller wait for lock to become available? */
+        } __attribute__ ((packed)) filelock_change;
 } __attribute__ ((packed));
 #define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
@@ -480,6 +472,23 @@ struct ceph_mds_reply_dirfrag {
        __le32 dist[];
 } __attribute__ ((packed));
+#define CEPH_LOCK_FCNTL    1
+#define CEPH_LOCK_FLOCK    2
+#define CEPH_LOCK_SHARED   1
+#define CEPH_LOCK_EXCL     2
+#define CEPH_LOCK_UNLOCK   4
+struct ceph_filelock {
+        __le64 start;/* file offset to start lock at */
+        __le64 length; /* num bytes to lock; 0 for all following start */
+        __le64 client; /* which client holds the lock */
+        __le64 pid; /* process id holding the lock on the client */
+        __le64 pid_namespace;
+        __u8 type; /* shared lock, exclusive lock, or unlock */
+} __attribute__ ((packed));
 /* file access modes */
 #define CEPH_FILE_MODE_PIN        0
 #define CEPH_FILE_MODE_RD         1
@@ -508,9 +517,10 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_SAUTH      2
 #define CEPH_CAP_SLINK      4
 #define CEPH_CAP_SXATTR     6
-#define CEPH_CAP_SFILE      8   /* goes at the end (uses >2 cap bits) */
+#define CEPH_CAP_SFILE      8
+#define CEPH_CAP_SFLOCK    20 
-#define CEPH_CAP_BITS       16
+#define CEPH_CAP_BITS       22
 /* composed values */
 #define CEPH_CAP_AUTH_SHARED  (CEPH_CAP_GSHARED  << CEPH_CAP_SAUTH)
@@ -528,6 +538,9 @@ int ceph_flags_to_mode(int flags);
 #define CEPH_CAP_FILE_BUFFER   (CEPH_CAP_GBUFFER   << CEPH_CAP_SFILE)
 #define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
 #define CEPH_CAP_FILE_LAZYIO   (CEPH_CAP_GLAZYIO   << CEPH_CAP_SFILE)
+#define CEPH_CAP_FLOCK_SHARED  (CEPH_CAP_GSHARED   << CEPH_CAP_SFLOCK)
+#define CEPH_CAP_FLOCK_EXCL    (CEPH_CAP_GEXCL     << CEPH_CAP_SFLOCK)
 /* cap masks (for getattr) */
 #define CEPH_STAT_CAP_INODE    CEPH_CAP_PIN
@@ -563,7 +576,8 @@ int ceph_flags_to_mode(int flags);
                              CEPH_CAP_FILE_EXCL)
 #define CEPH_CAP_ANY_WR   (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
 #define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
-                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+                           CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
+                           CEPH_CAP_PIN)
 #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
                        CEPH_LOCK_IXATTR)
@@ -653,12 +667,21 @@ struct ceph_mds_cap_reconnect {
        __le64 cap_id;
        __le32 wanted;
        __le32 issued;
+        __le64 snaprealm;
+        __le64 pathbase;        /* base ino for our path to this ino */
+        __le32 flock_len;       /* size of flock state blob, if any */
+} __attribute__ ((packed));
+/* followed by flock blob */
+struct ceph_mds_cap_reconnect_v1 {
+        __le64 cap_id;
+        __le32 wanted;
+        __le32 issued;
        __le64 size;
        struct ceph_timespec mtime, atime;
        __le64 snaprealm;
        __le64 pathbase;        /* base ino for our path to this ino */
 } __attribute__ ((packed));
-/* followed by encoded string */
 struct ceph_mds_snaprealm_reconnect {
        __le64 ino;     /* snap realm base */
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
index 5ac470c433c9..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/fs/ceph/ceph_hash.h
@@ -1,5 +1,5 @@
-#ifndef _FS_CEPH_HASH_H
+#ifndef FS_CEPH_HASH_H
-#define _FS_CEPH_HASH_H
+#define FS_CEPH_HASH_H
 #define CEPH_STR_HASH_LINUX      0x1  /* linux dcache hash */
 #define CEPH_STR_HASH_RJENKINS   0x2  /* robert jenkins' */
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 7503aee828ce..c6179d3a26a2 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -28,6 +28,7 @@ const char *ceph_osd_op_name(int op)
        case CEPH_OSD_OP_TRUNCATE: return "truncate";
        case CEPH_OSD_OP_ZERO: return "zero";
        case CEPH_OSD_OP_WRITEFULL: return "writefull";
+        case CEPH_OSD_OP_ROLLBACK: return "rollback";
        case CEPH_OSD_OP_APPEND: return "append";
        case CEPH_OSD_OP_STARTSYNC: return "startsync";
@@ -129,6 +130,8 @@ const char *ceph_mds_op_name(int op)
        case CEPH_MDS_OP_LSSNAP: return "lssnap";
        case CEPH_MDS_OP_MKSNAP: return "mksnap";
        case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+        case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
+        case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
        }
        return "???";
 }
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index dcd7e7523700..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -1,5 +1,5 @@
-#ifndef _CRUSH_CRUSH_H
+#ifndef CEPH_CRUSH_CRUSH_H
-#define _CRUSH_CRUSH_H
+#define CEPH_CRUSH_CRUSH_H
 #include <linux/types.h>
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index ff48e110e4bb..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,5 +1,5 @@
-#ifndef _CRUSH_HASH_H
+#ifndef CEPH_CRUSH_HASH_H
-#define _CRUSH_HASH_H
+#define CEPH_CRUSH_HASH_H
 #define CRUSH_HASH_RJENKINS1   0
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
index 98e90046fd9f..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/fs/ceph/crush/mapper.h
@@ -1,5 +1,5 @@
-#ifndef _CRUSH_MAPPER_H
+#ifndef CEPH_CRUSH_MAPPER_H
-#define _CRUSH_MAPPER_H
+#define CEPH_CRUSH_MAPPER_H
 /*
 * CRUSH functions for find rules and then mapping an input to an
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index f704b3b62424..a3e627f63293 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
        return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
 }
-const u8 *aes_iv = "cephsageyudagreg";
+static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
-int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+static int ceph_aes_encrypt(const void *key, int key_len,
-                     const void *src, size_t src_len)
+                            void *dst, size_t *dst_len,
+                            const void *src, size_t src_len)
 {
        struct scatterlist sg_in[2], sg_out[1];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
        return 0;
 }
-int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
-                      const void *src1, size_t src1_len,
+                             size_t *dst_len,
-                      const void *src2, size_t src2_len)
+                             const void *src1, size_t src1_len,
+                             const void *src2, size_t src2_len)
 {
        struct scatterlist sg_in[3], sg_out[1];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
        return 0;
 }
-int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+static int ceph_aes_decrypt(const void *key, int key_len,
-                     const void *src, size_t src_len)
+                            void *dst, size_t *dst_len,
+                            const void *src, size_t src_len)
 {
        struct scatterlist sg_in[1], sg_out[2];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
        return 0;
 }
-int ceph_aes_decrypt2(const void *key, int key_len,
+static int ceph_aes_decrypt2(const void *key, int key_len,
-                      void *dst1, size_t *dst1_len,
+                             void *dst1, size_t *dst1_len,
-                      void *dst2, size_t *dst2_len,
+                             void *dst2, size_t *dst2_len,
-                      const void *src, size_t src_len)
+                             const void *src, size_t src_len)
 {
        struct scatterlist sg_in[1], sg_out[3];
        struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index 40b502e6bd89..bdf38607323c 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret,
                         const void *src2, size_t src2_len);
 /* armor.c */
-extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_armor(char *dst, const char *src, const char *end);
-extern int ceph_unarmor(void *dst, const char *src, const char *end);
+extern int ceph_unarmor(char *dst, const char *src, const char *end);
 #endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2f5332ddbba..360c4f22718d 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -291,7 +291,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
        return 0;
 }
-#define DEFINE_SHOW_FUNC(name)                                          \
+#define DEFINE_SHOW_FUNC(name)                                          \
 static int name##_open(struct inode *inode, struct file *file)          \
 {                                                                       \
        struct seq_file *sf;                                            \
@@ -361,8 +361,8 @@ int ceph_debugfs_client_init(struct ceph_client *client)
        int ret = 0;
        char name[80];
-        snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+        snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
-                 PR_FSID(&client->fsid), client->monc.auth->global_id);
+                 client->monc.auth->global_id);
        client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
        if (!client->debugfs_dir)
@@ -432,11 +432,12 @@ int ceph_debugfs_client_init(struct ceph_client *client)
        if (!client->debugfs_caps)
                goto out;
-        client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+        client->debugfs_congestion_kb =
-                                                   0600,
+                debugfs_create_file("writeback_congestion_kb",
-                                                   client->debugfs_dir,
+                                    0600,
-                                                   client,
+                                    client->debugfs_dir,
-                                                   &congestion_kb_fops);
+                                    client,
+                                    &congestion_kb_fops);
        if (!client->debugfs_congestion_kb)
                goto out;
@@ -466,7 +467,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
        debugfs_remove(client->debugfs_dir);
 }
-#else  // CONFIG_DEBUG_FS
+#else  /* CONFIG_DEBUG_FS */
 int __init ceph_debugfs_init(void)
 {
@@ -486,4 +487,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
 {
 }
-#endif  // CONFIG_DEBUG_FS
+#endif  /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 65b3e022eaf5..3d25415afe63 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv,
 */
 static inline void ceph_encode_addr(struct ceph_entity_addr *a)
 {
-        a->in_addr.ss_family = htons(a->in_addr.ss_family);
+        __be16 ss_family = htons(a->in_addr.ss_family);
+        a->in_addr.ss_family = *(__u16 *)&ss_family;
 }
 static inline void ceph_decode_addr(struct ceph_entity_addr *a)
 {
-        a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+        __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
+        a->in_addr.ss_family = ntohs(ss_family);
        WARN_ON(a->in_addr.ss_family == 512);
 }
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f94ed3c7f6a5..67bbb41d5526 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -27,7 +27,7 @@
 const struct inode_operations ceph_dir_iops;
 const struct file_operations ceph_dir_fops;
-struct dentry_operations ceph_dentry_ops;
+const struct dentry_operations ceph_dentry_ops;
 /*
 * Initialize ceph dentry state.
@@ -94,6 +94,8 @@ static unsigned fpos_off(loff_t p)
 */
 static int __dcache_readdir(struct file *filp,
                            void *dirent, filldir_t filldir)
+                __releases(inode->i_lock)
+                __acquires(inode->i_lock)
 {
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_file_info *fi = filp->private_data;
@@ -1239,16 +1241,16 @@ const struct inode_operations ceph_dir_iops = {
        .create = ceph_create,
 };
-struct dentry_operations ceph_dentry_ops = {
+const struct dentry_operations ceph_dentry_ops = {
        .d_revalidate = ceph_d_revalidate,
        .d_release = ceph_dentry_release,
 };
-struct dentry_operations ceph_snapdir_dentry_ops = {
+const struct dentry_operations ceph_snapdir_dentry_ops = {
        .d_revalidate = ceph_snapdir_d_revalidate,
        .d_release = ceph_dentry_release,
 };
-struct dentry_operations ceph_snap_dentry_ops = {
+const struct dentry_operations ceph_snap_dentry_ops = {
        .d_release = ceph_dentry_release,
 };
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7c08698fad3e..8c044a4f0457 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,7 +317,7 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
 * allocate a vector new pages
 */
-struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
+static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
        struct page **pages;
        int i;
@@ -665,7 +665,7 @@ more:
                 * throw out any page cache pages in this range. this
                 * may block.
                 */
-                truncate_inode_pages_range(inode->i_mapping, pos, 
+                truncate_inode_pages_range(inode->i_mapping, pos,
                                           (pos+len) | (PAGE_CACHE_SIZE-1));
        } else {
                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
@@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
                             unsigned long nr_segs, loff_t pos)
 {
        struct file *filp = iocb->ki_filp;
+        struct ceph_file_info *fi = filp->private_data;
        loff_t *ppos = &iocb->ki_pos;
        size_t len = iov->iov_len;
        struct inode *inode = filp->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        void *base = iov->iov_base;
+        void __user *base = iov->iov_base;
        ssize_t ret;
-        int got = 0;
+        int want, got = 0;
        int checkeof = 0, read = 0;
        dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
             inode, ceph_vinop(inode), pos, (unsigned)len, inode);
 again:
        __ceph_do_pending_vmtruncate(inode);
-        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
-                            &got, -1);
+                want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_CACHE;
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
        if (ret < 0)
                goto out;
        dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
             inode, ceph_vinop(inode), pos, (unsigned)len,
             ceph_cap_string(got));
-        if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
+        if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
            (inode->i_sb->s_flags & MS_SYNCHRONOUS))
                /* hmm, this isn't really async... */
@@ -807,11 +811,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
                       unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
+        struct ceph_file_info *fi = file->private_data;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        loff_t endoff = pos + iov->iov_len;
-        int got = 0;
+        int want, got = 0;
        int ret, err;
        if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +829,11 @@ retry_snap:
        dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
             inode->i_size);
-        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
+        if (fi->fmode & CEPH_FILE_MODE_LAZY)
-                            &got, endoff);
+                want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+        else
+                want = CEPH_CAP_FILE_BUFFER;
+        ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
        if (ret < 0)
                goto out;
@@ -833,7 +841,7 @@ retry_snap:
             inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
             ceph_cap_string(got));
-        if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
+        if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
            (iocb->ki_filp->f_flags & O_DIRECT) ||
            (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
                ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -930,6 +938,8 @@ const struct file_operations ceph_file_fops = {
        .aio_write = ceph_aio_write,
        .mmap = ceph_mmap,
        .fsync = ceph_fsync,
+        .lock = ceph_lock,
+        .flock = ceph_flock,
        .splice_read = generic_file_splice_read,
        .splice_write = generic_file_splice_write,
        .unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 389f9dbd9949..5d893d31e399 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
                         * the file is either opened or mmaped
                         */
                        if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
-                                      CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
+                                       CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
-                                      CEPH_CAP_FILE_EXCL)) ||
+                                       CEPH_CAP_FILE_EXCL|
+                                       CEPH_CAP_FILE_LAZYIO)) ||
                            mapping_mapped(inode->i_mapping) ||
                            __ceph_caps_file_wanted(ci)) {
                                ci->i_truncate_pending++;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index d085f07756b4..76e307d2aba1 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        return 0;
 }
+static long ceph_ioctl_lazyio(struct file *file)
+{
+        struct ceph_file_info *fi = file->private_data;
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
+                spin_lock(&inode->i_lock);
+                ci->i_nr_by_mode[fi->fmode]--;
+                fi->fmode |= CEPH_FILE_MODE_LAZY;
+                ci->i_nr_by_mode[fi->fmode]++;
+                spin_unlock(&inode->i_lock);
+                dout("ioctl_layzio: file %p marked lazy\n", file);
+                ceph_check_caps(ci, 0, NULL);
+        } else {
+                dout("ioctl_layzio: file %p already lazy\n", file);
+        }
+        return 0;
+}
 long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
        dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case CEPH_IOC_GET_DATALOC:
                return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+        case CEPH_IOC_LAZYIO:
+                return ceph_ioctl_lazyio(file);
        }
        return -ENOTTY;
 }
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d059..88451a3b6857 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc {
 #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
                                   struct ceph_ioctl_dataloc)
+#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
 #endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 000000000000..ae85af06454f
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,256 @@
+#include "ceph_debug.h"
+#include <linux/file.h>
+#include <linux/namei.h>
+#include "super.h"
+#include "mds_client.h"
+#include "pagelist.h"
+/**
+ * Implement fcntl and flock locking functions.
+ */
+static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
+                             u64 pid, u64 pid_ns,
+                             int cmd, u64 start, u64 length, u8 wait)
+{
+        struct inode *inode = file->f_dentry->d_inode;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(inode->i_sb)->mdsc;
+        struct ceph_mds_request *req;
+        int err;
+        req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
+        if (IS_ERR(req))
+                return PTR_ERR(req);
+        req->r_inode = igrab(inode);
+        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+             "length: %llu, wait: %d, type`: %d", (int)lock_type,
+             (int)operation, pid, start, length, wait, cmd);
+        req->r_args.filelock_change.rule = lock_type;
+        req->r_args.filelock_change.type = cmd;
+        req->r_args.filelock_change.pid = cpu_to_le64(pid);
+        /* This should be adjusted, but I'm not sure if
+           namespaces actually get id numbers*/
+        req->r_args.filelock_change.pid_namespace =
+                cpu_to_le64((u64)pid_ns);
+        req->r_args.filelock_change.start = cpu_to_le64(start);
+        req->r_args.filelock_change.length = cpu_to_le64(length);
+        req->r_args.filelock_change.wait = wait;
+        err = ceph_mdsc_do_request(mdsc, inode, req);
+        ceph_mdsc_put_request(req);
+        dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
+             "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
+             (int)operation, pid, start, length, wait, cmd, err);
+        return err;
+}
+/**
+ * Attempt to set an fcntl lock.
+ * For now, this just goes away to the server. Later it may be more awesome.
+ */
+int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+        u64 length;
+        u8 lock_cmd;
+        int err;
+        u8 wait = 0;
+        u16 op = CEPH_MDS_OP_SETFILELOCK;
+        fl->fl_nspid = get_pid(task_tgid(current));
+        dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+        /* set wait bit as appropriate, then make command as Ceph expects it*/
+        if (F_SETLKW == cmd)
+                wait = 1;
+        if (F_GETLK == cmd)
+                op = CEPH_MDS_OP_GETFILELOCK;
+        if (F_RDLCK == fl->fl_type)
+                lock_cmd = CEPH_LOCK_SHARED;
+        else if (F_WRLCK == fl->fl_type)
+                lock_cmd = CEPH_LOCK_EXCL;
+        else
+                lock_cmd = CEPH_LOCK_UNLOCK;
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
+        err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                lock_cmd, fl->fl_start,
+                                length, wait);
+        if (!err) {
+                dout("mds locked, locking locally");
+                err = posix_lock_file(file, fl, NULL);
+                if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
+                        /* undo! This should only happen if the kernel detects
+                         * local deadlock. */
+                        ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
+                                          (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                          length, 0);
+                        dout("got %d on posix_lock_file, undid lock", err);
+                }
+        } else {
+                dout("mds returned error code %d", err);
+        }
+        return err;
+}
+int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+        u64 length;
+        u8 lock_cmd;
+        int err;
+        u8 wait = 1;
+        fl->fl_nspid = get_pid(task_tgid(current));
+        dout("ceph_flock, fl_pid:%d", fl->fl_pid);
+        /* set wait bit, then clear it out of cmd*/
+        if (cmd & LOCK_NB)
+                wait = 0;
+        cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
+        /* set command sequence that Ceph wants to see:
+           shared lock, exclusive lock, or unlock */
+        if (LOCK_SH == cmd)
+                lock_cmd = CEPH_LOCK_SHARED;
+        else if (LOCK_EX == cmd)
+                lock_cmd = CEPH_LOCK_EXCL;
+        else
+                lock_cmd = CEPH_LOCK_UNLOCK;
+        /* mds requires start and length rather than start and end */
+        if (LLONG_MAX == fl->fl_end)
+                length = 0;
+        else
+                length = fl->fl_end - fl->fl_start + 1;
+        err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
+                                file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
+                                lock_cmd, fl->fl_start,
+                                length, wait);
+        if (!err) {
+                err = flock_lock_file_wait(file, fl);
+                if (err) {
+                        ceph_lock_message(CEPH_LOCK_FLOCK,
+                                          CEPH_MDS_OP_SETFILELOCK,
+                                          file, (u64)fl->fl_pid,
+                                          (u64)fl->fl_nspid,
+                                          CEPH_LOCK_UNLOCK, fl->fl_start,
+                                          length, 0);
+                        dout("got %d on flock_lock_file_wait, undid lock", err);
+                }
+        } else {
+                dout("mds error code %d", err);
+        }
+        return err;
+}
+/**
+ * Must be called with BKL already held. Fills in the passed
+ * counter variables, so you can prepare pagelist metadata before calling
+ * ceph_encode_locks.
+ */
+void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
+{
+        struct file_lock *lock;
+        *fcntl_count = 0;
+        *flock_count = 0;
+        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+                if (lock->fl_flags & FL_POSIX)
+                        ++(*fcntl_count);
+                else if (lock->fl_flags & FL_FLOCK)
+                        ++(*flock_count);
+        }
+        dout("counted %d flock locks and %d fcntl locks",
+             *flock_count, *fcntl_count);
+}
+/**
+ * Encode the flock and fcntl locks for the given inode into the pagelist.
+ * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
+ * sequential flock locks.
+ * Must be called with BLK already held, and the lock numbers should have
+ * been gathered under the same lock holding window.
+ */
+int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
+                      int num_fcntl_locks, int num_flock_locks)
+{
+        struct file_lock *lock;
+        struct ceph_filelock cephlock;
+        int err = 0;
+        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
+             num_fcntl_locks);
+        err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
+        if (err)
+                goto fail;
+        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+                if (lock->fl_flags & FL_POSIX) {
+                        err = lock_to_ceph_filelock(lock, &cephlock);
+                        if (err)
+                                goto fail;
+                        err = ceph_pagelist_append(pagelist, &cephlock,
+                                           sizeof(struct ceph_filelock));
+                }
+                if (err)
+                        goto fail;
+        }
+        err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
+        if (err)
+                goto fail;
+        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+                if (lock->fl_flags & FL_FLOCK) {
+                        err = lock_to_ceph_filelock(lock, &cephlock);
+                        if (err)
+                                goto fail;
+                        err = ceph_pagelist_append(pagelist, &cephlock,
+                                           sizeof(struct ceph_filelock));
+                }
+                if (err)
+                        goto fail;
+        }
+fail:
+        return err;
+}
+/*
+ * Given a pointer to a lock, convert it to a ceph filelock
+ */
+int lock_to_ceph_filelock(struct file_lock *lock,
+                          struct ceph_filelock *cephlock)
+{
+        int err = 0;
+        cephlock->start = cpu_to_le64(lock->fl_start);
+        cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
+        cephlock->client = cpu_to_le64(0);
+        cephlock->pid = cpu_to_le64(lock->fl_pid);
+        cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
+        switch (lock->fl_type) {
+        case F_RDLCK:
+                cephlock->type = CEPH_LOCK_SHARED;
+                break;
+        case F_WRLCK:
+                cephlock->type = CEPH_LOCK_EXCL;
+                break;
+        case F_UNLCK:
+                cephlock->type = CEPH_LOCK_UNLOCK;
+                break;
+        default:
+                dout("Have unknown lock type %d", lock->fl_type);
+                err = -EINVAL;
+        }
+        return err;
+}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index dd440bd438a9..a75ddbf9fe37 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "mds_client.h"
 #include "mon_client.h"
@@ -37,6 +38,11 @@
 * are no longer valid.
 */
+struct ceph_reconnect_state {
+        struct ceph_pagelist *pagelist;
+        bool flock;
+};
 static void __wake_requests(struct ceph_mds_client *mdsc,
                            struct list_head *head);
@@ -449,7 +455,7 @@ void ceph_mdsc_release_request(struct kref *kref)
        kfree(req->r_path1);
        kfree(req->r_path2);
        put_request_session(req);
-        ceph_unreserve_caps(&req->r_caps_reservation);
+        ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
        kfree(req);
 }
@@ -512,7 +518,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
 {
        req->r_tid = ++mdsc->last_tid;
        if (req->r_num_caps)
-                ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
+                ceph_reserve_caps(mdsc, &req->r_caps_reservation,
+                                  req->r_num_caps);
        dout("__register_request %p tid %lld\n", req, req->r_tid);
        ceph_mdsc_get_request(req);
        __insert_request(mdsc, req);
@@ -704,6 +711,51 @@ static int __open_session(struct ceph_mds_client *mdsc,
 }
 /*
+ * open sessions for any export targets for the given mds
+ *
+ * called under mdsc->mutex
+ */
+static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
+                                          struct ceph_mds_session *session)
+{
+        struct ceph_mds_info *mi;
+        struct ceph_mds_session *ts;
+        int i, mds = session->s_mds;
+        int target;
+        if (mds >= mdsc->mdsmap->m_max_mds)
+                return;
+        mi = &mdsc->mdsmap->m_info[mds];
+        dout("open_export_target_sessions for mds%d (%d targets)\n",
+             session->s_mds, mi->num_export_targets);
+        for (i = 0; i < mi->num_export_targets; i++) {
+                target = mi->export_targets[i];
+                ts = __ceph_lookup_mds_session(mdsc, target);
+                if (!ts) {
+                        ts = register_session(mdsc, target);
+                        if (IS_ERR(ts))
+                                return;
+                }
+                if (session->s_state == CEPH_MDS_SESSION_NEW ||
+                    session->s_state == CEPH_MDS_SESSION_CLOSING)
+                        __open_session(mdsc, session);
+                else
+                        dout(" mds%d target mds%d %p is %s\n", session->s_mds,
+                             i, ts, session_state_name(ts->s_state));
+                ceph_put_mds_session(ts);
+        }
+}
+void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+                                           struct ceph_mds_session *session)
+{
+        mutex_lock(&mdsc->mutex);
+        __open_export_target_sessions(mdsc, session);
+        mutex_unlock(&mdsc->mutex);
+}
+/*
 * session caps
 */
@@ -764,7 +816,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
                        last_inode = NULL;
                }
                if (old_cap) {
-                        ceph_put_cap(old_cap);
+                        ceph_put_cap(session->s_mdsc, old_cap);
                        old_cap = NULL;
                }
@@ -793,7 +845,7 @@ out:
        if (last_inode)
                iput(last_inode);
        if (old_cap)
-                ceph_put_cap(old_cap);
+                ceph_put_cap(session->s_mdsc, old_cap);
        return ret;
 }
@@ -1067,15 +1119,16 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 * Called under s_mutex.
 */
 int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                          struct ceph_mds_session *session,
+                          struct ceph_mds_session *session)
-                          int extra)
 {
-        struct ceph_msg *msg;
+        struct ceph_msg *msg, *partial = NULL;
        struct ceph_mds_cap_release *head;
        int err = -ENOMEM;
+        int extra = mdsc->client->mount_args->cap_release_safety;
+        int num;
-        if (extra < 0)
+        dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
-                extra = mdsc->client->mount_args->cap_release_safety;
+             extra);
        spin_lock(&session->s_cap_lock);
@@ -1084,9 +1137,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
                                       struct ceph_msg,
                                 list_head);
                head = msg->front.iov_base;
-                extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
+                num = le32_to_cpu(head->num);
+                if (num) {
+                        dout(" partial %p with (%d/%d)\n", msg, num,
+                             (int)CEPH_CAPS_PER_RELEASE);
+                        extra += CEPH_CAPS_PER_RELEASE - num;
+                        partial = msg;
+                }
        }
        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
                spin_unlock(&session->s_cap_lock);
                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
@@ -1103,19 +1161,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
                session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
        }
-        if (!list_empty(&session->s_cap_releases)) {
+        if (partial) {
-                msg = list_first_entry(&session->s_cap_releases,
+                head = partial->front.iov_base;
-                                       struct ceph_msg,
+                num = le32_to_cpu(head->num);
-                                       list_head);
+                dout(" queueing partial %p with %d/%d\n", partial, num,
-                head = msg->front.iov_base;
+                     (int)CEPH_CAPS_PER_RELEASE);
-                if (head->num) {
+                list_move_tail(&partial->list_head,
-                        dout(" queueing non-full %p (%d)\n", msg,
+                               &session->s_cap_releases_done);
-                             le32_to_cpu(head->num));
+                session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
-                        list_move_tail(&msg->list_head,
-                                      &session->s_cap_releases_done);
-                        session->s_num_cap_releases -=
-                                CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
-                }
        }
        err = 0;
        spin_unlock(&session->s_cap_lock);
@@ -1250,6 +1303,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
                return ERR_PTR(-ENOMEM);
        mutex_init(&req->r_fill_mutex);
+        req->r_mdsc = mdsc;
        req->r_started = jiffies;
        req->r_resend_mds = -1;
        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1580,6 +1634,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        req->r_mds = mds;
        req->r_attempts++;
+        if (req->r_inode) {
+                struct ceph_cap *cap =
+                        ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
+                if (cap)
+                        req->r_sent_on_mseq = cap->mseq;
+                else
+                        req->r_sent_on_mseq = -1;
+        }
        dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
             req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
@@ -1914,21 +1977,40 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        result = le32_to_cpu(head->result);
        /*
-         * Tolerate 2 consecutive ESTALEs from the same mds.
+         * Handle an ESTALE
-         * FIXME: we should be looking at the cap migrate_seq.
+         * if we're not talking to the authority, send to them
+         * if the authority has changed while we weren't looking,
+         * send to new authority
+         * Otherwise we just have to return an ESTALE
         */
        if (result == -ESTALE) {
-                req->r_direct_mode = USE_AUTH_MDS;
+                dout("got ESTALE on request %llu", req->r_tid);
-                req->r_num_stale++;
+                if (!req->r_inode) {
-                if (req->r_num_stale <= 2) {
+                        /* do nothing; not an authority problem */
+                } else if (req->r_direct_mode != USE_AUTH_MDS) {
+                        dout("not using auth, setting for that now");
+                        req->r_direct_mode = USE_AUTH_MDS;
                        __do_request(mdsc, req);
                        mutex_unlock(&mdsc->mutex);
                        goto out;
+                } else  {
+                        struct ceph_inode_info *ci = ceph_inode(req->r_inode);
+                        struct ceph_cap *cap =
+                                ceph_get_cap_for_mds(ci, req->r_mds);;
+                        dout("already using auth");
+                        if ((!cap || cap != ci->i_auth_cap) ||
+                            (cap->mseq != req->r_sent_on_mseq)) {
+                                dout("but cap changed, so resending");
+                                __do_request(mdsc, req);
+                                mutex_unlock(&mdsc->mutex);
+                                goto out;
+                        }
                }
-        } else {
+                dout("have to return ESTALE on request %llu", req->r_tid);
-                req->r_num_stale = 0;
        }
        if (head->safe) {
                req->r_got_safe = true;
                __unregister_request(mdsc, req);
@@ -1985,7 +2067,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        if (err == 0) {
                if (result == 0 && rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
-                ceph_unreserve_caps(&req->r_caps_reservation);
+                ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
        }
        mutex_unlock(&req->r_fill_mutex);
@@ -2005,7 +2087,7 @@ out_err:
        }
        mutex_unlock(&mdsc->mutex);
-        ceph_add_cap_releases(mdsc, req->r_session, -1);
+        ceph_add_cap_releases(mdsc, req->r_session);
        mutex_unlock(&session->s_mutex);
        /* kick calling process */
@@ -2193,9 +2275,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
 static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                          void *arg)
 {
-        struct ceph_mds_cap_reconnect rec;
+        union {
+                struct ceph_mds_cap_reconnect v2;
+                struct ceph_mds_cap_reconnect_v1 v1;
+        } rec;
+        size_t reclen;
        struct ceph_inode_info *ci;
-        struct ceph_pagelist *pagelist = arg;
+        struct ceph_reconnect_state *recon_state = arg;
+        struct ceph_pagelist *pagelist = recon_state->pagelist;
        char *path;
        int pathlen, err;
        u64 pathbase;
@@ -2228,17 +2315,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
        spin_lock(&inode->i_lock);
        cap->seq = 0;        /* reset cap seq */
        cap->issue_seq = 0;  /* and issue_seq */
-        rec.cap_id = cpu_to_le64(cap->cap_id);
-        rec.pathbase = cpu_to_le64(pathbase);
+        if (recon_state->flock) {
-        rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+                rec.v2.cap_id = cpu_to_le64(cap->cap_id);
-        rec.issued = cpu_to_le32(cap->issued);
+                rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
-        rec.size = cpu_to_le64(inode->i_size);
+                rec.v2.issued = cpu_to_le32(cap->issued);
-        ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
+                rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
-        ceph_encode_timespec(&rec.atime, &inode->i_atime);
+                rec.v2.pathbase = cpu_to_le64(pathbase);
-        rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+                rec.v2.flock_len = 0;
+                reclen = sizeof(rec.v2);
+        } else {
+                rec.v1.cap_id = cpu_to_le64(cap->cap_id);
+                rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
+                rec.v1.issued = cpu_to_le32(cap->issued);
+                rec.v1.size = cpu_to_le64(inode->i_size);
+                ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
+                ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
+                rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
+                rec.v1.pathbase = cpu_to_le64(pathbase);
+                reclen = sizeof(rec.v1);
+        }
        spin_unlock(&inode->i_lock);
-        err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
+        if (recon_state->flock) {
+                int num_fcntl_locks, num_flock_locks;
+                lock_kernel();
+                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
+                rec.v2.flock_len = (2*sizeof(u32) +
+                                    (num_fcntl_locks+num_flock_locks) *
+                                    sizeof(struct ceph_filelock));
+                err = ceph_pagelist_append(pagelist, &rec, reclen);
+                if (!err)
+                        err = ceph_encode_locks(inode, pagelist,
+                                                num_fcntl_locks,
+                                                num_flock_locks);
+                unlock_kernel();
+        }
 out:
        kfree(path);
@@ -2267,6 +2381,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        int mds = session->s_mds;
        int err = -ENOMEM;
        struct ceph_pagelist *pagelist;
+        struct ceph_reconnect_state recon_state;
        pr_info("mds%d reconnect start\n", mds);
@@ -2301,7 +2416,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
        if (err)
                goto fail;
-        err = iterate_session_caps(session, encode_caps_cb, pagelist);
+        recon_state.pagelist = pagelist;
+        recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+        err = iterate_session_caps(session, encode_caps_cb, &recon_state);
        if (err < 0)
                goto fail;
@@ -2326,6 +2444,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        }
        reply->pagelist = pagelist;
+        if (recon_state.flock)
+                reply->hdr.version = cpu_to_le16(2);
        reply->hdr.data_len = cpu_to_le32(pagelist->length);
        reply->nr_pages = calc_pages_for(0, pagelist->length);
        ceph_con_send(&session->s_con, reply);
@@ -2376,9 +2496,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                oldstate = ceph_mdsmap_get_state(oldmap, i);
                newstate = ceph_mdsmap_get_state(newmap, i);
-                dout("check_new_map mds%d state %s -> %s (session %s)\n",
+                dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
                     i, ceph_mds_state_name(oldstate),
+                     ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
                     ceph_mds_state_name(newstate),
+                     ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
                     session_state_name(s->s_state));
                if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2428,6 +2550,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                        wake_up_session_caps(s, 1);
                }
        }
+        for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
+                s = mdsc->sessions[i];
+                if (!s)
+                        continue;
+                if (!ceph_mdsmap_is_laggy(newmap, i))
+                        continue;
+                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                    s->s_state == CEPH_MDS_SESSION_HUNG ||
+                    s->s_state == CEPH_MDS_SESSION_CLOSING) {
+                        dout(" connecting to export targets of laggy mds%d\n",
+                             i);
+                        __open_export_target_sessions(mdsc, s);
+                }
+        }
 }
@@ -2715,7 +2852,7 @@ static void delayed_work(struct work_struct *work)
                        send_renew_caps(mdsc, s);
                else
                        ceph_con_keepalive(&s->s_con);
-                ceph_add_cap_releases(mdsc, s, -1);
+                ceph_add_cap_releases(mdsc, s);
                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
                    s->s_state == CEPH_MDS_SESSION_HUNG)
                        ceph_send_cap_releases(mdsc, s);
@@ -2764,6 +2901,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        spin_lock_init(&mdsc->dentry_lru_lock);
        INIT_LIST_HEAD(&mdsc->dentry_lru);
+        ceph_caps_init(mdsc);
+        ceph_adjust_min_caps(mdsc, client->min_caps);
        return 0;
 }
@@ -2959,6 +3099,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
        if (mdsc->mdsmap)
                ceph_mdsmap_destroy(mdsc->mdsmap);
        kfree(mdsc->sessions);
+        ceph_caps_finalize(mdsc);
 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 952410c60d09..ab7e89f5e344 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
 struct ceph_mds_request {
        u64 r_tid;                   /* transaction id */
        struct rb_node r_node;
+        struct ceph_mds_client *r_mdsc;
        int r_op;                    /* mds op code */
        int r_mds;
@@ -207,8 +208,8 @@ struct ceph_mds_request {
        int               r_attempts;   /* resend attempts */
        int               r_num_fwd;    /* number of forward attempts */
-        int               r_num_stale;
        int               r_resend_mds; /* mds to resend to next, if any*/
+        u32               r_sent_on_mseq; /* cap mseq request was sent at*/
        struct kref       r_kref;
        struct list_head  r_wait;
@@ -267,6 +268,27 @@ struct ceph_mds_client {
        spinlock_t        cap_dirty_lock;   /* protects above items */
        wait_queue_head_t cap_flushing_wq;
+        /*
+         * Cap reservations
+         *
+         * Maintain a global pool of preallocated struct ceph_caps, referenced
+         * by struct ceph_caps_reservations.  This ensures that we preallocate
+         * memory needed to successfully process an MDS response.  (If an MDS
+         * sends us cap information and we fail to process it, we will have
+         * problems due to the client and MDS being out of sync.)
+         *
+         * Reservations are 'owned' by a ceph_cap_reservation context.
+         */
+        spinlock_t      caps_list_lock;
+        struct          list_head caps_list; /* unused (reserved or
+                                                unreserved) */
+        int             caps_total_count;    /* total caps allocated */
+        int             caps_use_count;      /* in use */
+        int             caps_reserve_count;  /* unused, reserved */
+        int             caps_avail_count;    /* unused, unreserved */
+        int             caps_min_count;      /* keep at least this many
+                                                (unreserved) */
 #ifdef CONFIG_DEBUG_FS
        struct dentry     *debugfs_file;
 #endif
@@ -324,8 +346,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
 }
 extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                                 struct ceph_mds_session *session,
+                                 struct ceph_mds_session *session);
-                                 int extra);
 extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
                                   struct ceph_mds_session *session);
@@ -343,4 +364,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
 extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
                                 struct ceph_msg *msg);
+extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
+                                          struct ceph_mds_session *session);
 #endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfef..040be6d1150b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                struct ceph_entity_addr addr;
                u32 num_export_targets;
                void *pexport_targets = NULL;
+                struct ceph_timespec laggy_since;
                ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
                global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                state_seq = ceph_decode_64(p);
                ceph_decode_copy(p, &addr, sizeof(addr));
                ceph_decode_addr(&addr);
-                *p += sizeof(struct ceph_timespec);
+                ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
                *p += sizeof(u32);
                ceph_decode_32_safe(p, end, namelen, bad);
                *p += namelen;
@@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
                        m->m_info[mds].global_id = global_id;
                        m->m_info[mds].state = state;
                        m->m_info[mds].addr = addr;
+                        m->m_info[mds].laggy =
+                                (laggy_since.tv_sec != 0 ||
+                                 laggy_since.tv_nsec != 0);
                        m->m_info[mds].num_export_targets = num_export_targets;
                        if (num_export_targets) {
                                m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index eacc131aa5cb..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -13,6 +13,7 @@ struct ceph_mds_info {
        struct ceph_entity_addr addr;
        s32 state;
        int num_export_targets;
+        bool laggy;
        u32 *export_targets;
 };
@@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
        return m->m_info[w].state;
 }
+static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
+{
+        if (w >= 0 && w < m->m_max_mds)
+                return m->m_info[w].laggy;
+        return false;
+}
 extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
 extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
 extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 15167b2daa55..2502d76fcec1 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -108,7 +108,7 @@ void ceph_msgr_exit(void)
        destroy_workqueue(ceph_msgr_wq);
 }
-void ceph_msgr_flush()
+void ceph_msgr_flush(void)
 {
        flush_workqueue(ceph_msgr_wq);
 }
@@ -647,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
             con->connect_seq, global_seq, proto);
-        con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
+        con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
        con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1081,11 +1081,11 @@ static int process_banner(struct ceph_connection *con)
                   sizeof(con->peer_addr)) != 0 &&
            !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
              con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
-                pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+                pr_warning("wrong peer, want %s/%d, got %s/%d\n",
                           pr_addr(&con->peer_addr.in_addr),
-                           le64_to_cpu(con->peer_addr.nonce),
+                           (int)le32_to_cpu(con->peer_addr.nonce),
                           pr_addr(&con->actual_peer_addr.in_addr),
-                           le64_to_cpu(con->actual_peer_addr.nonce));
+                           (int)le32_to_cpu(con->actual_peer_addr.nonce));
                con->error_msg = "wrong peer at address";
                return -1;
        }
@@ -1123,8 +1123,8 @@ static void fail_protocol(struct ceph_connection *con)
 static int process_connect(struct ceph_connection *con)
 {
-        u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
+        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
-        u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
+        u64 req_feat = CEPH_FEATURE_REQUIRED;
        u64 server_feat = le64_to_cpu(con->in_reply.features);
        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con)
 static int read_partial_message_section(struct ceph_connection *con,
-                                        struct kvec *section, unsigned int sec_len,
+                                        struct kvec *section,
-                                        u32 *crc)
+                                        unsigned int sec_len, u32 *crc)
 {
        int left;
        int ret;
@@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con)
        /* middle */
        if (m->middle) {
-                ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+                ret = read_partial_message_section(con, &m->middle->vec,
+                                                   middle_len,
                                                   &con->in_middle_crc);
                if (ret <= 0)
                        return ret;
@@ -1920,7 +1921,7 @@ out:
        /*
         * in case we faulted due to authentication, invalidate our
         * current tickets so that we can get new ones.
-         */
+         */
        if (con->auth_retry && con->ops->invalidate_authorizer) {
                dout("calling invalidate_authorizer()\n");
                con->ops->invalidate_authorizer(con);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 54fe01c50706..b2a5a3e4a671 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -349,7 +349,7 @@ out:
 }
 /*
- * statfs
+ * generic requests (e.g., statfs, poolop)
 */
 static struct ceph_mon_generic_request *__lookup_generic_req(
        struct ceph_mon_client *monc, u64 tid)
@@ -442,6 +442,35 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
        return m;
 }
+static int do_generic_request(struct ceph_mon_client *monc,
+                              struct ceph_mon_generic_request *req)
+{
+        int err;
+        /* register request */
+        mutex_lock(&monc->mutex);
+        req->tid = ++monc->last_tid;
+        req->request->hdr.tid = cpu_to_le64(req->tid);
+        __insert_generic_request(monc, req);
+        monc->num_generic_requests++;
+        ceph_con_send(monc->con, ceph_msg_get(req->request));
+        mutex_unlock(&monc->mutex);
+        err = wait_for_completion_interruptible(&req->completion);
+        mutex_lock(&monc->mutex);
+        rb_erase(&req->node, &monc->generic_request_tree);
+        monc->num_generic_requests--;
+        mutex_unlock(&monc->mutex);
+        if (!err)
+                err = req->result;
+        return err;
+}
+/*
+ * statfs
+ */
 static void handle_statfs_reply(struct ceph_mon_client *monc,
                                struct ceph_msg *msg)
 {
@@ -468,7 +497,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
        return;
 bad:
-        pr_err("corrupt generic reply, no tid\n");
+        pr_err("corrupt generic reply, tid %llu\n", tid);
        ceph_msg_dump(msg);
 }
@@ -487,6 +516,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
        kref_init(&req->kref);
        req->buf = buf;
+        req->buf_len = sizeof(*buf);
        init_completion(&req->completion);
        err = -ENOMEM;
@@ -504,33 +534,134 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
-        /* register request */
+        err = do_generic_request(monc, req);
-        mutex_lock(&monc->mutex);
-        req->tid = ++monc->last_tid;
-        req->request->hdr.tid = cpu_to_le64(req->tid);
-        __insert_generic_request(monc, req);
-        monc->num_generic_requests++;
-        mutex_unlock(&monc->mutex);
-        /* send request and wait */
+out:
-        ceph_con_send(monc->con, ceph_msg_get(req->request));
+        kref_put(&req->kref, release_generic_request);
-        err = wait_for_completion_interruptible(&req->completion);
+        return err;
+}
+/*
+ * pool ops
+ */
+static int get_poolop_reply_buf(const char *src, size_t src_len,
+                                char *dst, size_t dst_len)
+{
+        u32 buf_len;
+        if (src_len != sizeof(u32) + dst_len)
+                return -EINVAL;
+        buf_len = le32_to_cpu(*(u32 *)src);
+        if (buf_len != dst_len)
+                return -EINVAL;
+        memcpy(dst, src + sizeof(u32), dst_len);
+        return 0;
+}
+static void handle_poolop_reply(struct ceph_mon_client *monc,
+                                struct ceph_msg *msg)
+{
+        struct ceph_mon_generic_request *req;
+        struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
+        if (msg->front.iov_len < sizeof(*reply))
+                goto bad;
+        dout("handle_poolop_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        rb_erase(&req->node, &monc->generic_request_tree);
+        req = __lookup_generic_req(monc, tid);
-        monc->num_generic_requests--;
+        if (req) {
+                if (req->buf_len &&
+                    get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
+                                     msg->front.iov_len - sizeof(*reply),
+                                     req->buf, req->buf_len) < 0) {
+                        mutex_unlock(&monc->mutex);
+                        goto bad;
+                }
+                req->result = le32_to_cpu(reply->reply_code);
+                get_generic_request(req);
+        }
        mutex_unlock(&monc->mutex);
+        if (req) {
+                complete(&req->completion);
+                put_generic_request(req);
+        }
+        return;
-        if (!err)
+bad:
-                err = req->result;
+        pr_err("corrupt generic reply, tid %llu\n", tid);
+        ceph_msg_dump(msg);
+}
+/*
+ * Do a synchronous pool op.
+ */
+int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
+                        u32 pool, u64 snapid,
+                        char *buf, int len)
+{
+        struct ceph_mon_generic_request *req;
+        struct ceph_mon_poolop *h;
+        int err;
+        req = kzalloc(sizeof(*req), GFP_NOFS);
+        if (!req)
+                return -ENOMEM;
+        kref_init(&req->kref);
+        req->buf = buf;
+        req->buf_len = len;
+        init_completion(&req->completion);
+        err = -ENOMEM;
+        req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
+        if (!req->request)
+                goto out;
+        req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
+        if (!req->reply)
+                goto out;
+        /* fill out request */
+        req->request->hdr.version = cpu_to_le16(2);
+        h = req->request->front.iov_base;
+        h->monhdr.have_version = 0;
+        h->monhdr.session_mon = cpu_to_le16(-1);
+        h->monhdr.session_mon_tid = 0;
+        h->fsid = monc->monmap->fsid;
+        h->pool = cpu_to_le32(pool);
+        h->op = cpu_to_le32(op);
+        h->auid = 0;
+        h->snapid = cpu_to_le64(snapid);
+        h->name_len = 0;
+        err = do_generic_request(monc, req);
 out:
        kref_put(&req->kref, release_generic_request);
        return err;
 }
+int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+                            u32 pool, u64 *snapid)
+{
+        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+                                   pool, 0, (char *)snapid, sizeof(*snapid));
+}
+int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+                            u32 pool, u64 snapid)
+{
+        return ceph_monc_do_poolop(monc,  POOL_OP_CREATE_UNMANAGED_SNAP,
+                                   pool, snapid, 0, 0);
+}
 /*
- * Resend pending statfs requests.
+ * Resend pending generic requests.
 */
 static void __resend_generic_request(struct ceph_mon_client *monc)
 {
@@ -783,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
                handle_statfs_reply(monc, msg);
                break;
+        case CEPH_MSG_POOLOP_REPLY:
+                handle_poolop_reply(monc, msg);
+                break;
        case CEPH_MSG_MON_MAP:
                ceph_monc_handle_map(monc, msg);
                break;
@@ -820,6 +955,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
        case CEPH_MSG_MON_SUBSCRIBE_ACK:
                m = ceph_msg_get(monc->m_subscribe_ack);
                break;
+        case CEPH_MSG_POOLOP_REPLY:
        case CEPH_MSG_STATFS_REPLY:
                return get_generic_reply(con, hdr, skip);
        case CEPH_MSG_AUTH_REPLY:
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 174d794321d0..8e396f2c0963 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -50,6 +50,7 @@ struct ceph_mon_generic_request {
        struct rb_node node;
        int result;
        void *buf;
+        int buf_len;
        struct completion completion;
        struct ceph_msg *request;  /* original request */
        struct ceph_msg *reply;    /* and reply */
@@ -111,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
 extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
+                                   u32 pool, u64 *snapid);
+extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
+                                   u32 pool, u64 snapid);
 #endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 892a0298dfdf..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -1,5 +1,5 @@
-#ifndef __MSGR_H
+#ifndef CEPH_MSGR_H
-#define __MSGR_H
+#define CEPH_MSGR_H
 /*
 * Data types for message passing layer used by Ceph.
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index e38522347898..bed6391e52c7 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1276,8 +1276,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
        /* it may be a short read due to an object boundary */
        req->r_pages = pages;
-        num_pages = calc_pages_for(off, *plen);
-        req->r_num_pages = num_pages;
        dout("readpages  final extent is %llu~%llu (%d pages)\n",
             off, *plen, req->r_num_pages);
@@ -1319,7 +1317,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
        /* it may be a short write due to an object boundary */
        req->r_pages = pages;
-        req->r_num_pages = calc_pages_for(off, len);
        dout("writepages %llu~%llu (%d pages)\n", off, len,
             req->r_num_pages);
@@ -1476,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con)
 * authentication
 */
 static int get_authorizer(struct ceph_connection *con,
-                          void **buf, int *len, int *proto,
+                          void **buf, int *len, int *proto,
-                          void **reply_buf, int *reply_len, int force_new)
+                          void **reply_buf, int *reply_len, int force_new)
 {
        struct ceph_osd *o = con->private;
        struct ceph_osd_client *osdc = o->o_osdc;
@@ -1497,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con,
                        &o->o_authorizer_reply_buf,
                        &o->o_authorizer_reply_buf_len);
                if (ret)
-                return ret;
+                        return ret;
        }
        *proto = ac->protocol;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 416d46adbf87..e31f118f1392 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
        kfree(pi);
 }
-void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
+static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
+        unsigned n, m;
        ceph_decode_copy(p, &pi->v, sizeof(pi->v));
        calc_pg_masks(pi);
-        *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+        /* num_snaps * snap_info_t */
+        n = le32_to_cpu(pi->v.num_snaps);
+        while (n--) {
+                ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
+                                 sizeof(struct ceph_timespec), bad);
+                *p += sizeof(u64) +       /* key */
+                        1 + sizeof(u64) + /* u8, snapid */
+                        sizeof(struct ceph_timespec);
+                m = ceph_decode_32(p);    /* snap name */
+                *p += m;
+        }
        *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
+        return 0;
+bad:
+        return -EINVAL;
 }
 static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
@@ -571,7 +589,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
                        kfree(pi);
                        goto bad;
                }
-                __decode_pool(p, pi);
+                err = __decode_pool(p, end, pi);
+                if (err < 0)
+                        goto bad;
                __insert_pg_pool(&map->pg_pools, pi);
        }
@@ -760,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                        pi->id = pool;
                        __insert_pg_pool(&map->pg_pools, pi);
                }
-                __decode_pool(p, pi);
+                err = __decode_pool(p, end, pi);
+                if (err < 0)
+                        goto bad;
        }
        if (version >= 5 && __decode_pool_names(p, end, map) < 0)
                goto bad;
@@ -833,7 +855,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                                                node)->pgid, pgid) <= 0) {
                        struct ceph_pg_mapping *cur =
                                rb_entry(rbp, struct ceph_pg_mapping, node);
-                        
                        rbp = rb_next(rbp);
                        dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
                        rb_erase(&cur->node, &map->pg_temp);
@@ -1026,8 +1048,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
        ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
                                 pool->v.type, pool->v.size);
        if (ruleno < 0) {
-                pr_err("no crush rule pool %d type %d size %d\n",
+                pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
-                       poolid, pool->v.type, pool->v.size);
+                       poolid, pool->v.crush_ruleset, pool->v.type,
+                       pool->v.size);
                return NULL;
        }
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 8fcc023056c7..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -1,5 +1,5 @@
-#ifndef __RADOS_H
+#ifndef CEPH_RADOS_H
-#define __RADOS_H
+#define CEPH_RADOS_H
 /*
 * Data types for the Ceph distributed object storage layer RADOS
@@ -203,6 +203,7 @@ enum {
        CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
        CEPH_OSD_OP_CREATE  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+        CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
        /** attrs **/
        /* read */
@@ -272,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op)
        return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
 }
+/*
+ * note that the following tmap stuff is also defined in the ceph librados.h
+ * any modification here needs to be updated there
+ */
 #define CEPH_OSD_TMAP_HDR 'h'
 #define CEPH_OSD_TMAP_SET 's'
 #define CEPH_OSD_TMAP_RM  'r'
@@ -297,6 +302,7 @@ enum {
        CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
        CEPH_OSD_FLAG_PGOP = 1024,      /* pg op, no object */
        CEPH_OSD_FLAG_EXEC = 2048,      /* op may exec */
+        CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
 };
 enum {
@@ -350,6 +356,9 @@ struct ceph_osd_op {
                struct {
                        __le64 cookie, count;
                } __attribute__ ((packed)) pgls;
+                struct {
+                        __le64 snapid;
+                } __attribute__ ((packed)) snap;
        };
        __le32 payload_len;
 } __attribute__ ((packed));
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fa87f51e38e1..9922628532b2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -2,6 +2,7 @@
 #include "ceph_debug.h"
 #include <linux/backing-dev.h>
+#include <linux/ctype.h>
 #include <linux/fs.h>
 #include <linux/inet.h>
 #include <linux/in6.h>
@@ -101,12 +102,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
-static int ceph_syncfs(struct super_block *sb, int wait)
+static int ceph_sync_fs(struct super_block *sb, int wait)
 {
-        dout("sync_fs %d\n", wait);
+        struct ceph_client *client = ceph_sb_to_client(sb);
+        if (!wait) {
+                dout("sync_fs (non-blocking)\n");
+                ceph_flush_dirty_caps(&client->mdsc);
+                dout("sync_fs (non-blocking) done\n");
+                return 0;
+        }
+        dout("sync_fs (blocking)\n");
        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
-        dout("sync_fs %d done\n", wait);
+        dout("sync_fs (blocking) done\n");
        return 0;
 }
@@ -150,9 +160,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
        struct ceph_mount_args *args = client->mount_args;
        if (args->flags & CEPH_OPT_FSID)
-                seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+                seq_printf(m, ",fsid=%pU", &args->fsid);
-                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
-                           le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
        if (args->flags & CEPH_OPT_NOSHARE)
                seq_puts(m, ",noshare");
        if (args->flags & CEPH_OPT_DIRSTAT)
@@ -279,7 +287,7 @@ static const struct super_operations ceph_super_ops = {
        .alloc_inode    = ceph_alloc_inode,
        .destroy_inode  = ceph_destroy_inode,
        .write_inode    = ceph_write_inode,
-        .sync_fs        = ceph_syncfs,
+        .sync_fs        = ceph_sync_fs,
        .put_super      = ceph_put_super,
        .show_options   = ceph_show_options,
        .statfs         = ceph_statfs,
@@ -322,9 +330,6 @@ const char *ceph_msg_type_name(int type)
 * mount options
 */
 enum {
-        Opt_fsidmajor,
-        Opt_fsidminor,
-        Opt_monport,
        Opt_wsize,
        Opt_rsize,
        Opt_osdtimeout,
@@ -339,6 +344,7 @@ enum {
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
+        Opt_fsid,
        Opt_snapdirname,
        Opt_name,
        Opt_secret,
@@ -355,9 +361,6 @@ enum {
 };
 static match_table_t arg_tokens = {
-        {Opt_fsidmajor, "fsidmajor=%ld"},
-        {Opt_fsidminor, "fsidminor=%ld"},
-        {Opt_monport, "monport=%d"},
        {Opt_wsize, "wsize=%d"},
        {Opt_rsize, "rsize=%d"},
        {Opt_osdtimeout, "osdtimeout=%d"},
@@ -371,6 +374,7 @@ static match_table_t arg_tokens = {
        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
+        {Opt_fsid, "fsid=%s"},
        {Opt_snapdirname, "snapdirname=%s"},
        {Opt_name, "name=%s"},
        {Opt_secret, "secret=%s"},
@@ -386,6 +390,36 @@ static match_table_t arg_tokens = {
        {-1, NULL}
 };
+static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+{
+        int i = 0;
+        char tmp[3];
+        int err = -EINVAL;
+        int d;
+        dout("parse_fsid '%s'\n", str);
+        tmp[2] = 0;
+        while (*str && i < 16) {
+                if (ispunct(*str)) {
+                        str++;
+                        continue;
+                }
+                if (!isxdigit(str[0]) || !isxdigit(str[1]))
+                        break;
+                tmp[0] = str[0];
+                tmp[1] = str[1];
+                if (sscanf(tmp, "%x", &d) < 1)
+                        break;
+                fsid->fsid[i] = d & 0xff;
+                i++;
+                str += 2;
+        }
+        if (i == 16)
+                err = 0;
+        dout("parse_fsid ret %d got fsid %pU", err, fsid);
+        return err;
+}
 static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                                                const char *dev_name,
@@ -469,12 +503,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                        dout("got token %d\n", token);
                }
                switch (token) {
-                case Opt_fsidmajor:
-                        *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
-                        break;
-                case Opt_fsidminor:
-                        *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
-                        break;
                case Opt_ip:
                        err = ceph_parse_ips(argstr[0].from,
                                             argstr[0].to,
@@ -485,6 +513,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                        args->flags |= CEPH_OPT_MYIP;
                        break;
+                case Opt_fsid:
+                        err = parse_fsid(argstr[0].from, &args->fsid);
+                        if (err == 0)
+                                args->flags |= CEPH_OPT_FSID;
+                        break;
                case Opt_snapdirname:
                        kfree(args->snapdir_name);
                        args->snapdir_name = kstrndup(argstr[0].from,
@@ -515,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                case Opt_osdkeepalivetimeout:
                        args->osd_keepalive_timeout = intval;
                        break;
+                case Opt_osd_idle_ttl:
+                        args->osd_idle_ttl = intval;
+                        break;
                case Opt_mount_timeout:
                        args->mount_timeout = intval;
                        break;
@@ -630,7 +666,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
        /* caps */
        client->min_caps = args->max_readdir;
-        ceph_adjust_min_caps(client->min_caps);
        /* subsystems */
        err = ceph_monc_init(&client->monc, client);
@@ -680,8 +715,6 @@ static void ceph_destroy_client(struct ceph_client *client)
        ceph_monc_stop(&client->monc);
-        ceph_adjust_min_caps(-client->min_caps);
        ceph_debugfs_client_cleanup(client);
        destroy_workqueue(client->wb_wq);
        destroy_workqueue(client->pg_inv_wq);
@@ -706,13 +739,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 {
        if (client->have_fsid) {
                if (ceph_fsid_compare(&client->fsid, fsid)) {
-                        pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+                        pr_err("bad fsid, had %pU got %pU",
-                               PR_FSID(&client->fsid), PR_FSID(fsid));
+                               &client->fsid, fsid);
                        return -1;
                }
        } else {
-                pr_info("client%lld fsid " FSID_FORMAT "\n",
+                pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
-                        client->monc.auth->global_id, PR_FSID(fsid));
+                        fsid);
                memcpy(&client->fsid, fsid, sizeof(*fsid));
                ceph_debugfs_client_init(client);
                client->have_fsid = true;
@@ -1043,8 +1076,6 @@ static int __init init_ceph(void)
        if (ret)
                goto out_msgr;
-        ceph_caps_init();
        ret = register_filesystem(&ceph_fs_type);
        if (ret)
                goto out_icache;
@@ -1069,7 +1100,6 @@ static void __exit exit_ceph(void)
 {
        dout("exit_ceph\n");
        unregister_filesystem(&ceph_fs_type);
-        ceph_caps_finalize();
        destroy_caches();
        ceph_msgr_exit();
        ceph_debugfs_cleanup();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 10a4a406e887..2482d696f0de 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -31,6 +31,12 @@
 #define CEPH_BLOCK         (1 << CEPH_BLOCK_SHIFT)
 /*
+ * Supported features
+ */
+#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED  CEPH_FEATURE_NOSRCADDR
+/*
 * mount options
 */
 #define CEPH_OPT_FSID             (1<<0)
@@ -560,11 +566,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
 /* what the mds thinks we want */
 extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
-extern void ceph_caps_init(void);
+extern void ceph_caps_init(struct ceph_mds_client *mdsc);
-extern void ceph_caps_finalize(void);
+extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
-extern void ceph_adjust_min_caps(int delta);
+extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
-extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
-extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+                             struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
+                               struct ceph_cap_reservation *ctx);
 extern void ceph_reservation_status(struct ceph_client *client,
                                    int *total, int *avail, int *used,
                                    int *reserved, int *min);
@@ -738,13 +746,6 @@ extern struct kmem_cache *ceph_file_cachep;
 extern const char *ceph_msg_type_name(int type);
 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
-#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
-        "%02x%02x%02x%02x%02x%02x"
-#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
-                (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7],    \
-                (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11],  \
-                (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
 /* inode.c */
 extern const struct inode_operations ceph_file_iops;
@@ -806,13 +807,16 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
        __ceph_remove_cap(cap);
        spin_unlock(&inode->i_lock);
 }
-extern void ceph_put_cap(struct ceph_cap *cap);
+extern void ceph_put_cap(struct ceph_mds_client *mdsc,
+                         struct ceph_cap *cap);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
 extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
+extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
+                                             int mds);
 extern int ceph_get_cap_mds(struct inode *inode);
 extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
 extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
@@ -857,7 +861,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
 extern const struct inode_operations ceph_dir_iops;
-extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -888,6 +892,14 @@ extern void ceph_debugfs_cleanup(void);
 extern int ceph_debugfs_client_init(struct ceph_client *client);
 extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+/* locks.c */
+extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
+extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
+extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
+extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
+                             int p_locks, int f_locks);
+extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
 static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
 {
        if (dentry && dentry->d_parent)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 68aeebc69681..097a2654c00f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -337,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
 }
 static int __build_xattrs(struct inode *inode)
+        __releases(inode->i_lock)
+        __acquires(inode->i_lock)
 {
        u32 namelen;
        u32 numattr = 0;
diff --git a/fs/cifs/README b/fs/cifs/README
index a7081eeeb85d..7099a526f775 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -301,6 +301,16 @@ A partial list of the supported mount options follows:
  gid           Set the default gid for inodes (similar to above).
  file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
+  fsc           Enable local disk caching using FS-Cache (off by default). This
+                option could be useful to improve performance on a slow link,
+                heavily loaded server and/or network where reading from the
+                disk is faster than reading from the server (over the network).
+                This could also impact scalability positively as the
+                number of calls to the server are reduced. However, local
+                caching is not suitable for all workloads for e.g. read-once
+                type workloads. So, you need to consider carefully your
+                workload/scenario before using this option. Currently, local
+                disk caching is functional for CIFS files opened as read-only.
  dir_mode      If CIFS Unix extensions are not supported by the server 
                this overrides the default mode for directory inodes.
  port          attempt to contact the server on this tcp port, before
diff --git a/fs/dcache.c b/fs/dcache.c
index 9f2c13417969..166d35d56868 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1905,48 +1905,30 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
 }
 /**
- * __d_path - return the path of a dentry
+ * Prepend path string to a buffer
+ *
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry (may be modified by this function)
- * @buffer: buffer to return value in
+ * @buffer: pointer to the end of the buffer
- * @buflen: buffer length
+ * @buflen: pointer to buffer length
 *
- * Convert a dentry into an ASCII path name. If the entry has been deleted
+ * Caller holds the dcache_lock.
- * the string " (deleted)" is appended. Note that this is ambiguous.
- *
- * Returns a pointer into the buffer or an error code if the
- * path was too long.
- *
- * "buflen" should be positive. Caller holds the dcache_lock.
 *
 * If path is not reachable from the supplied root, then the value of
 * root is changed (without modifying refcounts).
 */
-char *__d_path(const struct path *path, struct path *root,
+static int prepend_path(const struct path *path, struct path *root,
-               char *buffer, int buflen)
+                        char **buffer, int *buflen)
 {
        struct dentry *dentry = path->dentry;
        struct vfsmount *vfsmnt = path->mnt;
-        char *end = buffer + buflen;
+        bool slash = false;
-        char *retval;
+        int error = 0;
        spin_lock(&vfsmount_lock);
-        prepend(&end, &buflen, "\0", 1);
+        while (dentry != root->dentry || vfsmnt != root->mnt) {
-        if (d_unlinked(dentry) &&
-                (prepend(&end, &buflen, " (deleted)", 10) != 0))
-                        goto Elong;
-        if (buflen < 1)
-                goto Elong;
-        /* Get '/' right */
-        retval = end-1;
-        *retval = '/';
-        for (;;) {
                struct dentry * parent;
-                if (dentry == root->dentry && vfsmnt == root->mnt)
-                        break;
                if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
                        /* Global root? */
                        if (vfsmnt->mnt_parent == vfsmnt) {
@@ -1958,28 +1940,88 @@ char *__d_path(const struct path *path, struct path *root,
                }
                parent = dentry->d_parent;
                prefetch(parent);
-                if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
+                error = prepend_name(buffer, buflen, &dentry->d_name);
-                    (prepend(&end, &buflen, "/", 1) != 0))
+                if (!error)
-                        goto Elong;
+                        error = prepend(buffer, buflen, "/", 1);
-                retval = end;
+                if (error)
+                        break;
+                slash = true;
                dentry = parent;
        }
 out:
+        if (!error && !slash)
+                error = prepend(buffer, buflen, "/", 1);
        spin_unlock(&vfsmount_lock);
-        return retval;
+        return error;
 global_root:
-        retval += 1;    /* hit the slash */
+        /*
-        if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
+         * Filesystems needing to implement special "root names"
-                goto Elong;
+         * should do so with ->d_dname()
+         */
+        if (IS_ROOT(dentry) &&
+            (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
+                WARN(1, "Root dentry has weird name <%.*s>\n",
+                     (int) dentry->d_name.len, dentry->d_name.name);
+        }
        root->mnt = vfsmnt;
        root->dentry = dentry;
        goto out;
+}
-Elong:
+/**
-        retval = ERR_PTR(-ENAMETOOLONG);
+ * __d_path - return the path of a dentry
-        goto out;
+ * @path: the dentry/vfsmount to report
+ * @root: root vfsmnt/dentry (may be modified by this function)
+ * @buffer: buffer to return value in
+ * @buflen: buffer length
+ *
+ * Convert a dentry into an ASCII path name.
+ *
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
+ *
+ * "buflen" should be positive. Caller holds the dcache_lock.
+ *
+ * If path is not reachable from the supplied root, then the value of
+ * root is changed (without modifying refcounts).
+ */
+char *__d_path(const struct path *path, struct path *root,
+               char *buf, int buflen)
+{
+        char *res = buf + buflen;
+        int error;
+        prepend(&res, &buflen, "\0", 1);
+        error = prepend_path(path, root, &res, &buflen);
+        if (error)
+                return ERR_PTR(error);
+        return res;
+}
+/*
+ * same as __d_path but appends "(deleted)" for unlinked files.
+ */
+static int path_with_deleted(const struct path *path, struct path *root,
+                                 char **buf, int *buflen)
+{
+        prepend(buf, buflen, "\0", 1);
+        if (d_unlinked(path->dentry)) {
+                int error = prepend(buf, buflen, " (deleted)", 10);
+                if (error)
+                        return error;
+        }
+        return prepend_path(path, root, buf, buflen);
+}
+static int prepend_unreachable(char **buffer, int *buflen)
+{
+        return prepend(buffer, buflen, "(unreachable)", 13);
 }
 /**
@@ -2000,9 +2042,10 @@ Elong:
 */
 char *d_path(const struct path *path, char *buf, int buflen)
 {
-        char *res;
+        char *res = buf + buflen;
        struct path root;
        struct path tmp;
+        int error;
        /*
         * We have various synthetic filesystems that never get mounted.  On
@@ -2014,19 +2057,51 @@ char *d_path(const struct path *path, char *buf, int buflen)
        if (path->dentry->d_op && path->dentry->d_op->d_dname)
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
-        read_lock(&current->fs->lock);
+        get_fs_root(current->fs, &root);
-        root = current->fs->root;
-        path_get(&root);
-        read_unlock(&current->fs->lock);
        spin_lock(&dcache_lock);
        tmp = root;
-        res = __d_path(path, &tmp, buf, buflen);
+        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (error)
+                res = ERR_PTR(error);
        spin_unlock(&dcache_lock);
        path_put(&root);
        return res;
 }
 EXPORT_SYMBOL(d_path);
+/**
+ * d_path_with_unreachable - return the path of a dentry
+ * @path: path to report
+ * @buf: buffer to return value in
+ * @buflen: buffer length
+ *
+ * The difference from d_path() is that this prepends "(unreachable)"
+ * to paths which are unreachable from the current process' root.
+ */
+char *d_path_with_unreachable(const struct path *path, char *buf, int buflen)
+{
+        char *res = buf + buflen;
+        struct path root;
+        struct path tmp;
+        int error;
+        if (path->dentry->d_op && path->dentry->d_op->d_dname)
+                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
+        get_fs_root(current->fs, &root);
+        spin_lock(&dcache_lock);
+        tmp = root;
+        error = path_with_deleted(path, &tmp, &res, &buflen);
+        if (!error && !path_equal(&tmp, &root))
+                error = prepend_unreachable(&res, &buflen);
+        spin_unlock(&dcache_lock);
+        path_put(&root);
+        if (error)
+                res =  ERR_PTR(error);
+        return res;
+}
 /*
 * Helper function for dentry_operations.d_dname() members
 */
@@ -2129,27 +2204,30 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
        if (!page)
                return -ENOMEM;
-        read_lock(&current->fs->lock);
+        get_fs_root_and_pwd(current->fs, &root, &pwd);
-        pwd = current->fs->pwd;
-        path_get(&pwd);
-        root = current->fs->root;
-        path_get(&root);
-        read_unlock(&current->fs->lock);
        error = -ENOENT;
        spin_lock(&dcache_lock);
        if (!d_unlinked(pwd.dentry)) {
                unsigned long len;
                struct path tmp = root;
-                char * cwd;
+                char *cwd = page + PAGE_SIZE;
+                int buflen = PAGE_SIZE;
-                cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
+                prepend(&cwd, &buflen, "\0", 1);
+                error = prepend_path(&pwd, &tmp, &cwd, &buflen);
                spin_unlock(&dcache_lock);
-                error = PTR_ERR(cwd);
+                if (error)
-                if (IS_ERR(cwd))
                        goto out;
+                /* Unreachable from current root */
+                if (!path_equal(&tmp, &root)) {
+                        error = prepend_unreachable(&cwd, &buflen);
+                        if (error)
+                                goto out;
+                }
                error = -ERANGE;
                len = PAGE_SIZE + page - cwd;
                if (len <= size) {
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index f9bfe2b501d5..68cb23e3bb98 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -30,9 +30,6 @@
 * along with exofs; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/buffer_head.h>
 #include "exofs.h"
 static int exofs_release_file(struct inode *inode, struct file *filp)
@@ -40,19 +37,27 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
+/* exofs_file_fsync - flush the inode to disk
+ *
+ *   Note, in exofs all metadata is written as part of inode, regardless.
+ *   The writeout is synchronous
+ */
 static int exofs_file_fsync(struct file *filp, int datasync)
 {
        int ret;
-        struct address_space *mapping = filp->f_mapping;
+        struct inode *inode = filp->f_mapping->host;
-        struct inode *inode = mapping->host;
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_ALL,
+                .nr_to_write = 0, /* metadata-only; caller takes care of data */
+        };
        struct super_block *sb;
-        ret = filemap_write_and_wait(mapping);
+        if (!(inode->i_state & I_DIRTY))
-        if (ret)
+                return 0;
-                return ret;
+        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+                return 0;
-        /* sync the inode attributes */
+        ret = sync_inode(inode, &wbc);
-        ret = write_inode_now(inode, 1);
        /* This is a good place to write the sb */
        /* TODO: Sechedule an sb-sync on create */
@@ -65,9 +70,9 @@ static int exofs_file_fsync(struct file *filp, int datasync)
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-        exofs_file_fsync(file, 1);
+        int ret = vfs_fsync(file, 0);
        /* TODO: Flush the OSD target */
-        return 0;
+        return ret;
 }
 const struct file_operations exofs_file_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 088cb476b68a..eb7368ebd8cd 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -32,9 +32,6 @@
 */
 #include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/buffer_head.h>
-#include <scsi/scsi_device.h>
 #include "exofs.h"
@@ -773,15 +770,13 @@ static int exofs_releasepage(struct page *page, gfp_t gfp)
 {
        EXOFS_DBGMSG("page 0x%lx\n", page->index);
        WARN_ON(1);
-        return try_to_free_buffers(page);
+        return 0;
 }
 static void exofs_invalidatepage(struct page *page, unsigned long offset)
 {
-        EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+        EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset);
        WARN_ON(1);
-        block_invalidatepage(page, offset);
 }
 const struct address_space_operations exofs_aops = {
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
index e2732203fa93..6550bf70e41d 100644
--- a/fs/exofs/ios.c
+++ b/fs/exofs/ios.c
@@ -305,8 +305,6 @@ int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
 struct _striping_info {
        u64 obj_offset;
        u64 group_length;
-        u64 total_group_length;
-        u64 Major;
        unsigned dev;
        unsigned unit_off;
 };
@@ -343,8 +341,6 @@ static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset,
                                  (M * group_depth * stripe_unit);
        si->group_length = T - H;
-        si->total_group_length = T;
-        si->Major = M;
 }
 static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
@@ -392,20 +388,19 @@ static int _add_stripe_unit(struct exofs_io_state *ios,  unsigned *cur_pg,
 }
 static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
-                              struct _striping_info *si, unsigned first_comp)
+                              struct _striping_info *si)
 {
        unsigned stripe_unit = ios->layout->stripe_unit;
        unsigned mirrors_p1 = ios->layout->mirrors_p1;
        unsigned devs_in_group = ios->layout->group_width * mirrors_p1;
        unsigned dev = si->dev;
        unsigned first_dev = dev - (dev % devs_in_group);
-        unsigned comp = first_comp + (dev - first_dev);
        unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0;
        unsigned cur_pg = ios->pages_consumed;
        int ret = 0;
        while (length) {
-                struct exofs_per_dev_state *per_dev = &ios->per_dev[comp];
+                struct exofs_per_dev_state *per_dev = &ios->per_dev[dev];
                unsigned cur_len, page_off = 0;
                if (!per_dev->length) {
@@ -424,11 +419,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
                                cur_len = stripe_unit;
                        }
-                        if (max_comp < comp)
+                        if (max_comp < dev)
-                                max_comp = comp;
+                                max_comp = dev;
-                        dev += mirrors_p1;
-                        dev = (dev % devs_in_group) + first_dev;
                } else {
                        cur_len = stripe_unit;
                }
@@ -440,8 +432,8 @@ static int _prepare_one_group(struct exofs_io_state *ios, u64 length,
                if (unlikely(ret))
                        goto out;
-                comp += mirrors_p1;
+                dev += mirrors_p1;
-                comp = (comp % devs_in_group) + first_comp;
+                dev = (dev % devs_in_group) + first_dev;
                length -= cur_len;
        }
@@ -454,18 +446,15 @@ out:
 static int _prepare_for_striping(struct exofs_io_state *ios)
 {
        u64 length = ios->length;
+        u64 offset = ios->offset;
        struct _striping_info si;
-        unsigned devs_in_group = ios->layout->group_width *
-                                 ios->layout->mirrors_p1;
-        unsigned first_comp = 0;
        int ret = 0;
-        _calc_stripe_info(ios, ios->offset, &si);
        if (!ios->pages) {
                if (ios->kern_buff) {
                        struct exofs_per_dev_state *per_dev = &ios->per_dev[0];
+                        _calc_stripe_info(ios, ios->offset, &si);
                        per_dev->offset = si.obj_offset;
                        per_dev->dev = si.dev;
@@ -479,26 +468,17 @@ static int _prepare_for_striping(struct exofs_io_state *ios)
        }
        while (length) {
+                _calc_stripe_info(ios, offset, &si);
                if (length < si.group_length)
                        si.group_length = length;
-                ret = _prepare_one_group(ios, si.group_length, &si, first_comp);
+                ret = _prepare_one_group(ios, si.group_length, &si);
                if (unlikely(ret))
                        goto out;
+                offset += si.group_length;
                length -= si.group_length;
-                si.group_length = si.total_group_length;
-                si.unit_off = 0;
-                ++si.Major;
-                si.obj_offset = si.Major * ios->layout->stripe_unit *
-                                                ios->layout->group_depth;
-                si.dev = (si.dev - (si.dev % devs_in_group)) + devs_in_group;
-                si.dev %= ios->layout->s_numdevs;
-                first_comp += devs_in_group;
-                first_comp %= ios->layout->s_numdevs;
        }
 out:
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 32cfd61def5f..047e92fa3af8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -31,7 +31,6 @@
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
-#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/parser.h>
 #include <linux/vfs.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 9d175d623aab..6769fd0f35b8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -767,11 +767,22 @@ void kill_fasync(struct fasync_struct **fp, int sig, int band)
 }
 EXPORT_SYMBOL(kill_fasync);
-static int __init fasync_init(void)
+static int __init fcntl_init(void)
 {
+        /* please add new bits here to ensure allocation uniqueness */
+        BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+                O_RDONLY        | O_WRONLY      | O_RDWR        |
+                O_CREAT         | O_EXCL        | O_NOCTTY      |
+                O_TRUNC         | O_APPEND      | O_NONBLOCK    |
+                __O_SYNC        | O_DSYNC       | FASYNC        |
+                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
+                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
+                FMODE_EXEC
+                ));
        fasync_cache = kmem_cache_create("fasync_cache",
                sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
        return 0;
 }
-module_init(fasync_init)
+module_init(fcntl_init)
diff --git a/fs/file.c b/fs/file.c
index cccaead962c2..0be344755c02 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -39,28 +39,27 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 */
 static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-static inline void * alloc_fdmem(unsigned int size)
+static inline void *alloc_fdmem(unsigned int size)
 {
-        if (size <= PAGE_SIZE)
+        void *data;
-                return kmalloc(size, GFP_KERNEL);
-        else
+        data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
-                return vmalloc(size);
+        if (data != NULL)
+                return data;
+        return vmalloc(size);
 }
-static inline void free_fdarr(struct fdtable *fdt)
+static void free_fdmem(void *ptr)
 {
-        if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
+        is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
-                kfree(fdt->fd);
-        else
-                vfree(fdt->fd);
 }
-static inline void free_fdset(struct fdtable *fdt)
+static void __free_fdtable(struct fdtable *fdt)
 {
-        if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
+        free_fdmem(fdt->fd);
-                kfree(fdt->open_fds);
+        free_fdmem(fdt->open_fds);
-        else
+        kfree(fdt);
-                vfree(fdt->open_fds);
 }
 static void free_fdtable_work(struct work_struct *work)
@@ -75,9 +74,8 @@ static void free_fdtable_work(struct work_struct *work)
        spin_unlock_bh(&f->lock);
        while(fdt) {
                struct fdtable *next = fdt->next;
-                vfree(fdt->fd);
-                free_fdset(fdt);
+                __free_fdtable(fdt);
-                kfree(fdt);
                fdt = next;
        }
 }
@@ -98,7 +96,7 @@ void free_fdtable_rcu(struct rcu_head *rcu)
                                container_of(fdt, struct files_struct, fdtab));
                return;
        }
-        if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
+        if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
                kfree(fdt->fd);
                kfree(fdt->open_fds);
                kfree(fdt);
@@ -183,7 +181,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
        return fdt;
 out_arr:
-        free_fdarr(fdt);
+        free_fdmem(fdt->fd);
 out_fdt:
        kfree(fdt);
 out:
@@ -213,9 +211,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
         * caller and alloc_fdtable().  Cheaper to catch it here...
         */
        if (unlikely(new_fdt->max_fds <= nr)) {
-                free_fdarr(new_fdt);
+                __free_fdtable(new_fdt);
-                free_fdset(new_fdt);
-                kfree(new_fdt);
                return -EMFILE;
        }
        /*
@@ -231,9 +227,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
                        free_fdtable(cur_fdt);
        } else {
                /* Somebody else expanded, so undo our attempt */
-                free_fdarr(new_fdt);
+                __free_fdtable(new_fdt);
-                free_fdset(new_fdt);
-                kfree(new_fdt);
        }
        return 1;
 }
@@ -323,11 +317,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);
-                if (new_fdt != &newf->fdtab) {
+                if (new_fdt != &newf->fdtab)
-                        free_fdarr(new_fdt);
+                        __free_fdtable(new_fdt);
-                        free_fdset(new_fdt);
-                        kfree(new_fdt);
-                }
                new_fdt = alloc_fdtable(open_files - 1);
                if (!new_fdt) {
@@ -337,9 +328,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
                /* beyond sysctl_nr_open; nothing to do */
                if (unlikely(new_fdt->max_fds < open_files)) {
-                        free_fdarr(new_fdt);
+                        __free_fdtable(new_fdt);
-                        free_fdset(new_fdt);
-                        kfree(new_fdt);
                        *errorp = -EMFILE;
                        goto out_release;
                }
diff --git a/fs/file_table.c b/fs/file_table.c
index b8a0bb63cbd7..edecd36fed9b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -230,15 +230,6 @@ static void __fput(struct file *file)
        might_sleep();
        fsnotify_close(file);
-        /*
-         * fsnotify_create_event may have taken one or more references on this
-         * file.  If it did so it left one reference for us to drop to make sure
-         * its calls to fput could not prematurely destroy the file.
-         */
-        if (atomic_long_read(&file->f_count))
-                return fput(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
@@ -298,11 +289,20 @@ struct file *fget(unsigned int fd)
 EXPORT_SYMBOL(fget);
 /*
- * Lightweight file lookup - no refcnt increment if fd table isn't shared. 
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
- * You can use this only if it is guranteed that the current task already 
+ *
- * holds a refcnt to that file. That check has to be done at fget() only
+ * You can use this instead of fget if you satisfy all of the following
- * and a flag is returned to be passed to the corresponding fput_light().
+ * conditions:
- * There must not be a cloning between an fget_light/fput_light pair.
+ * 1) You must call fput_light before exiting the syscall and returning control
+ *    to userspace (i.e. you cannot remember the returned struct file * after
+ *    returning to userspace).
+ * 2) You must not call filp_close on the returned struct file * in between
+ *    calls to fget_light and fput_light.
+ * 3) You must not clone the current task in between the calls to fget_light
+ *    and fput_light.
+ *
+ * The fput_needed flag returned by fget_light should be passed to the
+ * corresponding fput_light.
 */
 struct file *fget_light(unsigned int fd, int *fput_needed)
 {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2f76c4a081a2..7d9d06ba184b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -68,7 +68,7 @@ int nr_pdflush_threads;
 */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-        return !list_empty(&bdi->work_list);
+        return test_bit(BDI_writeback_running, &bdi->state);
 }
 static void bdi_queue_work(struct backing_dev_info *bdi,
@@ -249,10 +249,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
 * Queue all expired dirty inodes for io, eldest first.
+ * Before
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    gf         edc     BA
+ * After
+ *         newly dirtied     b_dirty    b_io    b_more_io
+ *         =============>    g          fBAedc
+ *                                           |
+ *                                           +--> dequeue for IO
 */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-        list_splice_init(&wb->b_more_io, wb->b_io.prev);
+        list_splice_init(&wb->b_more_io, &wb->b_io);
        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
@@ -363,62 +371,35 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode_lock);
        inode->i_state &= ~I_SYNC;
        if (!(inode->i_state & I_FREEING)) {
-                if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
+                if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-                        /*
-                         * More pages get dirtied by a fast dirtier.
-                         */
-                        goto select_queue;
-                } else if (inode->i_state & I_DIRTY) {
-                        /*
-                         * At least XFS will redirty the inode during the
-                         * writeback (delalloc) and on io completion (isize).
-                         */
-                        redirty_tail(inode);
-                } else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
                        /*
                         * We didn't write back all the pages.  nfs_writepages()
-                         * sometimes bales out without doing anything. Redirty
+                         * sometimes bales out without doing anything.
-                         * the inode; Move it from b_io onto b_more_io/b_dirty.
                         */
-                        /*
+                        inode->i_state |= I_DIRTY_PAGES;
-                         * akpm: if the caller was the kupdate function we put
+                        if (wbc->nr_to_write <= 0) {
-                         * this inode at the head of b_dirty so it gets first
-                         * consideration.  Otherwise, move it to the tail, for
-                         * the reasons described there.  I'm not really sure
-                         * how much sense this makes.  Presumably I had a good
-                         * reasons for doing it this way, and I'd rather not
-                         * muck with it at present.
-                         */
-                        if (wbc->for_kupdate) {
                                /*
-                                 * For the kupdate function we move the inode
+                                 * slice used up: queue for next turn
-                                 * to b_more_io so it will get more writeout as
-                                 * soon as the queue becomes uncongested.
                                 */
-                                inode->i_state |= I_DIRTY_PAGES;
+                                requeue_io(inode);
-select_queue:
-                                if (wbc->nr_to_write <= 0) {
-                                        /*
-                                         * slice used up: queue for next turn
-                                         */
-                                        requeue_io(inode);
-                                } else {
-                                        /*
-                                         * somehow blocked: retry later
-                                         */
-                                        redirty_tail(inode);
-                                }
                        } else {
                                /*
-                                 * Otherwise fully redirty the inode so that
+                                 * Writeback blocked by something other than
-                                 * other inodes on this superblock will get some
+                                 * congestion. Delay the inode for some time to
-                                 * writeout.  Otherwise heavy writing to one
+                                 * avoid spinning on the CPU (100% iowait)
-                                 * file would indefinitely suspend writeout of
+                                 * retrying writeback of the dirty page/inode
-                                 * all the other files.
+                                 * that cannot be performed immediately.
                                 */
-                                inode->i_state |= I_DIRTY_PAGES;
                                redirty_tail(inode);
                        }
+                } else if (inode->i_state & I_DIRTY) {
+                        /*
+                         * Filesystems can dirty the inode during writeback
+                         * operations, such as delayed allocation during
+                         * submission or metadata updates after data IO
+                         * completion.
+                         */
+                        redirty_tail(inode);
                } else if (atomic_read(&inode->i_count)) {
                        /*
                         * The inode is clean, inuse
@@ -590,7 +571,7 @@ static inline bool over_bground_thresh(void)
 {
        unsigned long background_thresh, dirty_thresh;
-        get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+        global_dirty_limits(&background_thresh, &dirty_thresh);
        return (global_page_state(NR_FILE_DIRTY) +
                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
@@ -759,6 +740,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        struct wb_writeback_work *work;
        long wrote = 0;
+        set_bit(BDI_writeback_running, &wb->bdi->state);
        while ((work = get_next_work_item(bdi)) != NULL) {
                /*
                 * Override sync mode, in case we must wait for completion
@@ -785,6 +767,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
+        clear_bit(BDI_writeback_running, &wb->bdi->state);
        return wrote;
 }
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index eee059052db5..1ee40eb9a2c0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -106,12 +106,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
                fs->in_exec = 0;
                rwlock_init(&fs->lock);
                fs->umask = old->umask;
-                read_lock(&old->lock);
+                get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
-                fs->root = old->root;
-                path_get(&old->root);
-                fs->pwd = old->pwd;
-                path_get(&old->pwd);
-                read_unlock(&old->lock);
        }
        return fs;
 }
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 6a026441c5a6..f6aad48d38a8 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -321,17 +321,11 @@ void fscache_put_context(struct fscache_cookie *cookie, void *context)
 #define dbgprintk(FMT, ...) \
        printk(KERN_DEBUG "[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-/* make sure we maintain the format strings, even when debugging is disabled */
-static inline __attribute__((format(printf, 1, 2)))
-void _dbprintk(const char *fmt, ...)
-{
-}
 #define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
 #define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
 #define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-#define kjournal(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #ifdef __KDEBUG
 #define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
@@ -358,9 +352,9 @@ do {						\
 } while (0)
 #else
-#define _enter(FMT, ...) _dbprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
+#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) _dbprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
+#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) _dbprintk(FMT, ##__VA_ARGS__)
+#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
 #endif
 /*
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6b4dcd4f2943..5a44811b5027 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -722,7 +722,12 @@ root_found:
        }
        s->s_magic = ISOFS_SUPER_MAGIC;
-        s->s_maxbytes = 0xffffffff; /* We can handle files up to 4 GB */
+        /*
+         * With multi-extent files, file size is only limited by the maximum
+         * size of a file system, which is 8 TB.
+         */
+        s->s_maxbytes = 0x80000000000LL;
        /*
         * The CDROM is read-only, has no nodes (devices) on it, and since
diff --git a/fs/namei.c b/fs/namei.c
index 13ff4abdbdca..17ea76bf2fbe 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -483,13 +483,8 @@ ok:
 static __always_inline void set_root(struct nameidata *nd)
 {
-        if (!nd->root.mnt) {
+        if (!nd->root.mnt)
-                struct fs_struct *fs = current->fs;
+                get_fs_root(current->fs, &nd->root);
-                read_lock(&fs->lock);
-                nd->root = fs->root;
-                path_get(&nd->root);
-                read_unlock(&fs->lock);
-        }
 }
 static int link_path_walk(const char *, struct nameidata *);
@@ -1015,11 +1010,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
                nd->path = nd->root;
                path_get(&nd->root);
        } else if (dfd == AT_FDCWD) {
-                struct fs_struct *fs = current->fs;
+                get_fs_pwd(current->fs, &nd->path);
-                read_lock(&fs->lock);
-                nd->path = fs->pwd;
-                path_get(&fs->pwd);
-                read_unlock(&fs->lock);
        } else {
                struct dentry *dentry;
diff --git a/fs/namespace.c b/fs/namespace.c
index 66c4f7e781cb..2e10cb19c5b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -788,7 +788,6 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
                { MNT_NOATIME, ",noatime" },
                { MNT_NODIRATIME, ",nodiratime" },
                { MNT_RELATIME, ",relatime" },
-                { MNT_STRICTATIME, ",strictatime" },
                { 0, NULL }
        };
        const struct proc_fs_info *fs_infop;
@@ -2213,10 +2212,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                goto out1;
        }
-        read_lock(&current->fs->lock);
+        get_fs_root(current->fs, &root);
-        root = current->fs->root;
-        path_get(&current->fs->root);
-        read_unlock(&current->fs->lock);
        down_write(&namespace_sem);
        mutex_lock(&old.dentry->d_inode->i_mutex);
        error = -EINVAL;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index cc1bb33b59b8..26a510a7be09 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -100,3 +100,20 @@ config NFS_FSCACHE
        help
          Say Y here if you want NFS data to be cached locally on disc through
          the general filesystem cache manager
+config NFS_USE_LEGACY_DNS
+        bool "Use the legacy NFS DNS resolver"
+        depends on NFS_V4
+        help
+          The kernel now provides a method for translating a host name into an
+          IP address.  Select Y here if you would rather use your own DNS
+          resolver script.
+          If unsure, say N
+config NFS_USE_KERNEL_DNS
+        bool
+        depends on NFS_V4 && !NFS_USE_LEGACY_DNS
+        select DNS_RESOLVER
+        select KEYS
+        default y
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 36dfdae95123..e17b49e2eabd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -45,7 +45,7 @@ unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
 #define NFS_CALLBACK_MAXPORTNR (65535U)
-static int param_set_portnr(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, const struct kernel_param *kp)
 {
        unsigned long num;
        int ret;
@@ -58,11 +58,10 @@ static int param_set_portnr(const char *val, struct kernel_param *kp)
        *((unsigned int *)kp->arg) = num;
        return 0;
 }
+static struct kernel_param_ops param_ops_portnr = {
-static int param_get_portnr(char *buffer, struct kernel_param *kp)
+        .set = param_set_portnr,
-{
+        .get = param_get_uint,
-        return param_get_uint(buffer, kp);
+};
-}
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 76fd235d0024..dba50a5625db 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -6,6 +6,29 @@
 * Resolves DNS hostnames into valid ip addresses
 */
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+#include <linux/sunrpc/clnt.h>
+#include <linux/dns_resolver.h>
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+                struct sockaddr *sa, size_t salen)
+{
+        ssize_t ret;
+        char *ip_addr = NULL;
+        int ip_len;
+        ip_len = dns_query(NULL, name, namelen, NULL, &ip_addr, NULL);
+        if (ip_len > 0)
+                ret = rpc_pton(ip_addr, ip_len, sa, salen);
+        else
+                ret = -ESRCH;
+        kfree(ip_addr);
+        return ret;
+}
+#else
 #include <linux/hash.h>
 #include <linux/string.h>
 #include <linux/kmod.h>
@@ -346,3 +369,4 @@ void nfs_dns_resolver_destroy(void)
        nfs_cache_unregister(&nfs_dns_resolve);
 }
+#endif
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
index a3f0938babf7..199bb5543a91 100644
--- a/fs/nfs/dns_resolve.h
+++ b/fs/nfs/dns_resolve.h
@@ -6,8 +6,20 @@
 #define NFS_DNS_HOSTNAME_MAXLEN (128)
+#ifdef CONFIG_NFS_USE_KERNEL_DNS
+static inline int nfs_dns_resolver_init(void)
+{
+        return 0;
+}
+static inline void nfs_dns_resolver_destroy(void)
+{}
+#else
 extern int nfs_dns_resolver_init(void);
 extern void nfs_dns_resolver_destroy(void);
+#endif
 extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
                struct sockaddr *sa, size_t salen);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index eb8f73c9c131..756566fe8449 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -17,9 +17,9 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
            old->data_type == new->data_type &&
            old->tgid == new->tgid) {
                switch (old->data_type) {
-                case (FSNOTIFY_EVENT_FILE):
+                case (FSNOTIFY_EVENT_PATH):
-                        if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
+                        if ((old->path.mnt == new->path.mnt) &&
-                            (old->file->f_path.dentry == new->file->f_path.dentry))
+                            (old->path.dentry == new->path.dentry))
                                return true;
                case (FSNOTIFY_EVENT_NONE):
                        return true;
@@ -174,7 +174,7 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
                return false;
        /* if we don't have enough info to send an event to userspace say no */
-        if (data_type != FSNOTIFY_EVENT_FILE)
+        if (data_type != FSNOTIFY_EVENT_PATH)
                return false;
        if (inode_mark && vfsmnt_mark) {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 25a3b4dfcf61..032b837fcd11 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -65,7 +65,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
        if (client_fd < 0)
                return client_fd;
-        if (event->data_type != FSNOTIFY_EVENT_FILE) {
+        if (event->data_type != FSNOTIFY_EVENT_PATH) {
                WARN_ON(1);
                put_unused_fd(client_fd);
                return -EINVAL;
@@ -75,8 +75,8 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
         * we need a new file handle for the userspace program so it can read even if it was
         * originally opened O_WRONLY.
         */
-        dentry = dget(event->file->f_path.dentry);
+        dentry = dget(event->path.dentry);
-        mnt = mntget(event->file->f_path.mnt);
+        mnt = mntget(event->path.mnt);
        /* it's possible this event was an overflow event.  in that case dentry and mnt
         * are NULL;  That's fine, just don't call dentry open */
        if (dentry && mnt)
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4d2a82c1ceb1..3970392b2722 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -84,7 +84,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 /* Notify this dentry's parent about a child's events. */
-void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
+void __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 {
        struct dentry *parent;
        struct inode *p_inode;
@@ -92,7 +92,7 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
        bool should_update_children = false;
        if (!dentry)
-                dentry = file->f_path.dentry;
+                dentry = path->dentry;
        if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                return;
@@ -124,8 +124,8 @@ void __fsnotify_parent(struct file *file, struct dentry *dentry, __u32 mask)
                 * specifies these are events which came from a child. */
                mask |= FS_EVENT_ON_CHILD;
-                if (file)
+                if (path)
-                        fsnotify(p_inode, mask, file, FSNOTIFY_EVENT_FILE,
+                        fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
                                 dentry->d_name.name, 0);
                else
                        fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
@@ -217,8 +217,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
        /* global tests shouldn't care about events on child only the specific event */
        __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
-        if (data_is == FSNOTIFY_EVENT_FILE)
+        if (data_is == FSNOTIFY_EVENT_PATH)
-                mnt = ((struct file *)data)->f_path.mnt;
+                mnt = ((struct path *)data)->mnt;
        else
                mnt = NULL;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5e73eeb2c697..a91b69a6a291 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -52,9 +52,9 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
                            !strcmp(old->file_name, new->file_name))
                                return true;
                        break;
-                case (FSNOTIFY_EVENT_FILE):
+                case (FSNOTIFY_EVENT_PATH):
-                        if ((old->file->f_path.mnt == new->file->f_path.mnt) &&
+                        if ((old->path.mnt == new->path.mnt) &&
-                            (old->file->f_path.dentry == new->file->f_path.dentry))
+                            (old->path.dentry == new->path.dentry))
                                return true;
                        break;
                case (FSNOTIFY_EVENT_NONE):
@@ -147,10 +147,10 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
                                      __u32 mask, void *data, int data_type)
 {
        if ((inode_mark->mask & FS_EXCL_UNLINK) &&
-            (data_type == FSNOTIFY_EVENT_FILE)) {
+            (data_type == FSNOTIFY_EVENT_PATH)) {
-                struct file *file  = data;
+                struct path *path = data;
-                if (d_unlinked(file->f_path.dentry))
+                if (d_unlinked(path->dentry))
                        return false;
        }
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index d6c435adc7a2..f39260f8f865 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -31,7 +31,6 @@
 * allocated and used.
 */
-#include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -90,8 +89,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
        if (atomic_dec_and_test(&event->refcnt)) {
                pr_debug("%s: event=%p\n", __func__, event);
-                if (event->data_type == FSNOTIFY_EVENT_FILE)
+                if (event->data_type == FSNOTIFY_EVENT_PATH)
-                        fput(event->file);
+                        path_put(&event->path);
                BUG_ON(!list_empty(&event->private_data_list));
@@ -376,8 +375,8 @@ struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
                }
        }
        event->tgid = get_pid(old_event->tgid);
-        if (event->data_type == FSNOTIFY_EVENT_FILE)
+        if (event->data_type == FSNOTIFY_EVENT_PATH)
-                get_file(event->file);
+                path_get(&event->path);
        return event;
 }
@@ -424,22 +423,11 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
        event->data_type = data_type;
        switch (data_type) {
-        case FSNOTIFY_EVENT_FILE: {
+        case FSNOTIFY_EVENT_PATH: {
-                event->file = data;
+                struct path *path = data;
-                /*
+                event->path.dentry = path->dentry;
-                 * if this file is about to disappear hold an extra reference
+                event->path.mnt = path->mnt;
-                 * until we return to __fput so we don't have to worry about
+                path_get(&event->path);
-                 * future get/put destroying the file under us or generating
-                 * additional events.  Notice that we change f_mode without
-                 * holding f_lock.  This is safe since this is the only possible
-                 * reference to this object in the kernel (it was about to be
-                 * freed, remember?)
-                 */
-                if (!atomic_long_read(&event->file->f_count)) {
-                        event->file->f_mode |= FMODE_NONOTIFY;
-                        get_file(event->file);
-                }
-                get_file(event->file);
                break;
        }
        case FSNOTIFY_EVENT_INODE:
@@ -447,7 +435,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
                break;
        case FSNOTIFY_EVENT_NONE:
                event->inode = NULL;
-                event->file = NULL;
+                event->path.dentry = NULL;
+                event->path.mnt = NULL;
                break;
        default:
                BUG();
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index da702294d7e7..a76e0aa5cd3f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -290,12 +290,30 @@ static int ocfs2_set_acl(handle_t *handle,
 int ocfs2_check_acl(struct inode *inode, int mask)
 {
-        struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        struct posix_acl *acl;
+        int ret = -EAGAIN;
-        if (IS_ERR(acl))
+        if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+                return ret;
+        ret = ocfs2_read_inode_block(inode, &di_bh);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, di_bh);
+        brelse(di_bh);
+        if (IS_ERR(acl)) {
+                mlog_errno(PTR_ERR(acl));
                return PTR_ERR(acl);
+        }
        if (acl) {
-                int ret = posix_acl_permission(inode, acl, mask);
+                ret = posix_acl_permission(inode, acl, mask);
                posix_acl_release(acl);
                return ret;
        }
@@ -344,7 +362,7 @@ int ocfs2_init_acl(handle_t *handle,
 {
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct posix_acl *acl = NULL;
-        int ret = 0;
+        int ret = 0, ret2;
        mode_t mode;
        if (!S_ISLNK(inode->i_mode)) {
@@ -381,7 +399,12 @@ int ocfs2_init_acl(handle_t *handle,
                mode = inode->i_mode;
                ret = posix_acl_create_masq(clone, &mode);
                if (ret >= 0) {
-                        ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
+                        if (ret2) {
+                                mlog_errno(ret2);
+                                ret = ret2;
+                                goto cleanup;
+                        }
                        if (ret > 0) {
                                ret = ocfs2_set_acl(handle, inode,
                                                    di_bh, ACL_TYPE_ACCESS,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index aa75ca3f78da..1361997cf205 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1759,6 +1759,7 @@ static int o2net_accept_one(struct socket *sock)
        struct sockaddr_in sin;
        struct socket *new_sock = NULL;
        struct o2nm_node *node = NULL;
+        struct o2nm_node *local_node = NULL;
        struct o2net_sock_container *sc = NULL;
        struct o2net_node *nn;
@@ -1796,11 +1797,15 @@ static int o2net_accept_one(struct socket *sock)
                goto out;
        }
-        if (o2nm_this_node() > node->nd_num) {
+        if (o2nm_this_node() >= node->nd_num) {
-                mlog(ML_NOTICE, "unexpected connect attempted from a lower "
+                local_node = o2nm_get_node_by_num(o2nm_this_node());
-                     "numbered node '%s' at " "%pI4:%d with num %u\n",
+                mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
-                     node->nd_name, &sin.sin_addr.s_addr,
+                     "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
-                     ntohs(sin.sin_port), node->nd_num);
+                     local_node->nd_name, local_node->nd_num,
+                     &(local_node->nd_ipv4_address),
+                     ntohs(local_node->nd_ipv4_port),
+                     node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
+                     ntohs(sin.sin_port));
                ret = -EINVAL;
                goto out;
        }
@@ -1857,6 +1862,8 @@ out:
                sock_release(new_sock);
        if (node)
                o2nm_node_put(node);
+        if (local_node)
+                o2nm_node_put(local_node);
        if (sc)
                sc_put(sc);
        return ret;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 94b97fc6a88e..ffb4c68dafa4 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -511,8 +511,6 @@ static void dlm_lockres_release(struct kref *kref)
        atomic_dec(&dlm->res_cur_count);
-        dlm_put(dlm);
        if (!hlist_unhashed(&res->hash_node) ||
            !list_empty(&res->granted) ||
            !list_empty(&res->converting) ||
@@ -585,8 +583,6 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
        res->migration_pending = 0;
        res->inflight_locks = 0;
-        /* put in dlm_lockres_release */
-        dlm_grab(dlm);
        res->dlm = dlm;
        kref_init(&res->refs);
@@ -3050,8 +3046,6 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
        /* check for pre-existing lock */
        spin_lock(&dlm->spinlock);
        res = __dlm_lookup_lockres(dlm, name, namelen, hash);
-        spin_lock(&dlm->master_lock);
        if (res) {
                spin_lock(&res->spinlock);
                if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -3069,14 +3063,15 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
                spin_unlock(&res->spinlock);
        }
+        spin_lock(&dlm->master_lock);
        /* ignore status.  only nonzero status would BUG. */
        ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
                                    name, namelen,
                                    migrate->new_master,
                                    migrate->master);
-unlock:
        spin_unlock(&dlm->master_lock);
+unlock:
        spin_unlock(&dlm->spinlock);
        if (oldmle) {
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 9dfaac73b36d..aaaffbcbe916 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
        struct list_head *queue;
        struct dlm_lock *lock, *next;
+        assert_spin_locked(&dlm->spinlock);
+        assert_spin_locked(&res->spinlock);
        res->state |= DLM_LOCK_RES_RECOVERING;
        if (!list_empty(&res->recovering)) {
                mlog(0,
@@ -2326,19 +2328,15 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                        /* zero the lvb if necessary */
                        dlm_revalidate_lvb(dlm, res, dead_node);
                        if (res->owner == dead_node) {
-                                if (res->state & DLM_LOCK_RES_DROPPING_REF)
+                                if (res->state & DLM_LOCK_RES_DROPPING_REF) {
-                                        mlog(0, "%s:%.*s: owned by "
+                                        mlog(ML_NOTICE, "Ignore %.*s for "
-                                             "dead node %u, this node was "
+                                             "recovery as it is being freed\n",
-                                             "dropping its ref when it died. "
+                                             res->lockname.len,
-                                             "continue, dropping the flag.\n",
+                                             res->lockname.name);
-                                             dlm->name, res->lockname.len,
+                                } else
-                                             res->lockname.name, dead_node);
+                                        dlm_move_lockres_to_recovery_list(dlm,
+                                                                          res);
-                                /* the wake_up for this will happen when the
-                                 * RECOVERING flag is dropped later */
-                                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
-                                dlm_move_lockres_to_recovery_list(dlm, res);
                        } else if (res->owner == dlm->node_num) {
                                dlm_free_dead_locks(dlm, res, dead_node);
                                __dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca68fe5..2211acf33d9b 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,19 +92,27 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
 * truly ready to be freed. */
 int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
-        if (!__dlm_lockres_has_locks(res) &&
+        int bit;
-            (list_empty(&res->dirty) && !(res->state & DLM_LOCK_RES_DIRTY))) {
-                /* try not to scan the bitmap unless the first two
+        if (__dlm_lockres_has_locks(res))
-                 * conditions are already true */
+                return 0;
-                int bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-                if (bit >= O2NM_MAX_NODES) {
+        if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
-                        /* since the bit for dlm->node_num is not
+                return 0;
-                         * set, inflight_locks better be zero */
-                        BUG_ON(res->inflight_locks != 0);
+        if (res->state & DLM_LOCK_RES_RECOVERING)
-                        return 1;
+                return 0;
-                }
-        }
+        bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
-        return 0;
+        if (bit < O2NM_MAX_NODES)
+                return 0;
+        /*
+         * since the bit for dlm->node_num is not set, inflight_locks better
+         * be zero
+         */
+        BUG_ON(res->inflight_locks != 0);
+        return 1;
 }
@@ -152,45 +160,25 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
        spin_unlock(&dlm->spinlock);
 }
-static int dlm_purge_lockres(struct dlm_ctxt *dlm,
+static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                             struct dlm_lock_resource *res)
 {
        int master;
        int ret = 0;
-        spin_lock(&res->spinlock);
+        assert_spin_locked(&dlm->spinlock);
-        if (!__dlm_lockres_unused(res)) {
+        assert_spin_locked(&res->spinlock);
-                mlog(0, "%s:%.*s: tried to purge but not unused\n",
-                     dlm->name, res->lockname.len, res->lockname.name);
-                __dlm_print_one_lock_resource(res);
-                spin_unlock(&res->spinlock);
-                BUG();
-        }
-        if (res->state & DLM_LOCK_RES_MIGRATING) {
-                mlog(0, "%s:%.*s: Delay dropref as this lockres is "
-                     "being remastered\n", dlm->name, res->lockname.len,
-                     res->lockname.name);
-                /* Re-add the lockres to the end of the purge list */
-                if (!list_empty(&res->purge)) {
-                        list_del_init(&res->purge);
-                        list_add_tail(&res->purge, &dlm->purge_list);
-                }
-                spin_unlock(&res->spinlock);
-                return 0;
-        }
        master = (res->owner == dlm->node_num);
-        if (!master)
-                res->state |= DLM_LOCK_RES_DROPPING_REF;
-        spin_unlock(&res->spinlock);
        mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
             res->lockname.name, master);
        if (!master) {
+                res->state |= DLM_LOCK_RES_DROPPING_REF;
                /* drop spinlock...  retake below */
+                spin_unlock(&res->spinlock);
                spin_unlock(&dlm->spinlock);
                spin_lock(&res->spinlock);
@@ -208,31 +196,35 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
                mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
                     dlm->name, res->lockname.len, res->lockname.name, ret);
                spin_lock(&dlm->spinlock);
+                spin_lock(&res->spinlock);
        }
-        spin_lock(&res->spinlock);
        if (!list_empty(&res->purge)) {
                mlog(0, "removing lockres %.*s:%p from purgelist, "
                     "master = %d\n", res->lockname.len, res->lockname.name,
                     res, master);
                list_del_init(&res->purge);
-                spin_unlock(&res->spinlock);
                dlm_lockres_put(res);
                dlm->purge_count--;
-        } else
+        }
-                spin_unlock(&res->spinlock);
+        if (!__dlm_lockres_unused(res)) {
+                mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+                     dlm->name, res->lockname.len, res->lockname.name);
+                __dlm_print_one_lock_resource(res);
+                BUG();
+        }
        __dlm_unhash_lockres(res);
        /* lockres is not in the hash now.  drop the flag and wake up
         * any processes waiting in dlm_get_lock_resource. */
        if (!master) {
-                spin_lock(&res->spinlock);
                res->state &= ~DLM_LOCK_RES_DROPPING_REF;
                spin_unlock(&res->spinlock);
                wake_up(&res->wq);
-        }
+        } else
-        return 0;
+                spin_unlock(&res->spinlock);
 }
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
@@ -251,17 +243,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                lockres = list_entry(dlm->purge_list.next,
                                     struct dlm_lock_resource, purge);
-                /* Status of the lockres *might* change so double
-                 * check. If the lockres is unused, holding the dlm
-                 * spinlock will prevent people from getting and more
-                 * refs on it -- there's no need to keep the lockres
-                 * spinlock. */
                spin_lock(&lockres->spinlock);
-                unused = __dlm_lockres_unused(lockres);
-                spin_unlock(&lockres->spinlock);
-                if (!unused)
-                        continue;
                purge_jiffies = lockres->last_used +
                        msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +255,29 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
                         * in tail order, we can stop at the first
                         * unpurgable resource -- anyone added after
                         * him will have a greater last_used value */
+                        spin_unlock(&lockres->spinlock);
                        break;
                }
+                /* Status of the lockres *might* change so double
+                 * check. If the lockres is unused, holding the dlm
+                 * spinlock will prevent people from getting and more
+                 * refs on it. */
+                unused = __dlm_lockres_unused(lockres);
+                if (!unused ||
+                    (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+                        mlog(0, "lockres %s:%.*s: is in use or "
+                             "being remastered, used %d, state %d\n",
+                             dlm->name, lockres->lockname.len,
+                             lockres->lockname.name, !unused, lockres->state);
+                        list_move_tail(&dlm->purge_list, &lockres->purge);
+                        spin_unlock(&lockres->spinlock);
+                        continue;
+                }
                dlm_lockres_get(lockres);
-                /* This may drop and reacquire the dlm spinlock if it
+                dlm_purge_lockres(dlm, lockres);
-                 * has to do migration. */
-                if (dlm_purge_lockres(dlm, lockres))
-                        BUG();
                dlm_lockres_put(lockres);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3ac5aa733e9c..73a11ccfd4c2 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2436,16 +2436,26 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
                len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
                          le32_to_cpu(rec.r_clusters)) - cpos;
                /*
-                 * If the refcount rec already exist, cool. We just need
-                 * to check whether there is a split. Otherwise we just need
-                 * to increase the refcount.
-                 * If we will insert one, increases recs_add.
-                 *
                 * We record all the records which will be inserted to the
                 * same refcount block, so that we can tell exactly whether
                 * we need a new refcount block or not.
+                 *
+                 * If we will insert a new one, this is easy and only happens
+                 * during adding refcounted flag to the extent, so we don't
+                 * have a chance of spliting. We just need one record.
+                 *
+                 * If the refcount rec already exists, that would be a little
+                 * complicated. we may have to:
+                 * 1) split at the beginning if the start pos isn't aligned.
+                 *    we need 1 more record in this case.
+                 * 2) split int the end if the end pos isn't aligned.
+                 *    we need 1 more record in this case.
+                 * 3) split in the middle because of file system fragmentation.
+                 *    we need 2 more records in this case(we can't detect this
+                 *    beforehand, so always think of the worst case).
                 */
                if (rec.r_refcount) {
+                        recs_add += 2;
                        /* Check whether we need a split at the beginning. */
                        if (cpos == start_cpos &&
                            cpos != le64_to_cpu(rec.r_cpos))
diff --git a/fs/open.c b/fs/open.c
index b715d06fbe36..630715f9f73d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1031,7 +1031,9 @@ EXPORT_SYMBOL(generic_file_open);
 /*
 * This is used by subsystems that don't want seekable
- * file descriptors
+ * file descriptors. The function is not supposed to ever fail, the only
+ * reason it returns an 'int' and not 'void' is so that it can be plugged
+ * directly into file_operations structure.
 */
 int nonseekable_open(struct inode *inode, struct file *filp)
 {
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index 6921e7890be6..fbeb697374d5 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -45,8 +45,11 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
        nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) |
                   (le32_to_cpu(dr->disc_size) >> 9);
-        if (name)
+        if (name) {
-                printk(" [%s]", name);
+                strlcat(state->pp_buf, " [", PAGE_SIZE);
+                strlcat(state->pp_buf, name, PAGE_SIZE);
+                strlcat(state->pp_buf, "]", PAGE_SIZE);
+        }
        put_partition(state, slot, first_sector, nr_sects);
        return dr;
 }
@@ -81,14 +84,14 @@ static int riscix_partition(struct parsed_partitions *state,
        if (!rr)
                return -1;
-        printk(" [RISCiX]");
+        strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
        if (rr->magic == RISCIX_MAGIC) {
                unsigned long size = nr_sects > 2 ? 2 : nr_sects;
                int part;
-                printk(" <");
+                strlcat(state->pp_buf, " <", PAGE_SIZE);
                put_partition(state, slot++, first_sect, size);
                for (part = 0; part < 8; part++) {
@@ -97,11 +100,13 @@ static int riscix_partition(struct parsed_partitions *state,
                                put_partition(state, slot++,
                                        le32_to_cpu(rr->part[part].start),
                                        le32_to_cpu(rr->part[part].length));
-                                printk("(%s)", rr->part[part].name);
+                                strlcat(state->pp_buf, "(", PAGE_SIZE);
+                                strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
+                                strlcat(state->pp_buf, ")", PAGE_SIZE);
                        }
                }
-                printk(" >\n");
+                strlcat(state->pp_buf, " >\n", PAGE_SIZE);
        } else {
                put_partition(state, slot++, first_sect, nr_sects);
        }
@@ -131,7 +136,7 @@ static int linux_partition(struct parsed_partitions *state,
        struct linux_part *linuxp;
        unsigned long size = nr_sects > 2 ? 2 : nr_sects;
-        printk(" [Linux]");
+        strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
        put_partition(state, slot++, first_sect, size);
@@ -139,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state,
        if (!linuxp)
                return -1;
-        printk(" <");
+        strlcat(state->pp_buf, " <", PAGE_SIZE);
        while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
               linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
                if (slot == state->limit)
@@ -149,7 +154,7 @@ static int linux_partition(struct parsed_partitions *state,
                                 le32_to_cpu(linuxp->nr_sects));
                linuxp ++;
        }
-        printk(" >");
+        strlcat(state->pp_buf, " >", PAGE_SIZE);
        put_dev_sector(sect);
        return slot;
@@ -294,7 +299,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
                        break;
                }
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
 #endif
@@ -367,7 +372,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
                return 0;
        }
-        printk(" [ICS]");
+        strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
        for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
                u32 start = le32_to_cpu(p->start);
@@ -401,7 +406,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
        }
        put_dev_sector(sect);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
 #endif
@@ -461,7 +466,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
                return 0;
        }
-        printk(" [POWERTEC]");
+        strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
        for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
                u32 start = le32_to_cpu(p->start);
@@ -472,7 +477,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
        }
        put_dev_sector(sect);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
 #endif
@@ -543,7 +548,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
                size = get_capacity(state->bdev->bd_disk);
                put_partition(state, slot++, start, size - start);
-                printk("\n");
+                strlcat(state->pp_buf, "\n", PAGE_SIZE);
        }
        return i ? 1 : 0;
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index ba443d4229f8..70cbf44a1560 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -69,7 +69,13 @@ int amiga_partition(struct parsed_partitions *state)
        /* blksize is blocks per 512 byte standard block */
        blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
-        printk(" RDSK (%d)", blksize * 512);    /* Be more informative */
+        {
+                char tmp[7 + 10 + 1 + 1];
+                /* Be more informative */
+                snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
        blk = be32_to_cpu(rdb->rdb_PartitionList);
        put_dev_sector(sect);
        for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
@@ -106,23 +112,27 @@ int amiga_partition(struct parsed_partitions *state)
                {
                        /* Be even more informative to aid mounting */
                        char dostype[4];
+                        char tmp[42];
                        __be32 *dt = (__be32 *)dostype;
                        *dt = pb->pb_Environment[16];
                        if (dostype[3] < ' ')
-                                printk(" (%c%c%c^%c)",
+                                snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
                                        dostype[0], dostype[1],
                                        dostype[2], dostype[3] + '@' );
                        else
-                                printk(" (%c%c%c%c)",
+                                snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
                                        dostype[0], dostype[1],
                                        dostype[2], dostype[3]);
-                        printk("(res %d spb %d)",
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
+                        snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
                                be32_to_cpu(pb->pb_Environment[6]),
                                be32_to_cpu(pb->pb_Environment[4]));
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
                }
                res = 1;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
 rdb_done:
        return res;
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 4439ff1b6cec..9875b05e80a2 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -62,7 +62,7 @@ int atari_partition(struct parsed_partitions *state)
        }
        pi = &rs->part[0];
-        printk (" AHDI");
+        strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
        for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
                struct rootsector *xrs;
                Sector sect2;
@@ -81,7 +81,7 @@ int atari_partition(struct parsed_partitions *state)
 #ifdef ICD_PARTS
                part_fmt = 1;
 #endif
-                printk(" XGM<");
+                strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
                partsect = extensect = be32_to_cpu(pi->st);
                while (1) {
                        xrs = read_part_sector(state, partsect, &sect2);
@@ -120,14 +120,14 @@ int atari_partition(struct parsed_partitions *state)
                                break;
                        }
                }
-                printk(" >");
+                strlcat(state->pp_buf, " >", PAGE_SIZE);
        }
 #ifdef ICD_PARTS
        if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
                pi = &rs->icdpart[0];
                /* sanity check: no ICD format if first partition invalid */
                if (OK_id(pi->id)) {
-                        printk(" ICD<");
+                        strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
                        for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
                                /* accept only GEM,BGM,RAW,LNX,SWP partitions */
                                if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -137,13 +137,13 @@ int atari_partition(struct parsed_partitions *state)
                                                be32_to_cpu(pi->st),
                                                be32_to_cpu(pi->siz));
                        }
-                        printk(" >");
+                        strlcat(state->pp_buf, " >", PAGE_SIZE);
                }
        }
 #endif
        put_dev_sector(sect);
-        printk ("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 72c52656dc2e..79fbf3f390f0 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -164,10 +164,16 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
        state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
        if (!state)
                return NULL;
+        state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
+        if (!state->pp_buf) {
+                kfree(state);
+                return NULL;
+        }
+        state->pp_buf[0] = '\0';
        state->bdev = bdev;
        disk_name(hd, 0, state->name);
-        printk(KERN_INFO " %s:", state->name);
+        snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
        if (isdigit(state->name[strlen(state->name)-1]))
                sprintf(state->name, "p");
@@ -185,17 +191,25 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
                }
        }
-        if (res > 0)
+        if (res > 0) {
+                printk(KERN_INFO "%s", state->pp_buf);
+                free_page((unsigned long)state->pp_buf);
                return state;
+        }
        if (state->access_beyond_eod)
                err = -ENOSPC;
        if (err)
        /* The partition is unrecognized. So report I/O errors if there were any */
                res = err;
        if (!res)
-                printk(" unknown partition table\n");
+                strlcat(state->pp_buf, " unknown partition table\n", PAGE_SIZE);
        else if (warn_no_part)
-                printk(" unable to read partition table\n");
+                strlcat(state->pp_buf, " unable to read partition table\n", PAGE_SIZE);
+        printk(KERN_INFO "%s", state->pp_buf);
+        free_page((unsigned long)state->pp_buf);
        kfree(state);
        return ERR_PTR(res);
 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 52f8bd399396..8e4e103ba216 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -16,6 +16,7 @@ struct parsed_partitions {
        int next;
        int limit;
        bool access_beyond_eod;
+        char *pp_buf;
 };
 static inline void *read_part_sector(struct parsed_partitions *state,
@@ -32,9 +33,12 @@ static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
        if (n < p->limit) {
+                char tmp[1 + BDEVNAME_SIZE + 10 + 1];
                p->parts[n].from = from;
                p->parts[n].size = size;
-                printk(" %s%d", p->name, n);
+                snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
+                strlcat(p->pp_buf, tmp, PAGE_SIZE);
        }
 }
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 9efb2cfe2410..dbb44d4bb8a7 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -630,6 +630,6 @@ int efi_partition(struct parsed_partitions *state)
        }
        kfree(ptes);
        kfree(gpt);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc8497643fd0..d513a07f44bb 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -75,6 +75,7 @@ int ibm_partition(struct parsed_partitions *state)
        unsigned char *data;
        Sector sect;
        sector_t labelsect;
+        char tmp[64];
        res = 0;
        blocksize = bdev_logical_block_size(bdev);
@@ -144,13 +145,15 @@ int ibm_partition(struct parsed_partitions *state)
                         */
                        blocksize = label->cms.block_size;
                        if (label->cms.disk_offset != 0) {
-                                printk("CMS1/%8s(MDSK):", name);
+                                snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
+                                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                                /* disk is reserved minidisk */
                                offset = label->cms.disk_offset;
                                size = (label->cms.block_count - 1)
                                        * (blocksize >> 9);
                        } else {
-                                printk("CMS1/%8s:", name);
+                                snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
+                                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                                offset = (info->label_block + 1);
                                size = label->cms.block_count
                                        * (blocksize >> 9);
@@ -159,7 +162,8 @@ int ibm_partition(struct parsed_partitions *state)
                                      size-offset*(blocksize >> 9));
                } else {
                        if (strncmp(type, "LNX1", 4) == 0) {
-                                printk("LNX1/%8s:", name);
+                                snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
+                                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                                if (label->lnx.ldl_version == 0xf2) {
                                        fmt_size = label->lnx.formatted_blocks
                                                * (blocksize >> 9);
@@ -178,7 +182,7 @@ int ibm_partition(struct parsed_partitions *state)
                                offset = (info->label_block + 1);
                        } else {
                                /* unlabeled disk */
-                                printk("(nonl)");
+                                strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
                                size = i_size >> 9;
                                offset = (info->label_block + 1);
                        }
@@ -197,7 +201,8 @@ int ibm_partition(struct parsed_partitions *state)
                 * if not, something is wrong, skipping partition detection
                 */
                if (strncmp(type, "VOL1",  4) == 0) {
-                        printk("VOL1/%8s:", name);
+                        snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
                        /*
                         * get block number and read then go through format1
                         * labels
@@ -253,7 +258,7 @@ int ibm_partition(struct parsed_partitions *state)
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        goto out_freeall;
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 1cc928bb762f..0ea19312706b 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -50,7 +50,7 @@ int karma_partition(struct parsed_partitions *state)
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 648c9d8f3357..5bf8a04b5d9b 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -643,7 +643,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
                return false;
        }
-        printk (" [LDM]");
+        strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
        /* Create the data partitions */
        list_for_each (item, &ldb->v_part) {
@@ -658,7 +658,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
                part_num++;
        }
-        printk ("\n");
+        strlcat(pp->pp_buf, "\n", PAGE_SIZE);
        return true;
 }
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 74465ff7c263..68d6a216ee79 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -59,7 +59,7 @@ int mac_partition(struct parsed_partitions *state)
                put_dev_sector(sect);
                return 0;               /* not a MacOS disk */
        }
-        printk(" [mac]");
+        strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
        blocks_in_map = be32_to_cpu(part->map_count);
        for (blk = 1; blk <= blocks_in_map; ++blk) {
                int pos = blk * secsize;
@@ -128,6 +128,6 @@ int mac_partition(struct parsed_partitions *state)
 #endif
        put_dev_sector(sect);
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        return 1;
 }
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 15bfb7b1e044..5f79a6677c69 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -213,10 +213,18 @@ static void parse_solaris_x86(struct parsed_partitions *state,
                put_dev_sector(sect);
                return;
        }
-        printk(" %s%d: <solaris:", state->name, origin);
+        {
+                char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
+                snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
        if (le32_to_cpu(v->v_version) != 1) {
-                printk("  cannot handle version %d vtoc>\n",
+                char tmp[64];
-                        le32_to_cpu(v->v_version));
+                snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
+                         le32_to_cpu(v->v_version));
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                put_dev_sector(sect);
                return;
        }
@@ -224,9 +232,12 @@ static void parse_solaris_x86(struct parsed_partitions *state,
        max_nparts = le16_to_cpu (v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
        for (i=0; i<max_nparts && state->next<state->limit; i++) {
                struct solaris_x86_slice *s = &v->v_slice[i];
+                char tmp[3 + 10 + 1 + 1];
                if (s->s_size == 0)
                        continue;
-                printk(" [s%d]", i);
+                snprintf(tmp, sizeof(tmp), " [s%d]", i);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                /* solaris partitions are relative to current MS-DOS
                 * one; must add the offset of the current partition */
                put_partition(state, state->next++,
@@ -234,7 +245,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
                                 le32_to_cpu(s->s_size));
        }
        put_dev_sector(sect);
-        printk(" >\n");
+        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 #endif
 }
@@ -250,6 +261,7 @@ static void parse_bsd(struct parsed_partitions *state,
        Sector sect;
        struct bsd_disklabel *l;
        struct bsd_partition *p;
+        char tmp[64];
        l = read_part_sector(state, offset + 1, &sect);
        if (!l)
@@ -258,7 +270,9 @@ static void parse_bsd(struct parsed_partitions *state,
                put_dev_sector(sect);
                return;
        }
-        printk(" %s%d: <%s:", state->name, origin, flavour);
+        snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
+        strlcat(state->pp_buf, tmp, PAGE_SIZE);
        if (le16_to_cpu(l->d_npartitions) < max_partitions)
                max_partitions = le16_to_cpu(l->d_npartitions);
@@ -275,16 +289,18 @@ static void parse_bsd(struct parsed_partitions *state,
                        /* full parent partition, we have it already */
                        continue;
                if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
-                        printk("bad subpartition - ignored\n");
+                        strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
                        continue;
                }
                put_partition(state, state->next++, bsd_start, bsd_size);
        }
        put_dev_sector(sect);
-        if (le16_to_cpu(l->d_npartitions) > max_partitions)
+        if (le16_to_cpu(l->d_npartitions) > max_partitions) {
-                printk(" (ignored %d more)",
+                snprintf(tmp, sizeof(tmp), " (ignored %d more)",
-                       le16_to_cpu(l->d_npartitions) - max_partitions);
+                         le16_to_cpu(l->d_npartitions) - max_partitions);
-        printk(" >\n");
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
+        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 }
 #endif
@@ -333,7 +349,12 @@ static void parse_unixware(struct parsed_partitions *state,
                put_dev_sector(sect);
                return;
        }
-        printk(" %s%d: <unixware:", state->name, origin);
+        {
+                char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
+                snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
+        }
        p = &l->vtoc.v_slice[1];
        /* I omit the 0th slice as it is the same as whole disk. */
        while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -347,7 +368,7 @@ static void parse_unixware(struct parsed_partitions *state,
                p++;
        }
        put_dev_sector(sect);
-        printk(" >\n");
+        strlcat(state->pp_buf, " >\n", PAGE_SIZE);
 #endif
 }
@@ -376,8 +397,10 @@ static void parse_minix(struct parsed_partitions *state,
         * the normal boot sector. */
        if (msdos_magic_present (data + 510) &&
            SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+                char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
-                printk(" %s%d: <minix:", state->name, origin);
+                snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
+                strlcat(state->pp_buf, tmp, PAGE_SIZE);
                for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
                        if (state->next == state->limit)
                                break;
@@ -386,7 +409,7 @@ static void parse_minix(struct parsed_partitions *state,
                                put_partition(state, state->next++,
                                              start_sect(p), nr_sects(p));
                }
-                printk(" >\n");
+                strlcat(state->pp_buf, " >\n", PAGE_SIZE);
        }
        put_dev_sector(sect);
 #endif /* CONFIG_MINIX_SUBPARTITION */
@@ -425,7 +448,7 @@ int msdos_partition(struct parsed_partitions *state)
        if (aix_magic_present(state, data)) {
                put_dev_sector(sect);
-                printk( " [AIX]");
+                strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
                return 0;
        }
@@ -446,7 +469,7 @@ int msdos_partition(struct parsed_partitions *state)
                        fb = (struct fat_boot_sector *) data;
                        if (slot == 1 && fb->reserved && fb->fats
                                && fat_valid_media(fb->media)) {
-                                printk("\n");
+                                strlcat(state->pp_buf, "\n", PAGE_SIZE);
                                put_dev_sector(sect);
                                return 1;
                        } else {
@@ -491,21 +514,21 @@ int msdos_partition(struct parsed_partitions *state)
                        n = min(size, max(sector_size, n));
                        put_partition(state, slot, start, n);
-                        printk(" <");
+                        strlcat(state->pp_buf, " <", PAGE_SIZE);
                        parse_extended(state, start, size);
-                        printk(" >");
+                        strlcat(state->pp_buf, " >", PAGE_SIZE);
                        continue;
                }
                put_partition(state, slot, start, size);
                if (SYS_IND(p) == LINUX_RAID_PARTITION)
                        state->parts[slot].flags = ADDPART_FLAG_RAID;
                if (SYS_IND(p) == DM6_PARTITION)
-                        printk("[DM]");
+                        strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
                if (SYS_IND(p) == EZD_PARTITION)
-                        printk("[EZD]");
+                        strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        /* second pass - output for each on a separate line */
        p = (struct partition *) (0x1be + data);
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index fc22b85d436a..48cec7cbca17 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -72,7 +72,7 @@ int osf_partition(struct parsed_partitions *state)
                                le32_to_cpu(partition->p_size));
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index 43b1df9aa16c..ea8a86dceaf4 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -76,7 +76,7 @@ int sgi_partition(struct parsed_partitions *state)
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index a32660e25f7f..b5b6fcfb3d36 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -116,7 +116,7 @@ int sun_partition(struct parsed_partitions *state)
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 9030c864428e..9627ccffc1c4 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -54,6 +54,7 @@ int sysv68_partition(struct parsed_partitions *state)
        unsigned char *data;
        struct dkblk0 *b;
        struct slice *slice;
+        char tmp[64];
        data = read_part_sector(state, 0, &sect);
        if (!data)
@@ -73,7 +74,8 @@ int sysv68_partition(struct parsed_partitions *state)
                return -1;
        slices -= 1; /* last slice is the whole disk */
-        printk("sysV68: %s(s%u)", state->name, slices);
+        snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
+        strlcat(state->pp_buf, tmp, PAGE_SIZE);
        slice = (struct slice *)data;
        for (i = 0; i < slices; i++, slice++) {
                if (slot == state->limit)
@@ -82,11 +84,12 @@ int sysv68_partition(struct parsed_partitions *state)
                        put_partition(state, slot,
                                be32_to_cpu(slice->blkoff),
                                be32_to_cpu(slice->nblocks));
-                        printk("(s%u)", i);
+                        snprintf(tmp, sizeof(tmp), "(s%u)", i);
+                        strlcat(state->pp_buf, tmp, PAGE_SIZE);
                }
                slot++;
        }
-        printk("\n");
+        strlcat(state->pp_buf, "\n", PAGE_SIZE);
        put_dev_sector(sect);
        return 1;
 }
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index db9eef260364..8dbaf9f77a99 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state)
                                              label->pt_part[i].pi_blkoff,
                                              label->pt_part[i].pi_nblocks);
                put_dev_sector(sect);
-                printk ("\n");
+                strlcat(state->pp_buf, "\n", PAGE_SIZE);
                return 1;
        } else {
                put_dev_sector(sect);
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 11a7b5c68153..2758e2afc518 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -2,7 +2,7 @@
 # Makefile for the Linux proc filesystem routines.
 #
-obj-$(CONFIG_PROC_FS) += proc.o
+obj-y   += proc.o
 proc-y                  := nommu.o task_nommu.o
 proc-$(CONFIG_MMU)      := mmu.o task_mmu.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c806dfb24e08..a1c43e7c8a7b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -149,18 +149,13 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
        return count;
 }
-static int get_fs_path(struct task_struct *task, struct path *path, bool root)
+static int get_task_root(struct task_struct *task, struct path *root)
 {
-        struct fs_struct *fs;
        int result = -ENOENT;
        task_lock(task);
-        fs = task->fs;
+        if (task->fs) {
-        if (fs) {
+                get_fs_root(task->fs, root);
-                read_lock(&fs->lock);
-                *path = root ? fs->root : fs->pwd;
-                path_get(path);
-                read_unlock(&fs->lock);
                result = 0;
        }
        task_unlock(task);
@@ -173,7 +168,12 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
        int result = -ENOENT;
        if (task) {
-                result = get_fs_path(task, path, 0);
+                task_lock(task);
+                if (task->fs) {
+                        get_fs_pwd(task->fs, path);
+                        result = 0;
+                }
+                task_unlock(task);
                put_task_struct(task);
        }
        return result;
@@ -185,7 +185,7 @@ static int proc_root_link(struct inode *inode, struct path *path)
        int result = -ENOENT;
        if (task) {
-                result = get_fs_path(task, path, 1);
+                result = get_task_root(task, path);
                put_task_struct(task);
        }
        return result;
@@ -597,7 +597,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
                                get_mnt_ns(ns);
                }
                rcu_read_unlock();
-                if (ns && get_fs_path(task, &root, 1) == 0)
+                if (ns && get_task_root(task, &root) == 0)
                        ret = 0;
                put_task_struct(task);
        }
@@ -1526,7 +1526,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
        if (!tmp)
                return -ENOMEM;
-        pathname = d_path(path, tmp, PAGE_SIZE);
+        pathname = d_path_with_unreachable(path, tmp, PAGE_SIZE);
        len = PTR_ERR(pathname);
        if (IS_ERR(pathname))
                goto out;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 19fbc810e8e7..1ec952b1f036 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -983,7 +983,6 @@ static int flush_older_commits(struct super_block *s,
 static int reiserfs_async_progress_wait(struct super_block *s)
 {
-        DEFINE_WAIT(wait);
        struct reiserfs_journal *j = SB_JOURNAL(s);
        if (atomic_read(&j->j_async_throttle)) {
diff --git a/fs/signalfd.c b/fs/signalfd.c
index f329849ce3c0..1c5a6add779d 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -88,6 +88,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                 err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid);
                 err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun);
                 err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+                 err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
                break;
        case __SI_POLL:
                err |= __put_user(kinfo->si_band, &uinfo->ssi_band);
@@ -111,6 +112,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
                err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid);
                err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid);
                err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr);
+                err |= __put_user(kinfo->si_int, &uinfo->ssi_int);
                break;
        default:
                /*
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index cc6ce8a84c21..e5f63da64d04 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,13 +5,13 @@ config SQUASHFS
        help
          Saying Y here includes support for SquashFS 4.0 (a Compressed
          Read-Only File System).  Squashfs is a highly compressed read-only
-          filesystem for Linux.  It uses zlib compression to compress both
+          filesystem for Linux.  It uses zlib/lzo compression to compress both
          files, inodes and directories.  Inodes in the system are very small
          and all blocks are packed to minimise data overhead. Block sizes
          greater than 4K are supported up to a maximum of 1 Mbytes (default
          block size 128K).  SquashFS 4.0 supports 64 bit filesystems and files
          (larger than 4GB), full uid/gid information, hard links and
-          timestamps.  
+          timestamps.
          Squashfs is intended for general read-only filesystem use, for
          archival use (i.e. in cases where a .tar.gz file may be used), and in
@@ -26,7 +26,7 @@ config SQUASHFS
          If unsure, say N.
-config SQUASHFS_XATTRS
+config SQUASHFS_XATTR
        bool "Squashfs XATTR support"
        depends on SQUASHFS
        default n
@@ -37,9 +37,24 @@ config SQUASHFS_XATTRS
          If unsure, say N.
-config SQUASHFS_EMBEDDED
+config SQUASHFS_LZO
+        bool "Include support for LZO compressed file systems"
+        depends on SQUASHFS
+        default n
+        select LZO_DECOMPRESS
+        help
+          Saying Y here includes support for reading Squashfs file systems
+          compressed with LZO compresssion.  LZO compression is mainly
+          aimed at embedded systems with slower CPUs where the overheads
+          of zlib are too high.
-        bool "Additional option for memory-constrained systems" 
+          LZO is not the standard compression used in Squashfs and so most
+          file systems will be readable without selecting this option.
+          If unsure, say N.
+config SQUASHFS_EMBEDDED
+        bool "Additional option for memory-constrained systems"
        depends on SQUASHFS
        default n
        help
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 2cee3e9fa452..7672bac8d328 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
-squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
+squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 157478da6ac9..24af9ce9722f 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -40,9 +40,11 @@ static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
        NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
 };
+#ifndef CONFIG_SQUASHFS_LZO
 static const struct squashfs_decompressor squashfs_lzo_unsupported_comp_ops = {
        NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
 };
+#endif
 static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
        NULL, NULL, NULL, 0, "unknown", 0
@@ -51,7 +53,11 @@ static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
 static const struct squashfs_decompressor *decompressor[] = {
        &squashfs_zlib_comp_ops,
        &squashfs_lzma_unsupported_comp_ops,
+#ifdef CONFIG_SQUASHFS_LZO
+        &squashfs_lzo_comp_ops,
+#else
        &squashfs_lzo_unsupported_comp_ops,
+#endif
        &squashfs_unknown_comp_ops
 };
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
new file mode 100644
index 000000000000..5d87789bf1c1
--- /dev/null
+++ b/fs/squashfs/lzo_wrapper.c
@@ -0,0 +1,136 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010 LG Electronics
+ * Chan Jeong <chan.jeong@lge.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * lzo_wrapper.c
+ */
+#include <linux/mutex.h>
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/lzo.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "decompressor.h"
+struct squashfs_lzo {
+        void    *input;
+        void    *output;
+};
+static void *lzo_init(struct squashfs_sb_info *msblk)
+{
+        int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
+        struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+        if (stream == NULL)
+                goto failed;
+        stream->input = vmalloc(block_size);
+        if (stream->input == NULL)
+                goto failed;
+        stream->output = vmalloc(block_size);
+        if (stream->output == NULL)
+                goto failed2;
+        return stream;
+failed2:
+        vfree(stream->input);
+failed:
+        ERROR("Failed to allocate lzo workspace\n");
+        kfree(stream);
+        return NULL;
+}
+static void lzo_free(void *strm)
+{
+        struct squashfs_lzo *stream = strm;
+        if (stream) {
+                vfree(stream->input);
+                vfree(stream->output);
+        }
+        kfree(stream);
+}
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
+        struct buffer_head **bh, int b, int offset, int length, int srclength,
+        int pages)
+{
+        struct squashfs_lzo *stream = msblk->stream;
+        void *buff = stream->input;
+        int avail, i, bytes = length, res;
+        size_t out_len = srclength;
+        mutex_lock(&msblk->read_data_mutex);
+        for (i = 0; i < b; i++) {
+                wait_on_buffer(bh[i]);
+                if (!buffer_uptodate(bh[i]))
+                        goto block_release;
+                avail = min(bytes, msblk->devblksize - offset);
+                memcpy(buff, bh[i]->b_data + offset, avail);
+                buff += avail;
+                bytes -= avail;
+                offset = 0;
+                put_bh(bh[i]);
+        }
+        res = lzo1x_decompress_safe(stream->input, (size_t)length,
+                                        stream->output, &out_len);
+        if (res != LZO_E_OK)
+                goto failed;
+        res = bytes = (int)out_len;
+        for (i = 0, buff = stream->output; bytes && i < pages; i++) {
+                avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+                memcpy(buffer[i], buff, avail);
+                buff += avail;
+                bytes -= avail;
+        }
+        mutex_unlock(&msblk->read_data_mutex);
+        return res;
+block_release:
+        for (; i < b; i++)
+                put_bh(bh[i]);
+failed:
+        mutex_unlock(&msblk->read_data_mutex);
+        ERROR("lzo decompression failed, data probably corrupt\n");
+        return -EIO;
+}
+const struct squashfs_decompressor squashfs_lzo_comp_ops = {
+        .init = lzo_init,
+        .free = lzo_free,
+        .decompress = lzo_uncompress,
+        .id = LZO_COMPRESSION,
+        .name = "lzo",
+        .supported = 1
+};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 733a17c42945..5d45569d5f72 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -104,3 +104,6 @@ extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
+/* lzo_wrapper.c */
+extern const struct squashfs_decompressor squashfs_lzo_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 8eabb808b78d..c5137fc9ab11 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -274,7 +274,7 @@ struct squashfs_base_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
 };
 struct squashfs_ipc_inode {
@@ -283,7 +283,7 @@ struct squashfs_ipc_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
 };
@@ -293,7 +293,7 @@ struct squashfs_lipc_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  xattr;
 };
@@ -304,7 +304,7 @@ struct squashfs_dev_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  rdev;
 };
@@ -315,7 +315,7 @@ struct squashfs_ldev_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  rdev;
        __le32                  xattr;
@@ -327,7 +327,7 @@ struct squashfs_symlink_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  symlink_size;
        char                    symlink[0];
@@ -339,7 +339,7 @@ struct squashfs_reg_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  start_block;
        __le32                  fragment;
        __le32                  offset;
@@ -353,7 +353,7 @@ struct squashfs_lreg_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le64                  start_block;
        __le64                  file_size;
        __le64                  sparse;
@@ -370,7 +370,7 @@ struct squashfs_dir_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  start_block;
        __le32                  nlink;
        __le16                  file_size;
@@ -384,7 +384,7 @@ struct squashfs_ldir_inode {
        __le16                  uid;
        __le16                  guid;
        __le32                  mtime;
-        __le32                  inode_number;
+        __le32                  inode_number;
        __le32                  nlink;
        __le32                  file_size;
        __le32                  start_block;
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index c7655e8b31cd..652b8541f9c6 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -18,7 +18,7 @@
 * along with this program; if not, write to the Free Software
 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 *
- * xattr_id.c
+ * xattr.c
 */
 #include <linux/init.h>
@@ -295,7 +295,7 @@ static const struct xattr_handler squashfs_xattr_security_handler = {
        .get    = squashfs_security_get
 };
-static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+static const struct xattr_handler *squashfs_xattr_handler(int type)
 {
        if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
                /* ignore unrecognised type */
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index 9da071ae181c..49fe0d719fbf 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -21,7 +21,7 @@
 * xattr.h
 */
-#ifdef CONFIG_SQUASHFS_XATTRS
+#ifdef CONFIG_SQUASHFS_XATTR
 extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
                u64 *, int *);
 extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
diff --git a/fs/sysv/super.c b/fs/sysv/super.c
index 0e44a6253352..a0b0cda6927e 100644
--- a/fs/sysv/super.c
+++ b/fs/sysv/super.c
@@ -434,12 +434,46 @@ Ebadsize:
        goto failed;
 }
-static int v7_fill_super(struct super_block *sb, void *data, int silent)
+static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh)
 {
-        struct sysv_sb_info *sbi;
-        struct buffer_head *bh, *bh2 = NULL;
        struct v7_super_block *v7sb;
        struct sysv_inode *v7i;
+        struct buffer_head *bh2;
+        struct sysv_sb_info *sbi;
+        sbi = sb->s_fs_info;
+        /* plausibility check on superblock */
+        v7sb = (struct v7_super_block *) bh->b_data;
+        if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+            fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+            fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE)
+                return 0;
+        /* plausibility check on root inode: it is a directory,
+           with a nonzero size that is a multiple of 16 */
+        bh2 = sb_bread(sb, 2);
+        if (bh2 == NULL)
+                return 0;
+        v7i = (struct sysv_inode *)(bh2->b_data + 64);
+        if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
+            (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
+            (fs32_to_cpu(sbi, v7i->i_size) & 017) ||
+            (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES *
+             sizeof(struct sysv_dir_entry))) {
+                brelse(bh2);
+                return 0;
+        }
+        brelse(bh2);
+        return 1;
+}
+static int v7_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct sysv_sb_info *sbi;
+        struct buffer_head *bh;
        if (440 != sizeof (struct v7_super_block))
                panic("V7 FS: bad super-block size");
@@ -453,7 +487,6 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_sb = sb;
        sbi->s_block_base = 0;
        sbi->s_type = FSTYPE_V7;
-        sbi->s_bytesex = BYTESEX_PDP;
        sb->s_fs_info = sbi;
        
        sb_set_blocksize(sb, 512);
@@ -465,32 +498,27 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        }
-        /* plausibility check on superblock */
+        /* Try PDP-11 UNIX */
-        v7sb = (struct v7_super_block *) bh->b_data;
+        sbi->s_bytesex = BYTESEX_PDP;
-        if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE ||
+        if (v7_sanity_check(sb, bh))
-            fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD ||
+                goto detected;
-            fs32_to_cpu(sbi, v7sb->s_time) == 0)
-                goto failed;
-        /* plausibility check on root inode: it is a directory,
+        /* Try PC/IX, v7/x86 */
-           with a nonzero size that is a multiple of 16 */
+        sbi->s_bytesex = BYTESEX_LE;
-        if ((bh2 = sb_bread(sb, 2)) == NULL)
+        if (v7_sanity_check(sb, bh))
-                goto failed;
+                goto detected;
-        v7i = (struct sysv_inode *)(bh2->b_data + 64);
-        if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR ||
-            (fs32_to_cpu(sbi, v7i->i_size) == 0) ||
-            (fs32_to_cpu(sbi, v7i->i_size) & 017) != 0)
-                goto failed;
-        brelse(bh2);
-        bh2 = NULL;
+        goto failed;
+detected:
        sbi->s_bh1 = bh;
        sbi->s_bh2 = bh;
        if (complete_read_super(sb, silent, 1))
                return 0;
 failed:
-        brelse(bh2);
+        printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n",
+                sb->s_id);
        brelse(bh);
        kfree(sbi);
        return -EINVAL;
@@ -559,4 +587,5 @@ static void __exit exit_sysv_fs(void)
 module_init(init_sysv_fs)
 module_exit(exit_sysv_fs)
+MODULE_ALIAS("v7");
 MODULE_LICENSE("GPL");
author	Chris Metcalf <cmetcalf@tilera.com>	2010-08-13 19:59:15 -0400
committer	Chris Metcalf <cmetcalf@tilera.com>	2010-08-13 19:59:15 -0400
commit	7d72e6fa56c4100b9669efe0044f77ed9eb785a1 (patch)
tree	5e90bf4969809a1ab20b97432b85be20ccfaa1f4 /fs
parent	ba00376b0b13f234d839541a7b36a5bf5c2a4036 (diff)
parent	2be1f3a73dd02e38e181cf5abacb3d45a6a2d6b8 (diff)